123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- import argparse
- import json
- from configparser import ConfigParser
- from apps.task.annote.datatype.extract import extract_data_type
- from apps.task.annote.purpose.predict import predict
- from apps.task.annote.utils import load_json
- parser = argparse.ArgumentParser()
- # Required parameters
- parser.add_argument("--lang", default="python", type=str,
- help="language type, default is python")
- parser.add_argument("--tokenizer_name", default="microsoft/graphcodebert-base", type=str,
- help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
- parser.add_argument("--eval_batch_size", default=8, type=int,
- help="Batch size per GPU/CPU for predict.")
- parser.add_argument("--output_dir", default="purpose/saved_models", type=str,
- help="The output directory where the model predictions and checkpoints will be written.")
- parser.add_argument("--checkpoint_prefix", default="model.bin", type=str,
- help="The output directory where the model predictions and checkpoints will be written.")
- parser.add_argument("--config_name", default="microsoft/graphcodebert-base", type=str,
- help="Optional pretrained config name or path if not the same as model_name_or_path")
- parser.add_argument("--code_length", default=256, type=int,
- help="Optional Code input sequence length after tokenization.")
- parser.add_argument("--data_flow_length", default=64, type=int,
- help="Optional Data Flow input sequence length after tokenization.")
- parser.add_argument('--seed', type=int, default=42,
- help="random seed for initialization")
- parser.add_argument('--n_classes', type=int, default=10,
- help="random seed for initialization")
- args, unknown = parser.parse_known_args()
- def reload_params(train_params):
- args.n_classes = train_params['n_classes']
- args.do_train = True if train_params['do_train'] == 'True' else False
- args.do_test = True if train_params['do_test'] == 'True' else False
- args.train_batch_size = train_params['train_batch_size']
- args.eval_batch_size = train_params['eval_batch_size']
- args.epochs = train_params['epochs']
- args.lang = train_params['lang']
- args.output_dir = train_params['output_dir']
- args.code_length = train_params['code_length']
- args.data_flow_length = train_params['data_flow_length']
- args.seed = train_params['seed']
- args.train_data_file = train_params['train_data_file']
- args.test_data_file = train_params['test_data_file']
- args.model_path = train_params['model_path']
- args.gradient_accumulation_steps = train_params['gradient_accumulation_steps']
- args.learning_rate = train_params['learning_rate']
- args.weight_decay = train_params['weight_decay']
- args.adam_epsilon = train_params['adam_epsilon']
- args.max_grad_norm = train_params['max_grad_norm']
- args.max_steps = train_params['max_steps']
- args.warmup_steps = train_params['warmup_steps']
- def load_params():
- cp = ConfigParser()
- cp.read('params.cfg', encoding='utf-8')
- args.n_classes = int(cp.get('params', 'n_classes'))
- args.eval_batch_size = int(cp.get('params', 'batch_size'))
- args.output_dir = cp.get('params', 'output_dir')
- args.code_length = int(cp.get('params', 'code_length'))
- args.data_flow_length = int(cp.get('params', 'data_flow_length'))
- args.seed = int(cp.get('params', 'seed'))
- def annotate(source, lattices, entire=False):
- """
- :param source: 文件路径
- :param lattices: data_type的标注词典
- :param entire:
- :return:
- """
- params = load_json('apps/task/annote/params.json')
- reload_params(params)
- data_type_list = extract_data_type(source, lattices, args)
- purpose_list = predict(source, args)
- if entire:
- methods = dict()
- for data_type_single in data_type_list:
- func_key = data_type_single['file_path'] + "-" + data_type_single['func_name']
- if func_key in methods.keys():
- methods[func_key]['data_type'].append(data_type_single)
- else:
- methods[func_key] = dict()
- methods[func_key]['data_type'] = [data_type_single]
- for purpose in purpose_list:
- func_key = purpose['file_path'] + "-" + purpose['func_name']
- if func_key in methods.keys():
- methods[func_key]['purpose'] = purpose
- else:
- methods[func_key] = dict()
- methods[func_key]['purpose'] = purpose
- return methods
- else:
- return data_type_list, purpose_list
- if __name__ == '__main__':
- with open("datatype_dictionary.json", 'r', encoding='utf-8') as file:
- data_type = json.load(file)
- path = "/Users/liufan/Documents/实验室/隐私扫描项目/SAP检测项目/mini/Instagram_profile"
- methods = annotate(path, data_type, entire=False)
- a = 5
|