import argparse import json from configparser import ConfigParser from apps.task.annote.datatype.extract import extract_data_type from apps.task.annote.purpose.predict import predict from apps.task.annote.utils import load_json parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--lang", default="python", type=str, help="language type, default is python") parser.add_argument("--tokenizer_name", default="microsoft/graphcodebert-base", type=str, help="Optional pretrained tokenizer name or path if not the same as model_name_or_path") parser.add_argument("--eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for predict.") parser.add_argument("--output_dir", default="purpose/saved_models", type=str, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--checkpoint_prefix", default="model.bin", type=str, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--config_name", default="microsoft/graphcodebert-base", type=str, help="Optional pretrained config name or path if not the same as model_name_or_path") parser.add_argument("--code_length", default=256, type=int, help="Optional Code input sequence length after tokenization.") parser.add_argument("--data_flow_length", default=64, type=int, help="Optional Data Flow input sequence length after tokenization.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--n_classes', type=int, default=10, help="random seed for initialization") args, unknown = parser.parse_known_args() def reload_params(train_params): args.n_classes = train_params['n_classes'] args.do_train = True if train_params['do_train'] == 'True' else False args.do_test = True if train_params['do_test'] == 'True' else False args.train_batch_size = train_params['train_batch_size'] args.eval_batch_size = train_params['eval_batch_size'] args.epochs = train_params['epochs'] args.lang = train_params['lang'] args.output_dir = train_params['output_dir'] args.code_length = train_params['code_length'] args.data_flow_length = train_params['data_flow_length'] args.seed = train_params['seed'] args.train_data_file = train_params['train_data_file'] args.test_data_file = train_params['test_data_file'] args.model_path = train_params['model_path'] args.gradient_accumulation_steps = train_params['gradient_accumulation_steps'] args.learning_rate = train_params['learning_rate'] args.weight_decay = train_params['weight_decay'] args.adam_epsilon = train_params['adam_epsilon'] args.max_grad_norm = train_params['max_grad_norm'] args.max_steps = train_params['max_steps'] args.warmup_steps = train_params['warmup_steps'] def load_params(): cp = ConfigParser() cp.read('params.cfg', encoding='utf-8') args.n_classes = int(cp.get('params', 'n_classes')) args.eval_batch_size = int(cp.get('params', 'batch_size')) args.output_dir = cp.get('params', 'output_dir') args.code_length = int(cp.get('params', 'code_length')) args.data_flow_length = int(cp.get('params', 'data_flow_length')) args.seed = int(cp.get('params', 'seed')) def annotate(source, lattices, entire=False): """ :param source: 文件路径 :param lattices: data_type的标注词典 :param entire: :return: """ params = load_json('apps/task/annote/params.json') reload_params(params) data_type_list = extract_data_type(source, lattices, args) purpose_list = predict(source, args) if entire: methods = dict() for data_type_single in data_type_list: func_key = data_type_single['file_path'] + "-" + data_type_single['func_name'] if func_key in methods.keys(): methods[func_key]['data_type'].append(data_type_single) else: methods[func_key] = dict() methods[func_key]['data_type'] = [data_type_single] for purpose in purpose_list: func_key = purpose['file_path'] + "-" + purpose['func_name'] if func_key in methods.keys(): methods[func_key]['purpose'] = purpose else: methods[func_key] = dict() methods[func_key]['purpose'] = purpose return methods else: return data_type_list, purpose_list if __name__ == '__main__': with open("datatype_dictionary.json", 'r', encoding='utf-8') as file: data_type = json.load(file) path = "/Users/liufan/Documents/实验室/隐私扫描项目/SAP检测项目/mini/Instagram_profile" methods = annotate(path, data_type, entire=False) a = 5