import argparse import json import logging import multiprocessing import os import pickle import random from xml import parsers import numpy as np import torch from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler, TensorDataset from tqdm import tqdm from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer) from tree_sitter import Language, Parser from apps.task.annote.Parser import DFG_python, DFG_go, DFG_javascript, remove_comments_and_docstrings, tree_to_token_index, \ index_to_code_token from apps.task.annote.Parser.utils import extract_dataflow from apps.task.annote.datatype.extract import split_file_by_func from apps.task.annote.purpose.model import Model from apps.task.annote.utils import walk_files cpu_cont = 1 logger = logging.getLogger(__name__) dfg_function = { 'python': DFG_python, } Parsers = {} for lang in dfg_function: LANGUAGE = Language('apps/task/annote/Parser/my-languages.so', lang) parser = Parser() parser.set_language(LANGUAGE) parser = [parser, dfg_function[lang]] Parsers[lang] = parser def set_seed(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) class InputFeatures(object): """A single training/test features for a example.""" def __init__(self, code_tokens, code_ids, position_idx, dfg_to_code, dfg_to_dfg, file_path, func_name, label=None ): self.label_list = ['Archive', 'Azure', 'File', 'Hash', 'Kafka', 'Other', 'Pseudonym', 'S3', 'Truncate', 'Visualize'] self.code_tokens = code_tokens self.code_ids = code_ids self.position_idx = position_idx self.dfg_to_code = dfg_to_code self.dfg_to_dfg = dfg_to_dfg self.file_path = file_path self.func_name = func_name self.label = label def __str__(self): return self.file_path + "-" + self.func_name + " " + self.label def convert_examples_to_features(item): """ :param item: func_content :return: InputFeatures """ file_path, func_name, content, tokenizer, arg = item # code_type par = Parsers[arg.lang] code_tokens, dfg = extract_dataflow(content, par, arg.lang) code_tokens = [tokenizer.tokenize('@ ' + x)[1:] if idx != 0 else tokenizer.tokenize(x) for idx, x in enumerate(code_tokens)] ori2cur_pos = {-1: (0, 0)} for i in range(len(code_tokens)): ori2cur_pos[i] = (ori2cur_pos[i - 1][1], ori2cur_pos[i - 1][1] + len(code_tokens[i])) code_tokens = [y for x in code_tokens for y in x] # truncating code_tokens = code_tokens[:arg.code_length + arg.data_flow_length - 2 - min(len(dfg), arg.data_flow_length)] code_tokens = [tokenizer.cls_token] + code_tokens + [tokenizer.sep_token] code_ids = tokenizer.convert_tokens_to_ids(code_tokens) position_idx = [i + tokenizer.pad_token_id + 1 for i in range(len(code_tokens))] dfg = dfg[:arg.code_length + arg.data_flow_length - len(code_tokens)] code_tokens += [x[0] for x in dfg] position_idx += [0 for x in dfg] code_ids += [tokenizer.unk_token_id for x in dfg] padding_length = arg.code_length + arg.data_flow_length - len(code_ids) position_idx += [tokenizer.pad_token_id] * padding_length code_ids += [tokenizer.pad_token_id] * padding_length # reindex reverse_index = {} for idx, x in enumerate(dfg): reverse_index[x[1]] = idx for idx, x in enumerate(dfg): dfg[idx] = x[:-1] + ([reverse_index[i] for i in x[-1] if i in reverse_index],) dfg_to_dfg = [x[-1] for x in dfg] dfg_to_code = [ori2cur_pos[x[1]] for x in dfg] length = len([tokenizer.cls_token]) dfg_to_code = [(x[0] + length, x[1] + length) for x in dfg_to_code] return InputFeatures(code_tokens, code_ids, position_idx, dfg_to_code, dfg_to_dfg, file_path, func_name) class TextDataset(Dataset): def __init__(self, input_path, tokenizer, args, pool=None): self.examples = [] self.args = args self.examples = [] data = [] for file_name in walk_files(input_path): func_content_dict = split_file_by_func(file_name) for func_name, func_content in func_content_dict.items(): data.append((file_name, func_name, func_content[0], tokenizer, args)) for d in data: self.examples.append(convert_examples_to_features(d)) a = 5 def __len__(self): return len(self.examples) def __getitem__(self, item): # calculate graph-guided masked function attn_mask = np.zeros((self.args.code_length + self.args.data_flow_length, self.args.code_length + self.args.data_flow_length), dtype=bool) # calculate begin index of node and max length of input node_index = sum([i > 1 for i in self.examples[item].position_idx]) max_length = sum([i != 1 for i in self.examples[item].position_idx]) # sequence can attend to sequence attn_mask[:node_index, :node_index] = True # special tokens attend to all tokens for idx, i in enumerate(self.examples[item].code_ids): if i in [0, 2]: attn_mask[idx, :max_length] = True # nodes attend to code tokens that are identified from for idx, (a, b) in enumerate(self.examples[item].dfg_to_code): if a < node_index and b < node_index: attn_mask[idx + node_index, a:b] = True attn_mask[a:b, idx + node_index] = True # nodes attend to adjacent nodes for idx, nodes in enumerate(self.examples[item].dfg_to_dfg): for a in nodes: if a + node_index < len(self.examples[item].position_idx): attn_mask[idx + node_index, a + node_index] = True return (torch.tensor(self.examples[item].code_ids), torch.tensor(attn_mask), torch.tensor(self.examples[item].position_idx)) def predict(input_path, args): pool = multiprocessing.Pool(cpu_cont) # Setup CUDA, GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # device = torch.device("cpu") args.n_gpu = 0 args.device = device set_seed(args) config = RobertaConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path) config.num_labels = 1 tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name) model = RobertaForSequenceClassification.from_pretrained(args.config_name, config=config) model = Model(model, config, tokenizer, args) output_dir = os.path.join(args.output_dir, '{}'.format(args.checkpoint_prefix)) model.load_state_dict(torch.load(output_dir, map_location=device)) model.to(args.device) # build dataloader eval_dataset = TextDataset(input_path, tokenizer, args, pool=pool) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, num_workers=4) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) logit = [] for batch in eval_dataloader: (code_ids, attn_mask, position_idx) = [x.to(args.device) for x in batch] with torch.no_grad(): log = model(code_ids, attn_mask, position_idx) logit.append(log.cpu().numpy()) logit = np.concatenate(logit, 0) y_pre = np.argmax(logit, 1).tolist() result = [] for i in range(len(eval_dataset.examples)): if y_pre[i] != 5: eval_dataset.examples[i].label = eval_dataset.examples[i].label_list[y_pre[i]] result.append({ 'file_path': eval_dataset.examples[i].file_path.replace(input_path + '/', ''), 'func_name': eval_dataset.examples[i].func_name, 'purpose': eval_dataset.examples[i].label }) return result