123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217 |
- import argparse
- import json
- import logging
- import multiprocessing
- import os
- import pickle
- import random
- from xml import parsers
- import numpy as np
- import torch
- from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler, TensorDataset
- from tqdm import tqdm
- from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
- RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
- from tree_sitter import Language, Parser
- from apps.task.annote.Parser import DFG_python, DFG_go, DFG_javascript, remove_comments_and_docstrings, tree_to_token_index, \
- index_to_code_token
- from apps.task.annote.Parser.utils import extract_dataflow
- from apps.task.annote.datatype.extract import split_file_by_func
- from apps.task.annote.purpose.model import Model
- from apps.task.annote.utils import walk_files
- cpu_cont = 1
- logger = logging.getLogger(__name__)
- dfg_function = {
- 'python': DFG_python,
- }
- Parsers = {}
- for lang in dfg_function:
- LANGUAGE = Language('apps/task/annote/Parser/my-languages.so', lang)
- parser = Parser()
- parser.set_language(LANGUAGE)
- parser = [parser, dfg_function[lang]]
- Parsers[lang] = parser
- def set_seed(args):
- random.seed(args.seed)
- np.random.seed(args.seed)
- torch.manual_seed(args.seed)
- if args.n_gpu > 0:
- torch.cuda.manual_seed_all(args.seed)
- class InputFeatures(object):
- """A single training/test features for a example."""
- def __init__(self,
- code_tokens,
- code_ids,
- position_idx,
- dfg_to_code,
- dfg_to_dfg,
- file_path,
- func_name,
- label=None
- ):
- self.label_list = ['Archive', 'Azure', 'File', 'Hash', 'Kafka', 'Other', 'Pseudonym', 'S3', 'Truncate',
- 'Visualize']
- self.code_tokens = code_tokens
- self.code_ids = code_ids
- self.position_idx = position_idx
- self.dfg_to_code = dfg_to_code
- self.dfg_to_dfg = dfg_to_dfg
- self.file_path = file_path
- self.func_name = func_name
- self.label = label
- def __str__(self):
- return self.file_path + "-" + self.func_name + " " + self.label
- def convert_examples_to_features(item):
- """
- :param item: func_content
- :return: InputFeatures
- """
- file_path, func_name, content, tokenizer, arg = item
- # code_type
- par = Parsers[arg.lang]
- code_tokens, dfg = extract_dataflow(content, par, arg.lang)
- code_tokens = [tokenizer.tokenize('@ ' + x)[1:] if idx != 0 else tokenizer.tokenize(x) for idx, x in
- enumerate(code_tokens)]
- ori2cur_pos = {-1: (0, 0)}
- for i in range(len(code_tokens)):
- ori2cur_pos[i] = (ori2cur_pos[i - 1][1], ori2cur_pos[i - 1][1] + len(code_tokens[i]))
- code_tokens = [y for x in code_tokens for y in x]
- # truncating
- code_tokens = code_tokens[:arg.code_length + arg.data_flow_length - 2 - min(len(dfg), arg.data_flow_length)]
- code_tokens = [tokenizer.cls_token] + code_tokens + [tokenizer.sep_token]
- code_ids = tokenizer.convert_tokens_to_ids(code_tokens)
- position_idx = [i + tokenizer.pad_token_id + 1 for i in range(len(code_tokens))]
- dfg = dfg[:arg.code_length + arg.data_flow_length - len(code_tokens)]
- code_tokens += [x[0] for x in dfg]
- position_idx += [0 for x in dfg]
- code_ids += [tokenizer.unk_token_id for x in dfg]
- padding_length = arg.code_length + arg.data_flow_length - len(code_ids)
- position_idx += [tokenizer.pad_token_id] * padding_length
- code_ids += [tokenizer.pad_token_id] * padding_length
- # reindex
- reverse_index = {}
- for idx, x in enumerate(dfg):
- reverse_index[x[1]] = idx
- for idx, x in enumerate(dfg):
- dfg[idx] = x[:-1] + ([reverse_index[i] for i in x[-1] if i in reverse_index],)
- dfg_to_dfg = [x[-1] for x in dfg]
- dfg_to_code = [ori2cur_pos[x[1]] for x in dfg]
- length = len([tokenizer.cls_token])
- dfg_to_code = [(x[0] + length, x[1] + length) for x in dfg_to_code]
- return InputFeatures(code_tokens, code_ids, position_idx, dfg_to_code, dfg_to_dfg, file_path, func_name)
- class TextDataset(Dataset):
- def __init__(self, input_path, tokenizer, args, pool=None):
- self.examples = []
- self.args = args
- self.examples = []
- data = []
- for file_name in walk_files(input_path):
- func_content_dict = split_file_by_func(file_name)
- for func_name, func_content in func_content_dict.items():
- data.append((file_name, func_name, func_content[0], tokenizer, args))
- for d in data:
- self.examples.append(convert_examples_to_features(d))
- a = 5
- def __len__(self):
- return len(self.examples)
- def __getitem__(self, item):
- # calculate graph-guided masked function
- attn_mask = np.zeros((self.args.code_length + self.args.data_flow_length,
- self.args.code_length + self.args.data_flow_length), dtype=bool)
- # calculate begin index of node and max length of input
- node_index = sum([i > 1 for i in self.examples[item].position_idx])
- max_length = sum([i != 1 for i in self.examples[item].position_idx])
- # sequence can attend to sequence
- attn_mask[:node_index, :node_index] = True
- # special tokens attend to all tokens
- for idx, i in enumerate(self.examples[item].code_ids):
- if i in [0, 2]:
- attn_mask[idx, :max_length] = True
- # nodes attend to code tokens that are identified from
- for idx, (a, b) in enumerate(self.examples[item].dfg_to_code):
- if a < node_index and b < node_index:
- attn_mask[idx + node_index, a:b] = True
- attn_mask[a:b, idx + node_index] = True
- # nodes attend to adjacent nodes
- for idx, nodes in enumerate(self.examples[item].dfg_to_dfg):
- for a in nodes:
- if a + node_index < len(self.examples[item].position_idx):
- attn_mask[idx + node_index, a + node_index] = True
- return (torch.tensor(self.examples[item].code_ids),
- torch.tensor(attn_mask),
- torch.tensor(self.examples[item].position_idx))
- def predict(input_path, args):
- pool = multiprocessing.Pool(cpu_cont)
- # Setup CUDA, GPU
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- # device = torch.device("cpu")
- args.n_gpu = 0
- args.device = device
- set_seed(args)
- config = RobertaConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
- config.num_labels = 1
- tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
- model = RobertaForSequenceClassification.from_pretrained(args.config_name, config=config)
- model = Model(model, config, tokenizer, args)
- output_dir = os.path.join(args.output_dir, '{}'.format(args.checkpoint_prefix))
- model.load_state_dict(torch.load(output_dir, map_location=device))
- model.to(args.device)
- # build dataloader
- eval_dataset = TextDataset(input_path, tokenizer, args, pool=pool)
- eval_sampler = SequentialSampler(eval_dataset)
- eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, num_workers=4)
- # multi-gpu evaluate
- if args.n_gpu > 1:
- model = torch.nn.DataParallel(model)
- logit = []
- for batch in eval_dataloader:
- (code_ids, attn_mask, position_idx) = [x.to(args.device) for x in batch]
- with torch.no_grad():
- log = model(code_ids, attn_mask, position_idx)
- logit.append(log.cpu().numpy())
- logit = np.concatenate(logit, 0)
- y_pre = np.argmax(logit, 1).tolist()
- result = []
- for i in range(len(eval_dataset.examples)):
- if y_pre[i] != 5:
- eval_dataset.examples[i].label = eval_dataset.examples[i].label_list[y_pre[i]]
- result.append({
- 'file_path': eval_dataset.examples[i].file_path.replace(input_path + '/', ''),
- 'func_name': eval_dataset.examples[i].func_name,
- 'purpose': eval_dataset.examples[i].label
- })
- return result
|