# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. from tree_sitter import Language, Parser from .utils import (remove_comments_and_docstrings, tree_to_token_index, index_to_code_token, tree_to_variable_index) def DFG_python(root_node, index_to_code, states): assignment = ['assignment', 'augmented_assignment', 'for_in_clause'] if_statement = ['if_statement'] for_statement = ['for_statement'] while_statement = ['while_statement'] do_first_statement = ['for_in_clause'] def_statement = ['default_parameter'] states = states.copy() if (len(root_node.children) == 0 or root_node.type == 'string') and root_node.type != 'comment': idx, code = index_to_code[(root_node.start_point, root_node.end_point)] if root_node.type == code: return [], states elif code in states: return [(code, idx, 'comesFrom', [code], states[code].copy())], states else: if root_node.type == 'identifier': states[code] = [idx] return [(code, idx, 'comesFrom', [], [])], states elif root_node.type in def_statement: name = root_node.child_by_field_name('name') value = root_node.child_by_field_name('value') DFG = [] if value is None: indexs = tree_to_variable_index(name, index_to_code) for index in indexs: idx, code = index_to_code[index] DFG.append((code, idx, 'comesFrom', [], [])) states[code] = [idx] return sorted(DFG, key=lambda x: x[1]), states else: name_indexs = tree_to_variable_index(name, index_to_code) value_indexs = tree_to_variable_index(value, index_to_code) temp, states = DFG_python(value, index_to_code, states) DFG += temp for index1 in name_indexs: idx1, code1 = index_to_code[index1] for index2 in value_indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'comesFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in assignment: if root_node.type == 'for_in_clause': right_nodes = [root_node.children[-1]] left_nodes = [root_node.child_by_field_name('left')] else: if root_node.child_by_field_name('right') is None: return [], states left_nodes = [x for x in root_node.child_by_field_name('left').children if x.type != ','] right_nodes = [x for x in root_node.child_by_field_name('right').children if x.type != ','] if len(right_nodes) != len(left_nodes): left_nodes = [root_node.child_by_field_name('left')] right_nodes = [root_node.child_by_field_name('right')] if len(left_nodes) == 0: left_nodes = [root_node.child_by_field_name('left')] if len(right_nodes) == 0: right_nodes = [root_node.child_by_field_name('right')] DFG = [] for node in right_nodes: temp, states = DFG_python(node, index_to_code, states) DFG += temp for left_node, right_node in zip(left_nodes, right_nodes): left_tokens_index = tree_to_variable_index(left_node, index_to_code) right_tokens_index = tree_to_variable_index(right_node, index_to_code) temp = [] for token1_index in left_tokens_index: idx1, code1 = index_to_code[token1_index] temp.append((code1, idx1, 'computedFrom', [index_to_code[x][1] for x in right_tokens_index], [index_to_code[x][0] for x in right_tokens_index])) states[code1] = [idx1] DFG += temp return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in if_statement: DFG = [] current_states = states.copy() others_states = [] tag = False if 'else' in root_node.type: tag = True for child in root_node.children: if 'else' in child.type: tag = True if child.type not in ['elif_clause', 'else_clause']: temp, current_states = DFG_python(child, index_to_code, current_states) DFG += temp else: temp, new_states = DFG_python(child, index_to_code, states) DFG += temp others_states.append(new_states) others_states.append(current_states) if tag is False: others_states.append(states) new_states = {} for dic in others_states: for key in dic: if key not in new_states: new_states[key] = dic[key].copy() else: new_states[key] += dic[key] for key in new_states: new_states[key] = sorted(list(set(new_states[key]))) return sorted(DFG, key=lambda x: x[1]), new_states elif root_node.type in for_statement: DFG = [] for i in range(2): right_nodes = [x for x in root_node.child_by_field_name('right').children if x.type != ','] left_nodes = [x for x in root_node.child_by_field_name('left').children if x.type != ','] if len(right_nodes) != len(left_nodes): left_nodes = [root_node.child_by_field_name('left')] right_nodes = [root_node.child_by_field_name('right')] if len(left_nodes) == 0: left_nodes = [root_node.child_by_field_name('left')] if len(right_nodes) == 0: right_nodes = [root_node.child_by_field_name('right')] for node in right_nodes: temp, states = DFG_python(node, index_to_code, states) DFG += temp for left_node, right_node in zip(left_nodes, right_nodes): left_tokens_index = tree_to_variable_index(left_node, index_to_code) right_tokens_index = tree_to_variable_index(right_node, index_to_code) temp = [] for token1_index in left_tokens_index: idx1, code1 = index_to_code[token1_index] temp.append((code1, idx1, 'computedFrom', [index_to_code[x][1] for x in right_tokens_index], [index_to_code[x][0] for x in right_tokens_index])) states[code1] = [idx1] DFG += temp if root_node.children[-1].type == "block": temp, states = DFG_python(root_node.children[-1], index_to_code, states) DFG += temp dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in while_statement: DFG = [] for i in range(2): for child in root_node.children: temp, states = DFG_python(child, index_to_code, states) DFG += temp dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states else: DFG = [] for child in root_node.children: if child.type in do_first_statement: temp, states = DFG_python(child, index_to_code, states) DFG += temp for child in root_node.children: if child.type not in do_first_statement: temp, states = DFG_python(child, index_to_code, states) DFG += temp return sorted(DFG, key=lambda x: x[1]), states def DFG_java(root_node, index_to_code, states): assignment = ['assignment_expression'] def_statement = ['variable_declarator'] increment_statement = ['update_expression'] if_statement = ['if_statement', 'else'] for_statement = ['for_statement'] enhanced_for_statement = ['enhanced_for_statement'] while_statement = ['while_statement'] do_first_statement = [] states = states.copy() if (len(root_node.children) == 0 or root_node.type == 'string') and root_node.type != 'comment': idx, code = index_to_code[(root_node.start_point, root_node.end_point)] if root_node.type == code: return [], states elif code in states: return [(code, idx, 'comesFrom', [code], states[code].copy())], states else: if root_node.type == 'identifier': states[code] = [idx] return [(code, idx, 'comesFrom', [], [])], states elif root_node.type in def_statement: name = root_node.child_by_field_name('name') value = root_node.child_by_field_name('value') DFG = [] if value is None: indexs = tree_to_variable_index(name, index_to_code) for index in indexs: idx, code = index_to_code[index] DFG.append((code, idx, 'comesFrom', [], [])) states[code] = [idx] return sorted(DFG, key=lambda x: x[1]), states else: name_indexs = tree_to_variable_index(name, index_to_code) value_indexs = tree_to_variable_index(value, index_to_code) temp, states = DFG_java(value, index_to_code, states) DFG += temp for index1 in name_indexs: idx1, code1 = index_to_code[index1] for index2 in value_indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'comesFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in assignment: left_nodes = root_node.child_by_field_name('left') right_nodes = root_node.child_by_field_name('right') DFG = [] temp, states = DFG_java(right_nodes, index_to_code, states) DFG += temp name_indexs = tree_to_variable_index(left_nodes, index_to_code) value_indexs = tree_to_variable_index(right_nodes, index_to_code) for index1 in name_indexs: idx1, code1 = index_to_code[index1] for index2 in value_indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'computedFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in increment_statement: DFG = [] indexs = tree_to_variable_index(root_node, index_to_code) for index1 in indexs: idx1, code1 = index_to_code[index1] for index2 in indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'computedFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in if_statement: DFG = [] current_states = states.copy() others_states = [] flag = False tag = False if 'else' in root_node.type: tag = True for child in root_node.children: if 'else' in child.type: tag = True if child.type not in if_statement and flag is False: temp, current_states = DFG_java(child, index_to_code, current_states) DFG += temp else: flag = True temp, new_states = DFG_java(child, index_to_code, states) DFG += temp others_states.append(new_states) others_states.append(current_states) if tag is False: others_states.append(states) new_states = {} for dic in others_states: for key in dic: if key not in new_states: new_states[key] = dic[key].copy() else: new_states[key] += dic[key] for key in new_states: new_states[key] = sorted(list(set(new_states[key]))) return sorted(DFG, key=lambda x: x[1]), new_states elif root_node.type in for_statement: DFG = [] for child in root_node.children: temp, states = DFG_java(child, index_to_code, states) DFG += temp flag = False for child in root_node.children: if flag: temp, states = DFG_java(child, index_to_code, states) DFG += temp elif child.type == "local_variable_declaration": flag = True dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in enhanced_for_statement: name = root_node.child_by_field_name('name') value = root_node.child_by_field_name('value') body = root_node.child_by_field_name('body') DFG = [] for i in range(2): temp, states = DFG_java(value, index_to_code, states) DFG += temp name_indexs = tree_to_variable_index(name, index_to_code) value_indexs = tree_to_variable_index(value, index_to_code) for index1 in name_indexs: idx1, code1 = index_to_code[index1] for index2 in value_indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'computedFrom', [code2], [idx2])) states[code1] = [idx1] temp, states = DFG_java(body, index_to_code, states) DFG += temp dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in while_statement: DFG = [] for i in range(2): for child in root_node.children: temp, states = DFG_java(child, index_to_code, states) DFG += temp dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states else: DFG = [] for child in root_node.children: if child.type in do_first_statement: temp, states = DFG_java(child, index_to_code, states) DFG += temp for child in root_node.children: if child.type not in do_first_statement: temp, states = DFG_java(child, index_to_code, states) DFG += temp return sorted(DFG, key=lambda x: x[1]), states def DFG_csharp(root_node, index_to_code, states): assignment = ['assignment_expression'] def_statement = ['variable_declarator'] increment_statement = ['postfix_unary_expression'] if_statement = ['if_statement', 'else'] for_statement = ['for_statement'] enhanced_for_statement = ['for_each_statement'] while_statement = ['while_statement'] do_first_statement = [] states = states.copy() if (len(root_node.children) == 0 or root_node.type == 'string') and root_node.type != 'comment': idx, code = index_to_code[(root_node.start_point, root_node.end_point)] if root_node.type == code: return [], states elif code in states: return [(code, idx, 'comesFrom', [code], states[code].copy())], states else: if root_node.type == 'identifier': states[code] = [idx] return [(code, idx, 'comesFrom', [], [])], states elif root_node.type in def_statement: if len(root_node.children) == 2: name = root_node.children[0] value = root_node.children[1] else: name = root_node.children[0] value = None DFG = [] if value is None: indexs = tree_to_variable_index(name, index_to_code) for index in indexs: idx, code = index_to_code[index] DFG.append((code, idx, 'comesFrom', [], [])) states[code] = [idx] return sorted(DFG, key=lambda x: x[1]), states else: name_indexs = tree_to_variable_index(name, index_to_code) value_indexs = tree_to_variable_index(value, index_to_code) temp, states = DFG_csharp(value, index_to_code, states) DFG += temp for index1 in name_indexs: idx1, code1 = index_to_code[index1] for index2 in value_indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'comesFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in assignment: left_nodes = root_node.child_by_field_name('left') right_nodes = root_node.child_by_field_name('right') DFG = [] temp, states = DFG_csharp(right_nodes, index_to_code, states) DFG += temp name_indexs = tree_to_variable_index(left_nodes, index_to_code) value_indexs = tree_to_variable_index(right_nodes, index_to_code) for index1 in name_indexs: idx1, code1 = index_to_code[index1] for index2 in value_indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'computedFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in increment_statement: DFG = [] indexs = tree_to_variable_index(root_node, index_to_code) for index1 in indexs: idx1, code1 = index_to_code[index1] for index2 in indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'computedFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in if_statement: DFG = [] current_states = states.copy() others_states = [] flag = False tag = False if 'else' in root_node.type: tag = True for child in root_node.children: if 'else' in child.type: tag = True if child.type not in if_statement and flag is False: temp, current_states = DFG_csharp(child, index_to_code, current_states) DFG += temp else: flag = True temp, new_states = DFG_csharp(child, index_to_code, states) DFG += temp others_states.append(new_states) others_states.append(current_states) if tag is False: others_states.append(states) new_states = {} for dic in others_states: for key in dic: if key not in new_states: new_states[key] = dic[key].copy() else: new_states[key] += dic[key] for key in new_states: new_states[key] = sorted(list(set(new_states[key]))) return sorted(DFG, key=lambda x: x[1]), new_states elif root_node.type in for_statement: DFG = [] for child in root_node.children: temp, states = DFG_csharp(child, index_to_code, states) DFG += temp flag = False for child in root_node.children: if flag: temp, states = DFG_csharp(child, index_to_code, states) DFG += temp elif child.type == "local_variable_declaration": flag = True dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in enhanced_for_statement: name = root_node.child_by_field_name('left') value = root_node.child_by_field_name('right') body = root_node.child_by_field_name('body') DFG = [] for i in range(2): temp, states = DFG_csharp(value, index_to_code, states) DFG += temp name_indexs = tree_to_variable_index(name, index_to_code) value_indexs = tree_to_variable_index(value, index_to_code) for index1 in name_indexs: idx1, code1 = index_to_code[index1] for index2 in value_indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'computedFrom', [code2], [idx2])) states[code1] = [idx1] temp, states = DFG_csharp(body, index_to_code, states) DFG += temp dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in while_statement: DFG = [] for i in range(2): for child in root_node.children: temp, states = DFG_csharp(child, index_to_code, states) DFG += temp dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states else: DFG = [] for child in root_node.children: if child.type in do_first_statement: temp, states = DFG_csharp(child, index_to_code, states) DFG += temp for child in root_node.children: if child.type not in do_first_statement: temp, states = DFG_csharp(child, index_to_code, states) DFG += temp return sorted(DFG, key=lambda x: x[1]), states def DFG_ruby(root_node, index_to_code, states): assignment = ['assignment', 'operator_assignment'] if_statement = ['if', 'elsif', 'else', 'unless', 'when'] for_statement = ['for'] while_statement = ['while_modifier', 'until'] do_first_statement = [] def_statement = ['keyword_parameter'] if (len(root_node.children) == 0 or root_node.type == 'string') and root_node.type != 'comment': states = states.copy() idx, code = index_to_code[(root_node.start_point, root_node.end_point)] if root_node.type == code: return [], states elif code in states: return [(code, idx, 'comesFrom', [code], states[code].copy())], states else: if root_node.type == 'identifier': states[code] = [idx] return [(code, idx, 'comesFrom', [], [])], states elif root_node.type in def_statement: name = root_node.child_by_field_name('name') value = root_node.child_by_field_name('value') DFG = [] if value is None: indexs = tree_to_variable_index(name, index_to_code) for index in indexs: idx, code = index_to_code[index] DFG.append((code, idx, 'comesFrom', [], [])) states[code] = [idx] return sorted(DFG, key=lambda x: x[1]), states else: name_indexs = tree_to_variable_index(name, index_to_code) value_indexs = tree_to_variable_index(value, index_to_code) temp, states = DFG_ruby(value, index_to_code, states) DFG += temp for index1 in name_indexs: idx1, code1 = index_to_code[index1] for index2 in value_indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'comesFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in assignment: left_nodes = [x for x in root_node.child_by_field_name('left').children if x.type != ','] right_nodes = [x for x in root_node.child_by_field_name('right').children if x.type != ','] if len(right_nodes) != len(left_nodes): left_nodes = [root_node.child_by_field_name('left')] right_nodes = [root_node.child_by_field_name('right')] if len(left_nodes) == 0: left_nodes = [root_node.child_by_field_name('left')] if len(right_nodes) == 0: right_nodes = [root_node.child_by_field_name('right')] if root_node.type == "operator_assignment": left_nodes = [root_node.children[0]] right_nodes = [root_node.children[-1]] DFG = [] for node in right_nodes: temp, states = DFG_ruby(node, index_to_code, states) DFG += temp for left_node, right_node in zip(left_nodes, right_nodes): left_tokens_index = tree_to_variable_index(left_node, index_to_code) right_tokens_index = tree_to_variable_index(right_node, index_to_code) temp = [] for token1_index in left_tokens_index: idx1, code1 = index_to_code[token1_index] temp.append((code1, idx1, 'computedFrom', [index_to_code[x][1] for x in right_tokens_index], [index_to_code[x][0] for x in right_tokens_index])) states[code1] = [idx1] DFG += temp return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in if_statement: DFG = [] current_states = states.copy() others_states = [] tag = False if 'else' in root_node.type: tag = True for child in root_node.children: if 'else' in child.type: tag = True if child.type not in if_statement: temp, current_states = DFG_ruby(child, index_to_code, current_states) DFG += temp else: temp, new_states = DFG_ruby(child, index_to_code, states) DFG += temp others_states.append(new_states) others_states.append(current_states) if tag is False: others_states.append(states) new_states = {} for dic in others_states: for key in dic: if key not in new_states: new_states[key] = dic[key].copy() else: new_states[key] += dic[key] for key in new_states: new_states[key] = sorted(list(set(new_states[key]))) return sorted(DFG, key=lambda x: x[1]), new_states elif root_node.type in for_statement: DFG = [] for i in range(2): left_nodes = [root_node.child_by_field_name('pattern')] right_nodes = [root_node.child_by_field_name('value')] assert len(right_nodes) == len(left_nodes) for node in right_nodes: temp, states = DFG_ruby(node, index_to_code, states) DFG += temp for left_node, right_node in zip(left_nodes, right_nodes): left_tokens_index = tree_to_variable_index(left_node, index_to_code) right_tokens_index = tree_to_variable_index(right_node, index_to_code) temp = [] for token1_index in left_tokens_index: idx1, code1 = index_to_code[token1_index] temp.append((code1, idx1, 'computedFrom', [index_to_code[x][1] for x in right_tokens_index], [index_to_code[x][0] for x in right_tokens_index])) states[code1] = [idx1] DFG += temp temp, states = DFG_ruby(root_node.child_by_field_name('body'), index_to_code, states) DFG += temp dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in while_statement: DFG = [] for i in range(2): for child in root_node.children: temp, states = DFG_ruby(child, index_to_code, states) DFG += temp dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states else: DFG = [] for child in root_node.children: if child.type in do_first_statement: temp, states = DFG_ruby(child, index_to_code, states) DFG += temp for child in root_node.children: if child.type not in do_first_statement: temp, states = DFG_ruby(child, index_to_code, states) DFG += temp return sorted(DFG, key=lambda x: x[1]), states def DFG_go(root_node, index_to_code, states): assignment = ['assignment_statement', ] def_statement = ['var_spec'] increment_statement = ['inc_statement'] if_statement = ['if_statement', 'else'] for_statement = ['for_statement'] enhanced_for_statement = [] while_statement = [] do_first_statement = [] states = states.copy() if (len(root_node.children) == 0 or root_node.type == 'string') and root_node.type != 'comment': idx, code = index_to_code[(root_node.start_point, root_node.end_point)] if root_node.type == code: return [], states elif code in states: return [(code, idx, 'comesFrom', [code], states[code].copy())], states else: if root_node.type == 'identifier': states[code] = [idx] return [(code, idx, 'comesFrom', [], [])], states elif root_node.type in def_statement: name = root_node.child_by_field_name('name') value = root_node.child_by_field_name('value') DFG = [] if value is None: indexs = tree_to_variable_index(name, index_to_code) for index in indexs: idx, code = index_to_code[index] DFG.append((code, idx, 'comesFrom', [], [])) states[code] = [idx] return sorted(DFG, key=lambda x: x[1]), states else: name_indexs = tree_to_variable_index(name, index_to_code) value_indexs = tree_to_variable_index(value, index_to_code) temp, states = DFG_go(value, index_to_code, states) DFG += temp for index1 in name_indexs: idx1, code1 = index_to_code[index1] for index2 in value_indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'comesFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in assignment: left_nodes = root_node.child_by_field_name('left') right_nodes = root_node.child_by_field_name('right') DFG = [] temp, states = DFG_go(right_nodes, index_to_code, states) DFG += temp name_indexs = tree_to_variable_index(left_nodes, index_to_code) value_indexs = tree_to_variable_index(right_nodes, index_to_code) for index1 in name_indexs: idx1, code1 = index_to_code[index1] for index2 in value_indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'computedFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in increment_statement: DFG = [] indexs = tree_to_variable_index(root_node, index_to_code) for index1 in indexs: idx1, code1 = index_to_code[index1] for index2 in indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'computedFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in if_statement: DFG = [] current_states = states.copy() others_states = [] flag = False tag = False if 'else' in root_node.type: tag = True for child in root_node.children: if 'else' in child.type: tag = True if child.type not in if_statement and flag is False: temp, current_states = DFG_go(child, index_to_code, current_states) DFG += temp else: flag = True temp, new_states = DFG_go(child, index_to_code, states) DFG += temp others_states.append(new_states) others_states.append(current_states) if tag is False: others_states.append(states) new_states = {} for dic in others_states: for key in dic: if key not in new_states: new_states[key] = dic[key].copy() else: new_states[key] += dic[key] for key in states: if key not in new_states: new_states[key] = states[key] else: new_states[key] += states[key] for key in new_states: new_states[key] = sorted(list(set(new_states[key]))) return sorted(DFG, key=lambda x: x[1]), new_states elif root_node.type in for_statement: DFG = [] for child in root_node.children: temp, states = DFG_go(child, index_to_code, states) DFG += temp flag = False for child in root_node.children: if flag: temp, states = DFG_go(child, index_to_code, states) DFG += temp elif child.type == "for_clause": if child.child_by_field_name('update') is not None: temp, states = DFG_go(child.child_by_field_name('update'), index_to_code, states) DFG += temp flag = True dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states else: DFG = [] for child in root_node.children: if child.type in do_first_statement: temp, states = DFG_go(child, index_to_code, states) DFG += temp for child in root_node.children: if child.type not in do_first_statement: temp, states = DFG_go(child, index_to_code, states) DFG += temp return sorted(DFG, key=lambda x: x[1]), states def DFG_php(root_node, index_to_code, states): assignment = ['assignment_expression', 'augmented_assignment_expression'] def_statement = ['simple_parameter'] increment_statement = ['update_expression'] if_statement = ['if_statement', 'else_clause'] for_statement = ['for_statement'] enhanced_for_statement = ['foreach_statement'] while_statement = ['while_statement'] do_first_statement = [] states = states.copy() if (len(root_node.children) == 0 or root_node.type == 'string') and root_node.type != 'comment': idx, code = index_to_code[(root_node.start_point, root_node.end_point)] if root_node.type == code: return [], states elif code in states: return [(code, idx, 'comesFrom', [code], states[code].copy())], states else: if root_node.type == 'identifier': states[code] = [idx] return [(code, idx, 'comesFrom', [], [])], states elif root_node.type in def_statement: name = root_node.child_by_field_name('name') value = root_node.child_by_field_name('default_value') DFG = [] if value is None: indexs = tree_to_variable_index(name, index_to_code) for index in indexs: idx, code = index_to_code[index] DFG.append((code, idx, 'comesFrom', [], [])) states[code] = [idx] return sorted(DFG, key=lambda x: x[1]), states else: name_indexs = tree_to_variable_index(name, index_to_code) value_indexs = tree_to_variable_index(value, index_to_code) temp, states = DFG_php(value, index_to_code, states) DFG += temp for index1 in name_indexs: idx1, code1 = index_to_code[index1] for index2 in value_indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'comesFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in assignment: left_nodes = root_node.child_by_field_name('left') right_nodes = root_node.child_by_field_name('right') DFG = [] temp, states = DFG_php(right_nodes, index_to_code, states) DFG += temp name_indexs = tree_to_variable_index(left_nodes, index_to_code) value_indexs = tree_to_variable_index(right_nodes, index_to_code) for index1 in name_indexs: idx1, code1 = index_to_code[index1] for index2 in value_indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'computedFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in increment_statement: DFG = [] indexs = tree_to_variable_index(root_node, index_to_code) for index1 in indexs: idx1, code1 = index_to_code[index1] for index2 in indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'computedFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in if_statement: DFG = [] current_states = states.copy() others_states = [] flag = False tag = False if 'else' in root_node.type: tag = True for child in root_node.children: if 'else' in child.type: tag = True if child.type not in if_statement and flag is False: temp, current_states = DFG_php(child, index_to_code, current_states) DFG += temp else: flag = True temp, new_states = DFG_php(child, index_to_code, states) DFG += temp others_states.append(new_states) others_states.append(current_states) new_states = {} for dic in others_states: for key in dic: if key not in new_states: new_states[key] = dic[key].copy() else: new_states[key] += dic[key] for key in states: if key not in new_states: new_states[key] = states[key] else: new_states[key] += states[key] for key in new_states: new_states[key] = sorted(list(set(new_states[key]))) return sorted(DFG, key=lambda x: x[1]), new_states elif root_node.type in for_statement: DFG = [] for child in root_node.children: temp, states = DFG_php(child, index_to_code, states) DFG += temp flag = False for child in root_node.children: if flag: temp, states = DFG_php(child, index_to_code, states) DFG += temp elif child.type == "assignment_expression": flag = True dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in enhanced_for_statement: name = None value = None for child in root_node.children: if child.type == 'variable_name' and value is None: value = child elif child.type == 'variable_name' and name is None: name = child break body = root_node.child_by_field_name('body') DFG = [] for i in range(2): temp, states = DFG_php(value, index_to_code, states) DFG += temp name_indexs = tree_to_variable_index(name, index_to_code) value_indexs = tree_to_variable_index(value, index_to_code) for index1 in name_indexs: idx1, code1 = index_to_code[index1] for index2 in value_indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'computedFrom', [code2], [idx2])) states[code1] = [idx1] temp, states = DFG_php(body, index_to_code, states) DFG += temp dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in while_statement: DFG = [] for i in range(2): for child in root_node.children: temp, states = DFG_php(child, index_to_code, states) DFG += temp dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states else: DFG = [] for child in root_node.children: if child.type in do_first_statement: temp, states = DFG_php(child, index_to_code, states) DFG += temp for child in root_node.children: if child.type not in do_first_statement: temp, states = DFG_php(child, index_to_code, states) DFG += temp return sorted(DFG, key=lambda x: x[1]), states def DFG_javascript(root_node, index_to_code, states): assignment = ['assignment_pattern', 'augmented_assignment_expression'] def_statement = ['variable_declarator'] increment_statement = ['update_expression'] if_statement = ['if_statement', 'else'] for_statement = ['for_statement'] enhanced_for_statement = [] while_statement = ['while_statement'] do_first_statement = [] states = states.copy() if (len(root_node.children) == 0 or root_node.type == 'string') and root_node.type != 'comment': idx, code = index_to_code[(root_node.start_point, root_node.end_point)] if root_node.type == code: return [], states elif code in states: return [(code, idx, 'comesFrom', [code], states[code].copy())], states else: if root_node.type == 'identifier': states[code] = [idx] return [(code, idx, 'comesFrom', [], [])], states elif root_node.type in def_statement: name = root_node.child_by_field_name('name') value = root_node.child_by_field_name('value') DFG = [] if value is None: indexs = tree_to_variable_index(name, index_to_code) for index in indexs: idx, code = index_to_code[index] DFG.append((code, idx, 'comesFrom', [], [])) states[code] = [idx] return sorted(DFG, key=lambda x: x[1]), states else: name_indexs = tree_to_variable_index(name, index_to_code) value_indexs = tree_to_variable_index(value, index_to_code) temp, states = DFG_javascript(value, index_to_code, states) DFG += temp for index1 in name_indexs: idx1, code1 = index_to_code[index1] for index2 in value_indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'comesFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in assignment: left_nodes = root_node.child_by_field_name('left') right_nodes = root_node.child_by_field_name('right') DFG = [] temp, states = DFG_javascript(right_nodes, index_to_code, states) DFG += temp name_indexs = tree_to_variable_index(left_nodes, index_to_code) value_indexs = tree_to_variable_index(right_nodes, index_to_code) for index1 in name_indexs: idx1, code1 = index_to_code[index1] for index2 in value_indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'computedFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in increment_statement: DFG = [] indexs = tree_to_variable_index(root_node, index_to_code) for index1 in indexs: idx1, code1 = index_to_code[index1] for index2 in indexs: idx2, code2 = index_to_code[index2] DFG.append((code1, idx1, 'computedFrom', [code2], [idx2])) states[code1] = [idx1] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in if_statement: DFG = [] current_states = states.copy() others_states = [] flag = False tag = False if 'else' in root_node.type: tag = True for child in root_node.children: if 'else' in child.type: tag = True if child.type not in if_statement and flag is False: temp, current_states = DFG_javascript(child, index_to_code, current_states) DFG += temp else: flag = True temp, new_states = DFG_javascript(child, index_to_code, states) DFG += temp others_states.append(new_states) others_states.append(current_states) if tag is False: others_states.append(states) new_states = {} for dic in others_states: for key in dic: if key not in new_states: new_states[key] = dic[key].copy() else: new_states[key] += dic[key] for key in states: if key not in new_states: new_states[key] = states[key] else: new_states[key] += states[key] for key in new_states: new_states[key] = sorted(list(set(new_states[key]))) return sorted(DFG, key=lambda x: x[1]), new_states elif root_node.type in for_statement: DFG = [] for child in root_node.children: temp, states = DFG_javascript(child, index_to_code, states) DFG += temp flag = False for child in root_node.children: if flag: temp, states = DFG_javascript(child, index_to_code, states) DFG += temp elif child.type == "variable_declaration": flag = True dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states elif root_node.type in while_statement: DFG = [] for i in range(2): for child in root_node.children: temp, states = DFG_javascript(child, index_to_code, states) DFG += temp dic = {} for x in DFG: if (x[0], x[1], x[2]) not in dic: dic[(x[0], x[1], x[2])] = [x[3], x[4]] else: dic[(x[0], x[1], x[2])][0] = list(set(dic[(x[0], x[1], x[2])][0] + x[3])) dic[(x[0], x[1], x[2])][1] = sorted(list(set(dic[(x[0], x[1], x[2])][1] + x[4]))) DFG = [(x[0], x[1], x[2], y[0], y[1]) for x, y in sorted(dic.items(), key=lambda t: t[0][1])] return sorted(DFG, key=lambda x: x[1]), states else: DFG = [] for child in root_node.children: if child.type in do_first_statement: temp, states = DFG_javascript(child, index_to_code, states) DFG += temp for child in root_node.children: if child.type not in do_first_statement: temp, states = DFG_javascript(child, index_to_code, states) DFG += temp return sorted(DFG, key=lambda x: x[1]), states