funcnode.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. import ast
  2. import re
  3. from algorithm.charactermatch import word_match
  4. from lattices.asttype import ast_type
  5. from utils.fileio import load_json
  6. from models.sentencenode import SuspectedSentenceNode
  7. def go_split(s, symbol):
  8. """
  9. 将代码行的内容分解成单词列表
  10. Args:
  11. s: script
  12. symbol: 分隔符
  13. Returns:
  14. """
  15. result = [s]
  16. for i in symbol:
  17. median = []
  18. for z in map(lambda x: x.split(i), result):
  19. median.extend(z)
  20. result = median
  21. return [x.replace(' ', '') for x in result if x not in [':\n', '']]
  22. def get_params(node, node_params=None):
  23. """
  24. 获取赋值语句中的函数参数
  25. 例如:"request1, request2 = a + b"
  26. return 的内容为 request1, request2
  27. node(a,b)
  28. return a,b
  29. Args:
  30. node:ast node
  31. node_params:[]
  32. Returns:
  33. [request1, request2]
  34. """
  35. if not node_params:
  36. node_params = []
  37. if isinstance(node, ast.Name):
  38. node_params.append(node.id)
  39. return node_params
  40. elif isinstance(node, ast.Call):
  41. for arg in node.args:
  42. if isinstance(arg, ast.Name):
  43. node_params.append(arg.id)
  44. else:
  45. node_params = get_params(arg, node_params)
  46. if isinstance(node.func, ast.Attribute):
  47. node_value = node.func.value
  48. while isinstance(node_value, ast.Attribute):
  49. node_params.append(node_value.attr)
  50. node_value = node_value.value
  51. if isinstance(node_value, ast.Name):
  52. node_params.append(node_value.id)
  53. elif isinstance(node, ast.List) or isinstance(node, ast.Tuple) or isinstance(node, ast.Set):
  54. for arg in node.elts:
  55. if isinstance(arg, ast.Name):
  56. node_params.append(arg.id)
  57. else:
  58. node_params = get_params(arg, node_params)
  59. else:
  60. pass
  61. return node_params
  62. def get_script(node, script_list):
  63. """
  64. Args:
  65. node: ast节点
  66. script_list:源代码字符串列表
  67. Returns:
  68. """
  69. script_ori = script_list[node.lineno - 1:node.end_lineno]
  70. script_tmp = ""
  71. if node.__class__ in ast_type:
  72. for i in range(len(script_ori)):
  73. if ":" not in script_ori[i]:
  74. script_tmp = script_tmp + script_ori[i].replace('\\\n', '').replace('\n', '')
  75. else:
  76. script_tmp = script_tmp + script_ori[i]
  77. break
  78. else:
  79. script_tmp = "".join(script_ori).replace('\\\n', '').replace('\n', '')
  80. words_list = {'methods': [], 'vars': []}
  81. get_all_words(node, node.lineno, words_list)
  82. # print("words_list:", words_list)
  83. words_in_line = go_split(script_tmp, ':.()[]{},=+-*/#&@!^\'\" ')
  84. # TODO 修改words_in_line中的 注释内容
  85. words_list['vars'] = [item for item in words_list['vars'] if
  86. item not in words_list[
  87. 'methods'] and item in words_in_line and "\"\"\"" not in item and "#" not in item]
  88. # print(words_list)
  89. words_list['methods'] = [item for item in words_list['methods'] if
  90. item in words_in_line and "\"\"\"" not in item and "#" not in item]
  91. words_list['methods'] = list(set(words_list['methods']))
  92. words_list['vars'] = list(set(words_list['vars']))
  93. # print(node.lineno, words_list)
  94. # words_list['methods'] = go_split(script_tmp, '()[]{},=+-*/#&@!^ ')
  95. # words_list['vars'] = go_split(script_tmp, '()[]{},=+-*/#&@!^ ')
  96. return script_tmp, words_list
  97. def match_data_type(script, data_type):
  98. """
  99. Args:
  100. script: 代码字符串
  101. data_type: 隐私数据类型
  102. Returns:
  103. [(data_type, word_in script),...]
  104. """
  105. private_word_list = []
  106. for word in script:
  107. for key in data_type.keys():
  108. word_std_list = data_type[key]['abbr']
  109. if word_match(word_std_list, word):
  110. private_word_list.append((key, word))
  111. private_word_list = list(set(private_word_list))
  112. if len(private_word_list) == 0:
  113. private_word_list = [("None", "none")]
  114. return private_word_list
  115. def match_purpose_type(script, purpose_dict):
  116. """
  117. Args:
  118. script:
  119. purpose_dict:
  120. Returns:
  121. purpose
  122. """
  123. purpose = []
  124. for word in script:
  125. for key in purpose_dict.keys():
  126. purpose_list = purpose_dict[key]['abbr']
  127. # print("purpose_list:", purpose_list, "word: ", word, word_match(purpose_list, word))
  128. if word_match(purpose_list, word):
  129. purpose.append(purpose_dict[key]['path'])
  130. purpose = list(set(purpose))
  131. if len(purpose) == 0:
  132. purpose = ["None"]
  133. return purpose
  134. def get_all_words(node, line_no, vars_and_methods):
  135. if isinstance(node, ast.Call):
  136. if isinstance(node.func, ast.Name):
  137. vars_and_methods['methods'].append(node.func.id)
  138. elif isinstance(node.func, ast.Attribute):
  139. vars_and_methods['methods'].append(node.func.attr)
  140. node_value = node.func.value
  141. while isinstance(node_value, ast.Attribute):
  142. vars_and_methods['methods'].append(node_value.attr)
  143. node_value = node_value.value
  144. if isinstance(node_value, ast.Name):
  145. vars_and_methods['methods'].append(node_value.id)
  146. elif isinstance(node, ast.Import):
  147. for name in node.names:
  148. vars_and_methods['methods'].append(name.name)
  149. if name.asname:
  150. vars_and_methods['methods'].append(name.asname)
  151. elif isinstance(node, ast.ImportFrom):
  152. for name in node.names:
  153. vars_and_methods['methods'].append(name.name)
  154. if name.asname:
  155. vars_and_methods['methods'].append(name.asname)
  156. vars_and_methods['methods'].extend(node.module.split("."))
  157. # if hasattr(node, 'lineno') and node.lineno == line_no:
  158. for field, value in ast.iter_fields(node):
  159. if isinstance(value, list):
  160. for item in value:
  161. if isinstance(item, str): # 添加所有单词作为变量
  162. vars_and_methods['vars'].append(item)
  163. if isinstance(item, ast.AST):
  164. get_all_words(item, line_no, vars_and_methods)
  165. elif isinstance(value, str): # 添加所有单词作为变量
  166. vars_and_methods['vars'].append(value)
  167. elif isinstance(value, ast.AST):
  168. get_all_words(value, line_no, vars_and_methods)
  169. def get_all_vars(node, line_no, var_list):
  170. if node.lineno == line_no:
  171. # recursion
  172. for field, value in ast.iter_fields(node):
  173. if isinstance(value, list):
  174. for item in value:
  175. if isinstance(item, ast.AST):
  176. get_all_vars(item, line_no, var_list)
  177. elif isinstance(value, ast.AST):
  178. get_all_vars(value, line_no, var_list)
  179. # todo: 优化
  180. class FuncNode:
  181. def __init__(self, func_node, file_path, lattices, script_list=None):
  182. self.func_node = func_node
  183. self.file_path = file_path
  184. self.script_list = script_list
  185. self.private_info = []
  186. self.key_variable = {}
  187. self.func_name = func_node.name
  188. self.lattices = lattices
  189. def get_sentence_nodes(self, node=None, all_nodes=None):
  190. """
  191. Args:
  192. node: ast_node
  193. all_nodes: all suspected sentence node
  194. Returns:
  195. """
  196. if node is None:
  197. node = self.func_node
  198. if all_nodes is None:
  199. all_nodes = []
  200. line_no = node.lineno
  201. data_type = self.lattices["dataType"]
  202. purpose_dict = self.lattices["purpose"]
  203. script_ori, script = get_script(node, self.script_list)
  204. private_word_list = match_data_type(script['vars'], data_type)
  205. # private_word_list = match_data_type(script['vars'], data_type) + match_data_type(script['methods'], purpose_dict)
  206. # 行所调用的方法
  207. for var in script['vars']:
  208. if var in self.key_variable:
  209. private_word_list.extend(self.key_variable[var][0])
  210. private_word_list = list(set(private_word_list))
  211. if len(private_word_list) > 1 and ('None', 'none') in private_word_list:
  212. private_word_list.remove(('None', 'none'))
  213. # print(script['methods'])
  214. purpose = match_purpose_type(script['methods'] + script['vars'], purpose_dict)
  215. # print("2", private_word_list, purpose)
  216. if not (("None", "none") in private_word_list and purpose == ["None"]):
  217. sentence_node = SuspectedSentenceNode(self.file_path, line_no, private_word_list, purpose, self.func_name,
  218. script=script_ori, methods_called=script['methods'])
  219. # print(private_word_list, purpose)
  220. all_nodes.append(sentence_node)
  221. # 考虑数据流,找到赋值语句,将被隐私数据污染的数据保存到private_word_list
  222. # TODO
  223. if isinstance(node, ast.Assign):
  224. # 不能因为找到了 隐私变量就不考虑已定义变量的传递
  225. if not ("None", "none") in private_word_list: # 存在隐私变量 被赋值变量直接添加到key_var
  226. for target in node.targets:
  227. if isinstance(target, ast.Name):
  228. self.key_variable[target.id] = (private_word_list, purpose)
  229. elif isinstance(target, ast.Attribute):
  230. self.key_variable[target.attr] = (private_word_list, purpose)
  231. else:
  232. pass
  233. # 已定义变量的传播
  234. node_params = get_params(node.value)
  235. for node_param in node_params:
  236. if node_param in list(self.key_variable.keys()): # 是否包含传递的变量 (已定义的的变量
  237. private_word_list_inherit, purpose_inherit = self.key_variable[node_param]
  238. if ("None", "none") not in private_word_list_inherit:
  239. sentence_node = SuspectedSentenceNode(self.file_path, line_no,
  240. private_word_list_inherit,
  241. purpose_inherit, self.func_name, script=script_ori,
  242. methods_called=script['methods'])
  243. all_nodes.append(sentence_node)
  244. else:
  245. sentence_node = all_nodes[-1]
  246. sentence_node.purpose.extend(purpose_inherit)
  247. sentence_node.purpose = list(set(all_nodes[-1].purpose))
  248. purpose_inherit = sentence_node.purpose
  249. new_private_info = []
  250. for type in sentence_node.private_word_list:
  251. for purpose_each in sentence_node.purpose:
  252. new_private_info.append((type[0], purpose_each))
  253. sentence_node.private_info = new_private_info
  254. for target in node.targets:
  255. if isinstance(target, ast.Name):
  256. self.key_variable[target.id] = (private_word_list_inherit, purpose_inherit)
  257. elif isinstance(target, ast.Attribute):
  258. self.key_variable[target.attr] = (private_word_list_inherit, purpose_inherit)
  259. elif isinstance(target, ast.Subscript) and isinstance(target.value, ast.Name):
  260. self.key_variable[target.value.id] = (private_word_list_inherit, purpose_inherit)
  261. else:
  262. pass
  263. elif isinstance(node, ast.AugAssign): # +=赋值
  264. if not ("None", "none") in private_word_list: # 存在隐私变量 被赋值变量直接添加到key_var
  265. if isinstance(node.target, ast.Name):
  266. self.key_variable[node.target.id] = (private_word_list, purpose)
  267. elif isinstance(node.target, ast.Attribute):
  268. self.key_variable[node.target.attr] = (private_word_list, purpose)
  269. else:
  270. pass
  271. # 已定义变量的传播
  272. node_params = get_params(node.value)
  273. for node_param in node_params:
  274. if node_param in list(self.key_variable.keys()): # 是否包含传递的变量 (已定义的的变量
  275. private_word_list_inherit, purpose_inherit = self.key_variable[node_param]
  276. if ("None", "none") not in private_word_list_inherit:
  277. sentence_node = SuspectedSentenceNode(self.file_path, line_no,
  278. private_word_list_inherit,
  279. purpose_inherit, self.func_name, script=script_ori,
  280. methods_called=script['methods'])
  281. all_nodes.append(sentence_node)
  282. else:
  283. sentence_node = all_nodes[-1]
  284. sentence_node.purpose.extend(purpose_inherit)
  285. sentence_node.purpose = list(set(all_nodes[-1].purpose))
  286. purpose_inherit = sentence_node.purpose
  287. new_private_info = []
  288. for type in sentence_node.private_word_list:
  289. for purpose_each in sentence_node.purpose:
  290. new_private_info.append((type[0], purpose_each))
  291. sentence_node.private_info = new_private_info
  292. if isinstance(node.target, ast.Name):
  293. self.key_variable[node.target.id] = (private_word_list_inherit, purpose_inherit)
  294. elif isinstance(node.target, ast.Attribute):
  295. self.key_variable[node.target.attr] = (private_word_list_inherit, purpose_inherit)
  296. elif isinstance(node.target, ast.Subscript) and isinstance(node.target.value, ast.Name):
  297. self.key_variable[node.target.value.id] = (private_word_list_inherit, purpose_inherit)
  298. else:
  299. pass
  300. # 考虑 数据关系 对象user 展示了user.username 说明变量user 被传递 隐私数据username
  301. elif isinstance(node, ast.Expr):
  302. node_params = get_params(node.value)
  303. if len(node_params) > 0:
  304. for node_param in node_params:
  305. if node_param in list(self.key_variable.keys()) and len(private_word_list) == 0:
  306. private_word_list_inherit, purpose_inherit = self.key_variable[node_param]
  307. sentence_node = SuspectedSentenceNode(self.file_path, line_no,
  308. private_word_list_inherit,
  309. purpose_inherit, self.func_name, script=script_ori,
  310. methods_called=script['methods'])
  311. all_nodes.append(sentence_node)
  312. # print(self.file_path, line_no, private_word_list, purpose)
  313. elif isinstance(node, ast.ImportFrom):
  314. for alias in node.names:
  315. self.key_variable[alias.name] = (private_word_list, purpose)
  316. for private_word in private_word_list:
  317. if not (private_word[0] == "None" and purpose[0] == "None") and private_word[0] not in [info[0] for info in
  318. self.private_info]:
  319. for p in purpose:
  320. self.private_info.append((private_word[0], p))
  321. node_son_list = []
  322. for field, value in ast.iter_fields(node):
  323. if field == "body" or field == "orelse":
  324. node_son_list.append(value)
  325. if len(node_son_list) > 0:
  326. for field in node_son_list:
  327. for node_son in field:
  328. all_nodes = self.get_sentence_nodes(node_son, all_nodes)
  329. return all_nodes
  330. def __str__(self):
  331. return self.script_list[0]
  332. if __name__ == '__main__':
  333. file = open("D:\\study\\python\\cmdb-python-master\\test.py", encoding='utf-8')
  334. lines = file.readlines()
  335. string = ""
  336. for line in lines:
  337. string += line
  338. node = ast.parse(string)
  339. words_list = {'methods': [], 'vars': []}
  340. get_all_words(node.body[1].body[0], 5, words_list)
  341. print(words_list)