12-4-HMM-recognise-XSS.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. import numpy as np
  2. from hmmlearn import hmm
  3. from urllib.parse import unquote, parse_qsl
  4. import re
  5. import joblib
  6. import nltk
  7. # 以黑找黑
  8. # 处理参数值的最小长度
  9. MIN_LEN = 6
  10. # 隐状态个数
  11. N = 10
  12. # 最大似然概率阈值
  13. T = -200
  14. # 数据提取与特征提取,这一步不采用单字符的char特征提取,而是根据领域经验对特定的phrase字符组为基础单位,进行特征化,这是一种token切分方案
  15. # </script><script>alert(String.fromCharCode(88,83,83))</script>
  16. # <IMG SRC=x onchange="alert(String.fromCharCode(88,83,83))">
  17. # <;IFRAME SRC=http://ha.ckers.org/scriptlet.html <;
  18. # ';alert(String.fromCharCode(88,83,83))//\';alert(String.fromCharCode(88,83,83))//";alert(String.fromCharCode(88,83,83))
  19. # //\";alert(String.fromCharCode(88,83,83))//--></SCRIPT>">'><SCRIPT>alert(String.fromCharCode(88,83,83))</SCRIPT>
  20. tokens_pattern = r'''(?x)
  21. "[^"]+" #"xxxx"
  22. |http://\S+ #http://xxxx
  23. |</\w+> #</xxx>
  24. |<\w+> #<xxx>
  25. |<\w+ #<xxxx
  26. |\w+= # xxxx=
  27. |> # >
  28. |\w+\([^<]+\) #函数 比如alert(String.fromCharCode(88,83,83))
  29. |\w+ # xxxx
  30. '''
  31. # #排除中文干扰 只处理127以内的字符
  32. def ischeck(str1):
  33. for i, c in enumerate(str1):
  34. if ord(c) > 127 or ord(c) < 31:
  35. return False
  36. return True
  37. # 数据预处理
  38. def preprocessing(str1):
  39. result = []
  40. line = str1.strip('\n')
  41. line = unquote(line) # url解码
  42. if len(line) >= MIN_LEN: # 忽略短url value
  43. # 只处理参数
  44. params = parse_qsl(line, True)
  45. for k, line in params:
  46. if ischeck(line) and len(line) >= N:
  47. line, _ = re.subn(r'\d+', "8", line) # 数字常量替换成8
  48. line, _ = re.subn(r'(http|https)://[a-zA-Z0-9\.@&/#!#\?:=]+', "http://u", line) # url换成http://u
  49. line, _ = re.subn(r'\/\*.?\*\/', "", line) # 去除注释
  50. tokens = nltk.regexp_tokenize(line, tokens_pattern) # token切分
  51. result += tokens
  52. if result:
  53. return result
  54. return False
  55. # 加载词集
  56. def load_wordbag(filename, max=100):
  57. tokens_list = []
  58. index_wordbag = 1 # 词袋索引
  59. wordbag = {} # 词袋
  60. with open(filename) as f:
  61. for line in f:
  62. tokens = preprocessing(line)
  63. if tokens:
  64. tokens_list += tokens
  65. fredist = nltk.FreqDist(tokens_list) # 单文件词频
  66. keys = list(fredist.keys())
  67. keys = keys[:max] # 降维,只提取前N个高频使用的单词,其余规范化到0
  68. for localkey in keys: # 获取统计后的不重复词集
  69. if localkey in wordbag.keys(): # 判断该词是否已在词袋中
  70. continue
  71. else:
  72. wordbag[localkey] = index_wordbag
  73. index_wordbag += 1
  74. return wordbag
  75. # 训练
  76. def train(filename, wordbag):
  77. x = [[-1]]
  78. x_lens = [1]
  79. with open(filename) as f:
  80. for line in f:
  81. words = preprocessing(line)
  82. if words:
  83. vers = []
  84. for word in words:
  85. # 根据词汇编码表进行index编码,对不在词汇表中的token词不予编码
  86. if word in wordbag.keys():
  87. vers.append([wordbag[word]])
  88. else:
  89. vers.append([-1])
  90. np_vers = np.array(vers)
  91. x = np.concatenate([x, np_vers])
  92. x_lens.append(len(np_vers))
  93. ghmm = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=100)
  94. ghmm.fit(x, x_lens)
  95. joblib.dump(ghmm, "export/model/hmm-xss-train_2.pkl")
  96. return ghmm
  97. # 测试
  98. def test(filename, wordbag):
  99. # 从pkl读取
  100. ghmm = joblib.load("export/model/hmm-xss-train_2.pkl")
  101. with open(filename) as f:
  102. for line in f:
  103. words = preprocessing(line)
  104. if words:
  105. vers = []
  106. for word in words:
  107. # test和train保持相同的编码方式
  108. if word in wordbag.keys():
  109. vers.append([wordbag[word]])
  110. else:
  111. vers.append([-1])
  112. np_vers = np.array(vers)
  113. pro = ghmm.score(np_vers)
  114. if pro >= T:
  115. print("SCORE:(%d) XSS_URL: %s " % (pro, line))
  116. def main():
  117. xss = "data/XSS/xss-200000.txt"
  118. # 得到词频编码表
  119. wordbag = load_wordbag(xss, 2000)
  120. # 以黑找黑, 训练HMM模型, 保存
  121. train(xss, wordbag)
  122. # 输入test样本检测
  123. test('data/XSS/test-sample.txt', wordbag)
  124. if __name__ == '__main__':
  125. main()