123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151 |
- import numpy as np
- from hmmlearn import hmm
- from urllib.parse import unquote, parse_qsl
- import re
- import joblib
- import nltk
- MIN_LEN = 6
- N = 10
- T = -200
- tokens_pattern = r'''(?x)
- "[^"]+" #"xxxx"
- |http://\S+ #http://xxxx
- |</\w+> #</xxx>
- |<\w+> #<xxx>
- |<\w+ #<xxxx
- |\w+= # xxxx=
- |> # >
- |\w+\([^<]+\) #函数 比如alert(String.fromCharCode(88,83,83))
- |\w+ # xxxx
- '''
- def ischeck(str1):
- for i, c in enumerate(str1):
- if ord(c) > 127 or ord(c) < 31:
- return False
- return True
- def preprocessing(str1):
- result = []
- line = str1.strip('\n')
- line = unquote(line)
- if len(line) >= MIN_LEN:
-
- params = parse_qsl(line, True)
- for k, line in params:
- if ischeck(line) and len(line) >= N:
- line, _ = re.subn(r'\d+', "8", line)
- line, _ = re.subn(r'(http|https)://[a-zA-Z0-9\.@&/#!#\?:=]+', "http://u", line)
- line, _ = re.subn(r'\/\*.?\*\/', "", line)
- tokens = nltk.regexp_tokenize(line, tokens_pattern)
- result += tokens
- if result:
- return result
- return False
- def load_wordbag(filename, max=100):
- tokens_list = []
- index_wordbag = 1
- wordbag = {}
- with open(filename) as f:
- for line in f:
- tokens = preprocessing(line)
- if tokens:
- tokens_list += tokens
- fredist = nltk.FreqDist(tokens_list)
- keys = list(fredist.keys())
- keys = keys[:max]
- for localkey in keys:
- if localkey in wordbag.keys():
- continue
- else:
- wordbag[localkey] = index_wordbag
- index_wordbag += 1
- return wordbag
- def train(filename, wordbag):
- x = [[-1]]
- x_lens = [1]
- with open(filename) as f:
- for line in f:
- words = preprocessing(line)
- if words:
- vers = []
- for word in words:
-
- if word in wordbag.keys():
- vers.append([wordbag[word]])
- else:
- vers.append([-1])
- np_vers = np.array(vers)
- x = np.concatenate([x, np_vers])
- x_lens.append(len(np_vers))
- ghmm = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=100)
- ghmm.fit(x, x_lens)
- joblib.dump(ghmm, "export/model/hmm-xss-train_2.pkl")
- return ghmm
- def test(filename, wordbag):
-
- ghmm = joblib.load("export/model/hmm-xss-train_2.pkl")
- with open(filename) as f:
- for line in f:
- words = preprocessing(line)
- if words:
- vers = []
- for word in words:
-
- if word in wordbag.keys():
- vers.append([wordbag[word]])
- else:
- vers.append([-1])
- np_vers = np.array(vers)
- pro = ghmm.score(np_vers)
- if pro >= T:
- print("SCORE:(%d) XSS_URL: %s " % (pro, line))
- def main():
- xss = "data/XSS/xss-200000.txt"
-
- wordbag = load_wordbag(xss, 2000)
-
- train(xss, wordbag)
-
- test('data/XSS/test-sample.txt', wordbag)
- if __name__ == '__main__':
- main()
|