123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960 |
- from sklearn.svm import SVC
- from datasets import Datasets
- from sklearn.preprocessing import StandardScaler
- from sklearn.model_selection import train_test_split, cross_val_score
- import re
- # 特征选取: url长度、url包含第三方域名个数、敏感字符个数、敏感关键字个数
- # url长度
- def url_len(url):
- return len(url)
- # url是否包含第三方域名
- def url_has_domain(url):
- return 1 if re.search('(http://)|(https://)', url, re.IGNORECASE) else 0
- # 敏感字符个数
- def evil_str_count(url):
- return len(re.findall("[<>,\'\"/]", url, re.IGNORECASE))
- # 敏感关键字个数
- def evil_keywords_count(url):
- blacklist = "(alert)|(script=)(%3c)|(%3e)|(%20)|(onerror)|(onload)|(eval)|(src=)|(prompt)"
- return len(re.findall(blacklist, url, re.IGNORECASE))
- # 特征提取
- def get_feature(url):
- return [url_len(url), url_has_domain(url), evil_str_count(url), evil_keywords_count(url)]
- def main():
- data, y = Datasets.load_xss()
- x = []
- for url in data:
- x.append(get_feature(url))
- # 标准化
- std = StandardScaler()
- x = std.fit_transform(x)
- x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3)
- # 用SVM模型并训练
- clf = SVC(kernel='linear')
- clf.fit(x_train, y_train)
- print(clf.score(x_test, y_test))
- # 交叉验证 十组比较慢
- scores = cross_val_score(clf, x, y, cv=10, scoring='accuracy')
- print(scores.mean())
- if __name__ == "__main__":
- main()
|