123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960 |
- from sklearn.svm import SVC
- from datasets import Datasets
- from sklearn.preprocessing import StandardScaler
- from sklearn.model_selection import train_test_split, cross_val_score
- import re
- def url_len(url):
- return len(url)
- def url_has_domain(url):
- return 1 if re.search('(http://)|(https://)', url, re.IGNORECASE) else 0
- def evil_str_count(url):
- return len(re.findall("[<>,\'\"/]", url, re.IGNORECASE))
- def evil_keywords_count(url):
- blacklist = "(alert)|(script=)(%3c)|(%3e)|(%20)|(onerror)|(onload)|(eval)|(src=)|(prompt)"
- return len(re.findall(blacklist, url, re.IGNORECASE))
- def get_feature(url):
- return [url_len(url), url_has_domain(url), evil_str_count(url), evil_keywords_count(url)]
- def main():
- data, y = Datasets.load_xss()
- x = []
- for url in data:
- x.append(get_feature(url))
-
- std = StandardScaler()
- x = std.fit_transform(x)
- x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3)
-
- clf = SVC(kernel='linear')
- clf.fit(x_train, y_train)
- print(clf.score(x_test, y_test))
-
- scores = cross_val_score(clf, x, y, cv=10, scoring='accuracy')
- print(scores.mean())
- if __name__ == "__main__":
- main()
|