7-4-NaiveBayesian-detect-WEBSHELL.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. from sklearn.naive_bayes import GaussianNB
  2. from sklearn.model_selection import cross_val_score, train_test_split
  3. from sklearn.feature_extraction.text import CountVectorizer
  4. from datasets import Datasets
  5. import numpy as np
  6. # 数据预处理 向量化 以2-gram
  7. def to_voc(webshell, wordpress):
  8. webshell_voc = CountVectorizer(ngram_range=(2, 2), decode_error="ignore", token_pattern=r'\b\w+\b', min_df=1)
  9. x1 = webshell_voc.fit_transform(webshell).toarray()
  10. y1 = [1] * len(x1)
  11. wordpress_voc = CountVectorizer(ngram_range=(2, 2), decode_error="ignore", token_pattern=r'\b\w+\b', min_df=1,
  12. vocabulary=webshell_voc.vocabulary_)
  13. x2 = wordpress_voc.fit_transform(wordpress).toarray()
  14. y2 = [0] * len(x2)
  15. x = np.concatenate((x1, x2))
  16. y = np.concatenate((y1, y2))
  17. return x, y
  18. def main():
  19. webshell, wordpress = Datasets.load_php_webshell()
  20. x, y = to_voc(webshell, wordpress)
  21. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
  22. gnb = GaussianNB()
  23. gnb.fit(x_train, y_train)
  24. print(gnb.score(x_test, y_test)) # 0.7659574468085106
  25. scores = cross_val_score(gnb, x, y, cv=3, scoring="accuracy")
  26. print(scores.mean()) # 0.7872046254399195
  27. if __name__ == "__main__":
  28. main()