7-6-NaiveBayesian-detect-DGA-domain.py 849 B

123456789101112131415161718192021222324252627282930
  1. from sklearn.naive_bayes import GaussianNB
  2. from sklearn.model_selection import cross_val_score, train_test_split
  3. from sklearn.feature_extraction.text import CountVectorizer
  4. from datasets import Datasets
  5. import numpy as np
  6. # 提取特征 向量化 以2-gram
  7. def get_feature(x):
  8. cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore", token_pattern=r"\w", min_df=1)
  9. x = cv.fit_transform(x).toarray()
  10. return x
  11. def main():
  12. x, y = Datasets.load_dga_domain()
  13. x = get_feature(x)
  14. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
  15. gnb = GaussianNB()
  16. gnb.fit(x_train, y_train)
  17. print(gnb.score(x_test, y_test)) # 0.9422222222222222
  18. scores = cross_val_score(gnb, x, y, cv=3, scoring="accuracy")
  19. print(scores.mean()) # 0.9356666666666666
  20. if __name__ == "__main__":
  21. main()