LiuFan
/
PrivacyScanData


			
							123456789101112131415161718192021222324252627282930
							from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from datasets import Datasets
import numpy as np


# 提取特征 向量化 以2-gram
def get_feature(x):
    cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore", token_pattern=r"\w", min_df=1)
    x = cv.fit_transform(x).toarray()
    return x


def main():
    x, y = Datasets.load_dga_domain()
    x = get_feature(x)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    print(gnb.score(x_test, y_test))  # 0.9422222222222222

    scores = cross_val_score(gnb, x, y, cv=3, scoring="accuracy")
    print(scores.mean())  # 0.9356666666666666


if __name__ == "__main__":
    main()