from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from datasets import Datasets
import numpy as np


# 数据预处理 向量化 以2-gram
def to_voc(webshell, wordpress):
    webshell_voc = CountVectorizer(ngram_range=(2, 2), decode_error="ignore", token_pattern=r'\b\w+\b', min_df=1)
    x1 = webshell_voc.fit_transform(webshell).toarray()
    y1 = [1] * len(x1)

    wordpress_voc = CountVectorizer(ngram_range=(2, 2), decode_error="ignore", token_pattern=r'\b\w+\b', min_df=1,
                                    vocabulary=webshell_voc.vocabulary_)
    x2 = wordpress_voc.fit_transform(wordpress).toarray()
    y2 = [0] * len(x2)

    x = np.concatenate((x1, x2))
    y = np.concatenate((y1, y2))

    return x, y


def main():
    webshell, wordpress = Datasets.load_php_webshell()
    x, y = to_voc(webshell, wordpress)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    print(gnb.score(x_test, y_test))  # 0.7659574468085106

    scores = cross_val_score(gnb, x, y, cv=3, scoring="accuracy")
    print(scores.mean())  # 0.7872046254399195


if __name__ == "__main__":
    main()