#!/usr/bin/python # coding=utf-8 import os import numpy as np import logging import sklearn import torch from sklearn.model_selection import train_test_split # 导入切分训练集、测试集模块 from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn import svm from sklearn.naive_bayes import GaussianNB fileName = './constract.log' formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s', datefmt='%m/%d/%Y %H:%M:%S') handler = logging.FileHandler(filename=fileName, encoding="utf-8") handler.setFormatter(formatter) logging.basicConfig(level=logging.DEBUG, handlers=[handler]) parent_path = os.path.dirname(os.path.realpath(__file__)) grander_path = os.path.dirname(parent_path) word_list_data_path_base = parent_path + "/word_list_data/" word2index_path_base = grander_path + "/word2index/" data_path = './word_list_data/' dataset_name = "决赛自主可控众测web自主可控运维管理系统" max_len = 64 vocab_size = 5000 embedding_size = 64 batch_size = 16 random_state = 15 def contrast(): logging.info("正在加载初始数据") direc = "./splited_data/" # txts = np.load(word_list_data_path_base + str(dataset_name) + ".npy", allow_pickle=True) # labels = np.load(word_list_data_path_base + str(dataset_name) + "_label.npy", allow_pickle=True) txts = np.load(direc + str(dataset_name) + "_train.npy", allow_pickle=True).tolist() labels = np.load(direc + str(dataset_name) + "_label_train.npy", allow_pickle=True).tolist() labels_new = [] for label in labels: label_new = 0 for i in range(len(label)): label_new += i * label[i] labels_new.append(label_new) labels_new = np.array(labels_new) logging.info("正在加载词表") word2index_path = word2index_path_base + str(dataset_name) + ".npy" word2index = np.load(word2index_path, allow_pickle=True).item() features = [] for txt in txts: text_feature = text_to_feature(txt, word2index, max_len) features.append(text_feature) # np.save(, features) score_knn_lowest = 100 score_svm_lowest = 100 score_nb_lowest = 100 score_bpnn_lowest = 100 score_knn_all = 0 recall_knn_all = 0 f1_knn_all = 0 pre_knn_all = 0 score_svm_all = 0 recall_svm_all = 0 f1_svm_all = 0 pre_svm_all = 0 score_nb_all = 0 recall_nb_all = 0 f1_nb_all = 0 pre_nb_all = 0 score_bpnn_all = 0 recall_bpnn_all = 0 f1_bpnn_all = 0 pre_bpnn_all = 0 for i in range(random_state): train_data, test_data, train_label, test_label = sklearn.model_selection.train_test_split(features, labels_new, random_state=i, train_size=0.6, test_size=0.2) logging.info("正在训练k最近邻分类器") knn_classifier = KNeighborsClassifier() knn_classifier.fit(train_data, train_label) knn_predict = knn_classifier.predict(test_data) recall_knn = sklearn.metrics.recall_score(test_label, knn_predict, average="macro") f1_knn = sklearn.metrics.f1_score(test_label, knn_predict, average="macro") score_knn = knn_classifier.score(test_data, test_label) pre_knn = sklearn.metrics.precision_score(test_label, knn_predict, average="macro") if score_knn < score_knn_lowest: score_knn_lowest = score_knn score_knn_all = score_knn_all + score_knn recall_knn_all += recall_knn f1_knn_all += f1_knn pre_knn_all += pre_knn logging.info("k最近邻分类器Acc为{}".format(score_knn)) logging.info("k最近邻分类器召回率为{}".format(recall_knn)) logging.info("k最近邻分类器f1_score为{}".format(f1_knn)) logging.info("正在训练SVM分类器") svm_classifier = svm.SVC(C=2, kernel='rbf', gamma=10, decision_function_shape='ovr') svm_classifier.fit(train_data, train_label) svm_predict = svm_classifier.predict(test_data) recall_svm = sklearn.metrics.recall_score(test_label, svm_predict, average="macro") f1_svm = sklearn.metrics.f1_score(test_label, svm_predict, average="macro") score_svm = svm_classifier.score(test_data, test_label) pre_svm = sklearn.metrics.precision_score(test_label, svm_predict, average="macro") if score_svm < score_svm_lowest: score_svm_lowest = score_svm score_svm_all = score_svm_all + score_svm recall_svm_all += recall_svm f1_svm_all += f1_svm pre_svm_all += pre_svm logging.info("SVM分类器Acc为{}".format(score_svm)) logging.info("SVM分类器召回率为{}".format(recall_svm)) logging.info("SVM分类器f1_score为{}".format(f1_svm)) # logging.info("正在训练朴素贝叶斯分类器") muNB_classifier = GaussianNB() muNB_classifier.fit(train_data, train_label) muNB_predict = muNB_classifier.predict(test_data) recall_nb = sklearn.metrics.recall_score(test_label, muNB_predict, average="macro") f1_nb = sklearn.metrics.f1_score(test_label, muNB_predict, average="macro") score_nb = muNB_classifier.score(test_data, test_label) pre_nb = sklearn.metrics.precision_score(test_label, muNB_predict, average="macro") if score_nb < score_nb_lowest: score_nb_lowest = score_nb score_nb_all = score_nb_all + score_nb recall_nb_all += recall_nb f1_nb_all += f1_nb pre_nb_all += pre_nb logging.info("朴素贝叶斯分类器Acc为{}".format(score_nb)) logging.info("朴素贝叶斯分类器召回率为{}".format(recall_nb)) logging.info("朴素贝叶斯分类器f1_score为{}".format(f1_nb)) logging.info("正在训练bpnn分类器") bpnn_classifier = MLPClassifier(solver='lbfgs', random_state=0, hidden_layer_sizes=[10, 10]) bpnn_classifier.fit(train_data, train_label) bpnn_predict = bpnn_classifier.predict(test_data) recall_bpnn = sklearn.metrics.recall_score(test_label, bpnn_predict, average="macro") f1_bpnn = sklearn.metrics.f1_score(test_label, bpnn_predict, average="macro") score_bpnn = bpnn_classifier.score(test_data, test_label) pre_bpnn = sklearn.metrics.precision_score(test_label, bpnn_predict, average="macro") if score_bpnn < score_bpnn_lowest: score_bpnn_lowest = score_bpnn score_bpnn_all = score_bpnn_all + score_bpnn recall_bpnn_all += recall_bpnn f1_bpnn_all += f1_bpnn pre_bpnn_all += pre_bpnn logging.info("bpnn分类器Acc为{}".format(score_bpnn)) logging.info("bpnn分类器召回率为{}".format(recall_bpnn)) logging.info("bpnn分类器f1_score为{}".format(f1_bpnn)) logging.info("数据集 " + dataset_name + " 结果:") logging.info("k最近邻分类器最低准确率为{}".format(score_knn_lowest)) logging.info("SVM分类器最低准确率为{}".format(score_svm_lowest)) logging.info("朴素贝叶斯分类器最低准确率为{}".format(score_nb_lowest)) logging.info("k最近邻分类器平均Acc为{}".format(score_knn_all / random_state)) logging.info("SVM分类器平均Acc为{}".format(score_svm_all / random_state)) logging.info("朴素贝叶斯分类器平均Acc为{}".format(score_nb_all / random_state)) logging.info("k最近邻分类器平均召回率为{}".format(recall_knn_all / random_state)) logging.info("SVM分类器平均召回率为{}".format(recall_svm_all / random_state)) logging.info("朴素贝叶斯分类器平均召回率为{}".format(recall_nb_all / random_state)) logging.info("k最近邻分类器平均f1_score为{}".format(f1_knn_all / random_state)) logging.info("SVM分类器平均f1_score为{}".format(f1_svm_all / random_state)) logging.info("朴素贝叶斯分类器平均f1_score为{}".format(f1_nb_all / random_state)) logging.info("k最近邻分类器平均precision为{}".format(pre_knn_all / random_state)) logging.info("SVM分类器平均precision为{}".format(pre_svm_all / random_state)) logging.info("朴素贝叶斯分类器平均precision为{}".format(pre_nb_all / random_state)) logging.info("bpnn分类器平均Acc为{}".format(score_bpnn_all / random_state)) logging.info("bpnn分类器平均召回率为{}".format(recall_bpnn_all / random_state)) logging.info("bpnn分类器平均f1_score为{}".format(f1_bpnn_all / random_state)) logging.info("bpnn分类器平均precision为{}".format(pre_bpnn_all / random_state)) def text_to_feature(text, word2index, max_len): feature = [] for word in text: if word in word2index: feature.append(word2index[word]) else: feature.append(word2index[""]) if len(feature) == max_len: break feature = feature + [word2index[""]] * (max_len - len(feature)) return feature def calculate_bi_standards(name): model = torch.load(name) pass if __name__ == "__main__": contrast()