123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159 |
- import os
- import numpy as np
- import logging
- import sklearn
- import torch
- from sklearn.model_selection import train_test_split
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn import svm
- from sklearn.naive_bayes import GaussianNB
- fileName = './constract.log'
- formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
- datefmt='%m/%d/%Y %H:%M:%S')
- handler = logging.FileHandler(filename=fileName, encoding="utf-8")
- handler.setFormatter(formatter)
- logging.basicConfig(level=logging.DEBUG, handlers=[handler])
- parent_path = os.path.dirname(os.path.realpath(__file__))
- grander_path = os.path.dirname(parent_path)
- word_list_data_path_base = parent_path + "/word_list_data/"
- word2index_path_base = grander_path + "/word2index/"
- dataset_name = "航天中认自主可控众包测试练习赛"
- max_len = 64
- vocab_size = 5000
- embedding_size = 64
- batch_size = 16
- random_state = 15
- def contrast():
- logging.info("正在加载初始数据")
- txts = np.load(word_list_data_path_base + str(dataset_name) + ".npy", allow_pickle=True)
- labels = np.load(word_list_data_path_base + str(dataset_name) + "_label.npy", allow_pickle=True)
- labels_new = []
- for label in labels:
- label_new = 0
- for i in range(len(label)):
- label_new += i * label[i]
- labels_new.append(label_new)
- labels_new = np.array(labels_new)
- logging.info("正在加载词表")
- word2index_path = word2index_path_base + str(dataset_name) + ".npy"
- word2index = np.load(word2index_path, allow_pickle=True).item()
- features = []
- for txt in txts:
- text_feature = text_to_feature(txt, word2index, max_len)
- features.append(text_feature)
-
- score_knn_lowest = 100
- score_svm_lowest = 100
- score_nb_lowest = 100
- score_knn_all = 0
- recall_knn_all = 0
- f1_knn_all = 0
- score_svm_all = 0
- recall_svm_all = 0
- f1_svm_all = 0
- score_nb_all = 0
- recall_nb_all = 0
- f1_nb_all = 0
- for i in range(random_state):
- train_data, test_data, train_label, test_label = sklearn.model_selection.train_test_split(features, labels_new,
- random_state=i,
- train_size=0.2,
- test_size=0.8)
- logging.info("正在训练k最近邻分类器")
- knn_classifier = KNeighborsClassifier()
- knn_classifier.fit(train_data, train_label)
- knn_predict = knn_classifier.predict(test_data)
- recall_knn = sklearn.metrics.recall_score(test_label, knn_predict, average="macro")
- f1_knn = sklearn.metrics.f1_score(test_label, knn_predict, average="macro")
- score_knn = knn_classifier.score(test_data, test_label)
- if score_knn < score_knn_lowest:
- score_knn_lowest = score_knn
- score_knn_all = score_knn_all + score_knn
- recall_knn_all += recall_knn
- f1_knn_all += f1_knn
- logging.info("k最近邻分类器准确率为{}".format(score_knn))
- logging.info("k最近邻分类器召回率为{}".format(recall_knn))
- logging.info("k最近邻分类器f1_score为{}".format(f1_knn))
- logging.info("正在训练SVM分类器")
- svm_classifier = svm.SVC(C=2, kernel='rbf', gamma=10, decision_function_shape='ovr')
- svm_classifier.fit(train_data, train_label)
- svm_predict = svm_classifier.predict(test_data)
- recall_svm = sklearn.metrics.recall_score(test_label, svm_predict, average="macro")
- f1_svm = sklearn.metrics.f1_score(test_label, svm_predict, average="macro")
- score_svm = svm_classifier.score(test_data, test_label)
- if score_svm < score_svm_lowest:
- score_svm_lowest = score_svm
- score_svm_all = score_svm_all + score_svm
- recall_svm_all += recall_svm
- f1_svm_all += f1_svm
- logging.info("SVM分类器准确率为{}".format(score_svm))
- logging.info("SVM分类器召回率为{}".format(recall_svm))
- logging.info("SVM分类器f1_score为{}".format(f1_svm))
- logging.info("正在训练朴素贝叶斯分类器")
- muNB_classifier = GaussianNB()
- muNB_classifier.fit(train_data, train_label)
- muNB_predict = muNB_classifier.predict(test_data)
- recall_nb = sklearn.metrics.recall_score(test_label, muNB_predict, average="macro")
- f1_nb = sklearn.metrics.f1_score(test_label, muNB_predict, average="macro")
- score_nb = muNB_classifier.score(test_data, test_label)
- if score_nb < score_nb_lowest:
- score_nb_lowest = score_nb
- score_nb_all = score_nb_all + score_nb
- recall_nb_all += recall_nb
- f1_nb_all += f1_nb
- logging.info("朴素贝叶斯分类器准确率为{}".format(score_nb))
- logging.info("朴素贝叶斯分类器召回率为{}".format(recall_nb))
- logging.info("朴素贝叶斯分类器f1_score为{}".format(f1_nb))
- logging.info("k最近邻分类器最低准确率为{}".format(score_knn_lowest))
- logging.info("SVM分类器最低准确率为{}".format(score_svm_lowest))
- logging.info("朴素贝叶斯分类器最低准确率为{}".format(score_nb_lowest))
- logging.info("k最近邻分类器平均准确率为{}".format(score_knn_all / random_state))
- logging.info("SVM分类器平均准确率为{}".format(score_svm_all / random_state))
- logging.info("朴素贝叶斯分类器平均准确率为{}".format(score_nb_all / random_state))
- logging.info("k最近邻分类器平均召回率为{}".format(recall_knn_all / random_state))
- logging.info("SVM分类器平均召回率为{}".format(recall_svm_all / random_state))
- logging.info("朴素贝叶斯分类器平均召回率为{}".format(recall_nb_all / random_state))
- logging.info("k最近邻分类器平均f1_score为{}".format(f1_knn_all / random_state))
- logging.info("SVM分类器平均f1_score为{}".format(f1_svm_all / random_state))
- logging.info("朴素贝叶斯分类器平均f1_score为{}".format(f1_nb_all / random_state))
- def text_to_feature(text, word2index, max_len):
- feature = []
- for word in text:
- if word in word2index:
- feature.append(word2index[word])
- else:
- feature.append(word2index["<unk>"])
- if len(feature) == max_len:
- break
- feature = feature + [word2index["<pad>"]] * (max_len - len(feature))
- return feature
- def calculate_bi_standards(name):
- model = torch.load(name)
- pass
- if __name__ == "__main__":
- contrast()
|