#!/usr/bin/python #coding=utf-8 import os import numpy as np import logging import sklearn from sklearn.model_selection import train_test_split #导入切分训练集、测试集模块 from sklearn.neighbors import KNeighborsClassifier from sklearn import svm from sklearn.naive_bayes import GaussianNB fileName = './constract.log' handler = [logging.FileHandler(filename=fileName,encoding="utf-8")] logging.basicConfig(level = logging.DEBUG, handlers = handler) parent_path = os.path.dirname(os.path.realpath(__file__)) grander_path = os.path.dirname(parent_path) word_list_data_path_base = parent_path + "/word_list_data/" word2index_path_base = grander_path + "/word2index/" dataset_name = "航天中认自主可控众包测试练习赛" max_len = 64 vocab_size = 5000 embedding_size = 64 batch_size = 16 random_state = 15 def contrast(): logging.info("正在加载初始数据") txts = np.load(word_list_data_path_base + str(dataset_name) + ".npy", allow_pickle=True) labels = np.load(word_list_data_path_base + str(dataset_name) + "_label.npy", allow_pickle=True) labels_new = [] for label in labels: label_new = 0 for i in range(len(label)): label_new += i * label[i] labels_new.append(label_new) labels_new = np.array(labels_new) logging.info("正在加载词表") word2index_path = word2index_path_base + str(dataset_name) + ".npy" word2index = np.load(word2index_path, allow_pickle=True).item() features = [] for txt in txts: text_feature = text_to_feature(txt, word2index, max_len) features.append(text_feature) #np.save(, features) score_knn_lowest = 100 score_svm_lowest = 100 score_nb_lowest = 100 score_knn_all = 0 score_svm_all = 0 score_nb_all = 0 for i in range(random_state): train_data, test_data, train_label, test_label = sklearn.model_selection.train_test_split(features, labels_new, random_state = i, train_size = 0.2,test_size = 0.8) logging.info("正在训练k最近邻分类器") knn_classifier = KNeighborsClassifier() knn_classifier.fit(train_data, train_label) score_knn = knn_classifier.score(test_data, test_label) if score_knn < score_knn_lowest: score_knn_lowest = score_knn score_knn_all = score_knn_all + score_knn logging.info("k最近邻分类器准确率为{}".format(score_knn)) logging.info("正在训练SVM分类器") svm_classifier = svm.SVC(C=2,kernel='rbf',gamma=10,decision_function_shape='ovr') svm_classifier.fit(train_data, train_label) score_svm = svm_classifier.score(test_data, test_label) if score_svm < score_svm_lowest: score_svm_lowest = score_svm score_svm_all = score_svm_all + score_svm logging.info("SVM分类器准确率为{}".format(score_svm)) logging.info("正在训练朴素贝叶斯分类器") muNB_classifier = GaussianNB() muNB_classifier.fit(train_data, train_label) score_nb = muNB_classifier.score(test_data, test_label) if score_nb < score_nb_lowest: score_nb_lowest = score_nb score_nb_all = score_nb_all + score_nb logging.info("朴素贝叶斯分类器准确率为{}".format(score_nb)) logging.info("k最近邻分类器最低准确率为{}".format(score_knn_lowest)) logging.info("SVM分类器最低准确率为{}".format(score_svm_lowest)) logging.info("朴素贝叶斯分类器最低准确率为{}".format(score_nb_lowest)) logging.info("k最近邻分类器平均准确率为{}".format(score_knn_all / random_state)) logging.info("SVM分类器平均准确率为{}".format(score_svm_all / random_state)) logging.info("朴素贝叶斯分类器平均准确率为{}".format(score_nb_all / random_state)) def text_to_feature(text, word2index, max_len): feature = [] for word in text: if word in word2index: feature.append(word2index[word]) else: feature.append(word2index[""]) if(len(feature) == max_len): break feature = feature + [word2index[""]] * (max_len - len(feature)) return feature if __name__ == "__main__": contrast()