bilstm_attention.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652
  1. # -*- coding: utf-8 -*-
  2. import copy
  3. import sklearn
  4. import torch
  5. import torch.nn as nn
  6. import torch.nn.functional as F
  7. from torch.autograd import Variable
  8. import data_processor
  9. import logging
  10. fileName = './model_train.log'
  11. formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
  12. datefmt='%m/%d/%Y %H:%M:%S')
  13. handler = logging.FileHandler(filename=fileName, encoding="utf-8")
  14. handler.setFormatter(formatter)
  15. logging.basicConfig(level=logging.DEBUG, handlers=[handler])
  16. torch.manual_seed(123) # 保证每次运行初始化的随机数相同
  17. vocab_size = 5000 # 词表大小
  18. embedding_size = 64 # 词向量维度
  19. num_classes = 6 # 6分类 todo
  20. sentence_max_len = 64 # 单个句子的长度
  21. hidden_size = 16
  22. num_layers = 1 # 一层lstm
  23. num_directions = 2 # 双向lstm
  24. lr = 1e-3
  25. batch_size = 16 # batch_size 批尺寸
  26. epochs = 50
  27. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  28. app_names = ["趣享GIF众包测试201908试题"]
  29. # 航天 random_state 6
  30. # 趣享 13
  31. # ,"决赛自主可控众测web自主可控运维管理系统"
  32. bug_type = ["不正常退出", "功能不完整", "用户体验", "页面布局缺陷", "性能", "安全"]
  33. lexicon = {0: [], 1: [], 2: [], 3: [], 4: [], 5: []}
  34. word_with_attention = {}
  35. n = 5 # 选择置信度最高的前n条数据
  36. m = 3 # 选择注意力权重最高的前m个词
  37. t1 = 3
  38. t2 = 8
  39. threshold_confidence = 0.9
  40. # Bi-LSTM模型
  41. class BiLSTMModel(nn.Module):
  42. # 声明带有模型参数的层
  43. def __init__(self, embedding_size, hidden_size, num_layers, num_directions, num_classes):
  44. super(BiLSTMModel, self).__init__()
  45. self.input_size = embedding_size
  46. self.hidden_size = hidden_size
  47. self.num_layers = num_layers
  48. self.num_directions = num_directions
  49. self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers=num_layers, bidirectional=(num_directions == 2))
  50. # torch.nn.Sequential 类是 torch.nn 中的一种序列容器,通过在容器中嵌套各种实现神经网络中具体功能相关的类,来完成对神经网络模型的搭建,
  51. # 最主要的是,参数会按照我们定义好的序列自动传递下去。
  52. # torch.nn.Linear 类接收的参数有三个,分别是输入特征数、输出特征数和是否使用偏置,
  53. # 设置是否使用偏置的参数是一个布尔值,默认为 True ,即使用偏置。
  54. self.attention_weights_layer = nn.Sequential(
  55. nn.Linear(hidden_size, hidden_size), # 从hidden_size到hideen_size的线性变换
  56. nn.ReLU(inplace=True) # 激活函数
  57. )
  58. self.liner = nn.Linear(hidden_size, num_classes)
  59. self.act_func = nn.Softmax(dim=1)
  60. # 定义模型的前向计算,即如何根据输入x计算返回所需要的模型输出
  61. def forward(self, x):
  62. # lstm的输入维度为 [seq_len, batch, input_size]
  63. # x [batch_size, sentence_length, embedding_size]
  64. x = x.permute(1, 0, 2) # [sentence_length, batch_size, embedding_size] ,将x进行依次转置
  65. # 由于数据集不一定是预先设置的batch_size的整数倍,所以用size(1)获取当前数据实际的batch
  66. batch_size = x.size(1)
  67. # 设置lstm最初的前项输出
  68. h_0 = torch.randn(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(device)
  69. c_0 = torch.randn(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(device)
  70. # out[seq_len, batch, num_directions * hidden_size]。多层lstm,out只保存最后一层每个时间步t的输出h_t
  71. # h_n, c_n [num_laye,rs * num_directions, batch, hidden_size]
  72. out, (h_n, c_n) = self.lstm(x, (h_0, c_0))
  73. # 将双向lstm的输出拆分为前向输出和后向输出
  74. (forward_out, backward_out) = torch.chunk(out, 2, dim=2)
  75. out = forward_out + backward_out # [seq_len, batch, hidden_size]
  76. out = out.permute(1, 0, 2) # [batch, seq_len, hidden_size]
  77. # 为了使用到lstm最后一个时间步时,每层lstm的表达,用h_n生成attention的权重
  78. h_n = h_n.permute(1, 0, 2) # [batch, num_layers * num_directions, hidden_size]
  79. h_n = torch.sum(h_n, dim=1) # [batch, 1, hidden_size]
  80. h_n = h_n.squeeze(dim=1) # [batch, hidden_size]
  81. # Bi-LSTM + Attention 就是在Bi-LSTM的模型上加入Attention层,在Bi-LSTM中我们会用最后一个时序的输出向量 作为特征向量,然后进行softmax分类。Attention是先计算每个时序的权重,然后将所有时序 的向量进行加权和作为特征向量,然后进行softmax分类。在实验中,加上Attention确实对结果有所提升。
  82. # https://blog.csdn.net/zwqjoy/article/details/96724702
  83. attention_w = self.attention_weights_layer(h_n) # [batch, hidden_size]
  84. attention_w = attention_w.unsqueeze(dim=1) # [batch, 1, hidden_size] [16, 1, 16]
  85. # print(attention_w)
  86. attention_context = torch.bmm(attention_w, out.transpose(1, 2)) # [batch, 1, seq_len] [16 ,1, 32]
  87. # print(attention_context)
  88. softmax_w = F.softmax(attention_context, dim=-1) # [batch, 1, seq_len],权重归一化 [16, 1, 32]
  89. # print(softmax_w) # 这个是注意力机制的权重向量
  90. x = torch.bmm(softmax_w, out) # [batch, 1, hidden_size]
  91. x = x.squeeze(dim=1) # [batch, hidden_size]
  92. x = self.liner(x)
  93. x = self.act_func(x) # [16, 6]
  94. return softmax_w, x
  95. # 将 发展集中新预测的标签数据添加到训练集中,然后再次训练分类器
  96. # 这些新伪标签数据的类别分布要平衡
  97. # 通过基础分类器和词库共同 预测 标签的类型。
  98. # 这种预测 共分为两个流程:第一个是 预测发展集的标签并把预测好的数据加到训练集中
  99. # 第二个是 当加完所有的伪标签数据后,重新训练 基础分类器,用 新的基础分类器+最全的词库去预测 测试集
  100. def develop_to_train(new_labeled_data, train_features, develop_features, train_labels, develop_labels):
  101. for key in sorted(new_labeled_data, reverse=True):
  102. feature = develop_features.pop(key)
  103. del develop_labels[key]
  104. label_index = new_labeled_data[key]
  105. label = [0] * num_classes
  106. label[label_index] = 1
  107. train_labels.append(label)
  108. train_features.append(feature)
  109. return train_features, develop_features, train_labels, develop_labels
  110. # 在发展集上重新运行基础分类器,获得一组关于发展集的关键词
  111. # 方法 是 通过基础分类器在发展集上预测出的置信度和单词的attention 为发展集收集词库
  112. def test_with_lexicon(model, develop_loader, develop_feature_origin, word2index):
  113. model.eval() # 评估模式而非训练模式,batchNorm层和dropout层等用于优化训练而添加的网络层会被关闭,使评估时不会发生偏移
  114. confidence_list = [] # 总的置信度列表
  115. category_list = [] # 总的预判种类列表
  116. attention_list = [] # 总的word权重列表
  117. for datas, labels in develop_loader:
  118. datas = datas.to(device) # 将所有最开始读取数据时的tensor变量copy一份到device所指定的GPU上去,之后的运算都在GPU上进行
  119. softmax_w, preds = model.forward(datas)
  120. softmax_w = softmax_w.squeeze(dim=1) # [16, 32]
  121. attention = softmax_w.tolist()
  122. attention_list.extend(attention)
  123. # pre_test = torch.argmax(preds, dim=1)
  124. # label_test = torch.argmax(labels, dim=1)
  125. # develop_true += torch.sum(pre_test == label_test).item()
  126. a = preds.max(dim=1)
  127. confidence = a[0].tolist() # 置信度列表
  128. category = a[1].tolist() # 预测的类别列表
  129. confidence_list.extend(confidence)
  130. category_list.extend(category)
  131. confidence_dict = dict(zip(confidence_list, list(range(len(confidence_list)))))
  132. category_dict = dict(zip(list(range(len(category_list))), category_list))
  133. attention_dict = dict(zip(list(range(len(attention_list))), attention_list))
  134. develop_true = 0
  135. develop_all = 0
  136. lexicon_num = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
  137. for i in sorted(confidence_dict, reverse=True):
  138. lexicon_key = category_dict[confidence_dict[i]]
  139. if lexicon_num[lexicon_key] <= n: # 每个类别取置信度最高的前n条数据
  140. # print(str(lexicon_key) + ":" + str(i))
  141. label = torch.argmax(develop_loader.dataset.tensors[1].data, dim=1)[confidence_dict[i]]
  142. develop_all += 1
  143. if label == lexicon_key:
  144. develop_true += 1
  145. # TODO 伪标签 与标签
  146. lexicon_num[lexicon_key] += 1
  147. lexicon_value_attention = attention_dict[confidence_dict[i]]
  148. lexicon_value_word = develop_feature_origin[confidence_dict[i]]
  149. attention2word = dict(zip(lexicon_value_attention, lexicon_value_word))
  150. word2attention = {}
  151. for j in sorted(attention2word, reverse=True):
  152. word = list(word2index.keys())[list(word2index.values()).index(attention2word[j])]
  153. if word != "<unk>" and word != "<pad>":
  154. if word in word2attention.keys():
  155. word2attention[word] += j
  156. else:
  157. word2attention[word] = j
  158. q = 0
  159. for k in sorted(word2attention.items(), key=lambda kv: (kv[1], kv[0]), reverse=True):
  160. if k[0] not in word_with_attention:
  161. word_with_attention[k[0]] = k[1]
  162. if q < m and k[1] >= word_with_attention[k[0]]: # 按关键字在剧中的attention 判定关键字类别
  163. for key, value in lexicon.items():
  164. if k[0] in lexicon[key]:
  165. lexicon[key].remove(k[0])
  166. lexicon[lexicon_key].append(k[0])
  167. q += 1
  168. print("每类选出前{}个,正确的有{}个,一共有{}个".format(n, develop_true, develop_all))
  169. new_labeled_data = {}
  170. # 此时,已经获得了这一轮的类别词库
  171. # 记录下新被贴标签的数据,记录第k个数据和它新的类别,之后在发展集中剔除它,把它加到训练集
  172. for k in range(len(confidence_list)):
  173. lexicon_value_word = develop_feature_origin[k]
  174. match_num = [0] * num_classes
  175. for value_word in lexicon_value_word:
  176. word = list(word2index.keys())[list(word2index.values()).index(value_word)]
  177. if word != "<unk>" and word != "<pad>":
  178. for l in range(num_classes):
  179. if word in lexicon.get(l):
  180. # 会不会出现同一个词在多个类别词库中出现的问题
  181. match_num[l] = match_num[l] + 1
  182. max_num = max(match_num)
  183. # print(str(match_num) + "---"+ str(confidence_list[k]))
  184. if match_num.count(max_num) != 1:
  185. continue
  186. elif max_num >= t2:
  187. # 就根据词库对应的类贴标签给这个数据
  188. new_labeled_data[k] = match_num.index(max_num)
  189. elif confidence_list[k] > threshold_confidence and t1 <= max_num < t2:
  190. new_labeled_data[k] = category_list[k]
  191. # 返回被标记的数据的行数和它的新类别
  192. return new_labeled_data
  193. # 对于取发展集里预测置信度前n条 换成cosin 距离排序前n项
  194. def test_with_lexicon_tensor_dis(model, develop_loader, develop_feature_origin, word2index):
  195. from classify_service.arp import get_distance_matrix_tensor, arp_tensor_result
  196. model.eval() # 评估模式而非训练模式,batchNorm层和dropout层等用于优化训练而添加的网络层会被关闭,使评估时不会发生偏移
  197. confidence_list = [] # 总的置信度列表
  198. category_list = [] # 总的预判种类列表
  199. attention_list = [] # 总的word权重列表
  200. for datas, labels in develop_loader:
  201. datas = datas.to(device) # 将所有最开始读取数据时的tensor变量copy一份到device所指定的GPU上去,之后的运算都在GPU上进行
  202. softmax_w, preds = model.forward(datas)
  203. softmax_w = softmax_w.squeeze(dim=1) # [16, 32]
  204. attention = softmax_w.tolist()
  205. attention_list.extend(attention)
  206. # pre_test = torch.argmax(preds, dim=1)
  207. # label_test = torch.argmax(labels, dim=1)
  208. # develop_true += torch.sum(pre_test == label_test).item()
  209. a = preds.max(dim=1)
  210. confidence = a[0].tolist() # 置信度列表
  211. category = a[1].tolist() # 预测的类别列表
  212. confidence_list.extend(confidence)
  213. category_list.extend(category)
  214. confidence_dict = dict(zip(confidence_list, list(range(len(confidence_list)))))
  215. dis_matrix = get_distance_matrix_tensor(develop_feature_origin, 'c')
  216. first_id = confidence_dict[sorted(confidence_dict, reverse=True)[0]] # 把置信度第一的用例 作为第一条数据
  217. # first_id = get_max_index(dis_matrix) # 把距离最近的两个词 中的一个作为第一条数据 TODO 实现逻辑
  218. ordered_list = arp_tensor_result(develop_feature_origin, dis_matrix, first_id) #
  219. category_dict = dict(zip(list(range(len(category_list))), category_list))
  220. attention_dict = dict(zip(list(range(len(attention_list))), attention_list))
  221. develop_true = 0
  222. develop_all = 0
  223. lexicon_num = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
  224. for i in ordered_list:
  225. lexicon_key = category_dict[i]
  226. if lexicon_num[lexicon_key] <= n: # 每个类别取置信度最高的前n条数据
  227. # print(str(lexicon_key) + ":" + str(i))
  228. label = torch.argmax(develop_loader.dataset.tensors[1].data, dim=1)[i]
  229. develop_all += 1
  230. if label == lexicon_key:
  231. develop_true += 1
  232. lexicon_num[lexicon_key] += 1
  233. lexicon_value_attention = attention_dict[i]
  234. lexicon_value_word = develop_feature_origin[i]
  235. attention2word = dict(zip(lexicon_value_attention, lexicon_value_word))
  236. word2attention = {}
  237. for j in sorted(attention2word, reverse=True):
  238. word = list(word2index.keys())[list(word2index.values()).index(attention2word[j])]
  239. if word != "<unk>" and word != "<pad>":
  240. if word in word2attention.keys():
  241. word2attention[word] += j
  242. else:
  243. word2attention[word] = j
  244. q = 0
  245. for k in sorted(word2attention.items(), key=lambda kv: (kv[1], kv[0]), reverse=True):
  246. if k[0] not in word_with_attention:
  247. word_with_attention[k[0]] = k[1]
  248. if q < m and k[1] >= word_with_attention[k[0]]: # 按关键字在句中的attention 判定关键字类别
  249. for key, value in lexicon.items():
  250. if k[0] in lexicon[key]:
  251. lexicon[key].remove(k[0])
  252. lexicon[lexicon_key].append(k[0])
  253. q += 1
  254. print("每类选出前{}个,正确的有{}个,一共有{}个".format(n, develop_true, develop_all))
  255. new_labeled_data = {}
  256. # 此时,已经获得了这一轮的类别词库
  257. # 记录下新被贴标签的数据,记录第k个数据和它新的类别,之后在发展集中剔除它,把它加到训练集
  258. for k in range(len(confidence_list)):
  259. lexicon_value_word = develop_feature_origin[k]
  260. match_num = [0] * num_classes
  261. for value_word in lexicon_value_word:
  262. word = list(word2index.keys())[list(word2index.values()).index(value_word)]
  263. if word != "<unk>" and word != "<pad>":
  264. for l in range(num_classes):
  265. if word in lexicon.get(l):
  266. # 会不会出现同一个词在多个类别词库中出现的问题
  267. match_num[l] = match_num[l] + 1
  268. max_num = max(match_num)
  269. # print(str(match_num) + "---"+ str(confidence_list[k]))
  270. if match_num.count(max_num) != 1:
  271. continue
  272. elif max_num >= t2:
  273. # 就根据词库对应的类贴标签给这个数据
  274. new_labeled_data[k] = match_num.index(max_num)
  275. elif confidence_list[k] > threshold_confidence and t1 <= max_num < t2:
  276. new_labeled_data[k] = category_list[k]
  277. # 返回被标记的数据的行数和它的新类别
  278. return new_labeled_data
  279. def test(model, test_loader, loss_func, test_feature_origin, word2index):
  280. model.eval()
  281. loss_val = 0.0
  282. corrects = 0.0
  283. confidence_list = [] # 总的置信度列表
  284. category_list = [] # 总的预判种类列表
  285. label_list = []
  286. for datas, labels in test_loader:
  287. datas = datas.to(device)
  288. labels = labels.to(device)
  289. labels_num = labels.tolist()
  290. label_list_tmp = []
  291. for label in labels_num:
  292. sum_label = 0
  293. for i in range(len(label)):
  294. sum_label = sum_label + label[i] * i
  295. label_list_tmp.append(sum_label)
  296. softmax_w, preds = model.forward(datas)
  297. a = preds.max(dim=1)
  298. confidence = a[0].tolist() # 置信度列表
  299. category = a[1].tolist() # 预测的类别列表
  300. confidence_list.extend(confidence)
  301. category_list.extend(category)
  302. label_list.extend(label_list_tmp)
  303. """
  304. loss = loss_func(preds, labels)
  305. loss_val += loss.item() * datas.size(0)
  306. #获取预测的最大概率出现的位置
  307. preds = torch.argmax(preds, dim=1)
  308. labels = torch.argmax(labels, dim=1)
  309. corrects += torch.sum(preds == labels).item()
  310. """
  311. for k in range(len(confidence_list)):
  312. lexicon_value_word = test_feature_origin[k]
  313. match_num = [0] * num_classes
  314. for value_word in lexicon_value_word:
  315. word = list(word2index.keys())[list(word2index.values()).index(value_word)]
  316. if word != "<unk>" and word != "<pad>":
  317. for l in range(num_classes):
  318. if word in lexicon.get(l):
  319. # 会不会出现同一个词在多个类别词库中出现的问题
  320. match_num[l] = match_num[l] + 1
  321. max_num = max(match_num)
  322. if match_num.count(max_num) != 1:
  323. continue
  324. elif max_num >= t2:
  325. # 就根据词库对应的类贴标签给这个数据
  326. category_list[k] = match_num.index(max_num)
  327. test_loss = 0
  328. test_acc = 1
  329. for i in range(len(category_list)):
  330. # print("第{}个标签: category_list: {}, label_list: {}".format(i, category_list[i], label_list[i]))
  331. if category_list[i] == label_list[i]:
  332. corrects = corrects + 1
  333. test_acc = corrects / len(category_list)
  334. print("Test Loss: {}, Test Acc: {}".format(test_loss, test_acc))
  335. return test_acc
  336. def test_origin(model, test_loader, loss_func):
  337. model.eval()
  338. with torch.no_grad():
  339. loss_val = 0.0
  340. corrects = 0.0
  341. recall_all = 0
  342. f1_all = 0
  343. pre_all = 0
  344. for datas, labels in test_loader:
  345. datas = datas.to(device)
  346. labels = labels.to(device)
  347. softmax_w, preds = model.forward(datas)
  348. loss = loss_func(preds, labels)
  349. loss_val += loss.item() * datas.size(0)
  350. # 获取预测的最大概率出现的位置
  351. preds = torch.argmax(preds, dim=1)
  352. labels = torch.argmax(labels, dim=1)
  353. recall = sklearn.metrics.recall_score(labels, preds, average="macro", zero_division=0)
  354. f1 = sklearn.metrics.f1_score(labels, preds, average="macro", zero_division=0)
  355. pre = sklearn.metrics.precision_score(labels, preds, average="macro", zero_division=0)
  356. corrects += torch.sum(preds == labels).item()
  357. recall_all += recall
  358. f1_all += f1
  359. pre_all += pre
  360. test_acc = corrects / len(test_loader.dataset)
  361. test_recall = recall_all / len(test_loader.batch_sampler)
  362. test_f1 = f1_all / len(test_loader.batch_sampler)
  363. test_pre = pre_all / len(test_loader.batch_sampler)
  364. # print("Test Loss: {}, Test Acc: {}".format(test_loss, test_acc))
  365. return test_acc, test_recall, test_f1, test_pre
  366. def get_evaluation(model, test_loader):
  367. model.eval()
  368. # print(model.state_dict())
  369. with torch.no_grad():
  370. corrects = 0.0
  371. recall_all = 0
  372. f1_all = 0
  373. pre_all = 0
  374. label_all = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
  375. test_label_all = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
  376. for datas, labels in test_loader:
  377. datas = datas.to(device)
  378. labels = labels.to(device)
  379. softmax_w, preds = model.forward(datas)
  380. preds = torch.argmax(preds, dim=1)
  381. labels = torch.argmax(labels, dim=1)
  382. for i in range(len(preds)):
  383. category_a = preds[i].tolist() # 预测的类别列表
  384. category_b = labels[i].tolist()
  385. test_label_all[category_b + 1] += 1
  386. if category_b == category_a:
  387. label_all[category_b + 1] += 1
  388. recall = sklearn.metrics.recall_score(labels, preds, average="macro", zero_division=0)
  389. f1 = sklearn.metrics.f1_score(labels, preds, average="macro", zero_division=0)
  390. pre = sklearn.metrics.precision_score(labels, preds, average="macro", zero_division=0)
  391. corrects += torch.sum(preds == labels).item()
  392. # label_all[labels.]+=torch.sum(preds == labels).item()
  393. recall_all += recall
  394. f1_all += f1
  395. pre_all += pre
  396. test_acc = corrects / len(test_loader.dataset)
  397. test_recall = recall_all / len(test_loader.batch_sampler)
  398. test_f1 = f1_all / len(test_loader.batch_sampler)
  399. test_pre = pre_all / len(test_loader.batch_sampler)
  400. print(label_all)
  401. print(test_label_all)
  402. # print("Test Loss: {}, Test Acc: {}".format(test_loss, test_acc))
  403. return test_acc, test_recall, test_f1, test_pre
  404. def train_origin(model, train_loader, test_loader, optimizer, loss_func, epochs):
  405. # best_val_acc = 0.0
  406. best_val_acc = test_origin(model, test_loader, loss_func)[0]
  407. best_model_params = copy.deepcopy(model.state_dict())
  408. for epoch in range(epochs):
  409. model.train()
  410. loss_val = 0.0
  411. corrects = 0.0
  412. for datas, labels in train_loader:
  413. datas = datas.to(device)
  414. labels = labels.to(device)
  415. attention_w, preds = model.forward(datas) # 使用model预测数据
  416. loss = loss_func(preds, labels)
  417. optimizer.zero_grad()
  418. loss.backward()
  419. optimizer.step()
  420. loss_val += loss.item() * datas.size(0)
  421. # 获取预测的最大概率出现的位置
  422. preds = torch.argmax(preds, dim=1)
  423. labels = torch.argmax(labels, dim=1)
  424. corrects += torch.sum(preds == labels).item()
  425. train_loss = loss_val / len(train_loader.dataset)
  426. train_acc = corrects / len(train_loader.dataset)
  427. # print("Train Loss: {}, Train Acc: {}".format(train_loss, train_acc))
  428. if epoch % 2 == 0:
  429. test_acc = test_origin(model, test_loader, loss_func)[0]
  430. if best_val_acc < test_acc:
  431. print("best:", best_val_acc, " new_best:", test_acc)
  432. best_val_acc = test_acc
  433. best_model_params = copy.deepcopy(model.state_dict())
  434. model.load_state_dict(best_model_params)
  435. return model
  436. # 从给定的训练集数据中创建一个基础分类器,训练集数据很少,数据类别分布平衡
  437. # 这个基础分类器过度拟合训练集??????? todo
  438. def train(model, train_loader, optimizer, loss_func, epochs):
  439. best_val_acc = 0.0
  440. best_model_params = copy.deepcopy(model.state_dict())
  441. # epoch、batch、iteration的概念 https://www.jianshu.com/p/22c50ded4cf7?from=groupmessage
  442. for epoch in range(epochs):
  443. model.train()
  444. loss_val = 0.0
  445. corrects = 0.0
  446. for datas, labels in train_loader:
  447. datas = datas.to(device)
  448. labels = labels.to(device)
  449. # print("第{}批训练数据: labels: {}".format(epoch, labels))
  450. attention_w, preds = model.forward(datas) # 使用model预测数据
  451. loss = loss_func(preds, labels)
  452. optimizer.zero_grad()
  453. loss.backward()
  454. optimizer.step()
  455. loss_val += loss.item() * datas.size(0)
  456. # 获取预测的最大概率出现的位置
  457. preds = torch.argmax(preds, dim=1)
  458. labels = torch.argmax(labels, dim=1)
  459. corrects += torch.sum(preds == labels).item()
  460. train_loss = loss_val / len(train_loader.dataset)
  461. train_acc = corrects / len(train_loader.dataset)
  462. # print("Train Loss: {}, Train Acc: {}".format(train_loss, train_acc))
  463. if best_val_acc < train_acc:
  464. best_val_acc = train_acc
  465. best_model_params = copy.deepcopy(model.state_dict())
  466. model.load_state_dict(best_model_params)
  467. return model
  468. def main():
  469. for app_name in app_names:
  470. processor = data_processor.DataProcessor(dataset_name=app_name)
  471. train_features, develop_features, test_features, train_labels, develop_labels, test_labels, word2index = processor.get_datasets_origin(
  472. vocab_size=vocab_size, max_len=sentence_max_len)
  473. train_datasets, develop_datasets, test_datasets = processor.get_datasets(train_features, develop_features,
  474. test_features, train_labels,
  475. develop_labels, test_labels,
  476. vocab_size=vocab_size,
  477. embedding_size=embedding_size)
  478. logging.info("开始训练模型:" + app_name)
  479. # train_loader是 batch_size(16)个 数据(train_features)
  480. logging.info("pytorch 初始化")
  481. train_loader = torch.utils.data.DataLoader(train_datasets, batch_size=batch_size, shuffle=False)
  482. develop_loader = torch.utils.data.DataLoader(develop_datasets, batch_size=batch_size, shuffle=False)
  483. test_loader = torch.utils.data.DataLoader(test_datasets, batch_size=batch_size, shuffle=False)
  484. logging.info("模型初始化")
  485. model = BiLSTMModel(embedding_size, hidden_size, num_layers, num_directions, num_classes)
  486. model = model.to(device)
  487. optimizer = torch.optim.Adam(model.parameters(), lr=lr)
  488. loss_func = nn.BCELoss()
  489. # 训练基础的模型
  490. logging.info("开始训练基础分类器")
  491. # model = train(model, train_loader, optimizer, loss_func, epochs)
  492. model = train_origin(model, train_loader, test_loader, optimizer, loss_func, epochs)
  493. test_acc, test_recall, test_f1, test_pre = get_evaluation(model, test_loader)
  494. # torch.load("../classify_model/" + app_name + ".pth")
  495. # best_acc = 0
  496. # best_recall = 0
  497. # best_f1 = 0
  498. # best_pre = 0
  499. #
  500. # for i in range(20):
  501. # test_acc, test_recall, test_f1, test_pre = get_evaluation(model, test_loader)
  502. # # torch.load("../classify_model/" + app_name + ".pth")
  503. #
  504. # if test_acc > best_acc:
  505. # best_acc = test_acc
  506. # best_recall = test_recall
  507. # best_f1 = test_f1
  508. # best_pre = test_pre
  509. logging.info("初始分类器accuracy为{}".format(test_acc))
  510. logging.info("初始分类器召回率为{}".format(test_recall))
  511. logging.info("初始分类器precision为{}".format(test_pre))
  512. logging.info("初始分类器f1_score为{}".format(test_f1))
  513. i = 0
  514. while 1:
  515. i = i + 1
  516. # 从发展集中构建词库
  517. # new_labeled_data = test_with_lexicon(model, develop_loader, develop_features, word2index)
  518. new_labeled_data = test_with_lexicon_tensor_dis(model, develop_loader, develop_features, word2index)
  519. print("重新贴标签的数据是{}".format(new_labeled_data))
  520. print("现在的词库是{}".format(lexicon))
  521. if len(new_labeled_data) == 0:
  522. break
  523. train_features, develop_features, train_labels, develop_labels = develop_to_train(new_labeled_data,
  524. train_features,
  525. develop_features,
  526. train_labels,
  527. develop_labels)
  528. embed = nn.Embedding(vocab_size + 2, embedding_size) # https://www.jianshu.com/p/63e7acc5e890
  529. train_features_after1 = torch.LongTensor(train_features)
  530. train_features_after1 = embed(train_features_after1)
  531. train_features_after2 = Variable(train_features_after1, requires_grad=False)
  532. train_labels_after = torch.FloatTensor(train_labels)
  533. train_datasets = torch.utils.data.TensorDataset(train_features_after2, train_labels_after)
  534. train_loader = torch.utils.data.DataLoader(train_datasets, batch_size=batch_size, shuffle=False)
  535. develop_features_after1 = torch.LongTensor(develop_features)
  536. develop_features_after1 = embed(develop_features_after1)
  537. develop_features_after2 = Variable(develop_features_after1, requires_grad=False)
  538. develop_labels_after = torch.FloatTensor(develop_labels)
  539. develop_datasets = torch.utils.data.TensorDataset(develop_features_after2, develop_labels_after)
  540. develop_loader = torch.utils.data.DataLoader(develop_datasets, batch_size=batch_size, shuffle=False)
  541. logging.info("开始第{}次重训练".format(i))
  542. model = train_origin(model, train_loader, test_loader, optimizer, loss_func, epochs)
  543. model = train_origin(model, train_loader, test_loader, optimizer, loss_func, epochs)
  544. best_acc = 0
  545. best_recall = 0
  546. best_f1 = 0
  547. best_pre = 0
  548. for i in range(20):
  549. test_acc, test_recall, test_f1, test_pre = get_evaluation(model, test_loader)
  550. # torch.load("../classify_model/" + app_name + ".pth")
  551. if test_acc > best_acc:
  552. best_acc = test_acc
  553. best_recall = test_recall
  554. best_f1 = test_f1
  555. best_pre = test_pre
  556. logging.info("训练完成,测试集Accuracy为{}".format(best_acc))
  557. logging.info("训练完成,测试集召回率为{}".format(best_recall))
  558. logging.info("训练完成,测试集Precision为{}".format(best_pre))
  559. logging.info("训练完成,测试集f1_score为{}".format(best_f1))
  560. torch.save(model, "../classify_model/" + app_name + ".pth")
  561. if __name__ == '__main__':
  562. main()