data_processor.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Thu May 21 19:19:01 2020
  4. 读取数据并对数据做预处理
  5. 统计出训练数据中出现频次最多的5k个单词,用这出现最多的5k个单词创建词表(词向量)
  6. 对于测试数据,直接用训练数据构建的词表
  7. @author:
  8. """
  9. import os
  10. import copy
  11. import torch
  12. import torch.nn as nn
  13. from torch.autograd import Variable
  14. import torch.utils.data # 新添加代码
  15. import sklearn
  16. from sklearn import model_selection
  17. import numpy as np
  18. import pymysql
  19. import classifyer
  20. from nlpcda import Simbert
  21. import logging
  22. fileName = './model_train.log'
  23. handler = [logging.FileHandler(filename=fileName, encoding="utf-8")]
  24. logging.basicConfig(level=logging.DEBUG, handlers=handler)
  25. simbert_config = {
  26. 'model_path': './chinese_simbert_L-12_H-768_A-12',
  27. 'CUDA_VISIBLE_DEVICES': '0,1',
  28. 'max_len': 64,
  29. 'seed': 1
  30. }
  31. bug_type = ["不正常退出", "功能不完整", "用户体验", "页面布局缺陷", "性能", "安全"]
  32. num_classes = len(bug_type)
  33. word2index_path_base = "../word2index/"
  34. torch.manual_seed(123)
  35. datas = []
  36. labels = []
  37. def read_file(file_path):
  38. f = open(file_path, "r", encoding="utf-8")
  39. msg = f.read()
  40. return msg
  41. def process_file(root_path):
  42. # 获取该目录下所有的文件名称和目录名称
  43. dir_or_files = os.listdir(root_path)
  44. for dir_file in dir_or_files:
  45. # 获取目录或者文件的路径
  46. dir_file_path = os.path.join(root_path, dir_file)
  47. # 判断该路径为文件还是路径
  48. if os.path.isdir(dir_file_path):
  49. # 递归获取所有文件和目录的路径
  50. process_file(dir_file_path)
  51. else:
  52. if "after" in dir_file_path:
  53. description = read_file(dir_file_path)
  54. datas.append(description)
  55. label_china = dir_file_path.split("/")[8]
  56. label = []
  57. for i in bug_type:
  58. if i == label_china:
  59. label.append(1)
  60. else:
  61. label.append(0)
  62. labels.append(label)
  63. class DataProcessor(object):
  64. def __init__(self, dataset_name=None, host=None, user=None, password=None):
  65. self.dataset_name = dataset_name
  66. self.datas_path = "./word_list_data/" + str(self.dataset_name) + ".npy"
  67. self.labels_path = "./word_list_data/" + str(self.dataset_name) + "_label.npy"
  68. self.datas_increase_path = "./word_list_data/" + str(self.dataset_name) + "_increase.npy"
  69. self.labels_increase_path = "./word_list_data/" + str(self.dataset_name) + "_label_increase.npy"
  70. self.host = host
  71. self.user = user
  72. self.password = password
  73. if user == None or password == None:
  74. self.host = "127.0.0.1"
  75. self.user = "root"
  76. self.password = "123456"
  77. def read_text_from_db(self):
  78. datas = []
  79. labels = []
  80. conn = pymysql.connect(host=self.host, user=self.user, password=self.password, database="mt_clerk_test",
  81. charset="utf8")
  82. cursor = conn.cursor()
  83. try:
  84. sql = "select id from dataset where name = %s"
  85. cursor.execute(sql, str(self.dataset_name))
  86. dataset_id = cursor.fetchall()[0][0]
  87. sql = "select test_process,test_requirement,product_version_module,tccategory_id,name from test_case where dataset_id = %s and tccategory_id is not null"
  88. cursor.execute(sql, str(dataset_id))
  89. results = cursor.fetchall()
  90. for row in results:
  91. test_process = row[0]
  92. test_requirement = row[1]
  93. product_version_module = row[2]
  94. tccategory_id = int(row[3])
  95. name = row[4]
  96. text = classifyer.text_after_ltp(test_process, test_requirement, product_version_module, name)
  97. datas.append(text)
  98. label = []
  99. for i in range(num_classes):
  100. if i == tccategory_id:
  101. label.append(1)
  102. else:
  103. label.append(0)
  104. labels.append(label)
  105. except Exception as e:
  106. raise e
  107. finally:
  108. cursor.close()
  109. conn.close()
  110. np.save(self.datas_path, datas)
  111. np.save(self.labels_path, labels)
  112. return datas, labels
  113. def read_text_from_file_system(self):
  114. global datas, labels
  115. process_file("/Users/tanghaojie/Desktop/final/手动标记后的数据/决赛自主可控众测web自主可控运维管理系统")
  116. return datas, labels
  117. def increase_data(self):
  118. simbert = Simbert(config=simbert_config)
  119. datas_pre = np.load(self.datas_path, allow_pickle=True)
  120. labels_pre = np.load(self.labels_path, allow_pickle=True)
  121. datas = []
  122. labels = []
  123. num = 5
  124. if (len(datas_pre) == len(labels_pre)):
  125. for i in range(len(datas_pre)):
  126. datas.append(datas_pre[i])
  127. labels.append(labels_pre[i])
  128. synonyms = simbert.replace(sent=datas_pre[i], create_num=num)
  129. for j in range(num):
  130. datas.append(synonyms[j][0])
  131. labels.append(labels_pre[i])
  132. np.save(self.datas_increase_path, datas)
  133. np.save(self.labels_increase_path, labels)
  134. return datas, labels
  135. def word_count(self, datas):
  136. # 统计单词出现的频次,并将其降序排列,得出出现频次最多的单词
  137. dic = {}
  138. for data in datas:
  139. for word in data:
  140. word = word.lower() # 所有单词转化为小写,中文没有小写 todo
  141. if (word in dic):
  142. dic[word] += 1
  143. else:
  144. dic[word] = 1
  145. word_count_sorted = sorted(dic.items(), key=lambda item: item[1], reverse=True)
  146. return word_count_sorted # 键是词,值是出现的次数
  147. def word_index(self, datas, vocab_size):
  148. # 创建词表
  149. word_count_sorted = self.word_count(datas)
  150. word2index = {}
  151. # 词表中未出现的词,因为词表大小有限,所以有些句子中的词不在词表中
  152. word2index["<unk>"] = 0
  153. # 句子添加的padding,whats this
  154. word2index["<pad>"] = 1
  155. # 词表的实际大小由词的数量和限定大小决定
  156. vocab_size = min(len(word_count_sorted), vocab_size)
  157. for i in range(vocab_size):
  158. word = word_count_sorted[i][0]
  159. word2index[word] = i + 2 # 键是 词,值是在word2index列表中的位置
  160. word2index_path = word2index_path_base + self.dataset_name + ".npy"
  161. np.save(word2index_path, word2index)
  162. return word2index, vocab_size
  163. def get_datasets_origin(self, vocab_size, max_len):
  164. # 注,由于nn.Embedding每次生成的词嵌入不固定,因此此处同时获取训练数据的词嵌入和测试数据的词嵌入
  165. # 测试数据的词表也用训练数据创建
  166. logging.info('正在从数据库读取原始数据')
  167. txt_origin, label_origin = self.read_text_from_db()
  168. logging.info('正在对原始数据进行数据扩增')
  169. txt_origin, label_origin = self.increase_data()
  170. # txt_origin = np.load(self.datas_increase_path, allow_pickle=True).tolist()
  171. # label_origin = np.load(self.labels_increase_path, allow_pickle=True).tolist()
  172. label_count = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
  173. for i in label_origin:
  174. sum = 0
  175. for j in range(len(i)):
  176. if i[j] == 1:
  177. sum = j
  178. label_count[sum] = label_count[sum] + 1
  179. logging.info('正在统计原始数据的标签类型', label_count)
  180. train_datas, test_datas, train_labels, test_labels = sklearn.model_selection.train_test_split(txt_origin,
  181. label_origin,
  182. random_state=2,
  183. train_size=0.2,
  184. test_size=0.8)
  185. test_datas, develop_datas, test_labels, develop_labels = sklearn.model_selection.train_test_split(test_datas,
  186. test_labels,
  187. random_state=2,
  188. train_size=0.25,
  189. test_size=0.75)
  190. logging.info('正在制作词表')
  191. word_datas = copy.deepcopy(train_datas)
  192. word_datas.extend(develop_datas)
  193. word_datas.extend(test_datas)
  194. word2index, vocab_size = self.word_index(word_datas, vocab_size) # 获得word2index词表 和 词表的实际大小
  195. logging.info('正在获取词向量')
  196. train_features = []
  197. for data in train_datas:
  198. feature = []
  199. for word in data:
  200. word = word.lower() # 词表中的单词均为小写
  201. if word in word2index:
  202. feature.append(word2index[word])
  203. else:
  204. feature.append(word2index["<unk>"]) # 词表中未出现的词用<unk>代替
  205. if (len(feature) == max_len): # 限制句子的最大长度,超出部分直接截断
  206. break
  207. # 对未达到最大长度的句子添加padding
  208. feature = feature + [word2index["<pad>"]] * (max_len - len(feature))
  209. train_features.append(feature)
  210. develop_features = []
  211. for data in develop_datas:
  212. feature = []
  213. for word in data:
  214. word = word.lower() # 词表中的单词均为小写
  215. if word in word2index:
  216. feature.append(word2index[word])
  217. else:
  218. feature.append(word2index["<unk>"]) # 词表中未出现的词用<unk>代替
  219. if (len(feature) == max_len): # 限制句子的最大长度,超出部分直接截断
  220. break
  221. # 对未达到最大长度的句子添加padding
  222. feature = feature + [word2index["<pad>"]] * (max_len - len(feature))
  223. develop_features.append(feature)
  224. test_features = []
  225. for data in test_datas:
  226. feature = []
  227. for word in data:
  228. word = word.lower() # 词表中的单词均为小写
  229. if word in word2index:
  230. feature.append(word2index[word])
  231. else:
  232. feature.append(word2index["<unk>"]) # 词表中未出现的词用<unk>代替
  233. if (len(feature) == max_len): # 限制句子的最大长度,超出部分直接截断
  234. break
  235. # 对未达到最大长度的句子添加padding
  236. feature = feature + [word2index["<pad>"]] * (max_len - len(feature))
  237. test_features.append(feature)
  238. return train_features, develop_features, test_features, train_labels, develop_labels, test_labels, word2index
  239. def get_datasets(self, train_features, develop_features, test_features, train_labels, develop_labels, test_labels,
  240. vocab_size, embedding_size):
  241. # 将词的index转换成tensor,train_features中数据的维度需要一致,否则会报错
  242. train_features = torch.LongTensor(train_features)
  243. train_labels = torch.FloatTensor(train_labels)
  244. develop_features = torch.LongTensor(develop_features)
  245. develop_labels = torch.FloatTensor(develop_labels)
  246. test_features = torch.LongTensor(test_features)
  247. test_labels = torch.FloatTensor(test_labels)
  248. # 将词转化为embedding
  249. # 词表中有两个特殊的词<unk>和<pad>,所以词表实际大小为vocab_size + 2
  250. embed = nn.Embedding(vocab_size + 2, embedding_size) # https://www.jianshu.com/p/63e7acc5e890
  251. train_features = embed(train_features)
  252. develop_features = embed(develop_features)
  253. test_features = embed(test_features)
  254. # 指定输入特征是否需要计算梯度
  255. train_features = Variable(train_features,
  256. requires_grad=False) # https://www.cnblogs.com/henuliulei/p/11363121.html
  257. train_datasets = torch.utils.data.TensorDataset(train_features, train_labels)
  258. develop_features = Variable(develop_features,
  259. requires_grad=False) # https://www.cnblogs.com/henuliulei/p/11363121.html
  260. develop_datasets = torch.utils.data.TensorDataset(develop_features, develop_labels)
  261. test_features = Variable(test_features, requires_grad=False)
  262. test_datasets = torch.utils.data.TensorDataset(test_features,
  263. test_labels) # https://www.cnblogs.com/hahaah/p/14914603.html
  264. return train_datasets, develop_datasets, test_datasets