data_processor.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Thu May 21 19:19:01 2020
  4. 读取数据并对数据做预处理
  5. 统计出训练数据中出现频次最多的5k个单词,用这出现最多的5k个单词创建词表(词向量)
  6. 对于测试数据,直接用训练数据构建的词表
  7. @author:
  8. """
  9. import os
  10. import copy
  11. import torch
  12. import torch.nn as nn
  13. from torch.autograd import Variable
  14. import torch.utils.data # 新添加代码
  15. import sklearn
  16. from sklearn import model_selection
  17. import numpy as np
  18. import pymysql
  19. import classifyer
  20. from nlpcda import Simbert
  21. import logging
  22. import character_processor
  23. from bert4keras.backend import keras, K
  24. from bert4keras.models import build_transformer_model
  25. from bert4keras.tokenizers import Tokenizer
  26. from bert4keras.snippets import sequence_padding, AutoRegressiveDecoder
  27. fileName = './model_train.log'
  28. formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(module)s: %(message)s',
  29. datefmt='%m/%d/%Y %H:%M:%S')
  30. handler = logging.FileHandler(filename=fileName, encoding="utf-8")
  31. handler.setFormatter(formatter)
  32. logging.basicConfig(level=logging.DEBUG, handlers=[handler])
  33. simbert_config = {
  34. 'model_path': './chinese_roformer-sim-char_L-12_H-768_A-12',
  35. 'CUDA_VISIBLE_DEVICES': '0,1',
  36. 'max_len': 64,
  37. 'seed': 1
  38. }
  39. bug_type = ["不正常退出", "功能不完整", "用户体验", "页面布局缺陷", "性能", "安全"]
  40. num_classes = len(bug_type)
  41. word2index_path_base = "../word2index/"
  42. torch.manual_seed(123)
  43. datas = []
  44. labels = []
  45. processor = character_processor.DataProcessor("./ltp_data_v3.4.0/cws.model", "./ltp_data_v3.4.0/pos.model")
  46. synonym_dict = processor.synonym_word_dict("./ltp_data_v3.4.0/HIT-IRLab-同义词词林.txt")
  47. def read_file(file_path):
  48. f = open(file_path, "r", encoding="utf-8")
  49. msg = f.read()
  50. return msg
  51. def process_file(root_path):
  52. # 获取该目录下所有的文件名称和目录名称
  53. dir_or_files = os.listdir(root_path)
  54. for dir_file in dir_or_files:
  55. # 获取目录或者文件的路径
  56. dir_file_path = os.path.join(root_path, dir_file)
  57. # 判断该路径为文件还是路径
  58. if os.path.isdir(dir_file_path):
  59. # 递归获取所有文件和目录的路径
  60. process_file(dir_file_path)
  61. else:
  62. if "after" in dir_file_path:
  63. description = read_file(dir_file_path)
  64. datas.append(description)
  65. label_china = dir_file_path.split("/")[8]
  66. label = []
  67. for i in bug_type:
  68. if i == label_china:
  69. label.append(1)
  70. else:
  71. label.append(0)
  72. labels.append(label)
  73. class DataProcessor(object):
  74. def __init__(self, dataset_name=None, host=None, user=None, password=None):
  75. self.dataset_name = dataset_name
  76. self.datas_path = "./word_list_data/" + str(self.dataset_name) + ".npy"
  77. self.labels_path = "./word_list_data/" + str(self.dataset_name) + "_label.npy"
  78. self.datas_increase_path = "./word_list_data/" + str(self.dataset_name) + "_increase.npy"
  79. self.labels_increase_path = "./word_list_data/" + str(self.dataset_name) + "_label_increase.npy"
  80. self.host = host
  81. self.user = user
  82. self.password = password
  83. self.directory = "./splited_data/"
  84. if user == None or password == None:
  85. self.host = "127.0.0.1"
  86. self.user = "root"
  87. self.password = "123456"
  88. def read_text_from_db(self):
  89. datas = []
  90. labels = []
  91. conn = pymysql.connect(host=self.host, user=self.user, password=self.password, database="mt_clerk_test",
  92. charset="utf8")
  93. cursor = conn.cursor()
  94. try:
  95. sql = "select id from dataset where name = %s"
  96. cursor.execute(sql, str(self.dataset_name))
  97. dataset_id = cursor.fetchall()[0][0]
  98. sql = "select test_process,test_requirement,product_version_module,tccategory_id,name from test_case where dataset_id = %s and tccategory_id is not null"
  99. cursor.execute(sql, str(dataset_id))
  100. results = cursor.fetchall()
  101. for row in results:
  102. test_process = row[0]
  103. test_requirement = row[1]
  104. product_version_module = row[2]
  105. tccategory_id = int(row[3])
  106. name = row[4]
  107. text = classifyer.text_after_ltp(test_process, test_requirement, product_version_module, name)
  108. datas.append(text)
  109. label = []
  110. for i in range(num_classes):
  111. if i == tccategory_id - 1:
  112. label.append(1)
  113. else:
  114. label.append(0)
  115. labels.append(label)
  116. except Exception as e:
  117. raise e
  118. finally:
  119. cursor.close()
  120. conn.close()
  121. np.save(self.datas_path, datas)
  122. np.save(self.labels_path, labels)
  123. return datas, labels
  124. def read_text_from_file_system(self):
  125. global datas, labels
  126. process_file("/Users/tanghaojie/Desktop/final/手动标记后的数据/决赛自主可控众测web自主可控运维管理系统")
  127. return datas, labels
  128. def increase_data(self):
  129. logging.info("开始数据扩增")
  130. datas_pre = np.load(self.directory + self.dataset_name + "_train.npy", allow_pickle=True).tolist()
  131. labels_pre = np.load(self.directory + self.dataset_name + "_label_train.npy", allow_pickle=True).tolist()
  132. datas = []
  133. labels = []
  134. type_num_test = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
  135. for label in labels_pre:
  136. type_num_test[label.index(1) + 1] += 1
  137. k = 500 # 每种type 要求数量
  138. self.increase_by_synonms_list(datas, labels, datas_pre, labels_pre, type_num_test, k)
  139. np.save(self.datas_increase_path, datas)
  140. np.save(self.labels_increase_path, labels)
  141. return datas, labels
  142. def increase_by_synonms_list(self, datas, labels, datas_pre, labels_pre, type_num_test, k):
  143. if len(datas_pre) == len(labels_pre):
  144. for type in type_num_test:
  145. type_num_test[type] = int(k / type_num_test[type])
  146. for i in range(len(datas_pre)):
  147. temp_type = labels_pre[i].index(1) + 1
  148. # print(temp_type)
  149. datas_append = self.get_case_increased(datas_pre[i], type_num_test[temp_type])
  150. for p in range(len(datas_append)):
  151. labels.append(labels_pre[i])
  152. datas.extend(datas_append)
  153. def get_case_increased(self, origin, k):
  154. res = [[]]
  155. for word in origin:
  156. if len(res) < k:
  157. synonym_word_list = processor.synonym_word_list(word, synonym_dict)
  158. else:
  159. synonym_word_list = [word]
  160. # res = [l.append(w) for w in synonym_word_list for l in res]
  161. temp_res = []
  162. for l in res:
  163. for w in synonym_word_list:
  164. t = l.copy()
  165. t.append(w)
  166. temp_res.append(t)
  167. res = temp_res
  168. return res[:k]
  169. def increase_by_simbert(self, datas, labels, datas_pre, labels_pre, type_num_test, k):
  170. # TODO 按最终比例
  171. if len(datas_pre) == len(labels_pre):
  172. simbert = Simbert(config=simbert_config)
  173. nums = [3, 0, 0, 3, 5, 5]
  174. for i in range(len(datas_pre)):
  175. datas.append(datas_pre[i])
  176. labels.append(labels_pre[i])
  177. synonym_list = []
  178. temp_type = labels_pre[i].index(1)
  179. if nums[temp_type] > 0:
  180. for word in datas_pre[i]:
  181. # synonyms = gen_synonyms(word,nums[temp_type]*5,nums[temp_type])
  182. synonyms = sorted(simbert.replace(sent=word, create_num=nums[temp_type]),
  183. key=lambda item: item[1],
  184. reverse=True)
  185. synonym_list.append(synonyms)
  186. min_len = min([len(k) for k in synonym_list]) # 可能生成单词数不足nums[temp_type]
  187. data_increased = [[synonym_list[j][i][0] for j in range(len(synonym_list))] for i in range(min_len)]
  188. for j in range(len(data_increased)):
  189. datas.append(data_increased[j])
  190. labels.append(labels_pre[i])
  191. def word_count(self, datas):
  192. # 统计单词出现的频次,并将其降序排列,得出出现频次最多的单词
  193. dic = {}
  194. for data in datas:
  195. for word in data:
  196. word = word.lower() # 所有单词转化为小写,中文没有小写 todo
  197. if (word in dic):
  198. dic[word] += 1
  199. else:
  200. dic[word] = 1
  201. word_count_sorted = sorted(dic.items(), key=lambda item: item[1], reverse=True)
  202. return word_count_sorted # 键是词,值是出现的次数
  203. def word_index(self, datas, vocab_size):
  204. # 创建词表
  205. word_count_sorted = self.word_count(datas)
  206. word2index = {}
  207. # 词表中未出现的词,因为词表大小有限,所以有些句子中的词不在词表中
  208. word2index["<unk>"] = 0
  209. # 句子添加的padding,whats this
  210. word2index["<pad>"] = 1
  211. # 词表的实际大小由词的数量和限定大小决定
  212. vocab_size = min(len(word_count_sorted), vocab_size)
  213. for i in range(vocab_size):
  214. word = word_count_sorted[i][0]
  215. word2index[word] = i + 2 # 键是 词,值是在word2index列表中的位置
  216. word2index_path = word2index_path_base + self.dataset_name + ".npy"
  217. np.save(word2index_path, word2index)
  218. return word2index, vocab_size
  219. def get_datasets_origin(self, vocab_size, max_len):
  220. # 注,由于nn.Embedding每次生成的词嵌入不固定,因此此处同时获取训练数据的词嵌入和测试数据的词嵌入
  221. # 测试数据的词表也用训练数据创建
  222. logging.info('正在从数据库读取原始数据')
  223. # txt_origin, label_origin = self.read_text_from_db()
  224. # txt_origin = np.load(self.datas_path, allow_pickle=True).tolist()
  225. # label_origin = np.load(self.labels_path, allow_pickle=True).tolist()
  226. # logging.info('正在对原始数据进行数据扩增')
  227. # txt_origin, label_origin = self.increase_data()
  228. # txt_origin = np.load(self.datas_increase_path, allow_pickle=True).tolist()
  229. # label_origin = np.load(self.labels_increase_path, allow_pickle=True).tolist()
  230. #
  231. # train_datas, test_datas, train_labels, test_labels = sklearn.model_selection.train_test_split(txt_origin,
  232. # label_origin,
  233. # random_state=2,
  234. # train_size=0.6,
  235. # test_size=0.4)
  236. # test_datas, develop_datas, test_labels, develop_labels = sklearn.model_selection.train_test_split(test_datas,
  237. # test_labels,
  238. # random_state=2,
  239. # train_size=0.5,
  240. # test_size=0.5)
  241. directory = "./splited_data/"
  242. # train_datas = np.load(directory + self.dataset_name + "_train.npy", allow_pickle=True).tolist()
  243. # train_labels = np.load(directory + self.dataset_name + "_label_train.npy", allow_pickle=True).tolist()
  244. # train_datas = np.load(self.datas_increase_path, allow_pickle=True).tolist()
  245. # train_labels = np.load(self.labels_increase_path, allow_pickle=True).tolist()
  246. # train_datas = np.load(directory + self.dataset_name + "_train.npy", allow_pickle=True).tolist()
  247. # train_labels = np.load(directory + self.dataset_name + "_label_train.npy", allow_pickle=True).tolist()
  248. train_datas = np.load(self.datas_increase_path, allow_pickle=True).tolist()
  249. train_labels = np.load(self.labels_increase_path, allow_pickle=True).tolist()
  250. test_datas = np.load(directory + self.dataset_name + "_test.npy", allow_pickle=True).tolist()
  251. test_labels = np.load(directory + self.dataset_name + "_label_test.npy", allow_pickle=True).tolist()
  252. develop_datas = np.load(directory + self.dataset_name + "_develop.npy", allow_pickle=True).tolist()
  253. develop_labels = np.load(directory + self.dataset_name + "_label_develop.npy", allow_pickle=True).tolist()
  254. # txt_origin, label_origin = self.increase_data()
  255. logging.info('正在制作词表')
  256. word_datas = copy.deepcopy(train_datas)
  257. word_datas.extend(develop_datas)
  258. word_datas.extend(test_datas)
  259. word2index, vocab_size = self.word_index(word_datas, vocab_size) # 获得word2index词表 和 词表的实际大小
  260. logging.info('正在获取词向量')
  261. train_features = []
  262. for data in train_datas:
  263. feature = []
  264. for word in data:
  265. word = word.lower() # 词表中的单词均为小写
  266. if word in word2index:
  267. feature.append(word2index[word])
  268. else:
  269. feature.append(word2index["<unk>"]) # 词表中未出现的词用<unk>代替
  270. if (len(feature) == max_len): # 限制句子的最大长度,超出部分直接截断
  271. break
  272. # 对未达到最大长度的句子添加padding
  273. feature = feature + [word2index["<pad>"]] * (max_len - len(feature))
  274. train_features.append(feature)
  275. develop_features = []
  276. for data in develop_datas:
  277. feature = []
  278. for word in data:
  279. word = word.lower() # 词表中的单词均为小写
  280. if word in word2index:
  281. feature.append(word2index[word])
  282. else:
  283. feature.append(word2index["<unk>"]) # 词表中未出现的词用<unk>代替
  284. if (len(feature) == max_len): # 限制句子的最大长度,超出部分直接截断
  285. break
  286. # 对未达到最大长度的句子添加padding
  287. feature = feature + [word2index["<pad>"]] * (max_len - len(feature))
  288. develop_features.append(feature)
  289. test_features = []
  290. for data in test_datas:
  291. feature = []
  292. for word in data:
  293. word = word.lower() # 词表中的单词均为小写
  294. if word in word2index:
  295. feature.append(word2index[word])
  296. else:
  297. feature.append(word2index["<unk>"]) # 词表中未出现的词用<unk>代替
  298. if (len(feature) == max_len): # 限制句子的最大长度,超出部分直接截断
  299. break
  300. # 对未达到最大长度的句子添加padding
  301. feature = feature + [word2index["<pad>"]] * (max_len - len(feature))
  302. test_features.append(feature)
  303. return train_features, develop_features, test_features, train_labels, develop_labels, test_labels, word2index
  304. def get_datasets(self, train_features, develop_features, test_features, train_labels, develop_labels, test_labels,
  305. vocab_size, embedding_size):
  306. # 将词的index转换成tensor,train_features中数据的维度需要一致,否则会报错
  307. train_features = torch.LongTensor(train_features)
  308. train_labels = torch.FloatTensor(train_labels)
  309. develop_features = torch.LongTensor(develop_features)
  310. develop_labels = torch.FloatTensor(develop_labels)
  311. test_features = torch.LongTensor(test_features)
  312. test_labels = torch.FloatTensor(test_labels)
  313. # 将词转化为embedding
  314. # 词表中有两个特殊的词<unk>和<pad>,所以词表实际大小为vocab_size + 2
  315. embed = nn.Embedding(vocab_size + 2, embedding_size) # https://www.jianshu.com/p/63e7acc5e890
  316. train_features = embed(train_features)
  317. develop_features = embed(develop_features)
  318. test_features = embed(test_features)
  319. # 指定输入特征是否需要计算梯度
  320. train_features = Variable(train_features,
  321. requires_grad=False) # https://www.cnblogs.com/henuliulei/p/11363121.html
  322. train_datasets = torch.utils.data.TensorDataset(train_features, train_labels)
  323. develop_features = Variable(develop_features,
  324. requires_grad=False) # https://www.cnblogs.com/henuliulei/p/11363121.html
  325. develop_datasets = torch.utils.data.TensorDataset(develop_features, develop_labels)
  326. test_features = Variable(test_features, requires_grad=False)
  327. test_datasets = torch.utils.data.TensorDataset(test_features,
  328. test_labels) # https://www.cnblogs.com/hahaah/p/14914603.html
  329. return train_datasets, develop_datasets, test_datasets
  330. def new_split(dataset_name):
  331. names = [dataset_name]
  332. directory = "./splited_data/"
  333. for name in names:
  334. dp = DataProcessor(dataset_name=name)
  335. txt_origin = np.load(dp.datas_path, allow_pickle=True).tolist()
  336. label_origin = np.load(dp.labels_path, allow_pickle=True).tolist()
  337. train_datas, test_datas, train_labels, test_labels = sklearn.model_selection.train_test_split(txt_origin,
  338. label_origin,
  339. random_state=13,
  340. train_size=0.6,
  341. test_size=0.4)
  342. test_datas, develop_datas, test_labels, develop_labels = sklearn.model_selection.train_test_split(test_datas,
  343. test_labels,
  344. random_state=13,
  345. train_size=0.5,
  346. test_size=0.5)
  347. np.save(directory + name + "_train.npy", train_datas)
  348. np.save(directory + name + "_label_train.npy", train_labels)
  349. np.save(directory + name + "_test.npy", test_datas)
  350. np.save(directory + name + "_label_test.npy", test_labels)
  351. np.save(directory + name + "_develop.npy", develop_datas)
  352. np.save(directory + name + "_label_develop.npy", develop_labels)
  353. def increase_data(names):
  354. for name in names:
  355. dp = DataProcessor(dataset_name=name)
  356. # train_labels = np.load(dp.directory + dp.dataset_name + "_label_train.npy", allow_pickle=True).tolist()
  357. # test_labels = np.load(dp.directory + dp.dataset_name + "_label_test.npy", allow_pickle=True).tolist()
  358. train_labels = np.load("./splited_data/"+name+"_label_train.npy", allow_pickle=True).tolist()
  359. test_labels = np.load(dp.labels_increase_path, allow_pickle=True).tolist()
  360. type_num_train = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
  361. type_num_test = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
  362. for label in test_labels:
  363. type_num_test[label.index(1) + 1] += 1
  364. for label in train_labels:
  365. type_num_train[label.index(1) + 1] += 1
  366. print("扩增前:", type_num_train)
  367. print("扩增后:", type_num_test)
  368. def increase_test(names):
  369. for name in names:
  370. dp = DataProcessor(dataset_name=name)
  371. dp.increase_data()
  372. def read_data():
  373. names = ["航天中认自主可控众包测试练习赛", "决赛自主可控众测web自主可控运维管理系统"]
  374. for name in names:
  375. dp = DataProcessor(dataset_name=name)
  376. dp.read_text_from_db()
  377. if __name__ == '__main__':
  378. direc = "./splited_data/"
  379. names = ['趣享GIF众包测试201908试题']
  380. # new_split(names[0])
  381. # read_data()
  382. increase_test(names)
  383. increase_data(names)