character_processor.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. #!/usr/bin/python
  2. #coding=utf-8
  3. import pyltp
  4. from pyltp import Segmentor,Postagger
  5. import os
  6. class DataProcessor(object):
  7. def __init__(self, filepath_cws, filepath_pos, user_dict_filepath = None):
  8. super(DataProcessor, self).__init__()
  9. self.my_segmentor = Segmentor(filepath_cws,user_dict_filepath)
  10. self.postagger = Postagger(filepath_pos)
  11. def __del__(self):
  12. self.my_segmentor.release()
  13. self.postagger.release()
  14. def segmentor(self, sentence):
  15. ltpword = self.my_segmentor.segment(sentence)
  16. ltpword_list = list(ltpword)
  17. return ltpword_list
  18. def stop_word_list(self, filepath):
  19. stop_word = [line.strip() for line in open(filepath, "r", encoding = "utf-8").readlines()]
  20. return stop_word
  21. def clean_word_list(self, origin_data, stop_word):
  22. clean_word = [word for word in origin_data if word not in stop_word]
  23. postags = list(self.postagger.postag(clean_word))
  24. clean_word_after_postagger = []
  25. for i in range(len(postags)):
  26. if postags[i] == "n" or postags[i] == "v":
  27. clean_word_after_postagger.append(clean_word[i])
  28. return clean_word_after_postagger
  29. def synonym_word_dict(self, filepath):
  30. synonym_dict = []
  31. for line in open(filepath, "r", encoding = "utf-8"):
  32. items = line.replace("\n", "").split(" ")
  33. index = items[0]
  34. if(index[-1] == "="):
  35. synonym_dict.append(items[1:])
  36. return synonym_dict
  37. def synonym_replace_word(self, word, synonym_dict):
  38. for each in synonym_dict:
  39. for w in each:
  40. if w == word:
  41. return each[0] # 同义词替换为同义词表中的第一个词
  42. return word
  43. def synonym_replace_sentence(self, clean_word, synonym_dict):
  44. for i in range(len(clean_word)):
  45. clean_word[i] = self.synonym_replace_word(clean_word[i], synonym_dict)
  46. return clean_word
  47. if __name__ == "__main__":
  48. processor = DataProcessor("./ltp_data_v3.4.0/cws.model", "./ltp_data_v3.4.0/pos.model")
  49. myList = processor.segmentor("我们是人工智能研究所,主要致力于分享人工智能方面的技术知识,欢迎大家一起学习。")
  50. stop_word = processor.stop_word_list("./ltp_data_v3.4.0/stop_word.txt")
  51. clean_word = processor.clean_word_list(myList, stop_word)
  52. print(clean_word)
  53. synonym_dict = processor.synonym_word_dict("./ltp_data_v3.4.0/HIT-IRLab-同义词词林.txt")
  54. clean_word_after_synonym = processor.synonym_replace_sentence(clean_word, synonym_dict)
  55. print(clean_word_after_synonym)