Sjim
/
exam-question-classification


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
							#!/usr/bin/python
#coding=utf-8

import pyltp
from pyltp import Segmentor,Postagger
import os


class DataProcessor(object):
    def __init__(self, filepath_cws, filepath_pos, user_dict_filepath = None):
        super(DataProcessor, self).__init__()
        self.my_segmentor = Segmentor(filepath_cws,user_dict_filepath)
        self.postagger = Postagger(filepath_pos)


    def __del__(self):
        self.my_segmentor.release()
        self.postagger.release()
        
    def segmentor(self, sentence):
        ltpword = self.my_segmentor.segment(sentence)
        ltpword_list = list(ltpword)
        return ltpword_list

    def stop_word_list(self, filepath):
        stop_word = [line.strip() for line in open(filepath, "r", encoding = "utf-8").readlines()]
        return stop_word

    def clean_word_list(self, origin_data, stop_word):
        clean_word = [word for word in origin_data if word not in stop_word]
        postags = list(self.postagger.postag(clean_word))
        clean_word_after_postagger = []
        for i in range(len(postags)):
            if postags[i] == "n" or postags[i] == "v":
                clean_word_after_postagger.append(clean_word[i]) 
        return clean_word_after_postagger

    def synonym_word_dict(self, filepath):
        synonym_dict = []
        for line in open(filepath, "r", encoding = "utf-8"):
            items = line.replace("\n", "").split(" ")
            index = items[0]
            if(index[-1] == "="):
                synonym_dict.append(items[1:])
        return synonym_dict

    def synonym_replace_word(self, word, synonym_dict):
        for each in synonym_dict:
            for w in each:
                if w == word:
                    return each[0] # 同义词替换为同义词表中的第一个词
        return word

    def synonym_replace_sentence(self, clean_word, synonym_dict):
        for i in range(len(clean_word)):
            clean_word[i] = self.synonym_replace_word(clean_word[i], synonym_dict)
        return clean_word

if __name__ == "__main__":
    processor = DataProcessor("./ltp_data_v3.4.0/cws.model", "./ltp_data_v3.4.0/pos.model")
    myList = processor.segmentor("我们是人工智能研究所，主要致力于分享人工智能方面的技术知识，欢迎大家一起学习。")

    stop_word = processor.stop_word_list("./ltp_data_v3.4.0/stop_word.txt")
    clean_word = processor.clean_word_list(myList, stop_word)
    print(clean_word)

    synonym_dict = processor.synonym_word_dict("./ltp_data_v3.4.0/HIT-IRLab-同义词词林.txt")
    clean_word_after_synonym = processor.synonym_replace_sentence(clean_word, synonym_dict)
    print(clean_word_after_synonym)