LiuFan
/
PrivacyScanData


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243
							import spacy
import nltk
from string import punctuation
nltk.download("stopwords")
from nltk.corpus import stopwords
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
from spacy.tokenizer import Tokenizer
nlp = spacy.load('en_core_web_sm')
import sklearn
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

# Normalizes the text: removes digits and punctuation
def normalizeText(text):
    text = ''.join(c for c in text if not c.isdigit())
    text = ''.join(c for c in text if c not in punctuation).lower()
    return text

# Processes the text: normalizes it and removes stop words
def processText(text):
    text = normalizeText(text)
    STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
    text = ' '.join([word for word in text.split() if word not in STOPLIST])
    return text

# Tokenizes the text
def tokenization(text):
    tokenizer = Tokenizer(nlp.vocab)
    tokens = tokenizer(text)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_ if tok.lemma_ != "-PRON-" else tok)
    tokens = lemmas
    return tokens

class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [processText(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}