12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- import spacy
- import nltk
- from string import punctuation
- nltk.download("stopwords")
- from nltk.corpus import stopwords
- from spacy.lemmatizer import Lemmatizer
- from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
- from spacy.tokenizer import Tokenizer
- nlp = spacy.load('en_core_web_sm')
- import sklearn
- from sklearn.base import TransformerMixin
- from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
- # Normalizes the text: removes digits and punctuation
- def normalizeText(text):
- text = ''.join(c for c in text if not c.isdigit())
- text = ''.join(c for c in text if c not in punctuation).lower()
- return text
- # Processes the text: normalizes it and removes stop words
- def processText(text):
- text = normalizeText(text)
- STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
- text = ' '.join([word for word in text.split() if word not in STOPLIST])
- return text
- # Tokenizes the text
- def tokenization(text):
- tokenizer = Tokenizer(nlp.vocab)
- tokens = tokenizer(text)
- lemmas = []
- for tok in tokens:
- lemmas.append(tok.lemma_ if tok.lemma_ != "-PRON-" else tok)
- tokens = lemmas
- return tokens
- class CleanTextTransformer(TransformerMixin):
- def transform(self, X, **transform_params):
- return [processText(text) for text in X]
- def fit(self, X, y=None, **fit_params):
- return self
- def get_params(self, deep=True):
- return {}
|