text_processing.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. import spacy
  2. import nltk
  3. from string import punctuation
  4. nltk.download("stopwords")
  5. from nltk.corpus import stopwords
  6. from spacy.lemmatizer import Lemmatizer
  7. from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
  8. from spacy.tokenizer import Tokenizer
  9. nlp = spacy.load('en_core_web_sm')
  10. import sklearn
  11. from sklearn.base import TransformerMixin
  12. from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
  13. # Normalizes the text: removes digits and punctuation
  14. def normalizeText(text):
  15. text = ''.join(c for c in text if not c.isdigit())
  16. text = ''.join(c for c in text if c not in punctuation).lower()
  17. return text
  18. # Processes the text: normalizes it and removes stop words
  19. def processText(text):
  20. text = normalizeText(text)
  21. STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
  22. text = ' '.join([word for word in text.split() if word not in STOPLIST])
  23. return text
  24. # Tokenizes the text
  25. def tokenization(text):
  26. tokenizer = Tokenizer(nlp.vocab)
  27. tokens = tokenizer(text)
  28. lemmas = []
  29. for tok in tokens:
  30. lemmas.append(tok.lemma_ if tok.lemma_ != "-PRON-" else tok)
  31. tokens = lemmas
  32. return tokens
  33. class CleanTextTransformer(TransformerMixin):
  34. def transform(self, X, **transform_params):
  35. return [processText(text) for text in X]
  36. def fit(self, X, y=None, **fit_params):
  37. return self
  38. def get_params(self, deep=True):
  39. return {}