nlp_utils.py 1.1 KB

12345678910111213141516171819202122232425262728293031323334
  1. from nltk.corpus import stopwords
  2. from nltk.tokenize import word_tokenize, sent_tokenize
  3. from crest.helper.utils import read_file
  4. stop_words = set(stopwords.words('english'))
  5. import string
  6. def is_whitespace(c, use_space=True):
  7. if (c == " " and use_space) or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
  8. return True
  9. return False
  10. def compact_text(paragraph_text):
  11. doc_tokens = []
  12. prev_is_whitespace = True
  13. for c in paragraph_text:
  14. if is_whitespace(c):
  15. prev_is_whitespace = True
  16. else:
  17. if prev_is_whitespace:
  18. doc_tokens.append(' ')
  19. doc_tokens.append(c)
  20. prev_is_whitespace = False
  21. else:
  22. doc_tokens.append(c)
  23. prev_is_whitespace = False
  24. return ''.join(doc_tokens)
  25. # removes punctutions and stop words from the text
  26. def normalize_text(text):
  27. state = text.lower()
  28. out = state.translate(str.maketrans('', '', string.punctuation))
  29. out = word_tokenize(out)
  30. s_ws = [w for w in out if not w in stop_words]
  31. return s_ws