visualize_3_3.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. def top_words(s: TextSeries, normalize=False) -> pd.Series:
  2. r"""
  3. Return a pandas series with index the top words and as value the count.
  4. Tokenization: split by space and remove all punctuations that are not
  5. between characters.
  6. Parameters
  7. ----------
  8. normalize : bool, optional, default=False.
  9. When set to true, return normalized values.
  10. Examples
  11. --------
  12. >>> import pandas as pd
  13. >>> import texthero as hero
  14. >>> s = pd.Series("one two two three three three")
  15. >>> hero.top_words(s)
  16. three 3
  17. two 2
  18. one 1
  19. dtype: int64
  20. """
  21. # Replace all punctuation that are NOT in-between chacarters
  22. # This means, they have either a non word-bounding \B, are at the start ^, or at the end $
  23. # As re.sub replace all and not just the matching group, add matching parenthesis to the character
  24. # to keep during replacement.
  25. # TODO replace it with tokenizer.
  26. pattern = (
  27. rf"((\w)[{string.punctuation}](?:\B|$)|(?:^|\B)[{string.punctuation}](\w))"
  28. )
  29. return (
  30. s.str.replace(
  31. pattern, r"\2 \3"
  32. ) # \2 and \3 permits to keep the character around the punctuation.
  33. .str.split() # now split by space
  34. .explode() # one word for each line
  35. .value_counts(normalize=normalize)
  36. )