1234567891011121314151617181920212223242526272829303132333435363738394041424344 |
- def top_words(s: TextSeries, normalize=False) -> pd.Series:
- r"""
- Return a pandas series with index the top words and as value the count.
- Tokenization: split by space and remove all punctuations that are not
- between characters.
- Parameters
- ----------
- normalize : bool, optional, default=False.
- When set to true, return normalized values.
- Examples
- --------
- >>> import pandas as pd
- >>> import texthero as hero
- >>> s = pd.Series("one two two three three three")
- >>> hero.top_words(s)
- three 3
- two 2
- one 1
- dtype: int64
- """
- # Replace all punctuation that are NOT in-between chacarters
- # This means, they have either a non word-bounding \B, are at the start ^, or at the end $
- # As re.sub replace all and not just the matching group, add matching parenthesis to the character
- # to keep during replacement.
- # TODO replace it with tokenizer.
- pattern = (
- rf"((\w)[{string.punctuation}](?:\B|$)|(?:^|\B)[{string.punctuation}](\w))"
- )
- return (
- s.str.replace(
- pattern, r"\2 \3"
- ) # \2 and \3 permits to keep the character around the punctuation.
- .str.split() # now split by space
- .explode() # one word for each line
- .value_counts(normalize=normalize)
- )
|