1234567891011121314151617181920212223242526272829303132333435363738394041424344 |
- def top_words(s: TextSeries, normalize=False) -> pd.Series:
- r"""
- Return a pandas series with index the top words and as value the count.
- Tokenization: split by space and remove all punctuations that are not
- between characters.
- Parameters
- ----------
- normalize : bool, optional, default=False.
- When set to true, return normalized values.
- Examples
- --------
- >>> import pandas as pd
- >>> import texthero as hero
- >>> s = pd.Series("one two two three three three")
- >>> hero.top_words(s)
- three 3
- two 2
- one 1
- dtype: int64
- """
-
-
-
-
-
- pattern = (
- rf"((\w)[{string.punctuation}](?:\B|$)|(?:^|\B)[{string.punctuation}](\w))"
- )
- return (
- s.str.replace(
- pattern, r"\2 \3"
- )
- .str.split()
- .explode()
- .value_counts(normalize=normalize)
- )
|