LiuFan
/
PrivacyScanData


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344
							def top_words(s: TextSeries, normalize=False) -> pd.Series:
    r"""

    Return a pandas series with index the top words and as value the count.


    Tokenization: split by space and remove all punctuations that are not

    between characters.


    Parameters

    ----------

    normalize : bool, optional, default=False.

        When set to true, return normalized values.


    Examples

    --------

    >>> import pandas as pd

    >>> import texthero as hero

    >>> s = pd.Series("one two two three three three")

    >>> hero.top_words(s)

    three    3

    two      2

    one      1

    dtype: int64


    """

    # Replace all punctuation that are NOT in-between chacarters
    # This means, they have either a non word-bounding \B, are at the start ^, or at the end $
    # As re.sub replace all and not just the matching group, add matching parenthesis to the character
    # to keep during replacement.

    # TODO replace it with tokenizer.

    pattern = (
        rf"((\w)[{string.punctuation}](?:\B|$)|(?:^|\B)[{string.punctuation}](\w))"
    )

    return (
        s.str.replace(
            pattern, r"\2 \3"
        )  # \2 and \3 permits to keep the character around the punctuation.
        .str.split()  # now split by space
        .explode()  # one word for each line
        .value_counts(normalize=normalize)
    )