def top_words(s: TextSeries, normalize=False) -> pd.Series: r""" Return a pandas series with index the top words and as value the count. Tokenization: split by space and remove all punctuations that are not between characters. Parameters ---------- normalize : bool, optional, default=False. When set to true, return normalized values. Examples -------- >>> import pandas as pd >>> import texthero as hero >>> s = pd.Series("one two two three three three") >>> hero.top_words(s) three 3 two 2 one 1 dtype: int64 """ # Replace all punctuation that are NOT in-between chacarters # This means, they have either a non word-bounding \B, are at the start ^, or at the end $ # As re.sub replace all and not just the matching group, add matching parenthesis to the character # to keep during replacement. # TODO replace it with tokenizer. pattern = ( rf"((\w)[{string.punctuation}](?:\B|$)|(?:^|\B)[{string.punctuation}](\w))" ) return ( s.str.replace( pattern, r"\2 \3" ) # \2 and \3 permits to keep the character around the punctuation. .str.split() # now split by space .explode() # one word for each line .value_counts(normalize=normalize) )