visualize_3.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. """
  2. Visualize insights and statistics of a text-based Pandas DataFrame.
  3. """
  4. import pandas as pd
  5. import numpy as np
  6. import plotly.express as px
  7. from wordcloud import WordCloud
  8. from texthero import preprocessing
  9. from texthero._types import TextSeries, InputSeries
  10. import string
  11. from matplotlib.colors import LinearSegmentedColormap as lsg
  12. import matplotlib.pyplot as plt
  13. from collections import Counter
  14. def scatterplot(
  15. df: pd.DataFrame,
  16. col: str,
  17. color: str = None,
  18. hover_name: str = None,
  19. hover_data: list = None,
  20. title="",
  21. return_figure=False,
  22. ):
  23. """
  24. Show scatterplot of DataFrame column using python plotly scatter.
  25. Plot the values in column col. For example, if every cell in df[col]
  26. is a list of three values (e.g. from doing PCA with 3 components),
  27. a 3D-Plot is created and every cell entry [x, y, z] is visualized
  28. as the point (x, y, z).
  29. Parameters
  30. ----------
  31. df: DataFrame with a column to be visualized.
  32. col: str
  33. The name of the column of the DataFrame to use for x and y (and z)
  34. axis.
  35. color: str, optional, default=None
  36. Name of the column to use for coloring (rows with same value get same
  37. color).
  38. hover_name: str, optional, default=None
  39. Name of the column to supply title of hover data when hovering over a
  40. point.
  41. hover_data: List[str], optional, default=[]
  42. List of column names to supply data when hovering over a point.
  43. title: str, default to "".
  44. Title of the plot.
  45. return_figure: bool, optional, default=False
  46. Function returns the figure instead of showing it if set to True.
  47. Examples
  48. --------
  49. >>> import texthero as hero
  50. >>> import pandas as pd
  51. >>> df = pd.DataFrame(["Football, Sports, Soccer",
  52. ... "music, violin, orchestra", "football, fun, sports",
  53. ... "music, fun, guitar"], columns=["texts"])
  54. >>> df["texts"] = hero.clean(df["texts"]).pipe(hero.tokenize)
  55. >>> df["pca"] = (
  56. ... hero.tfidf(df["texts"])
  57. ... .pipe(hero.pca, n_components=3)
  58. ... )
  59. >>> df["topics"] = (
  60. ... hero.tfidf(df["texts"])
  61. ... .pipe(hero.kmeans, n_clusters=2)
  62. ... )
  63. >>> hero.scatterplot(df, col="pca", color="topics",
  64. ... hover_data=["texts"]) # doctest: +SKIP
  65. """
  66. plot_values = np.stack(df[col], axis=1)
  67. dimension = len(plot_values)
  68. if dimension < 2 or dimension > 3:
  69. raise ValueError(
  70. "The column you want to visualize has dimension < 2 or dimension > 3."
  71. " The function can only visualize 2- and 3-dimensional data."
  72. )
  73. if dimension == 2:
  74. x, y = plot_values[0], plot_values[1]
  75. fig = px.scatter(
  76. df,
  77. x=x,
  78. y=y,
  79. color=color,
  80. hover_data=hover_data,
  81. title=title,
  82. hover_name=hover_name,
  83. )
  84. else:
  85. x, y, z = plot_values[0], plot_values[1], plot_values[2]
  86. fig = px.scatter_3d(
  87. df,
  88. x=x,
  89. y=y,
  90. z=z,
  91. color=color,
  92. hover_data=hover_data,
  93. title=title,
  94. hover_name=hover_name,
  95. )
  96. if return_figure:
  97. return fig
  98. else:
  99. fig.show()
  100. """
  101. Wordcloud
  102. """
  103. @InputSeries(TextSeries)
  104. def wordcloud(
  105. s: TextSeries,
  106. font_path: str = None,
  107. width: int = 400,
  108. height: int = 200,
  109. max_words=200,
  110. mask=None,
  111. contour_width=0,
  112. contour_color="PAPAYAWHIP",
  113. min_font_size=4,
  114. background_color="PAPAYAWHIP",
  115. max_font_size=None,
  116. relative_scaling="auto",
  117. colormap=None,
  118. return_figure=False,
  119. ):
  120. """
  121. Plot wordcloud image using WordCloud from word_cloud package.
  122. Most of the arguments are very similar if not equal to the mother
  123. function. In constrast, all words are taken into account when computing
  124. the wordcloud, inclusive stopwords. They can be easily removed with
  125. preprocessing.remove_stopwords.
  126. Words are computed using generate_from_frequencies.
  127. To reduce blur in the wordcloud image, `width` and `height` should be at
  128. least 400.
  129. Parameters
  130. ----------
  131. s : :class:`texthero._types.TextSeries`
  132. font_path : str, optional, default=None
  133. Font path to the font that will be used (OTF or TTF). Defaults to
  134. DroidSansMono path on a Linux machine. If you are on another OS or
  135. don't have this font, you need to adjust this path.
  136. width : int, optional, default=400
  137. Width of the canvas.
  138. height : int, optional, default=200
  139. Height of the canvas.
  140. max_words : int, optional, default=200
  141. The maximum number of words.
  142. mask : nd-array or None, optional, default=None
  143. When set, gives a binary mask on where to draw words. When set, width
  144. and height will be ignored and the shape of mask will be used instead.
  145. All white (#FF or #FFFFFF) entries will be considerd "masked out"
  146. while other entries will be free to draw on.
  147. contour_width: float, optional, default=0
  148. If mask is not None and contour_width > 0, draw the mask contour.
  149. contour_color: str, optional, default="PAPAYAWHIP"
  150. Mask contour color.
  151. min_font_size : int, optional, default=4
  152. Smallest font size to use. Will stop when there is no more room in
  153. this size.
  154. background_color : str, optional, default="PAPAYAWHIP"
  155. Background color for the word cloud image.
  156. max_font_size : int or None, optional, default=None
  157. Maximum font size for the largest word. If None, height of the image
  158. is used.
  159. relative_scaling : float, optional, default="auto"
  160. Importance of relative word frequencies for font-size. With
  161. relative_scaling=0, only word-ranks are considered. With
  162. relative_scaling=1, a word that is twice as frequent will have twice
  163. the size. If you want to consider the word frequencies and not only
  164. their rank, relative_scaling around .5 often looks good.
  165. If 'auto' it will be set to 0.5 unless repeat is true, in which
  166. case it will be set to 0.
  167. colormap : string or matplotlib colormap, optional, default="viridis"
  168. Matplotlib colormap to randomly draw colors from for each word.
  169. """
  170. text = s.str.cat(sep=" ")
  171. if colormap is None:
  172. # Custom palette.
  173. # TODO move it under tools.
  174. corn = (255.0 / 256, 242.0 / 256, 117.0 / 256)
  175. mango_tango = (255.0 / 256, 140.0 / 256, 66.0 / 256)
  176. crayola = (63.0 / 256, 136.0 / 256, 197.0 / 256)
  177. crimson = (215.0 / 256, 38.0 / 256, 61.0 / 256)
  178. oxford_blue = (2.0 / 256, 24.0 / 256, 43.0 / 256)
  179. texthero_cm = lsg.from_list(
  180. "texthero", [corn, mango_tango, crayola, crimson, oxford_blue]
  181. )
  182. colormap = texthero_cm
  183. words = s.str.cat(sep=" ").split()
  184. wordcloud = WordCloud(
  185. font_path=font_path,
  186. width=width,
  187. height=height,
  188. max_words=max_words,
  189. mask=mask,
  190. contour_width=contour_width,
  191. contour_color=contour_color,
  192. min_font_size=min_font_size,
  193. background_color=background_color,
  194. max_font_size=max_font_size,
  195. relative_scaling=relative_scaling,
  196. colormap=colormap,
  197. # stopwords=[], # TODO. Will use generate from frequencies.
  198. # normalize_plurals=False, # TODO.
  199. ).generate_from_frequencies(dict(Counter(words)))
  200. # fig = px.imshow(wordcloud)
  201. # fig.show()
  202. fig, ax = plt.subplots(figsize=(20, 10))
  203. ax.imshow(wordcloud, interpolation="bilinear")
  204. ax.axis("off")
  205. if return_figure:
  206. return fig
  207. @InputSeries(TextSeries)
  208. def top_words(s: TextSeries, normalize=False) -> pd.Series:
  209. r"""
  210. Return a pandas series with index the top words and as value the count.
  211. Tokenization: split by space and remove all punctuations that are not
  212. between characters.
  213. Parameters
  214. ----------
  215. normalize : bool, optional, default=False.
  216. When set to true, return normalized values.
  217. Examples
  218. --------
  219. >>> import pandas as pd
  220. >>> import texthero as hero
  221. >>> s = pd.Series("one two two three three three")
  222. >>> hero.top_words(s)
  223. three 3
  224. two 2
  225. one 1
  226. dtype: int64
  227. """
  228. # Replace all punctuation that are NOT in-between chacarters
  229. # This means, they have either a non word-bounding \B, are at the start ^, or at the end $
  230. # As re.sub replace all and not just the matching group, add matching parenthesis to the character
  231. # to keep during replacement.
  232. # TODO replace it with tokenizer.
  233. pattern = (
  234. rf"((\w)[{string.punctuation}](?:\B|$)|(?:^|\B)[{string.punctuation}](\w))"
  235. )
  236. return (
  237. s.str.replace(
  238. pattern, r"\2 \3"
  239. ) # \2 and \3 permits to keep the character around the punctuation.
  240. .str.split() # now split by space
  241. .explode() # one word for each line
  242. .value_counts(normalize=normalize)
  243. )