visualize_3_2.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. def wordcloud(
  2. s: TextSeries,
  3. font_path: str = None,
  4. width: int = 400,
  5. height: int = 200,
  6. max_words=200,
  7. mask=None,
  8. contour_width=0,
  9. contour_color="PAPAYAWHIP",
  10. min_font_size=4,
  11. background_color="PAPAYAWHIP",
  12. max_font_size=None,
  13. relative_scaling="auto",
  14. colormap=None,
  15. return_figure=False,
  16. ):
  17. """
  18. Plot wordcloud image using WordCloud from word_cloud package.
  19. Most of the arguments are very similar if not equal to the mother
  20. function. In constrast, all words are taken into account when computing
  21. the wordcloud, inclusive stopwords. They can be easily removed with
  22. preprocessing.remove_stopwords.
  23. Words are computed using generate_from_frequencies.
  24. To reduce blur in the wordcloud image, `width` and `height` should be at
  25. least 400.
  26. Parameters
  27. ----------
  28. s : :class:`texthero._types.TextSeries`
  29. font_path : str, optional, default=None
  30. Font path to the font that will be used (OTF or TTF). Defaults to
  31. DroidSansMono path on a Linux machine. If you are on another OS or
  32. don't have this font, you need to adjust this path.
  33. width : int, optional, default=400
  34. Width of the canvas.
  35. height : int, optional, default=200
  36. Height of the canvas.
  37. max_words : int, optional, default=200
  38. The maximum number of words.
  39. mask : nd-array or None, optional, default=None
  40. When set, gives a binary mask on where to draw words. When set, width
  41. and height will be ignored and the shape of mask will be used instead.
  42. All white (#FF or #FFFFFF) entries will be considerd "masked out"
  43. while other entries will be free to draw on.
  44. contour_width: float, optional, default=0
  45. If mask is not None and contour_width > 0, draw the mask contour.
  46. contour_color: str, optional, default="PAPAYAWHIP"
  47. Mask contour color.
  48. min_font_size : int, optional, default=4
  49. Smallest font size to use. Will stop when there is no more room in
  50. this size.
  51. background_color : str, optional, default="PAPAYAWHIP"
  52. Background color for the word cloud image.
  53. max_font_size : int or None, optional, default=None
  54. Maximum font size for the largest word. If None, height of the image
  55. is used.
  56. relative_scaling : float, optional, default="auto"
  57. Importance of relative word frequencies for font-size. With
  58. relative_scaling=0, only word-ranks are considered. With
  59. relative_scaling=1, a word that is twice as frequent will have twice
  60. the size. If you want to consider the word frequencies and not only
  61. their rank, relative_scaling around .5 often looks good.
  62. If 'auto' it will be set to 0.5 unless repeat is true, in which
  63. case it will be set to 0.
  64. colormap : string or matplotlib colormap, optional, default="viridis"
  65. Matplotlib colormap to randomly draw colors from for each word.
  66. """
  67. text = s.str.cat(sep=" ")
  68. if colormap is None:
  69. # Custom palette.
  70. # TODO move it under tools.
  71. corn = (255.0 / 256, 242.0 / 256, 117.0 / 256)
  72. mango_tango = (255.0 / 256, 140.0 / 256, 66.0 / 256)
  73. crayola = (63.0 / 256, 136.0 / 256, 197.0 / 256)
  74. crimson = (215.0 / 256, 38.0 / 256, 61.0 / 256)
  75. oxford_blue = (2.0 / 256, 24.0 / 256, 43.0 / 256)
  76. texthero_cm = lsg.from_list(
  77. "texthero", [corn, mango_tango, crayola, crimson, oxford_blue]
  78. )
  79. colormap = texthero_cm
  80. words = s.str.cat(sep=" ").split()
  81. wordcloud = WordCloud(
  82. font_path=font_path,
  83. width=width,
  84. height=height,
  85. max_words=max_words,
  86. mask=mask,
  87. contour_width=contour_width,
  88. contour_color=contour_color,
  89. min_font_size=min_font_size,
  90. background_color=background_color,
  91. max_font_size=max_font_size,
  92. relative_scaling=relative_scaling,
  93. colormap=colormap,
  94. # stopwords=[], # TODO. Will use generate from frequencies.
  95. # normalize_plurals=False, # TODO.
  96. ).generate_from_frequencies(dict(Counter(words)))
  97. # fig = px.imshow(wordcloud)
  98. # fig.show()
  99. fig, ax = plt.subplots(figsize=(20, 10))
  100. ax.imshow(wordcloud, interpolation="bilinear")
  101. ax.axis("off")
  102. if return_figure:
  103. return fig