__init__.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. VERSION = (0, 1)
  2. __version__ = '.'.join(map(str, VERSION))
  3. DATE = "2014-02-09"
  4. """
  5. Sources come from django-haystack project.
  6. This contrib want to be an utility to use with no need of haystack.
  7. """
  8. from django.utils.html import strip_tags
  9. class Highlighter(object):
  10. """
  11. This class permits to highlight the searched words (self.query) in a long text and truncates it
  12. counts number of chars == max_length
  13. """
  14. css_class = 'highlighted'
  15. html_tag = 'span'
  16. max_length = 200
  17. text_block = ''
  18. def __init__(self, query, **kwargs):
  19. self.query = query
  20. if 'max_length' in kwargs:
  21. self.max_length = int(kwargs['max_length'])
  22. if 'html_tag' in kwargs:
  23. self.html_tag = kwargs['html_tag']
  24. if 'css_class' in kwargs:
  25. self.css_class = kwargs['css_class']
  26. self.query_words = set([word.lower() for word in self.query.split() if not word.startswith('-')])
  27. def highlight(self, text_block, nchars_before=0):
  28. """
  29. This method highlight text_block and prepends chars before self.query word nchars_before chars
  30. """
  31. self.text_block = strip_tags(text_block)
  32. highlight_locations = self.find_highlightable_words()
  33. start_offset, end_offset = self.find_window(highlight_locations)
  34. return self.render_html(highlight_locations, start_offset, end_offset, nchars_before)
  35. def find_highlightable_words(self):
  36. # Use a set so we only do this once per unique word.
  37. word_positions = {}
  38. # Pre-compute the length.
  39. end_offset = len(self.text_block)
  40. lower_text_block = self.text_block.lower()
  41. for word in self.query_words:
  42. if not word in word_positions:
  43. word_positions[word] = []
  44. start_offset = 0
  45. while start_offset < end_offset:
  46. next_offset = lower_text_block.find(word, start_offset, end_offset)
  47. # If we get a -1 out of find, it wasn't found. Bomb out and
  48. # start the next word.
  49. if next_offset == -1:
  50. break
  51. word_positions[word].append(next_offset)
  52. start_offset = next_offset + len(word)
  53. return word_positions
  54. def find_window(self, highlight_locations):
  55. best_start = 0
  56. best_end = self.max_length
  57. # First, make sure we have words.
  58. if not len(highlight_locations):
  59. return (best_start, best_end)
  60. words_found = []
  61. # Next, make sure we found any words at all.
  62. #for word, offset_list in highlight_locations.items():
  63. for offset_list in highlight_locations.values():
  64. if len(offset_list):
  65. # Add all of the locations to the list.
  66. words_found.extend(offset_list)
  67. if not len(words_found):
  68. return (best_start, best_end)
  69. if len(words_found) == 1:
  70. return (words_found[0], words_found[0] + self.max_length)
  71. # Sort the list so it's in ascending order.
  72. words_found = sorted(words_found)
  73. # We now have a denormalized list of all positions were a word was
  74. # found. We'll iterate through and find the densest window we can by
  75. # counting the number of found offsets (-1 to fit in the window).
  76. highest_density = 0
  77. if words_found[:-1][0] > self.max_length:
  78. best_start = words_found[:-1][0]
  79. best_end = best_start + self.max_length
  80. for count, start in enumerate(words_found[:-1]):
  81. current_density = 1
  82. for end in words_found[count + 1:]:
  83. if end - start < self.max_length:
  84. current_density += 1
  85. else:
  86. current_density = 0
  87. # Only replace if we have a bigger (not equal density) so we
  88. # give deference to windows earlier in the document.
  89. if current_density > highest_density:
  90. best_start = start
  91. best_end = start + self.max_length
  92. highest_density = current_density
  93. return (best_start, best_end)
  94. def render_html(self, highlight_locations=None, start_offset=None, end_offset=None, nchars_before=0):
  95. # Start by chopping the block down to the proper window.
  96. if start_offset > nchars_before:
  97. start_offset = start_offset - nchars_before
  98. text = self.text_block[start_offset:end_offset]
  99. # Invert highlight_locations to a location -> term list
  100. term_list = []
  101. for term, locations in highlight_locations.items():
  102. term_list += [(loc - start_offset, term) for loc in locations]
  103. loc_to_term = sorted(term_list)
  104. # Prepare the highlight template
  105. if self.css_class:
  106. hl_start = '<%s class="%s">' % (self.html_tag, self.css_class)
  107. else:
  108. hl_start = '<%s>' % (self.html_tag)
  109. hl_end = '</%s>' % self.html_tag
  110. #highlight_length = len(hl_start + hl_end)
  111. # Copy the part from the start of the string to the first match,
  112. # and there replace the match with a highlighted version.
  113. highlighted_chunk = ""
  114. matched_so_far = 0
  115. prev = 0
  116. prev_str = ""
  117. for cur, cur_str in loc_to_term:
  118. # This can be in a different case than cur_str
  119. actual_term = text[cur:cur + len(cur_str)]
  120. # Handle incorrect highlight_locations by first checking for the term
  121. if actual_term.lower() == cur_str:
  122. highlighted_chunk += text[prev + len(prev_str):cur] + hl_start + actual_term + hl_end
  123. prev = cur
  124. prev_str = cur_str
  125. # Keep track of how far we've copied so far, for the last step
  126. matched_so_far = cur + len(actual_term)
  127. # Don't forget the chunk after the last term
  128. highlighted_chunk += text[matched_so_far:]
  129. if start_offset > 0:
  130. highlighted_chunk = '...%s' % highlighted_chunk
  131. if end_offset < len(self.text_block):
  132. highlighted_chunk = '%s...' % highlighted_chunk
  133. return highlighted_chunk