html.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. # Copyright (c) 2017 crocoite contributors
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in
  11. # all copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. # THE SOFTWARE.
  20. """
  21. HTML helper
  22. """
  23. from html5lib.treewalkers.base import TreeWalker
  24. from html5lib.filters.base import Filter
  25. from html5lib import constants
  26. # HTML void tags, see https://html.spec.whatwg.org/multipage/syntax.html#void-elements
  27. voidTags = {'area',
  28. 'base',
  29. 'br',
  30. 'col',
  31. 'embed',
  32. 'hr',
  33. 'img',
  34. 'input',
  35. 'link',
  36. 'meta',
  37. 'param',
  38. 'source',
  39. 'track',
  40. 'wbr'}
  41. # source: https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes
  42. eventAttributes = {'onabort',
  43. 'onautocomplete',
  44. 'onautocompleteerror',
  45. 'onblur',
  46. 'oncancel',
  47. 'oncanplay',
  48. 'oncanplaythrough',
  49. 'onchange',
  50. 'onclick',
  51. 'onclose',
  52. 'oncontextmenu',
  53. 'oncuechange',
  54. 'ondblclick',
  55. 'ondrag',
  56. 'ondragend',
  57. 'ondragenter',
  58. 'ondragexit',
  59. 'ondragleave',
  60. 'ondragover',
  61. 'ondragstart',
  62. 'ondrop',
  63. 'ondurationchange',
  64. 'onemptied',
  65. 'onended',
  66. 'onerror',
  67. 'onfocus',
  68. 'oninput',
  69. 'oninvalid',
  70. 'onkeydown',
  71. 'onkeypress',
  72. 'onkeyup',
  73. 'onload',
  74. 'onloadeddata',
  75. 'onloadedmetadata',
  76. 'onloadstart',
  77. 'onmousedown',
  78. 'onmouseenter',
  79. 'onmouseleave',
  80. 'onmousemove',
  81. 'onmouseout',
  82. 'onmouseover',
  83. 'onmouseup',
  84. 'onmousewheel',
  85. 'onpause',
  86. 'onplay',
  87. 'onplaying',
  88. 'onprogress',
  89. 'onratechange',
  90. 'onreset',
  91. 'onresize',
  92. 'onscroll',
  93. 'onseeked',
  94. 'onseeking',
  95. 'onselect',
  96. 'onshow',
  97. 'onsort',
  98. 'onstalled',
  99. 'onsubmit',
  100. 'onsuspend',
  101. 'ontimeupdate',
  102. 'ontoggle',
  103. 'onvolumechange',
  104. 'onwaiting'}
  105. default_namespace = constants.namespaces["html"]
  106. class ChromeTreeWalker (TreeWalker):
  107. """
  108. Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument
  109. """
  110. def recurse (self, node):
  111. name = node['nodeName']
  112. if name.startswith ('#'):
  113. if name == '#text':
  114. yield from self.text (node['nodeValue'])
  115. elif name == '#comment':
  116. yield self.comment (node['nodeValue'])
  117. elif name == '#document':
  118. for child in node.get ('children', []):
  119. yield from self.recurse (child)
  120. elif name == '#cdata-section':
  121. # html5lib cannot generate cdata, so we’re faking it by using
  122. # an empty tag
  123. yield from self.emptyTag (default_namespace,
  124. '![CDATA[' + node['nodeValue'] + ']]', {})
  125. else:
  126. assert False, (name, node)
  127. else:
  128. attributes = node.get ('attributes', [])
  129. convertedAttr = {}
  130. for i in range (0, len (attributes), 2):
  131. convertedAttr[(default_namespace, attributes[i])] = attributes[i+1]
  132. children = node.get ('children', [])
  133. if name.lower() in voidTags and not children:
  134. yield from self.emptyTag (default_namespace, name, convertedAttr)
  135. else:
  136. yield self.startTag (default_namespace, name, convertedAttr)
  137. for child in node.get ('children', []):
  138. yield from self.recurse (child)
  139. yield self.endTag ('', name)
  140. def __iter__ (self):
  141. assert self.tree['nodeName'] == '#document'
  142. return self.recurse (self.tree)
  143. def split (self):
  144. """
  145. Split response returned by DOM.getDocument(pierce=True) into independent documents
  146. """
  147. def recurse (node):
  148. contentDocument = node.get ('contentDocument')
  149. if contentDocument:
  150. assert contentDocument['nodeName'] == '#document'
  151. yield contentDocument
  152. yield from recurse (contentDocument)
  153. for child in node.get ('children', []):
  154. yield from recurse (child)
  155. if self.tree['nodeName'] == '#document':
  156. yield self.tree
  157. yield from recurse (self.tree)
  158. class StripTagFilter (Filter):
  159. """
  160. Remove arbitrary tags
  161. """
  162. def __init__ (self, source, tags):
  163. Filter.__init__ (self, source)
  164. self.tags = set (map (str.lower, tags))
  165. def __iter__(self):
  166. delete = 0
  167. for token in Filter.__iter__(self):
  168. tokenType = token['type']
  169. if tokenType in {'StartTag', 'EmptyTag'}:
  170. if delete > 0 or token['name'].lower () in self.tags:
  171. delete += 1
  172. if delete == 0:
  173. yield token
  174. if tokenType == 'EndTag' and delete > 0:
  175. delete -= 1
  176. class StripAttributeFilter (Filter):
  177. """
  178. Remove arbitrary HTML attributes
  179. """
  180. def __init__ (self, source, attributes):
  181. Filter.__init__ (self, source)
  182. self.attributes = set (map (str.lower, attributes))
  183. def __iter__(self):
  184. for token in Filter.__iter__(self):
  185. data = token.get ('data')
  186. if data and token['type'] in {'StartTag', 'EmptyTag'}:
  187. newdata = {}
  188. for (namespace, k), v in data.items ():
  189. if k.lower () not in self.attributes:
  190. newdata[(namespace, k)] = v
  191. token['data'] = newdata
  192. yield token