statusandheaders.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. """
  2. Representation and parsing of HTTP-style status + headers
  3. """
  4. from six.moves import range
  5. from six import iteritems
  6. from warcio.utils import to_native_str, headers_to_str_headers
  7. import uuid
  8. from six.moves.urllib.parse import quote
  9. import re
  10. #=================================================================
  11. class StatusAndHeaders(object):
  12. ENCODE_HEADER_RX = re.compile(r'[=]["\']?([^;"]+)["\']?(?=[;]?)')
  13. """
  14. Representation of parsed http-style status line and headers
  15. Status Line if first line of request/response
  16. Headers is a list of (name, value) tuples
  17. An optional protocol which appears on first line may be specified
  18. If is_http_request is true, split http verb (instead of protocol) from start of statusline
  19. """
  20. def __init__(self, statusline, headers, protocol='', total_len=0, is_http_request=False):
  21. if is_http_request:
  22. protocol, statusline = statusline.split(' ', 1)
  23. self.statusline = statusline
  24. self.headers = headers_to_str_headers(headers)
  25. self.protocol = protocol
  26. self.total_len = total_len
  27. self.headers_buff = None
  28. def get_header(self, name, default_value=None):
  29. """
  30. return header (name, value)
  31. if found
  32. """
  33. name_lower = name.lower()
  34. for value in self.headers:
  35. if value[0].lower() == name_lower:
  36. return value[1]
  37. return default_value
  38. def add_header(self, name, value):
  39. self.headers.append((name, value))
  40. def replace_header(self, name, value):
  41. """
  42. replace header with new value or add new header
  43. return old header value, if any
  44. """
  45. name_lower = name.lower()
  46. for index in range(len(self.headers) - 1, -1, -1):
  47. curr_name, curr_value = self.headers[index]
  48. if curr_name.lower() == name_lower:
  49. self.headers[index] = (curr_name, value)
  50. return curr_value
  51. self.headers.append((name, value))
  52. return None
  53. def remove_header(self, name):
  54. """
  55. Remove header (case-insensitive)
  56. return True if header removed, False otherwise
  57. """
  58. name_lower = name.lower()
  59. for index in range(len(self.headers) - 1, -1, -1):
  60. if self.headers[index][0].lower() == name_lower:
  61. del self.headers[index]
  62. return True
  63. return False
  64. def get_statuscode(self):
  65. """
  66. Return the statuscode part of the status response line
  67. (Assumes no protocol in the statusline)
  68. """
  69. code = self.statusline.split(' ', 1)[0]
  70. return code
  71. def validate_statusline(self, valid_statusline):
  72. """
  73. Check that the statusline is valid, eg. starts with a numeric
  74. code. If not, replace with passed in valid_statusline
  75. """
  76. code = self.get_statuscode()
  77. try:
  78. code = int(code)
  79. assert(code > 0)
  80. return True
  81. except(ValueError, AssertionError):
  82. self.statusline = valid_statusline
  83. return False
  84. def add_range(self, start, part_len, total_len):
  85. """
  86. Add range headers indicating that this a partial response
  87. """
  88. content_range = 'bytes {0}-{1}/{2}'.format(start,
  89. start + part_len - 1,
  90. total_len)
  91. self.statusline = '206 Partial Content'
  92. self.replace_header('Content-Range', content_range)
  93. self.replace_header('Content-Length', str(part_len))
  94. self.replace_header('Accept-Ranges', 'bytes')
  95. return self
  96. def compute_headers_buffer(self, header_filter=None):
  97. """
  98. Set buffer representing headers
  99. """
  100. # HTTP headers %-encoded as ascii (see to_ascii_bytes for more info)
  101. self.headers_buff = self.to_ascii_bytes(header_filter)
  102. def __repr__(self):
  103. return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \
  104. headers = {2})".format(self.protocol, self.statusline, self.headers)
  105. def __ne__(self, other):
  106. return not (self == other)
  107. def __eq__(self, other):
  108. if not other:
  109. return False
  110. return (self.statusline == other.statusline and
  111. self.headers == other.headers and
  112. self.protocol == other.protocol)
  113. def __str__(self, exclude_list=None):
  114. return self.to_str(exclude_list)
  115. def __bool__(self):
  116. return bool(self.statusline or self.headers)
  117. __nonzero__ = __bool__
  118. def to_str(self, filter_func=None):
  119. string = self.protocol
  120. if string and self.statusline:
  121. string += ' '
  122. if self.statusline:
  123. string += self.statusline
  124. if string:
  125. string += '\r\n'
  126. for h in self.headers:
  127. if filter_func:
  128. h = filter_func(h)
  129. if not h:
  130. continue
  131. string += ': '.join(h) + '\r\n'
  132. return string
  133. def to_bytes(self, filter_func=None, encoding='utf-8'):
  134. return self.to_str(filter_func).encode(encoding) + b'\r\n'
  135. def to_ascii_bytes(self, filter_func=None):
  136. """ Attempt to encode the headers block as ascii
  137. If encoding fails, call percent_encode_non_ascii_headers()
  138. to encode any headers per RFCs
  139. """
  140. try:
  141. string = self.to_str(filter_func)
  142. string = string.encode('ascii')
  143. except (UnicodeEncodeError, UnicodeDecodeError):
  144. self.percent_encode_non_ascii_headers()
  145. string = self.to_str(filter_func)
  146. string = string.encode('ascii')
  147. return string + b'\r\n'
  148. def percent_encode_non_ascii_headers(self, encoding='UTF-8'):
  149. """ Encode any headers that are not plain ascii
  150. as UTF-8 as per:
  151. https://tools.ietf.org/html/rfc8187#section-3.2.3
  152. https://tools.ietf.org/html/rfc5987#section-3.2.2
  153. """
  154. def do_encode(m):
  155. return "*={0}''".format(encoding) + quote(to_native_str(m.group(1)))
  156. for index in range(len(self.headers) - 1, -1, -1):
  157. curr_name, curr_value = self.headers[index]
  158. try:
  159. # test if header is ascii encodable, no action needed
  160. curr_value.encode('ascii')
  161. except:
  162. # if single value header, (eg. no ';'), %-encode entire header
  163. if ';' not in curr_value:
  164. new_value = quote(curr_value)
  165. else:
  166. # %-encode value in ; name="value"
  167. new_value = self.ENCODE_HEADER_RX.sub(do_encode, curr_value)
  168. if new_value == curr_value:
  169. new_value = quote(curr_value)
  170. self.headers[index] = (curr_name, new_value)
  171. # act like a (case-insensitive) dictionary of headers, much like other
  172. # python http headers apis including http.client.HTTPMessage
  173. # and requests.structures.CaseInsensitiveDict
  174. get = get_header
  175. __getitem__ = get_header
  176. __setitem__ = replace_header
  177. __delitem__ = remove_header
  178. def __contains__(self, key):
  179. return bool(self[key])
  180. #=================================================================
  181. def _strip_count(string, total_read):
  182. length = len(string)
  183. return string.rstrip(), total_read + length
  184. #=================================================================
  185. class StatusAndHeadersParser(object):
  186. """
  187. Parser which consumes a stream support readline() to read
  188. status and headers and return a StatusAndHeaders object
  189. """
  190. def __init__(self, statuslist, verify=True):
  191. self.statuslist = statuslist
  192. self.verify = verify
  193. def parse(self, stream, full_statusline=None):
  194. """
  195. parse stream for status line and headers
  196. return a StatusAndHeaders object
  197. support continuation headers starting with space or tab
  198. """
  199. # status line w newlines intact
  200. if full_statusline is None:
  201. full_statusline = stream.readline()
  202. full_statusline = self.decode_header(full_statusline)
  203. statusline, total_read = _strip_count(full_statusline, 0)
  204. headers = []
  205. # at end of stream
  206. if total_read == 0:
  207. raise EOFError()
  208. elif not statusline:
  209. return StatusAndHeaders(statusline=statusline,
  210. headers=headers,
  211. protocol='',
  212. total_len=total_read)
  213. # validate only if verify is set
  214. if self.verify:
  215. protocol_status = self.split_prefix(statusline, self.statuslist)
  216. if not protocol_status:
  217. msg = 'Expected Status Line starting with {0} - Found: {1}'
  218. msg = msg.format(self.statuslist, statusline)
  219. raise StatusAndHeadersParserException(msg, full_statusline)
  220. else:
  221. protocol_status = statusline.split(' ', 1)
  222. line, total_read = _strip_count(self.decode_header(stream.readline()), total_read)
  223. while line:
  224. result = line.split(':', 1)
  225. if len(result) == 2:
  226. name = result[0].rstrip(' \t')
  227. value = result[1].lstrip()
  228. else:
  229. name = result[0]
  230. value = None
  231. next_line, total_read = _strip_count(self.decode_header(stream.readline()),
  232. total_read)
  233. # append continuation lines, if any
  234. while next_line and next_line.startswith((' ', '\t')):
  235. if value is not None:
  236. value += next_line
  237. next_line, total_read = _strip_count(self.decode_header(stream.readline()),
  238. total_read)
  239. if value is not None:
  240. header = (name, value)
  241. headers.append(header)
  242. line = next_line
  243. if len(protocol_status) > 1:
  244. statusline = protocol_status[1].strip()
  245. else:
  246. statusline = ''
  247. return StatusAndHeaders(statusline=statusline,
  248. headers=headers,
  249. protocol=protocol_status[0],
  250. total_len=total_read)
  251. @staticmethod
  252. def split_prefix(key, prefixs):
  253. """
  254. split key string into prefix and remainder
  255. for first matching prefix from a list
  256. """
  257. key_upper = key.upper()
  258. for prefix in prefixs:
  259. if key_upper.startswith(prefix):
  260. plen = len(prefix)
  261. return (key_upper[:plen], key[plen:])
  262. @staticmethod
  263. def make_warc_id(id_=None):
  264. if not id_:
  265. id_ = uuid.uuid4()
  266. return '<urn:uuid:{0}>'.format(id_)
  267. @staticmethod
  268. def decode_header(line):
  269. try:
  270. # attempt to decode as utf-8 first
  271. return to_native_str(line, 'utf-8')
  272. except:
  273. # if fails, default to ISO-8859-1
  274. return to_native_str(line, 'iso-8859-1')
  275. #=================================================================
  276. class StatusAndHeadersParserException(Exception):
  277. """
  278. status + headers parsing exception
  279. """
  280. def __init__(self, msg, statusline):
  281. super(StatusAndHeadersParserException, self).__init__(msg)
  282. self.statusline = statusline