utils.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. import io
  2. from urllib.parse import urlparse
  3. from scrapy.statscollectors import MemoryStatsCollector
  4. from tqdm import tqdm
  5. from warcio.archiveiterator import ArchiveIterator
  6. from warcio.statusandheaders import StatusAndHeaders
  7. # Status/messages taken from <https://en.wikipedia.org/wiki/List_of_HTTP_status_codes>
  8. HTTP_STATUS_CODES = {
  9. 100: "Continue",
  10. 101: "Switching Protocols",
  11. 102: "Processing",
  12. 103: "Early Hints",
  13. 200: "OK",
  14. 201: "Created",
  15. 202: "Accepted",
  16. 203: "Non-Authoritative Information",
  17. 204: "No Content",
  18. 205: "Reset Content",
  19. 206: "Partial Content",
  20. 207: "Multi-Status",
  21. 208: "Already Reported",
  22. 218: "This is fine", # Unofficial/Apache Web Server
  23. 226: "IM Used",
  24. 300: "Multiple Choices",
  25. 301: "Moved Permanently",
  26. 302: "Found",
  27. 303: "See Other",
  28. 304: "Not Modified",
  29. 305: "Use Proxy",
  30. 306: "Switch Proxy",
  31. 307: "Temporary Redirect",
  32. 308: "Permanent Redirect",
  33. 400: "Bad Request",
  34. 401: "Unauthorized",
  35. 402: "Payment Required",
  36. 403: "Forbidden",
  37. 404: "Not Found",
  38. 405: "Method Not Allowed",
  39. 406: "Not Acceptable",
  40. 407: "Proxy Authentication Required",
  41. 408: "Request Timeout",
  42. 409: "Conflict",
  43. 410: "Gone",
  44. 411: "Length Required",
  45. 412: "Precondition Failed",
  46. 413: "Payload Too Large",
  47. 414: "URI Too Long",
  48. 415: "Unsupported Media Type",
  49. 416: "Range Not Satisfiable",
  50. 417: "Expectation Failed",
  51. 418: "I'm a teapot",
  52. 419: "Page Expired", # Unofficial/Laravel Framework
  53. 420: "Enhance Your Calm", # Unofficial/Twitter
  54. 421: "Misdirected Request",
  55. 422: "Unprocessable Entity",
  56. 423: "Locked",
  57. 424: "Failed Dependency",
  58. 425: "Too Early",
  59. 426: "Upgrade Required",
  60. 428: "Precondition Required",
  61. 429: "Too Many Requests",
  62. 430: "Request Header Fields Too Large", # Unofficial/Shopify
  63. 431: "Request Header Fields Too Large",
  64. 440: "Login Time-out", # Unofficial/Internet Information Services
  65. 444: "No Response", # Unofficial/nginx
  66. 449: "Retry With", # Unofficial/Internet Information Services
  67. 450: "Blocked by Windows Parental Controls", # Unofficial/Microsoft
  68. 451: "Redirect", # Unofficial/Internet Information Services
  69. 451: "Unavailable For Legal Reasons",
  70. 460: "Client closed the connection", # Unofficial/AWS Elastic Load Balancer
  71. 463: "Too many forward IPs", # Unofficial/AWS Elastic Load Balancer
  72. 494: "Request header too large", # Unofficial/nginx
  73. 495: "SSL Certificate Error", # Unofficial/nginx
  74. 496: "SSL Certificate Required", # Unofficial/nginx
  75. 497: "HTTP Request Sent to HTTPS Port", # Unofficial/nginx
  76. 498: "Invalid Token", # Unofficial/Esri
  77. 499: "Client Closed Request", # Unofficial/nginx
  78. 499: "Token Required", # Unofficial/Esri
  79. 500: "Internal Server Error",
  80. 501: "Not Implemented",
  81. 502: "Bad Gateway",
  82. 503: "Service Unavailable",
  83. 504: "Gateway Timeout",
  84. 505: "HTTP Version Not Supported",
  85. 506: "Variant Also Negotiates",
  86. 507: "Insufficient Storage",
  87. 508: "Loop Detected",
  88. 509: "Bandwidth Limit Exceeded", # Unofficial/Apache Web Server/cPanel
  89. 510: "Not Extended",
  90. 511: "Network Authentication Required",
  91. 520: "Web Server Returned an Unknown Error", # Unofficial/Cloudflare
  92. 521: "Web Server Is Down", # Unofficial/Cloudflare
  93. 522: "Connection Timed Out", # Unofficial/Cloudflare
  94. 523: "Origin Is Unreachable", # Unofficial/Cloudflare
  95. 524: "A Timeout Occurred", # Unofficial/Cloudflare
  96. 525: "SSL Handshake Failed", # Unofficial/Cloudflare
  97. 526: "Invalid SSL Certificate", # Unofficial/Cloudflare
  98. 526: "Invalid SSL Certificate", # Unofficial/Cloudflare/Cloud Foundry
  99. 527: "Railgun Error", # Unofficial/Cloudflare
  100. 530: "Cloudflare Error", # Unofficial/Cloudflare
  101. 530: "Site is frozen", # Unofficial
  102. 598: "Network read timeout error", # Unofficial/Informal convention
  103. }
  104. def get_urls_from_file(filename, encoding="utf-8"):
  105. with open(filename, encoding=encoding) as fobj:
  106. for line in fobj:
  107. yield line.strip()
  108. class WarcReader:
  109. def __init__(self, filename):
  110. self.filename = filename
  111. self.__fobj = None
  112. def __iter__(self):
  113. self.__fobj = open(self.filename, mode="rb")
  114. self.__iterator = ArchiveIterator(self.__fobj)
  115. return self
  116. def __next__(self):
  117. try:
  118. item = next(self.__iterator)
  119. except StopIteration:
  120. self.__fobj.close()
  121. self.__iterator = self.__fobj = None
  122. raise
  123. else:
  124. return item
  125. def get_response(self, uri):
  126. for record in self:
  127. if (
  128. record.rec_type == "response"
  129. and record.rec_headers.get_header("WARC-Target-URI") == uri
  130. ):
  131. return record
  132. def __del__(self):
  133. if self.__fobj is not None:
  134. self.__fobj.close()
  135. class StdoutStatsCollector(MemoryStatsCollector):
  136. def __init__(self, *args, **kwargs):
  137. super().__init__(*args, **kwargs)
  138. self.progress_bar = tqdm(
  139. desc="Downloading", unit="req", unit_scale=True, dynamic_ncols=True
  140. )
  141. def inc_value(self, key, count=1, start=0, spider=None):
  142. super().inc_value(key, count=count, start=start, spider=spider)
  143. if key == "response_received_count":
  144. self.progress_bar.n = self._stats["response_received_count"]
  145. self.progress_bar.refresh()
  146. def get_headers_list(headers):
  147. # TODO: fix if list has more than one value
  148. # TODO: decode properly
  149. return [
  150. (key.decode("ascii"), value[0].decode("ascii"))
  151. for key, value in headers.items()
  152. ]
  153. def write_warc_request_response(writer, response):
  154. request = response.request
  155. path = request.url[request.url.find("/", len(urlparse(request.url).scheme) + 3) :]
  156. http_headers = StatusAndHeaders(
  157. # TODO: fix HTTP version
  158. f"{request.method} {path} HTTP/1.1",
  159. get_headers_list(request.headers),
  160. is_http_request=True,
  161. )
  162. writer.write_record(
  163. writer.create_warc_record(request.url, "request", http_headers=http_headers)
  164. )
  165. # XXX: we're currently guessing the status "title" by its code, but this
  166. # title may not be the original from HTTP server.
  167. status_title = HTTP_STATUS_CODES.get(response.status, "Unknown")
  168. http_headers = StatusAndHeaders(
  169. f"{response.status} {status_title}",
  170. get_headers_list(response.headers),
  171. protocol=response.protocol,
  172. is_http_request=False,
  173. )
  174. # TODO: what about redirects?
  175. writer.write_record(
  176. writer.create_warc_record(
  177. response.url,
  178. "response",
  179. payload=io.BytesIO(response.body),
  180. http_headers=http_headers,
  181. )
  182. )
  183. def resource_matches_base_url(absolute_url, allowed):
  184. clean_allowed = []
  185. for allow in allowed:
  186. if not allow.startswith("http"):
  187. allow = f"http://{allow}"
  188. clean_allowed.append(urlparse(allow.replace("www.", "")))
  189. parsed_url = urlparse(absolute_url.replace("www.", ""))
  190. return (
  191. any(
  192. a.netloc == parsed_url.netloc and parsed_url.path.startswith(a.path)
  193. for a in clean_allowed
  194. )
  195. or not allowed
  196. )