123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219 |
- import io
- from urllib.parse import urlparse
- from scrapy.statscollectors import MemoryStatsCollector
- from tqdm import tqdm
- from warcio.archiveiterator import ArchiveIterator
- from warcio.statusandheaders import StatusAndHeaders
- # Status/messages taken from <https://en.wikipedia.org/wiki/List_of_HTTP_status_codes>
- HTTP_STATUS_CODES = {
- 100: "Continue",
- 101: "Switching Protocols",
- 102: "Processing",
- 103: "Early Hints",
- 200: "OK",
- 201: "Created",
- 202: "Accepted",
- 203: "Non-Authoritative Information",
- 204: "No Content",
- 205: "Reset Content",
- 206: "Partial Content",
- 207: "Multi-Status",
- 208: "Already Reported",
- 218: "This is fine", # Unofficial/Apache Web Server
- 226: "IM Used",
- 300: "Multiple Choices",
- 301: "Moved Permanently",
- 302: "Found",
- 303: "See Other",
- 304: "Not Modified",
- 305: "Use Proxy",
- 306: "Switch Proxy",
- 307: "Temporary Redirect",
- 308: "Permanent Redirect",
- 400: "Bad Request",
- 401: "Unauthorized",
- 402: "Payment Required",
- 403: "Forbidden",
- 404: "Not Found",
- 405: "Method Not Allowed",
- 406: "Not Acceptable",
- 407: "Proxy Authentication Required",
- 408: "Request Timeout",
- 409: "Conflict",
- 410: "Gone",
- 411: "Length Required",
- 412: "Precondition Failed",
- 413: "Payload Too Large",
- 414: "URI Too Long",
- 415: "Unsupported Media Type",
- 416: "Range Not Satisfiable",
- 417: "Expectation Failed",
- 418: "I'm a teapot",
- 419: "Page Expired", # Unofficial/Laravel Framework
- 420: "Enhance Your Calm", # Unofficial/Twitter
- 421: "Misdirected Request",
- 422: "Unprocessable Entity",
- 423: "Locked",
- 424: "Failed Dependency",
- 425: "Too Early",
- 426: "Upgrade Required",
- 428: "Precondition Required",
- 429: "Too Many Requests",
- 430: "Request Header Fields Too Large", # Unofficial/Shopify
- 431: "Request Header Fields Too Large",
- 440: "Login Time-out", # Unofficial/Internet Information Services
- 444: "No Response", # Unofficial/nginx
- 449: "Retry With", # Unofficial/Internet Information Services
- 450: "Blocked by Windows Parental Controls", # Unofficial/Microsoft
- 451: "Redirect", # Unofficial/Internet Information Services
- 451: "Unavailable For Legal Reasons",
- 460: "Client closed the connection", # Unofficial/AWS Elastic Load Balancer
- 463: "Too many forward IPs", # Unofficial/AWS Elastic Load Balancer
- 494: "Request header too large", # Unofficial/nginx
- 495: "SSL Certificate Error", # Unofficial/nginx
- 496: "SSL Certificate Required", # Unofficial/nginx
- 497: "HTTP Request Sent to HTTPS Port", # Unofficial/nginx
- 498: "Invalid Token", # Unofficial/Esri
- 499: "Client Closed Request", # Unofficial/nginx
- 499: "Token Required", # Unofficial/Esri
- 500: "Internal Server Error",
- 501: "Not Implemented",
- 502: "Bad Gateway",
- 503: "Service Unavailable",
- 504: "Gateway Timeout",
- 505: "HTTP Version Not Supported",
- 506: "Variant Also Negotiates",
- 507: "Insufficient Storage",
- 508: "Loop Detected",
- 509: "Bandwidth Limit Exceeded", # Unofficial/Apache Web Server/cPanel
- 510: "Not Extended",
- 511: "Network Authentication Required",
- 520: "Web Server Returned an Unknown Error", # Unofficial/Cloudflare
- 521: "Web Server Is Down", # Unofficial/Cloudflare
- 522: "Connection Timed Out", # Unofficial/Cloudflare
- 523: "Origin Is Unreachable", # Unofficial/Cloudflare
- 524: "A Timeout Occurred", # Unofficial/Cloudflare
- 525: "SSL Handshake Failed", # Unofficial/Cloudflare
- 526: "Invalid SSL Certificate", # Unofficial/Cloudflare
- 526: "Invalid SSL Certificate", # Unofficial/Cloudflare/Cloud Foundry
- 527: "Railgun Error", # Unofficial/Cloudflare
- 530: "Cloudflare Error", # Unofficial/Cloudflare
- 530: "Site is frozen", # Unofficial
- 598: "Network read timeout error", # Unofficial/Informal convention
- }
- def get_urls_from_file(filename, encoding="utf-8"):
- with open(filename, encoding=encoding) as fobj:
- for line in fobj:
- yield line.strip()
- class WarcReader:
- def __init__(self, filename):
- self.filename = filename
- self.__fobj = None
- def __iter__(self):
- self.__fobj = open(self.filename, mode="rb")
- self.__iterator = ArchiveIterator(self.__fobj)
- return self
- def __next__(self):
- try:
- item = next(self.__iterator)
- except StopIteration:
- self.__fobj.close()
- self.__iterator = self.__fobj = None
- raise
- else:
- return item
- def get_response(self, uri):
- for record in self:
- if (
- record.rec_type == "response"
- and record.rec_headers.get_header("WARC-Target-URI") == uri
- ):
- return record
- def __del__(self):
- if self.__fobj is not None:
- self.__fobj.close()
- class StdoutStatsCollector(MemoryStatsCollector):
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- self.progress_bar = tqdm(
- desc="Downloading", unit="req", unit_scale=True, dynamic_ncols=True
- )
- def inc_value(self, key, count=1, start=0, spider=None):
- super().inc_value(key, count=count, start=start, spider=spider)
- if key == "response_received_count":
- self.progress_bar.n = self._stats["response_received_count"]
- self.progress_bar.refresh()
- def get_headers_list(headers):
- # TODO: fix if list has more than one value
- # TODO: decode properly
- return [
- (key.decode("ascii"), value[0].decode("ascii"))
- for key, value in headers.items()
- ]
- def write_warc_request_response(writer, response):
- request = response.request
- path = request.url[request.url.find("/", len(urlparse(request.url).scheme) + 3) :]
- http_headers = StatusAndHeaders(
- # TODO: fix HTTP version
- f"{request.method} {path} HTTP/1.1",
- get_headers_list(request.headers),
- is_http_request=True,
- )
- writer.write_record(
- writer.create_warc_record(request.url, "request", http_headers=http_headers)
- )
- # XXX: we're currently guessing the status "title" by its code, but this
- # title may not be the original from HTTP server.
- status_title = HTTP_STATUS_CODES.get(response.status, "Unknown")
- http_headers = StatusAndHeaders(
- f"{response.status} {status_title}",
- get_headers_list(response.headers),
- protocol=response.protocol,
- is_http_request=False,
- )
- # TODO: what about redirects?
- writer.write_record(
- writer.create_warc_record(
- response.url,
- "response",
- payload=io.BytesIO(response.body),
- http_headers=http_headers,
- )
- )
- def resource_matches_base_url(absolute_url, allowed):
- clean_allowed = []
- for allow in allowed:
- if not allow.startswith("http"):
- allow = f"http://{allow}"
- clean_allowed.append(urlparse(allow.replace("www.", "")))
- parsed_url = urlparse(absolute_url.replace("www.", ""))
- return (
- any(
- a.netloc == parsed_url.netloc and parsed_url.path.startswith(a.path)
- for a in clean_allowed
- )
- or not allowed
- )
|