archiver_2.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. from functools import partial
  2. from requests.adapters import HTTPAdapter
  3. from urllib3.util.retry import Retry
  4. import argparse
  5. import logging
  6. import multiprocessing as mp
  7. import re
  8. import requests
  9. import time
  10. import xml.etree.ElementTree as ET
  11. # Library version
  12. __version__ = "1.9.1"
  13. # String used to prefix local sitemaps
  14. LOCAL_PREFIX = "file://"
  15. def format_archive_url(url):
  16. """Given a URL, constructs an Archive URL to submit the archive request."""
  17. logging.debug("Creating archive URL for %s", url)
  18. SAVE_URL = "https://web.archive.org/save/"
  19. request_url = SAVE_URL + url
  20. return request_url
  21. def call_archiver(request_url, rate_limit_wait, session):
  22. """Submit a url to the Internet Archive to archive."""
  23. if rate_limit_wait > 0:
  24. logging.debug("Sleeping for %s", rate_limit_wait)
  25. time.sleep(rate_limit_wait)
  26. logging.info("Calling archive url %s", request_url)
  27. r = session.head(request_url, allow_redirects=True)
  28. try:
  29. # Raise `requests.exceptions.HTTPError` if 4XX or 5XX status
  30. r.raise_for_status()
  31. except requests.exceptions.HTTPError as e:
  32. logging.exception(e)
  33. raise
  34. def get_namespace(element):
  35. """Extract the namespace using a regular expression."""
  36. match = re.match(r"\{.*\}", element.tag)
  37. return match.group(0) if match else ""
  38. def download_remote_sitemap(sitemap_url, session):
  39. """Download the sitemap of the target website."""
  40. logging.debug("Downloading: %s", sitemap_url)
  41. r = session.get(sitemap_url)
  42. try:
  43. # Raise `requests.exceptions.HTTPError` if 4XX or 5XX status
  44. r.raise_for_status()
  45. except requests.exceptions.HTTPError as e:
  46. logging.exception(e)
  47. raise
  48. else:
  49. return r.text.encode("utf-8")
  50. def load_local_sitemap(sitemap_filepath):
  51. """Load a local sitemap and return it as a string."""
  52. logging.debug("Loading local sitemap: %s", sitemap_filepath)
  53. if sitemap_filepath.startswith(LOCAL_PREFIX):
  54. sitemap_filepath = sitemap_filepath[len(LOCAL_PREFIX):]
  55. # Try to open the file, error on failure
  56. try:
  57. logging.debug("Opening local file '%s'", sitemap_filepath)
  58. with open(sitemap_filepath, "r") as fp:
  59. contents = fp.read()
  60. except IOError as e:
  61. logging.exception(e)
  62. raise
  63. return contents
  64. def sitemap_is_local(sitemap_url):
  65. """Returns True if we believe a URI to be local, False otherwise."""
  66. return sitemap_url.startswith(LOCAL_PREFIX) or sitemap_url.startswith("/")
  67. def extract_pages_from_sitemap(site_map_text):
  68. """Extract the various pages from the sitemap text. """
  69. root = ET.fromstring(site_map_text)
  70. # Sitemaps use a namespace in the XML, which we need to read
  71. namespace = get_namespace(root)
  72. urls = []
  73. for loc_node in root.findall(".//{}loc".format(namespace)):
  74. urls.append(loc_node.text)
  75. return set(urls)
  76. def main():
  77. # Command line parsing
  78. parser = argparse.ArgumentParser(
  79. prog="archiver",
  80. description="A script to backup a web pages with Internet Archive",
  81. )
  82. parser.add_argument(
  83. "--version",
  84. action="version",
  85. version="%(prog)s {version}".format(version=__version__),
  86. )
  87. parser.add_argument(
  88. "urls",
  89. nargs="*",
  90. default=[],
  91. help="the URLs of the pages to archive",
  92. )
  93. parser.add_argument(
  94. "--file",
  95. help="path to a file containing urls to save (one url per line)",
  96. required=False,
  97. )
  98. parser.add_argument(
  99. "--sitemaps",
  100. nargs="+",
  101. default=[],
  102. help="one or more URIs to sitemaps listing pages to archive; local paths must be prefixed with '{f}'".format(f=LOCAL_PREFIX),
  103. required=False,
  104. )
  105. parser.add_argument(
  106. "--log",
  107. help="set the logging level, defaults to WARNING",
  108. dest="log_level",
  109. default=logging.WARNING,
  110. choices=[
  111. "DEBUG",
  112. "INFO",
  113. "WARNING",
  114. "ERROR",
  115. "CRITICAL",
  116. ],
  117. )
  118. parser.add_argument(
  119. "--log-to-file",
  120. help="redirect logs to a file",
  121. dest="log_file",
  122. default=None,
  123. )
  124. parser.add_argument(
  125. "--archive-sitemap-also",
  126. help="also submit the URL of the sitemap to be archived",
  127. dest="archive_sitemap",
  128. default=False,
  129. action="store_true",
  130. )
  131. parser.add_argument(
  132. "--jobs",
  133. "-j",
  134. help="run this many concurrent URL submissions, defaults to 1",
  135. default=1,
  136. type=int,
  137. )
  138. parser.add_argument(
  139. "--rate-limit-wait",
  140. help="number of seconds to wait between page requests to avoid flooding the archive site, defaults to 5; also used as the backoff factor for retries",
  141. dest="rate_limit_in_sec",
  142. default=5,
  143. type=int,
  144. )
  145. args = parser.parse_args()
  146. # Set the logging level based on the arguments
  147. #
  148. # If `filename` is None, the constructor will set up a stream, otherwise it
  149. # will use the file specified.
  150. logging.basicConfig(level=args.log_level, filename=args.log_file)
  151. logging.debug("Archiver Version: %s", __version__)
  152. logging.debug("Arguments: %s", args)
  153. archive_urls = []
  154. # Add the regular pages
  155. if args.urls:
  156. logging.info("Adding page URLs to archive")
  157. logging.debug("Page URLs to archive: %s", args.urls)
  158. archive_urls += map(format_archive_url, args.urls)
  159. # Set up retry and backoff
  160. session = requests.Session()
  161. session.max_redirects = 100 # Changed number of redirects allowed from 30
  162. retries = Retry(
  163. total=5,
  164. backoff_factor=args.rate_limit_in_sec,
  165. status_forcelist=[500, 502, 503, 504, 520],
  166. )
  167. session.mount("https://", HTTPAdapter(max_retries=retries))
  168. session.mount("http://", HTTPAdapter(max_retries=retries))
  169. # Download and process the sitemaps
  170. remote_sitemaps = set()
  171. logging.info("Parsing sitemaps")
  172. for sitemap_url in args.sitemaps:
  173. # Save the remote ones, incase the user wants us to backthem up
  174. if sitemap_is_local(sitemap_url):
  175. logging.debug("The sitemap '%s' is local.", sitemap_url)
  176. sitemap_xml = load_local_sitemap(sitemap_url)
  177. else:
  178. logging.debug("The sitemap '%s' is remote.", sitemap_url)
  179. if args.archive_sitemap:
  180. remote_sitemaps.add(sitemap_url)
  181. sitemap_xml = download_remote_sitemap(sitemap_url, session=session)
  182. for url in extract_pages_from_sitemap(sitemap_xml):
  183. archive_urls.append(format_archive_url(url))
  184. # Archive the sitemap as well, if requested
  185. if args.archive_sitemap:
  186. logging.info("Archiving sitemaps")
  187. if remote_sitemaps:
  188. archive_urls += map(format_archive_url, remote_sitemaps)
  189. else:
  190. logging.debug("No remote sitemaps to backup.")
  191. # And URLs from file
  192. if args.file:
  193. logging.info("Reading urls from file: %s", args.file)
  194. with open(args.file) as file:
  195. urls_from_file = (u.strip() for u in file.readlines() if u.strip())
  196. archive_urls += map(format_archive_url, urls_from_file)
  197. # Deduplicate URLs
  198. archive_urls = set(archive_urls)
  199. # Archive the URLs
  200. logging.debug("Archive URLs: %s", archive_urls)
  201. pool = mp.Pool(processes=args.jobs)
  202. partial_call = partial(
  203. call_archiver, rate_limit_wait=args.rate_limit_in_sec, session=session
  204. )
  205. pool.map(partial_call, archive_urls)
  206. pool.close()
  207. pool.join()
  208. if __name__ == "__main__":
  209. main()