wayback_archiver.py 3.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. import time, requests
  2. from loguru import logger
  3. from bs4 import BeautifulSoup
  4. from storages import Storage
  5. from .base_archiver import Archiver, ArchiveResult
  6. from configs import WaybackConfig
  7. class WaybackArchiver(Archiver):
  8. """
  9. This archiver could implement a check_if_exists by going to "https://web.archive.org/web/{url}"
  10. but that might not be desirable since the webpage might have been archived a long time ago and thus have changed
  11. """
  12. name = "wayback"
  13. def __init__(self, storage: Storage, driver, config: WaybackConfig):
  14. super(WaybackArchiver, self).__init__(storage, driver)
  15. self.config = config
  16. self.seen_urls = {}
  17. def download(self, url, check_if_exists=False):
  18. if self.config is None:
  19. logger.error('Missing Wayback config')
  20. return False
  21. if check_if_exists:
  22. if url in self.seen_urls: return self.seen_urls[url]
  23. screenshot = self.get_screenshot(url)
  24. logger.debug(f"POSTing {url=} to web.archive.org")
  25. ia_headers = {
  26. "Accept": "application/json",
  27. "Authorization": f"LOW {self.config.key}:{self.config.secret}"
  28. }
  29. r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url})
  30. if r.status_code != 200:
  31. logger.warning(f"Internet archive failed with status of {r.status_code}")
  32. return ArchiveResult(status="Internet archive failed", screenshot=screenshot)
  33. if 'job_id' not in r.json() and 'message' in r.json():
  34. return self.custom_retry(r.json(), screenshot=screenshot)
  35. job_id = r.json()['job_id']
  36. logger.debug(f"GETting status for {job_id=} on {url=}")
  37. status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
  38. retries = 0
  39. # TODO: make the job queue parallel -> consider propagation of results back to sheet though
  40. # wait 90-120 seconds for the archive job to finish
  41. while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
  42. time.sleep(3)
  43. try:
  44. logger.debug(f"GETting status for {job_id=} on {url=} [{retries=}]")
  45. status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
  46. except:
  47. time.sleep(1)
  48. retries += 1
  49. if status_r.status_code != 200:
  50. return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot)
  51. status_json = status_r.json()
  52. if status_json['status'] != 'success':
  53. return self.custom_retry(status_json, screenshot=screenshot)
  54. archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
  55. try:
  56. req = requests.get(archive_url)
  57. parsed = BeautifulSoup(req.content, 'html.parser')
  58. title = parsed.find_all('title')[0].text
  59. if title == 'Wayback Machine':
  60. title = 'Could not get title'
  61. except:
  62. title = "Could not get title"
  63. screenshot = self.get_screenshot(url)
  64. self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot)
  65. return self.seen_urls[url]
  66. def custom_retry(self, json_data, **kwargs):
  67. logger.warning(f"Internet archive failed json \n {json_data}")
  68. if "please try again" in str(json_data).lower():
  69. return self.signal_retry_in(**kwargs)
  70. if "this host has been already captured" in str(json_data).lower():
  71. return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600) # 24h to 36h later
  72. return ArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)