12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788 |
- import time, requests
- from loguru import logger
- from bs4 import BeautifulSoup
- from storages import Storage
- from .base_archiver import Archiver, ArchiveResult
- from configs import WaybackConfig
- class WaybackArchiver(Archiver):
- """
- This archiver could implement a check_if_exists by going to "https://web.archive.org/web/{url}"
- but that might not be desirable since the webpage might have been archived a long time ago and thus have changed
- """
- name = "wayback"
- def __init__(self, storage: Storage, driver, config: WaybackConfig):
- super(WaybackArchiver, self).__init__(storage, driver)
- self.config = config
- self.seen_urls = {}
- def download(self, url, check_if_exists=False):
- if self.config is None:
- logger.error('Missing Wayback config')
- return False
- if check_if_exists:
- if url in self.seen_urls: return self.seen_urls[url]
- screenshot = self.get_screenshot(url)
- logger.debug(f"POSTing {url=} to web.archive.org")
- ia_headers = {
- "Accept": "application/json",
- "Authorization": f"LOW {self.config.key}:{self.config.secret}"
- }
- r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url})
- if r.status_code != 200:
- logger.warning(f"Internet archive failed with status of {r.status_code}")
- return ArchiveResult(status="Internet archive failed", screenshot=screenshot)
- if 'job_id' not in r.json() and 'message' in r.json():
- return self.custom_retry(r.json(), screenshot=screenshot)
- job_id = r.json()['job_id']
- logger.debug(f"GETting status for {job_id=} on {url=}")
- status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
- retries = 0
- # TODO: make the job queue parallel -> consider propagation of results back to sheet though
- # wait 90-120 seconds for the archive job to finish
- while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
- time.sleep(3)
- try:
- logger.debug(f"GETting status for {job_id=} on {url=} [{retries=}]")
- status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
- except:
- time.sleep(1)
- retries += 1
- if status_r.status_code != 200:
- return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot)
- status_json = status_r.json()
- if status_json['status'] != 'success':
- return self.custom_retry(status_json, screenshot=screenshot)
- archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
- try:
- req = requests.get(archive_url)
- parsed = BeautifulSoup(req.content, 'html.parser')
- title = parsed.find_all('title')[0].text
- if title == 'Wayback Machine':
- title = 'Could not get title'
- except:
- title = "Could not get title"
- screenshot = self.get_screenshot(url)
- self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot)
- return self.seen_urls[url]
- def custom_retry(self, json_data, **kwargs):
- logger.warning(f"Internet archive failed json \n {json_data}")
- if "please try again" in str(json_data).lower():
- return self.signal_retry_in(**kwargs)
- if "this host has been already captured" in str(json_data).lower():
- return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600) # 24h to 36h later
- return ArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)
|