123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152 |
- import os, datetime, traceback, random, tempfile
- from loguru import logger
- from slugify import slugify
- from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver
- from utils import GWorksheet, mkdir_if_not_exists, expand_url
- from configs import Config
- from storages import Storage
- random.seed()
- def update_sheet(gw, row, result: ArchiveResult):
- cell_updates = []
- row_values = gw.get_row(row)
- def batch_if_valid(col, val, final_value=None):
- final_value = final_value or val
- if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
- cell_updates.append((row, col, final_value))
- cell_updates.append((row, 'status', result.status))
- batch_if_valid('archive', result.cdn_url)
- batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
- batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
- batch_if_valid('thumbnail_index', result.thumbnail_index)
- batch_if_valid('title', result.title)
- batch_if_valid('duration', result.duration, str(result.duration))
- batch_if_valid('screenshot', result.screenshot)
- batch_if_valid('hash', result.hash)
- if result.timestamp is not None:
- if type(result.timestamp) == int:
- timestamp_string = datetime.datetime.fromtimestamp(result.timestamp).replace(tzinfo=datetime.timezone.utc).isoformat()
- elif type(result.timestamp) == str:
- timestamp_string = result.timestamp
- else:
- timestamp_string = result.timestamp.isoformat()
- batch_if_valid('timestamp', timestamp_string)
- gw.batch_set_cell(cell_updates)
- def missing_required_columns(gw: GWorksheet):
- missing = False
- for required_col in ['url', 'status']:
- if not gw.col_exists(required_col):
- logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.wks.title}')
- missing = True
- return missing
- def process_sheet(c: Config):
- sh = c.gsheets_client.open(c.sheet)
- # loop through worksheets to check
- for ii, wks in enumerate(sh.worksheets()):
- logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
- gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
- if missing_required_columns(gw): continue
- # archives will default to being in a folder 'doc_name/worksheet_name'
- default_folder = os.path.join(slugify(c.sheet), slugify(wks.title))
- c.set_folder(default_folder)
- storage = c.get_storage()
- # loop through rows in worksheet
- for row in range(1 + c.header, gw.count_rows() + 1):
- url = gw.get_cell(row, 'url')
- original_status = gw.get_cell(row, 'status')
- status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
- is_retry = False
- if url == '' or status not in ['', None]:
- is_retry = Archiver.should_retry_from_status(status)
- if not is_retry: continue
- # All checks done - archival process starts here
- try:
- gw.set_cell(row, 'status', 'Archive in progress')
- url = expand_url(url)
- c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
- # make a new driver so each spreadsheet row is idempotent
- c.recreate_webdriver()
- # order matters, first to succeed excludes remaining
- active_archivers = [
- TelethonArchiver(storage, c.webdriver, c.telegram_config),
- TiktokArchiver(storage, c.webdriver),
- TwitterApiArchiver(storage, c.webdriver, c.twitter_config),
- YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
- TelegramArchiver(storage, c.webdriver),
- TwitterArchiver(storage, c.webdriver),
- VkArchiver(storage, c.webdriver, c.vk_config),
- WaybackArchiver(storage, c.webdriver, c.wayback_config)
- ]
- for archiver in active_archivers:
- logger.debug(f'Trying {archiver} on {row=}')
- try:
- result = archiver.download(url, check_if_exists=c.check_if_exists)
- except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
- except Exception as e:
- result = False
- logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
- if result:
- success = result.status in ['success', 'already archived']
- result.status = f"{archiver.name}: {result.status}"
- if success:
- logger.success(f'{archiver.name} succeeded on {row=}, {url=}')
- break
- # only 1 retry possible for now
- if is_retry and Archiver.is_retry(result.status):
- result.status = Archiver.remove_retry(result.status)
- logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
- if result:
- update_sheet(gw, row, result)
- else:
- gw.set_cell(row, 'status', 'failed: no archiver')
- except KeyboardInterrupt:
- # catches keyboard interruptions to do a clean exit
- logger.warning(f"caught interrupt on {row=}, {url=}")
- gw.set_cell(row, 'status', '')
- c.destroy_webdriver()
- exit()
- except Exception as e:
- logger.error(f'Got unexpected error in row {row} for {url=}: {e}\n{traceback.format_exc()}')
- gw.set_cell(row, 'status', 'failed: unexpected error (see logs)')
- logger.success(f'Finished worksheet {wks.title}')
- @logger.catch
- def main():
- c = Config()
- c.parse()
- logger.info(f'Opening document {c.sheet} for header {c.header}')
- with tempfile.TemporaryDirectory(dir="./") as tmpdir:
- Storage.TMP_FOLDER = tmpdir
- process_sheet(c)
- c.destroy_webdriver()
- if __name__ == '__main__':
- main()
|