auto_archive.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. import os, datetime, traceback, random, tempfile
  2. from loguru import logger
  3. from slugify import slugify
  4. from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, TwitterApiArchiver, VkArchiver, WaybackArchiver, ArchiveResult, Archiver
  5. from utils import GWorksheet, mkdir_if_not_exists, expand_url
  6. from configs import Config
  7. from storages import Storage
  8. random.seed()
  9. def update_sheet(gw, row, result: ArchiveResult):
  10. cell_updates = []
  11. row_values = gw.get_row(row)
  12. def batch_if_valid(col, val, final_value=None):
  13. final_value = final_value or val
  14. if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
  15. cell_updates.append((row, col, final_value))
  16. cell_updates.append((row, 'status', result.status))
  17. batch_if_valid('archive', result.cdn_url)
  18. batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
  19. batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
  20. batch_if_valid('thumbnail_index', result.thumbnail_index)
  21. batch_if_valid('title', result.title)
  22. batch_if_valid('duration', result.duration, str(result.duration))
  23. batch_if_valid('screenshot', result.screenshot)
  24. batch_if_valid('hash', result.hash)
  25. if result.timestamp is not None:
  26. if type(result.timestamp) == int:
  27. timestamp_string = datetime.datetime.fromtimestamp(result.timestamp).replace(tzinfo=datetime.timezone.utc).isoformat()
  28. elif type(result.timestamp) == str:
  29. timestamp_string = result.timestamp
  30. else:
  31. timestamp_string = result.timestamp.isoformat()
  32. batch_if_valid('timestamp', timestamp_string)
  33. gw.batch_set_cell(cell_updates)
  34. def missing_required_columns(gw: GWorksheet):
  35. missing = False
  36. for required_col in ['url', 'status']:
  37. if not gw.col_exists(required_col):
  38. logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.wks.title}')
  39. missing = True
  40. return missing
  41. def process_sheet(c: Config):
  42. sh = c.gsheets_client.open(c.sheet)
  43. # loop through worksheets to check
  44. for ii, wks in enumerate(sh.worksheets()):
  45. logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
  46. gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
  47. if missing_required_columns(gw): continue
  48. # archives will default to being in a folder 'doc_name/worksheet_name'
  49. default_folder = os.path.join(slugify(c.sheet), slugify(wks.title))
  50. c.set_folder(default_folder)
  51. storage = c.get_storage()
  52. # loop through rows in worksheet
  53. for row in range(1 + c.header, gw.count_rows() + 1):
  54. url = gw.get_cell(row, 'url')
  55. original_status = gw.get_cell(row, 'status')
  56. status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
  57. is_retry = False
  58. if url == '' or status not in ['', None]:
  59. is_retry = Archiver.should_retry_from_status(status)
  60. if not is_retry: continue
  61. # All checks done - archival process starts here
  62. try:
  63. gw.set_cell(row, 'status', 'Archive in progress')
  64. url = expand_url(url)
  65. c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
  66. # make a new driver so each spreadsheet row is idempotent
  67. c.recreate_webdriver()
  68. # order matters, first to succeed excludes remaining
  69. active_archivers = [
  70. TelethonArchiver(storage, c.webdriver, c.telegram_config),
  71. TiktokArchiver(storage, c.webdriver),
  72. TwitterApiArchiver(storage, c.webdriver, c.twitter_config),
  73. YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
  74. TelegramArchiver(storage, c.webdriver),
  75. TwitterArchiver(storage, c.webdriver),
  76. VkArchiver(storage, c.webdriver, c.vk_config),
  77. WaybackArchiver(storage, c.webdriver, c.wayback_config)
  78. ]
  79. for archiver in active_archivers:
  80. logger.debug(f'Trying {archiver} on {row=}')
  81. try:
  82. result = archiver.download(url, check_if_exists=c.check_if_exists)
  83. except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
  84. except Exception as e:
  85. result = False
  86. logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
  87. if result:
  88. success = result.status in ['success', 'already archived']
  89. result.status = f"{archiver.name}: {result.status}"
  90. if success:
  91. logger.success(f'{archiver.name} succeeded on {row=}, {url=}')
  92. break
  93. # only 1 retry possible for now
  94. if is_retry and Archiver.is_retry(result.status):
  95. result.status = Archiver.remove_retry(result.status)
  96. logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
  97. if result:
  98. update_sheet(gw, row, result)
  99. else:
  100. gw.set_cell(row, 'status', 'failed: no archiver')
  101. except KeyboardInterrupt:
  102. # catches keyboard interruptions to do a clean exit
  103. logger.warning(f"caught interrupt on {row=}, {url=}")
  104. gw.set_cell(row, 'status', '')
  105. c.destroy_webdriver()
  106. exit()
  107. except Exception as e:
  108. logger.error(f'Got unexpected error in row {row} for {url=}: {e}\n{traceback.format_exc()}')
  109. gw.set_cell(row, 'status', 'failed: unexpected error (see logs)')
  110. logger.success(f'Finished worksheet {wks.title}')
  111. @logger.catch
  112. def main():
  113. c = Config()
  114. c.parse()
  115. logger.info(f'Opening document {c.sheet} for header {c.header}')
  116. with tempfile.TemporaryDirectory(dir="./") as tmpdir:
  117. Storage.TMP_FOLDER = tmpdir
  118. process_sheet(c)
  119. c.destroy_webdriver()
  120. if __name__ == '__main__':
  121. main()