config.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. import argparse, yaml, json
  2. import gspread
  3. from loguru import logger
  4. from selenium import webdriver
  5. from dataclasses import asdict
  6. from selenium.common.exceptions import TimeoutException
  7. from utils import GWorksheet, getattr_or
  8. from .wayback_config import WaybackConfig
  9. from .telethon_config import TelethonConfig
  10. from .selenium_config import SeleniumConfig
  11. from .vk_config import VkConfig
  12. from .twitter_api_config import TwitterApiConfig
  13. from storages import S3Config, S3Storage, GDStorage, GDConfig, LocalStorage, LocalConfig
  14. class Config:
  15. """
  16. Controls the current execution parameters and manages API configurations
  17. Usage:
  18. c = Config() # initializes the argument parser
  19. c.parse() # parses the values and initializes the Services and API clients
  20. # you can then access the Services and APIs like 'c.s3_config'
  21. All the configurations available as cmd line options, when included, will
  22. override the configurations in the config.yaml file.
  23. Configurations are split between:
  24. 1. "secrets" containing API keys for generating services - not kept in memory
  25. 2. "execution" containing specific execution configurations
  26. """
  27. AVAILABLE_STORAGES = {"s3", "gd", "local"}
  28. def __init__(self):
  29. self.parser = self.get_argument_parser()
  30. self.folder = ""
  31. def parse(self):
  32. self.args = self.parser.parse_args()
  33. logger.success(f'Command line arguments parsed successfully')
  34. self.config_file = self.args.config
  35. self.read_config_yaml()
  36. logger.info(f'APIs and Services initialized:\n{self}')
  37. def read_config_yaml(self):
  38. with open(self.config_file, "r", encoding="utf-8") as inf:
  39. self.config = yaml.safe_load(inf)
  40. # ----------------------EXECUTION - execution configurations
  41. execution = self.config.get("execution", {})
  42. self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
  43. assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
  44. self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
  45. self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
  46. self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False)
  47. if self.save_logs:
  48. self.set_log_files()
  49. self.check_if_exists = getattr(self.args, "check_if_exists") or execution.get("check_if_exists", False)
  50. # Column names come from config and can be overwritten by CMD
  51. # in the end all are considered as lower case
  52. config_column_names = execution.get("column_names", {})
  53. self.column_names = {}
  54. for k in GWorksheet.COLUMN_NAMES.keys():
  55. self.column_names[k] = getattr_or(self.args, k, config_column_names.get(k, GWorksheet.COLUMN_NAMES[k])).lower()
  56. # selenium driver
  57. selenium_configs = execution.get("selenium", {})
  58. self.selenium_config = SeleniumConfig(
  59. timeout_seconds=int(selenium_configs.get("timeout_seconds", SeleniumConfig.timeout_seconds)),
  60. window_width=int(selenium_configs.get("window_width", SeleniumConfig.window_width)),
  61. window_height=int(selenium_configs.get("window_height", SeleniumConfig.window_height))
  62. )
  63. self.webdriver = "not initialized"
  64. # ---------------------- SECRETS - APIs and service configurations
  65. secrets = self.config.get("secrets", {})
  66. # assert selected storage credentials exist
  67. for key, name in [("s3", "s3"), ("gd", "google_drive"), ("local", "local")]:
  68. assert self.storage != key or name in secrets, f"selected storage '{key}' requires secrets.'{name}' in {self.config_file}"
  69. # google sheets config
  70. self.gsheets_client = gspread.service_account(
  71. filename=secrets.get("google_sheets", {}).get("service_account", 'service_account.json')
  72. )
  73. # facebook config
  74. self.facebook_cookie = secrets.get("facebook", {}).get("cookie", None)
  75. # s3 config
  76. if "s3" in secrets:
  77. s3 = secrets["s3"]
  78. self.s3_config = S3Config(
  79. bucket=s3["bucket"],
  80. region=s3["region"],
  81. key=s3["key"],
  82. secret=s3["secret"],
  83. endpoint_url=s3.get("endpoint_url", S3Config.endpoint_url),
  84. cdn_url=s3.get("cdn_url", S3Config.cdn_url),
  85. key_path=s3.get("key_path", S3Config.key_path),
  86. private=getattr_or(self.args, "s3-private", s3.get("private", S3Config.private))
  87. )
  88. # GDrive config
  89. if "google_drive" in secrets:
  90. gd = secrets["google_drive"]
  91. self.gd_config = GDConfig(
  92. root_folder_id=gd.get("root_folder_id"),
  93. service_account=gd.get("service_account", GDConfig.service_account)
  94. )
  95. if "local" in secrets:
  96. self.local_config = LocalConfig(
  97. save_to=secrets["local"].get("save_to", LocalConfig.save_to),
  98. )
  99. # wayback machine config
  100. if "wayback" in secrets:
  101. self.wayback_config = WaybackConfig(
  102. key=secrets["wayback"]["key"],
  103. secret=secrets["wayback"]["secret"],
  104. )
  105. else:
  106. self.wayback_config = None
  107. logger.debug(f"'wayback' key not present in the {self.config_file=}")
  108. # telethon config
  109. if "telegram" in secrets:
  110. self.telegram_config = TelethonConfig(
  111. api_id=secrets["telegram"]["api_id"],
  112. api_hash=secrets["telegram"]["api_hash"],
  113. bot_token=secrets["telegram"].get("bot_token", None)
  114. )
  115. else:
  116. self.telegram_config = None
  117. logger.debug(f"'telegram' key not present in the {self.config_file=}")
  118. # twitter config
  119. if "twitter" in secrets:
  120. self.twitter_config = TwitterApiConfig(
  121. bearer_token=secrets["twitter"].get("bearer_token"),
  122. consumer_key=secrets["twitter"].get("consumer_key"),
  123. consumer_secret=secrets["twitter"].get("consumer_secret"),
  124. access_token=secrets["twitter"].get("access_token"),
  125. access_secret=secrets["twitter"].get("access_secret"),
  126. )
  127. else:
  128. self.twitter_config = None
  129. logger.debug(f"'twitter' key not present in the {self.config_file=}")
  130. # vk config
  131. if "vk" in secrets:
  132. self.vk_config = VkConfig(
  133. username=secrets["vk"]["username"],
  134. password=secrets["vk"]["password"]
  135. )
  136. else:
  137. self.vk_config = None
  138. logger.debug(f"'vk' key not present in the {self.config_file=}")
  139. del self.config["secrets"] # delete to prevent leaks
  140. def set_log_files(self):
  141. # called only when config.execution.save_logs=true
  142. logger.add("logs/1trace.log", level="TRACE")
  143. logger.add("logs/2info.log", level="INFO")
  144. logger.add("logs/3success.log", level="SUCCESS")
  145. logger.add("logs/4warning.log", level="WARNING")
  146. logger.add("logs/5error.log", level="ERROR")
  147. def get_argument_parser(self):
  148. """
  149. Creates the CMD line arguments. 'python auto_archive.py --help'
  150. """
  151. parser = argparse.ArgumentParser(description='Automatically archive social media posts, videos, and images from a Google Sheets document. The command line arguments will always override the configurations in the provided YAML config file (--config), only some high-level options are allowed via the command line and the YAML configuration file is the preferred method. The sheet must have the "url" and "status" for the archiver to work. ')
  152. parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='config.yaml')
  153. parser.add_argument('--storage', action='store', dest='storage', help='which storage to use [execution.storage in config.yaml]', choices=Config.AVAILABLE_STORAGES)
  154. parser.add_argument('--sheet', action='store', dest='sheet', help='the name of the google sheets document [execution.sheet in config.yaml]')
  155. parser.add_argument('--header', action='store', dest='header', help='1-based index for the header row [execution.header in config.yaml]')
  156. parser.add_argument('--check-if-exists', action='store_true', dest='check_if_exists', help='when possible checks if the URL has been archived before and does not archive the same URL twice [exceution.check_if_exists]')
  157. parser.add_argument('--save-logs', action='store_true', dest='save_logs', help='creates or appends execution logs to files logs/LEVEL.log [exceution.save_logs]')
  158. parser.add_argument('--s3-private', action='store_true', help='Store content without public access permission (only for storage=s3) [secrets.s3.private in config.yaml]')
  159. for k, v in GWorksheet.COLUMN_NAMES.items():
  160. help = f"the name of the column to FILL WITH {k} (default='{v}')"
  161. if k in ["url", "folder"]:
  162. help = f"the name of the column to READ {k} FROM (default='{v}')"
  163. parser.add_argument(f'--col-{k}', action='store', dest=k, help=help)
  164. return parser
  165. def set_folder(self, folder):
  166. """
  167. update the folder in each of the storages
  168. """
  169. self.folder = folder
  170. # s3
  171. if hasattr(self, "s3_config"): self.s3_config.folder = folder
  172. if hasattr(self, "s3_storage"): self.s3_storage.folder = folder
  173. # gdrive
  174. if hasattr(self, "gd_config"): self.gd_config.folder = folder
  175. if hasattr(self, "gd_storage"): self.gd_storage.folder = folder
  176. # local
  177. if hasattr(self, "local_config"): self.local_config.folder = folder
  178. if hasattr(self, "local_storage"): self.local_storage.folder = folder
  179. def get_storage(self):
  180. """
  181. returns the configured type of storage, creating if needed
  182. """
  183. if self.storage == "s3":
  184. self.s3_storage = getattr_or(self, "s3_storage", S3Storage(self.s3_config))
  185. return self.s3_storage
  186. elif self.storage == "gd":
  187. self.gd_storage = getattr_or(self, "gd_storage", GDStorage(self.gd_config))
  188. return self.gd_storage
  189. elif self.storage == "local":
  190. self.local_storage = getattr_or(self, "local_storage", LocalStorage(self.local_config))
  191. return self.local_storage
  192. raise f"storage {self.storage} not implemented, available: {Config.AVAILABLE_STORAGES}"
  193. def destroy_webdriver(self):
  194. if self.webdriver is not None and type(self.webdriver) != str:
  195. self.webdriver.close()
  196. self.webdriver.quit()
  197. del self.webdriver
  198. def recreate_webdriver(self):
  199. options = webdriver.FirefoxOptions()
  200. options.headless = True
  201. options.set_preference('network.protocol-handler.external.tg', False)
  202. try:
  203. new_webdriver = webdriver.Firefox(options=options)
  204. # only destroy if creation is successful
  205. self.destroy_webdriver()
  206. self.webdriver = new_webdriver
  207. self.webdriver.set_window_size(self.selenium_config.window_width,
  208. self.selenium_config.window_height)
  209. self.webdriver.set_page_load_timeout(self.selenium_config.timeout_seconds)
  210. except TimeoutException as e:
  211. logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")
  212. def __str__(self) -> str:
  213. return json.dumps({
  214. "config_file": self.config_file,
  215. "sheet": self.sheet,
  216. "storage": self.storage,
  217. "header": self.header,
  218. "check_if_exists": self.check_if_exists,
  219. "save_logs": self.save_logs,
  220. "selenium_config": asdict(self.selenium_config),
  221. "selenium_webdriver": self.webdriver != None,
  222. "s3_config": hasattr(self, "s3_config"),
  223. "s3_private": getattr_or(getattr(self, "s3_config", {}), "private", None),
  224. "gd_config": hasattr(self, "gd_config"),
  225. "local_config": hasattr(self, "local_config"),
  226. "wayback_config": self.wayback_config != None,
  227. "telegram_config": self.telegram_config != None,
  228. "twitter_config": self.twitter_config != None,
  229. "vk_config": self.vk_config != None,
  230. "gsheets_client": self.gsheets_client != None,
  231. "column_names": self.column_names,
  232. }, ensure_ascii=False, indent=4)