connector.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. #!/usr/bin/env python3
  2. # coding=utf-8
  3. import configparser
  4. import importlib.resources
  5. import itertools
  6. import logging
  7. import logging.handlers
  8. import re
  9. import shutil
  10. import socket
  11. from abc import ABCMeta, abstractmethod
  12. from datetime import datetime
  13. from enum import Enum, auto
  14. from pathlib import Path
  15. from typing import Callable, Iterator
  16. import appdirs
  17. import praw
  18. import praw.exceptions
  19. import praw.models
  20. import prawcore
  21. from bdfr import exceptions as errors
  22. from bdfr.configuration import Configuration
  23. from bdfr.download_filter import DownloadFilter
  24. from bdfr.file_name_formatter import FileNameFormatter
  25. from bdfr.oauth2 import OAuth2Authenticator, OAuth2TokenManager
  26. from bdfr.site_authenticator import SiteAuthenticator
  27. logger = logging.getLogger(__name__)
  28. class RedditTypes:
  29. class SortType(Enum):
  30. CONTROVERSIAL = auto()
  31. HOT = auto()
  32. NEW = auto()
  33. RELEVENCE = auto()
  34. RISING = auto()
  35. TOP = auto()
  36. class TimeType(Enum):
  37. ALL = 'all'
  38. DAY = 'day'
  39. HOUR = 'hour'
  40. MONTH = 'month'
  41. WEEK = 'week'
  42. YEAR = 'year'
  43. class RedditConnector(metaclass=ABCMeta):
  44. def __init__(self, args: Configuration):
  45. self.args = args
  46. self.config_directories = appdirs.AppDirs('bdfr', 'BDFR')
  47. self.run_time = datetime.now().isoformat()
  48. self._setup_internal_objects()
  49. self.reddit_lists = self.retrieve_reddit_lists()
  50. def _setup_internal_objects(self):
  51. self.determine_directories()
  52. self.load_config()
  53. self.create_file_logger()
  54. self.read_config()
  55. self.parse_disabled_modules()
  56. self.download_filter = self.create_download_filter()
  57. logger.log(9, 'Created download filter')
  58. self.time_filter = self.create_time_filter()
  59. logger.log(9, 'Created time filter')
  60. self.sort_filter = self.create_sort_filter()
  61. logger.log(9, 'Created sort filter')
  62. self.file_name_formatter = self.create_file_name_formatter()
  63. logger.log(9, 'Create file name formatter')
  64. self.create_reddit_instance()
  65. self.args.user = list(filter(None, [self.resolve_user_name(user) for user in self.args.user]))
  66. self.excluded_submission_ids = set.union(
  67. self.read_id_files(self.args.exclude_id_file),
  68. set(self.args.exclude_id),
  69. )
  70. self.args.link = list(itertools.chain(self.args.link, self.read_id_files(self.args.include_id_file)))
  71. self.master_hash_list = {}
  72. self.authenticator = self.create_authenticator()
  73. logger.log(9, 'Created site authenticator')
  74. self.args.skip_subreddit = self.split_args_input(self.args.skip_subreddit)
  75. self.args.skip_subreddit = set([sub.lower() for sub in self.args.skip_subreddit])
  76. def read_config(self):
  77. """Read any cfg values that need to be processed"""
  78. if self.args.max_wait_time is None:
  79. self.args.max_wait_time = self.cfg_parser.getint('DEFAULT', 'max_wait_time', fallback=120)
  80. logger.debug(f'Setting maximum download wait time to {self.args.max_wait_time} seconds')
  81. if self.args.time_format is None:
  82. option = self.cfg_parser.get('DEFAULT', 'time_format', fallback='ISO')
  83. if re.match(r'^[\s\'\"]*$', option):
  84. option = 'ISO'
  85. logger.debug(f'Setting datetime format string to {option}')
  86. self.args.time_format = option
  87. if not self.args.disable_module:
  88. self.args.disable_module = [self.cfg_parser.get('DEFAULT', 'disabled_modules', fallback='')]
  89. # Update config on disk
  90. with open(self.config_location, 'w') as file:
  91. self.cfg_parser.write(file)
  92. def parse_disabled_modules(self):
  93. disabled_modules = self.args.disable_module
  94. disabled_modules = self.split_args_input(disabled_modules)
  95. disabled_modules = set([name.strip().lower() for name in disabled_modules])
  96. self.args.disable_module = disabled_modules
  97. logger.debug(f'Disabling the following modules: {", ".join(self.args.disable_module)}')
  98. def create_reddit_instance(self):
  99. if self.args.authenticate:
  100. logger.debug('Using authenticated Reddit instance')
  101. if not self.cfg_parser.has_option('DEFAULT', 'user_token'):
  102. logger.log(9, 'Commencing OAuth2 authentication')
  103. scopes = self.cfg_parser.get('DEFAULT', 'scopes', fallback='identity, history, read, save')
  104. scopes = OAuth2Authenticator.split_scopes(scopes)
  105. oauth2_authenticator = OAuth2Authenticator(
  106. scopes,
  107. self.cfg_parser.get('DEFAULT', 'client_id'),
  108. self.cfg_parser.get('DEFAULT', 'client_secret'),
  109. )
  110. token = oauth2_authenticator.retrieve_new_token()
  111. self.cfg_parser['DEFAULT']['user_token'] = token
  112. with open(self.config_location, 'w') as file:
  113. self.cfg_parser.write(file, True)
  114. token_manager = OAuth2TokenManager(self.cfg_parser, self.config_location)
  115. self.authenticated = True
  116. self.reddit_instance = praw.Reddit(
  117. client_id=self.cfg_parser.get('DEFAULT', 'client_id'),
  118. client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'),
  119. user_agent=socket.gethostname(),
  120. token_manager=token_manager,
  121. )
  122. else:
  123. logger.debug('Using unauthenticated Reddit instance')
  124. self.authenticated = False
  125. self.reddit_instance = praw.Reddit(
  126. client_id=self.cfg_parser.get('DEFAULT', 'client_id'),
  127. client_secret=self.cfg_parser.get('DEFAULT', 'client_secret'),
  128. user_agent=socket.gethostname(),
  129. )
  130. def retrieve_reddit_lists(self) -> list[praw.models.ListingGenerator]:
  131. master_list = []
  132. master_list.extend(self.get_subreddits())
  133. logger.log(9, 'Retrieved subreddits')
  134. master_list.extend(self.get_multireddits())
  135. logger.log(9, 'Retrieved multireddits')
  136. master_list.extend(self.get_user_data())
  137. logger.log(9, 'Retrieved user data')
  138. master_list.extend(self.get_submissions_from_link())
  139. logger.log(9, 'Retrieved submissions for given links')
  140. return master_list
  141. def determine_directories(self):
  142. self.download_directory = Path(self.args.directory).resolve().expanduser()
  143. self.config_directory = Path(self.config_directories.user_config_dir)
  144. self.download_directory.mkdir(exist_ok=True, parents=True)
  145. self.config_directory.mkdir(exist_ok=True, parents=True)
  146. def load_config(self):
  147. self.cfg_parser = configparser.ConfigParser()
  148. if self.args.config:
  149. if (cfg_path := Path(self.args.config)).exists():
  150. self.cfg_parser.read(cfg_path)
  151. self.config_location = cfg_path
  152. return
  153. possible_paths = [
  154. Path('./config.cfg'),
  155. Path('./default_config.cfg'),
  156. Path(self.config_directory, 'config.cfg'),
  157. Path(self.config_directory, 'default_config.cfg'),
  158. ]
  159. self.config_location = None
  160. for path in possible_paths:
  161. if path.resolve().expanduser().exists():
  162. self.config_location = path
  163. logger.debug(f'Loading configuration from {path}')
  164. break
  165. if not self.config_location:
  166. with importlib.resources.path('bdfr', 'default_config.cfg') as path:
  167. self.config_location = path
  168. shutil.copy(self.config_location, Path(self.config_directory, 'default_config.cfg'))
  169. if not self.config_location:
  170. raise errors.BulkDownloaderException('Could not find a configuration file to load')
  171. self.cfg_parser.read(self.config_location)
  172. def create_file_logger(self):
  173. main_logger = logging.getLogger()
  174. if self.args.log is None:
  175. log_path = Path(self.config_directory, 'log_output.txt')
  176. else:
  177. log_path = Path(self.args.log).resolve().expanduser()
  178. if not log_path.parent.exists():
  179. raise errors.BulkDownloaderException(f'Designated location for logfile does not exist')
  180. backup_count = self.cfg_parser.getint('DEFAULT', 'backup_log_count', fallback=3)
  181. file_handler = logging.handlers.RotatingFileHandler(
  182. log_path,
  183. mode='a',
  184. backupCount=backup_count,
  185. )
  186. if log_path.exists():
  187. try:
  188. file_handler.doRollover()
  189. except PermissionError:
  190. logger.critical(
  191. 'Cannot rollover logfile, make sure this is the only '
  192. 'BDFR process or specify alternate logfile location')
  193. raise
  194. formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] - %(message)s')
  195. file_handler.setFormatter(formatter)
  196. file_handler.setLevel(0)
  197. main_logger.addHandler(file_handler)
  198. @staticmethod
  199. def sanitise_subreddit_name(subreddit: str) -> str:
  200. pattern = re.compile(r'^(?:https://www\.reddit\.com/)?(?:r/)?(.*?)/?$')
  201. match = re.match(pattern, subreddit)
  202. if not match:
  203. raise errors.BulkDownloaderException(f'Could not find subreddit name in string {subreddit}')
  204. return match.group(1)
  205. @staticmethod
  206. def split_args_input(entries: list[str]) -> set[str]:
  207. all_entries = []
  208. split_pattern = re.compile(r'[,;]\s?')
  209. for entry in entries:
  210. results = re.split(split_pattern, entry)
  211. all_entries.extend([RedditConnector.sanitise_subreddit_name(name) for name in results])
  212. return set(all_entries)
  213. def get_subreddits(self) -> list[praw.models.ListingGenerator]:
  214. out = []
  215. subscribed_subreddits = set()
  216. if self.args.subscribed:
  217. if self.args.authenticate:
  218. try:
  219. subscribed_subreddits = list(self.reddit_instance.user.subreddits(limit=None))
  220. subscribed_subreddits = set([s.display_name for s in subscribed_subreddits])
  221. except prawcore.InsufficientScope:
  222. logger.error('BDFR has insufficient scope to access subreddit lists')
  223. else:
  224. logger.error('Cannot find subscribed subreddits without an authenticated instance')
  225. if self.args.subreddit or subscribed_subreddits:
  226. for reddit in self.split_args_input(self.args.subreddit) | subscribed_subreddits:
  227. if reddit == 'friends' and self.authenticated is False:
  228. logger.error('Cannot read friends subreddit without an authenticated instance')
  229. continue
  230. try:
  231. reddit = self.reddit_instance.subreddit(reddit)
  232. try:
  233. self.check_subreddit_status(reddit)
  234. except errors.BulkDownloaderException as e:
  235. logger.error(e)
  236. continue
  237. if self.args.search:
  238. out.append(reddit.search(
  239. self.args.search,
  240. sort=self.sort_filter.name.lower(),
  241. limit=self.args.limit,
  242. time_filter=self.time_filter.value,
  243. ))
  244. logger.debug(
  245. f'Added submissions from subreddit {reddit} with the search term "{self.args.search}"')
  246. else:
  247. out.append(self.create_filtered_listing_generator(reddit))
  248. logger.debug(f'Added submissions from subreddit {reddit}')
  249. except (errors.BulkDownloaderException, praw.exceptions.PRAWException) as e:
  250. logger.error(f'Failed to get submissions for subreddit {reddit}: {e}')
  251. return out
  252. def resolve_user_name(self, in_name: str) -> str:
  253. if in_name == 'me':
  254. if self.authenticated:
  255. resolved_name = self.reddit_instance.user.me().name
  256. logger.log(9, f'Resolved user to {resolved_name}')
  257. return resolved_name
  258. else:
  259. logger.warning('To use "me" as a user, an authenticated Reddit instance must be used')
  260. else:
  261. return in_name
  262. def get_submissions_from_link(self) -> list[list[praw.models.Submission]]:
  263. supplied_submissions = []
  264. for sub_id in self.args.link:
  265. if len(sub_id) == 6:
  266. supplied_submissions.append(self.reddit_instance.submission(id=sub_id))
  267. else:
  268. supplied_submissions.append(self.reddit_instance.submission(url=sub_id))
  269. return [supplied_submissions]
  270. def determine_sort_function(self) -> Callable:
  271. if self.sort_filter is RedditTypes.SortType.NEW:
  272. sort_function = praw.models.Subreddit.new
  273. elif self.sort_filter is RedditTypes.SortType.RISING:
  274. sort_function = praw.models.Subreddit.rising
  275. elif self.sort_filter is RedditTypes.SortType.CONTROVERSIAL:
  276. sort_function = praw.models.Subreddit.controversial
  277. elif self.sort_filter is RedditTypes.SortType.TOP:
  278. sort_function = praw.models.Subreddit.top
  279. else:
  280. sort_function = praw.models.Subreddit.hot
  281. return sort_function
  282. def get_multireddits(self) -> list[Iterator]:
  283. if self.args.multireddit:
  284. if len(self.args.user) != 1:
  285. logger.error(f'Only 1 user can be supplied when retrieving from multireddits')
  286. return []
  287. out = []
  288. for multi in self.split_args_input(self.args.multireddit):
  289. try:
  290. multi = self.reddit_instance.multireddit(self.args.user[0], multi)
  291. if not multi.subreddits:
  292. raise errors.BulkDownloaderException
  293. out.append(self.create_filtered_listing_generator(multi))
  294. logger.debug(f'Added submissions from multireddit {multi}')
  295. except (errors.BulkDownloaderException, praw.exceptions.PRAWException, prawcore.PrawcoreException) as e:
  296. logger.error(f'Failed to get submissions for multireddit {multi}: {e}')
  297. return out
  298. else:
  299. return []
  300. def create_filtered_listing_generator(self, reddit_source) -> Iterator:
  301. sort_function = self.determine_sort_function()
  302. if self.sort_filter in (RedditTypes.SortType.TOP, RedditTypes.SortType.CONTROVERSIAL):
  303. return sort_function(reddit_source, limit=self.args.limit, time_filter=self.time_filter.value)
  304. else:
  305. return sort_function(reddit_source, limit=self.args.limit)
  306. def get_user_data(self) -> list[Iterator]:
  307. if any([self.args.submitted, self.args.upvoted, self.args.saved]):
  308. if not self.args.user:
  309. logger.warning('At least one user must be supplied to download user data')
  310. return []
  311. generators = []
  312. for user in self.args.user:
  313. try:
  314. self.check_user_existence(user)
  315. except errors.BulkDownloaderException as e:
  316. logger.error(e)
  317. continue
  318. if self.args.submitted:
  319. logger.debug(f'Retrieving submitted posts of user {self.args.user}')
  320. generators.append(self.create_filtered_listing_generator(
  321. self.reddit_instance.redditor(user).submissions,
  322. ))
  323. if not self.authenticated and any((self.args.upvoted, self.args.saved)):
  324. logger.warning('Accessing user lists requires authentication')
  325. else:
  326. if self.args.upvoted:
  327. logger.debug(f'Retrieving upvoted posts of user {self.args.user}')
  328. generators.append(self.reddit_instance.redditor(user).upvoted(limit=self.args.limit))
  329. if self.args.saved:
  330. logger.debug(f'Retrieving saved posts of user {self.args.user}')
  331. generators.append(self.reddit_instance.redditor(user).saved(limit=self.args.limit))
  332. return generators
  333. else:
  334. return []
  335. def check_user_existence(self, name: str):
  336. user = self.reddit_instance.redditor(name=name)
  337. try:
  338. if user.id:
  339. return
  340. except prawcore.exceptions.NotFound:
  341. raise errors.BulkDownloaderException(f'Could not find user {name}')
  342. except AttributeError:
  343. if hasattr(user, 'is_suspended'):
  344. raise errors.BulkDownloaderException(f'User {name} is banned')
  345. def create_file_name_formatter(self) -> FileNameFormatter:
  346. return FileNameFormatter(self.args.file_scheme, self.args.folder_scheme, self.args.time_format)
  347. def create_time_filter(self) -> RedditTypes.TimeType:
  348. try:
  349. return RedditTypes.TimeType[self.args.time.upper()]
  350. except (KeyError, AttributeError):
  351. return RedditTypes.TimeType.ALL
  352. def create_sort_filter(self) -> RedditTypes.SortType:
  353. try:
  354. return RedditTypes.SortType[self.args.sort.upper()]
  355. except (KeyError, AttributeError):
  356. return RedditTypes.SortType.HOT
  357. def create_download_filter(self) -> DownloadFilter:
  358. return DownloadFilter(self.args.skip, self.args.skip_domain)
  359. def create_authenticator(self) -> SiteAuthenticator:
  360. return SiteAuthenticator(self.cfg_parser)
  361. @abstractmethod
  362. def download(self):
  363. pass
  364. @staticmethod
  365. def check_subreddit_status(subreddit: praw.models.Subreddit):
  366. if subreddit.display_name in ('all', 'friends'):
  367. return
  368. try:
  369. assert subreddit.id
  370. except prawcore.NotFound:
  371. raise errors.BulkDownloaderException(f"Source {subreddit.display_name} cannot be found")
  372. except prawcore.Redirect:
  373. raise errors.BulkDownloaderException(f"Source {subreddit.display_name} does not exist")
  374. except prawcore.Forbidden:
  375. raise errors.BulkDownloaderException(f'Source {subreddit.display_name} is private and cannot be scraped')
  376. @staticmethod
  377. def read_id_files(file_locations: list[str]) -> set[str]:
  378. out = []
  379. for id_file in file_locations:
  380. id_file = Path(id_file).resolve().expanduser()
  381. if not id_file.exists():
  382. logger.warning(f'ID file at {id_file} does not exist')
  383. continue
  384. with open(id_file, 'r') as file:
  385. for line in file:
  386. out.append(line.strip())
  387. return set(out)