downloader.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. #!/usr/bin/env python3
  2. # coding=utf-8
  3. import hashlib
  4. import logging.handlers
  5. import os
  6. import time
  7. from datetime import datetime
  8. from multiprocessing import Pool
  9. from pathlib import Path
  10. import praw
  11. import praw.exceptions
  12. import praw.models
  13. from bdfr import exceptions as errors
  14. from bdfr.configuration import Configuration
  15. from bdfr.connector import RedditConnector
  16. from bdfr.site_downloaders.download_factory import DownloadFactory
  17. logger = logging.getLogger(__name__)
  18. def _calc_hash(existing_file: Path):
  19. chunk_size = 1024 * 1024
  20. md5_hash = hashlib.md5()
  21. with open(existing_file, 'rb') as file:
  22. chunk = file.read(chunk_size)
  23. while chunk:
  24. md5_hash.update(chunk)
  25. chunk = file.read(chunk_size)
  26. file_hash = md5_hash.hexdigest()
  27. return existing_file, file_hash
  28. class RedditDownloader(RedditConnector):
  29. def __init__(self, args: Configuration):
  30. super(RedditDownloader, self).__init__(args)
  31. if self.args.search_existing:
  32. self.master_hash_list = self.scan_existing_files(self.download_directory)
  33. def download(self):
  34. for generator in self.reddit_lists:
  35. for submission in generator:
  36. self._download_submission(submission)
  37. def _download_submission(self, submission: praw.models.Submission):
  38. if submission.id in self.excluded_submission_ids:
  39. logger.debug(f'Object {submission.id} in exclusion list, skipping')
  40. return
  41. elif submission.subreddit.display_name.lower() in self.args.skip_subreddit:
  42. logger.debug(f'Submission {submission.id} in {submission.subreddit.display_name} in skip list')
  43. return
  44. elif (submission.author and submission.author.name in self.args.ignore_user) or \
  45. (submission.author is None and 'DELETED' in self.args.ignore_user):
  46. logger.debug(
  47. f'Submission {submission.id} in {submission.subreddit.display_name} skipped'
  48. f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user')
  49. return
  50. elif self.args.min_score and submission.score < self.args.min_score:
  51. logger.debug(
  52. f"Submission {submission.id} filtered due to score {submission.score} < [{self.args.min_score}]")
  53. return
  54. elif self.args.max_score and self.args.max_score < submission.score:
  55. logger.debug(
  56. f"Submission {submission.id} filtered due to score {submission.score} > [{self.args.max_score}]")
  57. return
  58. elif (self.args.min_score_ratio and submission.upvote_ratio < self.args.min_score_ratio) or (
  59. self.args.max_score_ratio and self.args.max_score_ratio < submission.upvote_ratio
  60. ):
  61. logger.debug(f"Submission {submission.id} filtered due to score ratio ({submission.upvote_ratio})")
  62. return
  63. elif not isinstance(submission, praw.models.Submission):
  64. logger.warning(f'{submission.id} is not a submission')
  65. return
  66. elif not self.download_filter.check_url(submission.url):
  67. logger.debug(f'Submission {submission.id} filtered due to URL {submission.url}')
  68. return
  69. logger.debug(f'Attempting to download submission {submission.id}')
  70. try:
  71. downloader_class = DownloadFactory.pull_lever(submission.url)
  72. downloader = downloader_class(submission)
  73. logger.debug(f'Using {downloader_class.__name__} with url {submission.url}')
  74. except errors.NotADownloadableLinkError as e:
  75. logger.error(f'Could not download submission {submission.id}: {e}')
  76. return
  77. if downloader_class.__name__.lower() in self.args.disable_module:
  78. logger.debug(f'Submission {submission.id} skipped due to disabled module {downloader_class.__name__}')
  79. return
  80. try:
  81. content = downloader.find_resources(self.authenticator)
  82. except errors.SiteDownloaderError as e:
  83. logger.error(f'Site {downloader_class.__name__} failed to download submission {submission.id}: {e}')
  84. return
  85. for destination, res in self.file_name_formatter.format_resource_paths(content, self.download_directory):
  86. if destination.exists():
  87. logger.debug(f'File {destination} from submission {submission.id} already exists, continuing')
  88. continue
  89. elif not self.download_filter.check_resource(res):
  90. logger.debug(f'Download filter removed {submission.id} file with URL {submission.url}')
  91. continue
  92. try:
  93. res.download({'max_wait_time': self.args.max_wait_time})
  94. except errors.BulkDownloaderException as e:
  95. logger.error(f'Failed to download resource {res.url} in submission {submission.id} '
  96. f'with downloader {downloader_class.__name__}: {e}')
  97. return
  98. resource_hash = res.hash.hexdigest()
  99. destination.parent.mkdir(parents=True, exist_ok=True)
  100. if resource_hash in self.master_hash_list:
  101. if self.args.no_dupes:
  102. logger.info(
  103. f'Resource hash {resource_hash} from submission {submission.id} downloaded elsewhere')
  104. return
  105. elif self.args.make_hard_links:
  106. self.master_hash_list[resource_hash].link_to(destination)
  107. logger.info(
  108. f'Hard link made linking {destination} to {self.master_hash_list[resource_hash]}'
  109. f' in submission {submission.id}')
  110. return
  111. try:
  112. with open(destination, 'wb') as file:
  113. file.write(res.content)
  114. logger.debug(f'Written file to {destination}')
  115. except OSError as e:
  116. logger.exception(e)
  117. logger.error(f'Failed to write file in submission {submission.id} to {destination}: {e}')
  118. return
  119. creation_time = time.mktime(datetime.fromtimestamp(submission.created_utc).timetuple())
  120. os.utime(destination, (creation_time, creation_time))
  121. self.master_hash_list[resource_hash] = destination
  122. logger.debug(f'Hash added to master list: {resource_hash}')
  123. logger.info(f'Downloaded submission {submission.id} from {submission.subreddit.display_name}')
  124. @staticmethod
  125. def scan_existing_files(directory: Path) -> dict[str, Path]:
  126. files = []
  127. for (dirpath, dirnames, filenames) in os.walk(directory):
  128. files.extend([Path(dirpath, file) for file in filenames])
  129. logger.info(f'Calculating hashes for {len(files)} files')
  130. pool = Pool(15)
  131. results = pool.map(_calc_hash, files)
  132. pool.close()
  133. hash_list = {res[1]: res[0] for res in results}
  134. return hash_list