archiver.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. #!/usr/bin/env python3
  2. # coding=utf-8
  3. import json
  4. import logging
  5. import re
  6. from typing import Iterator
  7. import dict2xml
  8. import praw.models
  9. import yaml
  10. from bdfr.archive_entry.base_archive_entry import BaseArchiveEntry
  11. from bdfr.archive_entry.comment_archive_entry import CommentArchiveEntry
  12. from bdfr.archive_entry.submission_archive_entry import SubmissionArchiveEntry
  13. from bdfr.configuration import Configuration
  14. from bdfr.connector import RedditConnector
  15. from bdfr.exceptions import ArchiverError
  16. from bdfr.resource import Resource
  17. logger = logging.getLogger(__name__)
  18. class Archiver(RedditConnector):
  19. def __init__(self, args: Configuration):
  20. super(Archiver, self).__init__(args)
  21. def download(self):
  22. for generator in self.reddit_lists:
  23. for submission in generator:
  24. if (submission.author and submission.author.name in self.args.ignore_user) or \
  25. (submission.author is None and 'DELETED' in self.args.ignore_user):
  26. logger.debug(
  27. f'Submission {submission.id} in {submission.subreddit.display_name} skipped'
  28. f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user')
  29. continue
  30. if submission.id in self.excluded_submission_ids:
  31. logger.debug(f'Object {submission.id} in exclusion list, skipping')
  32. continue
  33. logger.debug(f'Attempting to archive submission {submission.id}')
  34. self.write_entry(submission)
  35. def get_submissions_from_link(self) -> list[list[praw.models.Submission]]:
  36. supplied_submissions = []
  37. for sub_id in self.args.link:
  38. if len(sub_id) == 6:
  39. supplied_submissions.append(self.reddit_instance.submission(id=sub_id))
  40. elif re.match(r'^\w{7}$', sub_id):
  41. supplied_submissions.append(self.reddit_instance.comment(id=sub_id))
  42. else:
  43. supplied_submissions.append(self.reddit_instance.submission(url=sub_id))
  44. return [supplied_submissions]
  45. def get_user_data(self) -> list[Iterator]:
  46. results = super(Archiver, self).get_user_data()
  47. if self.args.user and self.args.all_comments:
  48. sort = self.determine_sort_function()
  49. for user in self.args.user:
  50. logger.debug(f'Retrieving comments of user {user}')
  51. results.append(sort(self.reddit_instance.redditor(user).comments, limit=self.args.limit))
  52. return results
  53. @staticmethod
  54. def _pull_lever_entry_factory(praw_item: (praw.models.Submission, praw.models.Comment)) -> BaseArchiveEntry:
  55. if isinstance(praw_item, praw.models.Submission):
  56. return SubmissionArchiveEntry(praw_item)
  57. elif isinstance(praw_item, praw.models.Comment):
  58. return CommentArchiveEntry(praw_item)
  59. else:
  60. raise ArchiverError(f'Factory failed to classify item of type {type(praw_item).__name__}')
  61. def write_entry(self, praw_item: (praw.models.Submission, praw.models.Comment)):
  62. if self.args.comment_context and isinstance(praw_item, praw.models.Comment):
  63. logger.debug(f'Converting comment {praw_item.id} to submission {praw_item.submission.id}')
  64. praw_item = praw_item.submission
  65. archive_entry = self._pull_lever_entry_factory(praw_item)
  66. if self.args.format == 'json':
  67. self._write_entry_json(archive_entry)
  68. elif self.args.format == 'xml':
  69. self._write_entry_xml(archive_entry)
  70. elif self.args.format == 'yaml':
  71. self._write_entry_yaml(archive_entry)
  72. else:
  73. raise ArchiverError(f'Unknown format {self.args.format} given')
  74. logger.info(f'Record for entry item {praw_item.id} written to disk')
  75. def _write_entry_json(self, entry: BaseArchiveEntry):
  76. resource = Resource(entry.source, '', lambda: None, '.json')
  77. content = json.dumps(entry.compile())
  78. self._write_content_to_disk(resource, content)
  79. def _write_entry_xml(self, entry: BaseArchiveEntry):
  80. resource = Resource(entry.source, '', lambda: None, '.xml')
  81. content = dict2xml.dict2xml(entry.compile(), wrap='root')
  82. self._write_content_to_disk(resource, content)
  83. def _write_entry_yaml(self, entry: BaseArchiveEntry):
  84. resource = Resource(entry.source, '', lambda: None, '.yaml')
  85. content = yaml.dump(entry.compile())
  86. self._write_content_to_disk(resource, content)
  87. def _write_content_to_disk(self, resource: Resource, content: str):
  88. file_path = self.file_name_formatter.format_path(resource, self.download_directory)
  89. file_path.parent.mkdir(exist_ok=True, parents=True)
  90. with open(file_path, 'w', encoding="utf-8") as file:
  91. logger.debug(
  92. f'Writing entry {resource.source_submission.id} to file in {resource.extension[1:].upper()}'
  93. f' format at {file_path}')
  94. file.write(content)