pushshift.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. '''
  2. On January 29, 2018, reddit announced the death of the ?timestamp cloudsearch
  3. parameter for submissions. RIP.
  4. https://www.reddit.com/r/changelog/comments/7tus5f/update_to_search_api/dtfcdn0
  5. This module interfaces with api.pushshift.io to restore this functionality.
  6. It also provides new features previously impossible through reddit alone, such
  7. as scanning all of a user's comments.
  8. '''
  9. import html
  10. import requests
  11. import time
  12. import traceback
  13. from . import common
  14. from voussoirkit import ratelimiter
  15. from voussoirkit import vlogging
  16. log = vlogging.get_logger(__name__)
  17. print('Thank you Jason Baumgartner of Pushshift.io!')
  18. USERAGENT = 'Timesearch ({version}) ({contact})'
  19. API_URL = 'https://api.pushshift.io/reddit/'
  20. DEFAULT_PARAMS = {
  21. 'size': 1000,
  22. 'sort': 'asc',
  23. 'sort_type': 'created_utc',
  24. }
  25. # Pushshift does not supply attributes that are null. So we fill them back in.
  26. FALLBACK_ATTRIBUTES = {
  27. 'distinguished': None,
  28. 'edited': False,
  29. 'link_flair_css_class': None,
  30. 'link_flair_text': None,
  31. 'score': 0,
  32. 'selftext': '',
  33. }
  34. contact_info_message = '''
  35. Please add a CONTACT_INFO string variable to your bot.py file.
  36. This will be added to your pushshift useragent.
  37. '''.strip()
  38. if not getattr(common.bot, 'CONTACT_INFO', ''):
  39. raise ValueError(contact_info_message)
  40. useragent = USERAGENT.format(version=common.VERSION, contact=common.bot.CONTACT_INFO)
  41. ratelimit = None
  42. session = requests.Session()
  43. session.headers.update({'User-Agent': useragent})
  44. class DummyObject:
  45. '''
  46. These classes are used to convert the JSON data we get from pushshift into
  47. objects so that the rest of timesearch can operate transparently.
  48. This requires a bit of whack-a-mole including:
  49. - Fleshing out the attributes which PS did not include because they were
  50. null (we use FALLBACK_ATTRIBUTES to replace them).
  51. - Providing the convenience methods and @properties that PRAW provides.
  52. - Mimicking the rich attributes like author and subreddit.
  53. '''
  54. def __init__(self, **attributes):
  55. for (key, val) in attributes.items():
  56. if key == 'author':
  57. val = DummyObject(name=val)
  58. elif key == 'subreddit':
  59. val = DummyObject(display_name=val)
  60. elif key in ['body', 'selftext']:
  61. val = html.unescape(val)
  62. setattr(self, key, val)
  63. for (key, val) in FALLBACK_ATTRIBUTES.items():
  64. if not hasattr(self, key):
  65. setattr(self, key, val)
  66. # In rare cases, things sometimes don't have a subreddit.
  67. # Promo posts seem to be one example.
  68. FALLBACK_ATTRIBUTES['subreddit'] = DummyObject(display_name=None)
  69. class DummySubmission(DummyObject):
  70. @property
  71. def fullname(self):
  72. return 't3_' + self.id
  73. class DummyComment(DummyObject):
  74. @property
  75. def fullname(self):
  76. return 't1_' + self.id
  77. def _normalize_subreddit(subreddit):
  78. if isinstance(subreddit, str):
  79. return subreddit
  80. else:
  81. return subreddit.display_name
  82. def _normalize_user(user):
  83. if isinstance(user, str):
  84. return user
  85. else:
  86. return user.name
  87. def _pagination_core(url, params, dummy_type, lower=None, upper=None):
  88. if upper is not None:
  89. params['before'] = upper
  90. if lower is not None:
  91. params['after'] = lower
  92. setify = lambda items: set(item['id'] for item in items)
  93. prev_batch_ids = set()
  94. while True:
  95. for retry in range(5):
  96. try:
  97. batch = get(url, params)
  98. except requests.exceptions.HTTPError as exc:
  99. traceback.print_exc()
  100. print('Retrying in 5...')
  101. time.sleep(5)
  102. else:
  103. break
  104. log.debug('Got batch of %d items.', len(batch))
  105. batch_ids = setify(batch)
  106. if len(batch_ids) == 0 or batch_ids.issubset(prev_batch_ids):
  107. break
  108. submissions = [dummy_type(**x) for x in batch if x['id'] not in prev_batch_ids]
  109. submissions.sort(key=lambda x: x.created_utc)
  110. # Take the latest-1 to avoid the lightning strike chance that two posts
  111. # have the same timestamp and this occurs at a page boundary.
  112. # Since ?after=latest would cause us to miss that second one.
  113. params['after'] = submissions[-1].created_utc - 1
  114. yield from submissions
  115. prev_batch_ids = batch_ids
  116. ratelimit.limit()
  117. def _initialize_ratelimiter():
  118. global ratelimit
  119. if ratelimit is not None:
  120. return
  121. log.debug('Initializing pushshift ratelimiter.')
  122. url = 'https://api.pushshift.io/meta'
  123. response = session.get(url)
  124. response.raise_for_status()
  125. response = response.json()
  126. limit = response['server_ratelimit_per_minute']
  127. log.debug('Pushshift ratelimit is %d requests per minute.', limit)
  128. ratelimit = ratelimiter.Ratelimiter(allowance=limit, period=60)
  129. def get(url, params=None):
  130. _initialize_ratelimiter()
  131. if not url.startswith('https://'):
  132. url = API_URL + url.lstrip('/')
  133. if params is None:
  134. params = {}
  135. for (key, val) in DEFAULT_PARAMS.items():
  136. params.setdefault(key, val)
  137. log.debug('Requesting %s with %s', url, params)
  138. ratelimit.limit()
  139. response = session.get(url, params=params)
  140. response.raise_for_status()
  141. response = response.json()
  142. data = response['data']
  143. return data
  144. def get_comments_from_submission(submission):
  145. if isinstance(submission, str):
  146. submission_id = common.t3_prefix(submission)[3:]
  147. else:
  148. submission_id = submission.id
  149. params = {'link_id': submission_id}
  150. comments = _pagination_core(
  151. url='comment/search/',
  152. params=params,
  153. dummy_type=DummyComment,
  154. )
  155. yield from comments
  156. def get_comments_from_subreddit(subreddit, **kwargs):
  157. subreddit = _normalize_subreddit(subreddit)
  158. params = {'subreddit': subreddit}
  159. comments = _pagination_core(
  160. url='comment/search/',
  161. params=params,
  162. dummy_type=DummyComment,
  163. **kwargs
  164. )
  165. yield from comments
  166. def get_comments_from_user(user, **kwargs):
  167. user = _normalize_user(user)
  168. params = {'author': user}
  169. comments = _pagination_core(
  170. url='comment/search/',
  171. params=params,
  172. dummy_type=DummyComment,
  173. **kwargs
  174. )
  175. yield from comments
  176. def get_submissions_from_subreddit(subreddit, **kwargs):
  177. subreddit = _normalize_subreddit(subreddit)
  178. params = {'subreddit': subreddit}
  179. submissions = _pagination_core(
  180. url='submission/search/',
  181. params=params,
  182. dummy_type=DummySubmission,
  183. **kwargs
  184. )
  185. yield from submissions
  186. def get_submissions_from_user(user, **kwargs):
  187. user = _normalize_user(user)
  188. params = {'author': user}
  189. submissions = _pagination_core(
  190. url='submission/search/',
  191. params=params,
  192. dummy_type=DummySubmission,
  193. **kwargs
  194. )
  195. yield from submissions
  196. def supplement_reddit_data(dummies, chunk_size=100):
  197. '''
  198. Given an iterable of the Dummy Pushshift objects, yield them back and also
  199. yield the live Reddit objects they refer to according to reddit's /api/info.
  200. The live object will always come after the corresponding dummy object.
  201. By doing this, we enjoy the strengths of both data sources: Pushshift
  202. will give us deleted or removed objects that reddit would not, and reddit
  203. gives us up-to-date scores and text bodies.
  204. '''
  205. chunks = common.generator_chunker(dummies, chunk_size)
  206. for chunk in chunks:
  207. log.debug('Supplementing %d items with live reddit data.', len(chunk))
  208. ids = [item.fullname for item in chunk]
  209. live_copies = list(common.r.info(ids))
  210. live_copies = {item.fullname: item for item in live_copies}
  211. for item in chunk:
  212. yield item
  213. live_item = live_copies.get(item.fullname, None)
  214. if live_item:
  215. yield live_item