123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252 |
- '''
- On January 29, 2018, reddit announced the death of the ?timestamp cloudsearch
- parameter for submissions. RIP.
- https://www.reddit.com/r/changelog/comments/7tus5f/update_to_search_api/dtfcdn0
- This module interfaces with api.pushshift.io to restore this functionality.
- It also provides new features previously impossible through reddit alone, such
- as scanning all of a user's comments.
- '''
- import html
- import requests
- import time
- import traceback
- from . import common
- from voussoirkit import ratelimiter
- from voussoirkit import vlogging
- log = vlogging.get_logger(__name__)
- print('Thank you Jason Baumgartner of Pushshift.io!')
- USERAGENT = 'Timesearch ({version}) ({contact})'
- API_URL = 'https://api.pushshift.io/reddit/'
- DEFAULT_PARAMS = {
- 'size': 1000,
- 'sort': 'asc',
- 'sort_type': 'created_utc',
- }
- # Pushshift does not supply attributes that are null. So we fill them back in.
- FALLBACK_ATTRIBUTES = {
- 'distinguished': None,
- 'edited': False,
- 'link_flair_css_class': None,
- 'link_flair_text': None,
- 'score': 0,
- 'selftext': '',
- }
- contact_info_message = '''
- Please add a CONTACT_INFO string variable to your bot.py file.
- This will be added to your pushshift useragent.
- '''.strip()
- if not getattr(common.bot, 'CONTACT_INFO', ''):
- raise ValueError(contact_info_message)
- useragent = USERAGENT.format(version=common.VERSION, contact=common.bot.CONTACT_INFO)
- ratelimit = None
- session = requests.Session()
- session.headers.update({'User-Agent': useragent})
- class DummyObject:
- '''
- These classes are used to convert the JSON data we get from pushshift into
- objects so that the rest of timesearch can operate transparently.
- This requires a bit of whack-a-mole including:
- - Fleshing out the attributes which PS did not include because they were
- null (we use FALLBACK_ATTRIBUTES to replace them).
- - Providing the convenience methods and @properties that PRAW provides.
- - Mimicking the rich attributes like author and subreddit.
- '''
- def __init__(self, **attributes):
- for (key, val) in attributes.items():
- if key == 'author':
- val = DummyObject(name=val)
- elif key == 'subreddit':
- val = DummyObject(display_name=val)
- elif key in ['body', 'selftext']:
- val = html.unescape(val)
- setattr(self, key, val)
- for (key, val) in FALLBACK_ATTRIBUTES.items():
- if not hasattr(self, key):
- setattr(self, key, val)
- # In rare cases, things sometimes don't have a subreddit.
- # Promo posts seem to be one example.
- FALLBACK_ATTRIBUTES['subreddit'] = DummyObject(display_name=None)
- class DummySubmission(DummyObject):
- @property
- def fullname(self):
- return 't3_' + self.id
- class DummyComment(DummyObject):
- @property
- def fullname(self):
- return 't1_' + self.id
- def _normalize_subreddit(subreddit):
- if isinstance(subreddit, str):
- return subreddit
- else:
- return subreddit.display_name
- def _normalize_user(user):
- if isinstance(user, str):
- return user
- else:
- return user.name
- def _pagination_core(url, params, dummy_type, lower=None, upper=None):
- if upper is not None:
- params['before'] = upper
- if lower is not None:
- params['after'] = lower
- setify = lambda items: set(item['id'] for item in items)
- prev_batch_ids = set()
- while True:
- for retry in range(5):
- try:
- batch = get(url, params)
- except requests.exceptions.HTTPError as exc:
- traceback.print_exc()
- print('Retrying in 5...')
- time.sleep(5)
- else:
- break
- log.debug('Got batch of %d items.', len(batch))
- batch_ids = setify(batch)
- if len(batch_ids) == 0 or batch_ids.issubset(prev_batch_ids):
- break
- submissions = [dummy_type(**x) for x in batch if x['id'] not in prev_batch_ids]
- submissions.sort(key=lambda x: x.created_utc)
- # Take the latest-1 to avoid the lightning strike chance that two posts
- # have the same timestamp and this occurs at a page boundary.
- # Since ?after=latest would cause us to miss that second one.
- params['after'] = submissions[-1].created_utc - 1
- yield from submissions
- prev_batch_ids = batch_ids
- ratelimit.limit()
- def _initialize_ratelimiter():
- global ratelimit
- if ratelimit is not None:
- return
- log.debug('Initializing pushshift ratelimiter.')
- url = 'https://api.pushshift.io/meta'
- response = session.get(url)
- response.raise_for_status()
- response = response.json()
- limit = response['server_ratelimit_per_minute']
- log.debug('Pushshift ratelimit is %d requests per minute.', limit)
- ratelimit = ratelimiter.Ratelimiter(allowance=limit, period=60)
- def get(url, params=None):
- _initialize_ratelimiter()
- if not url.startswith('https://'):
- url = API_URL + url.lstrip('/')
- if params is None:
- params = {}
- for (key, val) in DEFAULT_PARAMS.items():
- params.setdefault(key, val)
- log.debug('Requesting %s with %s', url, params)
- ratelimit.limit()
- response = session.get(url, params=params)
- response.raise_for_status()
- response = response.json()
- data = response['data']
- return data
- def get_comments_from_submission(submission):
- if isinstance(submission, str):
- submission_id = common.t3_prefix(submission)[3:]
- else:
- submission_id = submission.id
- params = {'link_id': submission_id}
- comments = _pagination_core(
- url='comment/search/',
- params=params,
- dummy_type=DummyComment,
- )
- yield from comments
- def get_comments_from_subreddit(subreddit, **kwargs):
- subreddit = _normalize_subreddit(subreddit)
- params = {'subreddit': subreddit}
- comments = _pagination_core(
- url='comment/search/',
- params=params,
- dummy_type=DummyComment,
- **kwargs
- )
- yield from comments
- def get_comments_from_user(user, **kwargs):
- user = _normalize_user(user)
- params = {'author': user}
- comments = _pagination_core(
- url='comment/search/',
- params=params,
- dummy_type=DummyComment,
- **kwargs
- )
- yield from comments
- def get_submissions_from_subreddit(subreddit, **kwargs):
- subreddit = _normalize_subreddit(subreddit)
- params = {'subreddit': subreddit}
- submissions = _pagination_core(
- url='submission/search/',
- params=params,
- dummy_type=DummySubmission,
- **kwargs
- )
- yield from submissions
- def get_submissions_from_user(user, **kwargs):
- user = _normalize_user(user)
- params = {'author': user}
- submissions = _pagination_core(
- url='submission/search/',
- params=params,
- dummy_type=DummySubmission,
- **kwargs
- )
- yield from submissions
- def supplement_reddit_data(dummies, chunk_size=100):
- '''
- Given an iterable of the Dummy Pushshift objects, yield them back and also
- yield the live Reddit objects they refer to according to reddit's /api/info.
- The live object will always come after the corresponding dummy object.
- By doing this, we enjoy the strengths of both data sources: Pushshift
- will give us deleted or removed objects that reddit would not, and reddit
- gives us up-to-date scores and text bodies.
- '''
- chunks = common.generator_chunker(dummies, chunk_size)
- for chunk in chunks:
- log.debug('Supplementing %d items with live reddit data.', len(chunk))
- ids = [item.fullname for item in chunk]
- live_copies = list(common.r.info(ids))
- live_copies = {item.fullname: item for item in live_copies}
- for item in chunk:
- yield item
- live_item = live_copies.get(item.fullname, None)
- if live_item:
- yield live_item
|