get_comments.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. import traceback
  2. from . import common
  3. from . import exceptions
  4. from . import pushshift
  5. from . import tsdb
  6. def get_comments(
  7. subreddit=None,
  8. username=None,
  9. specific_submission=None,
  10. do_supplement=True,
  11. lower=None,
  12. upper=None,
  13. ):
  14. if not specific_submission and not common.is_xor(subreddit, username):
  15. raise exceptions.NotExclusive(['subreddit', 'username'])
  16. if username and specific_submission:
  17. raise exceptions.NotExclusive(['username', 'specific_submission'])
  18. common.login()
  19. if specific_submission:
  20. (database, subreddit) = tsdb.TSDB.for_submission(specific_submission, do_create=True, fix_name=True)
  21. specific_submission = common.t3_prefix(specific_submission)[3:]
  22. specific_submission = common.r.submission(specific_submission)
  23. database.insert(specific_submission)
  24. elif subreddit:
  25. (database, subreddit) = tsdb.TSDB.for_subreddit(subreddit, do_create=True, fix_name=True)
  26. else:
  27. (database, username) = tsdb.TSDB.for_user(username, do_create=True, fix_name=True)
  28. cur = database.sql.cursor()
  29. if lower is None:
  30. lower = 0
  31. if lower == 'update':
  32. query_latest = 'SELECT created FROM comments ORDER BY created DESC LIMIT 1'
  33. if subreddit:
  34. # Instead of blindly taking the highest timestamp currently in the db,
  35. # we must consider the case that the user has previously done a
  36. # specific_submission scan and now wants to do a general scan, which
  37. # would trick the latest timestamp into missing anything before that
  38. # specific submission.
  39. query = '''
  40. SELECT created FROM comments WHERE NOT EXISTS (
  41. SELECT 1 FROM submissions
  42. WHERE submissions.idstr == comments.submission
  43. AND submissions.augmented_at IS NOT NULL
  44. )
  45. ORDER BY created DESC LIMIT 1
  46. '''
  47. unaugmented = cur.execute(query).fetchone()
  48. if unaugmented:
  49. lower = unaugmented[0] - 1
  50. else:
  51. latest = cur.execute(query_latest).fetchone()
  52. if latest:
  53. lower = latest[0] - 1
  54. if username:
  55. latest = cur.execute(query_latest).fetchone()
  56. if latest:
  57. lower = latest[0] - 1
  58. if lower == 'update':
  59. lower = 0
  60. if specific_submission:
  61. comments = pushshift.get_comments_from_submission(specific_submission)
  62. elif subreddit:
  63. comments = pushshift.get_comments_from_subreddit(subreddit, lower=lower, upper=upper)
  64. elif username:
  65. comments = pushshift.get_comments_from_user(username, lower=lower, upper=upper)
  66. if do_supplement:
  67. comments = pushshift.supplement_reddit_data(comments, chunk_size=100)
  68. comments = common.generator_chunker(comments, 500)
  69. form = '{lower} ({lower_unix}) - {upper} ({upper_unix}) +{gain}'
  70. for chunk in comments:
  71. step = database.insert(chunk)
  72. message = form.format(
  73. lower=common.human(chunk[0].created_utc),
  74. upper=common.human(chunk[-1].created_utc),
  75. lower_unix=int(chunk[0].created_utc),
  76. upper_unix=int(chunk[-1].created_utc),
  77. gain=step['new_comments'],
  78. )
  79. print(message)
  80. if specific_submission:
  81. query = '''
  82. UPDATE submissions
  83. set augmented_at = ?
  84. WHERE idstr == ?
  85. '''
  86. bindings = [common.get_now(), specific_submission.fullname]
  87. cur.execute(query, bindings)
  88. database.sql.commit()
  89. def get_comments_argparse(args):
  90. return get_comments(
  91. subreddit=args.subreddit,
  92. username=args.username,
  93. #limit=common.int_none(args.limit),
  94. #threshold=common.int_none(args.threshold),
  95. #num_thresh=common.int_none(args.num_thresh),
  96. specific_submission=args.specific_submission,
  97. do_supplement=args.do_supplement,
  98. lower=args.lower,
  99. upper=args.upper,
  100. )