tsdb.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541
  1. import os
  2. import sqlite3
  3. import time
  4. import types
  5. from . import common
  6. from . import exceptions
  7. from . import pushshift
  8. from voussoirkit import pathclass
  9. from voussoirkit import sqlhelpers
  10. from voussoirkit import vlogging
  11. log = vlogging.get_logger(__name__)
  12. # For backwards compatibility reasons, this list of format strings will help
  13. # timesearch find databases that are using the old filename style.
  14. # The final element will be used if none of the previous ones were found.
  15. DB_FORMATS_SUBREDDIT = [
  16. '.\\{name}.db',
  17. '.\\subreddits\\{name}\\{name}.db',
  18. '.\\{name}\\{name}.db',
  19. '.\\databases\\{name}.db',
  20. '.\\subreddits\\{name}\\{name}.db',
  21. ]
  22. DB_FORMATS_USER = [
  23. '.\\@{name}.db',
  24. '.\\users\\@{name}\\@{name}.db',
  25. '.\\@{name}\\@{name}.db',
  26. '.\\databases\\@{name}.db',
  27. '.\\users\\@{name}\\@{name}.db',
  28. ]
  29. DATABASE_VERSION = 2
  30. DB_VERSION_PRAGMA = f'''
  31. PRAGMA user_version = {DATABASE_VERSION};
  32. '''
  33. DB_PRAGMAS = f'''
  34. '''
  35. DB_INIT = f'''
  36. {DB_PRAGMAS}
  37. {DB_VERSION_PRAGMA}
  38. ----------------------------------------------------------------------------------------------------
  39. CREATE TABLE IF NOT EXISTS config(
  40. key TEXT,
  41. value TEXT
  42. );
  43. ----------------------------------------------------------------------------------------------------
  44. CREATE TABLE IF NOT EXISTS submissions(
  45. idint INT,
  46. idstr TEXT,
  47. created INT,
  48. self INT,
  49. nsfw INT,
  50. author TEXT,
  51. title TEXT,
  52. url TEXT,
  53. selftext TEXT,
  54. score INT,
  55. subreddit TEXT,
  56. distinguish INT,
  57. textlen INT,
  58. num_comments INT,
  59. flair_text TEXT,
  60. flair_css_class TEXT,
  61. augmented_at INT,
  62. augmented_count INT
  63. );
  64. CREATE INDEX IF NOT EXISTS submission_index ON submissions(idstr);
  65. ----------------------------------------------------------------------------------------------------
  66. CREATE TABLE IF NOT EXISTS comments(
  67. idint INT,
  68. idstr TEXT,
  69. created INT,
  70. author TEXT,
  71. parent TEXT,
  72. submission TEXT,
  73. body TEXT,
  74. score INT,
  75. subreddit TEXT,
  76. distinguish TEXT,
  77. textlen INT
  78. );
  79. CREATE INDEX IF NOT EXISTS comment_index ON comments(idstr);
  80. ----------------------------------------------------------------------------------------------------
  81. CREATE TABLE IF NOT EXISTS submission_edits(
  82. idstr TEXT,
  83. previous_selftext TEXT,
  84. replaced_at INT
  85. );
  86. CREATE INDEX IF NOT EXISTS submission_edits_index ON submission_edits(idstr);
  87. ----------------------------------------------------------------------------------------------------
  88. CREATE TABLE IF NOT EXISTS comment_edits(
  89. idstr TEXT,
  90. previous_body TEXT,
  91. replaced_at INT
  92. );
  93. CREATE INDEX IF NOT EXISTS comment_edits_index ON comment_edits(idstr);
  94. '''
  95. DEFAULT_CONFIG = {
  96. 'store_edits': True,
  97. }
  98. SQL_SUBMISSION_COLUMNS = [
  99. 'idint',
  100. 'idstr',
  101. 'created',
  102. 'self',
  103. 'nsfw',
  104. 'author',
  105. 'title',
  106. 'url',
  107. 'selftext',
  108. 'score',
  109. 'subreddit',
  110. 'distinguish',
  111. 'textlen',
  112. 'num_comments',
  113. 'flair_text',
  114. 'flair_css_class',
  115. 'augmented_at',
  116. 'augmented_count',
  117. ]
  118. SQL_COMMENT_COLUMNS = [
  119. 'idint',
  120. 'idstr',
  121. 'created',
  122. 'author',
  123. 'parent',
  124. 'submission',
  125. 'body',
  126. 'score',
  127. 'subreddit',
  128. 'distinguish',
  129. 'textlen',
  130. ]
  131. SQL_EDITS_COLUMNS = [
  132. 'idstr',
  133. 'text',
  134. 'replaced_at',
  135. ]
  136. SQL_SUBMISSION = {key:index for (index, key) in enumerate(SQL_SUBMISSION_COLUMNS)}
  137. SQL_COMMENT = {key:index for (index, key) in enumerate(SQL_COMMENT_COLUMNS)}
  138. SUBMISSION_TYPES = (common.praw.models.Submission, pushshift.DummySubmission)
  139. COMMENT_TYPES = (common.praw.models.Comment, pushshift.DummyComment)
  140. class DBEntry:
  141. '''
  142. This class converts a tuple row from the database into an object so that
  143. you can access the attributes with dot notation.
  144. '''
  145. def __init__(self, dbrow):
  146. if dbrow[1].startswith('t3_'):
  147. columns = SQL_SUBMISSION_COLUMNS
  148. self.object_type = 'submission'
  149. else:
  150. columns = SQL_COMMENT_COLUMNS
  151. self.object_type = 'comment'
  152. self.id = None
  153. self.idstr = None
  154. for (index, attribute) in enumerate(columns):
  155. setattr(self, attribute, dbrow[index])
  156. def __repr__(self):
  157. return 'DBEntry(\'%s\')' % self.id
  158. class TSDB:
  159. def __init__(self, filepath, *, do_create=True, skip_version_check=False):
  160. self.filepath = pathclass.Path(filepath)
  161. if not self.filepath.is_file:
  162. if not do_create:
  163. raise exceptions.DatabaseNotFound(self.filepath)
  164. print('New database', self.filepath.relative_path)
  165. self.filepath.parent.makedirs(exist_ok=True)
  166. self.breakdown_dir = self.filepath.parent.with_child('breakdown')
  167. self.offline_reading_dir = self.filepath.parent.with_child('offline_reading')
  168. self.index_dir = self.filepath.parent.with_child('index')
  169. self.styles_dir = self.filepath.parent.with_child('styles')
  170. self.wiki_dir = self.filepath.parent.with_child('wiki')
  171. existing_database = self.filepath.exists
  172. self.sql = sqlite3.connect(self.filepath.absolute_path)
  173. self.cur = self.sql.cursor()
  174. if existing_database:
  175. if not skip_version_check:
  176. self._check_version()
  177. self._load_pragmas()
  178. else:
  179. self._first_time_setup()
  180. self.config = {}
  181. for (key, default_value) in DEFAULT_CONFIG.items():
  182. self.cur.execute('SELECT value FROM config WHERE key == ?', [key])
  183. existing_value = self.cur.fetchone()
  184. if existing_value is None:
  185. self.cur.execute('INSERT INTO config VALUES(?, ?)', [key, default_value])
  186. self.config[key] = default_value
  187. else:
  188. existing_value = existing_value[0]
  189. if isinstance(default_value, int):
  190. existing_value = int(existing_value)
  191. self.config[key] = existing_value
  192. def _check_version(self):
  193. '''
  194. Compare database's user_version against DATABASE_VERSION,
  195. raising exceptions.DatabaseOutOfDate if not correct.
  196. '''
  197. existing = self.cur.execute('PRAGMA user_version').fetchone()[0]
  198. if existing != DATABASE_VERSION:
  199. raise exceptions.DatabaseOutOfDate(
  200. current=existing,
  201. new=DATABASE_VERSION,
  202. filepath=self.filepath,
  203. )
  204. def _first_time_setup(self):
  205. self.sql.executescript(DB_INIT)
  206. self.sql.commit()
  207. def _load_pragmas(self):
  208. self.sql.executescript(DB_PRAGMAS)
  209. self.sql.commit()
  210. def __repr__(self):
  211. return 'TSDB(%s)' % self.filepath
  212. @staticmethod
  213. def _pick_filepath(formats, name):
  214. '''
  215. Starting with the most specific and preferred filename format, check
  216. if there is an existing database that matches the name we're looking
  217. for, and return that path. If none of them exist, then use the most
  218. preferred filepath.
  219. '''
  220. for form in formats:
  221. path = form.format(name=name)
  222. if os.path.isfile(path):
  223. break
  224. return pathclass.Path(path)
  225. @classmethod
  226. def _for_object_helper(cls, name, path_formats, do_create=True, fix_name=False):
  227. if name != os.path.basename(name):
  228. filepath = pathclass.Path(name)
  229. else:
  230. filepath = cls._pick_filepath(formats=path_formats, name=name)
  231. database = cls(filepath=filepath, do_create=do_create)
  232. if fix_name:
  233. return (database, name_from_path(name))
  234. return database
  235. @classmethod
  236. def for_submission(cls, submission_id, fix_name=False, *args, **kwargs):
  237. subreddit = common.subreddit_for_submission(submission_id)
  238. database = cls.for_subreddit(subreddit, *args, **kwargs)
  239. if fix_name:
  240. return (database, subreddit.display_name)
  241. return database
  242. @classmethod
  243. def for_subreddit(cls, name, do_create=True, fix_name=False):
  244. if isinstance(name, common.praw.models.Subreddit):
  245. name = name.display_name
  246. elif not isinstance(name, str):
  247. raise TypeError(name, 'should be str or Subreddit.')
  248. return cls._for_object_helper(
  249. name,
  250. do_create=do_create,
  251. fix_name=fix_name,
  252. path_formats=DB_FORMATS_SUBREDDIT,
  253. )
  254. @classmethod
  255. def for_user(cls, name, do_create=True, fix_name=False):
  256. if isinstance(name, common.praw.models.Redditor):
  257. name = name.name
  258. elif not isinstance(name, str):
  259. raise TypeError(name, 'should be str or Redditor.')
  260. return cls._for_object_helper(
  261. name,
  262. do_create=do_create,
  263. fix_name=fix_name,
  264. path_formats=DB_FORMATS_USER,
  265. )
  266. def check_for_edits(self, obj, existing_entry):
  267. '''
  268. If the item's current text doesn't match the stored text, decide what
  269. to do.
  270. Firstly, make sure to ignore deleted comments.
  271. Then, if the database is configured to store edited text, do so.
  272. Finally, return the body that we want to store in the main table.
  273. '''
  274. if isinstance(obj, SUBMISSION_TYPES):
  275. existing_body = existing_entry[SQL_SUBMISSION['selftext']]
  276. body = obj.selftext
  277. else:
  278. existing_body = existing_entry[SQL_COMMENT['body']]
  279. body = obj.body
  280. if body != existing_body:
  281. if should_keep_existing_text(obj):
  282. body = existing_body
  283. elif self.config['store_edits']:
  284. self.insert_edited(obj, old_text=existing_body)
  285. return body
  286. def insert(self, objects, commit=True):
  287. if not isinstance(objects, (list, tuple, types.GeneratorType)):
  288. objects = [objects]
  289. log.debug('Trying to insert %d objects.', len(objects))
  290. new_values = {
  291. 'tsdb': self,
  292. 'new_submissions': 0,
  293. 'new_comments': 0,
  294. }
  295. methods = {
  296. common.praw.models.Submission: (self.insert_submission, 'new_submissions'),
  297. common.praw.models.Comment: (self.insert_comment, 'new_comments'),
  298. }
  299. methods[pushshift.DummySubmission] = methods[common.praw.models.Submission]
  300. methods[pushshift.DummyComment] = methods[common.praw.models.Comment]
  301. for obj in objects:
  302. (method, key) = methods.get(type(obj), (None, None))
  303. if method is None:
  304. raise TypeError('Unsupported', type(obj), obj)
  305. status = method(obj)
  306. new_values[key] += status
  307. if commit:
  308. log.debug('Committing insert.')
  309. self.sql.commit()
  310. log.debug('Done inserting.')
  311. return new_values
  312. def insert_edited(self, obj, old_text):
  313. '''
  314. Having already detected that the item has been edited, add a record to
  315. the appropriate *_edits table containing the text that is being
  316. replaced.
  317. '''
  318. if isinstance(obj, SUBMISSION_TYPES):
  319. table = 'submission_edits'
  320. key = 'previous_selftext'
  321. else:
  322. table = 'comment_edits'
  323. key = 'previous_body'
  324. if obj.edited is False:
  325. replaced_at = int(time.time())
  326. else:
  327. replaced_at = int(obj.edited)
  328. postdata = {
  329. 'idstr': obj.fullname,
  330. key: old_text,
  331. 'replaced_at': replaced_at,
  332. }
  333. cur = self.sql.cursor()
  334. (qmarks, bindings) = sqlhelpers.insert_filler(postdata)
  335. query = f'INSERT INTO {table} {qmarks}'
  336. cur.execute(query, bindings)
  337. def insert_submission(self, submission):
  338. cur = self.sql.cursor()
  339. cur.execute('SELECT * FROM submissions WHERE idstr == ?', [submission.fullname])
  340. existing_entry = cur.fetchone()
  341. if submission.author is None:
  342. author = '[DELETED]'
  343. else:
  344. author = submission.author.name
  345. if not existing_entry:
  346. if submission.is_self:
  347. # Selfpost's URL leads back to itself, so just ignore it.
  348. url = None
  349. elif hasattr(submission, 'crosspost_parent') and getattr(submission, 'crosspost_parent_list'):
  350. url = submission.crosspost_parent_list[0]['permalink']
  351. else:
  352. url = getattr(submission, 'url', None)
  353. if url and url.startswith('/r/'):
  354. url = 'https://reddit.com' + url
  355. postdata = {
  356. 'idint': common.b36(submission.id),
  357. 'idstr': submission.fullname,
  358. 'created': submission.created_utc,
  359. 'self': submission.is_self,
  360. 'nsfw': submission.over_18,
  361. 'author': author,
  362. 'title': submission.title,
  363. 'url': url,
  364. 'selftext': submission.selftext,
  365. 'score': submission.score,
  366. 'subreddit': submission.subreddit.display_name,
  367. 'distinguish': submission.distinguished,
  368. 'textlen': len(submission.selftext),
  369. 'num_comments': submission.num_comments,
  370. 'flair_text': submission.link_flair_text,
  371. 'flair_css_class': submission.link_flair_css_class,
  372. 'augmented_at': None,
  373. 'augmented_count': None,
  374. }
  375. (qmarks, bindings) = sqlhelpers.insert_filler(postdata)
  376. query = f'INSERT INTO submissions {qmarks}'
  377. cur.execute(query, bindings)
  378. else:
  379. selftext = self.check_for_edits(submission, existing_entry=existing_entry)
  380. query = '''
  381. UPDATE submissions SET
  382. nsfw = coalesce(?, nsfw),
  383. score = coalesce(?, score),
  384. selftext = coalesce(?, selftext),
  385. distinguish = coalesce(?, distinguish),
  386. num_comments = coalesce(?, num_comments),
  387. flair_text = coalesce(?, flair_text),
  388. flair_css_class = coalesce(?, flair_css_class)
  389. WHERE idstr == ?
  390. '''
  391. bindings = [
  392. submission.over_18,
  393. submission.score,
  394. selftext,
  395. submission.distinguished,
  396. submission.num_comments,
  397. submission.link_flair_text,
  398. submission.link_flair_css_class,
  399. submission.fullname
  400. ]
  401. cur.execute(query, bindings)
  402. return existing_entry is None
  403. def insert_comment(self, comment):
  404. cur = self.sql.cursor()
  405. cur.execute('SELECT * FROM comments WHERE idstr == ?', [comment.fullname])
  406. existing_entry = cur.fetchone()
  407. if comment.author is None:
  408. author = '[DELETED]'
  409. else:
  410. author = comment.author.name
  411. if not existing_entry:
  412. postdata = {
  413. 'idint': common.b36(comment.id),
  414. 'idstr': comment.fullname,
  415. 'created': comment.created_utc,
  416. 'author': author,
  417. 'parent': comment.parent_id,
  418. 'submission': comment.link_id,
  419. 'body': comment.body,
  420. 'score': comment.score,
  421. 'subreddit': comment.subreddit.display_name,
  422. 'distinguish': comment.distinguished,
  423. 'textlen': len(comment.body),
  424. }
  425. (qmarks, bindings) = sqlhelpers.insert_filler(postdata)
  426. query = f'INSERT INTO comments {qmarks}'
  427. cur.execute(query, bindings)
  428. else:
  429. body = self.check_for_edits(comment, existing_entry=existing_entry)
  430. query = '''
  431. UPDATE comments SET
  432. score = coalesce(?, score),
  433. body = coalesce(?, body),
  434. distinguish = coalesce(?, distinguish)
  435. WHERE idstr == ?
  436. '''
  437. bindings = [
  438. comment.score,
  439. body,
  440. comment.distinguished,
  441. comment.fullname
  442. ]
  443. cur.execute(query, bindings)
  444. return existing_entry is None
  445. def name_from_path(filepath):
  446. '''
  447. In order to support usage like
  448. > timesearch livestream -r D:\\some\\other\\filepath\\learnpython.db
  449. this function extracts the subreddit name / username based on the given
  450. path, so that we can pass it into `r.subreddit` / `r.redditor` properly.
  451. '''
  452. if isinstance(filepath, pathclass.Path):
  453. filepath = filepath.basename
  454. else:
  455. filepath = os.path.basename(filepath)
  456. name = os.path.splitext(filepath)[0]
  457. name = name.strip('@')
  458. return name
  459. def should_keep_existing_text(obj):
  460. '''
  461. Under certain conditions we do not want to update the entry in the db
  462. with the most recent copy of the text. For example, if the post has
  463. been deleted and the text now shows '[deleted]' we would prefer to
  464. keep whatever we already have.
  465. This function puts away the work I would otherwise have to duplicate
  466. for both submissions and comments.
  467. '''
  468. body = obj.selftext if isinstance(obj, SUBMISSION_TYPES) else obj.body
  469. if obj.author is None and body in ['[removed]', '[deleted]']:
  470. return True
  471. greasy = ['has been overwritten', 'pastebin.com/64GuVi2F']
  472. if any(grease in body for grease in greasy):
  473. return True
  474. return False