write_html.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774
  1. #! /usr/bin/env python
  2. from datetime import datetime, date, timedelta
  3. import argparse
  4. import csv
  5. import os
  6. import re
  7. import snudown
  8. import psutil
  9. url_project = 'https://github.com/libertysoft3/reddit-html-archiver'
  10. links_per_page = 30
  11. pager_skip = 10
  12. pager_skip_long = 100
  13. start_date = date(2005, 1, 1)
  14. end_date = datetime.today().date() + timedelta(days=1)
  15. source_data_links = 'links.csv'
  16. max_comment_depth = 8 # mostly for mobile, which might be silly
  17. removed_content_identifiers = ['[deleted]','deleted','[removed]','removed']
  18. default_sort = 'score'
  19. sort_indexes = {
  20. 'score': {
  21. 'default': 1,
  22. 'slug': 'score'
  23. },
  24. 'num_comments': {
  25. 'default': 0,
  26. 'slug': 'comments',
  27. },
  28. 'created_utc': {
  29. 'default': 1000198000,
  30. 'slug': 'date',
  31. }
  32. }
  33. missing_comment_score_label = 'n/a'
  34. template_index = ''
  35. with open('templates/index.html', 'r', encoding='utf-8') as file:
  36. template_index = file.read()
  37. template_subreddit = ''
  38. with open('templates/subreddit.html', 'r', encoding='utf-8') as file:
  39. template_subreddit = file.read()
  40. template_link = ''
  41. with open('templates/link.html', 'r', encoding='utf-8') as file:
  42. template_link = file.read()
  43. template_comment = ''
  44. with open('templates/partial_comment.html', 'r', encoding='utf-8') as file:
  45. template_comment = file.read()
  46. template_search = ''
  47. with open('templates/search.html', 'r', encoding='utf-8') as file:
  48. template_search = file.read()
  49. template_user = ''
  50. with open('templates/user.html', 'r', encoding='utf-8') as file:
  51. template_user = file.read()
  52. template_sub_link = ''
  53. with open('templates/partial_menu_item.html', 'r', encoding='utf-8') as file:
  54. template_sub_link = file.read()
  55. template_user_url = ''
  56. with open('templates/partial_user.html', 'r', encoding='utf-8') as file:
  57. template_user_url = file.read()
  58. template_link_url = ''
  59. with open('templates/partial_link.html', 'r', encoding='utf-8') as file:
  60. template_link_url = file.read()
  61. template_search_link = ''
  62. with open('templates/partial_search_link.html', 'r', encoding='utf-8') as file:
  63. template_search_link = file.read()
  64. template_index_sub = ''
  65. with open('templates/partial_index_subreddit.html', 'r', encoding='utf-8') as file:
  66. template_index_sub = file.read()
  67. template_index_pager_link = ''
  68. with open('templates/partial_subreddit_pager_link.html', 'r', encoding='utf-8') as file:
  69. template_index_pager_link = file.read()
  70. template_selftext = ''
  71. with open('templates/partial_link_selftext.html', 'r', encoding='utf-8') as file:
  72. template_selftext = file.read()
  73. template_user_page_link = ''
  74. with open('templates/partial_user_link.html', 'r', encoding='utf-8') as file:
  75. template_user_page_link = file.read()
  76. teplate_url = ''
  77. with open('templates/partial_url.html', 'r', encoding='utf-8') as file:
  78. template_url = file.read()
  79. process = psutil.Process(os.getpid())
  80. def generate_html(min_score=0, min_comments=0, hide_deleted_comments=False):
  81. delta = timedelta(days=1)
  82. subs = get_subs()
  83. user_index = {}
  84. processed_subs = []
  85. stat_links = 0
  86. stat_filtered_links = 0
  87. for sub in subs:
  88. # write link pages
  89. # print('generate_html() processing %s %s kb' % (sub, int(int(process.memory_info().rss) / 1024)))
  90. stat_sub_links = 0
  91. stat_sub_filtered_links = 0
  92. stat_sub_comments = 0
  93. d = start_date
  94. while d <= end_date:
  95. raw_links = load_links(d, sub, True)
  96. stat_links += len(raw_links)
  97. stat_sub_links += len(raw_links)
  98. for l in raw_links:
  99. if validate_link(l, min_score, min_comments):
  100. write_link_page(subs, l, sub, hide_deleted_comments)
  101. stat_filtered_links += 1
  102. stat_sub_filtered_links += 1
  103. if 'comments' in l:
  104. stat_sub_comments += len(l['comments'])
  105. d += delta
  106. if stat_sub_filtered_links > 0:
  107. processed_subs.append({'name': sub, 'num_links': stat_sub_filtered_links})
  108. print('%s: %s links filtered to %s' % (sub, stat_sub_links, stat_sub_filtered_links))
  109. # write subreddit pages
  110. valid_sub_links = []
  111. d = start_date
  112. while d <= end_date:
  113. raw_links = load_links(d, sub)
  114. for l in raw_links:
  115. if validate_link(l, min_score, min_comments):
  116. valid_sub_links.append(l)
  117. # collect links for user pages
  118. # TODO: this is the least performant bit. load and generate user pages user by user instead.
  119. l['subreddit'] = sub
  120. if l['author'] not in user_index.keys():
  121. user_index[l['author']] = []
  122. user_index[l['author']].append(l)
  123. d += delta
  124. write_subreddit_pages(sub, subs, valid_sub_links, stat_sub_filtered_links, stat_sub_comments)
  125. write_subreddit_search_page(sub, subs, valid_sub_links, stat_sub_filtered_links, stat_sub_comments)
  126. # write user pages
  127. write_user_page(processed_subs, user_index)
  128. # write index page
  129. write_index(processed_subs)
  130. print('all done. %s links filtered to %s' % (stat_links, stat_filtered_links))
  131. def write_subreddit_pages(subreddit, subs, link_index, stat_sub_filtered_links, stat_sub_comments):
  132. if len(link_index) == 0:
  133. return True
  134. for sort in sort_indexes.keys():
  135. links = sorted(link_index, key=lambda k: (int(k[sort]) if k[sort] != '' else sort_indexes[sort]['default']), reverse=True)
  136. pages = list(chunks(links, links_per_page))
  137. page_num = 0
  138. sort_based_prefix = '../'
  139. if sort == default_sort:
  140. sort_based_prefix = ''
  141. # render subreddits list
  142. subs_menu_html = ''
  143. for sub in subs:
  144. sub_url = sort_based_prefix + '../' + sub + '/index.html'
  145. subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace('###SUB###', sub)
  146. for page in pages:
  147. page_num += 1
  148. # print('%s page' % (page))
  149. links_html = ''
  150. for l in page:
  151. author_link_html = template_user_url
  152. author_url = sort_based_prefix + '../user/' + l['author'] + '.html'
  153. author_link_html = author_link_html.replace('###URL_AUTHOR###', author_url).replace('###AUTHOR###', l['author'])
  154. link_url = l['url']
  155. link_comments_url = sort_based_prefix + l['permalink'].lower().strip('/')
  156. link_comments_url = link_comments_url.replace('r/' + subreddit + '/', '')
  157. idpath = '/'.join(list(l['id']))
  158. link_comments_url = link_comments_url.replace(l['id'], idpath)
  159. link_comments_url += '.html'
  160. if l['is_self'] is True or l['is_self'] == 'True':
  161. link_url = link_comments_url
  162. index_link_data_map = {
  163. '###TITLE###': l['title'],
  164. '###URL###': link_url,
  165. '###URL_COMMENTS###': link_comments_url,
  166. '###SCORE###': str(l['score']),
  167. '###NUM_COMMENTS###': l['num_comments'] if int(l['num_comments']) > 0 else str(0),
  168. '###DATE###': datetime.utcfromtimestamp(int(l['created_utc'])).strftime('%Y-%m-%d'),
  169. '###LINK_DOMAIN###': '(self.' + subreddit + ')' if l['is_self'] is True or l['is_self'] == 'True' else '',
  170. '###HTML_AUTHOR_URL###': author_link_html,
  171. }
  172. link_html = template_link_url
  173. for key, value in index_link_data_map.items():
  174. link_html = link_html.replace(key, value)
  175. links_html += link_html + '\n'
  176. index_page_data_map = {
  177. '###INCLUDE_PATH###': sort_based_prefix + '../',
  178. '###TITLE###': 'by ' + sort_indexes[sort]['slug'] + ' page ' + str(page_num) + ' of ' + str(len(pages)),
  179. '###SUB###': subreddit,
  180. '###ARCH_NUM_POSTS###': str(stat_sub_filtered_links),
  181. '###ARCH_NUM_COMMENTS###': str(stat_sub_comments),
  182. '###URL_SUBS###': sort_based_prefix + '../index.html',
  183. '###URL_PROJECT###': url_project,
  184. '###URL_IDX_SCORE###': sort_based_prefix + 'index.html',
  185. '###URL_IDX_CMNT###': sort_based_prefix + 'index-' + sort_indexes['num_comments']['slug'] + '/index.html',
  186. '###URL_IDX_DATE###': sort_based_prefix + 'index-' + sort_indexes['created_utc']['slug'] + '/index.html',
  187. '###URL_SEARCH###': sort_based_prefix + 'search.html',
  188. '###URL_IDX_SCORE_CSS###': 'active' if sort == 'score' else '',
  189. '###URL_IDX_CMNT_CSS###': 'active' if sort == 'num_comments' else '',
  190. '###URL_IDX_DATE_CSS###': 'active' if sort == 'created_utc' else '',
  191. '###URL_SEARCH_CSS###': '',
  192. '###HTML_LINKS###': links_html,
  193. '###HTML_SUBS_MENU###': subs_menu_html,
  194. '###HTML_PAGER###': get_pager_html(page_num, len(pages)),
  195. }
  196. page_html = template_subreddit
  197. for key, value in index_page_data_map.items():
  198. page_html = page_html.replace(key, value)
  199. # write file
  200. suffix = '-' + str(page_num) + '.html'
  201. if page_num == 1:
  202. suffix = '.html'
  203. filename = 'index' + suffix
  204. if sort == default_sort:
  205. filepath = 'r/' + subreddit + '/' + filename
  206. else:
  207. filepath = 'r/' + subreddit + '/index-' + sort_indexes[sort]['slug'] + '/' + filename
  208. if not os.path.isfile(filepath):
  209. os.makedirs(os.path.dirname(filepath), exist_ok=True)
  210. with open(filepath, 'w', encoding='utf-8') as file:
  211. file.write(page_html)
  212. # print('wrote %s %s, %s links' % (sort, filepath, len(page)))
  213. return True
  214. def write_link_page(subreddits, link, subreddit='', hide_deleted_comments=False):
  215. # reddit: https://www.reddit.com/r/conspiracy/comments/8742iv/happening_now_classmate_former_friend_of/
  216. # archive: r/conspiracy/comments/8/7/4/2/i/v/happening_now_classmate_former_friend_of.html
  217. idpath = '/'.join(list(link['id']))
  218. filepath = link['permalink'].lower().strip('/') + '.html'
  219. filepath = filepath.replace(link['id'], idpath)
  220. if os.path.isfile(filepath):
  221. return True
  222. created = datetime.utcfromtimestamp(int(link['created_utc']))
  223. sorted_comments = []
  224. if len(link['comments']) > 0:
  225. sorted_comments = sort_comments(link['comments'], hide_deleted_comments)
  226. # traverse up to root dir, depends on id length
  227. static_include_path = ''
  228. for i in range(len(link['id']) + 2):
  229. static_include_path += '../'
  230. # render comments
  231. comments_html = ''
  232. for c in sorted_comments:
  233. css_classes = 'ml-' + (str(c['depth']) if int(c['depth']) <= max_comment_depth else str(max_comment_depth))
  234. if c['author'] == link['author'] and c['author'] not in removed_content_identifiers:
  235. css_classes += ' op'
  236. if c['stickied'].lower() == 'true' or c['stickied'] is True:
  237. css_classes += ' stickied'
  238. # author link
  239. url = static_include_path + 'user/' + c['author'] + '.html'
  240. author_link_html = template_user_url.replace('###URL_AUTHOR###', url).replace('###AUTHOR###', c['author'])
  241. comment_data_map = {
  242. '###ID###': c['id'],
  243. '###PARENT_ID###': c['parent_id'],
  244. '###DEPTH###': str(c['depth']),
  245. '###DATE###': created.strftime('%Y-%m-%d'),
  246. '###SCORE###': str(c['score']) if len(str(c['score'])) > 0 else missing_comment_score_label,
  247. '###BODY###': snudown.markdown(c['body'].replace('&gt;','>')),
  248. '###CSS_CLASSES###': css_classes,
  249. '###CLASS_SCORE###': 'badge-danger' if len(c['score']) > 0 and int(c['score']) < 1 else 'badge-secondary',
  250. '###HTML_AUTHOR_URL###': author_link_html,
  251. }
  252. comment_html = template_comment
  253. for key, value in comment_data_map.items():
  254. comment_html = comment_html.replace(key, value)
  255. comments_html += comment_html + '\n'
  256. # render subreddits list
  257. subs_menu_html = ''
  258. for sub in subreddits:
  259. sub_url = static_include_path + sub + '/index.html'
  260. subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace('###SUB###', sub)
  261. # render selftext
  262. selftext_html = ''
  263. if len(link['selftext']) > 0:
  264. selftext_html = template_selftext.replace('###SELFTEXT###', snudown.markdown(link['selftext'].replace('&gt;','>')))
  265. # author link
  266. url = static_include_path + 'user/' + link['author'] + '.html'
  267. author_link_html = template_user_url.replace('###URL_AUTHOR###', url).replace('###AUTHOR###', link['author'])
  268. html_title = template_url.replace('#HREF#', link['url']).replace('#INNER_HTML#', link['title'])
  269. if link['is_self'] is True or link['is_self'].lower() == 'true':
  270. html_title = link['title']
  271. # render link page
  272. link_data_map = {
  273. '###INCLUDE_PATH###': static_include_path,
  274. '###SUB###': subreddit,
  275. '###TITLE###': link['title'],
  276. '###ID###': link['id'],
  277. '###DATE###': created.strftime('%Y-%m-%d'),
  278. '###ARCHIVE_DATE###': datetime.utcfromtimestamp(int(link['retrieved_on'])).strftime('%Y-%m-%d') if link['retrieved_on'] != '' else 'n/a',
  279. '###SCORE###': str(link['score']),
  280. '###NUM_COMMENTS###': str(link['num_comments']),
  281. '###URL_PROJECT###': url_project,
  282. '###URL_SUBS###': static_include_path + 'index.html',
  283. '###URL_SUB###': static_include_path + subreddit + '/index.html',
  284. '###URL_SUB_CMNT###': static_include_path + subreddit + '/index-' + sort_indexes['num_comments']['slug'] + '/index.html',
  285. '###URL_SUB_DATE###': static_include_path + subreddit + '/index-' + sort_indexes['created_utc']['slug'] + '/index.html',
  286. '###URL_SEARCH###': static_include_path + subreddit + '/search.html',
  287. '###HTML_SUBS_MENU###': subs_menu_html,
  288. '###HTML_SELFTEXT###': selftext_html,
  289. '###HTML_COMMENTS###': comments_html,
  290. '###HTML_AUTHOR_URL###': author_link_html,
  291. '###HTML_TITLE###': html_title,
  292. }
  293. html = template_link
  294. for key, value in link_data_map.items():
  295. html = html.replace(key, value)
  296. # write html
  297. # reddit: https://www.reddit.com/r/conspiracy/comments/8742iv/happening_now_classmate_former_friend_of/
  298. # archive: r/conspiracy/comments/8/7/4/2/i/v/happening_now_classmate_former_friend_of.html
  299. idpath = '/'.join(list(link['id']))
  300. filepath = link['permalink'].lower().strip('/') + '.html'
  301. filepath = filepath.replace(link['id'], idpath)
  302. if not os.path.isfile(filepath):
  303. os.makedirs(os.path.dirname(filepath), exist_ok=True)
  304. with open(filepath, 'w', encoding='utf-8') as file:
  305. file.write(html)
  306. # print('wrote %s %s' % (created.strftime('%Y-%m-%d'), filepath))
  307. return True
  308. def write_subreddit_search_page(subreddit, subs, link_index, stat_sub_filtered_links, stat_sub_comments):
  309. if len(link_index) == 0:
  310. return True
  311. # name sort?
  312. links = sorted(link_index, key=lambda k: re.sub(r'\W+', '', k['title']).lower())
  313. # render subreddits list
  314. subs_menu_html = ''
  315. for sub in subs:
  316. sub_url = '../' + sub + '/index.html'
  317. subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace('###SUB###', sub)
  318. links_html = ''
  319. for l in links:
  320. link_comments_url = l['permalink'].lower().strip('/').replace('r/' + subreddit + '/', '')
  321. idpath = '/'.join(list(l['id']))
  322. link_comments_url = link_comments_url.replace(l['id'], idpath)
  323. link_comments_url += '.html'
  324. index_link_data_map = {
  325. '###TITLE###': l['title'],
  326. '###URL###': link_comments_url,
  327. }
  328. link_html = template_search_link
  329. for key, value in index_link_data_map.items():
  330. link_html = link_html.replace(key, value)
  331. links_html += link_html + '\n'
  332. index_page_data_map = {
  333. '###INCLUDE_PATH###': '../',
  334. '###TITLE###': 'search',
  335. '###SUB###': subreddit,
  336. '###ARCH_NUM_POSTS###': str(stat_sub_filtered_links),
  337. '###ARCH_NUM_COMMENTS###': str(stat_sub_comments),
  338. '###URL_SUBS###': '../index.html',
  339. '###URL_PROJECT###': url_project,
  340. '###URL_IDX_SCORE###': 'index.html',
  341. '###URL_IDX_CMNT###': 'index-' + sort_indexes['num_comments']['slug'] + '/index.html',
  342. '###URL_IDX_DATE###': 'index-' + sort_indexes['created_utc']['slug'] + '/index.html',
  343. '###URL_SEARCH###': 'search.html',
  344. '###URL_IDX_SCORE_CSS###': '',
  345. '###URL_IDX_CMNT_CSS###': '',
  346. '###URL_IDX_DATE_CSS###': '',
  347. '###URL_SEARCH_CSS###': 'active',
  348. '###HTML_LINKS###': links_html,
  349. '###HTML_SUBS_MENU###': subs_menu_html,
  350. }
  351. page_html = template_search
  352. for key, value in index_page_data_map.items():
  353. page_html = page_html.replace(key, value)
  354. # write file
  355. filename = 'search.html'
  356. filepath = 'r/' + subreddit + '/' + filename
  357. if not os.path.isfile(filepath):
  358. os.makedirs(os.path.dirname(filepath), exist_ok=True)
  359. with open(filepath, 'w', encoding='utf-8') as file:
  360. file.write(page_html)
  361. # print('wrote %s, %s links' % (filepath, len(links)))
  362. return True
  363. def write_user_page(subs, user_index):
  364. if len(user_index.keys()) == 0:
  365. return False
  366. # subreddits list
  367. subs_menu_html = ''
  368. for sub in subs:
  369. sub_url = '../' + sub['name'] + '/index.html'
  370. subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace('###SUB###', sub['name'])
  371. for user in user_index.keys():
  372. links = user_index[user]
  373. links.sort(key=lambda k: (int(k['score']) if k['score'] != '' else sort_indexes['score']['default']), reverse=True)
  374. links_html = ''
  375. for l in links:
  376. author_link_html = template_user_url
  377. author_url = l['author'] + '.html'
  378. author_link_html = author_link_html.replace('###URL_AUTHOR###', author_url).replace('###AUTHOR###', l['author'])
  379. link_comments_url = l['permalink'].lower().replace('/r/', '').strip('/')
  380. link_comments_url = '../' + link_comments_url
  381. idpath = '/'.join(list(l['id']))
  382. link_comments_url = link_comments_url.replace(l['id'], idpath)
  383. link_comments_url += '.html'
  384. link_url = l['url']
  385. if l['is_self'] is True or l['is_self'] == 'True':
  386. link_url = link_comments_url
  387. link_data_map = {
  388. '###TITLE###': l['title'],
  389. '###URL###': link_url,
  390. '###URL_COMMENTS###': link_comments_url,
  391. '###SCORE###': str(l['score']),
  392. '###NUM_COMMENTS###': str(l['num_comments']) if int(l['num_comments']) > 0 else str(0),
  393. '###DATE###': datetime.utcfromtimestamp(int(l['created_utc'])).strftime('%Y-%m-%d'),
  394. '###SUB###': l['subreddit'],
  395. '###SUB_URL###': '../' + l['subreddit'] + '/index.html',
  396. '###HTML_AUTHOR_URL###': author_link_html,
  397. }
  398. link_html = template_user_page_link
  399. for key, value in link_data_map.items():
  400. link_html = link_html.replace(key, value)
  401. links_html += link_html + '\n'
  402. page_data_map = {
  403. '###INCLUDE_PATH###': '../',
  404. '###TITLE###': 'user/' + user,
  405. '###ARCH_NUM_POSTS###': str(len(links)),
  406. '###URL_USER###': user + '.html',
  407. '###URL_SUBS###': '../index.html',
  408. '###URL_PROJECT###': url_project,
  409. '###HTML_LINKS###': links_html,
  410. '###HTML_SUBS_MENU###': subs_menu_html,
  411. }
  412. page_html = template_user
  413. for key, value in page_data_map.items():
  414. page_html = page_html.replace(key, value)
  415. filepath = 'r/user/' + user + '.html'
  416. if not os.path.isfile(filepath):
  417. os.makedirs(os.path.dirname(filepath), exist_ok=True)
  418. with open(filepath, 'w', encoding='utf-8') as file:
  419. file.write(page_html)
  420. # print('wrote %s' % (filepath))
  421. return True
  422. def write_index(subs):
  423. if len(subs) == 0:
  424. return False
  425. subs.sort(key=lambda k: k['name'].casefold())
  426. stat_num_links = 0
  427. links_html = ''
  428. subs_menu_html = ''
  429. for sub in subs:
  430. sub_url = sub['name'] + '/index.html'
  431. links_html += template_index_sub.replace('#URL_SUB#', sub_url).replace('#SUB#', sub['name']).replace('#NUM_LINKS#', str(sub['num_links']))
  432. subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace('###SUB###', sub['name'])
  433. stat_num_links += sub['num_links']
  434. index_page_data_map = {
  435. '###INCLUDE_PATH###': '',
  436. '###TITLE###': 'subreddits',
  437. '###URL_SUBS###': 'index.html',
  438. '###URL_PROJECT###': url_project,
  439. '###ARCH_NUM_POSTS###': str(stat_num_links),
  440. '###HTML_LINKS###': links_html,
  441. '###HTML_SUBS_MENU###': subs_menu_html,
  442. }
  443. page_html = template_index
  444. for key, value in index_page_data_map.items():
  445. page_html = page_html.replace(key, value)
  446. filepath = 'r/index.html'
  447. if not os.path.isfile(filepath):
  448. os.makedirs(os.path.dirname(filepath), exist_ok=True)
  449. with open(filepath, 'w', encoding='utf-8') as file:
  450. file.write(page_html)
  451. # print('wrote %s' % (filepath))
  452. return True
  453. # a 'top' comments sort with orphaned comments (incomplete data) rendered last
  454. def sort_comments(comments, hide_deleted_comments=False):
  455. sorted_comments = []
  456. if len(comments) == 0:
  457. return sorted_comments
  458. parent_map = {}
  459. id_map = {}
  460. top_level_comments = []
  461. link_id = comments[0]['link_id']
  462. depth = 0
  463. for c in comments:
  464. c['depth'] = depth
  465. id_map[c['id']] = c
  466. parent_map[c['id']] = c['parent_id']
  467. # add stickied comments
  468. if c['stickied'].lower() == 'true':
  469. sorted_comments.append(c)
  470. # store top level comments
  471. elif c['parent_id'] == c['link_id']:
  472. top_level_comments.append(c)
  473. # sort non stickied top level comments
  474. if len(top_level_comments) > 0:
  475. top_level_comments = sorted(top_level_comments, key=lambda k: (int(k['score']) if k['score'] != '' else 1), reverse=True)
  476. sorted_comments += top_level_comments
  477. # add each top level comment's child comments
  478. sorted_linear_comments = []
  479. for c in sorted_comments:
  480. # only remove deleted comments if no children
  481. if hide_deleted_comments and c['body'] in removed_content_identifiers and 't1_' + c['id'] not in parent_map.values():
  482. pass
  483. else:
  484. sorted_linear_comments.append(c)
  485. child_comments = get_comment_tree_list([], depth + 1, c, id_map, parent_map, hide_deleted_comments)
  486. if len(child_comments) > 0:
  487. sorted_linear_comments += child_comments
  488. # add orphaned comments
  489. for c in comments:
  490. if c['parent_id'] != link_id and c['parent_id'].replace('t1_', '') not in id_map.keys():
  491. if hide_deleted_comments and c['body'] in removed_content_identifiers:
  492. continue
  493. sorted_linear_comments.append(c)
  494. # print('sort_comments() in %s out %s show deleted: %s' % (len(comments), len(sorted_comments), hide_deleted_comments))
  495. return sorted_linear_comments
  496. def get_comment_tree_list(tree, depth, parent_comment, id_map, parent_map, hide_deleted_comments):
  497. parent_id = 't1_' + parent_comment['id']
  498. child_comments = []
  499. for key, value in parent_map.items():
  500. if value == parent_id:
  501. if hide_deleted_comments and id_map[key]['body'] in removed_content_identifiers and 't1_' + key not in parent_map.values():
  502. pass
  503. else:
  504. child_comments.append(id_map[key])
  505. # sort children by score
  506. # TODO: sort by score and # of child comments
  507. if len(child_comments) > 0:
  508. child_comments = sorted(child_comments, key=lambda k: (int(k['score']) if k['score'] != '' else 1), reverse=True)
  509. for child_comment in child_comments:
  510. child_comment['depth'] = depth
  511. tree.append(child_comment)
  512. tree = get_comment_tree_list(tree, depth + 1, child_comment, id_map, parent_map, hide_deleted_comments)
  513. return tree
  514. def validate_link(link, min_score=0, min_comments=0):
  515. if not link:
  516. return False
  517. elif not 'id' in link.keys():
  518. return False
  519. # apply multiple conditions as an OR, keep high score low comments and high comment low score links/posts
  520. # TODO this should be configurable
  521. if min_score > 0 and min_comments > 0:
  522. if int(link['score']) < min_score and int(link['num_comments']) < min_comments:
  523. return False
  524. else:
  525. if min_score > 0 and int(link['score']) < min_score:
  526. return False
  527. if min_comments > 0 and int(link['num_comments']) < min_comments:
  528. return False
  529. return True
  530. def load_links(date, subreddit, with_comments=False):
  531. links = []
  532. if not date or not subreddit:
  533. return links
  534. date_path = date.strftime("%Y/%m/%d")
  535. daily_path = 'data/' + subreddit + '/' + date_path
  536. daily_links_path = daily_path + '/' + source_data_links
  537. if os.path.isfile(daily_links_path):
  538. links = []
  539. with open(daily_links_path, 'r', encoding='utf-8') as links_file:
  540. reader = csv.DictReader(links_file)
  541. for link_row in reader:
  542. if with_comments and 'id' in link_row.keys():
  543. comments = []
  544. comments_file_path = daily_path + '/' + link_row['id'] + '.csv'
  545. if os.path.isfile(comments_file_path):
  546. with open(comments_file_path, 'r', encoding='utf-8') as comments_file:
  547. reader = csv.DictReader(comments_file)
  548. for comment_row in reader:
  549. comments.append(comment_row)
  550. link_row['comments'] = comments
  551. links.append(link_row)
  552. return links
  553. def get_subs():
  554. subs = []
  555. if not os.path.isdir('data'):
  556. print('ERROR: no data, run fetch_links.py first')
  557. return subs
  558. for d in os.listdir('data'):
  559. if os.path.isdir('data' + '/' + d):
  560. subs.append(d.lower())
  561. return subs
  562. def get_pager_html(page_num=1, pages=1):
  563. html_pager = ''
  564. # previous
  565. css = ''
  566. if page_num == 1:
  567. css = 'disabled'
  568. url = 'index'
  569. if page_num - 1 > 1:
  570. url += '-' + str(page_num - 1)
  571. url += '.html'
  572. html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', '&lsaquo;').replace('#CSS_CLASS#', css)
  573. # skip back
  574. css = ''
  575. prev_skip = page_num - pager_skip
  576. if prev_skip < 1:
  577. prev_skip = 1
  578. if page_num == 1:
  579. css = 'disabled'
  580. url = 'index'
  581. if prev_skip > 1:
  582. url += '-' + str(prev_skip)
  583. url += '.html'
  584. html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', '&lsaquo;&lsaquo;').replace('#CSS_CLASS#', css)
  585. # skip back far
  586. css = ''
  587. prev_skip = page_num - pager_skip_long
  588. if prev_skip < 1:
  589. prev_skip = 1
  590. if page_num == 1:
  591. css += ' disabled'
  592. url = 'index'
  593. if prev_skip > 1:
  594. url += '-' + str(prev_skip)
  595. url += '.html'
  596. html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', '&lsaquo;&lsaquo;&lsaquo;').replace('#CSS_CLASS#', css)
  597. # n-1
  598. start = -2
  599. if page_num + 1 > pages:
  600. start -= 1
  601. if page_num + 2 > pages:
  602. start -= 1
  603. for prev_page_num in range(start,0):
  604. if page_num + prev_page_num > 0:
  605. css = ''
  606. url = 'index'
  607. if page_num + prev_page_num > 1:
  608. url += '-' + str(page_num + prev_page_num)
  609. url += '.html'
  610. if prev_page_num < -1:
  611. css = 'd-none d-sm-block'
  612. html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', str(page_num + prev_page_num)).replace('#CSS_CLASS#', css)
  613. # n
  614. url = 'index'
  615. if page_num > 1:
  616. url += '-' + str(page_num)
  617. url += '.html'
  618. html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', str(page_num)).replace('#CSS_CLASS#', 'active')
  619. # n + 1
  620. css = ''
  621. end = 3
  622. if page_num -1 < 1:
  623. end += 1
  624. if page_num - 2 < 1:
  625. end += 1
  626. for next_page_num in range(1,end):
  627. if page_num + next_page_num <= pages:
  628. if next_page_num > 1:
  629. css = 'd-none d-sm-block'
  630. html_pager += template_index_pager_link.replace('#URL#', 'index' + '-' + str(page_num + next_page_num) + '.html').replace('#TEXT#', str(page_num + next_page_num)).replace('#CSS_CLASS#', css)
  631. # skip forward far
  632. next_skip = page_num + pager_skip_long
  633. css = ''
  634. if page_num == pages:
  635. css += ' disabled'
  636. if next_skip > pages:
  637. next_skip = pages
  638. url = 'index'
  639. if next_skip > 1:
  640. url += '-' + str(next_skip)
  641. url += '.html'
  642. html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', '&rsaquo;&rsaquo;&rsaquo;').replace('#CSS_CLASS#', css)
  643. # skip forward
  644. next_skip = page_num + pager_skip
  645. css = ''
  646. if page_num == pages:
  647. css = 'disabled'
  648. if next_skip > pages:
  649. next_skip = pages
  650. url = 'index'
  651. if next_skip > 1:
  652. url += '-' + str(next_skip)
  653. url += '.html'
  654. html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', '&rsaquo;&rsaquo;').replace('#CSS_CLASS#', css)
  655. # next
  656. css = ''
  657. next_num = page_num + 1
  658. if page_num == pages:
  659. css = 'disabled'
  660. next_num = pages
  661. html_pager += template_index_pager_link.replace('#URL#', 'index' + '-' + str(next_num) + '.html').replace('#TEXT#', '&rsaquo;').replace('#CSS_CLASS#', css)
  662. return html_pager
  663. def chunks(l, n):
  664. """Yield successive n-sized chunks from l."""
  665. for i in range(0, len(l), n):
  666. yield l[i:i + n]
  667. if __name__ == '__main__':
  668. parser=argparse.ArgumentParser()
  669. parser.add_argument('--min-score', default=0, help='limit post rendering, default 0')
  670. parser.add_argument('--min-comments', default=0, help='limit post rendering, default 0')
  671. parser.add_argument('--hide-deleted-comments', action='store_true', help='exclude deleted and removed comments where possible')
  672. args=parser.parse_args()
  673. hide_deleted_comments = False
  674. if args.hide_deleted_comments:
  675. hide_deleted_comments = True
  676. args.min_score = int(args.min_score)
  677. args.min_comments = int(args.min_comments)
  678. generate_html(args.min_score, args.min_comments, hide_deleted_comments)