123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774 |
- #! /usr/bin/env python
- from datetime import datetime, date, timedelta
- import argparse
- import csv
- import os
- import re
- import snudown
- import psutil
- url_project = 'https://github.com/libertysoft3/reddit-html-archiver'
- links_per_page = 30
- pager_skip = 10
- pager_skip_long = 100
- start_date = date(2005, 1, 1)
- end_date = datetime.today().date() + timedelta(days=1)
- source_data_links = 'links.csv'
- max_comment_depth = 8 # mostly for mobile, which might be silly
- removed_content_identifiers = ['[deleted]','deleted','[removed]','removed']
- default_sort = 'score'
- sort_indexes = {
- 'score': {
- 'default': 1,
- 'slug': 'score'
- },
- 'num_comments': {
- 'default': 0,
- 'slug': 'comments',
- },
- 'created_utc': {
- 'default': 1000198000,
- 'slug': 'date',
- }
- }
- missing_comment_score_label = 'n/a'
- template_index = ''
- with open('templates/index.html', 'r', encoding='utf-8') as file:
- template_index = file.read()
- template_subreddit = ''
- with open('templates/subreddit.html', 'r', encoding='utf-8') as file:
- template_subreddit = file.read()
- template_link = ''
- with open('templates/link.html', 'r', encoding='utf-8') as file:
- template_link = file.read()
- template_comment = ''
- with open('templates/partial_comment.html', 'r', encoding='utf-8') as file:
- template_comment = file.read()
- template_search = ''
- with open('templates/search.html', 'r', encoding='utf-8') as file:
- template_search = file.read()
- template_user = ''
- with open('templates/user.html', 'r', encoding='utf-8') as file:
- template_user = file.read()
- template_sub_link = ''
- with open('templates/partial_menu_item.html', 'r', encoding='utf-8') as file:
- template_sub_link = file.read()
- template_user_url = ''
- with open('templates/partial_user.html', 'r', encoding='utf-8') as file:
- template_user_url = file.read()
- template_link_url = ''
- with open('templates/partial_link.html', 'r', encoding='utf-8') as file:
- template_link_url = file.read()
- template_search_link = ''
- with open('templates/partial_search_link.html', 'r', encoding='utf-8') as file:
- template_search_link = file.read()
- template_index_sub = ''
- with open('templates/partial_index_subreddit.html', 'r', encoding='utf-8') as file:
- template_index_sub = file.read()
- template_index_pager_link = ''
- with open('templates/partial_subreddit_pager_link.html', 'r', encoding='utf-8') as file:
- template_index_pager_link = file.read()
- template_selftext = ''
- with open('templates/partial_link_selftext.html', 'r', encoding='utf-8') as file:
- template_selftext = file.read()
- template_user_page_link = ''
- with open('templates/partial_user_link.html', 'r', encoding='utf-8') as file:
- template_user_page_link = file.read()
- teplate_url = ''
- with open('templates/partial_url.html', 'r', encoding='utf-8') as file:
- template_url = file.read()
- process = psutil.Process(os.getpid())
- def generate_html(min_score=0, min_comments=0, hide_deleted_comments=False):
- delta = timedelta(days=1)
- subs = get_subs()
- user_index = {}
- processed_subs = []
- stat_links = 0
- stat_filtered_links = 0
- for sub in subs:
- # write link pages
- # print('generate_html() processing %s %s kb' % (sub, int(int(process.memory_info().rss) / 1024)))
- stat_sub_links = 0
- stat_sub_filtered_links = 0
- stat_sub_comments = 0
- d = start_date
- while d <= end_date:
- raw_links = load_links(d, sub, True)
- stat_links += len(raw_links)
- stat_sub_links += len(raw_links)
- for l in raw_links:
- if validate_link(l, min_score, min_comments):
- write_link_page(subs, l, sub, hide_deleted_comments)
- stat_filtered_links += 1
- stat_sub_filtered_links += 1
- if 'comments' in l:
- stat_sub_comments += len(l['comments'])
- d += delta
- if stat_sub_filtered_links > 0:
- processed_subs.append({'name': sub, 'num_links': stat_sub_filtered_links})
- print('%s: %s links filtered to %s' % (sub, stat_sub_links, stat_sub_filtered_links))
- # write subreddit pages
- valid_sub_links = []
- d = start_date
- while d <= end_date:
- raw_links = load_links(d, sub)
- for l in raw_links:
- if validate_link(l, min_score, min_comments):
- valid_sub_links.append(l)
- # collect links for user pages
- # TODO: this is the least performant bit. load and generate user pages user by user instead.
- l['subreddit'] = sub
- if l['author'] not in user_index.keys():
- user_index[l['author']] = []
- user_index[l['author']].append(l)
- d += delta
- write_subreddit_pages(sub, subs, valid_sub_links, stat_sub_filtered_links, stat_sub_comments)
- write_subreddit_search_page(sub, subs, valid_sub_links, stat_sub_filtered_links, stat_sub_comments)
- # write user pages
- write_user_page(processed_subs, user_index)
- # write index page
- write_index(processed_subs)
- print('all done. %s links filtered to %s' % (stat_links, stat_filtered_links))
- def write_subreddit_pages(subreddit, subs, link_index, stat_sub_filtered_links, stat_sub_comments):
- if len(link_index) == 0:
- return True
- for sort in sort_indexes.keys():
- links = sorted(link_index, key=lambda k: (int(k[sort]) if k[sort] != '' else sort_indexes[sort]['default']), reverse=True)
- pages = list(chunks(links, links_per_page))
- page_num = 0
- sort_based_prefix = '../'
- if sort == default_sort:
- sort_based_prefix = ''
- # render subreddits list
- subs_menu_html = ''
- for sub in subs:
- sub_url = sort_based_prefix + '../' + sub + '/index.html'
- subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace('###SUB###', sub)
- for page in pages:
- page_num += 1
- # print('%s page' % (page))
- links_html = ''
- for l in page:
- author_link_html = template_user_url
- author_url = sort_based_prefix + '../user/' + l['author'] + '.html'
- author_link_html = author_link_html.replace('###URL_AUTHOR###', author_url).replace('###AUTHOR###', l['author'])
- link_url = l['url']
- link_comments_url = sort_based_prefix + l['permalink'].lower().strip('/')
- link_comments_url = link_comments_url.replace('r/' + subreddit + '/', '')
- idpath = '/'.join(list(l['id']))
- link_comments_url = link_comments_url.replace(l['id'], idpath)
- link_comments_url += '.html'
- if l['is_self'] is True or l['is_self'] == 'True':
- link_url = link_comments_url
- index_link_data_map = {
- '###TITLE###': l['title'],
- '###URL###': link_url,
- '###URL_COMMENTS###': link_comments_url,
- '###SCORE###': str(l['score']),
- '###NUM_COMMENTS###': l['num_comments'] if int(l['num_comments']) > 0 else str(0),
- '###DATE###': datetime.utcfromtimestamp(int(l['created_utc'])).strftime('%Y-%m-%d'),
- '###LINK_DOMAIN###': '(self.' + subreddit + ')' if l['is_self'] is True or l['is_self'] == 'True' else '',
- '###HTML_AUTHOR_URL###': author_link_html,
- }
- link_html = template_link_url
- for key, value in index_link_data_map.items():
- link_html = link_html.replace(key, value)
- links_html += link_html + '\n'
- index_page_data_map = {
- '###INCLUDE_PATH###': sort_based_prefix + '../',
- '###TITLE###': 'by ' + sort_indexes[sort]['slug'] + ' page ' + str(page_num) + ' of ' + str(len(pages)),
- '###SUB###': subreddit,
- '###ARCH_NUM_POSTS###': str(stat_sub_filtered_links),
- '###ARCH_NUM_COMMENTS###': str(stat_sub_comments),
- '###URL_SUBS###': sort_based_prefix + '../index.html',
- '###URL_PROJECT###': url_project,
- '###URL_IDX_SCORE###': sort_based_prefix + 'index.html',
- '###URL_IDX_CMNT###': sort_based_prefix + 'index-' + sort_indexes['num_comments']['slug'] + '/index.html',
- '###URL_IDX_DATE###': sort_based_prefix + 'index-' + sort_indexes['created_utc']['slug'] + '/index.html',
- '###URL_SEARCH###': sort_based_prefix + 'search.html',
- '###URL_IDX_SCORE_CSS###': 'active' if sort == 'score' else '',
- '###URL_IDX_CMNT_CSS###': 'active' if sort == 'num_comments' else '',
- '###URL_IDX_DATE_CSS###': 'active' if sort == 'created_utc' else '',
- '###URL_SEARCH_CSS###': '',
- '###HTML_LINKS###': links_html,
- '###HTML_SUBS_MENU###': subs_menu_html,
- '###HTML_PAGER###': get_pager_html(page_num, len(pages)),
- }
- page_html = template_subreddit
- for key, value in index_page_data_map.items():
- page_html = page_html.replace(key, value)
-
- # write file
- suffix = '-' + str(page_num) + '.html'
- if page_num == 1:
- suffix = '.html'
- filename = 'index' + suffix
- if sort == default_sort:
- filepath = 'r/' + subreddit + '/' + filename
- else:
- filepath = 'r/' + subreddit + '/index-' + sort_indexes[sort]['slug'] + '/' + filename
- if not os.path.isfile(filepath):
- os.makedirs(os.path.dirname(filepath), exist_ok=True)
- with open(filepath, 'w', encoding='utf-8') as file:
- file.write(page_html)
- # print('wrote %s %s, %s links' % (sort, filepath, len(page)))
- return True
- def write_link_page(subreddits, link, subreddit='', hide_deleted_comments=False):
- # reddit: https://www.reddit.com/r/conspiracy/comments/8742iv/happening_now_classmate_former_friend_of/
- # archive: r/conspiracy/comments/8/7/4/2/i/v/happening_now_classmate_former_friend_of.html
- idpath = '/'.join(list(link['id']))
- filepath = link['permalink'].lower().strip('/') + '.html'
- filepath = filepath.replace(link['id'], idpath)
- if os.path.isfile(filepath):
- return True
- created = datetime.utcfromtimestamp(int(link['created_utc']))
- sorted_comments = []
- if len(link['comments']) > 0:
- sorted_comments = sort_comments(link['comments'], hide_deleted_comments)
- # traverse up to root dir, depends on id length
- static_include_path = ''
- for i in range(len(link['id']) + 2):
- static_include_path += '../'
- # render comments
- comments_html = ''
- for c in sorted_comments:
- css_classes = 'ml-' + (str(c['depth']) if int(c['depth']) <= max_comment_depth else str(max_comment_depth))
- if c['author'] == link['author'] and c['author'] not in removed_content_identifiers:
- css_classes += ' op'
- if c['stickied'].lower() == 'true' or c['stickied'] is True:
- css_classes += ' stickied'
- # author link
- url = static_include_path + 'user/' + c['author'] + '.html'
- author_link_html = template_user_url.replace('###URL_AUTHOR###', url).replace('###AUTHOR###', c['author'])
- comment_data_map = {
- '###ID###': c['id'],
- '###PARENT_ID###': c['parent_id'],
- '###DEPTH###': str(c['depth']),
- '###DATE###': created.strftime('%Y-%m-%d'),
- '###SCORE###': str(c['score']) if len(str(c['score'])) > 0 else missing_comment_score_label,
- '###BODY###': snudown.markdown(c['body'].replace('>','>')),
- '###CSS_CLASSES###': css_classes,
- '###CLASS_SCORE###': 'badge-danger' if len(c['score']) > 0 and int(c['score']) < 1 else 'badge-secondary',
- '###HTML_AUTHOR_URL###': author_link_html,
- }
- comment_html = template_comment
- for key, value in comment_data_map.items():
- comment_html = comment_html.replace(key, value)
- comments_html += comment_html + '\n'
- # render subreddits list
- subs_menu_html = ''
- for sub in subreddits:
- sub_url = static_include_path + sub + '/index.html'
- subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace('###SUB###', sub)
- # render selftext
- selftext_html = ''
- if len(link['selftext']) > 0:
- selftext_html = template_selftext.replace('###SELFTEXT###', snudown.markdown(link['selftext'].replace('>','>')))
- # author link
- url = static_include_path + 'user/' + link['author'] + '.html'
- author_link_html = template_user_url.replace('###URL_AUTHOR###', url).replace('###AUTHOR###', link['author'])
- html_title = template_url.replace('#HREF#', link['url']).replace('#INNER_HTML#', link['title'])
- if link['is_self'] is True or link['is_self'].lower() == 'true':
- html_title = link['title']
- # render link page
- link_data_map = {
- '###INCLUDE_PATH###': static_include_path,
- '###SUB###': subreddit,
- '###TITLE###': link['title'],
- '###ID###': link['id'],
- '###DATE###': created.strftime('%Y-%m-%d'),
- '###ARCHIVE_DATE###': datetime.utcfromtimestamp(int(link['retrieved_on'])).strftime('%Y-%m-%d') if link['retrieved_on'] != '' else 'n/a',
- '###SCORE###': str(link['score']),
- '###NUM_COMMENTS###': str(link['num_comments']),
- '###URL_PROJECT###': url_project,
- '###URL_SUBS###': static_include_path + 'index.html',
- '###URL_SUB###': static_include_path + subreddit + '/index.html',
- '###URL_SUB_CMNT###': static_include_path + subreddit + '/index-' + sort_indexes['num_comments']['slug'] + '/index.html',
- '###URL_SUB_DATE###': static_include_path + subreddit + '/index-' + sort_indexes['created_utc']['slug'] + '/index.html',
- '###URL_SEARCH###': static_include_path + subreddit + '/search.html',
- '###HTML_SUBS_MENU###': subs_menu_html,
- '###HTML_SELFTEXT###': selftext_html,
- '###HTML_COMMENTS###': comments_html,
- '###HTML_AUTHOR_URL###': author_link_html,
- '###HTML_TITLE###': html_title,
- }
- html = template_link
- for key, value in link_data_map.items():
- html = html.replace(key, value)
- # write html
- # reddit: https://www.reddit.com/r/conspiracy/comments/8742iv/happening_now_classmate_former_friend_of/
- # archive: r/conspiracy/comments/8/7/4/2/i/v/happening_now_classmate_former_friend_of.html
- idpath = '/'.join(list(link['id']))
- filepath = link['permalink'].lower().strip('/') + '.html'
- filepath = filepath.replace(link['id'], idpath)
- if not os.path.isfile(filepath):
- os.makedirs(os.path.dirname(filepath), exist_ok=True)
- with open(filepath, 'w', encoding='utf-8') as file:
- file.write(html)
- # print('wrote %s %s' % (created.strftime('%Y-%m-%d'), filepath))
- return True
- def write_subreddit_search_page(subreddit, subs, link_index, stat_sub_filtered_links, stat_sub_comments):
- if len(link_index) == 0:
- return True
- # name sort?
- links = sorted(link_index, key=lambda k: re.sub(r'\W+', '', k['title']).lower())
- # render subreddits list
- subs_menu_html = ''
- for sub in subs:
- sub_url = '../' + sub + '/index.html'
- subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace('###SUB###', sub)
- links_html = ''
- for l in links:
- link_comments_url = l['permalink'].lower().strip('/').replace('r/' + subreddit + '/', '')
- idpath = '/'.join(list(l['id']))
- link_comments_url = link_comments_url.replace(l['id'], idpath)
- link_comments_url += '.html'
- index_link_data_map = {
- '###TITLE###': l['title'],
- '###URL###': link_comments_url,
- }
- link_html = template_search_link
- for key, value in index_link_data_map.items():
- link_html = link_html.replace(key, value)
- links_html += link_html + '\n'
- index_page_data_map = {
- '###INCLUDE_PATH###': '../',
- '###TITLE###': 'search',
- '###SUB###': subreddit,
- '###ARCH_NUM_POSTS###': str(stat_sub_filtered_links),
- '###ARCH_NUM_COMMENTS###': str(stat_sub_comments),
- '###URL_SUBS###': '../index.html',
- '###URL_PROJECT###': url_project,
- '###URL_IDX_SCORE###': 'index.html',
- '###URL_IDX_CMNT###': 'index-' + sort_indexes['num_comments']['slug'] + '/index.html',
- '###URL_IDX_DATE###': 'index-' + sort_indexes['created_utc']['slug'] + '/index.html',
- '###URL_SEARCH###': 'search.html',
- '###URL_IDX_SCORE_CSS###': '',
- '###URL_IDX_CMNT_CSS###': '',
- '###URL_IDX_DATE_CSS###': '',
- '###URL_SEARCH_CSS###': 'active',
- '###HTML_LINKS###': links_html,
- '###HTML_SUBS_MENU###': subs_menu_html,
- }
- page_html = template_search
- for key, value in index_page_data_map.items():
- page_html = page_html.replace(key, value)
- # write file
- filename = 'search.html'
- filepath = 'r/' + subreddit + '/' + filename
- if not os.path.isfile(filepath):
- os.makedirs(os.path.dirname(filepath), exist_ok=True)
- with open(filepath, 'w', encoding='utf-8') as file:
- file.write(page_html)
- # print('wrote %s, %s links' % (filepath, len(links)))
- return True
- def write_user_page(subs, user_index):
- if len(user_index.keys()) == 0:
- return False
- # subreddits list
- subs_menu_html = ''
- for sub in subs:
- sub_url = '../' + sub['name'] + '/index.html'
- subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace('###SUB###', sub['name'])
- for user in user_index.keys():
- links = user_index[user]
- links.sort(key=lambda k: (int(k['score']) if k['score'] != '' else sort_indexes['score']['default']), reverse=True)
- links_html = ''
- for l in links:
- author_link_html = template_user_url
- author_url = l['author'] + '.html'
- author_link_html = author_link_html.replace('###URL_AUTHOR###', author_url).replace('###AUTHOR###', l['author'])
- link_comments_url = l['permalink'].lower().replace('/r/', '').strip('/')
- link_comments_url = '../' + link_comments_url
- idpath = '/'.join(list(l['id']))
- link_comments_url = link_comments_url.replace(l['id'], idpath)
- link_comments_url += '.html'
- link_url = l['url']
- if l['is_self'] is True or l['is_self'] == 'True':
- link_url = link_comments_url
- link_data_map = {
- '###TITLE###': l['title'],
- '###URL###': link_url,
- '###URL_COMMENTS###': link_comments_url,
- '###SCORE###': str(l['score']),
- '###NUM_COMMENTS###': str(l['num_comments']) if int(l['num_comments']) > 0 else str(0),
- '###DATE###': datetime.utcfromtimestamp(int(l['created_utc'])).strftime('%Y-%m-%d'),
- '###SUB###': l['subreddit'],
- '###SUB_URL###': '../' + l['subreddit'] + '/index.html',
- '###HTML_AUTHOR_URL###': author_link_html,
- }
- link_html = template_user_page_link
- for key, value in link_data_map.items():
- link_html = link_html.replace(key, value)
- links_html += link_html + '\n'
- page_data_map = {
- '###INCLUDE_PATH###': '../',
- '###TITLE###': 'user/' + user,
- '###ARCH_NUM_POSTS###': str(len(links)),
- '###URL_USER###': user + '.html',
- '###URL_SUBS###': '../index.html',
- '###URL_PROJECT###': url_project,
- '###HTML_LINKS###': links_html,
- '###HTML_SUBS_MENU###': subs_menu_html,
- }
- page_html = template_user
- for key, value in page_data_map.items():
- page_html = page_html.replace(key, value)
- filepath = 'r/user/' + user + '.html'
- if not os.path.isfile(filepath):
- os.makedirs(os.path.dirname(filepath), exist_ok=True)
- with open(filepath, 'w', encoding='utf-8') as file:
- file.write(page_html)
- # print('wrote %s' % (filepath))
- return True
- def write_index(subs):
- if len(subs) == 0:
- return False
- subs.sort(key=lambda k: k['name'].casefold())
-
- stat_num_links = 0
- links_html = ''
- subs_menu_html = ''
- for sub in subs:
- sub_url = sub['name'] + '/index.html'
- links_html += template_index_sub.replace('#URL_SUB#', sub_url).replace('#SUB#', sub['name']).replace('#NUM_LINKS#', str(sub['num_links']))
- subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace('###SUB###', sub['name'])
- stat_num_links += sub['num_links']
- index_page_data_map = {
- '###INCLUDE_PATH###': '',
- '###TITLE###': 'subreddits',
- '###URL_SUBS###': 'index.html',
- '###URL_PROJECT###': url_project,
- '###ARCH_NUM_POSTS###': str(stat_num_links),
- '###HTML_LINKS###': links_html,
- '###HTML_SUBS_MENU###': subs_menu_html,
- }
- page_html = template_index
- for key, value in index_page_data_map.items():
- page_html = page_html.replace(key, value)
- filepath = 'r/index.html'
- if not os.path.isfile(filepath):
- os.makedirs(os.path.dirname(filepath), exist_ok=True)
- with open(filepath, 'w', encoding='utf-8') as file:
- file.write(page_html)
- # print('wrote %s' % (filepath))
- return True
- # a 'top' comments sort with orphaned comments (incomplete data) rendered last
- def sort_comments(comments, hide_deleted_comments=False):
- sorted_comments = []
- if len(comments) == 0:
- return sorted_comments
- parent_map = {}
- id_map = {}
- top_level_comments = []
- link_id = comments[0]['link_id']
- depth = 0
- for c in comments:
- c['depth'] = depth
- id_map[c['id']] = c
- parent_map[c['id']] = c['parent_id']
- # add stickied comments
- if c['stickied'].lower() == 'true':
- sorted_comments.append(c)
- # store top level comments
- elif c['parent_id'] == c['link_id']:
- top_level_comments.append(c)
- # sort non stickied top level comments
- if len(top_level_comments) > 0:
- top_level_comments = sorted(top_level_comments, key=lambda k: (int(k['score']) if k['score'] != '' else 1), reverse=True)
- sorted_comments += top_level_comments
- # add each top level comment's child comments
- sorted_linear_comments = []
- for c in sorted_comments:
- # only remove deleted comments if no children
- if hide_deleted_comments and c['body'] in removed_content_identifiers and 't1_' + c['id'] not in parent_map.values():
- pass
- else:
- sorted_linear_comments.append(c)
- child_comments = get_comment_tree_list([], depth + 1, c, id_map, parent_map, hide_deleted_comments)
- if len(child_comments) > 0:
- sorted_linear_comments += child_comments
- # add orphaned comments
- for c in comments:
- if c['parent_id'] != link_id and c['parent_id'].replace('t1_', '') not in id_map.keys():
- if hide_deleted_comments and c['body'] in removed_content_identifiers:
- continue
- sorted_linear_comments.append(c)
- # print('sort_comments() in %s out %s show deleted: %s' % (len(comments), len(sorted_comments), hide_deleted_comments))
- return sorted_linear_comments
- def get_comment_tree_list(tree, depth, parent_comment, id_map, parent_map, hide_deleted_comments):
- parent_id = 't1_' + parent_comment['id']
- child_comments = []
- for key, value in parent_map.items():
- if value == parent_id:
- if hide_deleted_comments and id_map[key]['body'] in removed_content_identifiers and 't1_' + key not in parent_map.values():
- pass
- else:
- child_comments.append(id_map[key])
- # sort children by score
- # TODO: sort by score and # of child comments
- if len(child_comments) > 0:
- child_comments = sorted(child_comments, key=lambda k: (int(k['score']) if k['score'] != '' else 1), reverse=True)
- for child_comment in child_comments:
- child_comment['depth'] = depth
- tree.append(child_comment)
- tree = get_comment_tree_list(tree, depth + 1, child_comment, id_map, parent_map, hide_deleted_comments)
- return tree
- def validate_link(link, min_score=0, min_comments=0):
- if not link:
- return False
- elif not 'id' in link.keys():
- return False
- # apply multiple conditions as an OR, keep high score low comments and high comment low score links/posts
- # TODO this should be configurable
- if min_score > 0 and min_comments > 0:
- if int(link['score']) < min_score and int(link['num_comments']) < min_comments:
- return False
- else:
- if min_score > 0 and int(link['score']) < min_score:
- return False
- if min_comments > 0 and int(link['num_comments']) < min_comments:
- return False
- return True
- def load_links(date, subreddit, with_comments=False):
- links = []
- if not date or not subreddit:
- return links
- date_path = date.strftime("%Y/%m/%d")
- daily_path = 'data/' + subreddit + '/' + date_path
- daily_links_path = daily_path + '/' + source_data_links
- if os.path.isfile(daily_links_path):
- links = []
- with open(daily_links_path, 'r', encoding='utf-8') as links_file:
- reader = csv.DictReader(links_file)
- for link_row in reader:
- if with_comments and 'id' in link_row.keys():
- comments = []
- comments_file_path = daily_path + '/' + link_row['id'] + '.csv'
- if os.path.isfile(comments_file_path):
- with open(comments_file_path, 'r', encoding='utf-8') as comments_file:
- reader = csv.DictReader(comments_file)
- for comment_row in reader:
- comments.append(comment_row)
- link_row['comments'] = comments
- links.append(link_row)
- return links
- def get_subs():
- subs = []
- if not os.path.isdir('data'):
- print('ERROR: no data, run fetch_links.py first')
- return subs
- for d in os.listdir('data'):
- if os.path.isdir('data' + '/' + d):
- subs.append(d.lower())
- return subs
- def get_pager_html(page_num=1, pages=1):
- html_pager = ''
- # previous
- css = ''
- if page_num == 1:
- css = 'disabled'
- url = 'index'
- if page_num - 1 > 1:
- url += '-' + str(page_num - 1)
- url += '.html'
- html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', '‹').replace('#CSS_CLASS#', css)
-
- # skip back
- css = ''
- prev_skip = page_num - pager_skip
- if prev_skip < 1:
- prev_skip = 1
- if page_num == 1:
- css = 'disabled'
- url = 'index'
- if prev_skip > 1:
- url += '-' + str(prev_skip)
- url += '.html'
- html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', '‹‹').replace('#CSS_CLASS#', css)
-
- # skip back far
- css = ''
- prev_skip = page_num - pager_skip_long
- if prev_skip < 1:
- prev_skip = 1
- if page_num == 1:
- css += ' disabled'
- url = 'index'
- if prev_skip > 1:
- url += '-' + str(prev_skip)
- url += '.html'
- html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', '‹‹‹').replace('#CSS_CLASS#', css)
- # n-1
- start = -2
- if page_num + 1 > pages:
- start -= 1
- if page_num + 2 > pages:
- start -= 1
- for prev_page_num in range(start,0):
- if page_num + prev_page_num > 0:
- css = ''
- url = 'index'
- if page_num + prev_page_num > 1:
- url += '-' + str(page_num + prev_page_num)
- url += '.html'
- if prev_page_num < -1:
- css = 'd-none d-sm-block'
- html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', str(page_num + prev_page_num)).replace('#CSS_CLASS#', css)
- # n
- url = 'index'
- if page_num > 1:
- url += '-' + str(page_num)
- url += '.html'
- html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', str(page_num)).replace('#CSS_CLASS#', 'active')
- # n + 1
- css = ''
- end = 3
- if page_num -1 < 1:
- end += 1
- if page_num - 2 < 1:
- end += 1
- for next_page_num in range(1,end):
- if page_num + next_page_num <= pages:
- if next_page_num > 1:
- css = 'd-none d-sm-block'
- html_pager += template_index_pager_link.replace('#URL#', 'index' + '-' + str(page_num + next_page_num) + '.html').replace('#TEXT#', str(page_num + next_page_num)).replace('#CSS_CLASS#', css)
- # skip forward far
- next_skip = page_num + pager_skip_long
- css = ''
- if page_num == pages:
- css += ' disabled'
- if next_skip > pages:
- next_skip = pages
- url = 'index'
- if next_skip > 1:
- url += '-' + str(next_skip)
- url += '.html'
- html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', '›››').replace('#CSS_CLASS#', css)
-
- # skip forward
- next_skip = page_num + pager_skip
- css = ''
- if page_num == pages:
- css = 'disabled'
- if next_skip > pages:
- next_skip = pages
- url = 'index'
- if next_skip > 1:
- url += '-' + str(next_skip)
- url += '.html'
- html_pager += template_index_pager_link.replace('#URL#', url).replace('#TEXT#', '››').replace('#CSS_CLASS#', css)
- # next
- css = ''
- next_num = page_num + 1
- if page_num == pages:
- css = 'disabled'
- next_num = pages
- html_pager += template_index_pager_link.replace('#URL#', 'index' + '-' + str(next_num) + '.html').replace('#TEXT#', '›').replace('#CSS_CLASS#', css)
- return html_pager
- def chunks(l, n):
- """Yield successive n-sized chunks from l."""
- for i in range(0, len(l), n):
- yield l[i:i + n]
- if __name__ == '__main__':
- parser=argparse.ArgumentParser()
- parser.add_argument('--min-score', default=0, help='limit post rendering, default 0')
- parser.add_argument('--min-comments', default=0, help='limit post rendering, default 0')
- parser.add_argument('--hide-deleted-comments', action='store_true', help='exclude deleted and removed comments where possible')
- args=parser.parse_args()
- hide_deleted_comments = False
- if args.hide_deleted_comments:
- hide_deleted_comments = True
- args.min_score = int(args.min_score)
- args.min_comments = int(args.min_comments)
- generate_html(args.min_score, args.min_comments, hide_deleted_comments)
|