scrap.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. # -*- coding: utf-8 -*-
  2. import argparse
  3. import codecs
  4. import shutil
  5. import os
  6. import json
  7. import hashlib
  8. from operator import attrgetter
  9. import bleach
  10. import dateutil.parser
  11. import requests
  12. from jinja2 import Environment, FileSystemLoader
  13. THEME_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'theme')
  14. class Author(object):
  15. def __init__(self, data):
  16. self.id = data['id']
  17. self.name = data['name']
  18. class Comment(object):
  19. def __init__(self, item):
  20. self.id = item['id']
  21. self.content = None
  22. self.picture = item['picture'] if 'picture' in item else None
  23. self.content = bleach.linkify(item['message']) if 'message' in item else None
  24. self.author = Author(item['from'])
  25. self.date = dateutil.parser.parse(item['created_time'])
  26. self.likes = [Author(d) for d in item['likes']['data']] if 'likes' in item else []
  27. class Entry(object):
  28. def __init__(self, item):
  29. self.id = item['id']
  30. self.picture = item['picture'] if 'picture' in item else None
  31. self.content = bleach.linkify(item['message']) if 'message' in item else None
  32. self.author = Author(item['from']) if 'from' in item else None
  33. self.date = dateutil.parser.parse(item['created_time'])
  34. self.likes = [Author(d) for d in item['likes']['data']] if 'likes' in item else []
  35. self.comments = [Comment(d) for d in item['comments']['data']] if 'comments' in item else []
  36. def render_template(output_path, tpl_name, filename, **options):
  37. env = Environment(loader=FileSystemLoader(THEME_PATH))
  38. template = env.get_template(tpl_name)
  39. output = template.render(**options)
  40. full_path = os.path.join(output_path, filename)
  41. with codecs.open(full_path, 'w+', encoding='utf-8') as f:
  42. f.write(output)
  43. def copy(source, destination):
  44. """Recursively copy source into destination.
  45. Taken from pelican.
  46. If source is a file, destination has to be a file as well.
  47. The function is able to copy either files or directories.
  48. :param source: the source file or directory
  49. :param destination: the destination file or directory
  50. """
  51. source_ = os.path.abspath(os.path.expanduser(source))
  52. destination_ = os.path.abspath(os.path.expanduser(destination))
  53. if os.path.isfile(source_):
  54. dst_dir = os.path.dirname(destination_)
  55. if not os.path.exists(dst_dir):
  56. os.makedirs(dst_dir)
  57. shutil.copy2(source_, destination_)
  58. elif os.path.isdir(source_):
  59. if not os.path.exists(destination_):
  60. os.makedirs(destination_)
  61. if not os.path.isdir(destination_):
  62. return
  63. for src_dir, subdirs, others in os.walk(source_):
  64. dst_dir = os.path.join(destination_,
  65. os.path.relpath(src_dir, source_))
  66. if not os.path.isdir(dst_dir):
  67. # Parent directories are known to exist, so 'mkdir' suffices.
  68. os.mkdir(dst_dir)
  69. for o in others:
  70. src_path = os.path.join(src_dir, o)
  71. dst_path = os.path.join(dst_dir, o)
  72. if os.path.isfile(src_path):
  73. shutil.copy2(src_path, dst_path)
  74. def download(url, output_path):
  75. print("downloading %s" % url)
  76. if not os.path.exists(output_path):
  77. os.makedirs(output_path)
  78. m = hashlib.md5()
  79. m.update(url)
  80. filename = m.hexdigest()
  81. file_path = os.path.join(output_path, filename)
  82. if not os.path.exists(file_path):
  83. resp = requests.get(url, stream=True)
  84. if resp.status_code == 200:
  85. with open(file_path, 'wb') as f:
  86. resp.raw.decode_content = True
  87. shutil.copyfileobj(resp.raw, f)
  88. return filename
  89. def get_attachments(item, image_path, token):
  90. resp = requests.get('https://graph.facebook.com/v2.12/%s/attachments' % item.id, params={
  91. 'access_token': token,
  92. })
  93. if 'data' in resp.json() and resp.json()['data']:
  94. data = resp.json()['data']
  95. item.pictures = [get_from_type(d, image_path) for d in data][0]
  96. def get_from_type(item, image_path):
  97. if item['type'] == 'photo':
  98. return [download(item['media']['image']['src'], image_path)]
  99. elif item['type'] == 'album':
  100. return [get_from_type(d, image_path)[0] for d in item['subattachments']['data']]
  101. def parse_data(data):
  102. entries = [Entry(d) for d in data]
  103. entries.sort(key=attrgetter('date'))
  104. return entries
  105. def enhance_entries(entries, output_path, token):
  106. pictures_path = os.path.join(output_path, 'pictures')
  107. for entry in entries:
  108. get_attachments(entry, pictures_path, token)
  109. def generate_archive(data, output_path, token):
  110. with open(data, 'r') as f:
  111. data_json = json.load(f)
  112. entries = parse_data(data_json)
  113. enhance_entries(entries, output_path, token)
  114. render_template(output_path, 'index.html', 'index.html', entries=entries)
  115. def copy_assets(output_path):
  116. copy(os.path.join(THEME_PATH, 'fonts'), os.path.join(output_path, 'fonts'))
  117. copy(os.path.join(THEME_PATH, 'assets'), os.path.join(output_path, 'assets'))
  118. def parse_args():
  119. parser = argparse.ArgumentParser(description='Generate facebook group archive pages.')
  120. parser.add_argument('--data', dest='data', default='data.json',
  121. help='Location of the JSON file containing the data.')
  122. parser.add_argument('--output', dest='output_path',
  123. default='output',
  124. help='Path where to output the generated files.')
  125. parser.add_argument('--token', dest='token', help='the access token from Facebook graph API.')
  126. return parser.parse_args()
  127. if __name__ == '__main__':
  128. args = parse_args()
  129. generate_archive(args.data, args.output_path, args.token)
  130. copy_assets(args.output_path)
  131. print('')