fetcher.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. import os
  2. import json
  3. import urllib
  4. import urlparse
  5. import requests
  6. import config
  7. from groupie import models
  8. from groupie.utils import get_path
  9. def get_pointer(group):
  10. path = group.get_path('pointer')
  11. if os.path.exists(path):
  12. with open(path) as fp:
  13. return fp.read().strip()
  14. return None
  15. def update_pointer(group, ptr):
  16. print ('Update pointer:', ptr)
  17. if ptr is None:
  18. print ('Not updating...')
  19. return
  20. scheme, netloc, path, query, frag = urlparse.urlsplit(ptr)
  21. query = urlparse.parse_qs(query)
  22. for k, v in query.items():
  23. if k.startswith('__'):
  24. del query[k]
  25. else:
  26. query[k] = v[0] # XXX
  27. query = urllib.urlencode(query)
  28. ptr = urlparse.urlunsplit((scheme, netloc, path, query, frag))
  29. with open(group.get_path('pointer'), 'w') as fp:
  30. fp.write(ptr)
  31. def fetch_comments(group, post):
  32. comments = post.get('comments')
  33. if not comments: return
  34. while True:
  35. for comment in comments.get('data', ()):
  36. comment_dir = ensure_dir(group.slug, 'comments', post['id'])
  37. with open(os.path.join(comment_dir, comment['id']), 'w') as fp:
  38. json.dump(comment, fp)
  39. next_url = comments.get('paging', {}).get('next')
  40. if not next_url:
  41. break
  42. print ('Fetching comments:', next_url)
  43. comments = requests.get(next_url).json()
  44. def update_posts(group, posts):
  45. for post in posts:
  46. fetch_comments(group, post)
  47. with open(group.get_path('posts', post['id']), 'w') as fp:
  48. json.dump(post, fp)
  49. def fetch_feed(group, initial_url=None):
  50. only_once = True
  51. if not initial_url:
  52. only_once = False
  53. params = {'access_token': config.ACCESS_TOKEN, 'limit': str(100)}
  54. initial_url = 'https://graph.facebook.com/%s/feed?%s' % (urllib.quote(group.id), urllib.urlencode(params))
  55. update_url = None
  56. url = initial_url
  57. while url:
  58. print ('*', url)
  59. r = requests.get(url)
  60. data = r.json()
  61. update_posts(group, data.get('data', []))
  62. paging = data.get('paging', {})
  63. url = paging.get('next')
  64. if update_url is None:
  65. update_url = paging.get('previous')
  66. if only_once:
  67. break
  68. return update_url
  69. def fetch_info(group):
  70. r = requests.get('https://graph.facebook.com/%s' % group.id)
  71. with open(group.get_path('info'), 'w') as fp:
  72. json.dump(r.json(), fp)
  73. def ensure_dir(*c):
  74. path = get_path(*c)
  75. try:
  76. os.mkdir(path)
  77. except:
  78. if not os.path.exists(path):
  79. raise
  80. return path
  81. def main(group_slug):
  82. print ('** Fetching:', group_slug)
  83. try:
  84. group = models.Group.get(group_slug)
  85. except models.GroupNotFound:
  86. group = models.Group({})
  87. group.slug = group_slug
  88. group.id = config.GROUPS[group_slug]
  89. ensure_dir(group.slug)
  90. ensure_dir(group.slug, 'posts')
  91. ensure_dir(group.slug, 'comments')
  92. fetch_info(group)
  93. update_pointer(group, fetch_feed(group, get_pointer(group)))
  94. if __name__ == '__main__':
  95. for slug in config.GROUPS:
  96. main(slug)