write_rss_feed.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. from __future__ import print_function
  2. import json
  3. import re
  4. try:
  5. from urllib import quote_plus, unquote_plus
  6. from urlparse import urljoin
  7. except ImportError:
  8. from urllib.parse import quote_plus, unquote_plus
  9. from urllib.parse import urljoin
  10. try:
  11. from email.Utils import formatdate
  12. except ImportError:
  13. from email.utils import formatdate
  14. from xml.sax.saxutils import escape
  15. from os import path
  16. import boto3
  17. from botocore.exceptions import ClientError
  18. class LambdaTestButton(Exception):
  19. pass
  20. print('Loading function')
  21. s3 = boto3.client('s3')
  22. FEED_TEMPLATE = """
  23. <rss xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" version="2.0">
  24. <channel>
  25. <title>{title}</title>
  26. <description>{description}</description>
  27. <link>{url}</link>
  28. <language>en-us</language>{items}
  29. </channel>
  30. </rss>
  31. """
  32. ITEM_TEMPLATE = """
  33. <item>
  34. <title>{title}</title>
  35. <description />
  36. <enclosure url="{url}" type="audio/mpeg" length="{filesize}" />
  37. <itunes:duration>{length_secs}</itunes:duration>
  38. <pubDate>{date}</pubDate>
  39. </item>"""
  40. DOMAIN = 'http://{bucket}.s3-{region}.amazonaws.com'
  41. FEED_FILENAME = 'feed.xml'
  42. TEST_BUCKET = 'sourcebucket'
  43. def natural_key(string_):
  44. """Split string_ into number / letter words, so e.g. A2 is lower than A10
  45. From http://stackoverflow.com/a/3033342/15890"""
  46. return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]
  47. def rssfeed(feed_data, items):
  48. item_xml = ''.join(
  49. ITEM_TEMPLATE.format(**item) for item in items
  50. )
  51. return FEED_TEMPLATE.format(items=item_xml, **feed_data)
  52. def deltaed_date_as_str(base_date, delta):
  53. dsecs = delta * 24 * 60 * 60
  54. return formatdate(dsecs + float(base_date.strftime('%s')))
  55. def episode_data(i, object_data, bucket, region):
  56. key = object_data['Key']
  57. fn = path.basename(key)
  58. title = path.splitext(fn)[0]
  59. filesize = object_data['Size']
  60. dt = object_data['LastModified']
  61. domain = DOMAIN.format(bucket=bucket, region=region)
  62. return {
  63. 'title': escape(title),
  64. 'url': urljoin(domain, quote_plus(key, safe='/')),
  65. 'filesize': filesize,
  66. # dumb guess about duration
  67. 'length_secs': filesize / 1500,
  68. 'date': deltaed_date_as_str(dt, i),
  69. }
  70. def get_episode_data(bucket, folder, region):
  71. """Extract the following episode data:
  72. title, url, filesize, length_secs, date
  73. """
  74. folder = (folder.rstrip('/') + '/').lstrip('/')
  75. print('s3.list_objects_v2(Bucket={!r}, Prefix={!r})'.format(
  76. bucket, folder))
  77. data = s3.list_objects_v2(Bucket=bucket, Prefix=folder)
  78. episodes = sorted(
  79. data['Contents'],
  80. key=lambda x: natural_key(x['Key']),
  81. reverse=True,)
  82. return [
  83. episode_data(i, obj, bucket, region)
  84. for i, obj in enumerate(episodes)
  85. if obj['Key'] != folder
  86. if obj['Key'].endswith(('.mp3', '.m4a', '.m4b'))
  87. if not obj['Key'].startswith('_')
  88. ]
  89. def write_feed(bucket, folder, region):
  90. episode_data = get_episode_data(bucket, folder, region)
  91. feed_path = path.join(folder, FEED_FILENAME)
  92. domain = DOMAIN.format(bucket=bucket, region=region)
  93. encoded_path = quote_plus(feed_path, safe='/')
  94. feed_url = urljoin(domain, encoded_path)
  95. print(feed_path, feed_url)
  96. feed_data = {
  97. 'title': escape(folder),
  98. 'description': escape(folder),
  99. 'url': feed_url,
  100. 'path': feed_path,
  101. 'encoded_path': encoded_path,
  102. }
  103. feed = rssfeed(feed_data, episode_data)
  104. print(feed)
  105. s3.put_object(
  106. Bucket=bucket,
  107. Key=feed_path,
  108. Body=feed,
  109. ContentType='application/xml'
  110. )
  111. return feed_data
  112. def write_index(bucket, feed_data):
  113. try:
  114. index = s3.get_object(
  115. Bucket=bucket,
  116. Key='feeds.json',)
  117. feed_index = json.load(index['Body'])
  118. except ClientError as e:
  119. error_code = e.response['Error']['Code']
  120. if error_code == 'NoSuchKey':
  121. feed_index = {}
  122. else:
  123. raise e
  124. feed_path = feed_data['encoded_path']
  125. feed_index[feed_path] = feed_data
  126. s3.put_object(
  127. Bucket=bucket,
  128. Key='feeds.json',
  129. Body=json.dumps(feed_index, indent=4),
  130. ContentType='application/json'
  131. )
  132. index_template = """
  133. <html>
  134. <body>
  135. {}
  136. </body>
  137. </html>
  138. """
  139. feed_links = [
  140. '<li><a href="{0[url]}">{0[title]}</a></li>'.format(feed)
  141. for feed in feed_index.values()
  142. ]
  143. html = index_template.format('<br>\n'.join(feed_links))
  144. s3.put_object(
  145. Bucket=bucket,
  146. Key='index.html',
  147. Body=html,
  148. ContentType='text/html'
  149. )
  150. def get_bucket(event):
  151. upload = event['Records'][0]['s3']
  152. try:
  153. bucket = upload['bucket']['name']
  154. except KeyError:
  155. raise LambdaTestButton
  156. else:
  157. if bucket == TEST_BUCKET:
  158. raise LambdaTestButton
  159. return bucket
  160. def get_default_bucket():
  161. return [
  162. b['Name'] for b in s3.list_buckets()['Buckets']
  163. if 'podcast' in b['Name']][0]
  164. def get_folders(event, bucket):
  165. print('get_folders')
  166. upload = event['Records'][0]['s3']
  167. key = unquote_plus(upload['object']['key'])
  168. print('Key={}'.format(key))
  169. folder = path.dirname(key)
  170. print('Folder={}'.format(folder))
  171. if folder:
  172. return {folder}
  173. key_data = s3.list_objects_v2(Bucket=bucket)
  174. keys = [k['Key'] for k in key_data['Contents']]
  175. print('keys={}'.format(keys))
  176. return {path.dirname(key) for key in keys if path.dirname(key)}
  177. def get_region(event, is_test_button):
  178. if is_test_button:
  179. return 'eu-west-1'
  180. return event['Records'][0]['awsRegion']
  181. def lambda_handler(event, context):
  182. """Write an RSS Podcast Feed upon any change to mp3s on S3.
  183. - An mp3 file has just been uploaded / deleted
  184. - Extract the podcast name from the "folder"
  185. - Collect details from each mp3 in the folder:
  186. - Filename
  187. - Size
  188. - Generate RSS Feed XML
  189. - Write RSS Feed
  190. """
  191. print("Received event: {}".format(json.dumps(event, indent=2)))
  192. is_test_button = False
  193. try:
  194. bucket = get_bucket(event)
  195. except LambdaTestButton:
  196. is_test_button = True
  197. bucket = get_default_bucket()
  198. region = get_region(event, is_test_button)
  199. folders = get_folders(event, bucket)
  200. print('Folders={}'.format(folders))
  201. print('Region={}, Bucket={}'.format(region, bucket))
  202. log_data = {}
  203. for folder in folders:
  204. print('Folder={}'.format(folder))
  205. feed_data = write_feed(bucket, folder, region)
  206. write_index(bucket, feed_data)
  207. log_data[folder] = feed_data
  208. return log_data