reddit_ingest.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. import requests, json, praw
  2. from api_authentication import reddit_obj, get_aws_secret
  3. # Create the Reddit object (read only) to use in API calls
  4. cred = get_aws_secret("hpg-keys","us-west-2")["reddit"]
  5. REDDIT = reddit_obj(
  6. client_id=cred['clientid'],
  7. client_secret=cred['clientsecret'],
  8. user=cred['user'])
  9. def get_posts(subreddit_name, post_count):
  10. # retrieve post titles from subreddits and return as a list to process.
  11. titles = [r.title for r in REDDIT.subreddit(subreddit_name).hot(limit=post_count)]
  12. return titles
  13. def parse_title(title):
  14. """Try to parse a title for a song name and artist
  15. This is based off common naming convention used in subreddit... no ML yet :)
  16. Rules:::
  17. - any post starting with [FRESH] will follow with <ARTIST> - <SONG> ...
  18. - posts with <NAME> - <WORDS> is almost always a song
  19. - any titles with `ft.` will have a feature, meaning a song"""
  20. # Rule One: look for a dash within post title or if it is a discussion post
  21. dash_loc = title.find('-')
  22. if dash_loc == -1 or title[:12].lower() == '[discussion]':
  23. return None
  24. else:
  25. pass
  26. # Look out for content after brackets
  27. if title[0] == "[":
  28. artist = title[(title.find(']')+2) : (dash_loc)]
  29. song = title[(dash_loc+1) :]
  30. # Look out for a regular song post
  31. else:
  32. artist = title[: dash_loc]
  33. song = title[dash_loc+1 :]
  34. # Now parse any extra off the end of a post
  35. info_loc = title.rfind('(')
  36. # has -1 index if there is none. If it comes after dash, it is info.
  37. song = song if info_loc < dash_loc else title[(dash_loc+1) : (info_loc)]
  38. # trim more extra off if in brackets
  39. song = song if song.find('[') == -1 else song[: song.find('[')-1]
  40. # handle features on a track.
  41. artist = artist if "ft." not in artist else artist[: artist.find('ft.')]
  42. # return dictionary of song posts
  43. song_post = {"artist":artist, "track": song}
  44. print(f"\nSong post we found so far:\n\t{song_post}")
  45. return song_post