mailCollect.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. #!/usr/bin/env python
  2. import sys
  3. import urllib2
  4. import re
  5. from optparse import OptionParser
  6. import time
  7. from datetime import date, timedelta
  8. import boto3 # pip install boto3
  9. # configure aws cli
  10. regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`"
  11. "{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|"
  12. "\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"))
  13. def get_emails(s):
  14. """Returns an iterator of matched emails found in string s."""
  15. # Removing lines that start with '//' because the regular expression
  16. # mistakenly matches patterns like 'http://foo@bar.com' as '//foo@bar.com'.
  17. return (email[0] for email in re.findall(regex, s) if not email[0].startswith('//'))
  18. #def find_emails(str):
  19. # emails = re.findall(r'[\w\.-]+@[\w\.-]+', str)
  20. # for email in emails:
  21. # print email
  22. def read_url(url,f):
  23. website = urllib2.urlopen(url)
  24. #read html code
  25. html = website.read()
  26. yesterday = date.today() - timedelta(1)
  27. todays=html[:html.find(yesterday.strftime("Posted:%a, %d %b %Y"))]
  28. #use re.findall to get all the links
  29. links = re.findall('"((http|ftp)s?://.*?)"', todays)
  30. for link in links:
  31. if "pastebin" in link[0]:
  32. try:
  33. website = urllib2.urlopen(link[0])
  34. except urllib2.HTTPError as e:
  35. print ("Error")
  36. print (link)
  37. print (e)
  38. else:
  39. mail = website.read()
  40. for email in get_emails(mail): #TODO replace get with find ???
  41. #print email
  42. f.write(email+'\n')
  43. if __name__ == '__main__':
  44. filename=time.strftime("Mail-%d-%m-%Y.csv")
  45. if len(sys.argv) == 2:
  46. filename=sys.argv[1]+"-parsed-"+filename
  47. with open(filename, 'w') as f:
  48. # f.write('This is a test\n')
  49. url="https://haveibeenpwned.com/Pastes/Latest"
  50. if len(sys.argv) == 2:
  51. print(sys.argv[1])
  52. with open (sys.argv[1], "r") as myfile:
  53. data=myfile.readlines()
  54. for line in data:
  55. for email in get_emails(line):
  56. f.write(email+',\n')
  57. #read_url(url,f)
  58. data = open(filename, 'rb')
  59. s3 = boto3.resource('s3')
  60. print (s3.Bucket('magazinosmail').put_object(Key=filename, Body=data))