LiuFan
/
PrivacyScanData


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
							#!/usr/bin/env python

import sys
import urllib2
import re
from optparse import OptionParser
import time
from datetime import date, timedelta
import boto3 	# pip install boto3
                # configure aws cli


regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`"
                    "{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|"
                    "\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"))

def get_emails(s):
    """Returns an iterator of matched emails found in string s."""
    # Removing lines that start with '//' because the regular expression
    # mistakenly matches patterns like 'http://foo@bar.com' as '//foo@bar.com'.
    return (email[0] for email in re.findall(regex, s) if not email[0].startswith('//'))

#def find_emails(str):
#	emails = re.findall(r'[\w\.-]+@[\w\.-]+', str)
#	for email in emails:
#    	print email


def read_url(url,f):
    website = urllib2.urlopen(url)

    #read html code
    html = website.read()

    yesterday = date.today() - timedelta(1)
    todays=html[:html.find(yesterday.strftime("Posted:%a, %d %b %Y"))]


    #use re.findall to get all the links
    links = re.findall('"((http|ftp)s?://.*?)"', todays)
    for link in links:
        if "pastebin" in link[0]:
            try:
                website = urllib2.urlopen(link[0])
            except urllib2.HTTPError as e:
                print ("Error")
                print (link)
                print (e)
            else:

                mail = website.read()
                for email in get_emails(mail): #TODO replace get with find ???
                    #print email
                    f.write(email+'\n')


if __name__ == '__main__':


    filename=time.strftime("Mail-%d-%m-%Y.csv")

    if len(sys.argv) == 2:
        filename=sys.argv[1]+"-parsed-"+filename

    with open(filename, 'w') as f:
    #	f.write('This is a test\n')
        url="https://haveibeenpwned.com/Pastes/Latest"
        if len(sys.argv) == 2:
            print(sys.argv[1])
            with open (sys.argv[1], "r") as myfile:
                    data=myfile.readlines()
                    for line in data:
                        for email in get_emails(line):
                            f.write(email+',\n')
            #read_url(url,f)

    data = open(filename, 'rb')
    s3 = boto3.resource('s3')
    print (s3.Bucket('magazinosmail').put_object(Key=filename, Body=data))