LiuFan
/
PrivacyScanData


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
							import nltk
import re

def score_by_length(rated_parts):
    ''' Take a list of (phrase_part, score) tuples and score them by minimum length '''
    if len(rated_parts) < 2:
        return rated_parts
    scores = sorted([len(item[0]) for item in rated_parts])
    b = []
    for t in rated_parts:
        l = len(t[0])
        score = t[1]
        if l == scores[0]:
            score += 2
        elif l == scores[1]:
            score += 1   
        b += [(t[0], score)]
    return b

def rupe_score(phrase):
    ''' 
    Rupe scoring: Score phrases from a tweet, sometimes truncating the phrase further.
    '''
    score = 0
    # If it starts with "But" then dock points
    if re.match("^[,\s]?[Bb]ut\s", phrase):
        score -= 4
        
    # Remove Blocklist
    if phrase in ['i.e.', 'ie.', 'e.g.', 'eg.', 'More', 'More.']:
        return (phrase, -100)

    # If it includes "but" in the middle, just strip it out and add points
    if re.match(".+[,\s]but\s", phrase):
        phrase = re.split("\sbut\s", phrase)[0]
        score += 1
    return (phrase, score)

def best_part(rated_parts):
    # Fold rated tweet parts into a single suggestion
    # if parts of tweet have the same score, the earlier one takes precedence
    part = reduce(lambda x,y: y if y[1] > x[1] else x, rated_parts, (None, -100))
    n = len(part[0])
    if n < 90 and n > 5:
        return part[0]
    else:
        return None

def derupe(tweet):
    # Initialize tokenizer each time? Why not, no need to complicate a bot
    # that tweets once or twice a day
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    parts = tokenizer.tokenize(tweet)
    ratings = [rupe_score(part.strip()) for part in parts]
    length_ranked_ratings = score_by_length(ratings)
    deruped_tweet = best_part(length_ranked_ratings)
    # Abort if the new tweet is too much of the old one
    if deruped_tweet and len(deruped_tweet) > (len(tweet)-5) and len(tweet) > 10:
        return None
    return deruped_tweet
        
def main():                    
    tweet_file = open('test-tweets.txt', 'r+')
    tweets = [t.rstrip() for t in tweet_file.readlines()]
    tweet_file.close()
            
    for tweet in tweets:
        deruped_tweet = derupe(tweet)
        if deruped_tweet:
            print ("ORIGINAL:  %s" % tweet)
            print ("NEW TWEET: %s" % deruped_tweet)

if __name__ == '__main__':
    main()