rupes.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. import nltk
  2. import re
  3. def score_by_length(rated_parts):
  4. ''' Take a list of (phrase_part, score) tuples and score them by minimum length '''
  5. if len(rated_parts) < 2:
  6. return rated_parts
  7. scores = sorted([len(item[0]) for item in rated_parts])
  8. b = []
  9. for t in rated_parts:
  10. l = len(t[0])
  11. score = t[1]
  12. if l == scores[0]:
  13. score += 2
  14. elif l == scores[1]:
  15. score += 1
  16. b += [(t[0], score)]
  17. return b
  18. def rupe_score(phrase):
  19. '''
  20. Rupe scoring: Score phrases from a tweet, sometimes truncating the phrase further.
  21. '''
  22. score = 0
  23. # If it starts with "But" then dock points
  24. if re.match("^[,\s]?[Bb]ut\s", phrase):
  25. score -= 4
  26. # Remove Blocklist
  27. if phrase in ['i.e.', 'ie.', 'e.g.', 'eg.', 'More', 'More.']:
  28. return (phrase, -100)
  29. # If it includes "but" in the middle, just strip it out and add points
  30. if re.match(".+[,\s]but\s", phrase):
  31. phrase = re.split("\sbut\s", phrase)[0]
  32. score += 1
  33. return (phrase, score)
  34. def best_part(rated_parts):
  35. # Fold rated tweet parts into a single suggestion
  36. # if parts of tweet have the same score, the earlier one takes precedence
  37. part = reduce(lambda x,y: y if y[1] > x[1] else x, rated_parts, (None, -100))
  38. n = len(part[0])
  39. if n < 90 and n > 5:
  40. return part[0]
  41. else:
  42. return None
  43. def derupe(tweet):
  44. # Initialize tokenizer each time? Why not, no need to complicate a bot
  45. # that tweets once or twice a day
  46. tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
  47. parts = tokenizer.tokenize(tweet)
  48. ratings = [rupe_score(part.strip()) for part in parts]
  49. length_ranked_ratings = score_by_length(ratings)
  50. deruped_tweet = best_part(length_ranked_ratings)
  51. # Abort if the new tweet is too much of the old one
  52. if deruped_tweet and len(deruped_tweet) > (len(tweet)-5) and len(tweet) > 10:
  53. return None
  54. return deruped_tweet
  55. def main():
  56. tweet_file = open('test-tweets.txt', 'r+')
  57. tweets = [t.rstrip() for t in tweet_file.readlines()]
  58. tweet_file.close()
  59. for tweet in tweets:
  60. deruped_tweet = derupe(tweet)
  61. if deruped_tweet:
  62. print ("ORIGINAL: %s" % tweet)
  63. print ("NEW TWEET: %s" % deruped_tweet)
  64. if __name__ == '__main__':
  65. main()