redact_address_3.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334
  1. def addy2(w):
  2. gm = re.sub(r'(\d\w+\b)\s+\b(\w+)\b\s+(\w+)\s+\b\D+$', r'\1 \2 \3', w)
  3. return gm
  4. ###################### Test addy2
  5. #print(addy2(w[0]))
  6. # Get the exact address's
  7. data['addressadd'] = data['Text'].apply(addy)
  8. data['addressadd2'] = data['addressadd'].map(lambda x: [addy2(i) for i in x])
  9. #print(data)
  10. # Replace the addresses from the data set.
  11. # reduce() applies the lambda function in a loop to compute the cumulative sum of the items in numbers.
  12. # The lambda function takes two arguments and returns their sum. numbers = [1, 2, 3, 4] reduce(lambda a, b: a + b, numbers)
  13. # itertools.product() is used to find the cartesian product from the given iterator. arr1 = [1, 2] arr2 = [5, 6] list(product(arr1, arr2)) => [(1, 5), (1, 6), (2, 5), (2, 6)], therefore if we have strings each char of the first string will be multiplied by the chars of the other
  14. # The use of list comprehension and mapping functions to eliminate the need to explicitly create loops and to increase speed.
  15. # Full Explanation
  16. # we use apply to go throughthe the whole series. going through the whole series we apply reduce
  17. data['Redacted_Address'] = data.apply(lambda x: reduce(lambda a, r: a.replace(*r),list(product(x['addressadd2'], ['xxx'])), # this is part of the second lambda
  18. x['Text']), axis=1) # this is part of the first lambda
  19. # reduce(lambda a, b: a + b, numbers)
  20. return pd.Series(data['Redacted_Address'])
  21. ############################################### Test
  22. ####################################################
  23. # # Here, by combining regex with list comprehension and a few mapping functions, we are able to identify and remove several different address formats from a data set.
  24. # Texts = ["I had an ok experience and I live close by 2000 Vernier rd grosse pointe woods MI 48236. I had a good time at 2999 vernier",
  25. # "I used to know someone who lived at, 2025 magna rd grosse pointe MI 48237 they loved it and told us many cool stories about the lake", "I liked their services 22000 moross rd, detroit MI 48236", "lots of diverse life experiences at 6233 orlina st, apt 1001, detroit MI 48217",
  26. # "2013 1st ambonstreet", "245e ousterkade 9", "oh yeah, I had a great time at 20225 Liverni a really really good time" ]
  27. # data = pd.Series(Texts)
  28. # print(address(data))