redact_all_1.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. def redact(data):
  2. # Redaction
  3. original_text = data
  4. print('Redact Dates and Times...')
  5. data = redact_datetime.datestimes(data)
  6. sleep(1)
  7. print('Redact Address...')
  8. data = redact_address.address(data)
  9. sleep(1)
  10. print('Redact Numerics...')
  11. data = redact_numerics.numerics(data)
  12. sleep(1)
  13. print('Redact Emails...')
  14. data = redact_emails.email(data)
  15. sleep(1)
  16. print('Redact Names...')
  17. data = redact_names.names(data)
  18. sleep(1)
  19. print('Redact Urls...')
  20. data = redact_urls.urls(data)
  21. sleep(1)
  22. df = pd.DataFrame(columns=['Original', 'Redacted'])
  23. df['Original'] = original_text
  24. df['Redacted'] = data
  25. return data
  26. ############################################ Test
  27. #################################################
  28. # The following is a simulated dataset that have address, sensitive numerics, emails and entity names.
  29. # Texts = [
  30. # 'I had an ok experience and I live close by 2000 Vernier rd grosse pointe woods MI 48236. I had a good time at 2999 vernier',
  31. # 'I used to know someone who lived at, 2025 magna rd grosse pointe MI 48237 they loved it and told us many cool stories about the lake',
  32. # 'I liked their services 22000 moross rd, detroit MI 48236", "lots of diverse life experiences at 6233 orlina st, apt 1001, detroit MI 48217',
  33. # '2013 1st ambonstreet", "245e ousterkade 9", "oh yeah, I had a great time at 20225 Liverni a really really good time',
  34. # '1231451469', '42.2', '123 145 1469', '123.145.1469', '(123) 145.1469', '(123) 145 1469',
  35. # '(123) 145–1469', '123–145–1469', '+1(123) 145–1469 ', '1234567890999111', '123HELLO56',
  36. # '-123', '04/04/1998', 'it’s015–96–0342 you know my number call me', '+123–145–1469',
  37. # '48236–123', 'I live close to (42.293564, -83.638916)', '123-4-5648', '1-234-5-6789',
  38. # 'I used these two email mouafek.ayadi@esprit.tn, moufak.ayadi@oddo-bhf.com',
  39. # 'this is another email afek.aadi@esit.com',
  40. # 'they work at Microsoft, and my name is Sami',
  41. # 'Google CEO is Sunder Pichai',
  42. # 'http://url.com','http://www.url.com/',
  43. # 'https://url.com/bla3/blah3/', 'www.google.com',
  44. # 'I eat potato at 05:30 PM and im happy, then i eat again at 10:12 AM',
  45. # '2018-03-14 06:08:18, he went on 2018-03-15 06:08:18,2018-03-15 slkfldfjezli'
  46. # ]
  47. # print(redact(pd.Series(Texts)).values)