redact_all.py 4.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. import pandas as pd
  2. from time import sleep
  3. import redact_address
  4. import redact_numerics
  5. import redact_emails
  6. import redact_names
  7. import redact_urls
  8. import redact_datetime
  9. # Texts = [
  10. # 'I had an ok experience and I live close by 2000 Vernier rd grosse pointe woods MI 48236. I had a good time at 2999 vernier',
  11. # 'I used to know someone who lived at, 2025 magna rd grosse pointe MI 48237 they loved it and told us many cool stories about the lake',
  12. # 'I liked their services 22000 moross rd, detroit MI 48236", "lots of diverse life experiences at 6233 orlina st, apt 1001, detroit MI 48217',
  13. # '2013 1st ambonstreet", "245e ousterkade 9", "oh yeah, I had a great time at 20225 Liverni a really really good time',
  14. # '1231451469', '42.2', '123 145 1469', '123.145.1469', '(123) 145.1469', '(123) 145 1469',
  15. # '(123) 145–1469', '123–145–1469', '+1(123) 145–1469 ', '1234567890999111', '123HELLO56',
  16. # '-123', '04/04/1998', 'it’s015–96–0342 you know my number call me', '+123–145–1469',
  17. # '48236–123', 'I live close to (42.293564, -83.638916)', '123-4-5648', '1-234-5-6789',
  18. # 'I used these two email mouafek.ayadi@esprit.tn, moufak.ayadi@oddo-bhf.com',
  19. # 'this is another email afek.aadi@esit.com',
  20. # 'they work at Microsoft, and my name is Sami',
  21. # 'Google CEO is Sunder Pichai',
  22. # 'http://url.com','http://www.url.com/',
  23. # 'https://url.com/bla3/blah3/', 'www.google.com',
  24. # 'I eat potato at 05:30 PM and im happy, then i eat again at 10:12 AM',
  25. # '2018-03-14 06:08:18, he went on 2018-03-15 06:08:18,2018-03-15 slkfldfjezli'
  26. # ]
  27. #
  28. # print(redact_address.address(pd.Series(Texts)))
  29. # print(redact_numerics.numerics(pd.Series(Texts)))
  30. # print(redact_emails.email(pd.Series(Texts)))
  31. # print(redact_names.names(pd.Series(Texts)))
  32. # print(redact_urls.urls(pd.Series(Texts)))
  33. # print(redact_datetime.datestimes(pd.Series(Texts)))
  34. def redact(data):
  35. # Redaction
  36. original_text = data
  37. print('Redact Dates and Times...')
  38. data = redact_datetime.datestimes(data)
  39. sleep(1)
  40. print('Redact Address...')
  41. data = redact_address.address(data)
  42. sleep(1)
  43. print('Redact Numerics...')
  44. data = redact_numerics.numerics(data)
  45. sleep(1)
  46. print('Redact Emails...')
  47. data = redact_emails.email(data)
  48. sleep(1)
  49. print('Redact Names...')
  50. data = redact_names.names(data)
  51. sleep(1)
  52. print('Redact Urls...')
  53. data = redact_urls.urls(data)
  54. sleep(1)
  55. df = pd.DataFrame(columns=['Original', 'Redacted'])
  56. df['Original'] = original_text
  57. df['Redacted'] = data
  58. return data
  59. ############################################ Test
  60. #################################################
  61. # The following is a simulated dataset that have address, sensitive numerics, emails and entity names.
  62. # Texts = [
  63. # 'I had an ok experience and I live close by 2000 Vernier rd grosse pointe woods MI 48236. I had a good time at 2999 vernier',
  64. # 'I used to know someone who lived at, 2025 magna rd grosse pointe MI 48237 they loved it and told us many cool stories about the lake',
  65. # 'I liked their services 22000 moross rd, detroit MI 48236", "lots of diverse life experiences at 6233 orlina st, apt 1001, detroit MI 48217',
  66. # '2013 1st ambonstreet", "245e ousterkade 9", "oh yeah, I had a great time at 20225 Liverni a really really good time',
  67. # '1231451469', '42.2', '123 145 1469', '123.145.1469', '(123) 145.1469', '(123) 145 1469',
  68. # '(123) 145–1469', '123–145–1469', '+1(123) 145–1469 ', '1234567890999111', '123HELLO56',
  69. # '-123', '04/04/1998', 'it’s015–96–0342 you know my number call me', '+123–145–1469',
  70. # '48236–123', 'I live close to (42.293564, -83.638916)', '123-4-5648', '1-234-5-6789',
  71. # 'I used these two email mouafek.ayadi@esprit.tn, moufak.ayadi@oddo-bhf.com',
  72. # 'this is another email afek.aadi@esit.com',
  73. # 'they work at Microsoft, and my name is Sami',
  74. # 'Google CEO is Sunder Pichai',
  75. # 'http://url.com','http://www.url.com/',
  76. # 'https://url.com/bla3/blah3/', 'www.google.com',
  77. # 'I eat potato at 05:30 PM and im happy, then i eat again at 10:12 AM',
  78. # '2018-03-14 06:08:18, he went on 2018-03-15 06:08:18,2018-03-15 slkfldfjezli'
  79. # ]
  80. # print(redact(pd.Series(Texts)).values)