redact_numerics.py 1.3 KB

1234567891011121314151617181920212223242526
  1. import re
  2. import pandas as pd
  3. # To find and replace all sensitive numbers (phone numbers, ssn, latitude and longitude coordinates, some zipcodes)
  4. # in the data frame by xxx we can create a function such as below:
  5. def sensitive_numerics(number):
  6. number = re.sub(r'[\+\(]?\d[\d .\-\(\)]{6,}', r'xxx', number)
  7. # [0-9] matches a single digit in the range 0 through 9 (inclusive), {4} indicates that four such digits should occur in a row,
  8. # - means a hyphen, and | means an OR and separates the two patterns you mention.
  9. # '123-4-5648' '1-234-5-6789'
  10. number = re.sub('[0-9]-[0-9]{3}-[0-9]-[0-9]{4}|[0-9]{3}-[0-9]-[0-9]{4}', 'xxx', number)
  11. return number
  12. def numerics(data):
  13. return pd.Series(data.apply(sensitive_numerics))
  14. ############################################# Test
  15. ##################################################
  16. # Texts = ['1231451469', '42.2', '123 145 1469', '123.145.1469', '(123) 145.1469', '(123) 145 1469',
  17. # '(123) 145–1469', '123–145–1469', '+1(123) 145–1469 ', '1234567890999111', '123HELLO56',
  18. # '-123', '04/04/1998', 'it’s015–96–0342 you know my number call me', '+123–145–1469',
  19. # '48236–123', 'I live close to (42.293564, -83.638916)', '123-4-5648', '1-234-5-6789']
  20. # data = pd.Series(Texts)
  21. # print(numerics(data))