03-pseudonymize-data-in-power-bi-python.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. import os
  2. import pickle
  3. import pandas as pd
  4. from presidio_analyzer import AnalyzerEngine
  5. from presidio_anonymizer import AnonymizerEngine
  6. from faker import Faker
  7. from faker.providers import internet
  8. # Function used to pseudonymize a text containing emails
  9. def anonymizeEmail(text_to_anonymize, country):
  10. # Initialize Faker
  11. fake = Faker(faker_locales_dict[country])
  12. fake.add_provider(internet)
  13. analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=["EMAIL_ADDRESS"], language='en')
  14. matched_emails = {}
  15. for match in analyzer_results:
  16. email = text_to_anonymize[match.start:match.end]
  17. if email not in emails_dict:
  18. fake_email = fake.safe_email()
  19. while (fake_email in emails_dict.values()) or (fake_email in emails_dict):
  20. fake_email = fake.safe_email()
  21. emails_dict[email] = fake_email
  22. matched_emails[email] = fake_email
  23. else:
  24. fake_email = emails_dict[email]
  25. matched_emails[email] = fake_email
  26. anonymized_result = text_to_anonymize
  27. for email in matched_emails:
  28. anonymized_result = anonymized_result.replace(email, matched_emails[email])
  29. return anonymized_result
  30. # Function used to pseudonymize a text containing names
  31. def anonymizeName(text_to_anonymize, country):
  32. # Initialize Faker
  33. fake = Faker(faker_locales_dict[country])
  34. analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=["PERSON"], language='en')
  35. matched_names = {}
  36. for match in analyzer_results:
  37. name = text_to_anonymize[match.start:match.end]
  38. if name not in names_dict:
  39. fake_name = fake.name()
  40. while (fake_name in names_dict.values()) or (fake_name in names_dict):
  41. fake_name = fake.name()
  42. names_dict[name] = fake_name
  43. matched_names[name] = fake_name
  44. else:
  45. fake_name = names_dict[name]
  46. matched_names[name] = fake_name
  47. anonymized_result = text_to_anonymize
  48. for name in matched_names:
  49. anonymized_result = anonymized_result.replace(name, matched_names[name])
  50. return anonymized_result
  51. # For testing purpose you can load the Excel content directly here.
  52. # Just uncomment the following 2 lines.
  53. # # Load the Excel content in a dataframe
  54. # dataset = pd.read_excel(r'D:\<your-path>\Chapter06\CustomersCreditCardAttempts.xlsx', engine='openpyxl')
  55. # Load mapping dictionaries from PKL files if they exist, otherwise create empty dictionaries
  56. pkls_path = r'D:\<your-path>\Chapter06\pkls'
  57. emails_dict_pkl_path = os.path.join(pkls_path, 'emails_dict.pkl')
  58. names_dict_pkl_path = os.path.join(pkls_path, 'names_dict.pkl')
  59. if os.path.isfile(emails_dict_pkl_path):
  60. emails_dict = pickle.load(open(emails_dict_pkl_path, "rb"))
  61. else:
  62. emails_dict = {}
  63. if os.path.isfile(names_dict_pkl_path):
  64. names_dict = pickle.load(open(names_dict_pkl_path, "rb"))
  65. else:
  66. names_dict = {}
  67. # Define locale and language dictionaries
  68. faker_locales_dict = {'UNITED STATES': 'en_US', 'ITALY': 'it_IT', 'GERMANY': 'de_DE'}
  69. # Initialize Presidio's analyzer and anonymizer
  70. # https://microsoft.github.io/presidio/supported_entities/
  71. analyzer = AnalyzerEngine()
  72. anonymizer = AnonymizerEngine()
  73. # Create a copy of the source dataset
  74. df = dataset.copy()
  75. # Apply the function anonymizeName for each value of the Name column
  76. df.Name = pd.Series([anonymizeName(text, country) for (text, country) in zip(df['Name'], df['Country'])])
  77. # Apply the function anonymizeEmail for each value of the Email column
  78. df.Email = pd.Series([anonymizeEmail(text, country) for (text, country) in zip(df['Email'], df['Country'])])
  79. # Column Notes is 'object' data type as it contains lot of NaN and
  80. # Pandas doesn't recognize it as string. So it has to be cast to string
  81. # in order to be anonymized. Then replace it with its anonymization
  82. df.Notes = pd.Series(
  83. [anonymizeName(text, country) for (text, country) in zip(df['Notes'].astype('str'), df['Country'])])
  84. df.Notes = pd.Series(
  85. [anonymizeEmail(text, country) for (text, country) in zip(df['Notes'].astype('str'), df['Country'])])
  86. # # Prevent Pandas to truncate strings in cells
  87. # pd.set_option('display.max_colwidth', None)
  88. # # Show both the dataframes
  89. # dataset
  90. # df
  91. # Write emails and names dictionaries to PKL files
  92. pickle.dump(emails_dict, open(emails_dict_pkl_path, "wb"))
  93. pickle.dump(names_dict, open(names_dict_pkl_path, "wb"))