123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127 |
- import os
- import pickle
- import pandas as pd
- from presidio_analyzer import AnalyzerEngine
- from presidio_anonymizer import AnonymizerEngine
- from faker import Faker
- from faker.providers import internet
- # Function used to pseudonymize a text containing emails
- def anonymizeEmail(text_to_anonymize, country):
- # Initialize Faker
- fake = Faker(faker_locales_dict[country])
- fake.add_provider(internet)
- analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=["EMAIL_ADDRESS"], language='en')
- matched_emails = {}
- for match in analyzer_results:
- email = text_to_anonymize[match.start:match.end]
- if email not in emails_dict:
- fake_email = fake.safe_email()
- while (fake_email in emails_dict.values()) or (fake_email in emails_dict):
- fake_email = fake.safe_email()
- emails_dict[email] = fake_email
- matched_emails[email] = fake_email
- else:
- fake_email = emails_dict[email]
- matched_emails[email] = fake_email
- anonymized_result = text_to_anonymize
- for email in matched_emails:
- anonymized_result = anonymized_result.replace(email, matched_emails[email])
- return anonymized_result
- # Function used to pseudonymize a text containing names
- def anonymizeName(text_to_anonymize, country):
- # Initialize Faker
- fake = Faker(faker_locales_dict[country])
- analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=["PERSON"], language='en')
- matched_names = {}
- for match in analyzer_results:
- name = text_to_anonymize[match.start:match.end]
- if name not in names_dict:
- fake_name = fake.name()
- while (fake_name in names_dict.values()) or (fake_name in names_dict):
- fake_name = fake.name()
- names_dict[name] = fake_name
- matched_names[name] = fake_name
- else:
- fake_name = names_dict[name]
- matched_names[name] = fake_name
- anonymized_result = text_to_anonymize
- for name in matched_names:
- anonymized_result = anonymized_result.replace(name, matched_names[name])
- return anonymized_result
- # For testing purpose you can load the Excel content directly here.
- # Just uncomment the following 2 lines.
- # # Load the Excel content in a dataframe
- # dataset = pd.read_excel(r'D:\<your-path>\Chapter06\CustomersCreditCardAttempts.xlsx', engine='openpyxl')
- # Load mapping dictionaries from PKL files if they exist, otherwise create empty dictionaries
- pkls_path = r'D:\<your-path>\Chapter06\pkls'
- emails_dict_pkl_path = os.path.join(pkls_path, 'emails_dict.pkl')
- names_dict_pkl_path = os.path.join(pkls_path, 'names_dict.pkl')
- if os.path.isfile(emails_dict_pkl_path):
- emails_dict = pickle.load(open(emails_dict_pkl_path, "rb"))
- else:
- emails_dict = {}
- if os.path.isfile(names_dict_pkl_path):
- names_dict = pickle.load(open(names_dict_pkl_path, "rb"))
- else:
- names_dict = {}
- # Define locale and language dictionaries
- faker_locales_dict = {'UNITED STATES': 'en_US', 'ITALY': 'it_IT', 'GERMANY': 'de_DE'}
- # Initialize Presidio's analyzer and anonymizer
- # https://microsoft.github.io/presidio/supported_entities/
- analyzer = AnalyzerEngine()
- anonymizer = AnonymizerEngine()
- # Create a copy of the source dataset
- df = dataset.copy()
- # Apply the function anonymizeName for each value of the Name column
- df.Name = pd.Series([anonymizeName(text, country) for (text, country) in zip(df['Name'], df['Country'])])
- # Apply the function anonymizeEmail for each value of the Email column
- df.Email = pd.Series([anonymizeEmail(text, country) for (text, country) in zip(df['Email'], df['Country'])])
- # Column Notes is 'object' data type as it contains lot of NaN and
- # Pandas doesn't recognize it as string. So it has to be cast to string
- # in order to be anonymized. Then replace it with its anonymization
- df.Notes = pd.Series(
- [anonymizeName(text, country) for (text, country) in zip(df['Notes'].astype('str'), df['Country'])])
- df.Notes = pd.Series(
- [anonymizeEmail(text, country) for (text, country) in zip(df['Notes'].astype('str'), df['Country'])])
- # # Prevent Pandas to truncate strings in cells
- # pd.set_option('display.max_colwidth', None)
- # # Show both the dataframes
- # dataset
- # df
- # Write emails and names dictionaries to PKL files
- pickle.dump(emails_dict, open(emails_dict_pkl_path, "wb"))
- pickle.dump(names_dict, open(names_dict_pkl_path, "wb"))
|