redact_names.py 1.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. import pandas as pd
  2. import nltk
  3. import os
  4. PROXY = ""
  5. os.environ["HTTP_PROXY"] = PROXY
  6. os.environ["HTTPS_PROXY"] = PROXY
  7. # redact proper names and company names using nltk
  8. def names(df):
  9. nltk.set_proxy(None)
  10. #nltk.download()
  11. # print('\nTag each row sentences\n')
  12. tagged_sentences = []
  13. for text in range(len(df)):
  14. sentence = df.iloc[text]
  15. # print('sentence\n', sentence)
  16. tagged_sentences.append(nltk.tag.pos_tag(sentence.split()))
  17. # print('tagged_sentences\n', tagged_sentences[text])
  18. # print('\nEdit each row sentences\n')
  19. edited_sentences = []
  20. for sentence in range(len(tagged_sentences)):
  21. edited_sentence = []
  22. for word, tag in tagged_sentences[sentence]:
  23. if tag == 'NNP' or tag == 'NNPS' or tag == 'NNS':
  24. edited_sentence.append('xxx')
  25. else:
  26. edited_sentence.append(word)
  27. edited_sentences.append(str(' '.join(edited_sentence)))
  28. # print('edited_sentences\n', edited_sentences)
  29. return pd.Series(edited_sentences)
  30. ######################### Test
  31. ##############################
  32. # Texts = ["they work at Microsoft, and my name is Sami", "Google CEO is Sunder Pichai"]
  33. # data = pd.Series(Texts)
  34. # print(names(data))