redact_names_1.py 1.1 KB

123456789101112131415161718192021222324252627282930313233
  1. def names(df):
  2. nltk.set_proxy(None)
  3. #nltk.download()
  4. # print('\nTag each row sentences\n')
  5. tagged_sentences = []
  6. for text in range(len(df)):
  7. sentence = df.iloc[text]
  8. # print('sentence\n', sentence)
  9. tagged_sentences.append(nltk.tag.pos_tag(sentence.split()))
  10. # print('tagged_sentences\n', tagged_sentences[text])
  11. # print('\nEdit each row sentences\n')
  12. edited_sentences = []
  13. for sentence in range(len(tagged_sentences)):
  14. edited_sentence = []
  15. for word, tag in tagged_sentences[sentence]:
  16. if tag == 'NNP' or tag == 'NNPS' or tag == 'NNS':
  17. edited_sentence.append('xxx')
  18. else:
  19. edited_sentence.append(word)
  20. edited_sentences.append(str(' '.join(edited_sentence)))
  21. # print('edited_sentences\n', edited_sentences)
  22. return pd.Series(edited_sentences)
  23. ######################### Test
  24. ##############################
  25. # Texts = ["they work at Microsoft, and my name is Sami", "Google CEO is Sunder Pichai"]
  26. # data = pd.Series(Texts)
  27. # print(names(data))