main_28.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. def _random_word_context(self, text, max_trial=10):
  2. puncs = list("[]!\"#$%&'()*+,./:;<=>?@\^_`{|}~-")
  3. words = text.split()
  4. trial = 0
  5. done = False
  6. while trial < max_trial and not done:
  7. trial += 1
  8. w_idx = random.randint(0, len(words) - 1)
  9. word, left_res, right_res = words[w_idx], [], []
  10. # If the word is already in vocab, it's good to go.
  11. if len(word) >= self.min_word_len and \
  12. (word.lower() in self.dictionary) and \
  13. len(word) < DEFAULT_MAX_CHARACTER_POSITIONS - 4:
  14. done = True
  15. else:
  16. # Otherwise, detach puncs at the first and the last char, and check again
  17. if word[0] in puncs:
  18. word, left_res = word[1:], [word[0]]
  19. else:
  20. word, left_res = word, []
  21. if not word: continue # The word was just a punc
  22. if word[-1] in puncs:
  23. word, right_res = word[:-1], [word[-1]]
  24. else:
  25. word, right_res = word, []
  26. if len(word) < self.min_word_len or \
  27. (not word.lower() in self.dictionary) or \
  28. len(word) >= DEFAULT_MAX_CHARACTER_POSITIONS - 4:
  29. continue
  30. # Check whether it's anonymized field
  31. right_snip = ' '.join(words[w_idx + 1:w_idx + 5])
  32. if '**]' in right_snip and '[**' not in right_snip:
  33. continue
  34. left_snip = ' '.join(words[w_idx - 4:w_idx])
  35. if '[**' in left_snip and '**]' not in left_snip:
  36. continue
  37. # Pass!
  38. done = True
  39. if done:
  40. return word, ' '.join(words[:w_idx] + left_res), ' '.join(right_res + words[w_idx + 1:])
  41. else:
  42. raise ValueError('failed to choose word')