12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849 |
- def _random_word_context(self, text, max_trial=10):
- puncs = list("[]!\"#$%&'()*+,./:;<=>?@\^_`{|}~-")
- words = text.split()
- trial = 0
- done = False
- while trial < max_trial and not done:
- trial += 1
- w_idx = random.randint(0, len(words) - 1)
- word, left_res, right_res = words[w_idx], [], []
- # If the word is already in vocab, it's good to go.
- if len(word) >= self.min_word_len and \
- (word.lower() in self.dictionary) and \
- len(word) < DEFAULT_MAX_CHARACTER_POSITIONS - 4:
- done = True
- else:
- # Otherwise, detach puncs at the first and the last char, and check again
- if word[0] in puncs:
- word, left_res = word[1:], [word[0]]
- else:
- word, left_res = word, []
- if not word: continue # The word was just a punc
- if word[-1] in puncs:
- word, right_res = word[:-1], [word[-1]]
- else:
- word, right_res = word, []
- if len(word) < self.min_word_len or \
- (not word.lower() in self.dictionary) or \
- len(word) >= DEFAULT_MAX_CHARACTER_POSITIONS - 4:
- continue
- # Check whether it's anonymized field
- right_snip = ' '.join(words[w_idx + 1:w_idx + 5])
- if '**]' in right_snip and '[**' not in right_snip:
- continue
- left_snip = ' '.join(words[w_idx - 4:w_idx])
- if '[**' in left_snip and '**]' not in left_snip:
- continue
- # Pass!
- done = True
- if done:
- return word, ' '.join(words[:w_idx] + left_res), ' '.join(right_res + words[w_idx + 1:])
- else:
- raise ValueError('failed to choose word')
|