def _random_word_context(self, text, max_trial=10): puncs = list("[]!\"#$%&'()*+,./:;<=>?@\^_`{|}~-") words = text.split() trial = 0 done = False while trial < max_trial and not done: trial += 1 w_idx = random.randint(0, len(words) - 1) word, left_res, right_res = words[w_idx], [], [] # If the word is already in vocab, it's good to go. if len(word) >= self.min_word_len and \ (word.lower() in self.dictionary) and \ len(word) < DEFAULT_MAX_CHARACTER_POSITIONS - 4: done = True else: # Otherwise, detach puncs at the first and the last char, and check again if word[0] in puncs: word, left_res = word[1:], [word[0]] else: word, left_res = word, [] if not word: continue # The word was just a punc if word[-1] in puncs: word, right_res = word[:-1], [word[-1]] else: word, right_res = word, [] if len(word) < self.min_word_len or \ (not word.lower() in self.dictionary) or \ len(word) >= DEFAULT_MAX_CHARACTER_POSITIONS - 4: continue # Check whether it's anonymized field right_snip = ' '.join(words[w_idx + 1:w_idx + 5]) if '**]' in right_snip and '[**' not in right_snip: continue left_snip = ' '.join(words[w_idx - 4:w_idx]) if '[**' in left_snip and '**]' not in left_snip: continue # Pass! done = True if done: return word, ' '.join(words[:w_idx] + left_res), ' '.join(right_res + words[w_idx + 1:]) else: raise ValueError('failed to choose word')