def __next__(self): # Select next note (length >= 2000) while True: try: _, row = next(self.note_iterrows) except StopIteration: self._load_random_csv() _, row = next(self.note_iterrows) note_id = int(row.ROW_ID) note = row.TEXT.strip() # if len(note) >= 2000: # break if len(note) < 2000: continue try: correct, left, right = self._random_word_context(note) except: # import traceback; traceback.print_exc(); continue break # Corrupt and pseudonymize correct = correct.lower() if random.uniform(0, 1) >= self.no_corruption_prob: typo = self.word_corrupter.corrupt_word(correct) else: typo = correct left = self.mimic_pseudo.pseudonymize(left) left = self._process_note(left) left = ' '.join(left.split(' ')[-128:]) right = self.mimic_pseudo.pseudonymize(right) right = self._process_note(right) right = ' '.join(right.split(' ')[:128]) # Parse temp_csv_row = [-1, note_id, typo, left, right, correct] # print(f'{self.csv_fname}({note_id}, {_}/{len(self.df_note)}): {correct} -> {typo}') example = self._parse_row(temp_csv_row) return example