1234567891011121314151617181920212223242526272829303132333435363738394041 |
- def __next__(self):
- # Select next note (length >= 2000)
- while True:
- try:
- _, row = next(self.note_iterrows)
- except StopIteration:
- self._load_random_csv()
- _, row = next(self.note_iterrows)
- note_id = int(row.ROW_ID)
- note = row.TEXT.strip()
- # if len(note) >= 2000:
- # break
- if len(note) < 2000:
- continue
- try:
- correct, left, right = self._random_word_context(note)
- except:
- # import traceback; traceback.print_exc();
- continue
- break
- # Corrupt and pseudonymize
- correct = correct.lower()
- if random.uniform(0, 1) >= self.no_corruption_prob:
- typo = self.word_corrupter.corrupt_word(correct)
- else:
- typo = correct
- left = self.mimic_pseudo.pseudonymize(left)
- left = self._process_note(left)
- left = ' '.join(left.split(' ')[-128:])
- right = self.mimic_pseudo.pseudonymize(right)
- right = self._process_note(right)
- right = ' '.join(right.split(' ')[:128])
- # Parse
- temp_csv_row = [-1, note_id, typo, left, right, correct]
- # print(f'{self.csv_fname}({note_id}, {_}/{len(self.df_note)}): {correct} -> {typo}')
- example = self._parse_row(temp_csv_row)
- return example
|