1234567891011121314151617181920212223242526 |
- def _make_sentence(self, tokens_left, tokens_right, seq_length=128):
- len_left = len(tokens_left)
- len_right = len(tokens_right)
- cut_len = len_left + len_right - (seq_length - 1)
- if cut_len > 0:
- cut_left = len_left - seq_length // 2
- cut_right = len_right - (seq_length - 1) // 2
- if cut_left < 0:
- cut_left, cut_right = 0, cut_left + cut_right
- elif cut_right < 0:
- cut_left, cut_right = cut_left + cut_right, 0
- else:
- cut_left, cut_right = 0, 0
- tokens_left = tokens_left[cut_left:]
- # tokens_right = tokens_right[:-cut_right]
- tokens_right = tokens_right[:len(tokens_right) - cut_right]
- tokens = tokens_left + [self.bert_tokenizer.mask_token] + tokens_right
- attention_mask = [1] * len(tokens_left) + [1] + [1] * len(tokens_right)
- if len(tokens) < seq_length:
- num_padding = seq_length - len(tokens)
- tokens += [self.bert_tokenizer.pad_token] * num_padding
- attention_mask += [0] * num_paddi
|