fix pad_1

e5d444a3 · 朱学凯 · fc1caac2 · e5d444a3 · e5d444a3
--- a/dataset.py
+++ b/dataset.py
@@ -188,6 +188,7 @@ class Data_Encoder_mol(data.Dataset):
        token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
        if len(input_seq) > self.max_len:
            input_seq = input_seq[:self.max_len-1] + [self.sep_id]
+            token_type_ids = token_type_ids[:self.max_len]
        else:
            token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
        input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)

--- a/test.py
+++ b/test.py
 from transformers import BertTokenizer
 from modeling_bert import BertForMaskedLM
 import torch
+import numpy as np
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+a = np.ones((5))
-# model = BertForMaskedLM.from_pretrained('bert-base-uncased')
-seq_a = "The capital of France is [MASK]."
-seq_b = "The capital of France is Paris."
-choice0 = "It is eaten with a fork and a knife."
-choice1 = "It is eaten while held in the hand."
-inputs = tokenizer([[seq_a, seq_b], [choice0, choice1]], padding=True)
-labels = tokenizer("The capital of France is Paris.", return_tensors="pt") #["input_ids"]
+b = a[:4]
 print('----------------')
\ No newline at end of file