提交 e5d444a3 作者: 朱学凯

fix pad_1

上级 fc1caac2
......@@ -188,6 +188,7 @@ class Data_Encoder_mol(data.Dataset):
token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
if len(input_seq) > self.max_len:
input_seq = input_seq[:self.max_len-1] + [self.sep_id]
token_type_ids = token_type_ids[:self.max_len]
else:
token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
......
from transformers import BertTokenizer
from modeling_bert import BertForMaskedLM
import torch
import numpy as np
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertForMaskedLM.from_pretrained('bert-base-uncased')
seq_a = "The capital of France is [MASK]."
seq_b = "The capital of France is Paris."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
inputs = tokenizer([[seq_a, seq_b], [choice0, choice1]], padding=True)
labels = tokenizer("The capital of France is Paris.", return_tensors="pt") #["input_ids"]
a = np.ones((5))
b = a[:4]
print('----------------')
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论