提交 106fd505 作者: 朱学凯

add pre-train

上级 9366316a
from subword_nmt.apply_bpe import BPE
import codecs
import json
import numpy as np
from tqdm import tqdm
import math
import random
def get_tokenzie_seq(file, save, mask=False):
begin_token = '[CLS]'
separate_token = "[SEP]"
with open(file['seq'], 'r') as f:
seq = f.readlines()
with open(file["smile"], 'r') as f:
smile = f.readlines()
with open(file["affinity"], 'r') as f:
affinity = f.readlines()
bpe_codes_drug = codecs.open('./config/drug_codes_chembl.txt')
dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
bpe_codes_prot = codecs.open('./config/protein_codes_uniprot.txt')
pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
with open(save, "w") as f:
for i in tqdm(range(len(seq))):
d = dbpe.process_line(smile[i].strip()).split()
p = pbpe.process_line(seq[i].strip()).split()
if mask == True:
d = random_mask(d)
p = random_mask(p)
final_seq = [begin_token] + d + [separate_token] + p + [separate_token]
affinity_num = affinity[i].strip()
item = {
"seq": " ".join(final_seq),
"affinity": affinity_num
}
new_item = json.dumps(item)
f.write(new_item + '\n')
def random_mask(input_seq, mask_proportion=0.15):
mask_len = math.ceil(len(input_seq)*mask_proportion)
mask_token_posi = np.random.choice(len(input_seq), mask_len)
for i in mask_token_posi:
choice = random.random()
if choice < 0.8:
input_seq[i] = "[MASK]"
# mask_vec[i] = 1
# elif choice >= 0.8 and choice < 0.9:
return input_seq
if __name__ == '__main__':
# file_train = {"sps": './data/train/train_sps',
# 'seq': './data/train/train_protein_seq',
# "smile": './data/train/train_smile',
# "affinity": './data/train/train_ic50',
# }
# save = "./data/tokenize_data/train.tokenize"
# save_mask = "./data/tokenize_data/train.tokenize.mask"
df_test = {"sps": './data/test/test_sps',
'seq': './data/test/test_protein_seq',
"smile": './data/test/test_smile',
"affinity": './data/test/test_ic50',
}
save = "./data/tokenize_data/test.tokenize"
get_tokenzie_seq(df_test, save)
# get_tokenzie_seq(file_train, save_mask, mask=True)
\ No newline at end of file
......@@ -271,6 +271,56 @@ class Data_Encoder_LM(data.Dataset):
return " ".join(d), " ".join(p), y
# return len(d), len(p)
class Data_Provide(data.Dataset):
def __init__(self, train_file, mask_file):
'Initialization'
# load data
with open(train_file, 'r') as f:
self.seq = f.readlines()
with open(mask_file, 'r') as f:
self.seq_mask = f.readlines()
def __len__(self):
'Denotes the total number of samples'
return len(self.seq)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
item = json.loads(self.seq[index])
mask_item = json.loads(self.seq_mask[index])
seq = item["seq"]
seq_mask = mask_item["seq"]
y = np.float64(item["affinity"])
return seq, seq_mask, y
class Data_Gen(data.Dataset):
def __init__(self, train_file):
'Initialization'
# load data
with open(train_file, 'r') as f:
self.seq = f.readlines()
# with open(mask_file, 'r') as f:
# self.seq_mask = f.readlines()
def __len__(self):
'Denotes the total number of samples'
return len(self.seq)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
item = json.loads(self.seq[index])
# mask_item = json.loads(self.seq_mask[index])
seq = item["seq"]
# seq_mask = mask_item["seq"]
y = np.float64(item["affinity"])
return seq, y
def get_task(task_name):
tokenizer_config = {"vocab_file": './config/vocab.txt',
......@@ -320,12 +370,8 @@ def get_task(task_name):
return df, tokenizer_config
elif task_name.lower() in ['train_mol', "pre-train"]:
df_train = {"sps": './data/train/train_sps',
'seq': './data/train/train_protein_seq',
"smile": './data/train/train_smile',
"affinity": './data/train/train_ic50',
}
elif task_name.lower() in ['train_mol']:
df_train = "data/tokenize_data/train.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
......@@ -353,6 +399,20 @@ def get_task(task_name):
return df_test, tokenizer_config
elif task_name.lower() == 'pre-train':
df_train_mask = "data/tokenize_data/train.tokenize.mask"
df_train = "data/tokenize_data/train.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 595
}
return df_train, df_train_mask, tokenizer_config
def random_mask(input_seq, mask_proportion=0.15):
input = [i.split() for i in input_seq]
mask_len = [math.ceil(len(i)*mask_proportion) for i in input]
......@@ -378,37 +438,46 @@ class Tokenizer(object):
self.max_len = tokenizer_config["max_len"]
self.vocab = load_vocab(tokenizer_config["vocab_file"])
def convert_token_to_ids(self, d, p):
mask_d = random_mask(d)
mask_p = random_mask(p)
input_seq = [[self.begin_id] + i + [self.sep_id] + j + [self.sep_id] for i, j in zip(mask_d, mask_p)]
input_seq_ori = [[self.begin_id] + i.split() + [self.sep_id] + j.split() + [self.sep_id] for i, j in zip(d, p)]
def seq2emb_encoder_simple(self, input_seq, vocab):
try:
ids = np.asarray([vocab[i] for i in input_seq])
except:
ids = np.array([0])
return ids
def convert_token_to_ids(self, seq):
# input_seq = [[self.begin_id] + i + [self.sep_id] + j + [self.sep_id] for i, j in zip(mask_d, mask_p)]
# input_seq_ori = [[self.begin_id] + i.split() + [self.sep_id] + j.split() + [self.sep_id] for i, j in zip(d, p)]
# mask_posi = np.concatenate((np.zeros(1), mask_d_posi, np.zeros(1), mask_p_posi, np.zeros(1)))
# token_type_ids = [[np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))] for d, p in zip(mask_d, mask_p)]
for i, seq in enumerate(input_seq):
if len(seq) > self.max_len:
input_seq[i] = seq[:self.max_len-1] + [self.sep_id]
input_seq_ori[i] = seq[:self.max_len-1] + [self.sep_id]
# seq = seq.split()
all_seq = [i.split() for i in seq]
for i, seq_i in enumerate(all_seq):
if len(seq_i) > self.max_len:
all_seq[i] = seq_i[:self.max_len-1] + [self.sep_id]
# input_seq_ori[i] = seq[:self.max_len-1] + [self.sep_id]
# token_type_ids = token_type_ids[:self.max_len]
# mask_posi = mask_posi[:self.max_len]
# else:
# mask_posi = np.pad(mask_posi, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
# token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
all_seq = []
all_seq_ori = []
all_seq_ids = []
# all_seq_ori = []
# all_mask = []
for seq, ori in zip(input_seq, input_seq_ori):
input = seq2emb_encoder_simple(seq, self.max_len, self.vocab)
input_ori = seq2emb_encoder_simple(ori, self.max_len, self.vocab)
all_seq.append(torch.from_numpy(input).long())
all_seq_ori.append(torch.from_numpy(input_ori).long())
input = pad_sequence(all_seq, batch_first=True)
input_ori = pad_sequence(all_seq_ori, batch_first=True)
for seq in all_seq:
input = self.seq2emb_encoder_simple(seq, self.vocab)
# input_ori = seq2emb_encoder_simple(ori, self.max_len, self.vocab)
all_seq_ids.append(torch.from_numpy(input).long())
# all_seq_ori.append(torch.from_numpy(input_ori).long())
input = pad_sequence(all_seq_ids, batch_first=True)
# input_ori = pad_sequence(all_seq_ori, batch_first=True)
input_mask = input != 0
# input_mask = pad_sequence(all_mask)
# return torch.from_numpy(input).long(), torch.from_numpy(input_mask).long(), torch.from_numpy(token_type_ids).long()
return input, input_mask, input_ori
# return input, input_mask, input_ori
return input, input_mask
......
......@@ -1864,6 +1864,7 @@ class BertAffinityModel(BertPreTrainedModel):
self.embeddings = BertEmbeddings(config)
self.encoder = BertEncoder(config)
self.mlp = Multilayer_perceptron(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
# self.pooler = BertPooler(config) if add_pooling_layer else None
self.init_weights()
......
from argparse import ArgumentParser
from dataset import Data_Encoder, get_task, Data_Encoder_mol
from dataset import Data_Encoder, get_task, Data_Encoder_mol, Data_Gen, Tokenizer
import torch
from torch.utils.data import DataLoader
from configuration_bert import BertConfig
......@@ -12,7 +12,7 @@ torch.set_default_tensor_type(torch.DoubleTensor)
def train(args, model, dataset):
def train(args, model, dataset, tokenizer):
data_loder_para = {'batch_size': args.batch_size,
'shuffle': True,
'num_workers': args.workers,
......@@ -38,11 +38,12 @@ def train(args, model, dataset):
print('begin training')
# training
for epoch in range(args.epochs):
for i, (input, token_type_ids, input_mask, affinity) in enumerate(data_generator):
for i, (input, affinity) in enumerate(data_generator):
# use cuda
# input model
# if torch.cuda.is_available():
pred_affinity = model(input_ids=input.cuda(), token_type_ids=token_type_ids.cuda(), attention_mask=input_mask.cuda())
input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
# pred_affinity = model(input_ids=input_ids.cuda(), token_type_ids=token_type_ids.cuda(), attention_mask=input_mask.cuda())
pred_affinity = model(input_ids=input_ids.cuda(), attention_mask=attention_mask.cuda())
loss = loss_fct(pred_affinity, affinity.cuda().unsqueeze(-1))
# else:
# pred_affinity = model(input_ids=input, token_type_ids=token_type_ids, attention_mask=input_mask)
......@@ -66,7 +67,7 @@ def train(args, model, dataset):
print('training over')
writer.close()
def test(args, model, dataset):
def test(args, model, dataset, tokenizer):
data_loder_para = {'batch_size': args.batch_size,
'shuffle': False,
'num_workers': args.workers,
......@@ -76,6 +77,7 @@ def test(args, model, dataset):
with torch.no_grad():
# if torch.cuda.is_available():
model.load_state_dict(torch.load(args.init), strict=True)
model.cuda()
# else:
# model.load_state_dict(torch.load(args.init, map_location=torch.device('cpu')), strict=True)
model.eval()
......@@ -84,12 +86,9 @@ def test(args, model, dataset):
result = args.output + '/' + '{}.txt'.format(args.task)
print('begin predicting')
with open(result, 'w') as f:
for i, (input, token_type_ids, input_mask, affinity) in enumerate(tqdm(data_generator)):
# if torch.cuda.is_available():
model.cuda()
pred_affinity = model(input_ids=input.cuda(), token_type_ids=token_type_ids.cuda(),
attention_mask=input_mask.cuda())
# else:
for i, (input, affinity) in enumerate(tqdm(data_generator)):
input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
pred_affinity = model(input_ids=input_ids.cuda(), attention_mask=attention_mask.cuda())
# pred_affinity = model(input_ids=input, token_type_ids=token_type_ids, attention_mask=input_mask)
pred_affinity = pred_affinity.cpu().numpy().squeeze(-1)
for res in pred_affinity:
......@@ -103,11 +102,12 @@ def main(args):
# load data
data_file, tokenizer_config = get_task(args.task)
# dataset = Data_Encoder(data_file, tokenizer_config)
dataset = Data_Encoder_mol(data_file, tokenizer_config)
dataset = Data_Gen(data_file)
# creat model
print('------------------creat model---------------------------')
config = BertConfig.from_pretrained(args.config)
model = BertAffinityModel(config)
tokenizer = Tokenizer(tokenizer_config)
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
......@@ -117,10 +117,10 @@ def main(args):
print('task name : {}'.format(args.task))
if args.task in ['train', 'train_z_1', 'train_z_10', 'train_z_100', 'train_mol']:
train(args, model, dataset)
train(args, model, dataset, tokenizer)
elif args.task in ['test', 'test_mol']:
test(args, model, dataset)
test(args, model, dataset, tokenizer)
......@@ -155,10 +155,12 @@ if __name__ == '__main__':
# local test
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
args.task = 'train_mol'
args.savedir = 'local_test_train'
# args.savedir = 'local_test_train'
args.savedir = 'train'
args.epochs = 10
args.lr = 1e-5
args.config = './config/config_layer_3_mol.json'
args.config = './config/config_layer_6_mol.json'
......
from argparse import ArgumentParser
from dataset import Data_Encoder, get_task, Data_Encoder_mol, Data_Encoder_LM, Tokenizer
from dataset import Data_Encoder, get_task, Data_Encoder_mol, Data_Encoder_LM, Tokenizer, Data_Provide
import torch
from torch.utils.data import DataLoader
from configuration_bert import BertConfig
......@@ -39,15 +39,17 @@ def train(args, model, dataset, tokenizer):
print('begin training')
# training
for epoch in range(args.epochs):
for i, (drug, protein, affinity) in enumerate(data_generator):
input, input_mask, input_ori = tokenizer.convert_token_to_ids(drug, protein)
for i, (seq, seq_mask, affinity) in enumerate(data_generator):
input_random_mask, attention_mask = tokenizer.convert_token_to_ids(seq_mask)
label, _ = tokenizer.convert_token_to_ids(seq)
# pred_affinity = model(input_ids=input.cuda(), token_type_ids=token_type_ids.cuda(), attention_mask=input_mask.cuda())
logits = model(input_ids=input.cuda(), attention_mask=input_mask.cuda())
logits = model(input_ids=input_random_mask.cuda(), attention_mask=attention_mask.cuda())
# loss = 0
pred_logits = logits[input == 1]
label = input_ori[input == 1]
loss = loss_fct(pred_logits, label.cuda())
posi = torch.where(input_random_mask == 1)
pred_logits = logits[posi]
target = label[posi]
loss = loss_fct(pred_logits, target.cuda())
# else:
# pred_affinity = model(input_ids=input, token_type_ids=token_type_ids, attention_mask=input_mask)
# loss = loss_fct(pred_affinity, affinity.unsqueeze(-1))
......@@ -105,9 +107,9 @@ def test(args, model, dataset):
def main(args):
# load data
data_file, tokenizer_config = get_task(args.task)
data_file, data_mask, tokenizer_config = get_task(args.task)
# dataset = Data_Encoder(data_file, tokenizer_config)
dataset = Data_Encoder_LM(data_file, tokenizer_config)
dataset = Data_Provide(data_file, data_mask)
tokenizer = Tokenizer(tokenizer_config)
# creat model
print('------------------creat model---------------------------')
......@@ -160,9 +162,10 @@ if __name__ == '__main__':
# local test
os.environ["CUDA_VISIBLE_DEVICES"] = "5"
args.task = 'pre-train'
args.savedir = 'mask-LM-quick'
args.savedir = 'mask-LM-lr-1e-4-1019'
# args.savedir = 'train'
args.epochs = 30
args.lr = 1e-5
args.lr = 1e-4
args.config = './config/config_layer_6_mol.json'
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论