提交 2a4d6f3f 作者: 朱学凯

add some change

上级 a43ff66c
import numpy as np
import re
def eval_result(pred, label):
pred = np.array(pred)
label = np.array(label)
num = len(pred)
diff = pred - label
mse = np.sum(np.power(diff, 2)) / num
rmse = np.sqrt(mse)
pearson_co = np.corrcoef(pred, label)
return rmse, pearson_co
def eval(pred_path, label_path):
with open(pred_path, 'r') as f:
pred = f.readlines()
pred = [float(i.strip()) for i in pred]
with open(label_path, 'r') as f:
label = f.readlines()
label = [float(i.strip()) for i in label]
remse, r_mat = eval_result(pred, label)
r = r_mat[0, 1]
save_path = pred_path.replace('test.txt', 'eval_results')
with open(save_path, 'w') as f:
f.write('RMSE : {} ; Pearson Correlation Coefficient : {}'.format(remse, r))
print('RMSE : {} ; Pearson Correlation Coefficient : {}'.format(remse, r))
if __name__ == '__main__':
with open('pre_test.sh', 'r') as f:
pred_dir = f.readline()
pred_dir = pred_dir.split()[5].split('/')[-1]
pred_result = './predict/{}/test.txt'.format(pred_dir)
test_label_path = './data/test_ic50'
eval(pred_result, test_label_path)
CUDA_VISIBLE_DEVICES=1 python run_interaction.py --task=test --output=./predict/lr-1e-5-batch-32-e-10-layer3-0417-add-type-ids-and-mask-step-24711 --config=./config/config_layer_3.json --init=./model/lr-1e-5-batch-32-e-10-layer3-0417-add-type-ids-and-mask/epoch-2-step-24711-loss-0.9765299144620636.pth --do_eval=True
\ No newline at end of file
CUDA_VISIBLE_DEVICES=1 python run_interaction.py --task=test --b=64 --output=./predict/lr-1e-5-batch-32-e-10-layer6-0420-step-74133 --config=./config/config_layer_6.json --init=./model/lr-1e-5-batch-32-e-10-layer6-0420/epoch-8-step-74133-loss-0.8989318833651185.pth --do_eval=True
\ No newline at end of file
from argparse import ArgumentParser
from dataset import Data_Encoder
import torch
from torch.utils.data import DataLoader
from configuration_bert import BertConfig
from modeling_bert import BertAffinityModel
from torch.utils.tensorboard import SummaryWriter
import os
from tqdm import tqdm
torch.set_default_tensor_type(torch.DoubleTensor)
def get_task(task_name):
if task_name.lower() == 'train':
df_train = {"sps": './data/train_sps',
"smile": './data/train_smile',
"affinity": './data/train_ic50',
}
tokenizer_config = {"vocab_file": './config/vocab.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 256
}
return df_train, tokenizer_config
elif task_name.lower() == 'test':
df_test = {"sps": './data/test_sps',
"smile": './data/test_smile',
"affinity": './data/test_ic50',
}
tokenizer_config = {"vocab_file": './config/vocab.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 256
}
return df_test, tokenizer_config
def train(args, model, dataset):
data_loder_para = {'batch_size': args.batch_size,
'shuffle': True,
'num_workers': args.workers,
}
data_generator = DataLoader(dataset, **data_loder_para)
model.train()
opt = torch.optim.Adam(model.parameters(), lr=args.lr)
loss_fct = torch.nn.MSELoss()
writer = SummaryWriter('./log/' + args.savedir)
num_step = args.epochs * len(data_generator)
step = 0
save_step = num_step // 10
# detect GPU
if torch.cuda.is_available():
model.cuda()
# print(model)
print('epoch num : {}'.format(args.epochs))
print('step num : {}'.format(num_step))
print('batch size : {}'.format(args.batch_size))
print('learning rate : {}'.format(args.lr))
print('begin training')
# training
for epoch in range(args.epochs):
for i, (input, token_type_ids, input_mask, affinity) in enumerate(data_generator):
# use cuda
# input model
if torch.cuda.is_available():
pred_affinity = model(input_ids=input.cuda(), token_type_ids=token_type_ids.cuda(), attention_mask=input_mask.cuda())
loss = loss_fct(pred_affinity, affinity.cuda().unsqueeze(-1))
else:
pred_affinity = model(input_ids=input, token_type_ids=token_type_ids, attention_mask=input_mask)
loss = loss_fct(pred_affinity, affinity.unsqueeze(-1))
step += 1
writer.add_scalar('loss', loss, global_step=step)
# Update gradient
opt.zero_grad()
loss.backward()
opt.step()
# if (i % 100 == 0):
print('Training at Epoch ' + str(epoch + 1) + ' step ' + str(step) + ' with loss ' + str(
loss.cpu().detach().numpy()))
# save
if epoch >= 1 and step % save_step == 0:
save_path = './model/' + args.savedir + '/'
if not os.path.exists(save_path):
os.mkdir(save_path)
torch.save(model.state_dict(), save_path + 'epoch-{}-step-{}-loss-{}.pth'.format(epoch, step, loss))
print('training over')
writer.close()
def test(args, model, dataset):
data_loder_para = {'batch_size': args.batch_size,
'shuffle': False,
'num_workers': args.workers,
}
data_generator = DataLoader(dataset, **data_loder_para)
with torch.no_grad():
if torch.cuda.is_available():
model.load_state_dict(torch.load(args.init), strict=True)
else:
model.load_state_dict(torch.load(args.init, map_location=torch.device('cpu')), strict=True)
model.eval()
if not os.path.exists(args.output):
os.mkdir(args.output)
result = args.output + '/' + '{}.txt'.format(args.task)
print('begin predicting')
with open(result, 'w') as f:
for i, (input, token_type_ids, input_mask, affinity) in enumerate(tqdm(data_generator)):
if torch.cuda.is_available():
model.cuda()
pred_affinity = model(input_ids=input.cuda(), token_type_ids=token_type_ids.cuda(),
attention_mask=input_mask.cuda())
else:
pred_affinity = model(input_ids=input, token_type_ids=token_type_ids, attention_mask=input_mask)
pred_affinity = pred_affinity.cpu().numpy()
for res in range(args.batch_size):
pred = pred_affinity[res, :][0]
f.write(str(pred) + '\n')
if args.do_eval:
os.system('python eval.py')
def main(args):
# load data
data_file, tokenizer_config = get_task(args.task)
dataset = Data_Encoder(data_file, tokenizer_config)
# creat model
print('------------------creat model---------------------------')
config = BertConfig.from_pretrained(args.config)
model = BertAffinityModel(config)
print('model name : BertAffinity')
print('task name : {}'.format(args.task))
if args.task == 'train':
train(args, model, dataset)
elif args.task in ['test']:
test(args, model, dataset)
if __name__ == '__main__':
# get parameter
parser = ArgumentParser(description='BertAffinity')
parser.add_argument('-b', '--batch-size', default=8, type=int,
metavar='N',
help='mini-batch size (default: 16), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('-j', '--workers', default=0, type=int, metavar='N',
help='number of data loading workers (default: 0)')
parser.add_argument('--epochs', default=50, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--task', choices=['train', 'test', 'channel', 'ER', 'GPCR', 'kinase'],
default='train', type=str, metavar='TASK',
help='Task name. Could be train, test, channel, ER, GPCR, kinase.')
parser.add_argument('--lr', '--learning-rate', default=1e-5, type=float,
metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--config', default='./config/config.json', type=str, help='model config file path')
# parser.add_argument('--log', default='training_log', type=str, help='training log')
parser.add_argument('--savedir', default='train', type=str, help='log and model save path')
# parser.add_argument('--device', default='0', type=str, help='name of GPU')
parser.add_argument('--init', default='model', type=str, help='init checkpoint')
parser.add_argument('--output', default='predict', type=str, help='result save path')
# parser.add_argument('--shuffle', default=True, type=str, help='shuffle data')
parser.add_argument('--do_eval', default=False, type=bool, help='do eval')
args = parser.parse_args()
# local test
# args.task = 'train'
# args.savedir = 'local_test_train'
# args.epochs = 10
# args.lr = 1e-5
# args.config = './config/config_layer_3.json'
# args.task = 'test'
# args.init = './model/lr-1e-5-batch-32-e-10-layer3-0417-add-type-ids-and-mask/epoch-9-step-82370-loss-0.8841055645024439.pth'
# args.output = './predict/test'
# args.config = './config/config_layer_3.json'
main(args)
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -32,7 +32,7 @@ def eval(pred_path, label_path):
if __name__ == '__main__':
with open('pre_test.sh', 'r') as f:
pred_dir = f.readline()
pred_dir = pred_dir.split()[4].split('/')[-1]
pred_dir = pred_dir.split()[5].split('/')[-1]
pred_result = './predict/{}/test.txt'.format(pred_dir)
test_label_path = './data/test_ic50'
eval(pred_result, test_label_path)
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
CUDA_VISIBLE_DEVICES=1 python run_interaction.py --task=test --output=./predict/lr-1e-5-batch-32-e-10-layer3-0417-add-type-ids-and-mask-step-24711 --config=./config/config_layer_3.json --init=./model/lr-1e-5-batch-32-e-10-layer3-0417-add-type-ids-and-mask/epoch-2-step-24711-loss-0.9765299144620636.pth --do_eval=True
\ No newline at end of file
CUDA_VISIBLE_DEVICES=1 python run_interaction.py --task=test --b=64 --output=./predict/lr-1e-5-batch-32-e-10-layer6-0420-step-74133 --config=./config/config_layer_6.json --init=./model/lr-1e-5-batch-32-e-10-layer6-0420/epoch-8-step-74133-loss-0.8989318833651185.pth --do_eval=True
\ No newline at end of file
RMSE : 1.0240501538228575 ; Pearson Correlation Coefficient : 0.7363279193240799
\ No newline at end of file
RMSE : 1.0240501538228575 ; Pearson Correlation Coefficient : 0.7363279193240799
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
RMSE : 1.0055980079639106 ; Pearson Correlation Coefficient : 0.7307091262603383
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
RMSE : 1.1229353121987031 ; Pearson Correlation Coefficient : 0.6521638915583137
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
RMSE : 1.1092552623206697 ; Pearson Correlation Coefficient : 0.6740378152907927
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
RMSE : 1.0635720063061838 ; Pearson Correlation Coefficient : 0.698596143686921
\ No newline at end of file
RMSE : 1.0635720063061838 ; Pearson Correlation Coefficient : 0.698596143686921
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论