提交 590b8bca 作者: 朱学凯

add attentiondta

上级 a9e031a9
# Auto detect text files and perform LF normalization
* text=auto
import tensorflow as tf
import pandas as pd
import numpy as np
import DTA_model as model
import os
MAX_SEQ_LEN = 1200
MAX_SMI_LEN = 100
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score # R square
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
def calculateMSE(X, Y):
in_bracket = []
for i in range(len(X)):
num = Y[i] - X[i]
num = pow(num, 2)
in_bracket.append(num)
all_sum = sum(in_bracket)
MSE = all_sum / len(X)
return MSE
def parser(record):
read_features = {
'drug': tf.FixedLenFeature([MAX_SMI_LEN], dtype=tf.int64),
'protein': tf.FixedLenFeature([MAX_SEQ_LEN], dtype=tf.int64),
'affinity': tf.FixedLenFeature([1], dtype=tf.float32)
}
read_data = tf.parse_single_example(
serialized=record, features=read_features)
# read_data = tf.parse_example(serialized=record, features=read_features)
drug = tf.cast(read_data['drug'], tf.int32)
protein = tf.cast(read_data['protein'], tf.int32)
affinit_y = read_data['affinity']
return drug, protein, affinit_y
def test(file, test_path):
with tf.Graph().as_default() as g:
dataset = tf.data.TFRecordDataset(test_path)
dataset = dataset.map(parser)
dataset = dataset.batch(
batch_size=3)
iterator = dataset.make_initializable_iterator()
drug_to_embeding, proteins_to_embeding, labels_batch \
= iterator.get_next()
_, _, test_label = \
model.inference(
drug_to_embeding,
proteins_to_embeding,
regularizer=None,
keep_prob=1,
trainlabel=0)
mean_squared_eror = tf.losses.mean_squared_error(
test_label, labels_batch)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(iterator.initializer)
ckpt = tf.train.get_checkpoint_state(
"./" + "train" + "/model0/")
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
predictions_eval = []
labels_eval = []
# MSElist = []
try:
while True:
df, pf, p, l, MSE = sess.run(
[drug_to_embeding, proteins_to_embeding, test_label, labels_batch, mean_squared_eror])
predictions_eval.append(p)
labels_eval.append(l)
except tf.errors.OutOfRangeError:
pass
predictions_eval = np.concatenate(predictions_eval)
labels_eval = np.concatenate(labels_eval)
labels_eval.resize([labels_eval.shape[0], 1])
RESULT_PATH = "./results/" + dataname + "/"
if os.path.exists(RESULT_PATH) is False:
os.makedirs(RESULT_PATH)
with open(RESULT_PATH + "test.txt", "w") as f:
for i in predictions_eval:
f.write(str(i[0]) + '\n')
# test_MSE = mean_squared_error(labels_eval, predictions_eval)
# test_MAE = mean_absolute_error(labels_eval, predictions_eval)
# test_R2 = r2_score(labels_eval, predictions_eval)
# print("MSE:", test_MSE, "MAE:", test_MAE, "R2:", test_R2)
print("----------------test over-----------------")
# return test_MSE,test_MAE,test_R2
if __name__ == '__main__':
# dataname = "davis"
# test
dataname = "test"
test_path = "./tfrecord/" + dataname + "/file.tfrecord"
test(dataname, test_path)
# kinase
dataname = "kinase"
test_path = "./tfrecord/" + dataname + "/file.tfrecord"
test(dataname, test_path)
# kinase
dataname = "GPCR"
test_path = "./tfrecord/" + dataname + "/file.tfrecord"
test(dataname, test_path)
# kinase
dataname = "ER"
test_path = "./tfrecord/" + dataname + "/file.tfrecord"
test(dataname, test_path)
# kinase
dataname = "channel"
test_path = "./tfrecord/" + dataname + "/file.tfrecord"
test(dataname, test_path)
import tensorflow as tf
import numpy as np
import DTA_model as model
# from tensorflow.python.client import timeline
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"
# dataname = "davis"
# dataname = "deepaffinity"
# 5-fold cross-validation
cross_num = 1
LEARNING_RATE_BASE = 0.0001
# REGULARIZATION_RATE = 0.00001
EPOCH = 75
#
# if dataname == "kiba":
# batch_size = 100
# TESTNUM = (118256/5)*4/100
dataname = "train"
batch_size = 64
TESTNUM = (30056/5)*4/100
MAX_SEQ_LEN = 1200
MAX_SMI_LEN = 100
Train_path = "./tfrecord/" + dataname + "/file.tfrecord"
MODEL_SAVE_PATH = "./" + dataname + "/model%d/"
MODEL_NAME = "model.ckpt"
def parser(record):
read_features = {
'drug': tf.FixedLenFeature([MAX_SMI_LEN], dtype=tf.int64),
'protein': tf.FixedLenFeature([MAX_SEQ_LEN], dtype=tf.int64),
'affinity': tf.FixedLenFeature([1], dtype=tf.float32)
}
read_data = tf.parse_single_example(
serialized=record, features=read_features)
drug = tf.cast(read_data['drug'], tf.int32)
protein = tf.cast(read_data['protein'], tf.int32)
affinit_y = read_data['affinity']
return drug, protein, affinit_y
def train(num, train_path):
with tf.variable_scope("input"):
dataset = tf.data.TFRecordDataset(train_path)
dataset = dataset.map(parser)
dataset = dataset.repeat(EPOCH).shuffle(500).batch(
batch_size=batch_size)
train_iterator = dataset.make_initializable_iterator()
train_drug, train_proteins_to_embeding, train_labels_batch\
= train_iterator.get_next()
# regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE)
_, _, train_label = \
model.inference(
train_drug,
train_proteins_to_embeding,
regularizer=None, keep_prob=0.9, trainlabel=1
)
global_step = tf.Variable(0, trainable=False)
with tf.name_scope("train_loss_function"):
mean_squared_eror = tf.losses.mean_squared_error(
train_label, train_labels_batch)
tf.summary.scalar("mean_squared_eror", mean_squared_eror)
# loss = mean_squared_eror + tf.add_n(tf.get_collection("losses"))
# tf.summary.scalar("loss", loss)
with tf.name_scope("train_step"):
learning_rate = LEARNING_RATE_BASE
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
# train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)
train_step = tf.train.AdamOptimizer(learning_rate).minimize(
mean_squared_eror, global_step=global_step)
with tf.control_dependencies([train_step]):
train_op = tf.no_op(name='train')
merged = tf.summary.merge_all()
summary_write = tf.summary.FileWriter(
"./" + dataname + "/path/to/log%d" %
num, tf.get_default_graph())
var_list = [var for var in tf.global_variables() if "moving" in var.name]
var_list += tf.trainable_variables()
saver = tf.train.Saver(var_list=var_list, max_to_keep=20)
config = tf.ConfigProto(
log_device_placement=False,
allow_soft_placement=True)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess,\
open("./" + dataname + "/path/to/log%d/log.txt" % num, "w") as f:
print("beginning training")
sess.run(
tf.group(
tf.global_variables_initializer(),
tf.local_variables_initializer()))
sess.run(train_iterator.initializer)
step = 0
maxloss = 100
trainMSElist = []
try:
while True:
step += 1
run_options = tf.RunOptions(
trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
#train and test
# trainLosslist = []
# Loss, summary, _, MSE, now_step = sess.run(
# [loss, merged, train_op, mean_squared_eror, global_step],
# options=run_options, run_metadata=run_metadata)
summary, _, MSE, now_step = sess.run(
[merged, train_op, mean_squared_eror, global_step],
options=run_options, run_metadata=run_metadata)
str = "%s-model:%d-step:%d;train_MSE:%g;" % (
dataname, num, now_step, MSE)
f.write(str + "\n")
trainMSElist.append(MSE)
# trainLosslist.append(Loss)
if step % 10 == 0:
# if step % TESTNUM == 0:
summary_write.add_summary(summary, now_step)
summary_write.add_run_metadata(
run_metadata, tag=(
"step%d" %
step), global_step=step)
trainMSE = 0
# trainLoss = 0
for i in range(len(trainMSElist)):
# trainLoss += trainLosslist[i]
trainMSE += trainMSElist[i]
# trainLoss /= len(trainLosslist)
trainMSE /= len(trainMSElist)
# print(
# "%s-model:%d-step:%d;train_Loss:%g;train_MSE:%g." %
# (dataname, num, now_step, trainLoss, trainMSE))
print(
"%s-model:%d-epoch:%d;train_MSE:%g;" %
(dataname, num, now_step, trainMSE))
trainMSElist = []
if trainMSE < maxloss:
saver.save(
sess,
os.path.join(
MODEL_SAVE_PATH %
num,
MODEL_NAME),
global_step=global_step)
maxloss = trainMSE
print("save model")
else:
pass
except tf.errors.OutOfRangeError:
pass
summary_write.close()
def main(argv=None):
for i in range(cross_num):
tf.reset_default_graph()
if os.path.exists(MODEL_SAVE_PATH % i) is False:
os.makedirs(MODEL_SAVE_PATH % i)
print("The No.%d model" % i)
# train_path = Train_path
train(i, Train_path)
if __name__ == '__main__':
tf.app.run()
# AttentionDTA_BIBM
AttentionDTA: prediction of drug–target binding affinity using attention model.https://ieeexplore.ieee.org/abstract/document/8983125
This repository contains the source code and the data.
## AttentionDTA
<div align="center">
<p><img src="model.jpg" width="600" /></p>
</div>
## Setup and dependencies
Dependencies:
- python 3.6
- tensorflow >=1.9
- numpy
## Resources:
+ README.md: this file.
+ tfrecord: The original data set and data set processing code are saved in this folder.
+ davis_div.txt: Under the 5-fold cross-validation setting, there is a division of the training set and the test set of the davis dataset.
+ kiba_div.txt: Under the 5-fold cross-validation setting, there is a division of the training set and the test set of the kiba dataset.
+ davis_str_all.txt
+ kiba_str_all.txt
+ dataset.py: create data in tfrecord format according to (kiba/davis)_div.txt
+ DTA_train.py: train a AttentionDTA model.
+ DTA_model.py: AttentionDTA model architecture
+ DTA_test.py: test trained models
# Step-by-step running:
## 1. Create data in tfrecord format
python dataset.py
## 2. Train a prediction model
python DTA_train.py
To train a model using training data.
## 3. Predict affinity with trained models
python DTA_test.py
\ No newline at end of file
RMSE : 1.977229018807965 ; Pearson Correlation Coefficient : 0.13056623322435212
\ No newline at end of file
RMSE : 1.8548911640057175 ; Pearson Correlation Coefficient : 0.19903442590966353
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
RMSE : 1.7273183993658947 ; Pearson Correlation Coefficient : 0.04453206966512105
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
RMSE : 1.756065730513885 ; Pearson Correlation Coefficient : 0.23311044821975194
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
RMSE : 1.1846278906245926 ; Pearson Correlation Coefficient : 0.6965380003561024
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论