add attentiondta

590b8bca · 朱学凯 · a9e031a9 · 590b8bca · 590b8bca · 590b8bca
--- a/baselines/AttentionDTA_BIBM/.gitattributes
+++ b/baselines/AttentionDTA_BIBM/.gitattributes
+# Auto detect text files and perform LF normalization
+* text=auto
--- a/baselines/AttentionDTA_BIBM/DTA_model.py
+++ b/baselines/AttentionDTA_BIBM/DTA_model.py
--- a/baselines/AttentionDTA_BIBM/DTA_test.py
+++ b/baselines/AttentionDTA_BIBM/DTA_test.py
+import tensorflow as tf
+
+import pandas as pd
+import numpy as np
+import DTA_model as model
+import os
+MAX_SEQ_LEN = 1200
+MAX_SMI_LEN = 100
+from sklearn.metrics import mean_squared_error  
+from sklearn.metrics import mean_absolute_error  
+from sklearn.metrics import r2_score  # R square
+os.environ["CUDA_VISIBLE_DEVICES"] = "6"
+
+def calculateMSE(X, Y):
+    in_bracket = []
+    for i in range(len(X)):
+        num = Y[i] - X[i]
+        num = pow(num, 2)
+        in_bracket.append(num)
+    all_sum = sum(in_bracket)
+    MSE = all_sum / len(X)
+    return MSE
+
+
+def parser(record):
+    read_features = {
+        'drug': tf.FixedLenFeature([MAX_SMI_LEN], dtype=tf.int64),
+        'protein': tf.FixedLenFeature([MAX_SEQ_LEN], dtype=tf.int64),
+        'affinity': tf.FixedLenFeature([1], dtype=tf.float32)
+    }
+
+    read_data = tf.parse_single_example(
+        serialized=record, features=read_features)
+    # read_data = tf.parse_example(serialized=record, features=read_features)
+
+    drug = tf.cast(read_data['drug'], tf.int32)
+    protein = tf.cast(read_data['protein'], tf.int32)
+    affinit_y = read_data['affinity']
+
+    return drug, protein, affinit_y
+
+
+def test(file, test_path):
+    with tf.Graph().as_default() as g:
+        dataset = tf.data.TFRecordDataset(test_path)
+        dataset = dataset.map(parser)
+        dataset = dataset.batch(
+            batch_size=3)
+        iterator = dataset.make_initializable_iterator()
+
+        drug_to_embeding, proteins_to_embeding, labels_batch \
+            = iterator.get_next()
+        _, _, test_label = \
+            model.inference(
+                drug_to_embeding,
+                proteins_to_embeding,
+                regularizer=None,
+                keep_prob=1,
+                trainlabel=0)
+        mean_squared_eror = tf.losses.mean_squared_error(
+            test_label, labels_batch)
+        saver = tf.train.Saver()
+
+        with tf.Session() as sess:
+            sess.run(iterator.initializer)
+            ckpt = tf.train.get_checkpoint_state(
+                "./" + "train" + "/model0/")
+            if ckpt and ckpt.model_checkpoint_path:
+                saver.restore(sess, ckpt.model_checkpoint_path)
+
+                predictions_eval = []
+                labels_eval = []
+                # MSElist = []
+                try:
+                    while True:
+                        df, pf, p, l, MSE = sess.run(
+                            [drug_to_embeding, proteins_to_embeding, test_label, labels_batch, mean_squared_eror])
+                        predictions_eval.append(p)
+                        labels_eval.append(l)
+                except tf.errors.OutOfRangeError:
+                    pass
+                predictions_eval = np.concatenate(predictions_eval)
+                labels_eval = np.concatenate(labels_eval)
+                labels_eval.resize([labels_eval.shape[0], 1])
+                
+                RESULT_PATH = "./results/" + dataname + "/"
+                if os.path.exists(RESULT_PATH) is False:
+                    os.makedirs(RESULT_PATH)
+                with open(RESULT_PATH + "test.txt", "w") as f:
+                    for i in predictions_eval:
+                        f.write(str(i[0]) + '\n')
+                # test_MSE = mean_squared_error(labels_eval, predictions_eval)
+                # test_MAE = mean_absolute_error(labels_eval, predictions_eval)
+                # test_R2 = r2_score(labels_eval, predictions_eval)
+                # print("MSE:", test_MSE, "MAE:", test_MAE, "R2:", test_R2)
+                print("----------------test over-----------------")
+                
+                # return test_MSE,test_MAE,test_R2
+
+
+
+if __name__ == '__main__':
+    # dataname = "davis"
+    
+    # test
+    dataname = "test"
+    test_path = "./tfrecord/" + dataname + "/file.tfrecord" 
+    test(dataname, test_path)
+
+    # kinase
+    dataname = "kinase"
+    test_path = "./tfrecord/" + dataname + "/file.tfrecord" 
+    test(dataname, test_path)
+    
+    
+    # kinase
+    dataname = "GPCR"
+    test_path = "./tfrecord/" + dataname + "/file.tfrecord" 
+    test(dataname, test_path)
+    
+    # kinase
+    dataname = "ER"
+    test_path = "./tfrecord/" + dataname + "/file.tfrecord" 
+    test(dataname, test_path)
+    
+    # kinase
+    dataname = "channel"
+    test_path = "./tfrecord/" + dataname + "/file.tfrecord" 
+    test(dataname, test_path)
--- a/baselines/AttentionDTA_BIBM/DTA_train.py
+++ b/baselines/AttentionDTA_BIBM/DTA_train.py
+import tensorflow as tf
+import numpy as np
+import DTA_model as model
+# from tensorflow.python.client import timeline
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "5"
+# dataname = "davis"
+# dataname = "deepaffinity"
+# 5-fold cross-validation
+cross_num = 1
+LEARNING_RATE_BASE = 0.0001
+# REGULARIZATION_RATE = 0.00001
+EPOCH = 75
+#
+# if dataname == "kiba":
+#     batch_size = 100
+#     TESTNUM = (118256/5)*4/100
+    
+dataname = "train"
+batch_size = 64
+TESTNUM = (30056/5)*4/100
+    
+MAX_SEQ_LEN = 1200
+MAX_SMI_LEN = 100
+
+Train_path = "./tfrecord/" + dataname + "/file.tfrecord"
+MODEL_SAVE_PATH = "./" + dataname + "/model%d/"
+MODEL_NAME = "model.ckpt"
+
+
+def parser(record):
+    read_features = {
+        'drug': tf.FixedLenFeature([MAX_SMI_LEN], dtype=tf.int64),
+        'protein': tf.FixedLenFeature([MAX_SEQ_LEN], dtype=tf.int64),
+        'affinity': tf.FixedLenFeature([1], dtype=tf.float32)
+    }
+
+    read_data = tf.parse_single_example(
+        serialized=record, features=read_features)
+
+    drug = tf.cast(read_data['drug'], tf.int32)
+    protein = tf.cast(read_data['protein'], tf.int32)
+    affinit_y = read_data['affinity']
+
+    return drug, protein, affinit_y
+
+
+def train(num, train_path):
+    with tf.variable_scope("input"):
+        dataset = tf.data.TFRecordDataset(train_path)
+        dataset = dataset.map(parser)
+        dataset = dataset.repeat(EPOCH).shuffle(500).batch(
+            batch_size=batch_size)
+        train_iterator = dataset.make_initializable_iterator()
+        train_drug, train_proteins_to_embeding, train_labels_batch\
+            = train_iterator.get_next()
+
+    # regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE)
+
+    _, _, train_label = \
+        model.inference(
+            train_drug,
+            train_proteins_to_embeding,
+            regularizer=None, keep_prob=0.9, trainlabel=1
+        )
+
+    global_step = tf.Variable(0, trainable=False)
+    with tf.name_scope("train_loss_function"):
+        mean_squared_eror = tf.losses.mean_squared_error(
+            train_label, train_labels_batch)
+        tf.summary.scalar("mean_squared_eror", mean_squared_eror)
+        # loss = mean_squared_eror + tf.add_n(tf.get_collection("losses"))
+        # tf.summary.scalar("loss", loss)
+
+    with tf.name_scope("train_step"):
+        learning_rate = LEARNING_RATE_BASE
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        with tf.control_dependencies(update_ops):
+            # train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)
+            train_step = tf.train.AdamOptimizer(learning_rate).minimize(
+                mean_squared_eror, global_step=global_step)
+            with tf.control_dependencies([train_step]):
+                train_op = tf.no_op(name='train')
+
+    merged = tf.summary.merge_all()
+    summary_write = tf.summary.FileWriter(
+        "./" + dataname + "/path/to/log%d" %
+        num, tf.get_default_graph())
+    var_list = [var for var in tf.global_variables() if "moving" in var.name]
+    var_list += tf.trainable_variables()
+    saver = tf.train.Saver(var_list=var_list, max_to_keep=20)
+
+    config = tf.ConfigProto(
+        log_device_placement=False,
+        allow_soft_placement=True)
+    config.gpu_options.allow_growth = True
+    with tf.Session(config=config) as sess,\
+            open("./" + dataname + "/path/to/log%d/log.txt" % num, "w") as f:
+        print("beginning training")
+        sess.run(
+            tf.group(
+                tf.global_variables_initializer(),
+                tf.local_variables_initializer()))
+        sess.run(train_iterator.initializer)
+        step = 0
+        maxloss = 100
+        trainMSElist = []
+        try:
+            while True:
+                step += 1
+                run_options = tf.RunOptions(
+                    trace_level=tf.RunOptions.FULL_TRACE)
+                run_metadata = tf.RunMetadata()
+                #train and test
+                # trainLosslist = []
+                # Loss, summary, _, MSE, now_step = sess.run(
+                #     [loss, merged, train_op, mean_squared_eror, global_step],
+                #     options=run_options, run_metadata=run_metadata)
+                summary, _, MSE, now_step = sess.run(
+                    [merged, train_op, mean_squared_eror, global_step],
+                     options=run_options, run_metadata=run_metadata)
+                str = "%s-model:%d-step:%d;train_MSE:%g;" % (
+                    dataname, num, now_step, MSE)
+                f.write(str + "\n")
+                trainMSElist.append(MSE)
+                # trainLosslist.append(Loss)
+                if step % 10 == 0:
+                # if step % TESTNUM == 0:
+                    summary_write.add_summary(summary, now_step)
+                    summary_write.add_run_metadata(
+                        run_metadata, tag=(
+                            "step%d" %
+                            step), global_step=step)
+                    trainMSE = 0
+                    # trainLoss = 0
+                    for i in range(len(trainMSElist)):
+                        # trainLoss += trainLosslist[i]
+                        trainMSE += trainMSElist[i]
+                    # trainLoss /= len(trainLosslist)
+                    trainMSE /= len(trainMSElist)
+                    # print(
+                    #     "%s-model:%d-step:%d;train_Loss:%g;train_MSE:%g." %
+                    #     (dataname, num, now_step, trainLoss, trainMSE))
+                    print(
+                        "%s-model:%d-epoch:%d;train_MSE:%g;" %
+                        (dataname, num, now_step, trainMSE))
+                    trainMSElist = []
+                    if trainMSE < maxloss:
+                        saver.save(
+                            sess,
+                            os.path.join(
+                                MODEL_SAVE_PATH %
+                                num,
+                                MODEL_NAME),
+                            global_step=global_step)
+                        maxloss = trainMSE
+                        print("save model")
+                else:
+                    pass
+        except tf.errors.OutOfRangeError:
+            pass
+        summary_write.close()
+
+
+def main(argv=None):
+    for i in range(cross_num):
+        tf.reset_default_graph()
+        if os.path.exists(MODEL_SAVE_PATH % i) is False:
+            os.makedirs(MODEL_SAVE_PATH % i)
+        print("The No.%d model" % i)
+        # train_path = Train_path
+        train(i, Train_path)
+
+
+if __name__ == '__main__':
+    tf.app.run()
--- a/baselines/AttentionDTA_BIBM/README.md
+++ b/baselines/AttentionDTA_BIBM/README.md
+# AttentionDTA_BIBM
+ AttentionDTA: prediction of drug–target binding affinity using attention model.https://ieeexplore.ieee.org/abstract/document/8983125
+
+This repository contains the source code and the data.
+
+## AttentionDTA
+
+<div align="center">
+<p><img src="model.jpg" width="600" /></p>
+</div>
+
+## Setup and dependencies 
+
+Dependencies:
+- python 3.6
+- tensorflow >=1.9
+- numpy
+
+## Resources:
+ README.md: this file.
+ tfrecord: The original data set and data set processing code are saved in this folder.
+	+ davis_div.txt: Under the 5-fold cross-validation setting, there is a division of the training set and the test set of the davis dataset.
+	+ kiba_div.txt: Under the 5-fold cross-validation setting, there is a division of the training set and the test set of the kiba dataset.
+	+ davis_str_all.txt
+	+ kiba_str_all.txt
+	+ dataset.py: create data in tfrecord format according to (kiba/davis)_div.txt
+ DTA_train.py: train a AttentionDTA model.
+ DTA_model.py: AttentionDTA model architecture
+ DTA_test.py: test trained models
+
+
+# Step-by-step running:
+
+## 1. Create data in tfrecord format
+python dataset.py
+
+## 2. Train a prediction model
+python DTA_train.py
+To train a model using training data. 
+
+## 3. Predict affinity with trained models
+python DTA_test.py
\ No newline at end of file
--- a/baselines/AttentionDTA_BIBM/model.jpg
+++ b/baselines/AttentionDTA_BIBM/model.jpg
--- a/baselines/AttentionDTA_BIBM/results/ER/eval_results
+++ b/baselines/AttentionDTA_BIBM/results/ER/eval_results
+RMSE : 1.977229018807965 ; Pearson Correlation Coefficient : 0.13056623322435212
\ No newline at end of file
--- a/baselines/AttentionDTA_BIBM/results/ER/test.txt
+++ b/baselines/AttentionDTA_BIBM/results/ER/test.txt
--- a/baselines/AttentionDTA_BIBM/results/GPCR/eval_results
+++ b/baselines/AttentionDTA_BIBM/results/GPCR/eval_results
+RMSE : 1.8548911640057175 ; Pearson Correlation Coefficient : 0.19903442590966353
\ No newline at end of file
--- a/baselines/AttentionDTA_BIBM/results/GPCR/test.txt
+++ b/baselines/AttentionDTA_BIBM/results/GPCR/test.txt
--- a/baselines/AttentionDTA_BIBM/results/channel/eval_results
+++ b/baselines/AttentionDTA_BIBM/results/channel/eval_results
+RMSE : 1.7273183993658947 ; Pearson Correlation Coefficient : 0.04453206966512105
\ No newline at end of file
--- a/baselines/AttentionDTA_BIBM/results/channel/test.txt
+++ b/baselines/AttentionDTA_BIBM/results/channel/test.txt
--- a/baselines/AttentionDTA_BIBM/results/kinase/eval_results
+++ b/baselines/AttentionDTA_BIBM/results/kinase/eval_results
+RMSE : 1.756065730513885 ; Pearson Correlation Coefficient : 0.23311044821975194
\ No newline at end of file
--- a/baselines/AttentionDTA_BIBM/results/kinase/test.txt
+++ b/baselines/AttentionDTA_BIBM/results/kinase/test.txt
--- a/baselines/AttentionDTA_BIBM/results/test/eval_results
+++ b/baselines/AttentionDTA_BIBM/results/test/eval_results
+RMSE : 1.1846278906245926 ; Pearson Correlation Coefficient : 0.6965380003561024
\ No newline at end of file
--- a/baselines/AttentionDTA_BIBM/results/test/test.txt
+++ b/baselines/AttentionDTA_BIBM/results/test/test.txt