add deepdta

a9e031a9 · 朱学凯 · 33c9c6b2 · d8c8a667 · 2c9cbafd · a9e031a9
--- a/AttentionDTA_BIBM @ d8c8a667
+++ b/AttentionDTA_BIBM @ d8c8a667
-Subproject commit d8c8a6673c75c5e457ccf5bc0c187b131a06b7a1
--- a/DeepDTA @ 2c9cbafd
+++ b/DeepDTA @ 2c9cbafd
-Subproject commit 2c9cbafdfb383f2f03bcea4b231b90a072e65b15
--- a/baselines/DeepDTA/README.md
+++ b/baselines/DeepDTA/README.md
+# About DeepDTA: deep drug-target binding affinity prediction
+The approach used in this work is the modeling of protein sequences and compound 1D representations (SMILES) with convolutional neural networks (CNNs) to predict the binding affinity value of drug-target pairs.
+![Figure](https://github.com/hkmztrk/DeepDTA/blob/master/docs/figures/deepdta.PNG)
+# Installation
+## Data
+Please see the [readme](https://github.com/hkmztrk/DeepDTA/blob/master/data/README.md) for detailed explanation.
+## Requirements
+You'll need to install following in order to run the codes.
+*  [Python 3.4 <=](https://www.python.org/downloads/)
+*  [Keras 2.x](https://pypi.org/project/Keras/)
+*  [Tensorflow 1.x](https://www.tensorflow.org/install/)
+*  numpy
+*  matplotlib
+You have to place "data" folder under "source" directory. 
+# Usage
+```
+python run_experiments.py --num_windows 32 \
+                          --seq_window_lengths 8 12 \
+                          --smi_window_lengths 4 8 \
+                          --batch_size 256 \
+                          --num_epoch 100 \
+                          --max_seq_len 1000 \
+                          --max_smi_len 100 \
+                          --dataset_path 'data/kiba/' \
+                          --problem_type 1 \
+                          --log_dir 'logs/'
+```
+**For citation:**
+```
+@article{ozturk2018deepdta,
+  title={DeepDTA: deep drug--target binding affinity prediction},
+  author={{\"O}zt{\"u}rk, Hakime and {\"O}zg{\"u}r, Arzucan and Ozkirimli, Elif},
+  journal={Bioinformatics},
+  volume={34},
+  number={17},
+  pages={i821--i829},
+  year={2018},
+  publisher={Oxford University Press}
+}
+```
--- a/baselines/DeepDTA/source/arguments.py
+++ b/baselines/DeepDTA/source/arguments.py
+import argparse
+import os
+def argparser():
+  parser = argparse.ArgumentParser()
+  # for model
+  parser.add_argument(
+      '--seq_window_lengths',
+      type=int,
+      nargs='+',
+      help='Space seperated list of motif filter lengths. (ex, --window_lengths 4 8 12)'
+  )
+  parser.add_argument(
+      '--smi_window_lengths',
+      type=int,
+      nargs='+',
+      help='Space seperated list of motif filter lengths. (ex, --window_lengths 4 8 12)'
+  )
+  parser.add_argument(
+      '--num_windows',
+      type=int,
+      nargs='+',
+      help='Space seperated list of the number of motif filters corresponding to length list. (ex, --num_windows 100 200 100)'
+  )
+  parser.add_argument(
+      '--num_hidden',
+      type=int,
+      default=0,
+      help='Number of neurons in hidden layer.'
+  )
+  parser.add_argument(
+      '--num_classes',
+      type=int,
+      default=0,
+      help='Number of classes (families).'
+  )
+  parser.add_argument(
+      '--max_seq_len',
+      type=int,
+      default=0,
+      help='Length of input sequences.'
+  )
+  parser.add_argument(
+      '--max_smi_len',
+      type=int,
+      default=0,
+      help='Length of input sequences.'
+  )
+  # for learning
+  parser.add_argument(
+      '--learning_rate',
+      type=float,
+      default=0.001,
+      help='Initial learning rate.'
+  )
+  parser.add_argument(
+      '--num_epoch',
+      type=int,
+      default=100,
+      help='Number of epochs to train.'
+  )
+  parser.add_argument(
+      '--batch_size',
+      type=int,
+      default=256,
+      help='Batch size. Must divide evenly into the dataset sizes.'
+  )
+  parser.add_argument(
+      '--dataset_path',
+      type=str,
+      default='/data/kiba/',
+      help='Directory for input data.'
+  )
+  parser.add_argument(
+      '--problem_type',
+      type=int,
+      default=1,
+      help='Type of the prediction problem (1-4)'
+  )
+  parser.add_argument(
+      '--binary_th',
+      type=float,
+      default=0.0,
+      help='Threshold to split data into binary classes'
+  )
+  parser.add_argument(
+      '--is_log',
+      type=int,
+      default=0,
+      help='use log transformation for Y'
+  )
+  parser.add_argument(
+      '--checkpoint_path',
+      type=str,
+      default='',
+      help='Path to write checkpoint file.'
+  )
+  parser.add_argument(
+      '--log_dir',
+      type=str,
+      default='/tmp',
+      help='Directory for log data.'
+  )
+  parser.add_argument(
+      '--out',
+      type=str,
+      default='/pred',
+      help='Directory for log data.'
+  )
+  parser.add_argument(
+      '--model',
+      type=str,
+      default='/model',
+      help='Directory for log data.'
+  )
+  FLAGS, unparsed = parser.parse_known_args()
+  # check validity
+  #assert( len(FLAGS.window_lengths) == len(FLAGS.num_windows) )
+  return FLAGS
+def logging(msg, FLAGS):
+  fpath = os.path.join( FLAGS.log_dir, "log.txt" )
+  with open( fpath, "a" ) as fw:
+    fw.write("%s\n" % msg)
+  #print(msg)
--- a/baselines/DeepDTA/source/auc.jar
+++ b/baselines/DeepDTA/source/auc.jar
--- a/baselines/DeepDTA/source/datahelper.py
+++ b/baselines/DeepDTA/source/datahelper.py
+import sys, re, math, time
+import numpy as np
+import matplotlib.pyplot as plt
+import json
+import pickle
+import collections
+from collections import OrderedDict
+from matplotlib.pyplot import cm
+from spacy import load
+#from keras.preprocessing.sequence import pad_sequences
+## ######################## ##
+#
+#  Define CHARSET, CHARLEN
+#
+## ######################## ## 
+# CHARPROTSET = { 'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, \
+#             'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, \
+#             'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, 'X': 20, \
+#             'O': 20, 'U': 20,
+#             'B': (2, 11),
+#             'Z': (3, 13),
+#             'J': (7, 9) }
+# CHARPROTLEN = 21
+CHARPROTSET = { "A": 1, "C": 2, "B": 3, "E": 4, "D": 5, "G": 6, 
+				"F": 7, "I": 8, "H": 9, "K": 10, "M": 11, "L": 12, 
+				"O": 13, "N": 14, "Q": 15, "P": 16, "S": 17, "R": 18, 
+				"U": 19, "T": 20, "W": 21, 
+				"V": 22, "Y": 23, "X": 24, 
+				"Z": 25 }
+CHARPROTLEN = 25
+CHARCANSMISET = { "#": 1, "%": 2, ")": 3, "(": 4, "+": 5, "-": 6, 
+			 ".": 7, "1": 8, "0": 9, "3": 10, "2": 11, "5": 12, 
+			 "4": 13, "7": 14, "6": 15, "9": 16, "8": 17, "=": 18, 
+			 "A": 19, "C": 20, "B": 21, "E": 22, "D": 23, "G": 24,
+			 "F": 25, "I": 26, "H": 27, "K": 28, "M": 29, "L": 30, 
+			 "O": 31, "N": 32, "P": 33, "S": 34, "R": 35, "U": 36, 
+			 "T": 37, "W": 38, "V": 39, "Y": 40, "[": 41, "Z": 42, 
+			 "]": 43, "_": 44, "a": 45, "c": 46, "b": 47, "e": 48, 
+			 "d": 49, "g": 50, "f": 51, "i": 52, "h": 53, "m": 54, 
+			 "l": 55, "o": 56, "n": 57, "s": 58, "r": 59, "u": 60,
+			 "t": 61, "y": 62}
+CHARCANSMILEN = 62
+CHARISOSMISET = {"#": 29, "%": 30, ")": 31, "(": 1, "+": 32, "-": 33, "/": 34, ".": 2, 
+				"1": 35, "0": 3, "3": 36, "2": 4, "5": 37, "4": 5, "7": 38, "6": 6, 
+				"9": 39, "8": 7, "=": 40, "A": 41, "@": 8, "C": 42, "B": 9, "E": 43, 
+				"D": 10, "G": 44, "F": 11, "I": 45, "H": 12, "K": 46, "M": 47, "L": 13, 
+				"O": 48, "N": 14, "P": 15, "S": 49, "R": 16, "U": 50, "T": 17, "W": 51, 
+				"V": 18, "Y": 52, "[": 53, "Z": 19, "]": 54, "\\": 20, "a": 55, "c": 56, 
+				"b": 21, "e": 57, "d": 22, "g": 58, "f": 23, "i": 59, "h": 24, "m": 60, 
+				"l": 25, "o": 61, "n": 26, "s": 62, "r": 27, "u": 63, "t": 28, "y": 64}
+CHARISOSMILEN = 64
+## ######################## ##
+#
+#  Encoding Helpers
+#
+## ######################## ## 
+#  Y = -(np.log10(Y/(math.pow(math.e,9))))
+def one_hot_smiles(line, MAX_SMI_LEN, smi_ch_ind):
+	X = np.zeros((MAX_SMI_LEN, len(smi_ch_ind))) #+1
+	for i, ch in enumerate(line[:MAX_SMI_LEN]):
+		X[i, (smi_ch_ind[ch]-1)] = 1 
+	return X #.tolist()
+def one_hot_sequence(line, MAX_SEQ_LEN, smi_ch_ind):
+	X = np.zeros((MAX_SEQ_LEN, len(smi_ch_ind))) 
+	for i, ch in enumerate(line[:MAX_SEQ_LEN]):
+		X[i, (smi_ch_ind[ch])-1] = 1
+	return X #.tolist()
+def label_smiles(line, MAX_SMI_LEN, smi_ch_ind):
+	X = np.zeros(MAX_SMI_LEN)
+	for i, ch in enumerate(line[:MAX_SMI_LEN]): #	x, smi_ch_ind, y
+		X[i] = smi_ch_ind[ch]
+	return X #.tolist()
+def label_sequence(line, MAX_SEQ_LEN, smi_ch_ind):
+	X = np.zeros(MAX_SEQ_LEN)
+	for i, ch in enumerate(line[:MAX_SEQ_LEN]):
+		X[i] = smi_ch_ind[ch]
+	return X #.tolist()
+## ######################## ##
+#
+#  DATASET Class
+#
+## ######################## ## 
+# works for large dataset
+class DataSet(object):
+  def __init__(self, fpath, setting_no, seqlen, smilen, need_shuffle = False):
+    self.SEQLEN = seqlen
+    self.SMILEN = smilen
+    #self.NCLASSES = n_classes
+    self.charseqset = CHARPROTSET
+    self.charseqset_size = CHARPROTLEN
+    self.charsmiset = CHARISOSMISET ###HERE CAN BE EDITED
+    self.charsmiset_size = CHARISOSMILEN
+    self.PROBLEMSET = setting_no 
+    # read raw file
+    # self._raw = self.read_sets( FLAGS)
+    # iteration flags
+    # self._num_data = len(self._raw)
+  def read_sets(self, FLAGS): ### fpath should be the dataset folder /kiba/ or /davis/
+    fpath = FLAGS.dataset_path
+    setting_no = FLAGS.problem_type
+    print("Reading %s start" % fpath)
+    test_fold = json.load(open(fpath + "folds/test_fold_setting" + str(setting_no)+".txt"))
+    train_folds = json.load(open(fpath + "folds/train_fold_setting" + str(setting_no)+".txt"))
+    return test_fold, train_folds
+  def parse_data(self, FLAGS,  with_label=True): 
+    fpath = FLAGS.dataset_path	
+    print("Read %s start" % fpath)
+    ligands = json.load(open(fpath+"ligands_can.txt"), object_pairs_hook=OrderedDict)
+    proteins = json.load(open(fpath+"proteins.txt"), object_pairs_hook=OrderedDict)
+    Y = pickle.load(open(fpath + "Y","rb"), encoding='latin1') ### TODO: read from raw
+    if FLAGS.is_log:
+        Y = -(np.log10(Y/(math.pow(10,9))))
+    XD = []
+    XT = []
+    if with_label:
+        for d in ligands.keys():
+            XD.append(label_smiles(ligands[d], self.SMILEN, self.charsmiset))
+        for t in proteins.keys():
+            XT.append(label_sequence(proteins[t], self.SEQLEN, self.charseqset))
+    else:
+        for d in ligands.keys():
+            XD.append(one_hot_smiles(ligands[d], self.SMILEN, self.charsmiset))
+        for t in proteins.keys():
+            XT.append(one_hot_sequence(proteins[t], self.SEQLEN, self.charseqset))
+    return XD, XT, Y
+class DataSet_for_new(object):
+  def __init__(self, fpath, setting_no, seqlen, smilen, need_shuffle = False):
+    self.SEQLEN = seqlen
+    self.SMILEN = smilen
+    #self.NCLASSES = n_classes
+    self.charseqset = CHARPROTSET
+    self.charseqset_size = CHARPROTLEN
+    self.charsmiset = CHARISOSMISET ###HERE CAN BE EDITED
+    self.charsmiset_size = CHARISOSMILEN
+    self.PROBLEMSET = setting_no 
+    # read raw file
+    # self._raw = self.read_sets( FLAGS)
+    # iteration flags
+    # self._num_data = len(self._raw)
+  def read_sets(self, FLAGS): ### fpath should be the dataset folder /kiba/ or /davis/
+    fpath = FLAGS.dataset_path
+    setting_no = FLAGS.problem_type
+    print("Reading %s start" % fpath)
+    test_fold = json.load(open(fpath + "folds/test_fold_setting" + str(setting_no)+".txt"))
+    train_folds = json.load(open(fpath + "folds/train_fold_setting" + str(setting_no)+".txt"))
+    return test_fold, train_folds
+  def parse_data(self, FLAGS,  with_label=True): 
+    fpath = FLAGS.dataset_path	
+    print("Read %s start" % fpath)
+    def load_file(file):
+      with open(file, 'r') as f:
+        data = f.readlines()
+        data = [i.strip() for i in data]
+      return data
+    # ligands = json.load(open(fpath+"ligands_can.txt"), object_pairs_hook=OrderedDict)
+    # proteins = json.load(open(fpath+"proteins.txt"), object_pairs_hook=OrderedDict)
+    ligands = load_file(fpath["ligand"])
+    proteins = load_file(fpath["protein"])
+    # Y = pickle.load(open(fpath + "Y","rb"), encoding='latin1') ### TODO: read from raw
+    # if FLAGS.is_log:
+    #     Y = -(np.log10(Y/(math.pow(10,9))))
+    # 修改数据输入
+    Y = load_file(fpath["y"])
+    XD = []
+    XT = []
+    if with_label:
+        # for d in ligands.keys():
+        for d in ligands:
+            XD.append(label_smiles(d, self.SMILEN, self.charsmiset))
+        for t in proteins:
+            XT.append(label_sequence(t, self.SEQLEN, self.charseqset))
+    # else:
+    #     for d in ligands.keys():
+    #         XD.append(one_hot_smiles(ligands[d], self.SMILEN, self.charsmiset))
+    #     for t in proteins.keys():
+    #         XT.append(one_hot_sequence(proteins[t], self.SEQLEN, self.charseqset))
+    return XD, XT, Y
--- a/baselines/DeepDTA/source/emetrics.py
+++ b/baselines/DeepDTA/source/emetrics.py
+import numpy as np
+def get_aupr(Y, P):
+    if hasattr(Y, 'A'): Y = Y.A
+    if hasattr(P, 'A'): P = P.A
+    Y = np.where(Y>0, 1, 0)
+    Y = Y.ravel()
+    P = P.ravel()
+    f = open("temp.txt", 'w')
+    for i in range(Y.shape[0]):
+        f.write("%f %d\n" %(P[i], Y[i]))
+    f.close()
+    f = open("foo.txt", 'w')
+    subprocess.call(["java", "-jar", "auc.jar", "temp.txt", "list"], stdout=f)
+    f.close()
+    f = open("foo.txt")
+    lines = f.readlines()
+    aucpr = float(lines[-2].split()[-1])
+    f.close()
+    return aucpr
+def get_cindex(Y, P):
+    summ = 0
+    pair = 0
+    for i in range(1, len(Y)):
+        for j in range(0, i):
+            if i is not j:
+                if(Y[i] > Y[j]):
+                    pair +=1
+                    summ +=  1* (P[i] > P[j]) + 0.5 * (P[i] == P[j])
+    if pair is not 0:
+        return summ/pair
+    else:
+        return 0
+def r_squared_error(y_obs,y_pred):
+    y_obs = np.array(y_obs)
+    y_pred = np.array(y_pred)
+    y_obs_mean = [np.mean(y_obs) for y in y_obs]
+    y_pred_mean = [np.mean(y_pred) for y in y_pred]
+    mult = sum((y_pred - y_pred_mean) * (y_obs - y_obs_mean))
+    mult = mult * mult
+    y_obs_sq = sum((y_obs - y_obs_mean)*(y_obs - y_obs_mean))
+    y_pred_sq = sum((y_pred - y_pred_mean) * (y_pred - y_pred_mean) )
+    return mult / float(y_obs_sq * y_pred_sq)
+def get_k(y_obs,y_pred):
+    y_obs = np.array(y_obs)
+    y_pred = np.array(y_pred)
+    return sum(y_obs*y_pred) / float(sum(y_pred*y_pred))
+def squared_error_zero(y_obs,y_pred):
+    k = get_k(y_obs,y_pred)
+    y_obs = np.array(y_obs)
+    y_pred = np.array(y_pred)
+    y_obs_mean = [np.mean(y_obs) for y in y_obs]
+    upp = sum((y_obs - (k*y_pred)) * (y_obs - (k* y_pred)))
+    down= sum((y_obs - y_obs_mean)*(y_obs - y_obs_mean))
+    return 1 - (upp / float(down))
+def get_rm2(ys_orig,ys_line):
+    r2 = r_squared_error(ys_orig, ys_line)
+    r02 = squared_error_zero(ys_orig, ys_line)
+    return r2 * (1 - np.sqrt(np.absolute((r2*r2)-(r02*r02))))
\ No newline at end of file
--- a/baselines/DeepDTA/source/go.sh
+++ b/baselines/DeepDTA/source/go.sh
+python run_experiments.py --num_windows 32 \
+                          --seq_window_lengths 8 12 \
+                          --smi_window_lengths 4 8 \
+                          --batch_size 256 \
+                          --num_epoch 100 \
+                          --max_seq_len 1000 \
+                          --max_smi_len 100 \
+                          --dataset_path 'data/kiba/' \
+                          --problem_type 1 \
+                          --is_log 0 \
+                          --log_dir 'logs/'
--- a/baselines/DeepDTA/source/output/ER/eval_results
+++ b/baselines/DeepDTA/source/output/ER/eval_results
+RMSE : 1.485262829572667 ; Pearson Correlation Coefficient : 0.17810684496134926
\ No newline at end of file
--- a/baselines/DeepDTA/source/output/ER/results.txt
+++ b/baselines/DeepDTA/source/output/ER/results.txt
--- a/baselines/DeepDTA/source/output/GPCR/eval_results
+++ b/baselines/DeepDTA/source/output/GPCR/eval_results
+RMSE : 1.4034081434737957 ; Pearson Correlation Coefficient : 0.2416971016625298
\ No newline at end of file
--- a/baselines/DeepDTA/source/output/GPCR/results.txt
+++ b/baselines/DeepDTA/source/output/GPCR/results.txt
--- a/baselines/DeepDTA/source/output/Ion_channel/eval_results
+++ b/baselines/DeepDTA/source/output/Ion_channel/eval_results
+RMSE : 1.4512754880382956 ; Pearson Correlation Coefficient : 0.11439684637165645
\ No newline at end of file
--- a/baselines/DeepDTA/source/output/Ion_channel/results.txt
+++ b/baselines/DeepDTA/source/output/Ion_channel/results.txt
--- a/baselines/DeepDTA/source/output/Tyrosine_kinase/eval_results
+++ b/baselines/DeepDTA/source/output/Tyrosine_kinase/eval_results
+RMSE : 1.2566115226505494 ; Pearson Correlation Coefficient : 0.3483553292295794
\ No newline at end of file
--- a/baselines/DeepDTA/source/output/Tyrosine_kinase/results.txt
+++ b/baselines/DeepDTA/source/output/Tyrosine_kinase/results.txt
--- a/baselines/DeepDTA/source/output/test/eval_results
+++ b/baselines/DeepDTA/source/output/test/eval_results
+RMSE : 0.9898476924424743 ; Pearson Correlation Coefficient : 0.7531926430165059
\ No newline at end of file
--- a/baselines/DeepDTA/source/output/test/results.txt
+++ b/baselines/DeepDTA/source/output/test/results.txt
--- a/baselines/DeepDTA/source/run_baseline.py
+++ b/baselines/DeepDTA/source/run_baseline.py
--- a/baselines/DeepDTA/source/run_experiments.py
+++ b/baselines/DeepDTA/source/run_experiments.py
--- a/baselines/DeepDTA/source/train.sh
+++ b/baselines/DeepDTA/source/train.sh
+python run_baseline.py 
\ No newline at end of file
--- a/MolTrans @ 47ac16b8
+++ b/MolTrans @ 47ac16b8
-Subproject commit 47ac16b8c158b080ba6cdaec74cd7aa9c1332b73