提交 a9e031a9 作者: 朱学凯

add deepdta

上级 33c9c6b2
AttentionDTA_BIBM @ d8c8a667
Subproject commit d8c8a6673c75c5e457ccf5bc0c187b131a06b7a1
DeepDTA @ 2c9cbafd
Subproject commit 2c9cbafdfb383f2f03bcea4b231b90a072e65b15
# About DeepDTA: deep drug-target binding affinity prediction
The approach used in this work is the modeling of protein sequences and compound 1D representations (SMILES) with convolutional neural networks (CNNs) to predict the binding affinity value of drug-target pairs.
![Figure](https://github.com/hkmztrk/DeepDTA/blob/master/docs/figures/deepdta.PNG)
# Installation
## Data
Please see the [readme](https://github.com/hkmztrk/DeepDTA/blob/master/data/README.md) for detailed explanation.
## Requirements
You'll need to install following in order to run the codes.
* [Python 3.4 <=](https://www.python.org/downloads/)
* [Keras 2.x](https://pypi.org/project/Keras/)
* [Tensorflow 1.x](https://www.tensorflow.org/install/)
* numpy
* matplotlib
You have to place "data" folder under "source" directory.
# Usage
```
python run_experiments.py --num_windows 32 \
--seq_window_lengths 8 12 \
--smi_window_lengths 4 8 \
--batch_size 256 \
--num_epoch 100 \
--max_seq_len 1000 \
--max_smi_len 100 \
--dataset_path 'data/kiba/' \
--problem_type 1 \
--log_dir 'logs/'
```
**For citation:**
```
@article{ozturk2018deepdta,
title={DeepDTA: deep drug--target binding affinity prediction},
author={{\"O}zt{\"u}rk, Hakime and {\"O}zg{\"u}r, Arzucan and Ozkirimli, Elif},
journal={Bioinformatics},
volume={34},
number={17},
pages={i821--i829},
year={2018},
publisher={Oxford University Press}
}
```
import argparse
import os
def argparser():
parser = argparse.ArgumentParser()
# for model
parser.add_argument(
'--seq_window_lengths',
type=int,
nargs='+',
help='Space seperated list of motif filter lengths. (ex, --window_lengths 4 8 12)'
)
parser.add_argument(
'--smi_window_lengths',
type=int,
nargs='+',
help='Space seperated list of motif filter lengths. (ex, --window_lengths 4 8 12)'
)
parser.add_argument(
'--num_windows',
type=int,
nargs='+',
help='Space seperated list of the number of motif filters corresponding to length list. (ex, --num_windows 100 200 100)'
)
parser.add_argument(
'--num_hidden',
type=int,
default=0,
help='Number of neurons in hidden layer.'
)
parser.add_argument(
'--num_classes',
type=int,
default=0,
help='Number of classes (families).'
)
parser.add_argument(
'--max_seq_len',
type=int,
default=0,
help='Length of input sequences.'
)
parser.add_argument(
'--max_smi_len',
type=int,
default=0,
help='Length of input sequences.'
)
# for learning
parser.add_argument(
'--learning_rate',
type=float,
default=0.001,
help='Initial learning rate.'
)
parser.add_argument(
'--num_epoch',
type=int,
default=100,
help='Number of epochs to train.'
)
parser.add_argument(
'--batch_size',
type=int,
default=256,
help='Batch size. Must divide evenly into the dataset sizes.'
)
parser.add_argument(
'--dataset_path',
type=str,
default='/data/kiba/',
help='Directory for input data.'
)
parser.add_argument(
'--problem_type',
type=int,
default=1,
help='Type of the prediction problem (1-4)'
)
parser.add_argument(
'--binary_th',
type=float,
default=0.0,
help='Threshold to split data into binary classes'
)
parser.add_argument(
'--is_log',
type=int,
default=0,
help='use log transformation for Y'
)
parser.add_argument(
'--checkpoint_path',
type=str,
default='',
help='Path to write checkpoint file.'
)
parser.add_argument(
'--log_dir',
type=str,
default='/tmp',
help='Directory for log data.'
)
parser.add_argument(
'--out',
type=str,
default='/pred',
help='Directory for log data.'
)
parser.add_argument(
'--model',
type=str,
default='/model',
help='Directory for log data.'
)
FLAGS, unparsed = parser.parse_known_args()
# check validity
#assert( len(FLAGS.window_lengths) == len(FLAGS.num_windows) )
return FLAGS
def logging(msg, FLAGS):
fpath = os.path.join( FLAGS.log_dir, "log.txt" )
with open( fpath, "a" ) as fw:
fw.write("%s\n" % msg)
#print(msg)
import sys, re, math, time
import numpy as np
import matplotlib.pyplot as plt
import json
import pickle
import collections
from collections import OrderedDict
from matplotlib.pyplot import cm
from spacy import load
#from keras.preprocessing.sequence import pad_sequences
## ######################## ##
#
# Define CHARSET, CHARLEN
#
## ######################## ##
# CHARPROTSET = { 'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, \
# 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, \
# 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, 'X': 20, \
# 'O': 20, 'U': 20,
# 'B': (2, 11),
# 'Z': (3, 13),
# 'J': (7, 9) }
# CHARPROTLEN = 21
CHARPROTSET = { "A": 1, "C": 2, "B": 3, "E": 4, "D": 5, "G": 6,
"F": 7, "I": 8, "H": 9, "K": 10, "M": 11, "L": 12,
"O": 13, "N": 14, "Q": 15, "P": 16, "S": 17, "R": 18,
"U": 19, "T": 20, "W": 21,
"V": 22, "Y": 23, "X": 24,
"Z": 25 }
CHARPROTLEN = 25
CHARCANSMISET = { "#": 1, "%": 2, ")": 3, "(": 4, "+": 5, "-": 6,
".": 7, "1": 8, "0": 9, "3": 10, "2": 11, "5": 12,
"4": 13, "7": 14, "6": 15, "9": 16, "8": 17, "=": 18,
"A": 19, "C": 20, "B": 21, "E": 22, "D": 23, "G": 24,
"F": 25, "I": 26, "H": 27, "K": 28, "M": 29, "L": 30,
"O": 31, "N": 32, "P": 33, "S": 34, "R": 35, "U": 36,
"T": 37, "W": 38, "V": 39, "Y": 40, "[": 41, "Z": 42,
"]": 43, "_": 44, "a": 45, "c": 46, "b": 47, "e": 48,
"d": 49, "g": 50, "f": 51, "i": 52, "h": 53, "m": 54,
"l": 55, "o": 56, "n": 57, "s": 58, "r": 59, "u": 60,
"t": 61, "y": 62}
CHARCANSMILEN = 62
CHARISOSMISET = {"#": 29, "%": 30, ")": 31, "(": 1, "+": 32, "-": 33, "/": 34, ".": 2,
"1": 35, "0": 3, "3": 36, "2": 4, "5": 37, "4": 5, "7": 38, "6": 6,
"9": 39, "8": 7, "=": 40, "A": 41, "@": 8, "C": 42, "B": 9, "E": 43,
"D": 10, "G": 44, "F": 11, "I": 45, "H": 12, "K": 46, "M": 47, "L": 13,
"O": 48, "N": 14, "P": 15, "S": 49, "R": 16, "U": 50, "T": 17, "W": 51,
"V": 18, "Y": 52, "[": 53, "Z": 19, "]": 54, "\\": 20, "a": 55, "c": 56,
"b": 21, "e": 57, "d": 22, "g": 58, "f": 23, "i": 59, "h": 24, "m": 60,
"l": 25, "o": 61, "n": 26, "s": 62, "r": 27, "u": 63, "t": 28, "y": 64}
CHARISOSMILEN = 64
## ######################## ##
#
# Encoding Helpers
#
## ######################## ##
# Y = -(np.log10(Y/(math.pow(math.e,9))))
def one_hot_smiles(line, MAX_SMI_LEN, smi_ch_ind):
X = np.zeros((MAX_SMI_LEN, len(smi_ch_ind))) #+1
for i, ch in enumerate(line[:MAX_SMI_LEN]):
X[i, (smi_ch_ind[ch]-1)] = 1
return X #.tolist()
def one_hot_sequence(line, MAX_SEQ_LEN, smi_ch_ind):
X = np.zeros((MAX_SEQ_LEN, len(smi_ch_ind)))
for i, ch in enumerate(line[:MAX_SEQ_LEN]):
X[i, (smi_ch_ind[ch])-1] = 1
return X #.tolist()
def label_smiles(line, MAX_SMI_LEN, smi_ch_ind):
X = np.zeros(MAX_SMI_LEN)
for i, ch in enumerate(line[:MAX_SMI_LEN]): # x, smi_ch_ind, y
X[i] = smi_ch_ind[ch]
return X #.tolist()
def label_sequence(line, MAX_SEQ_LEN, smi_ch_ind):
X = np.zeros(MAX_SEQ_LEN)
for i, ch in enumerate(line[:MAX_SEQ_LEN]):
X[i] = smi_ch_ind[ch]
return X #.tolist()
## ######################## ##
#
# DATASET Class
#
## ######################## ##
# works for large dataset
class DataSet(object):
def __init__(self, fpath, setting_no, seqlen, smilen, need_shuffle = False):
self.SEQLEN = seqlen
self.SMILEN = smilen
#self.NCLASSES = n_classes
self.charseqset = CHARPROTSET
self.charseqset_size = CHARPROTLEN
self.charsmiset = CHARISOSMISET ###HERE CAN BE EDITED
self.charsmiset_size = CHARISOSMILEN
self.PROBLEMSET = setting_no
# read raw file
# self._raw = self.read_sets( FLAGS)
# iteration flags
# self._num_data = len(self._raw)
def read_sets(self, FLAGS): ### fpath should be the dataset folder /kiba/ or /davis/
fpath = FLAGS.dataset_path
setting_no = FLAGS.problem_type
print("Reading %s start" % fpath)
test_fold = json.load(open(fpath + "folds/test_fold_setting" + str(setting_no)+".txt"))
train_folds = json.load(open(fpath + "folds/train_fold_setting" + str(setting_no)+".txt"))
return test_fold, train_folds
def parse_data(self, FLAGS, with_label=True):
fpath = FLAGS.dataset_path
print("Read %s start" % fpath)
ligands = json.load(open(fpath+"ligands_can.txt"), object_pairs_hook=OrderedDict)
proteins = json.load(open(fpath+"proteins.txt"), object_pairs_hook=OrderedDict)
Y = pickle.load(open(fpath + "Y","rb"), encoding='latin1') ### TODO: read from raw
if FLAGS.is_log:
Y = -(np.log10(Y/(math.pow(10,9))))
XD = []
XT = []
if with_label:
for d in ligands.keys():
XD.append(label_smiles(ligands[d], self.SMILEN, self.charsmiset))
for t in proteins.keys():
XT.append(label_sequence(proteins[t], self.SEQLEN, self.charseqset))
else:
for d in ligands.keys():
XD.append(one_hot_smiles(ligands[d], self.SMILEN, self.charsmiset))
for t in proteins.keys():
XT.append(one_hot_sequence(proteins[t], self.SEQLEN, self.charseqset))
return XD, XT, Y
class DataSet_for_new(object):
def __init__(self, fpath, setting_no, seqlen, smilen, need_shuffle = False):
self.SEQLEN = seqlen
self.SMILEN = smilen
#self.NCLASSES = n_classes
self.charseqset = CHARPROTSET
self.charseqset_size = CHARPROTLEN
self.charsmiset = CHARISOSMISET ###HERE CAN BE EDITED
self.charsmiset_size = CHARISOSMILEN
self.PROBLEMSET = setting_no
# read raw file
# self._raw = self.read_sets( FLAGS)
# iteration flags
# self._num_data = len(self._raw)
def read_sets(self, FLAGS): ### fpath should be the dataset folder /kiba/ or /davis/
fpath = FLAGS.dataset_path
setting_no = FLAGS.problem_type
print("Reading %s start" % fpath)
test_fold = json.load(open(fpath + "folds/test_fold_setting" + str(setting_no)+".txt"))
train_folds = json.load(open(fpath + "folds/train_fold_setting" + str(setting_no)+".txt"))
return test_fold, train_folds
def parse_data(self, FLAGS, with_label=True):
fpath = FLAGS.dataset_path
print("Read %s start" % fpath)
def load_file(file):
with open(file, 'r') as f:
data = f.readlines()
data = [i.strip() for i in data]
return data
# ligands = json.load(open(fpath+"ligands_can.txt"), object_pairs_hook=OrderedDict)
# proteins = json.load(open(fpath+"proteins.txt"), object_pairs_hook=OrderedDict)
ligands = load_file(fpath["ligand"])
proteins = load_file(fpath["protein"])
# Y = pickle.load(open(fpath + "Y","rb"), encoding='latin1') ### TODO: read from raw
# if FLAGS.is_log:
# Y = -(np.log10(Y/(math.pow(10,9))))
# 修改数据输入
Y = load_file(fpath["y"])
XD = []
XT = []
if with_label:
# for d in ligands.keys():
for d in ligands:
XD.append(label_smiles(d, self.SMILEN, self.charsmiset))
for t in proteins:
XT.append(label_sequence(t, self.SEQLEN, self.charseqset))
# else:
# for d in ligands.keys():
# XD.append(one_hot_smiles(ligands[d], self.SMILEN, self.charsmiset))
# for t in proteins.keys():
# XT.append(one_hot_sequence(proteins[t], self.SEQLEN, self.charseqset))
return XD, XT, Y
import numpy as np
def get_aupr(Y, P):
if hasattr(Y, 'A'): Y = Y.A
if hasattr(P, 'A'): P = P.A
Y = np.where(Y>0, 1, 0)
Y = Y.ravel()
P = P.ravel()
f = open("temp.txt", 'w')
for i in range(Y.shape[0]):
f.write("%f %d\n" %(P[i], Y[i]))
f.close()
f = open("foo.txt", 'w')
subprocess.call(["java", "-jar", "auc.jar", "temp.txt", "list"], stdout=f)
f.close()
f = open("foo.txt")
lines = f.readlines()
aucpr = float(lines[-2].split()[-1])
f.close()
return aucpr
def get_cindex(Y, P):
summ = 0
pair = 0
for i in range(1, len(Y)):
for j in range(0, i):
if i is not j:
if(Y[i] > Y[j]):
pair +=1
summ += 1* (P[i] > P[j]) + 0.5 * (P[i] == P[j])
if pair is not 0:
return summ/pair
else:
return 0
def r_squared_error(y_obs,y_pred):
y_obs = np.array(y_obs)
y_pred = np.array(y_pred)
y_obs_mean = [np.mean(y_obs) for y in y_obs]
y_pred_mean = [np.mean(y_pred) for y in y_pred]
mult = sum((y_pred - y_pred_mean) * (y_obs - y_obs_mean))
mult = mult * mult
y_obs_sq = sum((y_obs - y_obs_mean)*(y_obs - y_obs_mean))
y_pred_sq = sum((y_pred - y_pred_mean) * (y_pred - y_pred_mean) )
return mult / float(y_obs_sq * y_pred_sq)
def get_k(y_obs,y_pred):
y_obs = np.array(y_obs)
y_pred = np.array(y_pred)
return sum(y_obs*y_pred) / float(sum(y_pred*y_pred))
def squared_error_zero(y_obs,y_pred):
k = get_k(y_obs,y_pred)
y_obs = np.array(y_obs)
y_pred = np.array(y_pred)
y_obs_mean = [np.mean(y_obs) for y in y_obs]
upp = sum((y_obs - (k*y_pred)) * (y_obs - (k* y_pred)))
down= sum((y_obs - y_obs_mean)*(y_obs - y_obs_mean))
return 1 - (upp / float(down))
def get_rm2(ys_orig,ys_line):
r2 = r_squared_error(ys_orig, ys_line)
r02 = squared_error_zero(ys_orig, ys_line)
return r2 * (1 - np.sqrt(np.absolute((r2*r2)-(r02*r02))))
\ No newline at end of file
python run_experiments.py --num_windows 32 \
--seq_window_lengths 8 12 \
--smi_window_lengths 4 8 \
--batch_size 256 \
--num_epoch 100 \
--max_seq_len 1000 \
--max_smi_len 100 \
--dataset_path 'data/kiba/' \
--problem_type 1 \
--is_log 0 \
--log_dir 'logs/'
RMSE : 1.485262829572667 ; Pearson Correlation Coefficient : 0.17810684496134926
\ No newline at end of file
RMSE : 1.4034081434737957 ; Pearson Correlation Coefficient : 0.2416971016625298
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
RMSE : 1.4512754880382956 ; Pearson Correlation Coefficient : 0.11439684637165645
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
RMSE : 1.2566115226505494 ; Pearson Correlation Coefficient : 0.3483553292295794
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
RMSE : 0.9898476924424743 ; Pearson Correlation Coefficient : 0.7531926430165059
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
python run_baseline.py
\ No newline at end of file
MolTrans @ 47ac16b8
Subproject commit 47ac16b8c158b080ba6cdaec74cd7aa9c1332b73
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论