Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
F
FingerDTA
概览
概览
详情
活动
周期分析
版本库
存储库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
mszjaas
FingerDTA
Commits
5788ff27
提交
5788ff27
authored
12月 19, 2020
作者:
mszjaas
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
FingerDTA template
上级
隐藏空白字符变更
内嵌
并排
正在显示
4 个修改的文件
包含
429 行增加
和
0 行删除
+429
-0
data.py
data.py
+63
-0
fingerDTA.py
fingerDTA.py
+115
-0
generate_fingerprint.py
generate_fingerprint.py
+89
-0
train_and_evaluate.py
train_and_evaluate.py
+162
-0
没有找到文件。
data.py
0 → 100644
浏览文件 @
5788ff27
import
pickle
import
json
import
numpy
as
np
import
torch
from
torch.utils.data
import
Dataset
,
DataLoader
test_fold
=
json
.
load
(
open
(
"test_fold_setting1.txt"
))
# from DeepDTA
train_folds
=
json
.
load
(
open
(
"train_fold_setting1.txt"
))
# from DeepDTA
with
open
(
r'KIBA_protein.pickle'
,
'rb'
)
as
f
:
store
=
pickle
.
load
(
f
)
seqs
=
store
[
'seq'
]
with
open
(
r'KIBA_ligand.pickle'
,
'rb'
)
as
f
:
store
=
pickle
.
load
(
f
)
drugs
=
store
[
'smiles'
]
drug_fps
=
store
[
'fingerprint'
]
prot_fps
=
np
.
load
(
'KIBA_fingerprint.npy'
)
with
open
(
r'davis_relation.pickle'
,
'rb'
)
as
f
:
relationship
=
pickle
.
load
(
f
)
label_row_inds
,
label_col_inds
=
np
.
where
(
np
.
isnan
(
relationship
)
==
False
)
class
Datas
(
Dataset
):
def
__init__
(
self
,
index
,
data_type
):
indexes
=
[]
if
data_type
==
'train'
:
for
i
in
range
(
0
,
index
):
indexes
.
extend
(
train_folds
[
i
])
for
i
in
range
(
index
+
1
,
5
):
indexes
.
extend
(
train_folds
[
i
])
elif
data_type
==
'valid'
:
indexes
.
extend
(
train_folds
[
index
])
elif
data_type
==
'test'
:
indexes
=
test_fold
self
.
indexes
=
indexes
def
__getitem__
(
self
,
index
):
i
=
self
.
indexes
[
index
]
drug_i
=
label_row_inds
[
i
]
protein_i
=
label_col_inds
[
i
]
affinity
=
torch
.
tensor
(
relationship
[
drug_i
][
protein_i
])
.
float
()
.
cuda
()
protein
=
torch
.
from_numpy
(
seqs
[
protein_i
])
.
float
()
.
cuda
()
prot_fp
=
torch
.
from_numpy
(
prot_fps
[
protein_i
])
.
float
()
.
cuda
()
drug
=
torch
.
from_numpy
(
drugs
[
drug_i
])
.
float
()
.
cuda
()
drug_fp
=
torch
.
from_numpy
(
drug_fps
[
drug_i
])
.
float
()
.
cuda
()
return
drug
,
drug_fp
,
protein
,
prot_fp
,
affinity
def
__len__
(
self
):
return
len
(
self
.
indexes
)
# five fold
datas
=
[]
for
i
in
range
(
5
):
datas
.
append
({
'train'
:
DataLoader
(
Datas
(
i
,
'train'
),
batch_size
=
128
,
shuffle
=
True
),
'test'
:
DataLoader
(
Datas
(
i
,
'test'
),
batch_size
=
128
,
shuffle
=
True
),
'valid'
:
DataLoader
(
Datas
(
i
,
'valid'
),
batch_size
=
128
,
shuffle
=
True
),
})
\ No newline at end of file
fingerDTA.py
0 → 100644
浏览文件 @
5788ff27
import
torch
from
torch
import
nn
# Dense Convolutional Block
class
ConvBlock
(
nn
.
Module
):
def
__init__
(
self
,
length_in
,
length_out
):
super
(
ConvBlock
,
self
)
.
__init__
()
length_out
=
length_out
//
4
self
.
x1
=
nn
.
Conv1d
(
length_in
,
length_out
,
kernel_size
=
1
)
self
.
x2
=
nn
.
Conv1d
(
length_out
+
length_in
,
length_out
,
kernel_size
=
3
,
padding
=
1
)
self
.
x3
=
nn
.
Conv1d
(
length_out
*
2
+
length_in
,
length_out
,
kernel_size
=
5
,
padding
=
2
)
self
.
x4
=
nn
.
Conv1d
(
length_out
*
3
+
length_in
,
length_out
,
kernel_size
=
7
,
padding
=
3
)
def
forward
(
self
,
data_in
):
x1
=
self
.
x1
(
data_in
)
x2
=
self
.
x2
(
torch
.
cat
((
x1
,
data_in
),
dim
=
1
))
x3
=
self
.
x3
(
torch
.
cat
((
x2
,
x1
,
data_in
),
dim
=
1
))
x4
=
self
.
x4
(
torch
.
cat
((
x3
,
x2
,
x1
,
data_in
),
dim
=
1
))
data_out
=
torch
.
cat
((
x1
,
x2
,
x3
,
x4
),
dim
=
1
)
# data_out = torch.nn.functional.dropout(data_out, p=0.5)
data_out
=
nn
.
functional
.
relu
(
data_out
,
inplace
=
False
)
return
data_out
class
CNN
(
nn
.
Module
):
def
__init__
(
self
,
type_num
=
64
):
super
(
CNN
,
self
)
.
__init__
()
# self.x1 = nn.Conv1d(type_num, 128, 1)
self
.
x1
=
ConvBlock
(
type_num
,
128
)
self
.
x2
=
ConvBlock
(
128
,
256
)
self
.
x3
=
ConvBlock
(
256
,
96
)
def
forward
(
self
,
data_in
):
data_out
=
self
.
x1
(
data_in
)
data_out
=
self
.
x2
(
data_out
)
data_out
=
self
.
x3
(
data_out
)
# data_out = self.x4(data_out)
return
data_out
class
FC
(
nn
.
Module
):
def
__init__
(
self
,
dim_in
,
dim_out
,
dropout
=
True
):
super
(
FC
,
self
)
.
__init__
()
self
.
x1
=
nn
.
Linear
(
dim_in
,
dim_out
)
self
.
x2
=
torch
.
nn
.
Dropout
()
self
.
dropout
=
dropout
def
forward
(
self
,
x
):
x
=
self
.
x1
(
x
)
if
self
.
dropout
:
x
=
self
.
x2
(
x
)
x
=
nn
.
functional
.
leaky_relu
(
x
,
inplace
=
False
)
return
x
class
fp_FC
(
nn
.
Module
):
def
__init__
(
self
,
dim_in
,
dim_out
):
super
(
fp_FC
,
self
)
.
__init__
()
self
.
x1
=
nn
.
Linear
(
dim_in
,
512
)
self
.
x2
=
torch
.
nn
.
Dropout
()
self
.
x3
=
nn
.
Linear
(
512
,
dim_out
)
def
forward
(
self
,
x
):
x
=
self
.
x1
(
x
)
x
=
self
.
x2
(
x
)
# x = nn.functional.leaky_relu(x)
x
=
self
.
x3
(
x
)
# x = nn.functional.leaky_relu(x)
return
x
class
FingerDTA
(
nn
.
Module
):
def
__init__
(
self
):
super
(
FingerDTA
,
self
)
.
__init__
()
self
.
drug_model
=
CNN
(
64
)
self
.
protein_model
=
CNN
(
21
)
self
.
fp_drug
=
fp_FC
(
1024
,
96
)
self
.
fp_protein
=
fp_FC
(
1024
,
96
)
# self.atten = nn.Conv1d(96, 1, 1)
self
.
fc1
=
FC
(
192
,
1024
)
self
.
fc2
=
FC
(
1024
,
1024
)
self
.
fc3
=
nn
.
Linear
(
1024
,
512
)
self
.
fc4
=
nn
.
Linear
(
512
,
1
)
def
forward
(
self
,
drug
,
drug_fp
,
protein
,
prot_fp
):
drug
=
self
.
drug_model
(
drug
)
drug_out_fp
=
self
.
fp_drug
(
drug_fp
)
protein
=
self
.
protein_model
(
protein
)
protein_out_fp
=
self
.
fp_protein
(
prot_fp
)
# #attention in attentionDTA
# drug_out = nn.functional.relu(self.atten(drug))
# protein_out = nn.functional.relu(self.atten(protein))
# atten = nn.functional.tanh(drug_out.transpose(dim0=1, dim1=2).bmm(protein_out))
# atten_for_drug = torch.sum(atten, dim=2)
# atten_for_protein = torch.sum(atten, dim=1)
# drug_out = drug * atten_for_drug.unsqueeze(1)
# protein_out = protein * atten_for_protein.unsqueeze(1)
# embed fingerprint into convolutional output
drug_out
=
drug_out_fp
.
unsqueeze
(
2
)
*
drug
protein_out
=
protein_out_fp
.
unsqueeze
(
2
)
*
protein
drug_out
=
nn
.
functional
.
adaptive_max_pool1d
(
drug_out
,
output_size
=
1
)
.
squeeze
(
2
)
protein_out
=
nn
.
functional
.
adaptive_max_pool1d
(
protein_out
,
output_size
=
1
)
.
squeeze
(
2
)
data_out
=
torch
.
cat
((
drug_out
,
protein_out
),
dim
=
1
)
# fc
data_out
=
self
.
fc1
(
data_out
)
data_out
=
self
.
fc2
(
data_out
)
data_out
=
self
.
fc3
(
data_out
)
data_out
=
self
.
fc4
(
data_out
)
return
data_out
\ No newline at end of file
generate_fingerprint.py
0 → 100644
浏览文件 @
5788ff27
import
numpy
as
np
from
gensim.models
import
word2vec
from
sklearn.cluster
import
AgglomerativeClustering
def
generate_slices
(
path
,
slice_len
=
5
):
# (1) onehot sequence -> number
seqs
=
np
.
load
(
path
)
# seq is the one-hot matrix for protein sequence
aas
=
[]
# amino acids
for
i
,
mat
in
enumerate
(
seqs
):
aa
=
[]
for
j
,
seq
in
enumerate
(
mat
):
is_end
=
True
for
n
,
label
in
enumerate
(
seq
):
if
label
==
1
:
aa
.
append
(
int
(
n
))
is_end
=
False
break
if
is_end
:
break
aas
.
append
(
aa
)
# (2) generate slice
proteins
=
[]
total_slice_num
=
0
for
aa
in
aas
:
protein
=
[]
for
index
in
range
((
len
(
aa
)
-
slice_len
+
1
)):
slice_code
=
0
for
j
in
range
(
slice_len
):
slice_code
=
slice_code
*
21
+
aa
[
index
+
j
]
# 21 kind of amino acid -- base 21 to base 10
protein
.
append
(
str
(
slice_code
))
total_slice_num
+=
1
proteins
.
append
(
protein
)
# print(proteins)
print
(
"totally {} slices"
.
format
(
total_slice_num
))
return
proteins
#########################################
# (1) generate slice
############################################
slice_len
=
5
all_slices
=
generate_slices
(
'all_seq.npy'
,
slice_len
)
KIBA_slices
=
generate_slices
(
'KIBA_seq.npy'
,
slice_len
)
#########################################
# (2) generate vector for each slice (word2vec)
############################################
slice_window
=
10
-
slice_len
model
=
word2vec
.
Word2Vec
(
all_slices
,
sg
=
0
,
size
=
64
,
window
=
slice_window
,
min_count
=
3
,
negative
=
3
,
sample
=
0.001
,
hs
=
1
,
workers
=
4
,
batch_words
=
10
,
iter
=
10000
,
alpha
=
0.0001
,
callbacks
=
[
monitor
()])
all_words
=
model
.
wv
.
index2word
# print(all_slices)
print
(
"totally {} words"
.
format
(
len
(
all_words
)))
model
.
save
(
"word2vec.model"
)
#########################################
# (3) cluster into 1024 classes of slices
############################################
KIBA_vector
=
[]
for
slice
in
KIBA_slices
:
KIBA_vector
.
append
(
model
[
slice
])
ac
=
AgglomerativeClustering
(
n_clusters
=
1024
)
cls
=
ac
.
fit_predict
(
KIBA_vector
)
print
(
cls
)
# class for all slices in all protein
##########################################
# (4) map all slice to 1024 class
############################################
slice_dic
=
{}
for
i
,
slice
in
enumerate
(
KIBA_slices
):
slice_dic
[
slice
]
=
cls
[
i
]
##########################################
# (5) generate onehot encoding
############################################
import
numpy
as
np
protein_onehot
=
np
.
zeros
((
442
,
1024
))
# 442 proteins
for
i
,
protein
in
enumerate
(
KIBA_slices
):
for
slice
in
protein
:
if
slice
in
slice_dic
:
protein_onehot
[
i
][
slice_dic
[
slice
]]
=
1
np
.
save
(
'KIBA_fingerprint.npy'
,
protein_onehot
)
\ No newline at end of file
train_and_evaluate.py
0 → 100644
浏览文件 @
5788ff27
import
numpy
as
np
import
os
import
torch
from
torch
import
nn
from
tqdm
import
tqdm
from
data
import
datas
from
fingerDTA
import
FingerDTA
def
CI
(
P
,
Y
):
pair
=
0
summ
=
0
for
i
in
range
(
1
,
len
(
Y
)):
for
j
in
range
(
0
,
i
):
if
i
!=
j
:
if
(
Y
[
i
]
>
Y
[
j
]):
pair
+=
1
summ
+=
1
*
(
P
[
i
]
>
P
[
j
])
+
0.5
*
(
P
[
i
]
==
P
[
j
])
if
pair
!=
0
:
return
summ
/
pair
else
:
return
0
def
r_squared_error
(
y_obs
,
y_pred
):
y_obs
=
np
.
array
(
y_obs
)
y_pred
=
np
.
array
(
y_pred
)
y_obs_mean
=
np
.
mean
(
y_obs
)
y_pred_mean
=
np
.
mean
(
y_pred
)
mult
=
sum
((
y_pred
-
y_pred_mean
)
*
(
y_obs
-
y_obs_mean
))
mult
=
mult
*
mult
y_obs_sq
=
sum
((
y_obs
-
y_obs_mean
)
*
(
y_obs
-
y_obs_mean
))
y_pred_sq
=
sum
((
y_pred
-
y_pred_mean
)
*
(
y_pred
-
y_pred_mean
))
return
mult
/
(
y_obs_sq
*
y_pred_sq
)
def
get_k
(
y_obs
,
y_pred
):
y_obs
=
np
.
array
(
y_obs
)
y_pred
=
np
.
array
(
y_pred
)
return
sum
(
y_obs
*
y_pred
)
/
sum
(
y_pred
*
y_pred
)
def
squared_error_zero
(
y_obs
,
y_pred
):
k
=
get_k
(
y_obs
,
y_pred
)
y_obs
=
np
.
array
(
y_obs
)
y_pred
=
np
.
array
(
y_pred
)
y_obs_mean
=
np
.
mean
(
y_obs
)
upp
=
sum
((
y_obs
-
(
k
*
y_pred
))
*
(
y_obs
-
(
k
*
y_pred
)))
down
=
sum
((
y_obs
-
y_obs_mean
)
*
(
y_obs
-
y_obs_mean
))
return
1
-
(
upp
/
down
)
def
get_rm2
(
ys_line
,
ys_orig
):
r2
=
r_squared_error
(
ys_orig
,
ys_line
)
r02
=
squared_error_zero
(
ys_orig
,
ys_line
)
return
r2
*
(
1
-
np
.
sqrt
(
np
.
absolute
((
r2
*
r2
)
-
(
r02
*
r02
))))
def
evaluate_final
(
model
):
batch
=
0
loss_value
=
0
P
=
[]
Y
=
[]
model
.
eval
()
for
drug
,
drug_fp
,
protein
,
prot_fp
,
affinity
in
data
[
'test'
]:
batch
+=
1
drug
=
drug
.
permute
(
0
,
2
,
1
)
protein
=
protein
.
permute
(
0
,
2
,
1
)
judge
=
model
(
drug
,
drug_fp
,
protein
,
prot_fp
)
P
.
append
(
judge
.
squeeze
(
1
)
.
detach
()
.
cpu
()
.
numpy
())
Y
.
append
(
affinity
.
detach
()
.
cpu
()
.
numpy
())
loss_value
+=
loss
(
judge
,
affinity
.
unsqueeze
(
1
))
.
detach
()
.
cpu
()
P
=
np
.
concatenate
((
P
),
axis
=
0
)
Y
=
np
.
concatenate
((
Y
),
axis
=
0
)
CI_index
=
CI
(
P
,
Y
)
rm2_index
=
get_rm2
(
P
,
Y
)
print
(
"MSE"
,
loss_value
/
batch
,
"
\n
"
)
print
(
"CI_index"
,
CI_index
,
"
\n
"
)
print
(
"rm2_index"
,
rm2_index
,
"
\n
"
)
def
evaluate
(
model
,
epoch
):
global
Losssss
batch
=
0
loss_value
=
0
model
.
eval
()
for
drug
,
drug_fp
,
protein
,
prot_fp
,
affinity
in
data
[
'valid'
]:
batch
+=
1
drug
=
drug
.
permute
(
0
,
2
,
1
)
protein
=
protein
.
permute
(
0
,
2
,
1
)
judge
=
model
(
drug
,
drug_fp
,
protein
,
prot_fp
)
loss_value
+=
loss
(
judge
,
affinity
.
unsqueeze
(
1
))
.
detach
()
.
cpu
()
with
open
(
os
.
path
.
join
(
os
.
path
.
abspath
(
os
.
curdir
),
log_name
+
'.log'
),
'a'
)
as
f
:
f
.
write
(
"epoch "
+
str
(
epoch
)
+
": "
+
str
(
loss_value
/
batch
)
+
'
\n
'
)
print
(
"MSE"
,
loss_value
/
batch
,
"
\n
"
)
if
loss_value
/
batch
<
Losssss
:
Losssss
=
loss_value
/
batch
save_model
(
model
,
os
.
path
.
join
(
os
.
path
.
abspath
(
os
.
curdir
),
state_name
+
'.state'
))
def
train
(
model
,
optimizer
):
global
pre_auc
progress
=
tqdm
(
range
(
300
))
pre_auc
=
-
1
for
epoch
in
progress
:
model
.
train
()
for
batch
,
[
drug
,
drug_fp
,
protein
,
prot_fp
,
affinity
]
in
enumerate
(
data
[
'train'
]):
batch
+=
1
drug
=
drug
.
permute
(
0
,
2
,
1
)
protein
=
protein
.
permute
(
0
,
2
,
1
)
judge
=
model
(
drug
,
drug_fp
,
protein
,
prot_fp
)
loss_value
=
loss
(
judge
,
affinity
.
unsqueeze
(
1
))
progress
.
set_description
(
'epoch: {} batch: {} loss: {}'
.
format
(
epoch
,
batch
,
loss_value
))
optimizer
.
zero_grad
()
loss_value
.
backward
()
optimizer
.
step
()
evaluate
(
model
,
epoch
)
def
save_model
(
model
,
name
):
torch
.
save
(
model
.
state_dict
(),
name
)
def
load_model
(
model
,
name
):
model
.
load_state_dict
(
torch
.
load
(
name
))
#####################
# train
########################
data_i
=
0
# five fold: 0, 1, 2, 3, 4
data
=
datas
[
data_i
]
Losssss
=
9000000
model_type
=
'fingerdta'
log_name
=
'fingerdta'
+
str
(
data_i
)
state_name
=
'fingerdta'
+
str
(
data_i
)
loss
=
nn
.
MSELoss
()
.
cuda
()
model
=
FingerDTA
()
.
cuda
()
optimizer
=
torch
.
optim
.
Adam
(
model
.
parameters
(),
lr
=
0.0001
)
train
(
model
,
optimizer
)
#####################
# evaluate
########################
model
=
FingerDTA
()
.
cuda
()
load_model
(
model
,
os
.
path
.
join
(
os
.
path
.
abspath
(
os
.
curdir
),
model_type
,
'{}{}.state'
.
format
(
model_type
,
data_i
)))
evaluate_final
(
model
)
\ No newline at end of file
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论