提交 5788ff27 作者: mszjaas

FingerDTA template

上级
import pickle
import json
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
test_fold = json.load(open("test_fold_setting1.txt")) # from DeepDTA
train_folds = json.load(open("train_fold_setting1.txt")) # from DeepDTA
with open(r'KIBA_protein.pickle', 'rb') as f:
store = pickle.load(f)
seqs = store['seq']
with open(r'KIBA_ligand.pickle', 'rb') as f:
store = pickle.load(f)
drugs = store['smiles']
drug_fps = store['fingerprint']
prot_fps = np.load('KIBA_fingerprint.npy')
with open(r'davis_relation.pickle', 'rb') as f:
relationship = pickle.load(f)
label_row_inds, label_col_inds = np.where(np.isnan(relationship) == False)
class Datas(Dataset):
def __init__(self, index, data_type):
indexes = []
if data_type == 'train':
for i in range(0, index):
indexes.extend(train_folds[i])
for i in range(index + 1, 5):
indexes.extend(train_folds[i])
elif data_type == 'valid':
indexes.extend(train_folds[index])
elif data_type == 'test':
indexes = test_fold
self.indexes = indexes
def __getitem__(self, index):
i = self.indexes[index]
drug_i = label_row_inds[i]
protein_i = label_col_inds[i]
affinity = torch.tensor(relationship[drug_i][protein_i]).float().cuda()
protein = torch.from_numpy(seqs[protein_i]).float().cuda()
prot_fp = torch.from_numpy(prot_fps[protein_i]).float().cuda()
drug = torch.from_numpy(drugs[drug_i]).float().cuda()
drug_fp = torch.from_numpy(drug_fps[drug_i]).float().cuda()
return drug, drug_fp, protein, prot_fp, affinity
def __len__(self):
return len(self.indexes)
# five fold
datas = []
for i in range(5):
datas.append({
'train': DataLoader(Datas(i, 'train'), batch_size=128, shuffle=True),
'test': DataLoader(Datas(i, 'test'), batch_size=128, shuffle=True),
'valid': DataLoader(Datas(i, 'valid'), batch_size=128, shuffle=True),
})
\ No newline at end of file
import torch
from torch import nn
# Dense Convolutional Block
class ConvBlock(nn.Module):
def __init__(self, length_in, length_out):
super(ConvBlock, self).__init__()
length_out = length_out // 4
self.x1 = nn.Conv1d(length_in, length_out, kernel_size=1)
self.x2 = nn.Conv1d(length_out + length_in, length_out, kernel_size=3, padding=1)
self.x3 = nn.Conv1d(length_out * 2 + length_in, length_out, kernel_size=5, padding=2)
self.x4 = nn.Conv1d(length_out * 3 + length_in, length_out, kernel_size=7, padding=3)
def forward(self, data_in):
x1 = self.x1(data_in)
x2 = self.x2(torch.cat((x1, data_in), dim=1))
x3 = self.x3(torch.cat((x2, x1, data_in), dim=1))
x4 = self.x4(torch.cat((x3, x2, x1, data_in), dim=1))
data_out = torch.cat((x1, x2, x3, x4), dim=1)
# data_out = torch.nn.functional.dropout(data_out, p=0.5)
data_out = nn.functional.relu(data_out, inplace=False)
return data_out
class CNN(nn.Module):
def __init__(self, type_num=64):
super(CNN, self).__init__()
# self.x1 = nn.Conv1d(type_num, 128, 1)
self.x1 = ConvBlock(type_num, 128)
self.x2 = ConvBlock(128, 256)
self.x3 = ConvBlock(256, 96)
def forward(self, data_in):
data_out = self.x1(data_in)
data_out = self.x2(data_out)
data_out = self.x3(data_out)
# data_out = self.x4(data_out)
return data_out
class FC(nn.Module):
def __init__(self, dim_in, dim_out, dropout=True):
super(FC, self).__init__()
self.x1 = nn.Linear(dim_in, dim_out)
self.x2 = torch.nn.Dropout()
self.dropout = dropout
def forward(self, x):
x = self.x1(x)
if self.dropout:
x = self.x2(x)
x = nn.functional.leaky_relu(x, inplace=False)
return x
class fp_FC(nn.Module):
def __init__(self, dim_in, dim_out):
super(fp_FC, self).__init__()
self.x1 = nn.Linear(dim_in, 512)
self.x2 = torch.nn.Dropout()
self.x3 = nn.Linear(512, dim_out)
def forward(self, x):
x = self.x1(x)
x = self.x2(x)
# x = nn.functional.leaky_relu(x)
x = self.x3(x)
# x = nn.functional.leaky_relu(x)
return x
class FingerDTA(nn.Module):
def __init__(self):
super(FingerDTA, self).__init__()
self.drug_model = CNN(64)
self.protein_model = CNN(21)
self.fp_drug = fp_FC(1024, 96)
self.fp_protein = fp_FC(1024, 96)
# self.atten = nn.Conv1d(96, 1, 1)
self.fc1 = FC(192, 1024)
self.fc2 = FC(1024, 1024)
self.fc3 = nn.Linear(1024, 512)
self.fc4 = nn.Linear(512, 1)
def forward(self, drug, drug_fp, protein, prot_fp):
drug = self.drug_model(drug)
drug_out_fp = self.fp_drug(drug_fp)
protein = self.protein_model(protein)
protein_out_fp = self.fp_protein(prot_fp)
# #attention in attentionDTA
# drug_out = nn.functional.relu(self.atten(drug))
# protein_out = nn.functional.relu(self.atten(protein))
# atten = nn.functional.tanh(drug_out.transpose(dim0=1, dim1=2).bmm(protein_out))
# atten_for_drug = torch.sum(atten, dim=2)
# atten_for_protein = torch.sum(atten, dim=1)
# drug_out = drug * atten_for_drug.unsqueeze(1)
# protein_out = protein * atten_for_protein.unsqueeze(1)
# embed fingerprint into convolutional output
drug_out = drug_out_fp.unsqueeze(2) * drug
protein_out = protein_out_fp.unsqueeze(2) * protein
drug_out = nn.functional.adaptive_max_pool1d(drug_out, output_size=1).squeeze(2)
protein_out = nn.functional.adaptive_max_pool1d(protein_out, output_size=1).squeeze(2)
data_out = torch.cat((drug_out, protein_out), dim=1)
# fc
data_out = self.fc1(data_out)
data_out = self.fc2(data_out)
data_out = self.fc3(data_out)
data_out = self.fc4(data_out)
return data_out
\ No newline at end of file
import numpy as np
from gensim.models import word2vec
from sklearn.cluster import AgglomerativeClustering
def generate_slices(path, slice_len=5):
# (1) onehot sequence -> number
seqs = np.load(path) # seq is the one-hot matrix for protein sequence
aas = [] # amino acids
for i, mat in enumerate(seqs):
aa = []
for j, seq in enumerate(mat):
is_end = True
for n, label in enumerate(seq):
if label == 1:
aa.append(int(n))
is_end = False
break
if is_end:
break
aas.append(aa)
# (2) generate slice
proteins = []
total_slice_num = 0
for aa in aas:
protein = []
for index in range((len(aa) - slice_len + 1)):
slice_code = 0
for j in range(slice_len):
slice_code = slice_code * 21 + aa[index + j] # 21 kind of amino acid -- base 21 to base 10
protein.append(str(slice_code))
total_slice_num += 1
proteins.append(protein)
# print(proteins)
print("totally {} slices".format(total_slice_num))
return proteins
#########################################
# (1) generate slice
############################################
slice_len = 5
all_slices = generate_slices('all_seq.npy', slice_len)
KIBA_slices = generate_slices('KIBA_seq.npy', slice_len)
#########################################
# (2) generate vector for each slice (word2vec)
############################################
slice_window = 10 - slice_len
model = word2vec.Word2Vec(all_slices, sg=0, size=64, window=slice_window, min_count=3, negative=3, sample=0.001, hs=1, workers=4, batch_words=10, iter=10000, alpha=0.0001, callbacks=[monitor()])
all_words = model.wv.index2word
# print(all_slices)
print("totally {} words".format(len(all_words)))
model.save("word2vec.model")
#########################################
# (3) cluster into 1024 classes of slices
############################################
KIBA_vector = []
for slice in KIBA_slices:
KIBA_vector.append(model[slice])
ac = AgglomerativeClustering(n_clusters=1024)
cls = ac.fit_predict(KIBA_vector)
print(cls) # class for all slices in all protein
##########################################
# (4) map all slice to 1024 class
############################################
slice_dic = {}
for i, slice in enumerate(KIBA_slices):
slice_dic[slice] = cls[i]
##########################################
# (5) generate onehot encoding
############################################
import numpy as np
protein_onehot = np.zeros((442, 1024)) # 442 proteins
for i, protein in enumerate(KIBA_slices):
for slice in protein:
if slice in slice_dic:
protein_onehot[i][slice_dic[slice]] = 1
np.save('KIBA_fingerprint.npy', protein_onehot)
\ No newline at end of file
import numpy as np
import os
import torch
from torch import nn
from tqdm import tqdm
from data import datas
from fingerDTA import FingerDTA
def CI(P, Y):
pair = 0
summ = 0
for i in range(1, len(Y)):
for j in range(0, i):
if i != j:
if (Y[i] > Y[j]):
pair += 1
summ += 1 * (P[i] > P[j]) + 0.5 * (P[i] == P[j])
if pair != 0:
return summ / pair
else:
return 0
def r_squared_error(y_obs, y_pred):
y_obs = np.array(y_obs)
y_pred = np.array(y_pred)
y_obs_mean = np.mean(y_obs)
y_pred_mean = np.mean(y_pred)
mult = sum((y_pred - y_pred_mean) * (y_obs - y_obs_mean))
mult = mult * mult
y_obs_sq = sum((y_obs - y_obs_mean) * (y_obs - y_obs_mean))
y_pred_sq = sum((y_pred - y_pred_mean) * (y_pred - y_pred_mean))
return mult / (y_obs_sq * y_pred_sq)
def get_k(y_obs, y_pred):
y_obs = np.array(y_obs)
y_pred = np.array(y_pred)
return sum(y_obs * y_pred) / sum(y_pred * y_pred)
def squared_error_zero(y_obs, y_pred):
k = get_k(y_obs, y_pred)
y_obs = np.array(y_obs)
y_pred = np.array(y_pred)
y_obs_mean = np.mean(y_obs)
upp = sum((y_obs - (k * y_pred)) * (y_obs - (k * y_pred)))
down = sum((y_obs - y_obs_mean) * (y_obs - y_obs_mean))
return 1 - (upp / down)
def get_rm2(ys_line, ys_orig):
r2 = r_squared_error(ys_orig, ys_line)
r02 = squared_error_zero(ys_orig, ys_line)
return r2 * (1 - np.sqrt(np.absolute((r2 * r2) - (r02 * r02))))
def evaluate_final(model):
batch = 0
loss_value = 0
P = []
Y = []
model.eval()
for drug, drug_fp, protein, prot_fp, affinity in data['test']:
batch += 1
drug = drug.permute(0, 2, 1)
protein = protein.permute(0, 2, 1)
judge = model(drug, drug_fp, protein, prot_fp)
P.append(judge.squeeze(1).detach().cpu().numpy())
Y.append(affinity.detach().cpu().numpy())
loss_value += loss(judge, affinity.unsqueeze(1)).detach().cpu()
P = np.concatenate((P), axis=0)
Y = np.concatenate((Y), axis=0)
CI_index = CI(P, Y)
rm2_index = get_rm2(P, Y)
print("MSE", loss_value / batch, "\n")
print("CI_index", CI_index, "\n")
print("rm2_index", rm2_index, "\n")
def evaluate(model, epoch):
global Losssss
batch = 0
loss_value = 0
model.eval()
for drug, drug_fp, protein, prot_fp, affinity in data['valid']:
batch += 1
drug = drug.permute(0, 2, 1)
protein = protein.permute(0, 2, 1)
judge = model(drug, drug_fp, protein, prot_fp)
loss_value += loss(judge, affinity.unsqueeze(1)).detach().cpu()
with open(os.path.join(os.path.abspath(os.curdir), log_name + '.log'), 'a') as f:
f.write("epoch " + str(epoch) + ": " + str(loss_value / batch) + '\n')
print("MSE", loss_value / batch, "\n")
if loss_value / batch < Losssss:
Losssss = loss_value / batch
save_model(model, os.path.join(os.path.abspath(os.curdir), state_name + '.state'))
def train(model, optimizer):
global pre_auc
progress = tqdm(range(300))
pre_auc = -1
for epoch in progress:
model.train()
for batch, [drug, drug_fp, protein, prot_fp, affinity] in enumerate(data['train']):
batch += 1
drug = drug.permute(0, 2, 1)
protein = protein.permute(0, 2, 1)
judge = model(drug, drug_fp, protein, prot_fp)
loss_value = loss(judge, affinity.unsqueeze(1))
progress.set_description('epoch: {} batch: {} loss: {}'.format(epoch, batch, loss_value))
optimizer.zero_grad()
loss_value.backward()
optimizer.step()
evaluate(model, epoch)
def save_model(model, name):
torch.save(model.state_dict(), name)
def load_model(model, name):
model.load_state_dict(torch.load(name))
#####################
# train
########################
data_i = 0 # five fold: 0, 1, 2, 3, 4
data = datas[data_i]
Losssss = 9000000
model_type = 'fingerdta'
log_name = 'fingerdta' + str(data_i)
state_name = 'fingerdta' + str(data_i)
loss = nn.MSELoss().cuda()
model = FingerDTA().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
train(model, optimizer)
#####################
# evaluate
########################
model = FingerDTA().cuda()
load_model(model, os.path.join(os.path.abspath(os.curdir),model_type, '{}{}.state'.format(model_type, data_i)))
evaluate_final(model)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论