FingerDTA template

5788ff27 · mszjaas · 5788ff27 · 5788ff27 · 5788ff27 · 5788ff27
--- a/data.py
+++ b/data.py
+import pickle
+import json
+import numpy as np
+import torch
+from torch.utils.data import Dataset, DataLoader
+test_fold = json.load(open("test_fold_setting1.txt")) # from DeepDTA
+train_folds = json.load(open("train_fold_setting1.txt")) # from DeepDTA
+with open(r'KIBA_protein.pickle', 'rb') as f:
+    store = pickle.load(f)
+    seqs = store['seq']
+with open(r'KIBA_ligand.pickle', 'rb') as f:
+    store = pickle.load(f)
+    drugs = store['smiles']
+    drug_fps = store['fingerprint']
+prot_fps = np.load('KIBA_fingerprint.npy')
+with open(r'davis_relation.pickle', 'rb') as f:
+    relationship = pickle.load(f)
+    label_row_inds, label_col_inds = np.where(np.isnan(relationship) == False)
+class Datas(Dataset):
+    def __init__(self, index, data_type):
+        indexes = []
+        if data_type == 'train':
+            for i in range(0, index):
+                indexes.extend(train_folds[i])
+            for i in range(index + 1, 5):
+                indexes.extend(train_folds[i])
+        elif data_type == 'valid':
+            indexes.extend(train_folds[index])
+        elif data_type == 'test':
+            indexes = test_fold
+        self.indexes = indexes
+    def __getitem__(self, index):
+        i = self.indexes[index]
+        drug_i = label_row_inds[i]
+        protein_i = label_col_inds[i]
+        affinity = torch.tensor(relationship[drug_i][protein_i]).float().cuda()
+        protein = torch.from_numpy(seqs[protein_i]).float().cuda()
+        prot_fp = torch.from_numpy(prot_fps[protein_i]).float().cuda()
+        drug = torch.from_numpy(drugs[drug_i]).float().cuda()
+        drug_fp = torch.from_numpy(drug_fps[drug_i]).float().cuda()
+        return drug, drug_fp, protein, prot_fp, affinity
+    def __len__(self):
+        return len(self.indexes)
+# five fold
+datas = []
+for i in range(5):
+    datas.append({
+        'train': DataLoader(Datas(i, 'train'), batch_size=128, shuffle=True),
+        'test': DataLoader(Datas(i, 'test'), batch_size=128, shuffle=True),
+        'valid': DataLoader(Datas(i, 'valid'), batch_size=128, shuffle=True),
+    })
\ No newline at end of file
--- a/fingerDTA.py
+++ b/fingerDTA.py
+import torch
+from torch import nn
+# Dense Convolutional Block
+class ConvBlock(nn.Module):
+    def __init__(self, length_in, length_out):
+        super(ConvBlock, self).__init__()
+        length_out = length_out // 4
+        self.x1 = nn.Conv1d(length_in, length_out, kernel_size=1)
+        self.x2 = nn.Conv1d(length_out + length_in, length_out, kernel_size=3, padding=1)
+        self.x3 = nn.Conv1d(length_out * 2 + length_in, length_out, kernel_size=5, padding=2)
+        self.x4 = nn.Conv1d(length_out * 3 + length_in, length_out, kernel_size=7, padding=3)
+    def forward(self, data_in):
+        x1 = self.x1(data_in)
+        x2 = self.x2(torch.cat((x1, data_in), dim=1))
+        x3 = self.x3(torch.cat((x2, x1, data_in), dim=1))
+        x4 = self.x4(torch.cat((x3, x2, x1, data_in), dim=1))
+        data_out = torch.cat((x1, x2, x3, x4), dim=1)
+        #         data_out = torch.nn.functional.dropout(data_out, p=0.5)
+        data_out = nn.functional.relu(data_out, inplace=False)
+        return data_out
+class CNN(nn.Module):
+    def __init__(self, type_num=64):
+        super(CNN, self).__init__()
+        #        self.x1 = nn.Conv1d(type_num, 128, 1)
+        self.x1 = ConvBlock(type_num, 128)
+        self.x2 = ConvBlock(128, 256)
+        self.x3 = ConvBlock(256, 96)
+    def forward(self, data_in):
+        data_out = self.x1(data_in)
+        data_out = self.x2(data_out)
+        data_out = self.x3(data_out)
+        #        data_out = self.x4(data_out)
+        return data_out
+class FC(nn.Module):
+    def __init__(self, dim_in, dim_out, dropout=True):
+        super(FC, self).__init__()
+        self.x1 = nn.Linear(dim_in, dim_out)
+        self.x2 = torch.nn.Dropout()
+        self.dropout = dropout
+    def forward(self, x):
+        x = self.x1(x)
+        if self.dropout:
+            x = self.x2(x)
+        x = nn.functional.leaky_relu(x, inplace=False)
+        return x
+class fp_FC(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super(fp_FC, self).__init__()
+        self.x1 = nn.Linear(dim_in, 512)
+        self.x2 = torch.nn.Dropout()
+        self.x3 = nn.Linear(512, dim_out)
+    def forward(self, x):
+        x = self.x1(x)
+        x = self.x2(x)
+        #        x = nn.functional.leaky_relu(x)
+        x = self.x3(x)
+        #        x = nn.functional.leaky_relu(x)
+        return x
+class FingerDTA(nn.Module):
+    def __init__(self):
+        super(FingerDTA, self).__init__()
+        self.drug_model = CNN(64)
+        self.protein_model = CNN(21)
+        self.fp_drug = fp_FC(1024, 96)
+        self.fp_protein = fp_FC(1024, 96)
+        # self.atten = nn.Conv1d(96, 1, 1)
+        self.fc1 = FC(192, 1024)
+        self.fc2 = FC(1024, 1024)
+        self.fc3 = nn.Linear(1024, 512)
+        self.fc4 = nn.Linear(512, 1)
+    def forward(self, drug, drug_fp, protein, prot_fp):
+        drug = self.drug_model(drug)
+        drug_out_fp = self.fp_drug(drug_fp)
+        protein = self.protein_model(protein)
+        protein_out_fp = self.fp_protein(prot_fp)
+        #         #attention in attentionDTA
+        #         drug_out = nn.functional.relu(self.atten(drug))
+        #         protein_out = nn.functional.relu(self.atten(protein))
+        #         atten = nn.functional.tanh(drug_out.transpose(dim0=1, dim1=2).bmm(protein_out))
+        #         atten_for_drug = torch.sum(atten, dim=2)
+        #         atten_for_protein = torch.sum(atten, dim=1)
+        #         drug_out = drug * atten_for_drug.unsqueeze(1)
+        #         protein_out = protein * atten_for_protein.unsqueeze(1)
+        # embed fingerprint into convolutional output
+        drug_out = drug_out_fp.unsqueeze(2) * drug
+        protein_out = protein_out_fp.unsqueeze(2) * protein
+        drug_out = nn.functional.adaptive_max_pool1d(drug_out, output_size=1).squeeze(2)
+        protein_out = nn.functional.adaptive_max_pool1d(protein_out, output_size=1).squeeze(2)
+        data_out = torch.cat((drug_out, protein_out), dim=1)
+        # fc
+        data_out = self.fc1(data_out)
+        data_out = self.fc2(data_out)
+        data_out = self.fc3(data_out)
+        data_out = self.fc4(data_out)
+        return data_out
\ No newline at end of file
--- a/generate_fingerprint.py
+++ b/generate_fingerprint.py
+import numpy as np
+from gensim.models import word2vec
+from sklearn.cluster import AgglomerativeClustering
+def generate_slices(path, slice_len=5):
+    # (1) onehot sequence -> number
+    seqs = np.load(path)    # seq is the one-hot matrix for protein sequence
+    aas = [] # amino acids
+    for i, mat in enumerate(seqs):
+        aa = []
+        for j, seq in enumerate(mat):
+            is_end = True
+            for n, label in enumerate(seq):
+                if label == 1:
+                    aa.append(int(n))
+                    is_end = False
+                    break
+            if is_end:
+                break
+        aas.append(aa)
+    # (2) generate slice
+    proteins = []
+    total_slice_num = 0
+    for aa in aas:
+        protein = []
+        for index in range((len(aa) - slice_len + 1)):
+            slice_code = 0
+            for j in range(slice_len):
+                slice_code = slice_code * 21 + aa[index + j]  # 21 kind of amino acid -- base 21 to base 10
+            protein.append(str(slice_code))
+            total_slice_num += 1
+        proteins.append(protein)
+    # print(proteins)
+    print("totally {} slices".format(total_slice_num))
+    return proteins
+#########################################
+# (1) generate slice
+############################################
+slice_len = 5
+all_slices = generate_slices('all_seq.npy', slice_len)
+KIBA_slices = generate_slices('KIBA_seq.npy', slice_len)
+#########################################
+# (2) generate vector for each slice （word2vec）
+############################################
+slice_window = 10 - slice_len
+model = word2vec.Word2Vec(all_slices, sg=0, size=64,  window=slice_window,  min_count=3,  negative=3, sample=0.001, hs=1, workers=4, batch_words=10, iter=10000, alpha=0.0001, callbacks=[monitor()])
+all_words = model.wv.index2word
+# print(all_slices)
+print("totally {} words".format(len(all_words)))
+model.save("word2vec.model")
+#########################################
+# (3) cluster into 1024 classes of slices
+############################################
+KIBA_vector = []
+for slice in KIBA_slices:
+    KIBA_vector.append(model[slice])
+ac = AgglomerativeClustering(n_clusters=1024)
+cls = ac.fit_predict(KIBA_vector)
+print(cls) # class for all slices in all protein
+##########################################
+# (4) map all slice to 1024 class
+############################################
+slice_dic = {}
+for i, slice in enumerate(KIBA_slices):
+    slice_dic[slice] = cls[i]
+##########################################
+# (5) generate onehot encoding
+############################################
+import numpy as np
+protein_onehot = np.zeros((442, 1024))  # 442 proteins
+for i, protein in enumerate(KIBA_slices):
+    for slice in protein:
+        if slice in slice_dic:
+            protein_onehot[i][slice_dic[slice]] = 1
+np.save('KIBA_fingerprint.npy', protein_onehot)
\ No newline at end of file
--- a/train_and_evaluate.py
+++ b/train_and_evaluate.py
+import numpy as np
+import os
+import torch
+from torch import nn
+from tqdm import tqdm
+from data import datas
+from fingerDTA import FingerDTA
+def CI(P, Y):
+    pair = 0
+    summ = 0
+    for i in range(1, len(Y)):
+        for j in range(0, i):
+            if i != j:
+                if (Y[i] > Y[j]):
+                    pair += 1
+                    summ += 1 * (P[i] > P[j]) + 0.5 * (P[i] == P[j])
+    if pair != 0:
+        return summ / pair
+    else:
+        return 0
+def r_squared_error(y_obs, y_pred):
+    y_obs = np.array(y_obs)
+    y_pred = np.array(y_pred)
+    y_obs_mean = np.mean(y_obs)
+    y_pred_mean = np.mean(y_pred)
+    mult = sum((y_pred - y_pred_mean) * (y_obs - y_obs_mean))
+    mult = mult * mult
+    y_obs_sq = sum((y_obs - y_obs_mean) * (y_obs - y_obs_mean))
+    y_pred_sq = sum((y_pred - y_pred_mean) * (y_pred - y_pred_mean))
+    return mult / (y_obs_sq * y_pred_sq)
+def get_k(y_obs, y_pred):
+    y_obs = np.array(y_obs)
+    y_pred = np.array(y_pred)
+    return sum(y_obs * y_pred) / sum(y_pred * y_pred)
+def squared_error_zero(y_obs, y_pred):
+    k = get_k(y_obs, y_pred)
+    y_obs = np.array(y_obs)
+    y_pred = np.array(y_pred)
+    y_obs_mean = np.mean(y_obs)
+    upp = sum((y_obs - (k * y_pred)) * (y_obs - (k * y_pred)))
+    down = sum((y_obs - y_obs_mean) * (y_obs - y_obs_mean))
+    return 1 - (upp / down)
+def get_rm2(ys_line, ys_orig):
+    r2 = r_squared_error(ys_orig, ys_line)
+    r02 = squared_error_zero(ys_orig, ys_line)
+    return r2 * (1 - np.sqrt(np.absolute((r2 * r2) - (r02 * r02))))
+def evaluate_final(model):
+    batch = 0
+    loss_value = 0
+    P = []
+    Y = []
+    model.eval()
+    for drug, drug_fp, protein, prot_fp, affinity in data['test']:
+        batch += 1
+        drug = drug.permute(0, 2, 1)
+        protein = protein.permute(0, 2, 1)
+        judge = model(drug, drug_fp, protein, prot_fp)
+        P.append(judge.squeeze(1).detach().cpu().numpy())
+        Y.append(affinity.detach().cpu().numpy())
+        loss_value += loss(judge, affinity.unsqueeze(1)).detach().cpu()
+    P = np.concatenate((P), axis=0)
+    Y = np.concatenate((Y), axis=0)
+    CI_index = CI(P, Y)
+    rm2_index = get_rm2(P, Y)
+    print("MSE", loss_value / batch, "\n")
+    print("CI_index", CI_index, "\n")
+    print("rm2_index", rm2_index, "\n")
+def evaluate(model, epoch):
+    global Losssss
+    batch = 0
+    loss_value = 0
+    model.eval()
+    for drug, drug_fp, protein, prot_fp, affinity in data['valid']:
+        batch += 1
+        drug = drug.permute(0, 2, 1)
+        protein = protein.permute(0, 2, 1)
+        judge = model(drug, drug_fp, protein, prot_fp)
+        loss_value += loss(judge, affinity.unsqueeze(1)).detach().cpu()
+    with open(os.path.join(os.path.abspath(os.curdir), log_name + '.log'), 'a') as f:
+        f.write("epoch " + str(epoch) + ":  " + str(loss_value / batch) + '\n')
+    print("MSE", loss_value / batch, "\n")
+    if loss_value / batch < Losssss:
+        Losssss = loss_value / batch
+        save_model(model, os.path.join(os.path.abspath(os.curdir), state_name + '.state'))
+def train(model, optimizer):
+    global pre_auc
+    progress = tqdm(range(300))
+    pre_auc = -1
+    for epoch in progress:
+        model.train()
+        for batch, [drug, drug_fp, protein, prot_fp, affinity] in enumerate(data['train']):
+            batch += 1
+            drug = drug.permute(0, 2, 1)
+            protein = protein.permute(0, 2, 1)
+            judge = model(drug, drug_fp, protein, prot_fp)
+            loss_value = loss(judge, affinity.unsqueeze(1))
+            progress.set_description('epoch: {}   batch: {}   loss: {}'.format(epoch, batch, loss_value))
+            optimizer.zero_grad()
+            loss_value.backward()
+            optimizer.step()
+        evaluate(model, epoch)
+def save_model(model, name):
+    torch.save(model.state_dict(), name)
+def load_model(model, name):
+    model.load_state_dict(torch.load(name))
+#####################
+# train
+########################
+data_i = 0     # five fold: 0, 1, 2, 3, 4
+data = datas[data_i]
+Losssss = 9000000
+model_type = 'fingerdta'
+log_name = 'fingerdta' + str(data_i)
+state_name = 'fingerdta'  + str(data_i)
+loss = nn.MSELoss().cuda()
+model = FingerDTA().cuda()
+optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
+train(model, optimizer)
+#####################
+# evaluate
+########################
+model = FingerDTA().cuda()
+load_model(model, os.path.join(os.path.abspath(os.curdir),model_type, '{}{}.state'.format(model_type, data_i)))
+evaluate_final(model)
\ No newline at end of file