提交 101977f0 作者: 朱学凯

add mol encode

上级 cb4c0cc6
......@@ -2,6 +2,7 @@ model/
*.ipynb_checkpoints/
.idea/
.DS_Store
data/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
......
......@@ -15,5 +15,5 @@
"num_hidden_layers": 12,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 23615
"vocab_size": 23614
}
......@@ -15,5 +15,5 @@
"num_hidden_layers": 3,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 23615
"vocab_size": 23614
}
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 595,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 3,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 40235
}
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
_PAD
_GO
_EOS
_UNK
[PAD]
[MASK]
[CLS]
[SEP]
[UNK]
[unused1]
[unused2]
[unused3]
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -145,6 +145,51 @@ class Data_Encoder(data.Dataset):
return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(input_mask).long(), y
# return len(d), len(p)
class Data_Encoder_mol(data.Dataset):
def __init__(self, train_file, tokenizer_config):
'Initialization'
# load data
# with open(train_file["sps"], 'r') as f:
# self.sps = f.readlines()
with open(train_file['seq'], 'r') as f:
self.seq = f.readlines()
with open(train_file["smile"], 'r') as f:
self.smile = f.readlines()
with open(train_file["affinity"], 'r') as f:
self.affinity = f.readlines()
# define tokenizer
self.begin_id = tokenizer_config["begin_id"]
self.sep_id = tokenizer_config["separate_id"]
self.max_len = tokenizer_config["max_len"]
self.vocab = load_vocab(tokenizer_config["vocab_file"])
bpe_codes_drug = codecs.open(tokenizer_config["vocab_pair"])
self.dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
bpe_codes_prot = codecs.open(tokenizer_config["vocab_pair_p"])
self.pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
def __len__(self):
'Denotes the total number of samples'
return len(self.smile)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
# tokenization
d = self.dbpe.process_line(self.smile[index].strip()).split()
p = self.pbpe.process_line(self.seq[index].strip()).split()
y = np.float64(self.affinity[index].strip())
input_seq = [self.begin_id] + d + [self.sep_id] + p + [self.sep_id]
token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
token_type_ids = np.pad(token_type_ids, (0, self.max_len-len(input_seq)), 'constant', constant_values=0)
input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(input_mask).long(), y
# return len(d), len(p)
def get_task(task_name):
......@@ -195,6 +240,24 @@ def get_task(task_name):
return df, tokenizer_config
elif task_name.lower() == 'train_mol':
df_train = {"sps": './data/train/train_sps',
'seq': './data/train/train_protein_seq',
"smile": './data/train/train_smile',
"affinity": './data/train/train_ic50',
}
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 595
}
return df_train, tokenizer_config
......@@ -211,30 +274,14 @@ if __name__ == "__main__":
# vocab = load_vocab(vocab_file)
# test train
df_train = {"sps": './data/train_sps',
"smile": './data/train_smile',
"affinity": './data/train_ic50',
}
df_test = {"sps": './data/test_sps',
"smile": './data/test_smile',
"affinity": './data/test_ic50',
}
tokenizer_config = {"vocab_file": './config/vocab.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 256
}
task = 'train_mol'
data_file, tokenizer_config = get_task(task)
params = {'batch_size': 1,
'shuffle': False,
'num_workers': 0
}
# trainset = Data_Encoder(df_train, tokenizer_config)
# training_generator = data.DataLoader(trainset, **params)
# with open('utils/train_data_analyse.csv', 'w', newline='') as f:
# csv_f = csv.writer(f)
# csv_f.writerow(['drup_len', 'protein_len'])
# for i, (len_d, len_p) in tqdm(enumerate(training_generator)):
# d = len_d.numpy()[0]
# p = len_p.numpy()[0]
# csv_f.writerow([str(d), str(p)])
trainset = Data_Encoder_mol(data_file, tokenizer_config)
training_generator = data.DataLoader(trainset, **params)
for i, (len_d, len_p) in tqdm(enumerate(training_generator)):
d = len_d.numpy()[0]
p = len_p.numpy()[0]
from argparse import ArgumentParser
from dataset import Data_Encoder, get_task
from dataset import Data_Encoder, get_task, Data_Encoder_mol
import torch
from torch.utils.data import DataLoader
from configuration_bert import BertConfig
......@@ -102,7 +102,8 @@ def test(args, model, dataset):
def main(args):
# load data
data_file, tokenizer_config = get_task(args.task)
dataset = Data_Encoder(data_file, tokenizer_config)
# dataset = Data_Encoder(data_file, tokenizer_config)
dataset = Data_Encoder_mol(data_file, tokenizer_config)
# creat model
print('------------------creat model---------------------------')
config = BertConfig.from_pretrained(args.config)
......@@ -110,7 +111,7 @@ def main(args):
print('model name : BertAffinity')
print('task name : {}'.format(args.task))
if args.task in ['train', 'train_z_1', 'train_z_10', 'train_z_100']:
if args.task in ['train', 'train_z_1', 'train_z_10', 'train_z_100', 'train_mol']:
train(args, model, dataset)
elif args.task in ['test']:
......@@ -119,9 +120,6 @@ def main(args):
if __name__ == '__main__':
# get parameter
parser = ArgumentParser(description='BertAffinity')
......@@ -152,11 +150,11 @@ if __name__ == '__main__':
# local test
# args.task = 'train_z_10'
# args.savedir = 'local_test_train'
# args.epochs = 10
# args.lr = 1e-5
# args.config = './config/config_layer_3.json'
args.task = 'train_mol'
args.savedir = 'local_test_train'
args.epochs = 10
args.lr = 1e-5
args.config = './config/config_layer_3_mol.json'
......
CUDA_VISIBLE_DEVICES=4 python run_interaction.py --b=64 --task=train --epochs=30 --lr=1e-5 --savedir=lr-1e-5-batch-64-e-30-layer3-0505 --config=./config/config_layer_3.json
CUDA_VISIBLE_DEVICES=4 python run_interaction.py --b=64 --task=train_mol --epochs=30 --lr=1e-5 --savedir=lr-1e-5-batch-64-e-30-layer3-0505 --config=./config/config_layer_3.json
from subword_nmt.apply_bpe import BPE
import codecs
import collections
bpe_codes_drug = codecs.open('../config/drug_codes_chembl.txt')
dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
bpe_codes_prot = codecs.open('../config/protein_codes_uniprot.txt')
pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
def load_file(file):
data = []
with open(file, 'r') as f:
lines = f.readlines()
for line in lines:
data.append(line.strip('\n'))
return data
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def seq2vec(protein, drug):
start_token = '[CLS]'
sep_token = '[SEP]'
prots = load_file(protein)
drugs = load_file(drug)
for p, d in zip(prots, drugs):
d = dbpe.process_line(d).split()
p = pbpe.process_line(p).split()
tokens = [start_token] + d + [sep_token] + p + [sep_token]
print(len(p))
if __name__ == '__main__':
seq = '../data/test/test_protein_seq'
simle = '../data/train/train_smile'
vocab = '../config/vocab_mol.txt'
seq2vec(seq, simle)
\ No newline at end of file
import pandas as pd
import numpy as np
sub_csv = pd.read_csv('../ESPF/subword_units_map_chembl.csv')
sub_csv = pd.read_csv('../config/subword_units_map_chembl.csv')
idx2word_d = sub_csv['index'].values
sub_csv = pd.read_csv('../config/subword_units_map_uniprot.csv')
idx2word_p = sub_csv['index'].values
# words2idx_d = dict(zip(idx2word_d, range(0, len(idx2word_d))))
with open('../ESPF/vocab76.to', 'r') as f:
prot_vocab = f.readlines()
spqcial_tokens = np.array(['[PAD]', '[MASK]', '[CLS]', '[SEP]', '[UNK]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]'])
all_tokens = np.concatenate((spqcial_tokens, idx2word_p, idx2word_d))
save = '../config/vocab_mol.txt'
with open('../ESPF/vocab.txt', 'w') as f:
for prot in prot_vocab:
f.write(prot.strip() + '\n')
with open('../ESPF/vocab.txt', 'a') as f:
for drug in idx2word_d:
f.write(drug + '\n')
with open(save, 'w') as f:
for token in all_tokens:
f.write(str(token) + '\n')
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论