提交 101977f0 作者: 朱学凯

add mol encode

上级 cb4c0cc6
...@@ -2,6 +2,7 @@ model/ ...@@ -2,6 +2,7 @@ model/
*.ipynb_checkpoints/ *.ipynb_checkpoints/
.idea/ .idea/
.DS_Store .DS_Store
data/
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
*.py[cod] *.py[cod]
......
...@@ -15,5 +15,5 @@ ...@@ -15,5 +15,5 @@
"num_hidden_layers": 12, "num_hidden_layers": 12,
"pad_token_id": 0, "pad_token_id": 0,
"type_vocab_size": 2, "type_vocab_size": 2,
"vocab_size": 23615 "vocab_size": 23614
} }
...@@ -15,5 +15,5 @@ ...@@ -15,5 +15,5 @@
"num_hidden_layers": 3, "num_hidden_layers": 3,
"pad_token_id": 0, "pad_token_id": 0,
"type_vocab_size": 2, "type_vocab_size": 2,
"vocab_size": 23615 "vocab_size": 23614
} }
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 595,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 3,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 40235
}
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
_PAD [PAD]
_GO [MASK]
_EOS
_UNK
[CLS] [CLS]
[SEP] [SEP]
[UNK]
[unused1] [unused1]
[unused2] [unused2]
[unused3] [unused3]
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -145,6 +145,51 @@ class Data_Encoder(data.Dataset): ...@@ -145,6 +145,51 @@ class Data_Encoder(data.Dataset):
return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(input_mask).long(), y return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(input_mask).long(), y
# return len(d), len(p) # return len(d), len(p)
class Data_Encoder_mol(data.Dataset):
def __init__(self, train_file, tokenizer_config):
'Initialization'
# load data
# with open(train_file["sps"], 'r') as f:
# self.sps = f.readlines()
with open(train_file['seq'], 'r') as f:
self.seq = f.readlines()
with open(train_file["smile"], 'r') as f:
self.smile = f.readlines()
with open(train_file["affinity"], 'r') as f:
self.affinity = f.readlines()
# define tokenizer
self.begin_id = tokenizer_config["begin_id"]
self.sep_id = tokenizer_config["separate_id"]
self.max_len = tokenizer_config["max_len"]
self.vocab = load_vocab(tokenizer_config["vocab_file"])
bpe_codes_drug = codecs.open(tokenizer_config["vocab_pair"])
self.dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
bpe_codes_prot = codecs.open(tokenizer_config["vocab_pair_p"])
self.pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
def __len__(self):
'Denotes the total number of samples'
return len(self.smile)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
# tokenization
d = self.dbpe.process_line(self.smile[index].strip()).split()
p = self.pbpe.process_line(self.seq[index].strip()).split()
y = np.float64(self.affinity[index].strip())
input_seq = [self.begin_id] + d + [self.sep_id] + p + [self.sep_id]
token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
token_type_ids = np.pad(token_type_ids, (0, self.max_len-len(input_seq)), 'constant', constant_values=0)
input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(input_mask).long(), y
# return len(d), len(p)
def get_task(task_name): def get_task(task_name):
...@@ -195,6 +240,24 @@ def get_task(task_name): ...@@ -195,6 +240,24 @@ def get_task(task_name):
return df, tokenizer_config return df, tokenizer_config
elif task_name.lower() == 'train_mol':
df_train = {"sps": './data/train/train_sps',
'seq': './data/train/train_protein_seq',
"smile": './data/train/train_smile',
"affinity": './data/train/train_ic50',
}
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 595
}
return df_train, tokenizer_config
...@@ -211,30 +274,14 @@ if __name__ == "__main__": ...@@ -211,30 +274,14 @@ if __name__ == "__main__":
# vocab = load_vocab(vocab_file) # vocab = load_vocab(vocab_file)
# test train # test train
df_train = {"sps": './data/train_sps', task = 'train_mol'
"smile": './data/train_smile', data_file, tokenizer_config = get_task(task)
"affinity": './data/train_ic50',
}
df_test = {"sps": './data/test_sps',
"smile": './data/test_smile',
"affinity": './data/test_ic50',
}
tokenizer_config = {"vocab_file": './config/vocab.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 256
}
params = {'batch_size': 1, params = {'batch_size': 1,
'shuffle': False, 'shuffle': False,
'num_workers': 0 'num_workers': 0
} }
# trainset = Data_Encoder(df_train, tokenizer_config) trainset = Data_Encoder_mol(data_file, tokenizer_config)
# training_generator = data.DataLoader(trainset, **params) training_generator = data.DataLoader(trainset, **params)
# with open('utils/train_data_analyse.csv', 'w', newline='') as f: for i, (len_d, len_p) in tqdm(enumerate(training_generator)):
# csv_f = csv.writer(f) d = len_d.numpy()[0]
# csv_f.writerow(['drup_len', 'protein_len']) p = len_p.numpy()[0]
# for i, (len_d, len_p) in tqdm(enumerate(training_generator)):
# d = len_d.numpy()[0]
# p = len_p.numpy()[0]
# csv_f.writerow([str(d), str(p)])
from argparse import ArgumentParser from argparse import ArgumentParser
from dataset import Data_Encoder, get_task from dataset import Data_Encoder, get_task, Data_Encoder_mol
import torch import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from configuration_bert import BertConfig from configuration_bert import BertConfig
...@@ -102,7 +102,8 @@ def test(args, model, dataset): ...@@ -102,7 +102,8 @@ def test(args, model, dataset):
def main(args): def main(args):
# load data # load data
data_file, tokenizer_config = get_task(args.task) data_file, tokenizer_config = get_task(args.task)
dataset = Data_Encoder(data_file, tokenizer_config) # dataset = Data_Encoder(data_file, tokenizer_config)
dataset = Data_Encoder_mol(data_file, tokenizer_config)
# creat model # creat model
print('------------------creat model---------------------------') print('------------------creat model---------------------------')
config = BertConfig.from_pretrained(args.config) config = BertConfig.from_pretrained(args.config)
...@@ -110,7 +111,7 @@ def main(args): ...@@ -110,7 +111,7 @@ def main(args):
print('model name : BertAffinity') print('model name : BertAffinity')
print('task name : {}'.format(args.task)) print('task name : {}'.format(args.task))
if args.task in ['train', 'train_z_1', 'train_z_10', 'train_z_100']: if args.task in ['train', 'train_z_1', 'train_z_10', 'train_z_100', 'train_mol']:
train(args, model, dataset) train(args, model, dataset)
elif args.task in ['test']: elif args.task in ['test']:
...@@ -119,9 +120,6 @@ def main(args): ...@@ -119,9 +120,6 @@ def main(args):
if __name__ == '__main__': if __name__ == '__main__':
# get parameter # get parameter
parser = ArgumentParser(description='BertAffinity') parser = ArgumentParser(description='BertAffinity')
...@@ -152,11 +150,11 @@ if __name__ == '__main__': ...@@ -152,11 +150,11 @@ if __name__ == '__main__':
# local test # local test
# args.task = 'train_z_10' args.task = 'train_mol'
# args.savedir = 'local_test_train' args.savedir = 'local_test_train'
# args.epochs = 10 args.epochs = 10
# args.lr = 1e-5 args.lr = 1e-5
# args.config = './config/config_layer_3.json' args.config = './config/config_layer_3_mol.json'
......
CUDA_VISIBLE_DEVICES=4 python run_interaction.py --b=64 --task=train --epochs=30 --lr=1e-5 --savedir=lr-1e-5-batch-64-e-30-layer3-0505 --config=./config/config_layer_3.json CUDA_VISIBLE_DEVICES=4 python run_interaction.py --b=64 --task=train_mol --epochs=30 --lr=1e-5 --savedir=lr-1e-5-batch-64-e-30-layer3-0505 --config=./config/config_layer_3.json
from subword_nmt.apply_bpe import BPE
import codecs
import collections
bpe_codes_drug = codecs.open('../config/drug_codes_chembl.txt')
dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
bpe_codes_prot = codecs.open('../config/protein_codes_uniprot.txt')
pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
def load_file(file):
data = []
with open(file, 'r') as f:
lines = f.readlines()
for line in lines:
data.append(line.strip('\n'))
return data
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def seq2vec(protein, drug):
start_token = '[CLS]'
sep_token = '[SEP]'
prots = load_file(protein)
drugs = load_file(drug)
for p, d in zip(prots, drugs):
d = dbpe.process_line(d).split()
p = pbpe.process_line(p).split()
tokens = [start_token] + d + [sep_token] + p + [sep_token]
print(len(p))
if __name__ == '__main__':
seq = '../data/test/test_protein_seq'
simle = '../data/train/train_smile'
vocab = '../config/vocab_mol.txt'
seq2vec(seq, simle)
\ No newline at end of file
import pandas as pd import pandas as pd
import numpy as np
sub_csv = pd.read_csv('../config/subword_units_map_chembl.csv')
sub_csv = pd.read_csv('../ESPF/subword_units_map_chembl.csv')
idx2word_d = sub_csv['index'].values idx2word_d = sub_csv['index'].values
sub_csv = pd.read_csv('../config/subword_units_map_uniprot.csv')
idx2word_p = sub_csv['index'].values
# words2idx_d = dict(zip(idx2word_d, range(0, len(idx2word_d)))) # words2idx_d = dict(zip(idx2word_d, range(0, len(idx2word_d))))
with open('../ESPF/vocab76.to', 'r') as f: spqcial_tokens = np.array(['[PAD]', '[MASK]', '[CLS]', '[SEP]', '[UNK]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]'])
prot_vocab = f.readlines()
all_tokens = np.concatenate((spqcial_tokens, idx2word_p, idx2word_d))
save = '../config/vocab_mol.txt'
with open('../ESPF/vocab.txt', 'w') as f: with open(save, 'w') as f:
for prot in prot_vocab: for token in all_tokens:
f.write(prot.strip() + '\n') f.write(str(token) + '\n')
with open('../ESPF/vocab.txt', 'a') as f:
for drug in idx2word_d:
f.write(drug + '\n')
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论