提交 3d456595 作者: 朱学凯

start

上级 2f72dddc
File added
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (code)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/CPI.iml" filepath="$PROJECT_DIR$/.idea/CPI.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
import numpy as np
import pandas as pd
import torch
from torch.utils import data
import json
import collections
from sklearn.preprocessing import OneHotEncoder
from subword_nmt.apply_bpe import BPE
import codecs
from tqdm import tqdm
import csv
# vocab_path = './ESPF/protein_codes_uniprot.txt'
# bpe_codes_protein = codecs.open(vocab_path)
# pbpe = BPE(bpe_codes_protein, merges=-1, separator='')
# sub_csv = pd.read_csv('./ESPF/subword_units_map_uniprot.csv')
#
# idx2word_p = sub_csv['index'].values
# words2idx_p = dict(zip(idx2word_p, range(0, len(idx2word_p))))
# vocab_path = './ESPF/drug_codes_chembl.txt'
# bpe_codes_drug = codecs.open(vocab_path)
# dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
# sub_csv = pd.read_csv('./ESPF/subword_units_map_chembl.csv')
#
# idx2word_d = sub_csv['index'].values
# words2idx_d = dict(zip(idx2word_d, range(0, len(idx2word_d))))
# max_d = 205
# max_p = 545
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def protein2emb_encoder(x, words2idx_p):
max_p = 152
# t1 = pbpe.process_line(x).split() # split
t1 = x.split(',')
try:
i1 = np.asarray([words2idx_p[i] for i in t1]) # index
except:
i1 = np.array([0])
# print(x)
l = len(i1)
if l < max_p:
i = np.pad(i1, (0, max_p - l), 'constant', constant_values=0)
input_mask = ([1] * l) + ([0] * (max_p - l))
else:
i = i1[:max_p]
input_mask = [1] * max_p
return i, np.asarray(input_mask)
def drug2emb_encoder(x, dbpe, words2idx_d):
max_d = 50
# max_d = 100
t1 = dbpe.process_line(x)
t1 = t1.split() # split
try:
i1 = np.asarray([words2idx_d[i] for i in t1]) # index
except:
i1 = np.array([0])
# print(x)
l = len(i1)
print(i1)
if l < max_d:
i = np.pad(i1, (0, max_d - l), 'constant', constant_values=0)
input_mask = ([1] * l) + ([0] * (max_d - l))
else:
i = i1[:max_d]
input_mask = [1] * max_d
return i, np.asarray(input_mask)
def seq2emb_encoder(input_seq, max_len, vocab):
try:
ids = np.asarray([vocab[i] for i in input_seq])
except:
ids = np.array([0])
l = len(ids)
with open('./utils/data_analyse_train.tsv', 'w') as f:
tsv = csv.writer(f)
tsv.writerow([ids, l, '\n'])
if l < max_len:
ids = np.pad(ids, (0, max_len - l), 'constant', constant_values=0)
input_mask = np.array(([1] * l) + ([0] * (max_len - l)))
else:
ids = ids[:max_len]
input_mask = np.array([1] * max_len)
return ids, input_mask
class Data_Encoder(data.Dataset):
def __init__(self, train_file, tokenizer_config):
'Initialization'
self.begin_id = train_file["begin_id"]
self.sep_id = train_file["separate_id"]
self.max_len = train_file["max_len"]
with open(train_file["sps"], 'r') as f:
self.sps = f.readlines()
with open(train_file["smile"], 'r') as f:
self.smile = f.readlines()
with open(train_file["affinity"], 'r') as f:
self.affinity = f.readlines()
self.vocab = load_vocab(tokenizer_config["vocab_file"])
bpe_codes_drug = codecs.open(tokenizer_config["vocab_pair"])
self.dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
def __len__(self):
'Denotes the total number of samples'
return len(self.sps)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
# tokenization
d = self.dbpe.process_line(self.smile[index].strip()).split()
p = self.sps[index].strip().split(',')
y = self.affinity[index].strip()
input_seq = [self.begin_id] + d + [self.sep_id] + p + [self.sep_id]
input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
return input, y
if __name__ == "__main__":
# local test
# dataFolder = './IC50/SPS/train_smile'
# with open(dataFolder, 'r') as f:
# train_smi = f.readlines()
# drug_smi = train_smi[0]
# d_v, input_mask_d = drug2emb_encoder(drug_smi)
# test load vocab
# vocab_file = './ESPF/vocab.txt'
# vocab = load_vocab(vocab_file)
# test train
df_train = {"sps": './IC50/SPS/train_sps',
"smile": './IC50/SPS/train_smile',
"affinity": './IC50/SPS/train_ic50',
"vocab_file": './ESPF/vocab.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 256
}
tokenizer_config = {"vocab_file": './ESPF/vocab.txt',
"vocab_pair": './ESPF/drug_codes_chembl.txt'
}
params = {'batch_size': 5,
'shuffle': False,
'num_workers': 0,
'drop_last': True}
trainset = Data_Encoder(df_train, tokenizer_config)
training_generator = data.DataLoader(trainset)
for i, (input, affinity) in tqdm(enumerate(training_generator)):
print('----------------')
"[ 4 209 1755 3175 6267 15433 911 535 5272 5 25 34
19 30 11 28 11 31 21 15 12 44 12 14
58 42 11 29 20 27 11 33 17 15 19 34
11 18 12 28 11 78 11 47 35 26 11 15
70 36 32 27 58 15 11 14 52 11 14 16
18 12 57 11 14 19 18 16 5]",69,"
"
import pandas as pd
sub_csv = pd.read_csv('../ESPF/subword_units_map_chembl.csv')
idx2word_d = sub_csv['index'].values
# words2idx_d = dict(zip(idx2word_d, range(0, len(idx2word_d))))
with open('../ESPF/vocab76.to', 'r') as f:
prot_vocab = f.readlines()
with open('../ESPF/vocab.txt', 'w') as f:
for prot in prot_vocab:
f.write(prot.strip() + '\n')
with open('../ESPF/vocab.txt', 'a') as f:
for drug in idx2word_d:
f.write(drug + '\n')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论