Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
B
BiTransDPI
概览
概览
详情
活动
周期分析
版本库
存储库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
CI / CD
CI / CD
流水线
日程表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
提交
问题看板
Open sidebar
杨志辉
BiTransDPI
Commits
106fd505
提交
106fd505
authored
10月 18, 2021
作者:
朱学凯
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add pre-train
上级
9366316a
显示空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
202 行增加
和
54 行删除
+202
-54
data_preprocessing.py
data_preprocessing.py
+73
-0
dataset.py
dataset.py
+95
-26
events.out.tfevents.1634542578.gpu-athena.31905.0
...LM-1018/events.out.tfevents.1634542578.gpu-athena.31905.0
+0
-0
modeling_bert.py
modeling_bert.py
+1
-0
run_interaction.py
run_interaction.py
+19
-17
run_pretraining.py
run_pretraining.py
+14
-11
没有找到文件。
data_preprocessing.py
0 → 100644
浏览文件 @
106fd505
from
subword_nmt.apply_bpe
import
BPE
import
codecs
import
json
import
numpy
as
np
from
tqdm
import
tqdm
import
math
import
random
def
get_tokenzie_seq
(
file
,
save
,
mask
=
False
):
begin_token
=
'[CLS]'
separate_token
=
"[SEP]"
with
open
(
file
[
'seq'
],
'r'
)
as
f
:
seq
=
f
.
readlines
()
with
open
(
file
[
"smile"
],
'r'
)
as
f
:
smile
=
f
.
readlines
()
with
open
(
file
[
"affinity"
],
'r'
)
as
f
:
affinity
=
f
.
readlines
()
bpe_codes_drug
=
codecs
.
open
(
'./config/drug_codes_chembl.txt'
)
dbpe
=
BPE
(
bpe_codes_drug
,
merges
=-
1
,
separator
=
''
)
bpe_codes_prot
=
codecs
.
open
(
'./config/protein_codes_uniprot.txt'
)
pbpe
=
BPE
(
bpe_codes_prot
,
merges
=-
1
,
separator
=
''
)
with
open
(
save
,
"w"
)
as
f
:
for
i
in
tqdm
(
range
(
len
(
seq
))):
d
=
dbpe
.
process_line
(
smile
[
i
]
.
strip
())
.
split
()
p
=
pbpe
.
process_line
(
seq
[
i
]
.
strip
())
.
split
()
if
mask
==
True
:
d
=
random_mask
(
d
)
p
=
random_mask
(
p
)
final_seq
=
[
begin_token
]
+
d
+
[
separate_token
]
+
p
+
[
separate_token
]
affinity_num
=
affinity
[
i
]
.
strip
()
item
=
{
"seq"
:
" "
.
join
(
final_seq
),
"affinity"
:
affinity_num
}
new_item
=
json
.
dumps
(
item
)
f
.
write
(
new_item
+
'
\n
'
)
def
random_mask
(
input_seq
,
mask_proportion
=
0.15
):
mask_len
=
math
.
ceil
(
len
(
input_seq
)
*
mask_proportion
)
mask_token_posi
=
np
.
random
.
choice
(
len
(
input_seq
),
mask_len
)
for
i
in
mask_token_posi
:
choice
=
random
.
random
()
if
choice
<
0.8
:
input_seq
[
i
]
=
"[MASK]"
# mask_vec[i] = 1
# elif choice >= 0.8 and choice < 0.9:
return
input_seq
if
__name__
==
'__main__'
:
# file_train = {"sps": './data/train/train_sps',
# 'seq': './data/train/train_protein_seq',
# "smile": './data/train/train_smile',
# "affinity": './data/train/train_ic50',
# }
# save = "./data/tokenize_data/train.tokenize"
# save_mask = "./data/tokenize_data/train.tokenize.mask"
df_test
=
{
"sps"
:
'./data/test/test_sps'
,
'seq'
:
'./data/test/test_protein_seq'
,
"smile"
:
'./data/test/test_smile'
,
"affinity"
:
'./data/test/test_ic50'
,
}
save
=
"./data/tokenize_data/test.tokenize"
get_tokenzie_seq
(
df_test
,
save
)
# get_tokenzie_seq(file_train, save_mask, mask=True)
\ No newline at end of file
dataset.py
浏览文件 @
106fd505
...
...
@@ -271,6 +271,56 @@ class Data_Encoder_LM(data.Dataset):
return
" "
.
join
(
d
),
" "
.
join
(
p
),
y
# return len(d), len(p)
class
Data_Provide
(
data
.
Dataset
):
def
__init__
(
self
,
train_file
,
mask_file
):
'Initialization'
# load data
with
open
(
train_file
,
'r'
)
as
f
:
self
.
seq
=
f
.
readlines
()
with
open
(
mask_file
,
'r'
)
as
f
:
self
.
seq_mask
=
f
.
readlines
()
def
__len__
(
self
):
'Denotes the total number of samples'
return
len
(
self
.
seq
)
def
__getitem__
(
self
,
index
):
'Generates one sample of data'
# Select sample
# Load data and get label
item
=
json
.
loads
(
self
.
seq
[
index
])
mask_item
=
json
.
loads
(
self
.
seq_mask
[
index
])
seq
=
item
[
"seq"
]
seq_mask
=
mask_item
[
"seq"
]
y
=
np
.
float64
(
item
[
"affinity"
])
return
seq
,
seq_mask
,
y
class
Data_Gen
(
data
.
Dataset
):
def
__init__
(
self
,
train_file
):
'Initialization'
# load data
with
open
(
train_file
,
'r'
)
as
f
:
self
.
seq
=
f
.
readlines
()
# with open(mask_file, 'r') as f:
# self.seq_mask = f.readlines()
def
__len__
(
self
):
'Denotes the total number of samples'
return
len
(
self
.
seq
)
def
__getitem__
(
self
,
index
):
'Generates one sample of data'
# Select sample
# Load data and get label
item
=
json
.
loads
(
self
.
seq
[
index
])
# mask_item = json.loads(self.seq_mask[index])
seq
=
item
[
"seq"
]
# seq_mask = mask_item["seq"]
y
=
np
.
float64
(
item
[
"affinity"
])
return
seq
,
y
def
get_task
(
task_name
):
tokenizer_config
=
{
"vocab_file"
:
'./config/vocab.txt'
,
...
...
@@ -320,12 +370,8 @@ def get_task(task_name):
return
df
,
tokenizer_config
elif
task_name
.
lower
()
in
[
'train_mol'
,
"pre-train"
]:
df_train
=
{
"sps"
:
'./data/train/train_sps'
,
'seq'
:
'./data/train/train_protein_seq'
,
"smile"
:
'./data/train/train_smile'
,
"affinity"
:
'./data/train/train_ic50'
,
}
elif
task_name
.
lower
()
in
[
'train_mol'
]:
df_train
=
"data/tokenize_data/train.tokenize"
tokenizer_config
=
{
"vocab_file"
:
'./config/vocab_mol.txt'
,
"vocab_pair"
:
'./config/drug_codes_chembl.txt'
,
...
...
@@ -353,6 +399,20 @@ def get_task(task_name):
return
df_test
,
tokenizer_config
elif
task_name
.
lower
()
==
'pre-train'
:
df_train_mask
=
"data/tokenize_data/train.tokenize.mask"
df_train
=
"data/tokenize_data/train.tokenize"
tokenizer_config
=
{
"vocab_file"
:
'./config/vocab_mol.txt'
,
"vocab_pair"
:
'./config/drug_codes_chembl.txt'
,
"vocab_pair_p"
:
'./config/protein_codes_uniprot.txt'
,
"begin_id"
:
'[CLS]'
,
"separate_id"
:
"[SEP]"
,
"max_len"
:
595
}
return
df_train
,
df_train_mask
,
tokenizer_config
def
random_mask
(
input_seq
,
mask_proportion
=
0.15
):
input
=
[
i
.
split
()
for
i
in
input_seq
]
mask_len
=
[
math
.
ceil
(
len
(
i
)
*
mask_proportion
)
for
i
in
input
]
...
...
@@ -378,37 +438,46 @@ class Tokenizer(object):
self
.
max_len
=
tokenizer_config
[
"max_len"
]
self
.
vocab
=
load_vocab
(
tokenizer_config
[
"vocab_file"
])
def
convert_token_to_ids
(
self
,
d
,
p
):
mask_d
=
random_mask
(
d
)
mask_p
=
random_mask
(
p
)
input_seq
=
[[
self
.
begin_id
]
+
i
+
[
self
.
sep_id
]
+
j
+
[
self
.
sep_id
]
for
i
,
j
in
zip
(
mask_d
,
mask_p
)]
input_seq_ori
=
[[
self
.
begin_id
]
+
i
.
split
()
+
[
self
.
sep_id
]
+
j
.
split
()
+
[
self
.
sep_id
]
for
i
,
j
in
zip
(
d
,
p
)]
def
seq2emb_encoder_simple
(
self
,
input_seq
,
vocab
):
try
:
ids
=
np
.
asarray
([
vocab
[
i
]
for
i
in
input_seq
])
except
:
ids
=
np
.
array
([
0
])
return
ids
def
convert_token_to_ids
(
self
,
seq
):
# input_seq = [[self.begin_id] + i + [self.sep_id] + j + [self.sep_id] for i, j in zip(mask_d, mask_p)]
# input_seq_ori = [[self.begin_id] + i.split() + [self.sep_id] + j.split() + [self.sep_id] for i, j in zip(d, p)]
# mask_posi = np.concatenate((np.zeros(1), mask_d_posi, np.zeros(1), mask_p_posi, np.zeros(1)))
# token_type_ids = [[np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))] for d, p in zip(mask_d, mask_p)]
for
i
,
seq
in
enumerate
(
input_seq
):
if
len
(
seq
)
>
self
.
max_len
:
input_seq
[
i
]
=
seq
[:
self
.
max_len
-
1
]
+
[
self
.
sep_id
]
input_seq_ori
[
i
]
=
seq
[:
self
.
max_len
-
1
]
+
[
self
.
sep_id
]
# seq = seq.split()
all_seq
=
[
i
.
split
()
for
i
in
seq
]
for
i
,
seq_i
in
enumerate
(
all_seq
):
if
len
(
seq_i
)
>
self
.
max_len
:
all_seq
[
i
]
=
seq_i
[:
self
.
max_len
-
1
]
+
[
self
.
sep_id
]
# input_seq_ori[i] = seq[:self.max_len-1] + [self.sep_id]
# token_type_ids = token_type_ids[:self.max_len]
# mask_posi = mask_posi[:self.max_len]
# else:
# mask_posi = np.pad(mask_posi, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
# token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
all_seq
=
[]
all_seq_ori
=
[]
all_seq
_ids
=
[]
#
all_seq_ori = []
# all_mask = []
for
seq
,
ori
in
zip
(
input_seq
,
input_seq_ori
)
:
input
=
se
q2emb_encoder_simple
(
seq
,
self
.
max_len
,
self
.
vocab
)
input_ori
=
seq2emb_encoder_simple
(
ori
,
self
.
max_len
,
self
.
vocab
)
all_seq
.
append
(
torch
.
from_numpy
(
input
)
.
long
())
all_seq_ori
.
append
(
torch
.
from_numpy
(
input_ori
)
.
long
())
input
=
pad_sequence
(
all_seq
,
batch_first
=
True
)
input_ori
=
pad_sequence
(
all_seq_ori
,
batch_first
=
True
)
for
seq
in
all_seq
:
input
=
se
lf
.
seq2emb_encoder_simple
(
seq
,
self
.
vocab
)
#
input_ori = seq2emb_encoder_simple(ori, self.max_len, self.vocab)
all_seq
_ids
.
append
(
torch
.
from_numpy
(
input
)
.
long
())
#
all_seq_ori.append(torch.from_numpy(input_ori).long())
input
=
pad_sequence
(
all_seq
_ids
,
batch_first
=
True
)
#
input_ori = pad_sequence(all_seq_ori, batch_first=True)
input_mask
=
input
!=
0
# input_mask = pad_sequence(all_mask)
# return torch.from_numpy(input).long(), torch.from_numpy(input_mask).long(), torch.from_numpy(token_type_ids).long()
return
input
,
input_mask
,
input_ori
# return input, input_mask, input_ori
return
input
,
input_mask
...
...
log/mask-LM-1018/events.out.tfevents.1634542578.gpu-athena.31905.0
0 → 100644
浏览文件 @
106fd505
File added
modeling_bert.py
浏览文件 @
106fd505
...
...
@@ -1864,6 +1864,7 @@ class BertAffinityModel(BertPreTrainedModel):
self
.
embeddings
=
BertEmbeddings
(
config
)
self
.
encoder
=
BertEncoder
(
config
)
self
.
mlp
=
Multilayer_perceptron
(
config
)
self
.
lm_head
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
vocab_size
)
# self.pooler = BertPooler(config) if add_pooling_layer else None
self
.
init_weights
()
...
...
run_interaction.py
浏览文件 @
106fd505
from
argparse
import
ArgumentParser
from
dataset
import
Data_Encoder
,
get_task
,
Data_Encoder_mol
from
dataset
import
Data_Encoder
,
get_task
,
Data_Encoder_mol
,
Data_Gen
,
Tokenizer
import
torch
from
torch.utils.data
import
DataLoader
from
configuration_bert
import
BertConfig
...
...
@@ -12,7 +12,7 @@ torch.set_default_tensor_type(torch.DoubleTensor)
def
train
(
args
,
model
,
dataset
):
def
train
(
args
,
model
,
dataset
,
tokenizer
):
data_loder_para
=
{
'batch_size'
:
args
.
batch_size
,
'shuffle'
:
True
,
'num_workers'
:
args
.
workers
,
...
...
@@ -38,11 +38,12 @@ def train(args, model, dataset):
print
(
'begin training'
)
# training
for
epoch
in
range
(
args
.
epochs
):
for
i
,
(
input
,
token_type_ids
,
input_mask
,
affinity
)
in
enumerate
(
data_generator
):
for
i
,
(
input
,
affinity
)
in
enumerate
(
data_generator
):
# use cuda
# input model
# if torch.cuda.is_available():
pred_affinity
=
model
(
input_ids
=
input
.
cuda
(),
token_type_ids
=
token_type_ids
.
cuda
(),
attention_mask
=
input_mask
.
cuda
())
input_ids
,
attention_mask
=
tokenizer
.
convert_token_to_ids
(
input
)
# pred_affinity = model(input_ids=input_ids.cuda(), token_type_ids=token_type_ids.cuda(), attention_mask=input_mask.cuda())
pred_affinity
=
model
(
input_ids
=
input_ids
.
cuda
(),
attention_mask
=
attention_mask
.
cuda
())
loss
=
loss_fct
(
pred_affinity
,
affinity
.
cuda
()
.
unsqueeze
(
-
1
))
# else:
# pred_affinity = model(input_ids=input, token_type_ids=token_type_ids, attention_mask=input_mask)
...
...
@@ -66,7 +67,7 @@ def train(args, model, dataset):
print
(
'training over'
)
writer
.
close
()
def
test
(
args
,
model
,
dataset
):
def
test
(
args
,
model
,
dataset
,
tokenizer
):
data_loder_para
=
{
'batch_size'
:
args
.
batch_size
,
'shuffle'
:
False
,
'num_workers'
:
args
.
workers
,
...
...
@@ -76,6 +77,7 @@ def test(args, model, dataset):
with
torch
.
no_grad
():
# if torch.cuda.is_available():
model
.
load_state_dict
(
torch
.
load
(
args
.
init
),
strict
=
True
)
model
.
cuda
()
# else:
# model.load_state_dict(torch.load(args.init, map_location=torch.device('cpu')), strict=True)
model
.
eval
()
...
...
@@ -84,12 +86,9 @@ def test(args, model, dataset):
result
=
args
.
output
+
'/'
+
'{}.txt'
.
format
(
args
.
task
)
print
(
'begin predicting'
)
with
open
(
result
,
'w'
)
as
f
:
for
i
,
(
input
,
token_type_ids
,
input_mask
,
affinity
)
in
enumerate
(
tqdm
(
data_generator
)):
# if torch.cuda.is_available():
model
.
cuda
()
pred_affinity
=
model
(
input_ids
=
input
.
cuda
(),
token_type_ids
=
token_type_ids
.
cuda
(),
attention_mask
=
input_mask
.
cuda
())
# else:
for
i
,
(
input
,
affinity
)
in
enumerate
(
tqdm
(
data_generator
)):
input_ids
,
attention_mask
=
tokenizer
.
convert_token_to_ids
(
input
)
pred_affinity
=
model
(
input_ids
=
input_ids
.
cuda
(),
attention_mask
=
attention_mask
.
cuda
())
# pred_affinity = model(input_ids=input, token_type_ids=token_type_ids, attention_mask=input_mask)
pred_affinity
=
pred_affinity
.
cpu
()
.
numpy
()
.
squeeze
(
-
1
)
for
res
in
pred_affinity
:
...
...
@@ -103,11 +102,12 @@ def main(args):
# load data
data_file
,
tokenizer_config
=
get_task
(
args
.
task
)
# dataset = Data_Encoder(data_file, tokenizer_config)
dataset
=
Data_
Encoder_mol
(
data_file
,
tokenizer_config
)
dataset
=
Data_
Gen
(
data_file
)
# creat model
print
(
'------------------creat model---------------------------'
)
config
=
BertConfig
.
from_pretrained
(
args
.
config
)
model
=
BertAffinityModel
(
config
)
tokenizer
=
Tokenizer
(
tokenizer_config
)
if
torch
.
cuda
.
device_count
()
>
1
:
print
(
"Let's use"
,
torch
.
cuda
.
device_count
(),
"GPUs!"
)
...
...
@@ -117,10 +117,10 @@ def main(args):
print
(
'task name : {}'
.
format
(
args
.
task
))
if
args
.
task
in
[
'train'
,
'train_z_1'
,
'train_z_10'
,
'train_z_100'
,
'train_mol'
]:
train
(
args
,
model
,
dataset
)
train
(
args
,
model
,
dataset
,
tokenizer
)
elif
args
.
task
in
[
'test'
,
'test_mol'
]:
test
(
args
,
model
,
dataset
)
test
(
args
,
model
,
dataset
,
tokenizer
)
...
...
@@ -155,10 +155,12 @@ if __name__ == '__main__':
# local test
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"1"
args
.
task
=
'train_mol'
args
.
savedir
=
'local_test_train'
# args.savedir = 'local_test_train'
args
.
savedir
=
'train'
args
.
epochs
=
10
args
.
lr
=
1e-5
args
.
config
=
'./config/config_layer_3_mol.json'
args
.
config
=
'./config/config_layer_6_mol.json'
...
...
run_pretraining.py
浏览文件 @
106fd505
from
argparse
import
ArgumentParser
from
dataset
import
Data_Encoder
,
get_task
,
Data_Encoder_mol
,
Data_Encoder_LM
,
Tokenizer
from
dataset
import
Data_Encoder
,
get_task
,
Data_Encoder_mol
,
Data_Encoder_LM
,
Tokenizer
,
Data_Provide
import
torch
from
torch.utils.data
import
DataLoader
from
configuration_bert
import
BertConfig
...
...
@@ -39,15 +39,17 @@ def train(args, model, dataset, tokenizer):
print
(
'begin training'
)
# training
for
epoch
in
range
(
args
.
epochs
):
for
i
,
(
drug
,
protein
,
affinity
)
in
enumerate
(
data_generator
):
input
,
input_mask
,
input_ori
=
tokenizer
.
convert_token_to_ids
(
drug
,
protein
)
for
i
,
(
seq
,
seq_mask
,
affinity
)
in
enumerate
(
data_generator
):
input_random_mask
,
attention_mask
=
tokenizer
.
convert_token_to_ids
(
seq_mask
)
label
,
_
=
tokenizer
.
convert_token_to_ids
(
seq
)
# pred_affinity = model(input_ids=input.cuda(), token_type_ids=token_type_ids.cuda(), attention_mask=input_mask.cuda())
logits
=
model
(
input_ids
=
input
.
cuda
(),
attention_mask
=
input
_mask
.
cuda
())
logits
=
model
(
input_ids
=
input
_random_mask
.
cuda
(),
attention_mask
=
attention
_mask
.
cuda
())
# loss = 0
pred_logits
=
logits
[
input
==
1
]
label
=
input_ori
[
input
==
1
]
loss
=
loss_fct
(
pred_logits
,
label
.
cuda
())
posi
=
torch
.
where
(
input_random_mask
==
1
)
pred_logits
=
logits
[
posi
]
target
=
label
[
posi
]
loss
=
loss_fct
(
pred_logits
,
target
.
cuda
())
# else:
# pred_affinity = model(input_ids=input, token_type_ids=token_type_ids, attention_mask=input_mask)
# loss = loss_fct(pred_affinity, affinity.unsqueeze(-1))
...
...
@@ -105,9 +107,9 @@ def test(args, model, dataset):
def
main
(
args
):
# load data
data_file
,
tokenizer_config
=
get_task
(
args
.
task
)
data_file
,
data_mask
,
tokenizer_config
=
get_task
(
args
.
task
)
# dataset = Data_Encoder(data_file, tokenizer_config)
dataset
=
Data_
Encoder_LM
(
data_file
,
tokenizer_config
)
dataset
=
Data_
Provide
(
data_file
,
data_mask
)
tokenizer
=
Tokenizer
(
tokenizer_config
)
# creat model
print
(
'------------------creat model---------------------------'
)
...
...
@@ -160,9 +162,10 @@ if __name__ == '__main__':
# local test
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"5"
args
.
task
=
'pre-train'
args
.
savedir
=
'mask-LM-quick'
args
.
savedir
=
'mask-LM-lr-1e-4-1019'
# args.savedir = 'train'
args
.
epochs
=
30
args
.
lr
=
1e-
5
args
.
lr
=
1e-
4
args
.
config
=
'./config/config_layer_6_mol.json'
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论