提交 074f30dc 作者: 杨志辉

Initial commit

上级
model/
model0/
tfrecord/
train/
logs/
log/
visualize_attention/attention_mat.npy
baselines/
case_study/
experment_result/
predict/
doc/
utils/train_data_analyse.csv
utils/test_data_analyse.csv
utils/data_analyse.xlsx
*.ipynb_checkpoints/
.idea/
.DS_Store
data/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
{
// 使用 IntelliSense 了解相关属性。
// 悬停以查看现有属性的描述。
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: 当前文件",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true,
"cwd": "${fileDirname}"
}
]
}
\ No newline at end of file
## Title ##
Advancing Drug-Target Interaction Prediction with BERT and Subsequence Embedding
## Abstract ##
Exploring the relationship between proteins and drugs plays a significant role in discovering new synthetic drugs. The Drug-Target Interaction (DTI) prediction is a fundamental task in the relationship between proteins and drugs. Unlike encoding proteins by amino acids, we use amino acid subsequence to encode proteins, which simulates the biological process of DTI better. For this research purpose, we proposed a novel deep learning framework based on Bidirectional Encoder Representation from Transformers (BERT), which integrates high-frequency subsequence embedding and transfer learning methods to complete the DTI prediction task. As the first key module, subsequence embedding allows to explore the functional interaction units from drug and protein sequences and then contribute to finding DTI modules. As the second key module, transfer learning promotes the model learn the common DTI features from protein and drug sequences in a large dataset. Overall, the BERT-based model can learn two kinds features through the multi-head self-attention mechanism: internal features of sequence and interaction features of both proteins and drugs, respectively. Compared with other methods, BERT-based methods enable more DTI-related features to be discovered from general features of proteins and drugs through transfer learning. We conducted extensive experiments for the DTI prediction task on three different benchmark datasets. The experimental results show that the model achieves an average prediction metrics higher than most baseline methods. In order to verify the importance of transfer learning, we conducted an ablation study on datasets, and the results show the superiority of transfer learning. In addition, we test the scalability of the model on the dataset in unseen drugs and proteins, and the results of the experiments show that it is acceptable in scalability.
## How to use ##
Use the following command to pretrain the model:
```shell
sh pretrain.sh
```
Use the following command to fine-tune the model:
```shell
sh fine_tune.sh
```
Use the following command to predict the binding affinity score:
```shell
sh test.sh
```
\ No newline at end of file
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 384,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 23614
}
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 384,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 3,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 23614
}
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 595,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 3,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 40235
}
\ No newline at end of file
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 6,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 23615
}
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 595,
"max_len": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 6,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 40235
}
\ No newline at end of file
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 384,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 9,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 23615
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "00ef6494-43ce-43de-b91c-a8039d19fdcb",
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import pickle"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "84cdc51a-91e9-4af5-8404-5ec2b5059044",
"metadata": {},
"outputs": [],
"source": [
"# 读取分子token字典\n",
"with open('subword_units_map_chembl.csv', 'r', encoding='utf-8') as f:\n",
" reader = csv.reader(f)\n",
" next(reader) # 跳过标题行\n",
" chembl_token_dict = {}\n",
" for row in reader:\n",
" token = row[2]\n",
" index = token\n",
" frequency = int(row[3])\n",
" chembl_token_dict[token] = frequency\n",
" # print(token, frequency)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "82e79dba-ec91-463e-821e-778197f240d7",
"metadata": {},
"outputs": [],
"source": [
"# 读取蛋白质token字典\n",
"with open('subword_units_map_uniprot.csv', 'r', encoding='utf-8') as f:\n",
" reader = csv.reader(f)\n",
" next(reader) # 跳过标题行\n",
" uniprot_token_dict = {}\n",
" for row in reader:\n",
" token = row[2]\n",
" index = token\n",
" frequency = int(row[3])\n",
" uniprot_token_dict[token] = frequency\n",
" # print(token, frequency) "
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "5ed22d0d-87e5-47d6-9f1f-914f904150c2",
"metadata": {},
"outputs": [],
"source": [
"#创建一个special token字典\n",
"special_token_dict = {\n",
" '[PAD]': 1,\n",
" '[MASK]': 1,\n",
" '[CLS]': 1,\n",
" '[SEP]': 1,\n",
" '[UNK]': 1,\n",
" '[unused1]': 1,\n",
" '[unused2]': 1,\n",
" '[unused3]': 1,\n",
" '[unused4]': 1,\n",
" '[unused5]': 1\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "8e0472c9-1cdb-4fd5-8483-72acc0308e58",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"L\n",
"V\n",
"S\n",
"I\n",
"T\n",
"A\n",
"R\n",
"M\n",
"P\n",
"H\n",
"Z\n",
"F\n",
"K\n",
"O\n",
"B\n",
"C\n",
"X\n",
"N\n",
"SS\n",
"NN\n",
"CS\n",
"FC\n",
"CN\n",
"NC\n",
"CC\n",
"CCS\n",
"CCN\n"
]
}
],
"source": [
"token_frequency = {}\n",
"for token, frequency in chembl_token_dict.items():\n",
" token_frequency[token] = frequency\n",
" \n",
"for token, frequency in uniprot_token_dict.items():\n",
" if token in token_frequency:\n",
" print(token)\n",
" token_frequency[token] += frequency\n",
" else:\n",
" token_frequency[token] = frequency\n",
" \n",
"for token, frequency in special_token_dict.items():\n",
" token_frequency[token] = frequency"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "a6e2d04c-900b-4446-844a-4cd9e8d1cfa2",
"metadata": {},
"outputs": [],
"source": [
"#存储到pickle中\n",
"with open('token_frequency.pickle', 'wb') as f:\n",
" pickle.dump(token_frequency, f)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "64bc73cc-aae2-4741-a98b-aae147c705fe",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"字典中的item数量为: 40208\n"
]
}
],
"source": [
"# 从pickle文件中读取字典\n",
"with open('token_frequency.pickle', 'rb') as f:\n",
" token_frequency = pickle.load(f)\n",
"\n",
"# 获取字典中的item数量\n",
"num_items = len(token_frequency.items())\n",
"\n",
"# 输出item数量\n",
"print(\"字典中的item数量为:\", num_items)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "33d4a5e1-4a03-41ae-a0ee-f60059d571c0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"100"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"token_frequency[')N7']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b55735c-e495-4d7b-bf9f-600e665bf8db",
"metadata": {},
"outputs": [],
"source": [
"###注意:蛋白质token有16,693个,分子token有23,532个,special token有10个,共计40,235个\n",
"###创建的pickle文件中有分子和蛋白质交叉的字符,所以合并后有40208个\n",
"#L\n",
"# V\n",
"# S\n",
"# I\n",
"# T\n",
"# A\n",
"# R\n",
"# M\n",
"# P\n",
"# H\n",
"# Z\n",
"# F\n",
"# K\n",
"# O\n",
"# B\n",
"# C\n",
"# X\n",
"# N\n",
"# SS\n",
"# NN\n",
"# CS\n",
"# FC\n",
"# CN\n",
"# NC\n",
"# CC\n",
"# CCS\n",
"# CCN"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import csv
import pickle
# In[24]:
# 读取分子token字典
with open('subword_units_map_chembl.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f)
next(reader) # 跳过标题行
chembl_token_dict = {}
for row in reader:
token = row[2]
index = token
frequency = int(row[3])
chembl_token_dict[token] = frequency
# print(token, frequency)
# In[25]:
# 读取蛋白质token字典
with open('subword_units_map_uniprot.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f)
next(reader) # 跳过标题行
uniprot_token_dict = {}
for row in reader:
token = row[2]
index = token
frequency = int(row[3])
uniprot_token_dict[token] = frequency
# print(token, frequency)
# In[26]:
#创建一个special token字典
special_token_dict = {
'[PAD]': 1,
'[MASK]': 1,
'[CLS]': 1,
'[SEP]': 1,
'[UNK]': 1,
'[unused1]': 1,
'[unused2]': 1,
'[unused3]': 1,
'[unused4]': 1,
'[unused5]': 1
}
# In[28]:
token_frequency = {}
for token, frequency in chembl_token_dict.items():
token_frequency[token] = frequency
for token, frequency in uniprot_token_dict.items():
if token in token_frequency:
print(token)
token_frequency[token] += frequency
else:
token_frequency[token] = frequency
for token, frequency in special_token_dict.items():
token_frequency[token] = frequency
# In[20]:
#存储到pickle中
with open('token_frequency.pickle', 'wb') as f:
pickle.dump(token_frequency, f)
# In[21]:
# 从pickle文件中读取字典
with open('token_frequency.pickle', 'rb') as f:
token_frequency = pickle.load(f)
# 获取字典中的item数量
num_items = len(token_frequency.items())
# 输出item数量
print("字典中的item数量为:", num_items)
# In[23]:
token_frequency[')N7']
# In[ ]:
###注意:蛋白质token有16,693个,分子token有23,532个,special token有10个,共计40,235个
###创建的pickle文件中有分子和蛋白质交叉的字符,所以合并后有40208个
#L
# V
# S
# I
# T
# A
# R
# M
# P
# H
# Z
# F
# K
# O
# B
# C
# X
# N
# SS
# NN
# CS
# FC
# CN
# NC
# CC
# CCS
# CCN
This source diff could not be displayed because it is too large. You can view the blob instead.
#version: 0.2
L L
A A
A L
V L
G L
E L
S L
G G
S S
E E
T L
D L
R L
I L
A V
K L
A G
V V
A E
K K
S G
A I
P L
A R
A D
A S
Q L
T G
A K
V E
N L
F L
V I
V G
A T
K E
R R
V D
V S
P G
I E
P E
I G
I D
V T
R E
I S
A Q
D G
V K
D E
P S
Y L
R G
I T
A F
N G
K G
A P
V R
T T
I K
F G
S E
A N
V P
H L
I N
M L
I R
T S
T E
V N
Q Q
F S
D D
I P
F E
K S
Q E
Q G
D S
R S
Y G
T P
K R
K N
K D
V F
A Y
P P
K T
N N
A M
R D
V Q
N E
I I
F D
H G
Y E
N S
T D
P D
Q S
Y S
A H
M G
K P
I F
T R
Q R
M E
V Y
M S
K I
K Q
C L
N P
I Q
W L
E G
T F
V H
R P
Y D
N D
C G
R F
K F
V M
Q D
I Y
H S
R T
K Y
R Q
R N
Q P
T N
S D
R I
F F
R K
T I
H E
M D
T Q
A C
R Y
H P
A LL
D P
F P
S P
G E
T K
T Y
AA L
V C
F N
M P
I H
E S
A W
Q N
E D
I V
T V
LL L
F T
Q K
R H
Y P
G S
R V
E K
M N
C S
F Y
Q T
K V
D F
D I
N F
N I
K H
Q I
N Y
Q F
M T
D Y
N K
A GL
N T
D T
D R
M K
E R
Q Y
D K
P T
F R
W G
A VL
S T
N R
M R
F I
A EL
Q H
AA G
S R
C E
F K
D N
V W
S K
P R
Y R
LL G
T H
E N
A SL
AL G
S F
P F
Q V
A GG
S N
P N
S I
M I
Y Y
E I
A DL
H D
P K
E T
Y F
A RL
P I
H H
Y N
D V
P V
A IL
P Q
Y T
GG G
A TL
H R
S Q
E Q
H F
A EE
K M
A KL
A SS
I C
P Y
M Q
S Y
D Q
N Q
M F
S V
GL G
LL E
VL G
A SG
W E
H T
Y K
M V
C D
SL G
H I
V AA
A KK
E P
H Y
C P
W S
V LL
EL G
H Q
F Q
H N
I M
Y Q
AL E
R M
F V
G D
N V
T M
A TG
RL G
LL S
C R
A QL
AV G
Y I
H K
IL G
AA E
W D
A PL
T C
M M
SS S
E F
A FL
P H
EL E
VL E
EE E
TL G
AE G
AL S
VV G
LL D
T W
SS G
DL G
Q M
KL G
AD G
F H
W R
V AL
F C
SL S
A PG
VI G
D M
N H
A NL
AI G
C T
D H
E V
GL S
M Y
I W
R C
S H
R LL
K C
EE G
R W
AR G
VL S
AA S
N M
V AV
K W
VD G
E Y
VE G
K LL
A PE
AK G
T LL
V GG
V VL
C F
V AG
C N
D W
M H
V SL
Y H
K AL
K EL
R AL
K KL
GL P
A PS
GL D
A NG
PE G
SL E
W Q
KE G
V TL
AK E
V EL
FL G
GL E
P M
V AE
IL E
DL E
DL S
AV E
PL G
C Q
A YL
F M
ID G
W N
I LL
I AA
IL S
C Y
AR E
T AL
V EE
VL D
TG E
KK G
VV E
C K
C I
EL S
F W
T AA
RL E
RE G
V SG
AL D
AI E
RL S
GG S
V SS
R AA
TL S
A ML
V DL
TL E
LL P
RR G
NL G
VK G
QL G
AG E
KL E
AL R
IE G
V GL
K AA
SG S
SL P
W F
Q LL
ID E
AV S
V PL
W T
I AL
D LL
R EL
AQ G
AF G
AG S
A HL
W P
Q AL
V TG
V AD
R VL
V IL
KL S
K VL
W I
VV S
V KE
VD E
V AS
D AL
N C
I GG
V AI
K SL
K TL
D C
V AT
V AR
Y M
V KK
VL P
I AE
PL S
I EE
AD E
AS E
I AG
AA R
N W
C H
K IL
K EE
VG E
QL E
T SS
S M
VI E
SG E
C C
AA D
KK E
K RL
I SS
V RL
VS E
SL D
K AV
V RE
Q W
VV D
T GG
R RL
AA AA
TG S
N LL
V KL
VI S
T VL
PL P
K GL
YL G
V AK
K DL
EL D
NN NN
P LL
Q C
R GG
A YG
Y C
I DL
I VG
FL S
Q AA
V PG
T GL
P W
F LL
IS E
P C
R SL
V RR
N AL
K AI
FL E
NL S
IL D
I AR
I SL
Y W
I KE
QL S
VI D
H C
Q RL
P AA
T SL
AG D
H M
NL E
I AV
I GL
SE G
IG E
DG S
M AA
DE G
I SG
AT S
SS E
AI S
FE G
AL P
W Y
I VL
RG E
R AV
T AE
R GL
R IL
VT E
PL E
I IG
I AS
K VV
K AG
VG S
I VE
HL G
T EE
T TL
DL D
AV D
P PG
I KK
R KL
R VV
KG E
I AD
Q EL
LL N
AT E
K AK
V NG
R DL
F GG
K AE
R TL
QQ QQ
VG D
R AR
A HG
I TL
R AG
I EL
T AV
Q VL
T VE
RR E
DG E
I AK
D EE
DL P
H LL
VT S
GG E
V PE
PG S
T EL
K NL
IT S
V PS
AE S
AQ E
P EL
K SS
ML G
V RG
T VV
V NL
Q KL
D VL
Q TL
M AL
T AG
IT E
IG S
VD S
Q QL
T AT
T PE
I VS
I ID
T AS
V FL
I VV
I PL
AI D
PG E
V AQ
I IE
Q AV
W K
I IL
P SS
I AT
T PL
Q SL
TE G
P AL
I TG
VK S
N SL
T VT
K DG
IK S
K QL
I RE
AR S
FG S
K VI
VE S
KK S
IN E
AA P
R EE
N KL
R AE
SS SS
D EL
H W
K GG
M AK
PS P
I VD
D IL
V QL
K SG
IE S
I IS
T IL
R QL
I KL
NG E
QE G
TL P
T SG
N IL
F GL
AD S
AF S
R SG
AM G
N VL
M AE
K PL
RL P
V AN
K VE
T DL
RR S
AK S
T AI
VR S
F AA
I AI
F SL
T VS
T KL
EL P
N AA
N SS
D AA
PP P
V AP
F SS
K TT
Q IL
R AI
A WL
DE D
TL D
R SS
IE D
R PL
M LL
IK D
R DG
VK D
F SG
Y LL
R PG
VN E
T RL
M C
I RL
PS E
Q AE
AS D
V FG
W H
YE G
T PG
R TG
AS P
Q GL
F EE
YL E
VS D
K TG
V AF
ID S
YL S
R FL
Q AG
PE S
I VT
I RR
I NL
F AG
K AF
ML E
K IS
P VL
T AR
N TL
F AL
P EE
TT S
R VG
VD D
T AD
IR S
Q EE
K AR
T TG
M AR
P GG
M KK
R KK
PE D
I AQ
Q AI
KL D
K RR
M AG
K VG
K IE
T FL
N EL
K VT
I PG
M AT
T VG
Q VV
SE S
T KE
KL P
AN S
N AV
A CL
V FS
T IG
EE D
K VK
T IS
K IK
T VI
I IT
T IT
QQ G
VE D
M AV
P AG
R VI
N GL
VR D
K PG
R IG
AE D
IL P
K IT
EE S
NG S
KG S
RL D
M SL
VT D
GG D
RG S
K PE
P SL
R IE
W M
P GL
IN S
P SG
IS D
T KK
HL E
T AK
VL N
T IE
M AS
FG E
V QG
A CG
I FL
K AS
V FE
K VS
I VK
R VE
AN E
F DL
K IG
K AD
KE S
R NL
Q DL
AF E
T DG
F DE
AL N
I KG
N DL
F VL
Q AR
AK D
I VI
T PS
N RL
IR D
T VR
K NG
K RG
I IN
QG E
TG D
P DG
FL D
T SE
T QL
N AG
F VG
T VD
R AS
F AV
M W
P AV
N VV
P VV
ID D
T VK
R KE
V QE
SS D
V YL
P AP
K DE
I IK
Q PL
I PE
HL S
R AD
K AQ
DD D
K AT
K VD
R YL
I NG
M GL
Y GL
M VL
K TS
K SE
NE G
VN S
R AK
K RE
R VS
T NL
LL Q
N AI
F AE
H AL
I AN
M SS
M KL
IG D
T DE
IT D
F TL
K YL
R VK
M EE
Y AA
I FG
K ID
AD D
I PS
N PL
N KK
V YG
AQ S
H PL
I VR
VT P
H SL
K DS
R VR
NL D
K FL
F DG
N IS
NL P
AT D
I VN
M GG
K TE
R DE
N VG
T RE
TT E
N GG
I AP
N SG
M SG
F AR
AL Q
H GL
VV P
F VS
N EE
M SE
AA Q
I RG
AY E
R FG
SS P
R VT
R KR
ME G
F AD
T AP
T ID
DS D
F IL
R IT
DE S
Y SL
PG D
AR D
Y VL
F AS
F EL
DD S
P TG
Q VI
N PE
T VP
R IS
K YG
F SE
PL D
R KG
F KE
Y AL
ML S
Q SS
V AY
M AD
VF D
F VE
SG D
N VI
N IG
N AS
N VS
M VE
F ID
I IP
F VD
V QQ
F IG
ALL L
T AQ
T DS
F IS
P DL
M EL
M KE
T RR
LL H
K AN
R AQ
F TG
KK D
RE S
N QL
YL D
P AS
I QE
YG S
M AI
R AT
K IR
T KG
N IE
F VV
Y DL
AF D
I FS
K IN
K VR
TE S
T IK
Q AQ
V AH
R IR
W C
CL G
Y GG
I QL
WL G
N AE
N NL
DG D
F IE
Q KK
Q NL
H AA
M TL
N ID
YG E
VS P
V ML
AM E
AT P
H TL
T AF
W W
Q RR
H GG
I AF
EE EE
Q VE
Q KE
Q IE
N VE
F AK
T FG
R VD
K RS
Q GG
VQ S
K PS
R SE
F KK
FS D
H VD
M RL
I IR
F KL
Y RL
QG S
Y EE
Q AK
KG D
N AK
V AM
T IR
FE D
N KE
F PG
K AP
RG D
QL D
C W
Q AS
N VT
TS D
K II
FG D
VN D
T IN
QQ E
R PS
K TP
C LL
I VP
Q PG
R KS
AV P
Y AG
N FL
Y EL
M DL
Q SG
T RS
PP PP
K IP
FE S
N AT
H RL
T FS
Y SG
Q VG
Y SS
KN E
K VP
QL P
Y IL
N PD
F PE
T RG
F FL
H PE
N IK
V YE
V YS
N AR
V HL
KN S
R PE
N IT
AY S
M IE
KE D
R DS
T KS
R ID
N AD
N VD
P TL
I YG
H VL
F VT
Y VG
R VP
P VT
Y TL
F NL
F AI
K VN
IN D
H AV
F TS
F AT
N IP
M AQ
Q AT
F DD
M DE
RE D
Q AD
Q VS
K QE
R HL
F RL
ML D
M VV
M VG
EG E
Q IS
Y AE
IT P
F AP
K VF
F FG
T IP
F RR
R QG
Q FL
M IL
R AP
M TE
H IL
VI P
GG GG
T VN
V MG
I YL
NG D
ID P
K DD
R AF
M TT
H AG
T YL
HG E
Q IG
N SE
T AN
R PD
T KR
Q IK
H EL
AA AL
K QG
M KR
PS D
Q RE
C GG
F KG
P AE
R QR
HG S
R VF
R IK
AD Y
IS P
HL D
VD P
I AY
TS P
SE D
T NG
R DD
F VK
M RE
F TT
F IN
Q VT
N VK
M IG
T DD
N VP
M PL
Q AF
F VI
M AN
K FG
T KD
Y AS
N IN
M NL
F PL
Q VK
H SS
H PD
F DS
Q VR
R TE
T FD
Q YL
N TT
AP P
T QE
I VF
F VP
Y AR
Y FL
M VD
AL AA
R TR
N YL
I QS
VG P
AG P
TP D
AP D
AN P
Q KG
K HL
Y KL
F IT
N FS
K ML
F TE
F RE
F VR
W LL
T VQ
RR D
NN S
Q RG
T RD
P KG
F KD
R TS
M TG
P VI
K AY
Y KE
M TD
VP D
R QE
ALL G
M VS
Y AD
P VS
K IF
V CG
I AH
N PS
M RR
Y AV
R FS
H KL
R IP
I FE
N TG
I QG
Y DE
I VQ
TE D
I AM
R FE
Q PS
T VF
P IL
R NG
Q SE
Y VV
Q PE
F IK
T NS
F FS
M VK
F AN
LL LL
N KN
Y RR
K VY
H DL
I YE
N PG
R TP
Q DG
K QQ
N TS
QE S
AD P
H SG
T NE
T KT
V ME
R KT
Y TG
MG S
H VV
K QR
I QQ
N KS
FL P
C M
N KG
C SS
I FD
R YG
T YG
M TS
T QG
R NE
Y ID
MG E
F NE
M QL
H NL
Y VE
Y VS
AM S
R ML
P RG
N TP
AAL G
Q VD
CL S
Q IR
Y RE
M PG
T HG
Q TG
R PP
F RG
AH E
VY D
F KS
V HG
Y IE
I YS
R IN
F AQ
K FE
N RR
Q ID
N DE
Q VP
R KN
R KD
P QL
AN D
K PD
Y PL
T KN
Q RS
R II
N II
Y KK
Y AK
Y RG
T NP
CG S
IG P
AL H
F TP
C GL
F VN
IP D
M KS
R TT
Y VD
Y SE
M IK
P VG
K FS
R QQ
I QR
R FD
Y PG
F KN
C SL
M VT
K AM
I VH
YG D
Y AT
Y IS
M DG
F TD
P RL
F QL
Q KR
M VR
R VQ
NN E
P KL
N KD
N AQ
F RS
R HG
P AT
N DD
AA AV
P PL
M AF
M KN
H PG
M AP
F IR
T FE
H VE
T AY
N DG
Q IT
R AN
T HL
H AI
Y DG
CL E
H AE
F KR
F FD
R NS
P VE
M KD
ME S
P VD
N KT
KS D
T II
NE S
P AR
F RD
F PD
Y IG
F YL
K VQ
H QL
N VN
Q AP
H AD
RS D
N IR
YE S
I HL
Y YL
M KT
H VG
N AN
AA LL
P AI
M VI
Q TE
AH S
Y QL
T ML
KK P
R IF
Q KS
Q DE
F NS
Q ML
T QS
T IF
Q HL
M ID
C SG
Y AI
K QS
P FL
AA GL
K FD
R AM
QQ S
Q PP
WL E
P AD
MS D
AQ D
R AY
H TG
VQ D
Q DS
Q TT
F FE
Y VP
K YS
Q NG
N DS
N VR
Y KG
IQ D
M IS
Q TS
N AF
F VF
P RR
LLL G
N RE
N NG
P PS
I YD
H VI
N RS
AA AG
T YS
T KP
F PS
H RR
P TS
I ML
TT D
Y NL
N IF
C AL
QE D
Y FS
R KI
T AH
F YG
R KP
R NP
N TE
N KI
N AP
P DE
Y KD
H FG
M IT
H VS
V CL
R VN
H AR
Y VT
AY D
H EE
C AA
AL AL
C VV
Y RD
F KT
M TP
F NN
T YE
H ID
T NN
P NG
N HL
N FG
M DS
NS D
Q TP
M IR
V MS
VH E
K PP
H FL
T ND
P TP
T IQ
QG D
F NG
R KQ
H PS
Q KT
I VY
NE D
Q VF
P KK
KR D
M PS
MG D
K YE
R TD
Y FG
N FE
M FL
P VP
Y VI
F ND
F IP
M FG
M RS
NNNN NNNN
P TT
F AF
VH S
T AM
AE AL
YS D
WL S
M KG
V AC
Q IP
Q VQ
H IS
Y VK
R TF
Q IN
EG S
C VL
T QQ
Q FG
H TS
H IG
R YE
I HS
P NL
R KF
Q VN
AA AR
Q AN
Y KN
F QG
Q KD
M RG
H SE
Y DD
HG D
F QE
R QS
TT TT
R VY
H IE
H HL
R ND
M PE
N KR
AL AE
N RF
Y PD
Y TS
Y RS
C AV
Y QG
I CL
Y VR
F AY
Y PE
Y IT
Y IK
M DD
V WL
M KQ
M QE
T VY
AAL E
AA C
F II
AW G
Q KQ
Y AQ
Y IN
P AK
Y TE
P KE
T KI
N RG
Q NS
M TR
Y DS
AGL G
N TD
KT D
AV AA
N VF
V LLL
GG P
H AS
Y AP
K HG
H IT
R IQ
VM D
Q AM
F RN
H AP
R AH
H YL
YE D
H EG
M RD
M IN
P AQ
Q FE
P DS
H RS
N FD
F TR
V AAL
QS D
C DL
ALL E
N KP
H VT
P VK
P IG
F IF
M KI
H DG
R YD
Y FE
H RG
LL C
N TF
Q YG
AL C
Y TD
M NP
H AK
F KP
T IY
T KF
H KK
Y QE
Y YG
P RS
I HE
Q II
AE AA
H VP
Y TT
R MG
SL C
N YG
N YS
M TN
M VN
Q KI
Q NE
H DE
R IY
CG E
T VM
Y QQ
AM D
Q DD
K YD
H NG
Y VN
AGG G
I ME
T QD
T KQ
P QG
H AT
N IY
Q PD
Y IP
M NS
Y TP
R NN
AG LL
F VQ
AC E
H RD
P VR
M NE
P RE
P IS
Q FS
C RL
Y IR
H IP
VC S
M IP
F YE
Y KS
Q KN
F QS
F TF
C AG
Y NG
R WL
K AH
W AA
AL AG
N IQ
C SE
N QE
H VR
T QR
M PP
EL AA
P AF
C EE
R YS
Y NS
P TE
T MG
AA GG
F YS
N QS
F KQ
A A</w>
M AH
Q IQ
H KE
N RD
I HG
ME D
Y PS
P PE
LL W
P IT
M QQ
P KS
P VF
M QR
LL AL
Q HG
M NN
EL AK
Q RD
N PP
C AS
KN D
F QQ
DS DS
WL D
H QR
M VF
Q TR
Q NN
Q IF
I MG
Q KP
M FS
F NP
HS D
W SL
N KF
AIL G
Y AN
T YD
F QR
C KE
N QQ
M AM
W AL
M AY
AR AL
AVL G
I VM
Y AF
T KY
KK KK
W GG
M VQ
F HL
N RN
M VP
H FS
K MG
C VG
LL GL
C EL
P IE
Q AY
Y NE
N RT
C TG
T VH
F TN
R VM
H IK
N ML
AA AE
R VH
P DD
W GL
H KG
EL AE
K VM
SG P
K MS
H VF
N VQ
W KE
N TN
T QP
VL AL
Q FD
W DL
VC E
F IQ
N AY
M NG
M FE
M YL
N TR
AG AG
EG D
W KK
AL RE
H RE
P NS
F HP
AC S
F PP
AE EL
SL SS
H TE
AE LL
H IR
KL KE
C VS
I CG
P KD
W VL
N RP
P YL
F QN
C IL
W RE
P AN
Y YD
CL D
K VH
H TT
F HG
AK EL
SL EE
C TL
Y KT
V AAG
R KY
V ALL
VL AA
T HS
P RP
C KK
LL SL
AL GL
AS AS
W RL
K ME
AE RL
H VK
H TD
Q TD
M RQ
LL EE
T MS
Y FD
N QG
H QQ
I HP
P FS
AG AA
C PG
Y NN
Q MS
N RI
PP D
N YE
M KF
HE S
H AF
N AM
AL AR
TL AA
Q NP
AI AA
F TQ
F ML
Y TR
H VN
F YD
W VE
I MS
F RQ
LL AA
P FG
C AI
H TP
N VY
M KP
N YD
W IL
Y YS
W TL
EL EE
M QG
H IN
F IY
EL VE
AV LL
W SS
N KQ
Q KF
K MD
W RR
LLL E
EL KK
M FD
KS TL
AL AD
C DG
AS GG
EL AR
M TQ
Y QP
GL GL
M ML
N FF
M ND
F QD
Y RP
VL C
R QD
H FE
H FD
T MD
Y VQ
AEL G
P TR
F VY
Y VF
R HP
W EE
AW E
M PD
H QG
C FL
Q VY
C VE
R HS
Y QS
C DE
F RP
C PS
AEL E
RR RR
M VM
C VI
P TD
Q TQ
VV AA
R CL
N MG
AD AV
AE KL
H AQ
H DD
Y AH
W KL
Q YS
N QR
W RG
Q AH
P IP
Y QR
AL GG
Y II
SL GL
Y NP
AL EE
K K</w>
P ID
C RE
R ME
T ME
Q IY
N TI
T CL
LLL S
P QS
AD AL
Y KR
I MD
AS LL
M RK
W AG
AA VL
N QD
DD DD
C PL
Y RK
W VS
Y RQ
Q TF
C RR
EE LL
C KL
SL SL
P KP
F CG
SG SG
N IH
AA RL
C IS
Q WL
TG KT
AR RL
V AW
P KR
AD LL
GE S
P VQ
KQ D
W VV
AA AS
Y AY
VE EL
T HP
H KR
H YG
W EL
AA EL
Y HL
M RF
AL AS
ASL G
M QS
M IF
K HS
ARL G
VL AG
F TK
W AS
ALL D
Y QD
IE EL
P RD
P HL
AE AE
C IG
ALL S
W SE
I AC
N KY
KE KL
W AR
AH D
Q KY
P VN
N IV
VG AG
P QQ
Y V
F QP
P KT
M IQ
AA AK
C PE
M TI
EE EL
AE AV
AL SL
RS RS
W AV
TGE KP
F RF
H KD
H AN
F AM
P KN
SS SL
M TK
GL SG
H YE
T WL
F RT
W EG
K WL
I AAL
M TF
Q YE
AV AV
C NL
HE D
W VT
C AE
Y IF
F KI
W RD
N FP
H DS
C VD
F RK
W DE
Y YE
C SD
Y RN
LL AE
Q FP
P QE
Y ND
C AR
H RF
W AE
LL AG
AW S
LL DE
Q VM
F HS
VL VG
W SG
H AH
AA AI
F VH
F CL
X X
AL VE
ATL G
M RN
QQQQ QQQQ
H KS
M KY
W IE
C PP
LL TL
AL RR
C DP
AL RL
H ML
LL KE
M QD
VL GL
GE D
W KG
AD VV
N RY
LL K
F MG
A K</w>
H V
H TF
Q TN
H HP
Q MG
P IR
H AY
Q TI
P IK
GL PG
Y IQ
AP AP
AL EL
TL EE
IG AG
AL KE
C QL
GL AV
Y RF
M ME
M II
W AI
K CL
RL AA
NN D
LL KK
Y KP
LL EL
W QL
LL SS
H NP
C RS
IL AA
F TI
I MP
H DP
Q CL
RE KL
Q VH
RL AE
R HE
N VH
Y HG
N QN
N FN
A E</w>
W NG
VK P
T AC
W IG
N TY
H QE
Y ML
LL AR
W NL
M RP
C AT
AL VL
AL AQ
VE KL
M RT
N QP
VK EL
AA AP
ASS G
AV AR
V AGL
ADL G
C VP
H RP
F RY
KL EE
AS SL
W RS
T HE
H TR
AL AV
AAL S
T VC
C AD
F AH
RL GL
M YE
AK AA
K HE
H NE
C RG
TG SG
V AEL
AE AI
M RI
M HL
AVL E
K AC
VW S
ASS S
P IF
LL R
AG VD
C EG
I VC
H II
EL IE
H VQ
H KT
PG P
Y KQ
AR AA
Y VH
C IE
C TS
T CG
H IF
VE RL
C SP
VV AG
N RQ
W VK
GL AL
M FF
F HE
N CL
P II
R MS
Y KI
P AY
N WL
VL AK
R ALL
W QE
N HS
AG AV
Y FP
RL ID
AA K
Y TN
P VY
AL K
Y PP
DL AA
H KN
Y IY
H RQ
GL AA
AR LL
H HG
P QP
C TE
V ADL
T CS
M YS
VL GG
AG VP
P IN
Y RT
SS LL
SL AA
KE KE
AG AS
M MG
W GS
VE AL
SG SS
LLL D
M MS
H QS
VE W
AI AG
N ME
T AW
Q ND
N YP
KL AE
LL FL
N TQ
AS IL
I CS
W VD
VE AA
Y TF
EL KE
VI AA
QQ D
W SP
TP EE
W PE
Q ME
F IH
AGL S
AD AA
Q FF
Q TY
P TF
W PL
M YG
M VH
C VK
F KY
AS AA
C ID
C DS
VL EL
AT AA
Y RY
P FE
H YS
N VM
W DG
AV AE
LL KL
AD AI
N RK
M RV
LL SG
F TY
C EK
GG GS
N TK
IE KL
AV AG
C VR
I LLL
AR EL
P RF
RL EE
W AK
Q ALL
KT TL
AL AI
GL SL
VL VV
I WL
C YG
Y FN
Y KY
C KS
AR GL
AT VI
AI VG
AL AK
C VN
M RH
AG VG
LL DL
V AVL
M VY
TL H
AL IG
LL PL
F QK
W GE
GL PL
LL RE
W AD
C FS
D VV
C TT
Q TK
W VR
P AH
KG KG
P NP
F ME
AA SL
C NE
N FT
H PP
AG RG
C AK
Y VY
W DD
AA RR
Q HS
W KS
V ALG
VL SG
VH D
I AAG
ASL E
W KR
KL KK
IL SG
H IQ
Q IV
VD SS
AG TG
W TG
LL AK
GG AA
Y RI
H IH
AGL E
Q YD
W SD
TL AR
Q HP
M MD
F KV
AA KL
VV VG
LL VL
W KD
Q FY
F CS
IK EL
C HG
F TV
GG SS
W QG
R AC
K VC
N QF
VL AV
VD AL
V AGG
LL AQ
IL IL
CG D
C YL
C IK
N HP
AA GE
VV AV
N QI
C IT
AL Y
F VM
W AT
AD EL
AA AQ
C KN
SL Y
P FD
C IN
T LLL
N MS
W PG
M QN
C TP
RL R
H FP
IH D
W VP
W FL
W VG
TL AG
F VC
AL VG
P YS
AK KL
SG KS
DL VL
ASL S
F IV
P ML
H KP
LL RR
C RD
AV VV
AL TE
Y FF
V LLG
VL DE
R VC
Q HE
W IS
C ES
VE AG
AL AT
SS SG
H FF
C NS
AK RL
AI AE
W ID
I CE
AKL G
TV EE
AL F
M QP
W DS
C AQ
W IT
RL RE
DL VE
AR AR
C NG
N AH
GL R
AV EL
AE QL
I AW
C KG
AI RE
F WG
AA KK
M FP
W IR
Q FN
RG IT
C AF
H TI
C FG
AL KK
H ND
V GGG
W QR
LL GG
I ALL
VL AE
W DP
F YP
LL AS
AG AD
AAL D
DL RR
AEE E
N FY
AA AT
C AP
F RV
C ED
F KF
VS SL
AL T
C VT
AA SG
K HP
T MR
IS SL
KE EL
C IR
VP VG
N QT
IE RL
Q TV
H QD
Y QN
EL GL
C QE
IE TL
C KD
I AGL
C TD
F QT
LL RL
T MP
H MG
C HL
M QK
N MD
M TY
H RH
AL VD
W NS
D AV
LL AV
AG SG
VV EG
LL SE
DL KK
AI AR
VL AD
N RH
W IK
W NE
EL AG
P RT
K LLE
SL SE
VL RL
EE AL
P ND
Q IH
DL AR
SL ED
Y KF
TL SG
VS GG
VV GL
TL AE
N HG
KL GL
LL IL
P YG
AV VG
P IY
IE AA
P NE
EL RE
K LLL
Y FT
H VY
AV AD
AS TS
R CG
H RI
PL PP
M MN
SL AL
VL AT
W FD
SL TL
SL SG
M RY
F RH
ATL E
AIL E
LL NL
W TP
GL GG
N QY
Y TI
VD EL
VL DL
AA EE
W VN
P VH
AK AL
T ALL
VL EE
H HS
W RQ
P QT
W AP
VL TG
R MN
Q YP
W FG
AG AT
LL AT
AG KS
Y TH
RL AD
H YD
Y AM
W QQ
LL TG
H VH
VL SL
C PD
P QD
K MK
KL VE
AK AG
AL VV
W VI
I AEL
DE DE
V EEG
SL AE
AL TG
C RF
SG KT
GL AG
V WG
V ARL
IL K
R MD
AGG S
M HG
N HE
KL RE
AS GL
VL SS
Y FR
KL IE
H QP
AL TL
T AAL
DL AK
AG EL
W TE
VP VV
F RI
VS LL
AA TL
H NN
AR R
W KN
P FF
AV VD
AEE G
VC D
K MP
IL SL
LL AD
KL AK
KL AD
GL TG
A L</w>
VR EL
AA Y
AC D
AA RE
N KV
AE VL
DE EL
DL AG
C KR
LL IG
P QR
A S</w>
W ED
M IY
LLG D
EL PE
C AN
K ALL
V ATL
M CG
H NS
AA VV
C VF
ADL E
M YD
IE AG
Y TY
RL AR
F WL
SG AS
GG FS
AKL E
W KT
V GLG
R LLL
EL AQ
Y RH
LL VS
LL PE
IL DE
C RQ
P AM
Y IH
EL VK
AL SS
GL TL
VE AV
W TT
H DF
Q AC
IL KK
DL SS
AR RR
P KI
V AVG
N TV
P RY
K MN
AV IG
GL IL
R AAL
H KI
F MS
AE AG
W TD
VT GG
M KH
R AW
KG KK
C RN
C DD
H IY
Y TK
AL DL
Y NF
H RT
D DL
AE AR
ID EL
H RK
H KF
P NN
I MK
VD DL
AI AD
Y TQ
KE AL
M FN
C FD
TG AG
IE DL
W IN
AA VG
KE LL
VV GG
SL GG
SL PL
P FP
Y KV
P RN
RG RG
Y CL
Y CG
V ASG
SL AS
IP VI
AL AF
RE EL
AA EG
VL EG
M AC
AL RD
AD DL
LL VV
GL GE
VL AS
GL T
VL VE
Q CG
H CL
C FE
II SG
ANL G
P YE
P TK
AVL S
W YL
VE QL
RL AK
AD IL
C QS
P RQ
EE IE
AP AA
AL SG
LL T
TL TE
LL VG
H RY
AS VS
Y QK
C QG
VT SS
VI AG
ATL S
VL KK
LL AI
AL EG
K CG
W PD
KL AA
KE IE
P TQ
M KV
AD RL
I MN
W TS
C NP
LL DS
GL VL
AV AK
RE RE
AV AL
AA IL
W FS
LL EG
LL PG
EE VE
Y HS
IV AG
VP GG
F KH
VK KL
EL RR
W ES
C KT
VI GG
AV K
ID DL
C IP
TT SS
V VLG
IE AI
GL EL
F MD
SL VL
N QK
AL KL
TL TL
LL GE
K SLG
AE VV
AD VI
LL ES
NL KK
IV GG
V ASL
AL IL
Y NI
AA TG
V AIL
IE QL
C KP
F MP
AL SE
W AQ
T AAG
PL SL
IL KE
VG AT
TD EE
P VM
AE GL
VS EL
Y MG
R MK
P HS
P KF
LL VD
SL AR
SL AG
PE EL
W NP
W EK
VL AR
N KH
K AW
RL VD
LL TE
C QQ
LL EK
VW E
Y FY
QL AE
GL DL
AL VS
TL KE
AV AS
Y HE
Y NY
TG TG
M TV
RL RL
AV AT
AE VI
VL H
IL EE
AV VE
SL KK
AV SS
Q MD
Y QT
I MT
EL IK
AG GL
VS AA
SL KE
EL SE
I AGG
EE KL
EL VR
DL VV
R CS
AV AN
N CG
C TF
LL TS
KL SG
AD VL
GL SS
AK AK
VV VV
M IH
D AG
VE DL
A R</w>
VG Y
TL AS
R ALG
H KQ
V AIG
AL IE
N MP
SL AV
AQ AA
AG VL
AEL S
ID RL
TG AV
AL VT
GL VV
DE VI
P TI
Y ME
M AKK
AL PG
AG T
AK DL
P HG
DG VV
SL RR
RL KE
IL GG
LL GS
EE AE
LL KN
SL VS
P TN
IV AA
VL KE
AE VE
DL IE
Y YP
EL K
IL AS
ASG S
AE IE
AK VV
AI VE
VD VV
GL K
SL RE
RL RR
AI GG
A V</w>
I ARL
GL N
QL EE
KE KK
H HE
H AM
VL K
VT VT
SS TG
LL KS
IG SG
QE RL
C IY
W ND
AA VE
LL DG
DE DD
R MP
II GG
H RV
KL AG
AS AV
AE TL
SL PS
EL AD
P YP
KE IK
AQL G
AL VR
AL AN
Y FK
VL GE
DL DL
W AF
H RN
SS SE
DL VK
DL PG
KL SE
ID AG
KE TL
VL SE
SL TE
TG AA
P NI
EL VD
AL GE
C PN
VL F
EE AV
T AEE
IK PE
ADL S
N RV
SS GS
AVL D
SL TG
N CS
ATG E
KL ID
R LLE
RL AG
APE G
VI DG
LL SD
IL AE
AL KD
AA KE
AL ES
IL GL
C II
AFL G
C VQ
VG SG
H TN
R MR
N LLG
C VY
C CL
P MP
P IQ
I DLG
I ASL
EL EG
Y MD
KK EE
EL VS
VG AA
GL AI
H IC
RL AS
H FN
SL IL
P KY
AI EE
VK AG
GL VE
TV AA
DE IL
VAA E
RL AQ
AA SS
Y NT
TL SS
AT GG
C PF
W PS
C QR
VT VE
LL KD
AV KE
SL DL
F LLL
VL DG
IL SE
K AKK
H TQ
EL ED
AI AL
Q FT
GG AG
VL AQ
M QT
P MG
H YP
VV IG
SL DE
DL AS
DL PL
VV EE
NL TL
D VI
D GL
Y QY
SL AD
I AIL
AA VS
K MI
AA EK
Y NK
SL SD
DL AE
EL KD
AT AT
AI IG
RL RG
LL QL
EL EK
AP PP
W RN
T LLG
AL PL
AA IG
EL SS
R WG
KL SS
F AC
AV EE
RE IL
M YP
M HP
H DY
AV TG
M FT
H GGG
GL AR
P RK
PS PS
AA AD
C RP
ARL E
I ALG
P KQ
AL QE
VL GS
TS TS
W VQ
T MN
VD K
CG KAF
VL TL
VE VV
C TR
PL AG
AI KE
V ELG
DL EE
N MI
GL VG
GL AE
AQ LL
M IV
AL VK
KL K
VP VI
IL TL
GL KE
GG RG
P TY
GG SG
VL ES
VG RG
AD GS
AA KR
F QH
AV RE
F AW
C NN
AQ AE
TL RE
TG SS
SL K
IG RG
PE DL
SL ES
NG KL
M HE
C AY
AG IG
W FE
VK RL
LL SP
SL W
KL SD
I WG
Y CS
AR QL
VL VD
AV ID
TL R
H DI
IL SS
I LLG
Y RV
D KL
VL IG
IS GG
VE AI
II AA
AV RR
V AEG
Y VC
VL RE
Y QF
K CS
C CS
AL EK
C FR
VV SG
VL DS
EL DE
QL RE
IK RL
C DF
TG EG
P TV
AV GG
AG AR
EL AS
AKL S
C ER
TL DE
RR R
IL VG
I ADL
AQL E
VK PG
H DR
IP GL
C IF
AL KN
RS RG
IL TG
W QS
H FY
M HS
LL QE
C KQ
C CG
AG SE
AE DL
PL GG
KL VD
W IP
M CL
LL RD
AV SG
AV IE
AI AS
TL RL
F QF
IL AD
N AC
PG VG
AG VT
PE PE
AL KG
VE GE
M MP
C YS
KE IL
SL AT
DE VR
AE K
VV VD
VV DL
AA DG
AGL D
T ALG
I GGG
Y QI
VE TL
SS SP
GL VD
AL NL
V ATG
EL R
LL RG
SG KK
VG VG
T CE
II AG
DG TT
VS GL
Q MP
W AN
QL AR
EE AI
V ADG
SL R
P YD
ARL S
AA VR
VK TL
SS AS
AL SD
VK DL
VE GL
VL AN
PG PG
AL KR
Y FI
RL VG
N VC
AS VV
DE NG
AV KG
AA DL
Y NR
GG VL
VE LL
AV GL
W YG
VS R
TT TP
AQ TL
Y VM
IL AG
K MR
VP EL
VK VV
KL KL
HH HH
AE P
T GGG
Q AAL
C KI
TG P
VR GL
VL VT
D AI
IL TE
EE RL
AQ AV
Y WL
AQ RL
SS TL
W KQ
V K</w>
RE RL
PE GL
VS EE
SL VD
H QN
H FT
AL AP
AV KK
N MN
DD EE
AS AG
SL KD
SL IE
R AVL
KL AR
KG VL
VV DG
SL PG
AV VT
QL KE
AIL S
V LLE
LL IS
DL SE
DL VD
W NN
AEL D
Q AW
PS PP
C ML
SL IG
IK KL
DL AD
T PEG
AT AF
A G</w>
Y AC
Q VC
H NF
VV EL
V ASS
EE AK
VT PL
PE LL
IL R
GG PG
KE AI
N LLL
VLL S
TL AT
IL EG
AL RK
AA RG
F CE
QL VE
H WL
H KY
SS VS
FS GL
DL SG
DL GL
AL RS
AA GS
QR EL
P HP
T MT
EL TE
K MT
V AKL
H FK
SL VG
Y KH
APL G
H QT
C RK
KD AG
IS EL
W TR
VL PP
AK GL
LL AN
FL GL
LL QQ
DG KL
AL QL
AA IE
KT VL
AS VL
SL GS
C PT
VE VE
AA PG
SL IS
PL TL
NL SS
AL FG
AA VT
VV AT
SL VT
AE RE
H TH
P QN
M ALL
SL KL
AG IE
W QP
PL PG
W KP
VV SE
C PK
RL SE
AA VK
RS IP
AW D
AKK G
PE TL
LL ED
IP VV
VS TL
AN PL
F MN
AL DG
GL AD
GG EL
GG KG
H YR
GG TL
C VC
P FT
LL AP
AV VL
AG F
Y MS
AK AI
SL RL
SL FS
KE RL
H DN
LL NE
IT DG
PL SE
AL GS
LL VE
DL VS
AL RG
LL RS
DG TL
SL SP
VG EG
DL RD
VF AG
C HS
T AGL
KL KS
IL AV
ID GL
C RT
IT VP
AQ AQ
V AEE
LL PS
AL TT
AD SG
PL AA
SL KS
PL SS
I MR
H ME
EL EL
AL ER
W HL
VW D
RE QL
VS RL
CG KS
RS P
H YT
VS EG
TL AK
VL R
TL SE
H MP
PG SG
TL RG
M FR
LL KQ
AS AL
VD AA
SL VE
IL N
Q LLL
AG VE
C QD
VI VV
SL AK
H QF
TL EG
RL SG
K ELG
AG ID
RV AE
EL KS
H CG
GL IG
RL GG
IL RE
VV AE
LL TT
SL TS
N QH
F QI
Y YR
LL KR
AT AS
AI AN
AR KL
C YE
AL QG
T MK
EL KR
PL DL
GL TE
AL NE
H TY
H KH
SS VT
GL KL
AG KG
D GG
AR VL
I MQ
AK AR
QL AA
IL AT
IL AI
EE VK
VV VT
EL IN
AD AG
KI IK
C DK
AP SG
AL ID
F HD
M QI
AE AD
W QD
M FY
H IV
VG TG
RL VE
I AVL
IL DL
QL AK
AG TT
N AAL
W DT
GL RR
EL SD
DE DL
KL KD
Q WG
P NT
K ALE
TL TG
H TV
EE AA
AR EG
VD GE
VL VL
KE AE
EL IS
N ALL
R LLG
QL QQ
KL KG
T EEE
SL NL
R MI
AL KS
KI KE
AD TL
Y HP
AR AG
SS GG
AK EG
SS W
VL KN
DL VI
C IQ
AT LL
AE GE
EL ID
KI AD
D AD
C FN
EE DL
H TK
SS DS
M AAL
GL SE
VD AI
VV T
W RT
P RI
AA TT
P NF
IG QL
SL IN
VL VS
SL QE
RL IG
KL AQ
EE AR
C AM
P QF
N MF
AV RD
AV EG
V SLG
H VM
ASG E
V L</w>
SL DD
P WL
VI AS
IK DL
AG VV
VL Q
R GGG
LL NS
IG KG
M AW
KK IL
H QH
SL PP
K ALG
DL DG
C DI
AT TT
AL TR
GLG D
AK SG
VE IL
SS AA
K AAL
AL AH
SL F
EL RD
H MD
EL IR
IS GL
II VD
C DR
W SR
RH D
KI IE
K ELE
VG VV
RL AT
EE KE
AI KK
VG GL
H DT
TV AE
SL DS
I VLG
P HE
AG AE
DL IS
AL IS
AT VT
AG VS
SL PE
VL SD
VI SG
TL KK
SL GE
K AAG
AV RL
V SSG
RV AR
AA QL
LL KG
FL DE
Q AVL
QP VE
AI GL
VL PG
AF GG
VD GL
KL QE
SS TS
EL QE
C RY
LL F
LL DD
VS QL
VL T
I WS
SS SD
C KF
C ND
ID AV
W KF
VS AS
VI AE
H NY
IK AG
W RK
IV SG
VK AA
TL AD
TI EE
SL RS
SL FL
DL KE
T WG
VI VG
VV AI
R AAG
D RL
TL ED
KI AE
T WE
VS IL
Q VW
D AT
TL GL
EE EG
TG KS
VL TS
RD VN
EL KN
KI KK
C FP
AD GL
NL SE
H FR
M QF
IL KN
AG SS
AA AN
AI VV
C QP
W FF
H QK
FL EE
EL IG
P FI
H NI
AV RG
AS AR
VI EG
LL ER
AT AL
SL RG
KV KE
VL KD
AS RL
IE AL
AF SG
AI AK
AA FL
H QI
PL PE
KL KN
W IF
R MT
N AW
KH D
AT VE
GL ID
PL VR
P FY
IG DG
QL QE
C YD
W FN
TL TS
TI AA
SS PS
FL AM
AE VR
M VC
KK IS
AV IL
AG EG
IE K
TK EE
M FK
LL FG
H CS
DL IK
VGG S
RL TE
M YR
M ASL
KK AK
AG AL
SL VV
EL NE
N QV
AP GG
VD VL
VP EE
NL SG
I AKL
AA VQ
VG KL
RL SS
P CL
C DY
GL PE
VE VR
IR PL
EL GE
F AAL
TP PP
R VLG
H QY
EL GG
AE RG
F QY
DE VV
DE LL
AG DL
W ML
TL AL
AV VS
AFL S
AD KL
V TLG
V KEG
FL SS
DG SG
EL DL
Y QH
VI GE
KI IG
GL SP
AP VV
K LLG
AV AI
PL VV
VP P
VK DG
I AEE
VL AI
PL IG
LL TD
ID AA
AG IL
IS AS
I ASG
DL IL
SL T
PL GS
KI EE
H YF
H HD
RV AK
AFL E
AG AI
TL DG
IS AL
RL IE
FL GG
D SG
RR RG
KR KK
K RLG
IT GG
F MT
TL AQ
PL TG
IV GL
C FF
VI AT
R MQ
KV AE
KI VE
RL KS
IL NL
AG FE
TL T
RL KL
EL SG
C TN
AA T
DL TE
DE VL
AV KN
VS VG
SL EG
ID AI
W ME
FL KK
DL PE
RL KR
Q MN
KL TG
IL KD
GG LL
C PR
K R</w>
IL GS
DE GG
VV GE
VK AL
AV PG
EL KG
AV SL
AE AQ
W RP
AYL G
VT AI
R CE
NL EE
VE TP
SL KG
FL KE
VK EE
PL PS
AR RG
EL ES
TL KL
LL Y
II IG
IE KE
GL TP
VK VG
N WG
RL VR
DL KL
AT VL
AT SS
AE RR
QL AD
VI EE
P YY
K AEE
AV VR
VT AA
D KS
TR FF
KL GE
EL PL
AG KT
SL VN
TL KD
QL AQ
AE AN
TL SD
VE Y
SL VR
IV DL
VG IG
P MS
EE ED
KS KK
RE AA
GL SD
GG DG
AS EG
A Q</w>
RL RD
VL PL
TL PL
AI IS
V ANL
VL FS
GL DE
RL QE
M MK
K ASL
GL W
KI VD
K CE
AT AR
VL NL
AV EK
F MR
IL KL
P RH
AS AE
QL AG
P IH
LL QS
IL T
VV ID
ASS E
IT AD
IK EE
AI SS
VD KL
GL VS
IL EK
F GGG
RL VV
VI DL
VG AV
V AQL
TV SS
R MV
R ALE
H MS
VI AR
R ARL
VT AG
RL GE
QQQQ QQ
AI AQ
RL ED
IR EL
AK TL
VL VI
AP EE
KK LL
VV DS
VE VT
V AKK
PE KL
SS IS
YL EG
TL VG
GL IS
DE IE
AV SE
APG S
AG IP
VL EK
IK AA
IF GG
GL Y
ATG S
AML G
T ATL
Q LLE
N MT
GL KK
II KK
D TL
AK GG
VL FL
RK RR
PG SS
KG VV
VP AD
IK NL
H KV
RE LL
M QH
KL EK
RI VE
SL RD
PL PL
H DK
Y HD
VL TD
NNNNNNNN NNNNNNNN
IL AK
EE DE
AQ AL
C QN
AV KR
AS KL
APL S
IR DL
VS KK
SL VK
F MI
AL PS
VL RR
C AH
TV SG
RL RS
M QY
AV T
AS DL
R LLD
YL GG
P RV
Y IV
RL W
IK TL
I ATL
AA SE
LL AF
KL DL
F HF
AL DP
AG IS
QL SE
AI RR
AP EL
VK VT
T SSS
FE EL
C WL
C RI
W VF
AK VI
PS TP
II EE
DE VE
DE AR
AE IG
H NK
TV EL
R AGL
AI VD
Y IC
SL Q
II AD
GL VT
AK VL
IE NL
DG SS
AE NL
AE ID
DL KG
AV SD
AL ED
GL VR
FL FL
DL VG
AS AI
W AH
AL RQ
AG KL
IL DS
EL KL
DSDS DSDS
I K</w>
D IE
VV AD
R R</w>
P CG
LL RQ
WY FL
VS TS
II DL
IL IG
IL ES
VS AG
SG TE
I AVG
EE KK
AS VG
AE VD
EL RK
W RF
DS TS
W IQ
QL GG
AN VV
VI AD
TL PE
K ARL
P KH
EL VN
N VLL
GG VI
AR Q
AE IL
SL AI
P QK
KV AD
AA KG
VR RL
LL EQ
TL RR
R AGG
C RH
TG RR
SL KN
SL ID
K A</w>
SE SE
II LL
EL AN
AT AG
VT EG
KE VI
EE VV
GL RL
AP AS
TG VG
EL I
TT PS
AV IS
AL DE
GG VV
VAA S
Q CS
LL QR
EL T
AR TL
FL SG
AQL S
VL TE
M MT
LL KT
TT TG
SS EE
KI ID
K WG
H FI
GG PL
EL TG
AK EE
AD GE
VL KR
AI AI
W YD
PL IT
NG SG
AK RR
VL KG
V DLG
TL KN
AK GE
V APG
VL VN
SL NS
NL AA
P NR
II KE
AT VV
AL EN
Y WG
II VG
AL TD
TV AS
GL EG
W HG
K AVL
IV AV
IL SD
AL VI
VS VS
DE VK
IS KL
V AKG
SL KR
MS KS
AP PG
M YF
GL DS
AL QQ
PL SP
M WL
IS LL
II RE
I TLG
VG KS
IR KL
AR GE
VV AS
VE VK
SS KK
SL AQ
IT RD
I WD
AR AV
AI IE
Q MT
VD AV
RL KK
AP AG
VL RS
I ATG
AT AV
W YS
RE TL
AF AA
VE KG
I GLG
AL PP
VG RL
KR AR
K AGG
D ID
VE AE
RP GG
F VW
W MG
VF GG
I WE
AL IR
V RLG
I ADG
IL VD
C HP
VS N
II GL
RL QG
NL IE
KE GL
AA ID
W ER
IL FL
IL DG
C TK
AG VK
T GLG
NK ID
DL RE
I GGS
DE ID
AL IT
VL Y
C KV
RL PL
D FL
VL VR
VL KL
M AEL
VT VG
SL N
SG KL
RS VI
QL VS
PG GL
TV AV
IS EE
AR DL
VE VI
RL Q
XX XX
V VLE
TV TL
AG AK
AA QG
W EQ
SL AP
QL KK
IE KG
AI LL
D YL
C HE
AT SG
AN AV
IL PG
IL KG
V ELE
PE P
KV EE
SL IT
AS PS
C PQ
C PI
SG KG
R ARG
PL VG
VV SS
R ELG
MG Y
IV KE
D DE
DL SD
T CP
IT EE
C TI
W EN
VI DE
VE AT
NL AR
M YN
FG AP
DL VR
AKK E
DL PS
RL EG
AV GE
YL RD
W QK
TE AA
LL KP
IS VS
AL TP
IG LL
VT EE
V APL
TL PG
GL IN
AD IE
TL RD
RG VD
KK RR
K AKL
KG TG
GL PD
W SF
QT RE
AI ID
VR AG
AL PE
GL AS
W ET
C DV
AR GG
VK VI
AI VT
SG IL
RV PL
V ILG
SL AF
IS RL
EL QG
YL KE
VD RL
KV KK
AV AQ
QE KL
IR AG
IP VG
VT FD
II SS
TL VE
DG TP
QQ QL
DL DD
VT VV
AL FS
VP AG
TV AG
TG KP
SL EK
SG VV
LL NG
IL RR
VI GL
PE AA
P VC
M AKL
KL DE
DG KT
KL TE
SG AV
FD VV
AF AG
VI KE
RL KG
IY P
W YE
RS AA
RR AR
PV AG
LL PP
IL DP
IG ID
W DK
VL DD
LL VT
T SSG
T ASS
LL DR
IG QG
I AKK
EL ER
RG TL
NL KE
I AEG
EL FE
AL TS
AF AR
TL IE
RL SD
VL ED
RE AI
F VLL
VL KS
RS GG
NL KL
ASL D
QV AA
H YK
AS AT
GL KG
AS IS
VS AT
QL SS
Q MR
Q ALG
EL KT
AL KQ
V ARG
VL PE
VG ID
KR KL
IE GE
DL IT
AS ES
TL VP
P FN
DE DG
AGG E
T WS
RL TG
IL PE
VG DG
IP AD
AA VD
VL RD
VI ES
C KY
AK LL
V SSS
KL VK
AG IT
W IY
VL ID
P QY
K AGL
EE TL
DE AA
AS TL
VV VS
I VVG
AR VE
KR VD
TV TG
GL DG
AK QL
VT PE
VI SS
IR GG
VLP WG
SL QL
KK IG
IE VE
Q MI
TS TT
PL RD
F HH
SL NE
N CE
H NR
VP AA
M MQ
IL KS
IK VV
EL RS
AP AE
AI TG
RI IE
HVD HG
F HN
AK AE
W SK
VI IG
K VLE
C DN
VL RG
KN KK
IV AT
F HQ
VR GG
T LLS
QL VG
IV KK
AL QR
C TH
AA YL
RV GG
PP AP
N HD
AD AD
RL VS
RL TL
SS NS
R RLG
IS IL
F HR
M FI
KV RR
DL TL
SL TD
FE GG
VS AE
GL FL
AL KT
V PEG
T VVG
LL DP
IS KE
AI AT
DL KD
C TY
SL IR
RL FG
RG Y
PL TS
EL DD
EE IK
QL KS
PL SG
IL NS
AS VT
C QK
VV VE
K AIL
V ALE
KL TD
II TG
EL TR
AI KG
EL AV
DL ID
VL FG
TT LL
QL VR
W RH
VD AG
IG FG
DG KK
C NK
C DT
VAV S
KE VE
I VIG
AK VT
V KKG
IE VI
GL GS
FL KS
VF TG
N VW
N HF
AHL G
R AEL
PG TP
K VLG
DE AI
AV TE
R ASG
PE VV
P NY
Y HF
Y CE
R K</w>
KL GG
IP AG
H YY
EL TD
TL KS
T AVL
SP EE
H VC
AT IE
R MF
RI AE
GL VP
P QH
KQ KL
IE KD
SL TP
LL PD
GL RG
P AW
M HD
RS AL
IR RL
D SS
D SE
VT IT
VLPWG QMS
AG IK
AD IG
AD AE
VL TT
RV AA
PS TS
KL AS
GL Q
W ST
W SQ
VR IG
TL GG
DL GG
AD SL
Y KM
LL ID
KN IL
AI PY
PL AV
NL ID
DL KR
IP EE
KL ED
TV RG
TP TP
TG SL
SG IG
RR K
APG E
KG VI
DG VT
AS FL
AL DS
D VT
TL ES
TL AV
P YR
LL FS
W DR
AF SS
GL RS
DL RS
VL PS
NNNN NN
KK GE
IV EL
D VE
TE EL
P YF
IR NL
AL FL
TL DD
RL SP
KI IN
RD KL
DE AV
AI EK
TL NL
EL VG
AE Q
GL KD
DG EE
D AP
TL SL
RI AA
IS NL
AF AD
V ALS
IL GE
AI AV
R SLG
PE AV
KKL S
D NL
P QI
KI RE
IS AG
AP Y
AL VN
AI RL
GL AT
GG VG
VT SG
LL VK
PG VL
AL FE
AS EE
AK VE
KL IK
RE AG
M MR
H NT
C TC
KL Y
IS PL
H TGEKP
D IG
TS AA
TI SS
I MV
SL IP
RR RL
M AGL
KL VS
DL VT
AD QL
IG VV
AQ AG
AE TG
TPG HVD
AD VD
VR SG
MS GG
I SSG
GL IT
C YT
AD AT
A D</w>
C YR
IG AT
AP SS
V A</w>
I LLS
GL VI
GL EE
FL KD
EL VQ
DG DG
AS RR
VR EE
KV AA
IS AA
AQ QL
AK AV
TG VV
PG VP
PE QL
IV EE
IV AL
EL IT
AS VE
V VLS
VK AV
R AIL
KL IS
IP VS
AE ML
AE KK
M YY
GG AV
VI LL
V KLG
PD VV
IK N
F HY
Y MP
R WE
P ME
P AC
K GLE
IL ED
C TQ
AQ VV
AE IT
VT PG
LL SR
KE AA
ID EE
TV AT
GL KN
AL QD
AG RS
NL TD
AD VS
VI KK
QL SG
IL VV
AG VH
W HS
VT VK
RE AR
DL EG
T MF
II SL
DL DE
DG RL
YL KK
AL VP
VV TE
VG SS
R VLE
DL KN
GG RT
D KK
AG IR
H KM
DT PG
IL TS
AS AP
PS AP
KL RS
VK AT
VE IE
VE NL
Q MK
VE IG
SL VP
DL DS
AD YL
DL T
AQ AR
VI NG
RL GS
LL SK
KE VV
IK SL
WN FGS
VQ AA
TG AP
KD KK
ANG AS
AK N
AA AY
VS Y
KE KG
ID Y
C VH
AG KK
AE VT
TG TP
SS FS
C NY
AG AP
AD FL
M QV
KT KK
IV LL
RSIP NKL
EL N
VS KG
RV LL
RD GL
K AEL
TL VV
SL TT
RD DL
NG VV
I VDG
GL KS
FE KL
AK PG
VL TP
T LLE
PE AI
KR KR
P FK
IN KL
DE N
W RY
V ANG
VL QE
VI DS
SS IG
SG AA
ID QL
AF AV
EL Q
DL KS
VS AV
VS AL
T VLG
VR AA
VL PD
VD VI
IL Q
F GLG
EL AT
C NT
AG RR
VK AI
SL H
KE AV
EE RE
DL TG
W TQ
TGS NN
IV RR
GL PV
AD VE
QH D
KV AV
AS RG
VT VD
P FR
Y HR
VSG S
SS TT
RL AV
PL VE
N PLG
K KLG
HL RS
GL DP
GG IL
M MF
K GLG
IE AV
YL HS
VV RR
KI AA
VS IG
SS ST
QL TE
W QN
NL TS
KE RE
FWG ATVI
FL ES
EL QD
AK TG
SL FG
RK KL
AL SP
VV TG
VG KG
V TGE
N ALG
KI IS
AQ GE
R ILG
IL NG
APL E
AL W
Q CE
VK SG
I MI
D AE
VT VS
M AVL
VP VE
VP VD
P NK
P MD
P LLL
LL EN
IL VT
AL DD
AK NL
N HH
KL NL
AE RF
SE DE
QL ID
C FT
VV AR
TL SP
PP PG
NS NS
DL SL
F QV
AD PE
NG TL
KP EE
IS VG
EL AI
EL AL
P YT
M CS
KE IT
KE GE
AI SL
TL NE
SK EE
RG IP
R AKL
KS KS
VL EN
TT IP
KL ES
ID ID
I ASS
W DF
TI AT
PL AE
LL TP
FL EG
V DLS
RG VV
C FY
W HP
VK NL
TL VS
RD RD
IV AE
TS SL
T LLD
R LLS
PL AR
II AE
WYFL FAY
W AY
VTL E
RV TG
VR KL
VI GS
KK IE
K AAE
DE AL
C NR
GL FG
VI VD
H CE
FS TL
VK GE
KP KK
IG AE
VS TG
VL NS
R ASL
GL NL
AE H
T ARL
GL RE
YL SP
VL AP
PG VV
IV SS
FL SE
C NI
SL IK
RV RR
IL PL
F LLG
YL TR
Y QV
VLPWGQMS FWGATVI
VG VE
K L</w>
IS QL
II RR
H AC
DL N
VT AS
SL AN
R ADL
LL QG
DS EE
D PE
AG TE
LL TR
IL KR
GG FG
RL IT
NE KL
IS TL
ANL S
VL NG
V REG
RL AL
F ALL
EL RQ
VL ND
RS RR
Q AEL
PD AV
W CG
VR VV
VR PG
IE DG
AK RQ
AD ID
VL ER
R GLG
EL TL
AS QG
RE AV
I EEG
GL AQ
DL IN
M ASS
KL IT
DG EG
YL KS
Y LLL
T MI
M YT
I MF
H LLL
VR DF
VP VS
KL PE
IE AE
FL IG
AI AP
WYFLFAY AIL
IS EG
AN AA
MG EL
II AS
F AGL
VL IT
VI DD
IL QE
FL RR
VK VE
SL KT
C DQ
RG ID
N HN
IS KK
C YP
Y TV
EE IR
DL EK
DE IK
AD VG
RG SG
NL IK
NG EL
TL GE
N HT
AG FD
IG IL
IE KK
I AQL
C PV
KL NE
FS SL
C MG
AS PE
RI AR
M HR
LL RT
LL ND
AV GS
AG EE
IL SP
VK QL
NI VG
TL KR
I IDG
ER IL
C AC
W TF
DL AL
W PP
QM NE
N AGL
KL FG
KE IS
VE GG
GL TS
AT AE
Q AAE
PL EE
NG NG
IE VG
D IT
C VM
VP SS
M HY
VI RE
T AGG
DL ES
AD AR
W II
SG VG
PP PS
K LLD
IV RE
DL Y
W KY
TG TS
IT SS
ID GE
AT PE
AL IK
YL SS
VL NE
VGG E
VL QG
TT AA
II AR
IE TP
TG EI
QE AV
KK IK
PL AS
AK VS
VP VT
V DLE
QL AI
KK IT
IV AS
IE GL
FL DD
AF AE
VD Y
T AEL
PG IG
FL AG
RG RR
IL VN
F AEL
C ME
SL DG
I VEG
HL GG
DG VI
AV AY
AQ EL
V EEE
RG RL
IE RE
FL NL
DL VN
AL RN
VL QL
IL IS
F ASL
KK KS
IT PG
GG EG
Q ARL
C PY
W DY
SG AR
KG KT
F MK
I SLG
GL PS
AL SK
TG KL
RI RE
KG VQ
IL TD
IG RR
AIL D
IG AV
GL AF
A N</w>
IV RG
IP SS
AT TG
Y MN
VK RR
VG AD
IG KN
K WE
IL RL
DG TE
W SN
TV AD
TL ID
SL PD
RV KE
M LLL
KR VV
IC D
H QV
H CD
AS TT
AG VR
VP PP
VL SP
NG IK
F VAA
TS PL
TL AI
SSSS SSSS
KL IN
W CL
VL KT
LL RP
AP AT
VD FL
V AFL
GL DD
AP KG
DL FE
AA FG
SD EE
EL QL
EL PP
AR AI
YG TG
T MQ
KKL E
AE KE
AA TE
VQ AV
TL Q
NL PE
YL EE
VG IN
KTT TT
AG VI
WYFLFAYAIL RSIPNKL
W NK
TS EE
II AV
NL NL
IT PE
GG RL
VP AS
VL VK
TN LLS
P CS
VD SL
SL QD
N MK
AI SE
TV RE
TG TT
AG RL
W IH
V AKE
TL TR
PS RR
NL DE
IE SL
I IEG
GG TS
IK PG
C LLL
IL RG
EL RL
AL IP
TH D
KI AK
W MS
HR DL
SG SD
EL QR
D PG
C NF
VD VS
PS AS
KT VK
IV AI
IK DG
I AKG
DL GE
AT AK
AE AK
RE AY
Q ALE
KP LL
I SSS
Q LLG
AA IR
W FP
VR AV
R AAE
M VW
DD DE
PL IL
KI GL
DG SL
C KH
VI AV
KL RG
II AT
VP LL
VL DP
RV AV
IG EG
FS GG
VS DG
TL QE
TG KG
KL VN
KL EG
F HK
W WL
W TN
P QV
KL RL
F HT
VP VL
VE KE
EL DR
VF VG
RL VN
PL GL
IS TS
AG Y
W TY
IK AI
F AVL
AA SF
P HF
KR RL
QL RR
IT GL
SG TT
R ELE
QL IE
KT VV
KS LL
AG TP
AF LL
VF AA
VD PE
TL PS
FL AE
EL FG
AG K
W TK
V RRG
TG VI
RI EE
AQ AI
V FLG
SL TR
PL RL
P WG
IG HG
EE RR
T ASG
NL KG
M HQ
GL VK
EL VT
IL AL
DS AV
AR AK
M ADL
H YN
IE RG
I ILG
VF DL
TR TR
TP SS
TE KL
P KV
KI VK
DE RL
C IC
VV DE
Q MV
TT NS
RL KD
RK LL
KK AI
D RE
AL RP
IE ID
FL AA
C QT
T CD
RV SG
RG KG
DL EL
TG FG
T AAE
QG VD
PG VT
EE ID
Y HH
VP AP
PL TT
AV ES
VS ES
VR AR
SS IL
PS VV
IG VI
TL N
SS VG
GL AK
C FK
VK SL
QE QE
Q MQ
Q ELG
NL PL
IV PL
IP TN
I EEE
I ANL
VI VT
V APE
SG AG
GL PP
GL IR
AS KK
SS RS
RR GE
Q ALR
KQ RS
AE VS
N K</w>
IV AR
D VR
AE TE
VD SE
VAG S
NG IG
IL AN
EE KR
AL QS
VE VS
R V</w>
QL RD
IG AY
AT VG
V PLG
IL RD
F VLG
W DI
V SLS
P VW
RG AA
K TLG
AI DG
TG KK
EL SL
AL IN
W VY
T ASL
I RLG
I KEG
DT VV
TL TD
QL VN
QL EK
PE GE
AE PL
TG KR
RV KK
M ATL
AS PG
AL AY
VT VP
TL VD
IL VL
DL NL
D KE
AL EI
AA IT
AA IQ
TL EK
RG VT
IR SL
IK NG
TG RS
T K</w>
EL NL
AK DG
AI SG
AI GE
AA RQ
QL GL
LL VR
IP VT
F CD
AE SE
RI ID
Y VW
VE GS
TL TP
TL IS
TL IG
QL VD
LL VP
IG KS
V ARE
LL QD
AK IL
AG PG
QQ RL
PS PL
AP TP
AA AF
YL AE
VV EK
TL VT
QL R
EL RG
IR N
AML E
AA ER
SS AT
IV NG
H VW
GG GGG
VL RQ
VE K
RI RR
ID NL
GG AS
AK AF
PS SL
KE NL
IK AV
AE SL
VR DG
VE VL
VD EE
TV AR
KL DP
W AM
VS AF
IK SG
FL TG
AL HL
KI AR
VE N
C KC
VS PE
V GLS
PG SP
KL IG
IT AI
IS DE
LL VN
KK GS
AT DL
TE DG
PS GL
PG VY
IR AA
PL VI
KL TR
KL N
KE IG
EL VV
AS KG
W FT
VS AI
VL AF
EL FS
Y AW
RP RR
RI AQ
M HH
IP SG
AD AF
VR DL
NL FL
AH GG
AG SL
VG AI
R ALS
KL SL
EL VL
AE DG
W NT
TL RS
RD RR
PT PS
I AAE
RP NVG
IG EE
VI VE
LL DK
VQ VV
VI RR
TG EE
IQ AV
DD FD
AA AM
YD AI
TI AR
M AAG
KR EE
DE AE
D IK
AS TE
AS IT
AQ RQ
AL VF
VT SP
VN TT
VL ST
IP KG
EL IP
AT VS
AE AM
RI KE
N AVL
KP DL
ID PE
TI AG
RG VP
II NG
AT VK
Y ALL
VI TS
V NLG
TL K
IE GG
DS SL
VT IG
M ARL
QI AA
NL KS
N ATL
IS TT
AP PE
W DN
VE Q
SS GL
SS GE
RV RE
LL NN
EE GL
DL TS
D IP
AS IG
AF HF
SG TS
IL TT
AL EQ
KL FS
IV AD
DG SE
TV RR
KI VV
I AIG
C MS
TV KE
HE TGSNN
VE RR
RI VV
QL IG
DD AI
AY RE
VL IS
K LLS
IV AN
IN PE
AR VV
AP AV
C NQ
AR QG
ER AG
EE GE
D VD
AE KF
SS PG
PL AT
PD NY
NL AS
AE GG
VV PS
SG VT
SG SE
Q A</w>
LL IE
IL F
I APL
VS VD
VEL S
TW IGG
RI VD
Q AAG
K ALS
FS P
AT IT
AG TS
VV RS
KS GG
FL KN
DL QL
R VIG
IV TG
AR RE
AL FD
VE SL
N HI
FL SL
AI P
VS PS
M YK
LL RK
IK P
AT GL
TP EL
RL PE
PL VT
I AYL
AE GS
VE RG
T ADL
KL VV
IN EL
H NQ
IS PE
DS DD
VD EG
SG VI
RV RG
IS TG
DL SK
D RQ
AK RG
VLPWGQMSFWGATVI TNLLS
KL KQ
QL DE
KL QL
IL ND
EL KQ
AS TG
AL AM
KK KE
IK VI
AE KG
T AIL
LL QN
KK PE
IG VG
I ELG
PG H
NG SS
LL ST
VKK E
GG ML
EL DG
AS AQ
VL AY
IL FS
DL AT
AP GL
VV KG
VG VN
TL HL
RDVN YG
IS KP
DL AN
TL AN
R A</w>
ME AL
K S</w>
AI IP
AG FG
RG VR
R G</w>
NL PS
KL RD
IT KE
IR KT
I KKG
AI TR
A P</w>
W HE
VG EE
TL TT
IH TGEKP
DL AQ
AV ER
AT TF
AG AY
VV FG
VI AF
TI SG
RI IG
R SSG
QL KQ
W EI
TP AE
KL RR
IL NE
VQ AR
IS ES
DL PP
D VS
AD ML
TP RG
PL RG
M YQ
KV RE
AR KR
YL DG
SS TE
SL VI
M ALG
LL AH
GG IG
F AAG
W DQ
ID AL
DG AA
AI EL
VTL S
NL RS
AF VS
W VM
TT TS
SS RE
R ILE
MK KL
M HT
M CE
YL RE
VV ES
AT EE
AR RD
AK EK
W VH
TL GS
M AEE
IQ EL
GL RD
F WS
DL ER
AR GS
AL RT
VE VP
RL DE
RG EE
RF RQ
QL AS
IS N
D PL
AK P
VV AL
VD IL
R VVG
R CP
DL TD
AK IE
W SI
VY SG
RS PS
IT VN
FL RE
DV VL
VV LL
R CR
PE VL
KL Q
KK AG
IT ID
VE SG
IP AN
AS PL
VAG E
TI TG
IF DE
FL SD
DL AV
Y GGG
VS FG
VG AL
AG SD
VT VR
TR KG
TL QR
TL FG
R AKK
P CP
K AKE
AA SV
V APS
NL KN
NG KK
KD IE
FL GS
AA ES
VG SL
QL IR
NL SD
IQ AF
IK GL
F WD
D FS
VS KL
VI TG
V FEG
T ELG
QL VK
QL N
EE IL
AA QR
YF RD
VE VG
TS PE
PL TP
NG KS
IS KS
VT TT
VE VD
VAV E
RR EG
NL TG
NL AK
NG KP
K AIE
H AW
EL PD
VV KE
VP VP
VE AQ
TV AL
ST SS
QP QP
QL EG
KD KE
FL KR
Q LLS
I LLE
FG FK
EL QQ
AS RS
AR IE
AL SR
AH ID
AG DG
KD IL
DN VV
DL IR
VL IE
RV RL
RR RE
PL RE
AD IS
VS VP
VE AN
T TLG
R WR
IT TS
IN KK
AP AR
LL YL
EL EN
DL NE
DE KK
Q WE
NL IS
I PEG
D SL
W YR
W QT
VV RE
VS AK
VI SE
VG KT
VD ES
TE AE
T AEG
AI ES
AF GL
T MV
SG RR
F SLG
F ALG
DL RL
RS RE
R CD
KG VN
IL DD
VT SL
IN EE
VS KE
VS FE
VI GD
SV EE
PE RL
M MI
IT SG
AS KR
AL VQ
AD KK
TT GG
NI SS
DL NS
DE AG
AD VP
Y HN
V E</w>
TG EL
PL IS
KL DD
KI SS
KI FG
K EEE
IW GGFS
AT TS
W SY
TG KE
IL AR
FL VG
AD EE
R ATL
C HT
AG AN
VI SD
VF SG
SL SK
LL SN
DL IG
AA RD
V AYL
KV AK
DL TP
AV VK
AL DR
VI AN
KG KL
FL DS
ANL E
Y AAL
TR SG
T SLG
SL QS
RG EG
NL TE
GL QG
FS RL
T SLS
GL KR
FS EL
AL NG
VP SG
RR SS
FL RD
AS EL
AS AK
AA TS
RI AK
IL VS
AP AK
V ILE
RL IN
PP SS
NL IG
KG ID
AE IS
W IC
VD AT
KT IE
KEL S
IP AV
GL TR
V GLE
T IKD
H LLG
D PS
RG ED
KI VG
C IH
YL KD
DL ED
VV KK
R WS
IR Y
AE AF
AA QQ
SL ST
KL GS
IL KQ
II AI
AT AI
AG KP
PL VL
ID TL
AS QL
AE PG
QE EL
D FE
C QI
VP PG
IT TL
GL AP
SL FD
RI AN
LL SI
LL QP
IT TT
FL DL
AA SI
W MD
VK N
SS KS
RR RS
Q AEE
PL EG
NL IN
NI KK
ID ES
D RS
D IS
KL AL
VS VV
KR ML
AT SE
VS DE
K ILG
IE IG
TG IS
TG DG
RE AF
P HH
IT DE
F MQ
AI AF
VI KS
VF VD
P MT
M HF
KL FE
IT AS
IG IG
GL ED
AT SL
AL YD
V GLD
NE EL
GL KP
AE KS
VK PE
I AFL
FL N
C FI
VK RG
QE QL
KG Y
II AF
VL SK
VL HE
T GGS
T AKL
RE GL
DD IE
VS PL
QL IK
PE VQ
LL QK
KL KT
KL H
AS TR
VR Q
VR NL
KV SG
AR PG
AA HG
VV PG
VP IG
TP DG
RV AG
R ASS
AD VT
TL VL
T AVG
RI KK
KV TG
KI H
FG IG
VT AV
VI PL
T VIG
PL NL
LL HL
GL IP
C QF
LL IN
IE KS
EL FD
DG AE
AT IG
VE KD
PV PG
NR PL
VK AR
IE Q
DL KT
C QH
AS VR
TWIGG QPVE
LL DT
KQ RL
K ARG
I LLD
FG AF
AT PG
RI VK
PE VT
KK RL
KD KG
II AK
EE EK
DG AV
AG QL
QE IL
NL RE
I L</w>
GG QQ
AP PS
LL IR
K VIG
K ILE
DE KG
YL AK
VL IP
K CD
IE SG
EI AK
AG AQ
SS ES
R AQL
QQ AQ
PL KG
KD VV
GL VN
FL AR
DE AQ
SL EL
KP GG
VS VE
NG EE
KL KR
FL RS
ANG E
T VLS
QL RS
GL YG
AT SP
AA PS
VI PE
TG RP
T CR
LL DY
AA F
SG DS
R SSS
LL YG
IL Y
VI KG
KI KG
VS DL
VR TL
NS SL
NL RR
LL PF
K E</w>
AF KN
AYL S
AS FS
AR DG
VT ED
VI AQ
RL ES
IP GG
DV AK
AD IT
VV RG
VG FG
RE ML
KR AV
IS AV
IP TT
TS TL
P ALL
KL F
CG KT
AS KS
VR VL
SL EN
KI TG
IP VD
GL AY
FD AI
VK IG
TV GG
QL AV
N VLG
KE AG
K VVG
IT ES
FL TL
D AR
VR NG
LL RF
KT VT
F APL
RG NP
KK VI
IL PS
I APS
TE KE
NK YG
IR IG
C QY
AD GG
TP AP
T APS
SG TG
IR KK
IE TS
GL TT
Q ASL
P AAL
I AKE
FG DG
C FQ
VSL E
VI ID
RP SS
RI AV
PS PG
KT VE
FL EN
FG AD
AT IL
AE AS
VR SS
RL VL
DV IL
DE RR
AT RG
AL PD
AAAA AA
VV GS
RL TR
Q AIL
IR TL
DI VL
DI IK
Y AAG
YL DL
TI AD
NG IS
N ASL
KR VL
K ADL
H AAL
EL QK
EL IL
VV AK
VR SE
SG VD
PS PE
KT KE
K WS
I DLE
H WG
VQ RD
SG RG
IK AQ
IG AI
F WE
AV TL
ATL TRFF
VK TG
RL DG
NL EK
KR GG
IR EE
YL SE
VT DE
T RLG
KL VG
KK EG
KF KE
FHP YY
AE YG
W KI
TV PL
ML AT
I KLG
AG SP
V AFG
T ILG
RK AA
LL TN
LL RN
KK VL
F LLE
AE VG
YL VG
VV PE
VS TT
NL KD
EL PG
D AS
TS IS
P HR
LL ET
IP VF
VS AY
VE KS
SL YS
NN NS
KV GL
AT SD
VT AL
VI R
SL RQ
RR ID
KL AN
IS DS
Y IM
VLPWGQMSFWGATVITNLLS AIPY
TR IR
SL RN
KI AQ
IT TE
IE NG
DL AP
VN AG
RV EE
LL TQ
FD RG
AA RS
Y MT
TI RE
RL AN
QL SD
QE AI
Q MF
LL FE
VR VD
T ALS
SS RR
RL IS
RL DD
II VR
EL TS
AG TD
AG AF
VK VK
VE NG
VD FS
LL LLL
IG KK
I PLG
V RLS
TT EE
TE SE
RE KE
PL FL
K SLE
IL AQ
DL H
D AQ
VT TG
RF IE
FD AL
AI RD
VG VP
TR VG
TI RG
RI SG
I WR
AE N
TK KL
RK AR
IT VD
IE N
D AK
C MD
A I</w>
TP AV
PL AL
KT EE
ID YL
AI DL
AE VK
VE SS
KV IL
IV TL
D TS
C HR
AI ER
VQ KL
VD TG
RG IR
RD AV
M LLG
D DG
AT DP
AP VS
W NY
NL SL
K G</w>
IP VE
IL RS
FD AV
VP PS
VL AH
TP VT
W TI
IV SL
IK VN
GG AL
AS HL
VR P
TS PG
RI VG
QL Q
EL QS
AI SD
AF GE
Y CD
V AHL
RL QQ
RG FS
QL IS
PE TT
IN P
II VN
EL TQ
VE SE
TP SG
VR AF
TG ST
RR SG
NL VK
KS KL
KS IS
II ID
EL F
VS ED
IE IE
W NF
IL QG
EL SP
EL DS
VV TS
VD ID
RK VV
IP TP
I CLG
DE VT
AS KE
YL KN
T ATG
NL AE
IL IN
CT GG
VS VT
TL NS
T VGD
RR QL
PL TE
N LLE
KI IT
K DLG
K CP
C CE
YL H
VL VP
VF DG
P HK
NL VE
N A</w>
LL VI
LL AY
IS FL
FE KE
VF VN
TP AD
PS TT
LL VQ
II FG
GL FS
FE GE
VL HG
VI VR
SS SN
IT AA
ID KL
F IDE
AT AQ
AD VN
VR VG
RL VK
RL KN
QL ES
P MN
F ADL
W FY
RV SS
NL KR
I VDE
AT TP
AK RE
Y HY
VE DG
RG SS
LL IP
FE TL
DV FS
DT AG
VI EK
RL AM
RG FG
QV AE
PG TS
K SLS
IT AE
AI KN
AI GS
IL AF
EL PS
AG DP
VL QS
VK SE
RL TS
IE VV
AK AQ
RV AL
RE RR
QI EE
PL RR
PG TT
KI QE
K ATG
GL YE
GGG TFD
C HF
IL PD
IK AM
F ATL
ANGAS MFF
AG ES
H ALL
EL IQ
D PD
AR AE
YD PN
M HN
LLFL HETGSNN
K N</w>
C YY
AV DS
AS VD
QI AE
KI KL
GG GE
FL EK
VL SR
TG IG
N HR
GG GL
DL QE
PL AN
KE GS
IE AR
GG VF
F AGG
AI KR
AG FS
AD AP
AA SP
AA IS
AA IN
VG MG
VE AR
SV SS
Q LLD
IR AV
DL VQ
AI DE
VL HL
VK VN
QL GE
NI AK
MR GL
DD VI
AI KD
TS TG
TI VV
Q G</w>
IL EN
FL QS
DD ID
AV IT
AA VI
AA ML
RI TG
KK VV
K ASS
DL SP
AG EI
TV RL
T ALE
KK KL
DG IG
AS RD
APE S
VT GL
SS DD
R S</w>
DI AG
AL ET
VL RN
VL NN
K AVE
NG SL
KV AR
II FL
DL RP
VL IN
VK GL
RE DG
RD RL
PG TG
PG AV
NL EG
ME KL
LL KY
KK KG
IWGGFS VDK
AV DE
AM GG
AA SD
AA KN
RD SG
QL ED
QE KE
TL IT
TL DS
RK RL
PR GL
PQ GG
KK AR
KD IK
IL IT
IKPE WYFLFAYAILRSIPNKL
ID AR
FS EE
AG PS
VV FL
RG IS
RE GE
C CP
AL QK
VE QG
TL DL
PL PD
KI AV
FE AR
AS TP
AL DK
TR PE
KQ LL
IL TP
FLAM HY
AP VP
W KH
T EEG
I SLS
GG AR
FE TF
AYL E
YL VN
TD VV
KR IR
IS PD
I AQG
GG KL
EL TP
AP VI
Q VLE
N WS
MV KE
DE GE
AI RQ
YL ES
YL EK
VL QQ
QE TT
PL GE
M HI
KT AV
KE IR
FE DL
AF AQ
AA DE
RG ER
II AQ
DL IP
AG IN
AA KP
KK VG
IQ QL
FL KG
AS VP
AI ED
AD EG
VT VN
TT P
SG EL
RP GL
QE HS
M GGG
IQ RL
I DLS
AK VG
A H</w>
YL PE
QI AR
PV VL
KI VN
H VAA
AI IN
AA DD
TV AN
TI PG
RG RE
Q AKK
PV AV
EE ES
DS VT
AA ED
T KLG
SS SSS
QL IN
II KL
DL AI
AQ KR
AP AL
R AEE
NG KT
LL NP
KI SG
K VVE
IK SS
EL EQ
AE IR
VK NG
V RGE
TV AQ
TL KG
T AKK
Q AGL
KE TE
IE KN
F CP
F APS
DL IF
DG KP
AL QN
VS RS
KG RL
IT AT
IG KD
GL NP
FG KG
AT ES
VV VN
SE AE
LL ML
H MT
AL NN
VP AE
T L</w>
Q RLG
PE VI
NL GS
NL ES
ID SL
FL FG
AN IG
AD SS
PPPP PP
GD VV
AT RL
SG PS
QL PL
ID TS
F MF
N HY
KK AV
IT SL
IS DL
VNG E
VD TS
RE IE
M WE
LL IK
I ILS
D IR
C YN
AT IN
AE YL
VV DD
TL EQ
R WD
R SLS
M ASG
KK VS
IG NG
FL TE
Y HI
IS PS
FG VE
AI EG
VL IL
VD VT
N CP
GG VLAL
EL SK
EL AP
DG RG
AS VI
QL KN
NL RL
AI KS
VSG E
V ILS
SL QQ
QL EQ
FG SG
DE IR
NG KE
KI TL
IV PE
IL QQ
H VPL
AA TP
TI KE
IG VS
FR VT
ATLTRFF AFHF
W NR
RT PL
NL DP
IL NN
F ARL
AT PS
AT AD
VV FD
VD VE
SS TP
Q GLG
NL GE
LL SQ
KV AP
KI RR
AI PG
R AVG
IL QS
FS IG
EE AQ
AT KE
AN FL
VG DS
TL PP
SS PL
SG KP
SE KE
R KLG
P HD
NE IK
I APG
H ADY
GG N
EL VP
AP DG
VL KQ
T APG
KL VI
IN NN
GG NG
FL RG
FL AD
W PT
VL HS
KS KG
IS ED
AL KP
YE CL
KV SS
IG VL
AA SK
T A</w>
VG VS
RT AR
M AAS
KL SK
IE VP
C GGG
AI FG
YL AQ
Y HK
VD VD
TE VE
QG SG
KR PL
K RLS
IG TL
I TLE
GL NG
AT AN
SL ET
QL KG
IP TL
I ARG
AP TT
VT DL
VR AL
TK VT
T SGS
SG TP
RL ST
I FLG
I ALS
F AKL
EE IT
AS DG
AI VS
TV AK
TL TN
TI AV
QL QS
LL DF
II VS
DI AR
DE AD
AT KG
AA QE
SL YL
QL AL
NL GG
KT PL
EL AY
DE KL
DE DS
AG KR
W PI
VI VN
V AQG
RL PS
AK AD
VR VR
T SLP
SL NG
NL FE
FL NE
D VG
AS AF
W QY
VS FD
P YN
M AIL
IG IS
DI KE
Y LLG
VL TR
TL FE
RD AL
IG VE
FS KE
AV SP
V AML
T ADG
NL SP
MK IG
IF SS
QL NE
EE TE
AS DS
RG DD
QL RL
IT SD
FY TG
VR N
NL GL
FS TS
FG VT
R ELS
IP PP
II IS
ID TG
FS AT
AV IR
AL TF
YE VS
W YF
W FK
VV RT
VI KN
SL QG
ML EE
IL YG
II VT
II QE
EL SR
AQ RR
AI TS
VF TT
PS EE
NL IP
KG IK
K ILS
FL AK
VQ TL
VG RR
ML SS
ML KE
KL AV
IT PL
IE AK
H VLL
AL FT
VV TD
VK GG
RV AQ
PS VE
N HQ
ID SS
C TM
APS E
AD VR
W QH
VS RD
MAT AF
IN KE
GL RF
AA PP
AA EQ
RL PG
QV AR
PL AP
II TL
I APE
FL DG
EL GS
DL FD
AV ED
AR RS
YL KG
TI AK
I GLS
C FV
VV IT
VF SE
LL EI
VV IS
TT VT
Q AKL
PS KK
PE AP
NR AP
DT IE
AF AK
RE AL
NS KL
NL DD
N CI
KT AE
YL P
VK VL
N VAA
N SSS
N GLG
K ATL
HL AR
AI IL
VK VD
MP VG
HL PE
GG RR
DV TG
DI IE
C HD
AI Y
AI NE
VK AE
TS TP
R DLG
H MN
FG SS
DE F
AQ GS
TK DG
SV AV
RG QR
N CD
IS VP
AL SQ
AL SN
YL TE
PL AK
M AKG
KD AV
GG FL
EE VL
DE IT
D KG
VK AK
TV FE
AEE S
VPE S
NL PP
II AN
IG AR
GL SI
GG SD
VS FS
SS DG
RL YD
RE KG
PPG PPG
PE PG
LL IT
AT TL
VS RG
VS IT
VG DL
TE TE
PL RS
PL IE
F LLS
AQ VI
YL ED
YD KL
WNFGS LLG
VT AK
V R</w>
TI IN
SL QR
SL AY
SG TL
QE AE
FL KL
AM ED
AD ES
VS TP
PI IE
NL IL
DI KK
AG RD
VT ID
TI KK
TI AE
RG VI
QV AD
Q VLG
IS ID
TV NG
RQ RL
K VEG
IS NS
VT IN
TG AR
VN AA
VD DE
TR AG
PS TL
ML P
M GLG
K EEG
IP AE
I VLE
C FC
AV TD
VV KN
VL TQ
VI NR
TG AS
SS IN
SG ES
GL RN
GG RF
AV KS
T VKG
SL FE
NL VG
K DLE
IF VN
EE VI
DD EL
W FR
VD GS
SS AG
RD EL
D RR
D QL
VS AR
VP FS
SL YD
SL RP
Q WD
NL VN
N AAG
M VLG
IK QL
DR VV
DL FN
AT RE
AT EL
RV KL
PL TR
KV AI
VL IF
SS PE
PP TP
K TLS
IK GG
IG AK
I IDE
AN PE
AD FE
YL AG
VS ID
KI IR
K TLE
K AKG
II KG
IG KE
HL AA
DE HG
AT IS
AT EG
W PV
VD VN
TV KK
TE R
T VLE
RS KS
NL DS
NL DL
M CD
K VLD
IP AS
IK PD
AT RR
AD SE
TQ EE
Q ALS
N ARL
KV AG
KL PL
IL KT
IK NS
AQ VG
VQ AI
RD VE
RD SE
PG AR
IS DG
FL PL
AT AP
VT TL
SG EG
QI NG
KS TT
GL ES
AM AA
RR GD
RR AK
I ADE
EE QL
AV FG
VD ED
TV FS
TD SE
SG FG
N HK
LL SF
IS DD
IE KT
F HI
AN RL
AKK S
YL SD
YL RS
TG ES
RE KK
QG RL
PE IL
NI IN
LL VF
KL TS
TE GE
KG EE
H WD
AR SR
AN IL
AI DD
VP GL
Q KLG
PE VR
K AFL
IE KQ
FL RL
DS VV
AN AS
AL YG
AL NS
W SV
T AIG
LL TK
IV AF
FE DG
DL SR
VS RE
VAD E
RL RQ
PS AV
P HT
NL KQ
KE HL
D AM
AE KR
AD ED
NG DT
IS KD
DL TR
DL RG
DK VV
DK VK
DG R
AK SD
W DV
VL EQ
RT REG
QK QR
IF GL
ID SE
ID PD
D VF
AI RG
AE HL
AA FS
YL SG
W VC
W PR
VTG S
TL EL
T WD
PP EE
V KLE
TN KY
PK HL
IR DG
GG VP
VT TS
VS DS
KG FG
K ASG
AV TP
AG H
AA PL
V S</w>
SL AH
R FLS
M ATG
KV KL
GG DL
FL GE
AL VH
AA RK
TS AK
TL SK
RL Y
PL AD
KE KD
IG IP
ID AE
EL AH
AF VE
RI GL
KE AK
IK DS
II RG
II AL
DGTT TAT
AN AI
TE SG
SS VL
Q ADL
FL AL
AV VI
VN KK
T AQL
T AAS
RI AS
NL FG
LL HS
KR IS
IWGGFSVDK ATLTRFFAFHF
II VE
GL H
AV Y
AV RS
AF AS
VV KS
TY GG
NL VP
KE N
FI FG
DV AR
AP VE
VN SP
VG EL
SR KS
RP VV
Q CP
IL GD
FD AA
VD DG
NL NS
KD SG
ITVP AY
F CR
AV RT
VV TL
SS AQ
QR QR
PS AVG
LL FP
KV AL
ID IE
FL RN
FL QR
DL FS
DE SG
TK EL
T PLG
SS RL
SE EL
NG VL
KL NS
KE VK
AP VL
AA TR
SL VQ
IL PF
IK AD
HL AE
GL IE
CG SG
AI TE
VT AE
SL SR
RI AF
PG EG
KK KR
KD GL
FG EK
AA EI
VP DG
TI IG
M DLE
KK IR
IK PS
FL QE
AV TS
AV KD
AS RE
VV KR
TG AT
SL KQ
RL NE
QL DS
NL VQ
NI AR
K WR
EE EI
AS ED
RG TF
NT VL
IT AL
IS IG
AI VK
TK AG
R VLS
LL CL
IR KG
IP VL
IL C
YE KL
QT PL
QI RE
K SSS
DS VS
DL YL
AS YL
AA VP
AA H
RL QD
QL AN
NS TS
N LLD
IN SP
GL TD
FL TS
AV VQ
ANG S
W WG
VT KE
QP QQ
PL DP
F WQ
AL ATL
AD IN
VI TD
VD PS
QE IE
PG KG
IS KG
DG AR
AQ KL
AM EK
VN VG
TT SG
T AFL
SS EL
SL DP
NL KT
DG KG
AF EE
YL NS
YL AD
VV ED
VS AP
TP FF
RV TL
QL QR
Q K</w>
N ILG
IL EL
GL RP
GL IF
GL EK
FS KL
FL Y
DL GS
D VK
VS DD
RV AI
RL H
QE ML
KI VR
YL KR
VI KD
P GLG
LL PR
I VLD
FR TT
FE DS
DE PL
TV PS
TGE RP
T APL
NL IT
N AIL
IR KS
AV DD
AP TS
VN SS
T GLS
ML KL
IQ VE
I AMG
DK IP
DE KE
AS PD
AL TH
YG SD
VP KE
VK IP
TI RR
QG KL
PL IR
HL GD
GG AQ
EE SE
DG RR
VI AL
SS FL
RL EK
QL PP
KT IT
K CR
I VLS
SS IT
RL SR
M VVG
K RLE
IE DD
DN VG
AQ SL
AE NE
VS IS
TL DP
PV SS
PH IKPEWYFLFAYAILRSIPNKL
FG ID
F LLD
DV AD
AE SG
VK PS
TL IN
QV RE
QL TD
NL QR
NL NG
KL SP
IR NE
FD PE
F AAE
AS ID
AAL AA
YL ID
Y MK
VR PE
TT VV
QL PE
NG LL
HL RE
DL TT
AV DG
AS H
AP IG
TP P
SL EQ
II Y
I VVE
FG FD
AR KG
YL GE
SL SF
Q AAS
P LLG
NG AG
KL IL
VT ES
VR VE
VI EQ
TV PG
KE KN
IK AN
FS KK
DE AK
D KD
AI RS
R KLE
ML RE
KG ND
GL DF
FR PG
FL QG
D AN
AS DD
VR Y
TV LL
Q GGG
PL SD
KE TG
K VIE
GL IY
FD RL
EE AG
AF GS
VL FF
LLL RR
K AVG
K ANL
IG RL
I SLE
FS PE
VE IP
TL Y
QL KD
KT LL
IK TG
AP TR
AN SG
AL MG
AD FG
T VDG
N MV
IT AG
FL PE
VT KG
VR LL
TL QS
IV AQ
FT KD
F ADG
AE NG
VI TE
VI IS
VG PG
VE IT
N WE
ML DG
LL DQ
KN KD
KL ND
ED VE
AE RQ
AA FD
YL AR
VT EL
VS RR
VS KS
V GLP
RE IT
R AEG
PS AT
NL VD
FS PD
D AF
AG TY
TI IE
RS PL
NP EL
KT KS
I AAS
EEEE EEEE
AL SF
VS VL
VS PG
VAD S
SS VV
PS GG
IK VG
I ELE
AL ND
AL DN
AI QE
VLPWGQMSFWGATVITNLLSAIPY IG
TP DE
RI GG
Q WS
PL KL
KK EK
K AEG
IV AK
IE IL
DL IQ
V QEG
TK SG
M WG
I ALE
DS QR
C GLP
AV AF
VE AF
VD AD
V AIE
TL QQ
SG EE
RP DL
RE AE
R AFL
PD P
KR VR
KP VV
GL FD
FG AE
DL EN
AP RL
AG QG
Y HQ
VP PE
TV KL
SL NP
RK IS
RG KL
NL AN
KE QG
H LLE
FL IS
AR P
VT IE
VL VF
IT KK
FG SD
YL AT
YG EE
VPG S
SL SN
SL ND
R ARE
Q ILG
NS NN
KT IK
GL AN
FL ED
EL YE
D TG
D TE
AP KP
W YP
VS VR
VL RT
VI VK
VD IR
TT AT
SE DD
RI IP
R L</w>
LL RH
IT IN
IT ED
F MH
AL HG
VR AS
VI NE
VE PE
RF VQ
KS QL
II IN
FS DD
AV QG
AF SP
VG YG
Q CQ
M AKE
K VLS
IE P
FE KG
EE AS
AR ET
AK EI
AI AY
VV DP
VE FD
HL EG
FE RL
EL TT
TS LL
TL PD
TL HP
SL SQ
R APL
KV LL
ID KK
FL AS
K I</w>
DE NE
AY RR
SV AE
RI VN
PS RS
KI AS
II PL
AV NE
AI TD
VY DL
PE AE
KI AN
IE Y
GG TT
DG SD
VL DK
QL TG
AR H
AI EQ
AG RE
SL TN
RP AP
QL SP
NL IR
IQ KL
W NI
VK PL
TL QG
Q CR
KK KN
IS AM
GL VF
DD VV
CG KG
VP KG
VP DS
VE AS
V QLG
TI AS
TD VL
SL IF
RV FE
QL KR
ME RL
KL AT
IL SK
I NLG
FGG AGVG
C CF
AP LL
AI FL
AA AH
VD IP
TT SD
M VIG
M HK
IN PL
IG KL
GL ET
YL QE
RV AF
QV SS
Q ASS
N MR
KE FE
DL R
DI AK
AM EG
TL ER
SL YG
RS AS
QV AT
PP PE
NL ND
NL DG
II PG
DE VD
DE P
AI VL
AG VN
TP GL
Q SLG
PSAVG YQP
ND EE
MV NG
MGG MGG
K GGG
IE VT
ID AT
FD PS
AA PE
QR RL
QL RG
NL YG
KI IP
IG VP
IG SS
IG IT
DL NG
VAN S
TG Y
SI EE
RV PE
QL GS
IS NN
DL DK
AL ST
AA RN
YL GS
VE GD
TL FS
TGL FLAMHY
T ELE
NK VD
LL DN
II FS
FD EL
AG ED
VS IP
N MY
N ELG
KG IE
DD IL
AL RF
AA QS
TL IP
RT TG
R ILD
IS Y
IR TG
AR Y
AL YE
VD AS
SL NN
PL TD
NE TL
KY VP
KK SE
KD KL
K VDG
IR KE
H CR
GL EN
AQ GG
ANGASMFF ICL
AI TT
AH PD
AAAA AAAA
YD IE
VSS E
VN EG
VL RP
RK QL
RE RD
QR AQ
AV FL
AI EN
AA NS
QG VV
Q EEE
PGD VF
NK KL
KK IN
IT IT
IG NL
EL RP
C YF
TL KT
SL QP
Q QLE
KE ID
K WD
I ARE
DI RR
C YK
QV AK
N APG
KE Y
DE SE
AP RR
AL TN
AF VL
YE AR
VD SG
ML SP
KG SS
GL TF
AL KY
W TH
VK AD
VF RS
SG DP
RT SS
GG TG
C YQ
NP AT
N ADL
KS Y
KE NE
IS KN
IE IP
I VSE
I REG
FP DG
AE VY
VP TT
VP IS
VN ID
RR KS
RL VT
NL VV
KR IL
KL VR
KE AR
DV AG
AV VN
AV IN
AL ME
W AC
T ARG
RG RS
KS KN
IS RD
F AEG
DV AA
DS AR
AQ VL
AE IK
RN GG
QD W
NS FL
ML TG
KP VI
FT AN
VGE RTREG
TL IL
TK AV
QL RQ
Q WR
NL ER
NI ID
N MQ
KI VS
KI RL
KI FS
FI SS
EE NE
DE AT
VD HP
V YLG
TI FS
R PEG
PS RL
NL VT
NG TS
ID ED
GL KT
DE AS
AM AR
AL AAL
AA RY
VR AD
VI RS
SL IQ
RQ RR
QE GE
N CT
KV RG
I KKE
FD KL
DI VE
C AW
R EEE
NI IK
KV SL
IL IE
IG PG
IG EL
H SLG
GL F
FV AR
D VN
AA KS
VV GD
NG HL
M YI
IE KR
DT SG
DL VF
DK AR
AV TF
VL SI
PE GS
KK AE
FL PD
EL DP
VI VS
TR DG
TI KG
TE VG
TE KG
T KKG
SG RS
RV FG
PL PT
KK VE
IR P
ID GS
GL ST
DL PD
VL QR
V I</w>
RF GG
NL PG
KV RL
I AGS
EE DD
DE TL
AP SL
YL PS
VT AT
VI KR
RV QG
Q AGG
IE AN
IDE AR
H WE
AQ PG
AD AK
VG AE
SG AI
RF TT
R TLG
LL YS
H AVL
NL RD
NL AD
KP SS
IS RE
IL ID
II EL
H CP
EE ER
DL RN
AM AT
VT PD
VP TS
VI TL
VI PP
VG AN
M ANL
I GLD
D RG
TG RL
RQ AG
RK RG
Q ELE
KKP NS
KG KS
IT PD
FE VV
DS P
DI VV
AD SD
TI AN
TF SS
SL TQ
IQ P
IL ST
II PS
IG ED
EL RN
AK GS
YL QQ
VT VL
RE VV
QI KE
KL DG
KK PL
K AQL
IK VE
DG NG
AL RAL
AL DY
AF YG
YL TS
VG AP
TN SS
RR GS
RR GG
RL TD
PG VS
IT SE
IR TP
IL FF
AI GD
AA KT
VG ED
TL VN
RI SS
KD VK
IV FL
H LLS
FV SS
AI VN
AG DS
AA AAG
W FQ
VP FL
VI TT
TPGHVD FT
TL ST
SG SQ
QMNE PPG
QG IT
C CR
AN EG
SL RT
RLL ER
RL FE
R SLE
LL IF
KK VK
DE IN
D VY
C NV
AG RP
AD PS
AD F
W PF
RG VS
PV TG
NL Y
FG ES
DL DP
DG KS
AT NP
TR NG
R EEG
K SSG
IL SN
IL FG
IK AT
GL PI
ET AE
EE FE
DE SD
AV GD
RS AR
RL ER
RG VE
M LLE
LL SV
KS TG
EL NS
AD AN
AA KQ
YL QS
R AIG
QV KK
QM DG
KE SE
KD PS
IS VD
I SGE
GL NE
AS Y
W KM
PL HL
IG VK
DG VF
D YE
D IN
C MP
AV TR
AR YL
W NQ
VV TT
TL KQ
PG YG
LL II
IE DS
I ANG
AV TT
A F</w>
VS QG
VE RS
VE AK
TP AQ
TG IT
QL TL
KR KS
IG FD
AA RP
YY GG
TS DS
T KKE
T DLG
IK VT
IK AK
ID FS
I ASE
AY ED
AS AN
AP RG
V AYG
TV GL
PS FS
NL AL
IDE ID
DV KK
DE AY
AH VD
YL NE
VS KT
SE SD
QL QL
PS AE
KK YG
IL NP
IG QS
IE RF
GG VM
GG SP
C VW
AP VT
VF EE
VE RE
VD AE
TL RQ
SL TF
RT RR
RL IR
NL ED
M SSS
KE VT
KD AI
GL EI
FG AT
AS F
AF EG
AD TT
VL SN
T PLE
SS PP
RI IS
QI KK
QE AQ
PL SQ
KP FL
IP P
I AFG
AS KT
AA HL
VAS E
RD FL
QV AV
M AGG
DI VI
AA RF
RL NG
RL FS
RAR S
KL DS
KE KS
IN PD
FT GG
DL F
VP AV
RL PP
RD P
N SLS
HIC RDVNYG
AS PP
AG FL
VV TP
VV AQ
VQ EL
VF TD
TR VY
TR EE
SG SP
N AKL
MS KE
ML AE
KL QQ
K NLG
IS TD
IG SL
APG FGD
AL TQ
YL PG
YG EG
VR QS
VL FD
VF DE
TL IK
TF SE
RT RE
K QLG
DT VI
AR FS
AP VG
AN AR
SL KP
RT LL
IV QG
IF DS
IAD S
FL ND
DG VD
AWL G
AS AD
AN IN
AI QL
R AVE
PE IS
IP VY
IL VK
DL VY
YG SG
VS PD
SS NG
RL RK
R AIE
IE NE
DD LL
C RV
AL IQ
AE KI
TL QL
QV FS
Q AFL
NG KG
N ASS
KR AK
KG IT
KE VL
K QLE
F RLG
AS IN
AP TG
AI TP
Y WS
VE TE
TF EE
ML AG
M MV
IL QR
ER MG
YL DD
TG ID
SL FN
RL AF
RF ID
PR GG
PE SP
KL ME
KL GD
KI VI
KD P
DL KP
AN PD
TS DT
RY IE
PL VK
IGE PG
FP EE
EE VG
DV AV
DG VR
AG NL
AD FS
VV AF
VS HL
TV VL
TI ID
T IDG
T FLG
T AKG
II NL
FG FR
AG AM
AA NL
VV IN
VD LL
TS KG
TE DS
SS NE
SL HL
Q ASG
PE SS
IL PP
IK SE
FL TD
DL QD
W AW
RK EL
PG VR
PE NF
KK SG
IT NE
IG GL
IF SG
D FG
AR AF
AF SD
VQ VI
TP LL
RI FG
PL SR
PE Y
KR VK
IS FS
IQ EE
DL HL
DG ID
AT VR
VV SL
VIS ITDG
T ANL
PE PP
IT VT
EE EEE
AD IR
Q AQL
KK SS
IT RE
FT SG
AN KE
AF SE
VR AE
VN PE
TL QD
TL EN
SL SI
RR AA
PL AQ
IT QH
IL KP
IE AQ
GL RT
AVD PL
VS VI
VP AT
VF PD
RF RG
QV EE
NI LL
IT KD
IT EG
IK NN
H SSG
D VP
D TD
AY AD
VK VW
VD RD
TP GG
TL FD
R KEG
IY SG
IL VE
FHPYY TIKD
VD IE
TI PS
T VKS
RL QL
QE LL
PPG S
KI VQ
IK AR
FL AQ
DL QS
AI KL
AG KD
VV SD
VT RE
VI SP
VG DD
T VIE
RL DS
ML AR
MG QK
KT VN
KG KR
IV PS
IG TG
FE QL
DR DG
DE TT
AG QE
YL AV
VG RD
TL FL
T AKE
SE SS
M LLS
IR QL
IL QL
GG IS
FG EE
AS QS
YL DE
W QI
VS ALLG
VD PG
TS PS
RR VR
RL TT
R VVD
IP VP
IE SS
I GLE
I AHG
AP AD
AE QG
AD EI
W YN
W PQ
W PN
VV ER
VI VI
TE KK
R VLD
NL QS
NG VT
M AFL
KT SS
KN KL
KE KY
II FF
IG VT
DS PL
AY AT
AP KE
VD VR
I SGS
HP EL
AT QL
AA VAA
VD AR
T DGS
SL ER
PL FE
NI EE
K AIG
IT VE
IK NE
DV LL
VG SD
VG KR
VD VP
RL KQ
RE RG
R GLS
NG VS
IS AR
IG AL
I YEG
FV AK
FR KL
FG DY
EE GG
DG VL
DG EL
AK RP
AE TT
VN KL
VG NL
TW NIG
TR AE
TK AE
RL IQ
PG AT
IT AR
FD IE
AV RQ
AT QG
AS QE
AD DE
VT VH
VP AR
TL VF
TE IK
T DLE
P APS
KV GG
KN KN
GL QL
GL QE
FL GD
FG Y
AM KK
W CE
VH LLFLHETGSNN
PG RR
PG AG
KK VR
KI Y
KE SS
IT DS
IS VV
HR SGE
DV AE
DL QG
D NG
VV AN
VT PS
VS AN
VEW IWGGFSVDKATLTRFFAFHF
TD PE
RE RS
R TLE
QF MD
Q ARG
PL AI
M AQL
GL FF
DG TG
AD II
VL DT
VK VF
SG Y
ML KG
KK EL
AT FS
AK RF
AK AY
AA FE
VV VL
RG TG
NL N
M ALS
KL EN
KE QL
FL ET
AG KTT
SL QN
PE DE
KF SS
ID PS
AV EQ
W RI
W KW
T TLS
RV AD
RL RN
RL QR
QS QS
PK VL
LL TF
KN P
IN SS
FK DD
DG VE
DE IS
AT KL
VE KT
TK AQ
RE RF
N ALS
KV FE
IV KL
IR AL
ID VL
EI AR
VK PD
VI AI
ST SG
PG EE
MT EE
ML SG
ID VS
ID FD
HL EE
DV AF
C HI
TG IP
RE VL
RE VI
RE KS
KI AP
C CY
RP IS
QS VV
PL KS
NV SS
KR AA
IV FG
GL HQ
FS KD
FL SP
DV SG
AR EK
AQ QQ
VAT E
NN IN
M SSG
IK SD
AQ VK
VS F
T ANG
R ADG
PS IS
PE TS
NP DL
N AEL
KI AL
ID DE
HL VE
H MK
FL KQ
AI RN
AG TF
VK IN
VEE S
TG TL
TG DS
TG DL
SL DN
RP DG
QL TS
KV QE
KK KY
IQ AL
GL SR
DI IN
C HN
AA VF
Y CR
VR PD
VL QD
SE VE
RVK DLPG
PR PP
PL EL
PKG RN
D RD
C YI
AT PL
AN VI
AA TQ
VT LL
TK TT
RG AR
NP KG
IG DS
I SLP
FS DL
D NS
W MP
VV NG
VR YH
VG LL
VG AR
RL IK
RK SG
R FLG
PG ES
N LLS
KE IN
IR SG
IL H
IE VK
IE GS
GG DS
AK SL
VQ AL
TQ HS
T PLS
SI SS
RG EL
PV GL
MS KR
KN VL
KL VL
K V</w>
IL VI
ID IN
F ASS
D KR
AG VF
VT SE
VS QE
VQ PG
SSSS SS
RQ PE
R RLE
KD KD
H VAL
GG AF
AWL E
VE P
TV FL
TG ED
RR P
RK EE
R AKG
NL PD
NL NN
KT RR
HGGG EG
FD EE
AQ IR
AE RD
VD AF
TK AT
SL YE
PV EE
NL VF
IE PL
H MR
DL VP
D AY
AV AP
VD PD
TL VI
N GGG
IT VG
HL KK
FL QQ
AR EE
AE VF
AD PD
TE RG
RL SL
RG GL
QL VT
Q MM
KH KS
IT LL
IP KE
IL QN
HKL GE
FG EG
F K</w>
VS VN
VR KG
VE IK
Q VVE
LL AM
FI EE
FD QL
AE NN
YL RR
W YT
VQ AG
TP RE
T VEG
RL KT
RG YG
QP AA
PG AF
NL VL
NL EL
N AEE
FS DE
DD VL
D ML
TR TF
TM AE
TI AQ
RR VS
RK AG
RHG NKG
R RLS
IP TS
GL QR
VT PP
TI IS
TE SS
SL DR
RL EL
PL ED
KN GG
ID IS
H AGL
DI SS
AS DP
AG YG
VV VR
VP AQ
VG AK
TD AA
QI ID
NV EE
M AVG
KE AF
IL ER
IG DD
C HY
AE DS
VK VR
VD DD
TP VV
RG YE
Q ILE
PR EL
ME VW
KV FG
HL DG
GG RS
AE PE
YL KT
VPL S
VD TD
TE LL
N L</w>
FL FS
F AEE
AN GG
YR EL
VK Y
Q AEG
DG KD
D KT
C IM
VK SS
SV LL
NI AE
MS EE
KS VS
KK ID
IE KF
I AVS
FG TG
DI FG
YG SS
VI ER
QS AV
QG VP
PG SD
NL QE
N SLG
KN VV
KK TG
IE NS
HG SG
H ADL
FDG DQ
AML S
VY GG
VQ P
VI EN
VI ED
VG AY
VF SS
VAE S
TL ET
SV AS
RL IL
QL KL
PE SE
NL TT
IK VD
HL TE
GL FE
VI SR
TV FG
TF FE
RI INE
QE RE
PG AP
ML AK
IV FS
IL SR
IG Y
F ANL
AR ER
AM SS
AF AL
VV KD
VH PD
TL AH
QV AS
Q CD
PS LL
PL DG
PE AL
NN SS
IT IP
IT EK
ID SG
FD P
DEG KG
AI AH
TP VF
T GLD
QN VN
Q VLD
NL EN
II YD
ID VE
F WN
DE KN
AV KL
Y MR
VS AD
T ELS
RQ QL
RL GD
ML AS
KK K</w>
IR QG
ID FE
I TGE
GL PT
AA M
VQ QL
VP QE
VG KD
V G</w>
TL ND
SG ST
SE KL
RK VL
PL NE
NG IP
IT AN
IS EN
IE AT
I ALD
H TLG
DL AF
AR AD
ACL G
VK DS
VI RD
T VIS
QV GG
PS TE
MS RR
KI AG
IT TD
FD RY
EL DK
AE AT
VG RS
VE RF
TS VL
RLE RE
RK TG
RI IN
R VVE
QG AG
N AKK
IG YG
HNL QEHS
FT VT
AQ SG
AP VN
AA DP
VD FE
QK KL
ML RR
MG IP
K IDG
IS VL
H AAG
EL ME
AR FE
AG YE
AE Y
YL TG
VV PL
RL NS
QL SR
PQ SE
NN P
LL VM
KV FS
IID S
GG SL
F SSS
D IF
AA ET
VP TP
T VVE
RV AS
RD NG
PG Y
N TLG
N CR
KG ES
ID RR
ID FG
AV QE
AK VK
AK SS
AA YG
W CP
VS VK
TS VS
RK IG
MV GG
IK Y
FV KE
FD GL
DS VI
D TT
VP SL
VN SE
VAT S
TL VK
RI AT
RG AV
RE IS
QL DD
Q WF
NL VS
KP VT
KL IR
KAI S
K DGE
FG AG
AP IH
A T</w>
W HR
VE DD
TL YS
RE QG
RE DL
PE RR
IL EQ
ID VV
F GLS
DI EE
AV FE
VE PL
TT SP
TD AE
NP EE
NL FD
ML DE
KV FL
KK GG
KI RG
IS TE
IE RR
DE QG
AL MD
VT RR
SG SL
RL PD
QL PG
NI IG
LLGD PDNY
KV EL
AK IK
VQG S
VG TS
V KLS
RV EL
NL TR
NI PL
IP ML
IL RN
DL ST
DD IT
YG KD
V D</w>
TG SE
TG AE
T TGE
QL PS
I ILE
DL QR
AF KD
AE QQ
YL EL
VR AI
VN VV
VK TE
TP VD
T ILE
M WS
M ALE
KV AQ
KE IP
IL TF
IF EE
I AHL
VK GS
TL RN
TE AQ
PV PS
ND DG
K SGS
IL QD
IE KY
ID YD
IAE S
YL DS
VN DL
TP ANPL
SG AT
RI AG
RE FG
PG VF
PG PP
LL HE
C TV
W LLL
RN RR
RL VP
PS VF
PS KP
ML KK
M GLE
K PLG
FT PE
DE KS
AY AE
AF SL
AF RE
AE RY
VT IP
VT FE
VL QF
RL VQ
QQQQQQQQ QQQQQQQQ
QL DL
LL NR
KL YD
KD EE
IT AV
IQ SG
DG VM
AYG E
AS RP
YL AN
VL YG
RI AD
RG KK
R GGS
QE AL
PP Y
NK IK
NI KE
HP VLL
FS FL
FS DS
EE KS
W QF
TL VQ
PS ES
PG KY
K APL
DG TF
DG AS
VI FG
RV AT
RH RH
RG VN
RG AS
NP SS
NK VI
NE KK
KT SL
IL AH
IE H
I ANS
FE VT
DI IT
AT HG
AL SI
VN TG
VISITDG QI
TS RS
TE AT
T VLD
NG Y
K D</w>
FS RS
FL EL
AV AH
AK VR
SL FP
R ITS
R AAS
QV AN
PV AP
II VV
DG RT
DD AR
AV RN
AR IS
VQ VF
TP AS
RL ND
QL NS
PG DN
NV KK
KG VE
IL SF
I RRG
FS VG
FS HL
FI SG
VN P
RF FL
R KLS
R GLE
QS GG
PL H
N SSG
MD IK
KV PE
IQ AA
GL DK
FR KG
C IV
VLG D
VL SQ
VL NP
VG EP
VE KP
VD GG
TS DE
RL FL
RH RR
DG IT
YS LL
VI PG
TR P
TG AN
NK PE
NG TF
M RLG
KP TT
KL EL
IR TT
FL PS
DL SQ
AV QQ
AI VR
A Y</w>
W TW
TS DD
TG LL
QL YE
NK IL
IR KQ
FS AG
FR EL
DI IS
AS DE
YS VS
W HF
VV PP
VD IS
TR IG
TD DE
SE KG
RY KG
RK VR
RG SD
NP SG
NG RR
KV AT
IK YL
IK IT
FS DG
D IY
AQ P
AE FL
W FV
VK IE
Q AIE
LL QT
K AGE
I TLS
H MY
GL KQ
F EEG
DL AY
AV QS
VI SL
TI TL
TG AD
TD TE
RL RT
QL VV
IK AL
GL DY
AF EK
VN SG
VE NS
VE AY
VD DS
RT HL
QG EG
PS YS
NTP PHIKPEWYFLFAYAILRSIPNKL
NG IL
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" BERT model configuration """
from transformers.configuration_utils import PretrainedConfig
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json",
"bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/config.json",
"bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/config.json",
"bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/config.json",
"bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json",
"bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json",
"bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/config.json",
"bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/config.json",
"bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/config.json",
"bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/config.json",
"bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json",
"bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/config.json",
"bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/config.json",
"bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/config.json",
"bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/config.json",
"cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/config.json",
"cl-tohoku/bert-base-japanese-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json",
"cl-tohoku/bert-base-japanese-char": "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/config.json",
"cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/config.json",
"TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json",
"TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/config.json",
"wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/config.json",
"BertAffinity": "./config/config.json"
# See all BERT models at https://huggingface.co/models?filter=bert
}
class BertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a
:class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments,
defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
to that of the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
Args:
vocab_size (:obj:`int`, `optional`, defaults to 30522):
Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
:obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
:class:`~transformers.TFBertModel`.
hidden_size (:obj:`int`, `optional`, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, `optional`, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, `optional`, defaults to 3072):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
:class:`~transformers.TFBertModel`.
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
The epsilon used by the layer normalization layers.
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
:obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
:obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
<https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
`Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
<https://arxiv.org/abs/2009.13658>`__.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if ``config.is_decoder=True``.
Examples::
>>> from transformers import BertModel, BertConfig
>>> # Initializing a BERT bert-base-uncased style configuration
>>> configuration = BertConfig()
>>> # Initializing a model from the bert-base-uncased style configuration
>>> model = BertModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "bert"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
gradient_checkpointing=False,
position_embedding_type="absolute",
use_cache=True,
**kwargs
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.gradient_checkpointing = gradient_checkpointing
self.position_embedding_type = position_embedding_type
self.use_cache = use_cache
from subword_nmt.apply_bpe import BPE
import codecs
import json
import numpy as np
from tqdm import tqdm
import math
import random
def get_tokenzie_seq(file, save, mask=False):
begin_token = '[CLS]'
separate_token = "[SEP]"
with open(file['seq'], 'r') as f:
seq = f.readlines()
with open(file["smile"], 'r') as f:
smile = f.readlines()
with open(file["affinity"], 'r') as f:
affinity = f.readlines()
bpe_codes_drug = codecs.open('./config/drug_codes_chembl.txt')
dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
bpe_codes_prot = codecs.open('./config/protein_codes_uniprot.txt')
pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
with open(save, "w") as f:
for i in tqdm(range(len(seq))):
d = dbpe.process_line(smile[i].strip()).split()
p = pbpe.process_line(seq[i].strip()).split()
if mask == True:
d = random_mask(d)
p = random_mask(p)
final_seq = [begin_token] + d + [separate_token] + p + [separate_token]
affinity_num = affinity[i].strip()
item = {
"seq": " ".join(final_seq),
"affinity": affinity_num
}
new_item = json.dumps(item)
f.write(new_item + '\n')
def random_mask(input_seq, mask_proportion=0.15):
mask_len = math.ceil(len(input_seq)*mask_proportion)
mask_token_posi = np.random.choice(len(input_seq), mask_len)
for i in mask_token_posi:
choice = random.random()
if choice < 0.8:
input_seq[i] = "[MASK]"
# mask_vec[i] = 1
# elif choice >= 0.8 and choice < 0.9:
return input_seq
def get_tokenzie_seq_case(file, save, mask=False):
begin_token = '[CLS]'
separate_token = "[SEP]"
with open(file['seq'], 'r') as f:
seq = f.readlines()
seq = [i.strip() for i in seq]
seq = "".join(seq)
with open(file["smile"], 'r') as f:
smile = f.readlines()
# with open(file["affinity"], 'r') as f:
# affinity = f.readlines()
bpe_codes_drug = codecs.open('./config/drug_codes_chembl.txt')
dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
bpe_codes_prot = codecs.open('./config/protein_codes_uniprot.txt')
pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
with open(save, "w") as f:
for i in tqdm(range(len(smile))):
d = dbpe.process_line(smile[i].strip()).split()
p = pbpe.process_line(seq).split()
if mask == True:
d = random_mask(d)
p = random_mask(p)
final_seq = [begin_token] + d + [separate_token] + p + [separate_token]
# affinity_num = affinity[i].strip()
item = {
"seq": " ".join(final_seq),
# "affinity": affinity_num
}
new_item = json.dumps(item)
f.write(new_item + '\n')
if __name__ == '__main__':
# file_train = {"sps": './data/train/train_sps',
# 'seq': './data/train/train_protein_seq',
# "smile": './data/train/train_smile',
# "affinity": './data/train/train_ic50',
# }
# save = "./data/tokenize_data/train.tokenize"
# save_mask = "./data/tokenize_data/train.tokenize.mask"
df_test = {"sps": './data/test/test_sps',
'seq': './data/test/test_protein_seq',
"smile": './data/test/test_smile',
"affinity": './data/test/test_ic50',
}
df_ER = {"sps": './data/ER/ER_sps',
'seq': './data/ER/ER_protein_seq',
"smile": './data/ER/ER_smile',
"affinity": './data/ER/ER_ic50',
}
df_GPCR = {"sps": './data/GPCR/GPCR_sps',
'seq': './data/GPCR/GPCR_protein_seq',
"smile": './data/GPCR/GPCR_smile',
"affinity": './data/GPCR/GPCR_ic50',
}
df_Ion_channel = {"sps": './data/Ion_channel/channel_sps',
'seq': './data/Ion_channel/channel_protein_seq',
"smile": './data/Ion_channel/channel_smile',
"affinity": './data/Ion_channel/channel_ic50',
}
df_Tyrosine_kinase = {"sps": './data/Tyrosine_kinase/kinase_sps',
'seq': './data/Tyrosine_kinase/kinase_protein_seq',
"smile": './data/Tyrosine_kinase/kinase_smile',
"affinity": './data/Tyrosine_kinase/kinase_ic50',
}
# save = "./data/tokenize_data/test.tokenize"
# save = "./data/tokenize_data/test.tokenize.mask"
# get_tokenzie_seq(df_test, save)
# get_tokenzie_seq(file_train, save_mask, mask=True)
# save_er = "./data/tokenize_data/er.tokenize"
# save_GPCR = "./data/tokenize_data/gpcr.tokenize"
# save_channel = "./data/tokenize_data/channel.tokenize"
# save_kinase = "./data/tokenize_data/kinase.tokenize"
save_er_mask = "./data/tokenize_data/er.tokenize.mask"
save_GPCR_mask = "./data/tokenize_data/gpcr.tokenize.mask"
save_channel_mask = "./data/tokenize_data/channel.tokenize.mask"
save_kinase_mask = "./data/tokenize_data/kinase.tokenize.mask"
# get_tokenzie_seq(df_ER, save_er)
# get_tokenzie_seq(df_GPCR, save_GPCR)
# get_tokenzie_seq(df_Ion_channel, save_channel)
# get_tokenzie_seq(df_Tyrosine_kinase, save_kinase)
# get_tokenzie_seq(df_ER, save_er_mask, mask=True)
# get_tokenzie_seq(df_GPCR, save_GPCR_mask, mask=True)
# get_tokenzie_seq(df_Ion_channel, save_channel_mask, mask=True)
# get_tokenzie_seq(df_Tyrosine_kinase, save_kinase_mask, mask=True)
df_case = {'seq': './case_study/data/spike.txt',
"smile": './data/test/test_smile',
# "affinity": './data/Tyrosine_kinase/kinase_ic50',
}
save_case = "./case_study/spike.tokenize"
# get_tokenzie_seq_case(df_case, save_case)
#interaction datasets including the train, valide, test
## bindingbd dataset
df_bindingbd_train = {'seq':'./data/interaction/dataset/BindingDB/train/protein',
'smile':'./data/interaction/dataset/BindingDB/train/smile',
'affinity':'./data/interaction/dataset/BindingDB/train/label'}
save_bindingbd_train = './data/tokenize_data/bindingdb_train.tokenize'
# get_tokenzie_seq(df_bindingbd_train, save_bindingbd_train)
df_bindingbd_valid = {'seq':'./data/interaction/dataset/BindingDB/validate/protein',
'smile':'./data/interaction/dataset/BindingDB/validate/smile',
'affinity':'./data/interaction/dataset/BindingDB/validate/label'}
save_bindingbd_valid = './data/tokenize_data/bindingdb_valid.tokenize'
# get_tokenzie_seq(df_bindingbd_valid, save_bindingbd_valid)
df_bindingbd_test = {'seq':'./data/interaction/dataset/BindingDB/test/protein',
'smile':'./data/interaction/dataset/BindingDB/test/smile',
'affinity':'./data/interaction/dataset/BindingDB/test/label'}
save_bindingbd_test = './data/tokenize_data/bindingdb_test.tokenize'
# get_tokenzie_seq(df_bindingbd_test, save_bindingbd_test)
## biosnap
df_biosnap_train = {'seq':'./data/interaction/dataset/BIOSNAP/full_data/train/protein',
'smile':'./data/interaction/dataset/BIOSNAP/full_data/train/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/full_data/train/label'}
save_biosnap_train = './data/tokenize_data/biosnap_train.tokenize'
get_tokenzie_seq(df_biosnap_train, save_biosnap_train)
df_biosnap_valid = {'seq':'./data/interaction/dataset/BIOSNAP/full_data/validate/protein',
'smile':'./data/interaction/dataset/BIOSNAP/full_data/validate/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/full_data/validate/label'}
save_biosnap_valid = './data/tokenize_data/biosnap_valid.tokenize'
get_tokenzie_seq(df_biosnap_valid, save_biosnap_valid)
df_biosnap_test = {'seq':'./data/interaction/dataset/BIOSNAP/full_data/test/protein',
'smile':'./data/interaction/dataset/BIOSNAP/full_data/test/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/full_data/test/label'}
save_biosnap_test = './data/tokenize_data/biosnap_test.tokenize'
get_tokenzie_seq(df_biosnap_test, save_biosnap_test)
## davis
df_davis_train = {'seq':'./data/interaction/dataset/DAVIS/train/protein',
'smile':'./data/interaction/dataset/DAVIS/train/smile',
'affinity':'./data/interaction/dataset/DAVIS/train/label'}
save_davis_train = './data/tokenize_data/davis_train.tokenize'
# get_tokenzie_seq(df_davis_train, save_davis_train)
df_davis_valid = {'seq':'./data/interaction/dataset/DAVIS/validate/protein',
'smile':'./data/interaction/dataset/DAVIS/validate/smile',
'affinity':'./data/interaction/dataset/DAVIS/validate/label'}
save_davis_valid = './data/tokenize_data/davis_valid.tokenize'
# get_tokenzie_seq(df_davis_valid, save_davis_valid)
df_davis_test = {'seq':'./data/interaction/dataset/DAVIS/test/protein',
'smile':'./data/interaction/dataset/DAVIS/test/smile',
'affinity':'./data/interaction/dataset/DAVIS/test/label'}
save_davis_test = './data/tokenize_data/davis_test.tokenize'
# get_tokenzie_seq(df_davis_test, save_davis_test)
## biosnap for unseen protein
df_up_biosnap_train = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_protein/train/protein',
'smile':'./data/interaction/dataset/BIOSNAP/unseen_protein/train/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/unseen_protein/train/label'}
save_up_biosnap_train = './data/tokenize_data/biosnap_unseen_protein_train.tokenize'
# get_tokenzie_seq(df_up_biosnap_train, save_up_biosnap_train)
df_up_biosnap_valid = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_protein/validate/protein',
'smile':'./data/interaction/dataset/BIOSNAP/unseen_protein/validate/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/unseen_protein/validate/label'}
save_up_biosnap_valid = './data/tokenize_data/biosnap_unseen_protein_valid.tokenize'
# get_tokenzie_seq(df_up_biosnap_valid, save_up_biosnap_valid)
df_up_biosnap_test = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_protein/test/protein',
'smile':'./data/interaction/dataset/BIOSNAP/unseen_protein/test/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/unseen_protein/test/label'}
save_up_biosnap_test = './data/tokenize_data/biosnap_unseen_protein_test.tokenize'
# get_tokenzie_seq(df_up_biosnap_test, save_up_biosnap_test)
## biosnap for unseen drug
df_ud_biosnap_train = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_drug/train/protein',
'smile':'./data/interaction/dataset/BIOSNAP/unseen_drug/train/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/unseen_drug/train/label'}
save_ud_biosnap_train = './data/tokenize_data/biosnap_unseen_drug_train.tokenize'
# get_tokenzie_seq(df_ud_biosnap_train, save_ud_biosnap_train)
df_ud_biosnap_valid = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_drug/validate/protein',
'smile':'./data/interaction/dataset/BIOSNAP/unseen_drug/validate/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/unseen_drug/validate/label'}
save_ud_biosnap_valid = './data/tokenize_data/biosnap_unseen_drug_valid.tokenize'
# get_tokenzie_seq(df_ud_biosnap_valid, save_ud_biosnap_valid)
df_ud_biosnap_test = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_drug/test/protein',
'smile':'./data/interaction/dataset/BIOSNAP/unseen_drug/test/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/unseen_drug/test/label'}
save_ud_biosnap_test = './data/tokenize_data/biosnap_unseen_drug_test.tokenize'
# get_tokenzie_seq(df_ud_biosnap_test, save_ud_biosnap_test)
\ No newline at end of file
from subword_nmt.apply_bpe import BPE
import codecs
import json
import numpy as np
from tqdm import tqdm
import math
import random
def get_tokenzie_seq(file, save, mask=False):
begin_token = '[CLS]'
separate_token = "[SEP]"
with open(file['seq'], 'r') as f:
seq = f.readlines()
with open(file["smile"], 'r') as f:
smile = f.readlines()
with open(file["affinity"], 'r') as f:
affinity = f.readlines()
bpe_codes_drug = codecs.open('./config/drug_codes_chembl.txt')
dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
bpe_codes_prot = codecs.open('./config/protein_codes_uniprot.txt')
pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
with open(save, "w") as f:
for i in tqdm(range(len(seq))):
d = dbpe.process_line(smile[i].strip()).split()
p = pbpe.process_line(seq[i].strip()).split()
if mask == True:
d = random_mask(d)
p = random_mask(p)
final_seq = [begin_token] + d + [separate_token] + p + [separate_token]
affinity_num = affinity[i].strip()
item = {
"seq": " ".join(final_seq),
"affinity": affinity_num
}
new_item = json.dumps(item)
f.write(new_item + '\n')
def random_mask(input_seq, mask_proportion=0.15):
mask_len = math.ceil(len(input_seq)*mask_proportion)
mask_token_posi = np.random.choice(len(input_seq), mask_len)
for i in mask_token_posi:
choice = random.random()
if choice < 0.8:
input_seq[i] = "[MASK]"
# mask_vec[i] = 1
# elif choice >= 0.8 and choice < 0.9:
return input_seq
def get_tokenzie_seq_case(file, save, mask=False):
begin_token = '[CLS]'
separate_token = "[SEP]"
with open(file['seq'], 'r') as f:
seq = f.readlines()
seq = [i.strip() for i in seq]
seq = "".join(seq)
with open(file["smile"], 'r') as f:
smile = f.readlines()
# with open(file["affinity"], 'r') as f:
# affinity = f.readlines()
bpe_codes_drug = codecs.open('./config/drug_codes_chembl.txt')
dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
bpe_codes_prot = codecs.open('./config/protein_codes_uniprot.txt')
pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
with open(save, "w") as f:
for i in tqdm(range(len(smile))):
d = dbpe.process_line(smile[i].strip()).split()
p = pbpe.process_line(seq).split()
if mask == True:
d = random_mask(d)
p = random_mask(p)
final_seq = [begin_token] + d + [separate_token] + p + [separate_token]
# affinity_num = affinity[i].strip()
item = {
"seq": " ".join(final_seq),
# "affinity": affinity_num
}
new_item = json.dumps(item)
f.write(new_item + '\n')
if __name__ == '__main__':
# file_train = {"sps": './data/train/train_sps',
# 'seq': './data/train/train_protein_seq',
# "smile": './data/train/train_smile',
# "affinity": './data/train/train_ic50',
# }
# save = "./data/tokenize_data/train.tokenize"
# save_mask = "./data/tokenize_data/train.tokenize.mask"
df_test = {"sps": './data/test/test_sps',
'seq': './data/test/test_protein_seq',
"smile": './data/test/test_smile',
"affinity": './data/test/test_ic50',
}
df_ER = {"sps": './data/ER/ER_sps',
'seq': './data/ER/ER_protein_seq',
"smile": './data/ER/ER_smile',
"affinity": './data/ER/ER_ic50',
}
df_GPCR = {"sps": './data/GPCR/GPCR_sps',
'seq': './data/GPCR/GPCR_protein_seq',
"smile": './data/GPCR/GPCR_smile',
"affinity": './data/GPCR/GPCR_ic50',
}
df_Ion_channel = {"sps": './data/Ion_channel/channel_sps',
'seq': './data/Ion_channel/channel_protein_seq',
"smile": './data/Ion_channel/channel_smile',
"affinity": './data/Ion_channel/channel_ic50',
}
df_Tyrosine_kinase = {"sps": './data/Tyrosine_kinase/kinase_sps',
'seq': './data/Tyrosine_kinase/kinase_protein_seq',
"smile": './data/Tyrosine_kinase/kinase_smile',
"affinity": './data/Tyrosine_kinase/kinase_ic50',
}
# save = "./data/tokenize_data/test.tokenize"
# save = "./data/tokenize_data/test.tokenize.mask"
# get_tokenzie_seq(df_test, save)
# get_tokenzie_seq(file_train, save_mask, mask=True)
# save_er = "./data/tokenize_data/er.tokenize"
# save_GPCR = "./data/tokenize_data/gpcr.tokenize"
# save_channel = "./data/tokenize_data/channel.tokenize"
# save_kinase = "./data/tokenize_data/kinase.tokenize"
save_er_mask = "./data/tokenize_data/er.tokenize.mask"
save_GPCR_mask = "./data/tokenize_data/gpcr.tokenize.mask"
save_channel_mask = "./data/tokenize_data/channel.tokenize.mask"
save_kinase_mask = "./data/tokenize_data/kinase.tokenize.mask"
# get_tokenzie_seq(df_ER, save_er)
# get_tokenzie_seq(df_GPCR, save_GPCR)
# get_tokenzie_seq(df_Ion_channel, save_channel)
# get_tokenzie_seq(df_Tyrosine_kinase, save_kinase)
# get_tokenzie_seq(df_ER, save_er_mask, mask=True)
# get_tokenzie_seq(df_GPCR, save_GPCR_mask, mask=True)
# get_tokenzie_seq(df_Ion_channel, save_channel_mask, mask=True)
# get_tokenzie_seq(df_Tyrosine_kinase, save_kinase_mask, mask=True)
df_case = {'seq': './case_study/data/spike.txt',
"smile": './data/test/test_smile',
# "affinity": './data/Tyrosine_kinase/kinase_ic50',
}
save_case = "./case_study/spike.tokenize"
# get_tokenzie_seq_case(df_case, save_case)
#interaction datasets including the train, valide, test
## bindingbd dataset
df_bindingbd_train = {'seq':'./data/interaction/dataset/BindingDB/train/protein',
'smile':'./data/interaction/dataset/BindingDB/train/smile',
'affinity':'./data/interaction/dataset/BindingDB/train/label'}
save_bindingbd_train = './data/tokenize_data/bindingdb_train.tokenize'
# get_tokenzie_seq(df_bindingbd_train, save_bindingbd_train)
df_bindingbd_valid = {'seq':'./data/interaction/dataset/BindingDB/validate/protein',
'smile':'./data/interaction/dataset/BindingDB/validate/smile',
'affinity':'./data/interaction/dataset/BindingDB/validate/label'}
save_bindingbd_valid = './data/tokenize_data/bindingdb_valid.tokenize'
# get_tokenzie_seq(df_bindingbd_valid, save_bindingbd_valid)
df_bindingbd_test = {'seq':'./data/interaction/dataset/BindingDB/test/protein',
'smile':'./data/interaction/dataset/BindingDB/test/smile',
'affinity':'./data/interaction/dataset/BindingDB/test/label'}
save_bindingbd_test = './data/tokenize_data/bindingdb_test.tokenize'
# get_tokenzie_seq(df_bindingbd_test, save_bindingbd_test)
## biosnap
df_biosnap_train = {'seq':'./data/interaction/dataset/BIOSNAP/full_data/train/protein',
'smile':'./data/interaction/dataset/BIOSNAP/full_data/train/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/full_data/train/label'}
save_biosnap_train = './data/tokenize_data/biosnap_train.tokenize'
get_tokenzie_seq(df_biosnap_train, save_biosnap_train)
df_biosnap_valid = {'seq':'./data/interaction/dataset/BIOSNAP/full_data/validate/protein',
'smile':'./data/interaction/dataset/BIOSNAP/full_data/validate/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/full_data/validate/label'}
save_biosnap_valid = './data/tokenize_data/biosnap_valid.tokenize'
get_tokenzie_seq(df_biosnap_valid, save_biosnap_valid)
df_biosnap_test = {'seq':'./data/interaction/dataset/BIOSNAP/full_data/test/protein',
'smile':'./data/interaction/dataset/BIOSNAP/full_data/test/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/full_data/test/label'}
save_biosnap_test = './data/tokenize_data/biosnap_test.tokenize'
get_tokenzie_seq(df_biosnap_test, save_biosnap_test)
## davis
df_davis_train = {'seq':'./data/interaction/dataset/DAVIS/train/protein',
'smile':'./data/interaction/dataset/DAVIS/train/smile',
'affinity':'./data/interaction/dataset/DAVIS/train/label'}
save_davis_train = './data/tokenize_data/davis_train.tokenize'
# get_tokenzie_seq(df_davis_train, save_davis_train)
df_davis_valid = {'seq':'./data/interaction/dataset/DAVIS/validate/protein',
'smile':'./data/interaction/dataset/DAVIS/validate/smile',
'affinity':'./data/interaction/dataset/DAVIS/validate/label'}
save_davis_valid = './data/tokenize_data/davis_valid.tokenize'
# get_tokenzie_seq(df_davis_valid, save_davis_valid)
df_davis_test = {'seq':'./data/interaction/dataset/DAVIS/test/protein',
'smile':'./data/interaction/dataset/DAVIS/test/smile',
'affinity':'./data/interaction/dataset/DAVIS/test/label'}
save_davis_test = './data/tokenize_data/davis_test.tokenize'
# get_tokenzie_seq(df_davis_test, save_davis_test)
## biosnap for unseen protein
df_up_biosnap_train = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_protein/train/protein',
'smile':'./data/interaction/dataset/BIOSNAP/unseen_protein/train/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/unseen_protein/train/label'}
save_up_biosnap_train = './data/tokenize_data/biosnap_unseen_protein_train.tokenize'
# get_tokenzie_seq(df_up_biosnap_train, save_up_biosnap_train)
df_up_biosnap_valid = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_protein/validate/protein',
'smile':'./data/interaction/dataset/BIOSNAP/unseen_protein/validate/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/unseen_protein/validate/label'}
save_up_biosnap_valid = './data/tokenize_data/biosnap_unseen_protein_valid.tokenize'
# get_tokenzie_seq(df_up_biosnap_valid, save_up_biosnap_valid)
df_up_biosnap_test = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_protein/test/protein',
'smile':'./data/interaction/dataset/BIOSNAP/unseen_protein/test/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/unseen_protein/test/label'}
save_up_biosnap_test = './data/tokenize_data/biosnap_unseen_protein_test.tokenize'
# get_tokenzie_seq(df_up_biosnap_test, save_up_biosnap_test)
## biosnap for unseen drug
df_ud_biosnap_train = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_drug/train/protein',
'smile':'./data/interaction/dataset/BIOSNAP/unseen_drug/train/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/unseen_drug/train/label'}
save_ud_biosnap_train = './data/tokenize_data/biosnap_unseen_drug_train.tokenize'
# get_tokenzie_seq(df_ud_biosnap_train, save_ud_biosnap_train)
df_ud_biosnap_valid = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_drug/validate/protein',
'smile':'./data/interaction/dataset/BIOSNAP/unseen_drug/validate/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/unseen_drug/validate/label'}
save_ud_biosnap_valid = './data/tokenize_data/biosnap_unseen_drug_valid.tokenize'
# get_tokenzie_seq(df_ud_biosnap_valid, save_ud_biosnap_valid)
df_ud_biosnap_test = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_drug/test/protein',
'smile':'./data/interaction/dataset/BIOSNAP/unseen_drug/test/smile',
'affinity':'./data/interaction/dataset/BIOSNAP/unseen_drug/test/label'}
save_ud_biosnap_test = './data/tokenize_data/biosnap_unseen_drug_test.tokenize'
# get_tokenzie_seq(df_ud_biosnap_test, save_ud_biosnap_test)
\ No newline at end of file
import numpy as np
import pandas as pd
import torch
from torch.utils import data
import json
import collections
from torch.utils.data import DataLoader
from subword_nmt.apply_bpe import BPE
import codecs
from collections import Counter
from tqdm import tqdm
import math
import random
from torch.nn.utils.rnn import pad_sequence
import pickle, csv
import os
# vocab_path = './ESPF/protein_codes_uniprot.txt'
# bpe_codes_protein = codecs.open(vocab_path)
# pbpe = BPE(bpe_codes_protein, merges=-1, separator='')
# sub_csv = pd.read_csv('./ESPF/subword_units_map_uniprot.csv')
#
# idx2word_p = sub_csv['index'].values
# words2idx_p = dict(zip(idx2word_p, range(0, len(idx2word_p))))
# vocab_path = './ESPF/drug_codes_chembl.txt'
# bpe_codes_drug = codecs.open(vocab_path)
# dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
# sub_csv = pd.read_csv('./ESPF/subword_units_map_chembl.csv')
#
# idx2word_d = sub_csv['index'].values
# words2idx_d = dict(zip(idx2word_d, range(0, len(idx2word_d))))
# max_d = 205
# max_p = 545
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
# vocab_file = os.path.join(os.path.dirname(__file__), 'config', 'vocab_mol.txt')
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
# def protein2emb_encoder(x, words2idx_p):
# max_p = 152
# # t1 = pbpe.process_line(x).split() # split
# t1 = x.split(',')
# try:
# i1 = np.asarray([words2idx_p[i] for i in t1]) # index
# except:
# i1 = np.array([0])
# # print(x)
#
# l = len(i1)
#
# if l < max_p:
# i = np.pad(i1, (0, max_p - l), 'constant', constant_values=0)
# input_mask = ([1] * l) + ([0] * (max_p - l))
# else:
# i = i1[:max_p]
# input_mask = [1] * max_p
#
# return i, np.asarray(input_mask)
# def drug2emb_encoder(x, dbpe, words2idx_d):
# max_d = 50
# # max_d = 100
# t1 = dbpe.process_line(x)
# t1 = t1.split() # split
# try:
# i1 = np.asarray([words2idx_d[i] for i in t1]) # index
# except:
# i1 = np.array([0])
# # print(x)
#
# l = len(i1)
# print(i1)
#
# if l < max_d:
# i = np.pad(i1, (0, max_d - l), 'constant', constant_values=0)
# input_mask = ([1] * l) + ([0] * (max_d - l))
#
# else:
# i = i1[:max_d]
# input_mask = [1] * max_d
#
# return i, np.asarray(input_mask)
def seq2emb_encoder(input_seq, max_len, vocab):
try:
ids = np.asarray([vocab[i] for i in input_seq])
except:
ids = np.array([0])
l = len(ids)
if l < max_len:
ids = np.pad(ids, (0, max_len - l), 'constant', constant_values=0)
input_mask = np.array(([1] * l) + ([0] * (max_len - l)))
else:
ids = ids[:max_len]
input_mask = np.array([1] * max_len)
return ids, input_mask
def seq2emb_encoder_simple(input_seq, max_len, vocab):
try:
ids = np.asarray([vocab[i] for i in input_seq])
except:
ids = np.array([0])
# l = len(ids)
#
# if l < max_len:
# ids = np.pad(ids, (0, max_len - l), 'constant', constant_values=0)
# input_mask = np.array(([1] * l) + ([0] * (max_len - l)))
# else:
# ids = ids[:max_len]
# input_mask = np.array([1] * max_len)
return ids
class Data_Encoder(data.Dataset):
def __init__(self, train_file, tokenizer_config):
'Initialization'
# load data
with open(train_file["sps"], 'r') as f:
self.sps = f.readlines()
with open(train_file["smile"], 'r') as f:
self.smile = f.readlines()
with open(train_file["affinity"], 'r') as f:
self.affinity = f.readlines()
# define tokenizer
self.begin_id = tokenizer_config["begin_id"]
self.sep_id = tokenizer_config["separate_id"]
self.max_len = tokenizer_config["max_len"]
self.vocab = load_vocab(tokenizer_config["vocab_file"])
bpe_codes_drug = codecs.open(tokenizer_config["vocab_pair"])
self.dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
def __len__(self):
'Denotes the total number of samples'
return len(self.sps)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
# tokenization
d = self.dbpe.process_line(self.smile[index].strip()).split()
p = self.sps[index].strip().split(',')
y = np.float(self.affinity[index].strip())
input_seq = [self.begin_id] + d + [self.sep_id] + p + [self.sep_id]
token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(
input_mask).long(), y
# return len(d), len(p)
class Data_Encoder_mol(data.Dataset):
def __init__(self, train_file, tokenizer_config):
'Initialization'
# load data
# with open(train_file["sps"], 'r') as f:
# self.sps = f.readlines()
with open(train_file['seq'], 'r') as f:
self.seq = f.readlines()
with open(train_file["smile"], 'r') as f:
self.smile = f.readlines()
with open(train_file["affinity"], 'r') as f:
self.affinity = f.readlines()
# define tokenizer
self.begin_id = tokenizer_config["begin_id"]
self.sep_id = tokenizer_config["separate_id"]
self.max_len = tokenizer_config["max_len"]
self.vocab = load_vocab(tokenizer_config["vocab_file"])
bpe_codes_drug = codecs.open(tokenizer_config["vocab_pair"])
self.dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
bpe_codes_prot = codecs.open(tokenizer_config["vocab_pair_p"])
self.pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
def __len__(self):
'Denotes the total number of samples'
return len(self.smile)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
# tokenization
d = self.dbpe.process_line(self.smile[index].strip()).split()
p = self.pbpe.process_line(self.seq[index].strip()).split()
y = np.float64(self.affinity[index].strip())
input_seq = [self.begin_id] + d + [self.sep_id] + p + [self.sep_id]
token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
if len(input_seq) > self.max_len:
input_seq = input_seq[:self.max_len-1] + [self.sep_id]
token_type_ids = token_type_ids[:self.max_len]
else:
token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(input_mask).long(), y
# return len(d), len(p)
class Data_Encoder_LM(data.Dataset):
def __init__(self, train_file, tokenizer_config):
'Initialization'
# load data
# with open(train_file["sps"], 'r') as f:
# self.sps = f.readlines()
with open(train_file['seq'], 'r') as f:
self.seq = f.readlines()
with open(train_file["smile"], 'r') as f:
self.smile = f.readlines()
with open(train_file["affinity"], 'r') as f:
self.affinity = f.readlines()
# define tokenizer
self.begin_id = tokenizer_config["begin_id"]
self.sep_id = tokenizer_config["separate_id"]
self.max_len = tokenizer_config["max_len"]
self.vocab = load_vocab(tokenizer_config["vocab_file"])
bpe_codes_drug = codecs.open(tokenizer_config["vocab_pair"])
self.dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
bpe_codes_prot = codecs.open(tokenizer_config["vocab_pair_p"])
self.pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
def __len__(self):
'Denotes the total number of samples'
return len(self.smile)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
# tokenization
d = self.dbpe.process_line(self.smile[index].strip()).split()
p = self.pbpe.process_line(self.seq[index].strip()).split()
# mask_d, mask_d_posi = self.random_mask(d)
# mask_p, mask_p_posi = self.random_mask(p)
y = np.float64(self.affinity[index].strip())
#
# input_seq = [self.begin_id] + mask_d + [self.sep_id] + mask_p + [self.sep_id]
# mask_posi = np.concatenate((np.zeros(1), mask_d_posi, np.zeros(1), mask_p_posi, np.zeros(1)))
# token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
# if len(input_seq) > self.max_len:
# input_seq = input_seq[:self.max_len-1] + [self.sep_id]
# token_type_ids = token_type_ids[:self.max_len]
# mask_posi = mask_posi[:self.max_len]
# else:
# mask_posi = np.pad(mask_posi, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
# token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
# input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
# return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(input_mask).long(), y, torch.from_numpy(mask_posi).long()
return " ".join(d), " ".join(p), y
# return len(d), len(p)
class Data_Provide(data.Dataset):
def __init__(self, train_file, mask_file, tokenizer):
'Initialization'
# load data
with open(train_file, 'r') as f:
self.seq = f.readlines()
with open(mask_file, 'r') as f:
self.seq_mask = f.readlines()
self.tokenizer = tokenizer
def __len__(self):
'Denotes the total number of samples'
return len(self.seq)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
item = json.loads(self.seq[index])
mask_item = json.loads(self.seq_mask[index])
seq = item["seq"]
# freq_score = self.tokenizer.calculate_tf_idf(seq)
seq_mask = mask_item["seq"]
y = np.float64(item["affinity"])
# print("len(seq.split())",len(seq.split()))
# print("len(freq_score)",len(freq_score))
# print(len(seq_mask))
# print(y)
# print(seq.size(), seq_mask.size(), len(fre_score), len(y))
# return seq, seq_mask, freq_score, y
return seq, seq_mask, y
class Data_Gen(data.Dataset):
def __init__(self, train_file):
'Initialization'
# load data
with open(train_file, 'r') as f:
self.seq = f.readlines()
# with open(mask_file, 'r') as f:
# self.seq_mask = f.readlines()
def __len__(self):
'Denotes the total number of samples'
return len(self.seq)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
item = json.loads(self.seq[index])
# mask_item = json.loads(self.seq_mask[index])
seq = item["seq"]
# seq_mask = mask_item["seq"]
if "affinity" not in item.keys():
return seq
else:
y = np.float64(item["affinity"])
return seq, y
def get_task(task_name):
tokenizer_config = {"vocab_file": './config/vocab.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
if task_name.lower() == 'train':
df_train = {"sps": './data/train/train_sps',
"smile": './data/train/train_smile',
"affinity": './data/train/train_ic50',
}
return df_train, tokenizer_config
elif task_name.lower() == 'test':
df_test = {"sps": './data/test/test_sps',
"smile": './data/test/test_smile',
"affinity": './data/test/test_ic50',
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_ori_er':
df_test = {"sps": './data/ER/ER_sps',
"smile": './data/ER/ER_smile',
"affinity": './data/ER/ER_ic50',
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_ori_gpcr':
df_test = {"sps": './data/GPCR/GPCR_sps',
"smile": './data/GPCR/GPCR_smile',
"affinity": './data/GPCR/GPCR_ic50',
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_ori_channel':
df_test = {"sps": './data/Ion_channel/channel_sps',
"smile": './data/Ion_channel/channel_smile',
"affinity": './data/Ion_channel/channel_ic50',
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_ori_kinase':
df_test = {"sps": './data/Tyrosine_kinase/kinase_sps',
"smile": './data/Tyrosine_kinase/kinase_smile',
"affinity": './data/Tyrosine_kinase/kinase_ic50',
}
return df_test, tokenizer_config
elif task_name.lower() == 'train_mol':
df_train = "data/tokenize_data/train.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, tokenizer_config
elif task_name.lower() == 'test_mol':
df_test = "data/tokenize_data/test.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_er':
df_test = "data/tokenize_data/er.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_gpcr':
df_test = "data/tokenize_data/gpcr.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_channel':
df_test = "data/tokenize_data/channel.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_kinase':
df_test = "data/tokenize_data/kinase.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_test, tokenizer_config
elif task_name.lower() == 'pre-train':
df_train_mask = "data/tokenize_data/train.tokenize.mask"
df_train = "data/tokenize_data/train.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, df_train_mask, tokenizer_config
elif task_name.lower() == 'test-pre-train':
df_train_mask = "data/tokenize_data/test.tokenize.mask"
df_train = "data/tokenize_data/test.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, df_train_mask, tokenizer_config
elif task_name.lower() == 'test-pre-train-er':
df_train_mask = "data/tokenize_data/er.tokenize.mask"
df_train = "data/tokenize_data/er.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, df_train_mask, tokenizer_config
elif task_name.lower() == 'test-pre-train-gpcr':
df_train_mask = "data/tokenize_data/gpcr.tokenize.mask"
df_train = "data/tokenize_data/gpcr.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, df_train_mask, tokenizer_config
elif task_name.lower() == 'test-pre-train-channel':
df_train_mask = "data/tokenize_data/channel.tokenize.mask"
df_train = "data/tokenize_data/channel.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, df_train_mask, tokenizer_config
elif task_name.lower() == 'test-pre-train-kinase':
df_train_mask = "data/tokenize_data/kinase.tokenize.mask"
df_train = "data/tokenize_data/kinase.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, df_train_mask, tokenizer_config
elif task_name.lower() == 'case_study':
# df_train_mask = "data/tokenize_data/kinase.tokenize.mask"
df_train = "case_study/spike.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, tokenizer_config
def random_mask(input_seq, mask_proportion=0.15):
input = [i.split() for i in input_seq]
mask_len = [math.ceil(len(i)*mask_proportion) for i in input]
# mask_posi = np.arange(len(input_seq))
# mask_token_posi = random.sample(mask_posi, mask_len)
mask_token_posi = [np.random.choice(len(i), j) for i, j in zip(input, mask_len)]
# mask_vec = np.zeros(len(input_seq))
for i, posi in enumerate(mask_token_posi):
for j in posi:
choice = random.random()
if choice < 0.8:
input[i][j] = "[MASK]"
# mask_vec[i] = 1
# elif choice >= 0.8 and choice < 0.9:
return input
class Tokenizer(object):
def __init__(self, tokenizer_config):
self.begin_id = tokenizer_config["begin_id"]
self.sep_id = tokenizer_config["separate_id"]
self.max_len = tokenizer_config["max_len"]
self.vocab = load_vocab(tokenizer_config["vocab_file"])
# 读取vocab.txt中的token列表和对应的id
vocab = './config/vocab_mol.txt'
with open(vocab, 'r', encoding='utf-8') as f:
tokens = [token.strip() for token in f.readlines()]
self.token_to_id = {token: i for i, token in enumerate(tokens)}
#读取token的frequency字典
token_frequency_file = './config/token_frequency.pickle'
with open(token_frequency_file, 'rb') as f:
self.token_frequency = pickle.load(f)
self.total_tokens = sum(self.token_frequency.values())
# 定义函数,将一个sequence转换为对应的token id列表
def tokenize_sequence(self, sequence):
tokens = [sequence.split(' ')]
token_ids = [self.token_to_id.get(token, self.token_to_id['[UNK]']) for token in tokens]
return token_ids
def seq2emb_encoder_simple(self, input_seq, vocab):
all_ids = []
for i in input_seq:
try:
id = vocab[i]
all_ids.append(id)
except:
id = vocab["[UNK]"]
all_ids.append(id)
ids = np.asarray(all_ids)
return ids
def convert_token_to_ids(self, seq):
# input_seq = [[self.begin_id] + i + [self.sep_id] + j + [self.sep_id] for i, j in zip(mask_d, mask_p)]
# input_seq_ori = [[self.begin_id] + i.split() + [self.sep_id] + j.split() + [self.sep_id] for i, j in zip(d, p)]
# mask_posi = np.concatenate((np.zeros(1), mask_d_posi, np.zeros(1), mask_p_posi, np.zeros(1)))
# token_type_ids = [[np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))] for d, p in zip(mask_d, mask_p)]
# seq = seq.split()
all_seq = [i.split() for i in seq]
for i, seq_i in enumerate(all_seq):
if len(seq_i) > self.max_len:
all_seq[i] = seq_i[:self.max_len-1] + [self.sep_id]
# input_seq_ori[i] = seq[:self.max_len-1] + [self.sep_id]
# token_type_ids = token_type_ids[:self.max_len]
# mask_posi = mask_posi[:self.max_len]
# else:
# mask_posi = np.pad(mask_posi, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
# token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
all_seq_ids = []
# all_seq_ori = []
# all_mask = []
for seq in all_seq:
input = self.seq2emb_encoder_simple(seq, self.vocab)
# input_ori = seq2emb_encoder_simple(ori, self.max_len, self.vocab)
all_seq_ids.append(torch.from_numpy(input).long())
# all_seq_ori.append(torch.from_numpy(input_ori).long())
padded_seq_ids = pad_sequence(all_seq_ids, batch_first=True)
if padded_seq_ids.size(1) < self.max_len:
padded_seq_ids = torch.cat([padded_seq_ids, torch.zeros(padded_seq_ids.size(0), self.max_len - padded_seq_ids.size(1))], dim=1)
else:
padded_seq_ids = padded_seq_ids[:, :self.max_len]
# input_ori = pad_sequence(all_seq_ori, batch_first=True)
# input_mask = pad_sequence(all_mask)
# return torch.from_numpy(input).long(), torch.from_numpy(input_mask).long(), torch.from_numpy(token_type_ids).long()
# return input, input_mask, input_ori
input_mask = (padded_seq_ids != 0)
return padded_seq_ids.long(), input_mask
# 定义函数,计算一个sequence中每个token的TF-IDF值
def calculate_tf_idf(self, sequences):
# 将sequence转换为token id列表
tf_idfs = []
total_tokens = 23533
for index, sequence in enumerate(sequences):
seq_ids = self.seq2emb_encoder_simple(sequence.split(), self.vocab)
# input_ori = seq2emb_encoder_simple(ori, self.max_len, self.vocab)
# all_seq_ids.append(torch.from_numpy(input).long())
token_count = Counter(seq_ids)
token_tf = [token_count[token] / len(seq_ids) for token in seq_ids]
# for token in enumerate(sequence):
# print(token)
# print(self.token_frequency.get(self.vocab[token]))
# token_idf = [np.log(((self.token_frequency.get(token, 1))+1) / (token_count[token]+1)) for i, token in enumerate(sequence.split())]
token_idf = [np.log(((self.token_frequency.get(token, 1))+1) / (token_count[token]+1)) for token in enumerate(sequence)]
# print(token, token_idf)
token_tf_idf = [tf * idf for tf, idf in zip(token_tf, token_idf)]
tf_idfs.append(torch.tensor(token_tf_idf))
padded_tf_idfs = torch.zeros((len(tf_idfs), self.max_len))
for i,tf_idf in enumerate(tf_idfs):
if tf_idf.size(0) < self.max_len:
padded_tf_idf = torch.cat([tf_idf, torch.zeros(self.max_len - len(tf_idf))])
else:
padded_tf_idf = tf_idf[:self.max_len]
# padded_tf_idfs.append(padded_tf_idf)
padded_tf_idfs[i] = padded_tf_idf
# padded_tf_idfs = torch.stack(padded_tf_idfs)
# print(padded_tf_idfs.shape)
return padded_tf_idfs
def collate_fn(batch):
# Get the maximum length of freq_score in the batch
max_len = max(len(item[2]) for item in batch)
# Initialize empty lists for seq, seq_mask, freq_score, and y
seqs, seq_masks, freq_scores, ys = [], [], [], []
# Pad freq_score and concatenate seq and seq_mask for each item in the batch
for item in batch:
freq_score = item[1]
freq_score += [0] * (max_len - len(freq_score))
freq_scores.append(freq_score)
ys.append(item[2])
# Convert lists to tensors
seqs = [torch.tensor(seq) for seq in seqs]
seq_masks = [torch.tensor(seq_mask) for seq_mask in seq_masks]
freq_scores = torch.tensor(freq_scores)
ys = torch.tensor(ys)
# Return the batch
return seqs, seq_masks, ys
if __name__ == "__main__":
# local test
# dataFolder = './IC50/SPS/train_smile'
# with open(dataFolder, 'r') as f:
# train_smi = f.readlines()
# drug_smi = train_smi[0]
# d_v, input_mask_d = drug2emb_encoder(drug_smi)
# test load vocab
# vocab_file = './ESPF/vocab.txt'
# vocab = load_vocab(vocab_file)
# test train
'''
task = 'pre-train'
data_file, data_mask, tokenizer_config = get_task(task)
dataset = Data_Provide(data_file, data_mask)
tokenizer = Tokenizer(tokenizer_config)
data_loder_para = {'batch_size': 2,
'shuffle': False,
'num_workers': 0,
}
data_generator = DataLoader(dataset, **data_loder_para)
all_len = []
m = 0
for i, (seq, seq_mask, affinity) in enumerate(tqdm(data_generator)):
input_random_mask, attention_mask = tokenizer.convert_token_to_ids(seq_mask)
label, _ = tokenizer.convert_token_to_ids(seq)
posi = torch.where(input_random_mask == 1)
target = label[posi]
a = input_random_mask == 4
if torch.sum(a) > 2:
print(torch.sum(a))
'''
# a = seq[0].split()
# b = seq_mask[0].split()
# all_len.append(len(a))
# if len(a) > 512:
# m += 1
# if len(a) != len(b):
# print(seq)
# print(i)
# all_len = np.array(all_len)
# print(np.max(all_len))
# print(np.mean(all_len))
# print(m)
#test for tokenizer and count frequency
# sequence = '[CLS] CC1=CC=C (O 1)C2=N C(=CC(=N [SEP] MP VRRG H VAP QN'
output_file_path = "tf_idf_values.txt"
sequence = ['[CLS]', ')cn1', '(O', '1)C2=N', 'C(=CC(=N', '[SEP]', 'MP', 'VRRG', 'H', 'VAP', 'MP', 'VRRG', 'QN']
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
tokenizer = Tokenizer(tokenizer_config)
tf_idf_values = tokenizer.calculate_tf_idf(sequence)
print(tf_idf_values.size())
# 指定输出文件路径
output_file_path = "tf_idf_values.txt"
# 打开文件并写入tf_idf_values
with open(output_file_path, "w") as file:
# 将tf_idf_values转换为字符串形式
tf_idf_str = "\n".join(str(value) for value in tf_idf_values)
# 写入文件
file.write(tf_idf_str)
print("tf_idf_values已成功写入到文件:", output_file_path)
# for i, token in enumerate(sequence.split()):
# print(f"Token: {token}, TF-IDF value: {tf_idf_values[i]}")
# task = 'pre-train'
# data_file, data_mask, tokenizer_config = get_task(task)
# tokenizer = Tokenizer(tokenizer_config)
# dataset = Data_Provide(data_file, data_mask, tokenizer)
# data_loder_para = {'batch_size': 2,
# 'shuffle': False,
# 'num_workers': 0,
# }
# data_generator = DataLoader(dataset, **data_loder_para)
# for idx, inputs in enumerate(data_generator):
# x,y1,y2 = inputs
# print(f"Batch {idx}: Inputs shape: {x.dtype}")
# print(f"Batch {idx}: Targets shape: {y1.dtype}")
# # print(f"Batch {idx}: Targets shape: {fre.dtype}")
# print(f"Batch {idx}: Targets shape: {y2.dtype}")
# # print(f"Batch {idx}: Targets shape: {fre.shape}")
# if idx == 2:
# break
import numpy as np
import pandas as pd
import torch
from torch.utils import data
import json
import collections
from torch.utils.data import DataLoader
from subword_nmt.apply_bpe import BPE
import codecs
from collections import Counter
from tqdm import tqdm
import math
import random
from torch.nn.utils.rnn import pad_sequence
import pickle, csv
import os
# vocab_path = './ESPF/protein_codes_uniprot.txt'
# bpe_codes_protein = codecs.open(vocab_path)
# pbpe = BPE(bpe_codes_protein, merges=-1, separator='')
# sub_csv = pd.read_csv('./ESPF/subword_units_map_uniprot.csv')
#
# idx2word_p = sub_csv['index'].values
# words2idx_p = dict(zip(idx2word_p, range(0, len(idx2word_p))))
# vocab_path = './ESPF/drug_codes_chembl.txt'
# bpe_codes_drug = codecs.open(vocab_path)
# dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
# sub_csv = pd.read_csv('./ESPF/subword_units_map_chembl.csv')
#
# idx2word_d = sub_csv['index'].values
# words2idx_d = dict(zip(idx2word_d, range(0, len(idx2word_d))))
# max_d = 205
# max_p = 545
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
# vocab_file = os.path.join(os.path.dirname(__file__), 'config', 'vocab_mol.txt')
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
# def protein2emb_encoder(x, words2idx_p):
# max_p = 152
# # t1 = pbpe.process_line(x).split() # split
# t1 = x.split(',')
# try:
# i1 = np.asarray([words2idx_p[i] for i in t1]) # index
# except:
# i1 = np.array([0])
# # print(x)
#
# l = len(i1)
#
# if l < max_p:
# i = np.pad(i1, (0, max_p - l), 'constant', constant_values=0)
# input_mask = ([1] * l) + ([0] * (max_p - l))
# else:
# i = i1[:max_p]
# input_mask = [1] * max_p
#
# return i, np.asarray(input_mask)
# def drug2emb_encoder(x, dbpe, words2idx_d):
# max_d = 50
# # max_d = 100
# t1 = dbpe.process_line(x)
# t1 = t1.split() # split
# try:
# i1 = np.asarray([words2idx_d[i] for i in t1]) # index
# except:
# i1 = np.array([0])
# # print(x)
#
# l = len(i1)
# print(i1)
#
# if l < max_d:
# i = np.pad(i1, (0, max_d - l), 'constant', constant_values=0)
# input_mask = ([1] * l) + ([0] * (max_d - l))
#
# else:
# i = i1[:max_d]
# input_mask = [1] * max_d
#
# return i, np.asarray(input_mask)
def seq2emb_encoder(input_seq, max_len, vocab):
try:
ids = np.asarray([vocab[i] for i in input_seq])
except:
ids = np.array([0])
l = len(ids)
if l < max_len:
ids = np.pad(ids, (0, max_len - l), 'constant', constant_values=0)
input_mask = np.array(([1] * l) + ([0] * (max_len - l)))
else:
ids = ids[:max_len]
input_mask = np.array([1] * max_len)
return ids, input_mask
def seq2emb_encoder_simple(input_seq, max_len, vocab):
try:
ids = np.asarray([vocab[i] for i in input_seq])
except:
ids = np.array([0])
# l = len(ids)
#
# if l < max_len:
# ids = np.pad(ids, (0, max_len - l), 'constant', constant_values=0)
# input_mask = np.array(([1] * l) + ([0] * (max_len - l)))
# else:
# ids = ids[:max_len]
# input_mask = np.array([1] * max_len)
return ids
class Data_Encoder(data.Dataset):
def __init__(self, train_file, tokenizer_config):
'Initialization'
# load data
with open(train_file["sps"], 'r') as f:
self.sps = f.readlines()
with open(train_file["smile"], 'r') as f:
self.smile = f.readlines()
with open(train_file["affinity"], 'r') as f:
self.affinity = f.readlines()
# define tokenizer
self.begin_id = tokenizer_config["begin_id"]
self.sep_id = tokenizer_config["separate_id"]
self.max_len = tokenizer_config["max_len"]
self.vocab = load_vocab(tokenizer_config["vocab_file"])
bpe_codes_drug = codecs.open(tokenizer_config["vocab_pair"])
self.dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
def __len__(self):
'Denotes the total number of samples'
return len(self.sps)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
# tokenization
d = self.dbpe.process_line(self.smile[index].strip()).split()
p = self.sps[index].strip().split(',')
y = np.float(self.affinity[index].strip())
input_seq = [self.begin_id] + d + [self.sep_id] + p + [self.sep_id]
token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(
input_mask).long(), y
# return len(d), len(p)
class Data_Encoder_mol(data.Dataset):
def __init__(self, train_file, tokenizer_config):
'Initialization'
# load data
# with open(train_file["sps"], 'r') as f:
# self.sps = f.readlines()
with open(train_file['seq'], 'r') as f:
self.seq = f.readlines()
with open(train_file["smile"], 'r') as f:
self.smile = f.readlines()
with open(train_file["affinity"], 'r') as f:
self.affinity = f.readlines()
# define tokenizer
self.begin_id = tokenizer_config["begin_id"]
self.sep_id = tokenizer_config["separate_id"]
self.max_len = tokenizer_config["max_len"]
self.vocab = load_vocab(tokenizer_config["vocab_file"])
bpe_codes_drug = codecs.open(tokenizer_config["vocab_pair"])
self.dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
bpe_codes_prot = codecs.open(tokenizer_config["vocab_pair_p"])
self.pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
def __len__(self):
'Denotes the total number of samples'
return len(self.smile)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
# tokenization
d = self.dbpe.process_line(self.smile[index].strip()).split()
p = self.pbpe.process_line(self.seq[index].strip()).split()
y = np.float64(self.affinity[index].strip())
input_seq = [self.begin_id] + d + [self.sep_id] + p + [self.sep_id]
token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
if len(input_seq) > self.max_len:
input_seq = input_seq[:self.max_len-1] + [self.sep_id]
token_type_ids = token_type_ids[:self.max_len]
else:
token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(input_mask).long(), y
# return len(d), len(p)
class Data_Encoder_LM(data.Dataset):
def __init__(self, train_file, tokenizer_config):
'Initialization'
# load data
# with open(train_file["sps"], 'r') as f:
# self.sps = f.readlines()
with open(train_file['seq'], 'r') as f:
self.seq = f.readlines()
with open(train_file["smile"], 'r') as f:
self.smile = f.readlines()
with open(train_file["affinity"], 'r') as f:
self.affinity = f.readlines()
# define tokenizer
self.begin_id = tokenizer_config["begin_id"]
self.sep_id = tokenizer_config["separate_id"]
self.max_len = tokenizer_config["max_len"]
self.vocab = load_vocab(tokenizer_config["vocab_file"])
bpe_codes_drug = codecs.open(tokenizer_config["vocab_pair"])
self.dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
bpe_codes_prot = codecs.open(tokenizer_config["vocab_pair_p"])
self.pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
def __len__(self):
'Denotes the total number of samples'
return len(self.smile)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
# tokenization
d = self.dbpe.process_line(self.smile[index].strip()).split()
p = self.pbpe.process_line(self.seq[index].strip()).split()
# mask_d, mask_d_posi = self.random_mask(d)
# mask_p, mask_p_posi = self.random_mask(p)
y = np.float64(self.affinity[index].strip())
#
# input_seq = [self.begin_id] + mask_d + [self.sep_id] + mask_p + [self.sep_id]
# mask_posi = np.concatenate((np.zeros(1), mask_d_posi, np.zeros(1), mask_p_posi, np.zeros(1)))
# token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
# if len(input_seq) > self.max_len:
# input_seq = input_seq[:self.max_len-1] + [self.sep_id]
# token_type_ids = token_type_ids[:self.max_len]
# mask_posi = mask_posi[:self.max_len]
# else:
# mask_posi = np.pad(mask_posi, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
# token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
# input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
# return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(input_mask).long(), y, torch.from_numpy(mask_posi).long()
return " ".join(d), " ".join(p), y
# return len(d), len(p)
class Data_Provide(data.Dataset):
def __init__(self, train_file, mask_file, tokenizer):
'Initialization'
# load data
with open(train_file, 'r') as f:
self.seq = f.readlines()
with open(mask_file, 'r') as f:
self.seq_mask = f.readlines()
self.tokenizer = tokenizer
def __len__(self):
'Denotes the total number of samples'
return len(self.seq)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
item = json.loads(self.seq[index])
mask_item = json.loads(self.seq_mask[index])
seq = item["seq"]
# freq_score = self.tokenizer.calculate_tf_idf(seq)
seq_mask = mask_item["seq"]
y = np.float64(item["affinity"])
# print("len(seq.split())",len(seq.split()))
# print("len(freq_score)",len(freq_score))
# print(len(seq_mask))
# print(y)
# print(seq.size(), seq_mask.size(), len(fre_score), len(y))
# return seq, seq_mask, freq_score, y
return seq, seq_mask, y
class Data_Gen(data.Dataset):
def __init__(self, train_file):
'Initialization'
# load data
with open(train_file, 'r') as f:
self.seq = f.readlines()
# with open(mask_file, 'r') as f:
# self.seq_mask = f.readlines()
def __len__(self):
'Denotes the total number of samples'
return len(self.seq)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
item = json.loads(self.seq[index])
# mask_item = json.loads(self.seq_mask[index])
seq = item["seq"]
# seq_mask = mask_item["seq"]
if "affinity" not in item.keys():
return seq
else:
y = np.float64(item["affinity"])
return seq, y
def get_task(task_name):
tokenizer_config = {"vocab_file": './config/vocab.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
if task_name.lower() == 'train':
df_train = {"sps": './data/train/train_sps',
"smile": './data/train/train_smile',
"affinity": './data/train/train_ic50',
}
return df_train, tokenizer_config
elif task_name.lower() == 'test':
df_test = {"sps": './data/test/test_sps',
"smile": './data/test/test_smile',
"affinity": './data/test/test_ic50',
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_ori_er':
df_test = {"sps": './data/ER/ER_sps',
"smile": './data/ER/ER_smile',
"affinity": './data/ER/ER_ic50',
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_ori_gpcr':
df_test = {"sps": './data/GPCR/GPCR_sps',
"smile": './data/GPCR/GPCR_smile',
"affinity": './data/GPCR/GPCR_ic50',
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_ori_channel':
df_test = {"sps": './data/Ion_channel/channel_sps',
"smile": './data/Ion_channel/channel_smile',
"affinity": './data/Ion_channel/channel_ic50',
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_ori_kinase':
df_test = {"sps": './data/Tyrosine_kinase/kinase_sps',
"smile": './data/Tyrosine_kinase/kinase_smile',
"affinity": './data/Tyrosine_kinase/kinase_ic50',
}
return df_test, tokenizer_config
elif task_name.lower() == 'train_mol':
df_train = "data/tokenize_data/train.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, tokenizer_config
elif task_name.lower() == 'test_mol':
df_test = "data/tokenize_data/test.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_er':
df_test = "data/tokenize_data/er.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_gpcr':
df_test = "data/tokenize_data/gpcr.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_channel':
df_test = "data/tokenize_data/channel.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_test, tokenizer_config
elif task_name.lower() == 'test_kinase':
df_test = "data/tokenize_data/kinase.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_test, tokenizer_config
elif task_name.lower() == 'pre-train':
df_train_mask = "data/tokenize_data/train.tokenize.mask"
df_train = "data/tokenize_data/train.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, df_train_mask, tokenizer_config
elif task_name.lower() == 'test-pre-train':
df_train_mask = "data/tokenize_data/test.tokenize.mask"
df_train = "data/tokenize_data/test.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, df_train_mask, tokenizer_config
elif task_name.lower() == 'test-pre-train-er':
df_train_mask = "data/tokenize_data/er.tokenize.mask"
df_train = "data/tokenize_data/er.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, df_train_mask, tokenizer_config
elif task_name.lower() == 'test-pre-train-gpcr':
df_train_mask = "data/tokenize_data/gpcr.tokenize.mask"
df_train = "data/tokenize_data/gpcr.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, df_train_mask, tokenizer_config
elif task_name.lower() == 'test-pre-train-channel':
df_train_mask = "data/tokenize_data/channel.tokenize.mask"
df_train = "data/tokenize_data/channel.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, df_train_mask, tokenizer_config
elif task_name.lower() == 'test-pre-train-kinase':
df_train_mask = "data/tokenize_data/kinase.tokenize.mask"
df_train = "data/tokenize_data/kinase.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, df_train_mask, tokenizer_config
elif task_name.lower() == 'case_study':
# df_train_mask = "data/tokenize_data/kinase.tokenize.mask"
df_train = "case_study/spike.tokenize"
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
return df_train, tokenizer_config
def random_mask(input_seq, mask_proportion=0.15):
input = [i.split() for i in input_seq]
mask_len = [math.ceil(len(i)*mask_proportion) for i in input]
# mask_posi = np.arange(len(input_seq))
# mask_token_posi = random.sample(mask_posi, mask_len)
mask_token_posi = [np.random.choice(len(i), j) for i, j in zip(input, mask_len)]
# mask_vec = np.zeros(len(input_seq))
for i, posi in enumerate(mask_token_posi):
for j in posi:
choice = random.random()
if choice < 0.8:
input[i][j] = "[MASK]"
# mask_vec[i] = 1
# elif choice >= 0.8 and choice < 0.9:
return input
class Tokenizer(object):
def __init__(self, tokenizer_config):
self.begin_id = tokenizer_config["begin_id"]
self.sep_id = tokenizer_config["separate_id"]
self.max_len = tokenizer_config["max_len"]
self.vocab = load_vocab(tokenizer_config["vocab_file"])
# 读取vocab.txt中的token列表和对应的id
vocab = './config/vocab_mol.txt'
with open(vocab, 'r', encoding='utf-8') as f:
tokens = [token.strip() for token in f.readlines()]
self.token_to_id = {token: i for i, token in enumerate(tokens)}
#读取token的frequency字典
token_frequency_file = './config/token_frequency.pickle'
with open(token_frequency_file, 'rb') as f:
self.token_frequency = pickle.load(f)
self.total_tokens = sum(self.token_frequency.values())
# 定义函数,将一个sequence转换为对应的token id列表
def tokenize_sequence(self, sequence):
tokens = [sequence.split(' ')]
token_ids = [self.token_to_id.get(token, self.token_to_id['[UNK]']) for token in tokens]
return token_ids
def seq2emb_encoder_simple(self, input_seq, vocab):
all_ids = []
for i in input_seq:
try:
id = vocab[i]
all_ids.append(id)
except:
id = vocab["[UNK]"]
all_ids.append(id)
ids = np.asarray(all_ids)
return ids
def convert_token_to_ids(self, seq):
# input_seq = [[self.begin_id] + i + [self.sep_id] + j + [self.sep_id] for i, j in zip(mask_d, mask_p)]
# input_seq_ori = [[self.begin_id] + i.split() + [self.sep_id] + j.split() + [self.sep_id] for i, j in zip(d, p)]
# mask_posi = np.concatenate((np.zeros(1), mask_d_posi, np.zeros(1), mask_p_posi, np.zeros(1)))
# token_type_ids = [[np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))] for d, p in zip(mask_d, mask_p)]
# seq = seq.split()
all_seq = [i.split() for i in seq]
for i, seq_i in enumerate(all_seq):
if len(seq_i) > self.max_len:
all_seq[i] = seq_i[:self.max_len-1] + [self.sep_id]
# input_seq_ori[i] = seq[:self.max_len-1] + [self.sep_id]
# token_type_ids = token_type_ids[:self.max_len]
# mask_posi = mask_posi[:self.max_len]
# else:
# mask_posi = np.pad(mask_posi, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
# token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
all_seq_ids = []
# all_seq_ori = []
# all_mask = []
for seq in all_seq:
input = self.seq2emb_encoder_simple(seq, self.vocab)
# input_ori = seq2emb_encoder_simple(ori, self.max_len, self.vocab)
all_seq_ids.append(torch.from_numpy(input).long())
# all_seq_ori.append(torch.from_numpy(input_ori).long())
padded_seq_ids = pad_sequence(all_seq_ids, batch_first=True)
if padded_seq_ids.size(1) < self.max_len:
padded_seq_ids = torch.cat([padded_seq_ids, torch.zeros(padded_seq_ids.size(0), self.max_len - padded_seq_ids.size(1))], dim=1)
else:
padded_seq_ids = padded_seq_ids[:, :self.max_len]
# input_ori = pad_sequence(all_seq_ori, batch_first=True)
# input_mask = pad_sequence(all_mask)
# return torch.from_numpy(input).long(), torch.from_numpy(input_mask).long(), torch.from_numpy(token_type_ids).long()
# return input, input_mask, input_ori
input_mask = (padded_seq_ids != 0)
return padded_seq_ids.long(), input_mask
# 定义函数,计算一个sequence中每个token的TF-IDF值
def calculate_tf_idf(self, sequences):
# 将sequence转换为token id列表
tf_idfs = []
total_tokens = 23533
for index, sequence in enumerate(sequences):
seq_ids = self.seq2emb_encoder_simple(sequence.split(), self.vocab)
# input_ori = seq2emb_encoder_simple(ori, self.max_len, self.vocab)
# all_seq_ids.append(torch.from_numpy(input).long())
token_count = Counter(seq_ids)
token_tf = [token_count[token] / len(seq_ids) for token in seq_ids]
# for token in enumerate(sequence):
# print(token)
# print(self.token_frequency.get(self.vocab[token]))
# token_idf = [np.log(((self.token_frequency.get(token, 1))+1) / (token_count[token]+1)) for i, token in enumerate(sequence.split())]
token_idf = [np.log(((self.token_frequency.get(token, 1))+1) / (token_count[token]+1)) for token in enumerate(sequence)]
# print(token, token_idf)
token_tf_idf = [tf * idf for tf, idf in zip(token_tf, token_idf)]
tf_idfs.append(torch.tensor(token_tf_idf))
padded_tf_idfs = torch.zeros((len(tf_idfs), self.max_len))
for i,tf_idf in enumerate(tf_idfs):
if tf_idf.size(0) < self.max_len:
padded_tf_idf = torch.cat([tf_idf, torch.zeros(self.max_len - len(tf_idf))])
else:
padded_tf_idf = tf_idf[:self.max_len]
# padded_tf_idfs.append(padded_tf_idf)
padded_tf_idfs[i] = padded_tf_idf
# padded_tf_idfs = torch.stack(padded_tf_idfs)
# print(padded_tf_idfs.shape)
return padded_tf_idfs
def collate_fn(batch):
# Get the maximum length of freq_score in the batch
max_len = max(len(item[2]) for item in batch)
# Initialize empty lists for seq, seq_mask, freq_score, and y
seqs, seq_masks, freq_scores, ys = [], [], [], []
# Pad freq_score and concatenate seq and seq_mask for each item in the batch
for item in batch:
freq_score = item[1]
freq_score += [0] * (max_len - len(freq_score))
freq_scores.append(freq_score)
ys.append(item[2])
# Convert lists to tensors
seqs = [torch.tensor(seq) for seq in seqs]
seq_masks = [torch.tensor(seq_mask) for seq_mask in seq_masks]
freq_scores = torch.tensor(freq_scores)
ys = torch.tensor(ys)
# Return the batch
return seqs, seq_masks, ys
if __name__ == "__main__":
# local test
# dataFolder = './IC50/SPS/train_smile'
# with open(dataFolder, 'r') as f:
# train_smi = f.readlines()
# drug_smi = train_smi[0]
# d_v, input_mask_d = drug2emb_encoder(drug_smi)
# test load vocab
# vocab_file = './ESPF/vocab.txt'
# vocab = load_vocab(vocab_file)
# test train
'''
task = 'pre-train'
data_file, data_mask, tokenizer_config = get_task(task)
dataset = Data_Provide(data_file, data_mask)
tokenizer = Tokenizer(tokenizer_config)
data_loder_para = {'batch_size': 2,
'shuffle': False,
'num_workers': 0,
}
data_generator = DataLoader(dataset, **data_loder_para)
all_len = []
m = 0
for i, (seq, seq_mask, affinity) in enumerate(tqdm(data_generator)):
input_random_mask, attention_mask = tokenizer.convert_token_to_ids(seq_mask)
label, _ = tokenizer.convert_token_to_ids(seq)
posi = torch.where(input_random_mask == 1)
target = label[posi]
a = input_random_mask == 4
if torch.sum(a) > 2:
print(torch.sum(a))
'''
# a = seq[0].split()
# b = seq_mask[0].split()
# all_len.append(len(a))
# if len(a) > 512:
# m += 1
# if len(a) != len(b):
# print(seq)
# print(i)
# all_len = np.array(all_len)
# print(np.max(all_len))
# print(np.mean(all_len))
# print(m)
#test for tokenizer and count frequency
# sequence = '[CLS] CC1=CC=C (O 1)C2=N C(=CC(=N [SEP] MP VRRG H VAP QN'
sequence = ['[CLS]', ')cn1', '(O', '1)C2=N', 'C(=CC(=N', '[SEP]', 'MP', 'VRRG', 'H', 'VAP', 'MP', 'VRRG', 'QN']
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
tokenizer = Tokenizer(tokenizer_config)
tf_idf_values = tokenizer.calculate_tf_idf(sequence)
print(tf_idf_values)
# for i, token in enumerate(sequence.split()):
# print(f"Token: {token}, TF-IDF value: {tf_idf_values[i]}")
# task = 'pre-train'
# data_file, data_mask, tokenizer_config = get_task(task)
# tokenizer = Tokenizer(tokenizer_config)
# dataset = Data_Provide(data_file, data_mask, tokenizer)
# data_loder_para = {'batch_size': 2,
# 'shuffle': False,
# 'num_workers': 0,
# }
# data_generator = DataLoader(dataset, **data_loder_para)
# for idx, inputs in enumerate(data_generator):
# x,y1,y2 = inputs
# print(f"Batch {idx}: Inputs shape: {x.dtype}")
# print(f"Batch {idx}: Targets shape: {y1.dtype}")
# # print(f"Batch {idx}: Targets shape: {fre.dtype}")
# print(f"Batch {idx}: Targets shape: {y2.dtype}")
# # print(f"Batch {idx}: Targets shape: {fre.shape}")
# if idx == 2:
# break
import numpy as np
import re
def eval_result(pred, label):
pred = np.array(pred)
label = np.array(label)
num = len(pred)
diff = pred - label
mse = np.sum(np.power(diff, 2)) / num
rmse = np.sqrt(mse)
pearson_co = np.corrcoef(pred, label)
return rmse, pearson_co
def eval(pred_path, label_path):
with open(pred_path, 'r') as f:
pred = f.readlines()
pred = [float(i.strip()) for i in pred]
with open(label_path, 'r') as f:
label = f.readlines()
label = [float(i.strip()) for i in label]
remse, r_mat = eval_result(pred, label)
r = r_mat[0, 1]
file = pred_path.split("/")[-1]
save_path = pred_path.replace(file, 'eval_results')
with open(save_path, 'w') as f:
f.write('RMSE : {} ; Pearson Correlation Coefficient : {}'.format(remse, r))
print('RMSE : {} ; Pearson Correlation Coefficient : {}'.format(remse, r))
if __name__ == '__main__':
# with open('pre_test.sh', 'r') as f:
# pred_dir = f.readline()
# pred_dir = pred_dir.split()[5].split('/')[-1]
# pred_result = './predict/{}/test.txt'.format(pred_dir)
# pred_result = './predict/add_pretrain_1019-s-329480_v2/test_mol.txt'
# pred_result = './predict/add_pretrain_1019-s-329480-er/test_mol.txt'
# eval single file
# pred_file = "./predict/without-pre-train-layer-6-1021-s-988440-test/test_mol.txt"
# test_label_path = './data/test/test_ic50'
# eval(pred_file, test_label_path)
# eval all
test_label_path = './data/test/test_ic50'
test_label_path_ER = './data/ER/ER_ic50'
test_label_path_GPCR = './data/GPCR/GPCR_ic50'
test_label_path_Ion_channel = './data/Ion_channel/channel_ic50'
test_label_path_Tyrosine_kinase = './data/Tyrosine_kinase/kinase_ic50'
# test mol
# pred_test = "./predict/without-pre-train-layer-6-1021-s-988440-test/test_mol.txt"
# er = "./predict/without-pre-train-layer-6-1021-s-988440-er/test_er.txt"
# gpcr = "./predict/without-pre-train-layer-6-1021-s-988440-gpcr/test_gpcr.txt"
# channel = "./predict/without-pre-train-layer-6-1021-s-988440-channel/test_channel.txt"
# kinase = "./predict/without-pre-train-layer-6-1021-s-988440-kinase/test_kinase.txt"
# test
# pred_test = "predict/train_ori_1217-s-296532/test.txt"
# er = "predict/train_ori_1217-s-296532/test_ori_er.txt"
# gpcr = "predict/train_ori_1217-s-296532/test_ori_gpcr.txt"
# channel = "predict/train_ori_1217-s-296532/test_ori_channel.txt"
# kinase = "predict/train_ori_1217-s-296532/test_ori_kinase.txt"
# deepdta
# pred_test = "baselines/DeepDTA/source/output/test/results.txt"
# er = "baselines/DeepDTA/source/output/ER/results.txt"
# gpcr = "baselines/DeepDTA/source/output/GPCR/results.txt"
# channel = "baselines/DeepDTA/source/output/Ion_channel/results.txt"
# kinase = "baselines/DeepDTA/source/output/Tyrosine_kinase/results.txt"
# attentiondta
# pred_test = "baselines/AttentionDTA_BIBM/results/test/test.txt"
# er = "baselines/AttentionDTA_BIBM/results/ER/test.txt"
# gpcr = "baselines/AttentionDTA_BIBM/results/GPCR/test.txt"
# channel = "baselines/AttentionDTA_BIBM/results/channel/test.txt"
# kinase = "baselines/AttentionDTA_BIBM/results/kinase/test.txt"
# test_mol test_2
# pred_test = "predict/pre-train-layer-6-1021/test_mol.txt"
# er = "predict/pre-train-layer-6-1021/test_er.txt"
# gpcr = "predict/pre-train-layer-6-1021/test_gpcr.txt"
# channel = "predict/pre-train-layer-6-1021/test_channel.txt"
# kinase = "predict/pre-train-layer-6-1021/test_kinase.txt"
#frequency embedding /notebook/our_model-new/predict/pre-train-layer-6-1021-freq
pred_test = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_mol.txt"
er = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_er.txt"
gpcr = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_gpcr.txt"
channel = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_channel.txt"
kinase = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_kinase.txt"
pred_list = [pred_test, er, gpcr, channel, kinase]
label_list = [test_label_path, test_label_path_ER, test_label_path_GPCR, test_label_path_Ion_channel, test_label_path_Tyrosine_kinase]
for i, j in zip(pred_list, label_list):
print(i)
eval(i, j)
CUDA_VISIBLE_DEVICES=1 python run_interaction.py \
-batch_size=4 --task=train_mol --epochs=30 --lr=1e-5 \
--savedir=lr-1e-5-batch-64-e-30-layer6-1125-new \
--config=./config/config_layer_6_mol.json \
--output='./predict/test_new' \
--pre_train=True \
--init='./saved_model/train/epoch-23-step-790752-loss-0.12734022736549377.pth'
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"data_train_file_1 = '/notebook/our_model/data/interaction/dataset/BindingDB/train.csv'\n",
"data_train_file_2 = '/notebook/our_model/data/interaction/dataset/DAVIS/train.csv'\n",
"data_train_file_3 = '/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/train.csv'\n",
"\n",
"data_val_file_1 = '/notebook/our_model/data/interaction/dataset/BindingDB/val.csv'\n",
"data_val_file_2 = '/notebook/our_model/data/interaction/dataset/DAVIS/val.csv'\n",
"data_val_file_3 = '/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/val.csv'\n",
"\n",
"data_test_file_1 = '/notebook/our_model/data/interaction/dataset/BindingDB/test.csv'\n",
"data_test_file_2 = '/notebook/our_model/data/interaction/dataset/DAVIS/test.csv'\n",
"data_test_file_3 = '/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/test.csv'"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(data_train_file_3)\n",
"data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/train/protein',header=None,index=None)\n",
"data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/train/smile',header=None,index=None)\n",
"data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/train/label',header=None,index=None)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(data_val_file_3)\n",
"data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/validate/protein',header=None,index=None)\n",
"data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/validate/smile',header=None,index=None)\n",
"data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/validate/label',header=None,index=None)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(data_test_file_3)\n",
"data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/test/protein',header=None,index=None)\n",
"data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/test/smile',header=None,index=None)\n",
"data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/test/label',header=None,index=None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#unseen protein"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"up_train_file = '/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/train.csv'\n",
"up_val_file = '/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/val.csv'\n",
"up_test_file = '/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/test.csv'"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(up_train_file)\n",
"data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/train/protein',header=None,index=None)\n",
"data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/train/smile',header=None,index=None)\n",
"data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/train/label',header=None,index=None)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(up_val_file)\n",
"data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/validate/protein',header=None,index=None)\n",
"data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/validate/smile',header=None,index=None)\n",
"data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/validate/label',header=None,index=None)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(up_test_file)\n",
"data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/test/protein',header=None,index=None)\n",
"data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/test/smile',header=None,index=None)\n",
"data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/test/label',header=None,index=None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#unseen drug"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"ud_train_file = '/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/train.csv'\n",
"ud_val_file = '/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/val.csv'\n",
"ud_test_file = '/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/test.csv'"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(ud_train_file)\n",
"data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/train/protein',header=None,index=None)\n",
"data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/train/smile',header=None,index=None)\n",
"data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/train/label',header=None,index=None)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(ud_val_file)\n",
"data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/validate/protein',header=None,index=None)\n",
"data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/validate/smile',header=None,index=None)\n",
"data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/validate/label',header=None,index=None)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(ud_test_file)\n",
"data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/test/protein',header=None,index=None)\n",
"data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/test/smile',header=None,index=None)\n",
"data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/test/label',header=None,index=None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##测试词汇分割效率##"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from subword_nmt.apply_bpe import BPE\n",
"import codecs\n",
"import json\n",
"import numpy as np\n",
"from tqdm import tqdm\n",
"import math\n",
"import random"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def get_tokenzie_seq(file, save, mask=False):\n",
" begin_token = '[CLS]'\n",
" separate_token = \"[SEP]\"\n",
" with open(file['seq'], 'r') as f:\n",
" seq = f.readlines()\n",
" with open(file[\"smile\"], 'r') as f:\n",
" smile = f.readlines()\n",
" with open(file[\"affinity\"], 'r') as f:\n",
" affinity = f.readlines()\n",
" \n",
" bpe_codes_drug = codecs.open('./config/drug_codes_chembl.txt')\n",
" dbpe = BPE(bpe_codes_drug, merges=-1, separator='')\n",
" bpe_codes_prot = codecs.open('./config/protein_codes_uniprot.txt')\n",
" pbpe = BPE(bpe_codes_prot, merges=-1, separator='')\n",
"\n",
" with open(save, \"w\") as f:\n",
" for i in tqdm(range(len(seq))):\n",
" d = dbpe.process_line(smile[i].strip()).split()\n",
" p = pbpe.process_line(seq[i].strip()).split()\n",
" if mask == True:\n",
" d = random_mask(d)\n",
" p = random_mask(p)\n",
" final_seq = [begin_token] + d + [separate_token] + p + [separate_token]\n",
" affinity_num = affinity[i].strip()\n",
" item = {\n",
" \"seq\": \" \".join(final_seq),\n",
" \"affinity\": affinity_num\n",
" }\n",
" new_item = json.dumps(item)\n",
" f.write(new_item + '\\n')\n",
"\n",
"def get_tokenzie_seq_case(file, save, mask=False):\n",
" begin_token = '[CLS]'\n",
" separate_token = \"[SEP]\"\n",
" with open(file['seq'], 'r') as f:\n",
" seq = f.readlines()\n",
" seq = [i.strip() for i in seq]\n",
" seq = \"\".join(seq)\n",
" with open(file[\"smile\"], 'r') as f:\n",
" smile = f.readlines()\n",
" # with open(file[\"affinity\"], 'r') as f:\n",
" # affinity = f.readlines()\n",
"\n",
" bpe_codes_drug = codecs.open('./config/drug_codes_chembl.txt')\n",
" dbpe = BPE(bpe_codes_drug, merges=-1, separator='')\n",
" bpe_codes_prot = codecs.open('./config/protein_codes_uniprot.txt')\n",
" pbpe = BPE(bpe_codes_prot, merges=-1, separator='')\n",
"\n",
" with open(save, \"w\") as f:\n",
" for i in tqdm(range(len(smile))):\n",
" d = dbpe.process_line(smile[i].strip()).split()\n",
" p = pbpe.process_line(seq).split()\n",
" if mask == True:\n",
" d = random_mask(d)\n",
" p = random_mask(p)\n",
" final_seq = [begin_token] + d + [separate_token] + p + [separate_token]\n",
" # affinity_num = affinity[i].strip()\n",
" item = {\n",
" \"seq\": \" \".join(final_seq),\n",
" # \"affinity\": affinity_num\n",
" }\n",
" new_item = json.dumps(item)\n",
" f.write(new_item + '\\n')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.12 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
from yaml import load
from dataset import Data_Encoder, get_task, Data_Encoder_mol, Data_Gen, Tokenizer
import torch
from torch.utils.data import DataLoader
from configuration_bert import BertConfig
from modeling_bert import BertAffinityModel
import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold, datasets
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"
def load_embedding(data_file):
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
tokenizer = Tokenizer(tokenizer_config)
sep_id = 3
dataset = Data_Gen(data_file)
data_generator = DataLoader(dataset, batch_size=1, shuffle=False)
config = BertConfig.from_pretrained('./config/config_layer_6_mol.json')
model = BertAffinityModel(config)
model.load_state_dict(torch.load('./model/add_pretrain_1019/epoch-9-step-329480-loss-0.736057146887367.pth'), strict=True)
all_drug = []
all_protein = []
for i, (input, affinity) in enumerate(data_generator):
# input = input[1:]
input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
input_embs = model.embeddings(input_ids)
sep_index = torch.where(input_ids[:, :-1] == sep_id)[-1]
drug_emb = input_embs[:, 1:sep_index].squeeze(0).detach().numpy()
protein_embs = input_embs[:, sep_index+1:-1].squeeze(0).detach().numpy()
all_drug.append(drug_emb)
all_protein.append(protein_embs)
return all_drug, all_protein
def plot_drug_protein(save):
drug_embs, protein_embs = load_embedding("add_figure/sample_data/test_sample")
all_drug_sub = np.concatenate(drug_embs)
all_protein_sub = np.concatenate(protein_embs)[:len(all_drug_sub)]
all_data = np.concatenate((all_drug_sub, all_protein_sub))
y = np.array([0]*len(all_drug_sub) + [1]*len(all_protein_sub))
# t-sne
X_tsne = manifold.TSNE(n_components=2, init='random', random_state=5, verbose=1).fit_transform(all_data)
# plot
fig, ax=plt.subplots(dpi=600)
plt.axis("off")
# ax.spines['top'].set_visible(False)
# ax.spines['right'].set_visible(False)
# ax.spines['bottom'].set_visible(False)
# ax.spines['left'].set_visible(False)
plt.scatter(X_tsne[y==0, 0], X_tsne[y==0, 1], c="darkcyan", s=5, label="Drug", marker='^')
plt.scatter(X_tsne[y==1, 0], X_tsne[y==1, 1], c="deepskyblue", s=5, label="Protein", marker="s")
# plt.scatter(X_tsne[y==2, 0], X_tsne[y==2, 1], c="salmon", s=5, label="Story")
plt.legend(labels=["Drug", "Protein"], loc=1)
plt.savefig(save, dpi=fig.dpi, pad_inches=0, bbox_inches="tight")
def plot_protein_sub(save):
drug_embs, protein_embs = load_embedding("add_figure/sample_data/test_sample")
drug_1 = protein_embs[0]
drug_2 = protein_embs[1]
drug_3 = protein_embs[2]
# drug_4 = protein_embs[3]
y = np.array([0]*len(drug_1) + [1]*len(drug_2) + [2]*len(drug_3))
# + [3]*len(drug_4))
# all_data = np.concatenate((drug_1, drug_2, drug_3, drug_4))
all_data = np.concatenate((drug_1, drug_2, drug_3))
X_tsne = manifold.TSNE(n_components=2, init='random', random_state=5, verbose=1).fit_transform(all_data)
# plot
fig, ax=plt.subplots(dpi=600)
plt.axis("off")
plt.scatter(X_tsne[y==0, 0], X_tsne[y==0, 1], c="darkcyan", s=5, label="PTPH1", marker='^')
plt.scatter(X_tsne[y==1, 0], X_tsne[y==1, 1], c="deepskyblue", s=5, label="mGluRs", marker="s")
plt.scatter(X_tsne[y==2, 0], X_tsne[y==2, 1], c="salmon", s=5, label="EZH2")
# plt.scatter(X_tsne[y==3, 0], X_tsne[y==3, 1], s=5, label="Protein_4")
# plt.legend(labels=["Protein_1", "Protein_2", "Protein_3", "Protein_4"], loc=1)
plt.legend(labels=["PTPH1", "mGluRs", "EZH2"], loc=1)
plt.savefig(save, dpi=fig.dpi, pad_inches=0, bbox_inches="tight")
def plot_drug_sub(save):
drug_embs, protein_embs = load_embedding("add_figure/sample_data/test_sample")
drug_1 = drug_embs[0]
drug_2 = drug_embs[1]
drug_3 = drug_embs[2]
# drug_4 = protein_embs[3]
y = np.array([0]*len(drug_1) + [1]*len(drug_2) + [2]*len(drug_3))
# + [3]*len(drug_4))
# all_data = np.concatenate((drug_1, drug_2, drug_3, drug_4))
all_data = np.concatenate((drug_1, drug_2, drug_3))
X_tsne = manifold.TSNE(n_components=2, init='random', random_state=5, verbose=1).fit_transform(all_data)
# plot
fig, ax=plt.subplots(dpi=600)
plt.axis("off")
plt.scatter(X_tsne[y==0, 0], X_tsne[y==0, 1], c="darkcyan", s=5, label="Drug_1", marker='^')
plt.scatter(X_tsne[y==1, 0], X_tsne[y==1, 1], c="deepskyblue", s=5, label="Drug_2", marker="s")
plt.scatter(X_tsne[y==2, 0], X_tsne[y==2, 1], c="salmon", s=5, label="Drug_3")
# plt.scatter(X_tsne[y==3, 0], X_tsne[y==3, 1], s=5, label="Protein_4")
# plt.legend(labels=["Protein_1", "Protein_2", "Protein_3", "Protein_4"], loc=1)
plt.legend(labels=["Drug_1", "Drug_2", "Drug_3"], loc=1)
plt.savefig(save, dpi=fig.dpi, pad_inches=0, bbox_inches="tight")
if __name__ == '__main__':
plot_drug_protein("drug_and_protein_sub")
# plot_drug_sub("three_drug_sub")
# plot_protein_sub("three_protein_sub")
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch BERT model. """
import math
import os
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
import torch.nn.functional as F
from transformers.activations import ACT2FN
from transformers.file_utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
replace_return_docstrings,
)
from transformers.modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
MaskedLMOutput,
MultipleChoiceModelOutput,
NextSentencePredictorOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from transformers.modeling_utils import (
PreTrainedModel,
apply_chunking_to_forward,
find_pruneable_heads_and_indices,
prune_linear_layer,
)
from transformers.utils import logging
from configuration_bert import BertConfig
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "bert-base-uncased"
_CONFIG_FOR_DOC = "BertConfig"
_TOKENIZER_FOR_DOC = "BertTokenizer"
BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"bert-base-uncased",
"bert-large-uncased",
"bert-base-cased",
"bert-large-cased",
"bert-base-multilingual-uncased",
"bert-base-multilingual-cased",
"bert-base-chinese",
"bert-base-german-cased",
"bert-large-uncased-whole-word-masking",
"bert-large-cased-whole-word-masking",
"bert-large-uncased-whole-word-masking-finetuned-squad",
"bert-large-cased-whole-word-masking-finetuned-squad",
"bert-base-cased-finetuned-mrpc",
"bert-base-german-dbmdz-cased",
"bert-base-german-dbmdz-uncased",
"cl-tohoku/bert-base-japanese",
"cl-tohoku/bert-base-japanese-whole-word-masking",
"cl-tohoku/bert-base-japanese-char",
"cl-tohoku/bert-base-japanese-char-whole-word-masking",
"TurkuNLP/bert-base-finnish-cased-v1",
"TurkuNLP/bert-base-finnish-uncased-v1",
"wietsedv/bert-base-dutch-cased",
# See all BERT models at https://huggingface.co/models?filter=bert
]
def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
for name, array in zip(names, arrays):
name = name.split("/")
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if any(
n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
for n in name
):
logger.info("Skipping {}".format("/".join(name)))
continue
pointer = model
for m_name in name:
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
scope_names = [m_name]
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
else:
try:
pointer = getattr(pointer, scope_names[0])
except AttributeError:
logger.info("Skipping {}".format("/".join(name)))
continue
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
array = np.transpose(array)
try:
assert (
pointer.shape == array.shape
), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
logger.info("Initialize PyTorch weight {}".format(name))
pointer.data = torch.from_numpy(array)
return model
class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
def forward(
self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class BertSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
self.max_position_embeddings = config.max_position_embeddings
self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
self.is_decoder = config.is_decoder
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
):
mixed_query_layer = self.query(hidden_states)
# If this is instantiated as a cross-attention module, the keys
# and values come from an encoder; the attention mask needs to be
# such that the encoder's padding tokens are not attended to.
is_cross_attention = encoder_hidden_states is not None
if is_cross_attention and past_key_value is not None:
# reuse k,v, cross_attentions
key_layer = past_key_value[0]
value_layer = past_key_value[1]
attention_mask = encoder_attention_mask
elif is_cross_attention:
key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
attention_mask = encoder_attention_mask
elif past_key_value is not None:
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
else:
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
if self.is_decoder:
# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
# Further calls to cross_attention layer can then reuse all cross-attention
# key/value_states (first "if" case)
# if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
# all previous decoder key/value_states. Further calls to uni-directional self-attention
# can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
# if encoder bi-directional self-attention `past_key_value` is always `None`
past_key_value = (key_layer, value_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
seq_length = hidden_states.size()[1]
position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
distance = position_ids_l - position_ids_r
positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
if self.position_embedding_type == "relative_key":
relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
attention_scores = attention_scores + relative_position_scores
elif self.position_embedding_type == "relative_key_query":
relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
if self.is_decoder:
outputs = outputs + (past_key_value,)
return outputs
class BertSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class BertAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.self = BertSelfAttention(config)
self.output = BertSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
# Prune linear layers
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# Update hyper params and store pruned heads
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
):
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
class BertIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class BertOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class BertLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = BertAttention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
self.crossattention = BertAttention(config)
self.intermediate = BertIntermediate(config)
self.output = BertOutput(config)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
):
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
)
attention_output = self_attention_outputs[0]
# if decoder, the last output is tuple of self-attn cache
if self.is_decoder:
outputs = self_attention_outputs[1:-1]
present_key_value = self_attention_outputs[-1]
else:
outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
cross_attn_present_key_value = None
if self.is_decoder and encoder_hidden_states is not None:
assert hasattr(
self, "crossattention"
), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
# cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
cross_attn_past_key_value,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
# add cross-attn cache to positions 3,4 of present_key_value tuple
cross_attn_present_key_value = cross_attention_outputs[-1]
present_key_value = present_key_value + cross_attn_present_key_value
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
# if decoder, return the attn key/values as the last output
if self.is_decoder:
outputs = outputs + (present_key_value,)
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class BertEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
next_decoder_cache = () if use_cache else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if getattr(self.config, "gradient_checkpointing", False) and self.training:
if use_cache:
logger.warn(
"`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
"`use_cache=False`..."
)
use_cache = False
def create_custom_forward(module):
def custom_forward(*inputs):
return module(*inputs, past_key_value, output_attentions)
return custom_forward
layer_outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(layer_module),
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
class BertPooler(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class BertPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class BertLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = BertPredictionHeadTransform(config)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
class BertOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = BertLMPredictionHead(config)
def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class BertOnlyNSPHead(nn.Module):
def __init__(self, config):
super().__init__()
self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, pooled_output):
seq_relationship_score = self.seq_relationship(pooled_output)
return seq_relationship_score
class BertPreTrainingHeads(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = BertLMPredictionHead(config)
self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, sequence_output, pooled_output):
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score
class BertPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = BertConfig
load_tf_weights = load_tf_weights_in_bert
base_model_prefix = "bert"
_keys_to_ignore_on_load_missing = [r"position_ids"]
def _init_weights(self, module):
""" Initialize the weights """
if isinstance(module, nn.Linear):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
@dataclass
class BertForPreTrainingOutput(ModelOutput):
"""
Output type of :class:`~transformers.BertForPreTraining`.
Args:
loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
Total loss as the sum of the masked language modeling loss and the next sequence prediction
(classification) loss.
prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[torch.FloatTensor] = None
prediction_logits: torch.FloatTensor = None
seq_relationship_logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
BERT_START_DOCSTRING = r"""
This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.
Parameters:
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
weights.
"""
BERT_INPUTS_DOCSTRING = r"""
Args:
input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`~transformers.BertTokenizer`. See
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
`What are attention masks? <../glossary.html#attention-mask>`__
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
1]``:
- 0 corresponds to a `sentence A` token,
- 1 corresponds to a `sentence B` token.
`What are token type IDs? <../glossary.html#token-type-ids>`_
position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
config.max_position_embeddings - 1]``.
`What are position IDs? <../glossary.html#position-ids>`_
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
output_attentions (:obj:`bool`, `optional`):
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`):
Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
more detail.
return_dict (:obj:`bool`, `optional`):
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
"""
@add_start_docstrings(
"The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
BERT_START_DOCSTRING,
)
class BertModel(BertPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in `Attention is
all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
input to the forward pass.
"""
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
self.embeddings = BertEmbeddings(config)
self.encoder = BertEncoder(config)
self.pooler = BertPooler(config) if add_pooling_layer else None
self.init_weights()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# @add_code_sample_docstrings(
# tokenizer_class=_TOKENIZER_FOR_DOC,
# checkpoint=_CHECKPOINT_FOR_DOC,
# output_type=BaseModelOutputWithPoolingAndCrossAttentions,
# config_class=_CONFIG_FOR_DOC,
# )
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
use_cache (:obj:`bool`, `optional`):
If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
decoding (see :obj:`past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if self.config.is_decoder:
use_cache = use_cache if use_cache is not None else self.config.use_cache
else:
use_cache = False
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
batch_size, seq_length = input_shape
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
batch_size, seq_length = input_shape
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
device = input_ids.device if input_ids is not None else inputs_embeds.device
# past_key_values_length
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
if attention_mask is None:
attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
if self.config.is_decoder and encoder_hidden_states is not None:
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
if encoder_attention_mask is None:
encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
else:
encoder_extended_attention_mask = None
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
embedding_output = self.embeddings(
input_ids=input_ids,
position_ids=position_ids,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
past_key_values_length=past_key_values_length,
)
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPoolingAndCrossAttentions(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
past_key_values=encoder_outputs.past_key_values,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
cross_attentions=encoder_outputs.cross_attentions,
)
@add_start_docstrings(
"""
Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
sentence prediction (classification)` head.
""",
BERT_START_DOCSTRING,
)
class BertForPreTraining(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.bert = BertModel(config)
self.cls = BertPreTrainingHeads(config)
self.init_weights()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
next_sentence_label=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
(masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
(see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns:
Example::
>>> from transformers import BertTokenizer, BertForPreTraining
>>> import torch
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> prediction_logits = outputs.prediction_logits
>>> seq_relationship_logits = outputs.seq_relationship_logits
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output, pooled_output = outputs[:2]
prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
total_loss = None
if labels is not None and next_sentence_label is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
total_loss = masked_lm_loss + next_sentence_loss
if not return_dict:
output = (prediction_scores, seq_relationship_score) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return BertForPreTrainingOutput(
loss=total_loss,
prediction_logits=prediction_scores,
seq_relationship_logits=seq_relationship_score,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""Bert Model with a `language modeling` head on top for CLM fine-tuning. """, BERT_START_DOCSTRING
)
class BertLMHeadModel(BertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
def __init__(self, config):
super().__init__(config)
if not config.is_decoder:
logger.warning("If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`")
self.bert = BertModel(config, add_pooling_layer=False)
self.cls = BertOnlyMLMHead(config)
self.init_weights()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
labels=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
use_cache (:obj:`bool`, `optional`):
If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
decoding (see :obj:`past_key_values`).
Returns:
Example::
>>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
>>> import torch
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
>>> config = BertConfig.from_pretrained("bert-base-cased")
>>> config.is_decoder = True
>>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> prediction_logits = outputs.logits
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
use_cache = False
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
lm_loss = None
if labels is not None:
# we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous()
loss_fct = CrossEntropyLoss()
lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((lm_loss,) + output) if lm_loss is not None else output
return CausalLMOutputWithCrossAttentions(
loss=lm_loss,
logits=prediction_scores,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
)
def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
# if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
if attention_mask is None:
attention_mask = input_ids.new_ones(input_shape)
# cut decoder_input_ids if past is used
if past is not None:
input_ids = input_ids[:, -1:]
return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past}
def _reorder_cache(self, past, beam_idx):
reordered_past = ()
for layer_past in past:
reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
return reordered_past
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
class BertForMaskedLM(BertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
def __init__(self, config):
super().__init__(config)
if config.is_decoder:
logger.warning(
"If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.bert = BertModel(config, add_pooling_layer=False)
self.cls = BertOnlyMLMHead(config)
self.init_weights()
def get_output_embeddings(self):
return self.cls.predictions.decoder
def set_output_embeddings(self, new_embeddings):
self.cls.predictions.decoder = new_embeddings
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# @add_code_sample_docstrings(
# tokenizer_class=_TOKENIZER_FOR_DOC,
# checkpoint=_CHECKPOINT_FOR_DOC,
# output_type=MaskedLMOutput,
# config_class=_CONFIG_FOR_DOC,
# )
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
(masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss() # -100 index = padding token
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
effective_batch_size = input_shape[0]
# add a dummy token
assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
dummy_token = torch.full(
(effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
)
input_ids = torch.cat([input_ids, dummy_token], dim=1)
return {"input_ids": input_ids, "attention_mask": attention_mask}
@add_start_docstrings(
"""Bert Model with a `next sentence prediction (classification)` head on top. """,
BERT_START_DOCSTRING,
)
class BertForNextSentencePrediction(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.bert = BertModel(config)
self.cls = BertOnlyNSPHead(config)
self.init_weights()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
**kwargs
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
(see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
- 0 indicates sequence B is a continuation of sequence A,
- 1 indicates sequence B is a random sequence.
Returns:
Example::
>>> from transformers import BertTokenizer, BertForNextSentencePrediction
>>> import torch
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
>>> outputs = model(**encoding, labels=torch.LongTensor([1]))
>>> logits = outputs.logits
>>> assert logits[0, 0] < logits[0, 1] # next sentence was random
"""
if "next_sentence_label" in kwargs:
warnings.warn(
"The `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.",
FutureWarning,
)
labels = kwargs.pop("next_sentence_label")
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
seq_relationship_scores = self.cls(pooled_output)
next_sentence_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
if not return_dict:
output = (seq_relationship_scores,) + outputs[2:]
return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
return NextSentencePredictorOutput(
loss=next_sentence_loss,
logits=seq_relationship_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
output) e.g. for GLUE tasks.
""",
BERT_START_DOCSTRING,
)
class BertForSequenceClassification(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# @add_code_sample_docstrings(
# tokenizer_class=_TOKENIZER_FOR_DOC,
# checkpoint=_CHECKPOINT_FOR_DOC,
# output_type=SequenceClassifierOutput,
# config_class=_CONFIG_FOR_DOC,
# )
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.num_labels == 1:
# We are doing regression
loss_fct = MSELoss()
loss = loss_fct(logits.view(-1), labels.view(-1))
else:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
""",
BERT_START_DOCSTRING,
)
class BertForMultipleChoice(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
# @add_code_sample_docstrings(
# tokenizer_class=_TOKENIZER_FOR_DOC,
# checkpoint=_CHECKPOINT_FOR_DOC,
# output_type=MultipleChoiceModelOutput,
# config_class=_CONFIG_FOR_DOC,
# )
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
:obj:`input_ids` above)
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
inputs_embeds = (
inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
if inputs_embeds is not None
else None
)
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
""",
BERT_START_DOCSTRING,
)
class BertForTokenClassification(BertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config, add_pooling_layer=False)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# @add_code_sample_docstrings(
# tokenizer_class=_TOKENIZER_FOR_DOC,
# checkpoint=_CHECKPOINT_FOR_DOC,
# output_type=TokenClassifierOutput,
# config_class=_CONFIG_FOR_DOC,
# )
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
1]``.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
# Only keep active parts of the loss
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)
active_labels = torch.where(
active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
)
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings(
"""
Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
BERT_START_DOCSTRING,
)
class BertForQuestionAnswering(BertPreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config, add_pooling_layer=False)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# 创建TF-IDF embedding 层
self.tfidf_emb = nn.Embedding(config.max_len, config.hidden_size)
self.init_weights()
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# @add_code_sample_docstrings(
# tokenizer_class=_TOKENIZER_FOR_DOC,
# checkpoint=_CHECKPOINT_FOR_DOC,
# output_type=QuestionAnsweringModelOutput,
# config_class=_CONFIG_FOR_DOC,
# )
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
start_positions=None,
end_positions=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
total_loss = None
if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions.clamp_(0, ignored_index)
end_positions.clamp_(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class Multilayer_perceptron(nn.Module):
def __init__(self, config):
super(Multilayer_perceptron, self).__init__()
self.layer_1 = nn.Linear(config.hidden_size, 1)
# self.drop_out = nn.Dropout()
# self.layer_2 = nn.Linear(512, 256)
# self.layer_3 = nn.Linear(256, 1)
# self.drop_out = nn.Dropout(0.5)
def forward(self, bert_output):
# x = self.drop_out(bert_output)
x1 = self.layer_1(bert_output)
# x2 = self.drop_out(x1)
# x1 = F.relu(x1, inplace=True)
# x1 = self.drop_out(x1)
# x2 = self.layer_2(x1)
# x2 = F.relu(x2, inplace=True)
# x2 = self.drop_out(x2)
# x2 = self.layer_3(x1)
# return x2
return x1
class BertAffinityModel(BertPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in `Attention is
all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
input to the forward pass.
"""
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
self.embeddings = BertEmbeddings(config)
self.encoder = BertEncoder(config)
self.mlp = Multilayer_perceptron(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
# 创建TF-IDF embedding 层
self.tfidf_emb = nn.Embedding(config.vocab_size, config.hidden_size)
# self.pooler = BertPooler(config) if add_pooling_layer else None
self.init_weights()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# @add_code_sample_docstrings(
# tokenizer_class=_TOKENIZER_FOR_DOC,
# checkpoint=_CHECKPOINT_FOR_DOC,
# output_type=BaseModelOutputWithPoolingAndCrossAttentions,
# config_class=_CONFIG_FOR_DOC,
# )
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
# 添加一个新的参数来传递TF-IDF值
tfidf_values=None
):
r"""
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
use_cache (:obj:`bool`, `optional`):
If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
decoding (see :obj:`past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if self.config.is_decoder:
use_cache = use_cache if use_cache is not None else self.config.use_cache
else:
use_cache = False
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
batch_size, seq_length = input_shape
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
batch_size, seq_length = input_shape
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
device = input_ids.device if input_ids is not None else inputs_embeds.device
# past_key_values_length
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
if attention_mask is None:
attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
if self.config.is_decoder and encoder_hidden_states is not None:
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
if encoder_attention_mask is None:
encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
else:
encoder_extended_attention_mask = None
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
#在获取embedding output之前,先获取TF-IDF embedding
tfidf_embeds = self.tfidf_emb(tfidf_values)
embedding_output = self.embeddings(
input_ids=input_ids,
position_ids=position_ids,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
past_key_values_length=past_key_values_length,
)
#在embedding上添加TF-IDF embedding
embedding_output += tfidf_embeds
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
# pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
# if not return_dict:
# return (sequence_output, pooled_output) + encoder_outputs[1:]
# print(sequence_output.size())
bert_pred = sequence_output[:,0,:]
pred_affinity = self.mlp.forward(bert_pred)
# if output_attentions is not None:
# return BaseModelOutputWithPoolingAndCrossAttentions(
# last_hidden_state=sequence_output,
# # pooler_output=pooled_output,
# past_key_values=encoder_outputs.past_key_values,
# hidden_states=encoder_outputs.hidden_states,
# attentions=encoder_outputs.attentions,
# cross_attentions=encoder_outputs.cross_attentions,
# )
# else:
# return pred_affinity
return pred_affinity
class BertAffinityModel_MaskLM(BertPreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in `Attention is
all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
input to the forward pass.
"""
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
self.embeddings = BertEmbeddings(config)
self.encoder = BertEncoder(config)
self.mlp = Multilayer_perceptron(config)
# self.pooler = BertPooler(config) if add_pooling_layer else None
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
# 创建TF-IDF embedding 层
self.tfidf_emb = nn.Embedding(config.max_len, config.hidden_size)
self.init_weights()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
# @add_code_sample_docstrings(
# tokenizer_class=_TOKENIZER_FOR_DOC,
# checkpoint=_CHECKPOINT_FOR_DOC,
# output_type=BaseModelOutputWithPoolingAndCrossAttentions,
# config_class=_CONFIG_FOR_DOC,
# )
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
# 添加一个新的参数来传递TF-IDF值
tfidf_values=None
):
"""
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
use_cache (:obj:`bool`, `optional`):
If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
decoding (see :obj:`past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if self.config.is_decoder:
use_cache = use_cache if use_cache is not None else self.config.use_cache
else:
use_cache = False
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
batch_size, seq_length = input_shape
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
batch_size, seq_length = input_shape
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
device = input_ids.device if input_ids is not None else inputs_embeds.device
# past_key_values_length
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
if attention_mask is None:
attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
if self.config.is_decoder and encoder_hidden_states is not None:
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
if encoder_attention_mask is None:
encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
else:
encoder_extended_attention_mask = None
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
#在获取embedding output之前,先获取TF-IDF embedding
tfidf_embeds = self.tfidf_emb(tfidf_values)
embedding_output = self.embeddings(
input_ids=input_ids,
position_ids=position_ids,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
past_key_values_length=past_key_values_length,
)
#在embedding上添加TF-IDF embedding
embedding_output += tfidf_embeds
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
logits = self.lm_head(sequence_output)
# pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
# if not return_dict:
# return (sequence_output, pooled_output) + encoder_outputs[1:]
# print(sequence_output.size())
# bert_pred = sequence_output[:,0,:]
# pred_affinity = self.mlp.forward(bert_pred)
# return BaseModelOutputWithPoolingAndCrossAttentions(
# last_hidden_state=sequence_output,
# pooler_output=pooled_output,
# past_key_values=encoder_outputs.past_key_values,
# hidden_states=encoder_outputs.hidden_states,
# attentions=encoder_outputs.attentions,
# cross_attentions=encoder_outputs.cross_attentions,
# )
return logits
CUDA_VISIBLE_DEVICES=4
python run_prediction.py \
--batch_size=56 \
--task=train_mol \
--epochs=100 \
--lr=1e-5 \
--savedir=pre-train-yzh \
--config=./config/config_layer_6_mol.json
\ No newline at end of file
from argparse import ArgumentParser
import numpy as np
from dataset import Data_Encoder, get_task, Data_Encoder_mol, Data_Gen, Tokenizer
import torch
from torch.utils.data import DataLoader
from configuration_bert import BertConfig
from modeling_bert import BertAffinityModel
from torch.utils.tensorboard import SummaryWriter
import os
from tqdm import tqdm
# torch.set_default_tensor_type(torch.DoubleTensor)
def train(args, model, dataset, tokenizer, pre_train=False):
data_loder_para = {'batch_size': args.batch_size,
'shuffle': True,
'num_workers': args.workers,
}
data_generator = DataLoader(dataset, **data_loder_para)
if pre_train == True:
model.load_state_dict(torch.load(args.init), strict=True)
model.train()
opt = torch.optim.Adam(model.parameters(), lr=args.lr)
loss_fct = torch.nn.MSELoss()
writer = SummaryWriter('./log/' + args.savedir)
num_step = args.epochs * len(data_generator)
step = 0
save_step = num_step // 10
# detect GPU
if torch.cuda.is_available():
model.cuda()
# print(model)
print('epoch num : {}'.format(args.epochs))
print('step num : {}'.format(num_step))
print('batch size : {}'.format(args.batch_size))
print('learning rate : {}'.format(args.lr))
print('begin training')
# training
for epoch in range(args.epochs):
for i, (input_ids, token_type_ids, attention_mask, affinity) in enumerate(data_generator):
# use cuda
# input model
# input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
pred_affinity = model(input_ids=input_ids.cuda(), token_type_ids=token_type_ids.cuda(), attention_mask=attention_mask.cuda())
loss = loss_fct(pred_affinity, affinity.cuda().float().unsqueeze(-1))
step += 1
writer.add_scalar('loss', loss, global_step=step)
# Update gradient
opt.zero_grad()
loss.backward()
opt.step()
# if (i % 100 == 0):
print('Training at Epoch ' + str(epoch + 1) + ' step ' + str(step) + ' with loss ' + str(
loss.cpu().detach().numpy()))
# save
if epoch >= 1 and step % save_step == 0:
save_path = './model/' + args.savedir + '/'
if not os.path.exists(save_path):
os.mkdir(save_path)
torch.save(model.state_dict(), save_path + 'epoch-{}-step-{}-loss-{}.pth'.format(epoch, step, loss))
print('training over')
writer.close()
def test(args, model, dataset, tokenizer):
data_loder_para = {'batch_size': args.batch_size,
'shuffle': False,
'num_workers': args.workers,
}
data_generator = DataLoader(dataset, **data_loder_para)
with torch.no_grad():
# if torch.cuda.is_available():
model.load_state_dict(torch.load(args.init), strict=True)
model.cuda()
# else:
# model.load_state_dict(torch.load(args.init, map_location=torch.device('cpu')), strict=True)
model.eval()
if not os.path.exists(args.output):
os.mkdir(args.output)
result = args.output + '/' + '{}.txt'.format(args.task)
print('begin predicting')
with open(result, 'w') as f:
for i, (input_ids, token_type_ids, attention_mask, affinity) in enumerate(tqdm(data_generator)):
# input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
pred_affinity = model(input_ids=input_ids.cuda(), token_type_ids=token_type_ids.cuda(), attention_mask=attention_mask.cuda())
pred_affinity = pred_affinity.cpu().numpy().squeeze(-1)
for res in pred_affinity:
f.write(str(res) + '\n')
# if args.do_eval:
# os.system('python eval.py')
def train_mol(args, model, dataset, tokenizer, pre_train=False):
data_loder_para = {'batch_size': args.batch_size,
'shuffle': True,
'num_workers': args.workers,
}
data_generator = DataLoader(dataset, **data_loder_para)
if pre_train == True:
model.load_state_dict(torch.load(args.init), strict=True)
model.train()
opt = torch.optim.Adam(model.parameters(), lr=args.lr)
loss_fct = torch.nn.MSELoss()
writer = SummaryWriter('./log/' + args.savedir)
num_step = args.epochs * len(data_generator)
step = 0
save_step = num_step // 10
# detect GPU
if torch.cuda.is_available():
model.cuda()
# print(model)
print('epoch num : {}'.format(args.epochs))
print('step num : {}'.format(num_step))
print('batch size : {}'.format(args.batch_size))
print('learning rate : {}'.format(args.lr))
print('begin training')
# training
for epoch in range(args.epochs):
for i, (input, affinity) in enumerate(data_generator):
# use cuda
# input model
input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
pred_affinity = model(input_ids=input_ids.cuda(), attention_mask=attention_mask.cuda())
loss = loss_fct(pred_affinity, affinity.to(torch.float32).cuda().unsqueeze(-1))
step += 1
writer.add_scalar('loss', loss, global_step=step)
# Update gradient
opt.zero_grad()
loss.backward()
opt.step()
# if (i % 100 == 0):
print('Training at Epoch ' + str(epoch + 1) + ' step ' + str(step) + ' with loss ' + str(
loss.cpu().detach().numpy()))
# save
if epoch >= 1 and step % save_step == 0:
save_path = './model/' + args.savedir + '/'
if not os.path.exists(save_path):
os.mkdir(save_path)
torch.save(model.state_dict(), save_path + 'epoch-{}-step-{}-loss-{}.pth'.format(epoch, step, loss))
print('training over')
writer.close()
def test_mol(args, model, dataset, tokenizer):
data_loder_para = {'batch_size': args.batch_size,
'shuffle': False,
'num_workers': args.workers,
}
data_generator = DataLoader(dataset, **data_loder_para)
with torch.no_grad():
# if torch.cuda.is_available():
model.load_state_dict(torch.load(args.init), strict=True)
model.cuda()
# else:
# model.load_state_dict(torch.load(args.init, map_location=torch.device('cpu')), strict=True)
model.eval()
if not os.path.exists(args.output):
os.mkdir(args.output)
result = args.output + '/' + '{}.txt'.format(args.task)
print('begin predicting')
with open(result, 'w') as f:
for i, (input, affinity) in enumerate(tqdm(data_generator)):
input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
# pred_affinity = model(input_ids=input_ids.cuda(), attention_mask=attention_mask.cuda(), output_attentions=True)
# attention_mat = pred_affinity["attentions"][-1].detach().cpu().numpy()
# np.save("visualize_attention/attention_mat", attention_mat)
pred_affinity = model(input_ids=input_ids.cuda(), attention_mask=attention_mask.cuda())
# , output_attentions=True)
# pred_affinity = model(input_ids=input, token_type_ids=token_type_ids, attention_mask=input_mask)
pred_affinity = pred_affinity.cpu().numpy().squeeze(-1)
for res in pred_affinity:
f.write(str(res) + '\n')
# if args.do_eval:
# os.system('python eval.py')
def main(args):
# load data
data_file, tokenizer_config = get_task(args.task)
if args.task in ['train_mol', 'test_mol', "test_er", "test_gpcr", "test_channel", "test_kinase"]:
dataset = Data_Gen(data_file)
else:
dataset = Data_Encoder(data_file, tokenizer_config)
# creat model
print('------------------creat model---------------------------')
config = BertConfig.from_pretrained(args.config)
model = BertAffinityModel(config)
tokenizer = Tokenizer(tokenizer_config)
print('model name : BertAffinity')
print('task name : {}'.format(args.task))
if args.task in ['train_mol']:
train_mol(args, model, dataset, tokenizer, pre_train=args.pre_train)
# train(args, model, dataset, tokenizer)
elif args.task in ['test_mol', "test_er", "test_gpcr", "test_channel", "test_kinase"]:
test_mol(args, model, dataset, tokenizer)
elif args.task in ['train', 'train_z_1', 'train_z_10', 'train_z_100']:
train(args, model, dataset, tokenizer, pre_train=args.pre_train)
elif args.task in ['test', 'test_ori_er', 'test_ori_gpcr', 'test_ori_channel', 'test_ori_kinase']:
test(args, model, dataset, tokenizer)
if __name__ == '__main__':
# get parameter
parser = ArgumentParser(description='BertAffinity')
parser.add_argument('-b','--batch-size', default=8, type=int,
metavar='N',
help='mini-batch size (default: 16), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('-j', '--workers', default=0, type=int, metavar='N',
help='number of data loading workers (default: 0)')
parser.add_argument('--epochs', default=50, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--task', default='train', type=str, metavar='TASK',
help='Task name. Could be train, test, channel, ER, GPCR, kinase or else.')
parser.add_argument('--lr', '--learning-rate', default=1e-5, type=float,
metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--config', default='./config/config.json', type=str, help='model config file path')
# parser.add_argument('--log', default='training_log', type=str, help='training log')
parser.add_argument('--savedir', default='train', type=str, help='log and model save path')
# parser.add_argument('--device', default='0', type=str, help='name of GPU')
parser.add_argument('--init', default='model', type=str, help='init checkpoint')
parser.add_argument('--output', default='predict', type=str, help='result save path')
# parser.add_argument('--shuffle', default=True, type=str, help='shuffle data')
# parser.add_argument('--do_eval', default=False, type=bool, help='do eval')
parser.add_argument('--pre_train', default=False, type=bool, help='use pre-train')
args = parser.parse_args()
# local test
# os.environ["CUDA_VISIBLE_DEVICES"] = "5"
# args.task = 'train'
# args.epochs = 30
# args.lr = 1e-5
# args.config = './config/config_layer_6.json'
# args.savedir = 'train_ori_1217'
# args.task = 'train_mol'
# args.savedir = 'without-pre-train-layer-6-1021'
# # # args.savedir = 'train'
# args.epochs = 30
# args.lr = 1e-5
# args.config = './config/config_layer_6_mol.json'
# args.pre_train = False
# args.init = './model/mask-LM-lr-1e-4-1019/epoch-17-step-593064-loss-0.1007341668009758.pth'
#args.task = 'test_mol'
# args.task = 'test_er'
# args.task = 'test_gpcr'
# args.task = 'test_channel'
# args.task = 'test_kinase'
#args.init = './model/add_pretrain_1019/epoch-9-step-329480-loss-0.736057146887367.pth'
# args.init = './model/without-pre-train-layer-6-1021/epoch-29-step-988440-loss-0.19894360158554475.pth'
# args.output = './predict/without-pre-train-layer-6-1021-s-988440-test'
# args.output = './predict/without-pre-train-layer-6-1021-s-988440-gpcr'
# args.output = './predict/without-pre-train-layer-6-1021-s-988440-channel'
# args.output = './predict/without-pre-train-layer-6-1021-s-988440-kinase'
# args.output = './predict/without-pre-train-layer-6-1021-s-988440-er'
# args.output = './predict/add_pretrain_1019-s-329480-er'
# args.output = './predict/add_pretrain_1019-s-329480-gpcr'
# args.output = './predict/add_pretrain_1019-s-329480-channel'
# args.output = './predict/add_pretrain_1019-s-329480-kinase'
#args.config = './config/config_layer_6_mol.json'
#args.output = "./predict/test"
# args.batch_size = 1
# test ori
# args.task = 'test'
# args.task = 'test_ori_er'
# args.task = 'test_ori_gpcr'
# args.task = 'test_ori_channel'
# args.task = 'test_ori_kinase'
# args.init = 'model/train_ori_1217/epoch-8-step-296532-loss-0.5783637166023254.pth'
# args.config = './config/config_layer_6.json'
# args.output = './predict/train_ori_1217-s-296532'
####yzh new train###
args.task = 'train_mol'
args.batch_size = 16
args.savedir = 'fine-tune-new-50epochs-config-layer-6-mol'
# # args.savedir = 'train'
args.epochs = 50
args.lr = 1e-5
args.config = './config/config_layer_6_mol.json'
args.pre_train = True
#args.init = './model/train/epoch-23-step-790752-loss-0.12734022736549377.pth'
#args.init = './model/mask-LM-lr-1e-4-1019/epoch-17-step-593064-loss-0.1007341668009758.pth'
args.init = './model/pre-train-new-100epochs-config_layer_6_mol/epoch-99-step-3294800-loss-0.0736498162150383.pth'
####yzh new test###
#args.task = 'test_mol'
#args.task = 'test_er'
#args.task = 'test_gpcr'
#args.task = 'test_channel'
args.task = 'test_kinase'
args.batch_size = 32
#args.init = './model/fine-tune-50-epochs-config-layer-6-mol/epoch-49-step-823700-loss-0.1486610472202301.pth'
args.init = './model/fine-tune-new-50epochs-config-layer-6-mol/epoch-49-step-823700-loss-0.30148088932037354.pth'
args.config = './config/config_layer_6_mol.json'
args.output = './predict/fine-tune-new-50epochs-config-layer-6-mol'
main(args)
from argparse import ArgumentParser
import sys
import numpy as np
from dataset import Data_Encoder, get_task, Data_Encoder_mol, Data_Gen, Tokenizer
import torch
from torch.utils.data import DataLoader
from configuration_bert import BertConfig
from modeling_bert import BertAffinityModel
from torch.utils.tensorboard import SummaryWriter
import os
from tqdm import tqdm
# torch.set_default_tensor_type(torch.DoubleTensor)
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(ROOT_DIR)
def train(args, model, dataset, tokenizer, pre_train=False):
data_loder_para = {'batch_size': args.batch_size,
'shuffle': True,
'num_workers': args.workers,
}
data_generator = DataLoader(dataset, **data_loder_para)
if pre_train == True:
model.load_state_dict(torch.load(args.init), strict=True)
model.train()
opt = torch.optim.Adam(model.parameters(), lr=args.lr)
loss_fct = torch.nn.MSELoss()
writer = SummaryWriter('./log/' + args.savedir)
num_step = args.epochs * len(data_generator)
step = 0
save_step = num_step // 10
# detect GPU
if torch.cuda.is_available():
model.cuda()
# print(model)
print('epoch num : {}'.format(args.epochs))
print('step num : {}'.format(num_step))
print('batch size : {}'.format(args.batch_size))
print('learning rate : {}'.format(args.lr))
print('begin training')
# training
for epoch in range(args.epochs):
for i, (input_ids, token_type_ids, attention_mask, affinity) in enumerate(data_generator):
# use cuda
# input model
# input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
pred_affinity = model(input_ids=input_ids.cuda(), token_type_ids=token_type_ids.cuda(), attention_mask=attention_mask.cuda())
loss = loss_fct(pred_affinity, affinity.cuda().float().unsqueeze(-1))
step += 1
writer.add_scalar('loss', loss, global_step=step)
# Update gradient
opt.zero_grad()
loss.backward()
opt.step()
# if (i % 100 == 0):
print('Training at Epoch ' + str(epoch + 1) + ' step ' + str(step) + ' with loss ' + str(
loss.cpu().detach().numpy()))
# save
if epoch >= 1 and step % save_step == 0:
save_path = './model/' + args.savedir + '/'
if not os.path.exists(save_path):
os.mkdir(save_path)
torch.save(model.state_dict(), save_path + 'epoch-{}-step-{}-loss-{}.pth'.format(epoch, step, loss))
print('training over')
writer.close()
def test(args, model, dataset, tokenizer):
data_loder_para = {'batch_size': args.batch_size,
'shuffle': False,
'num_workers': args.workers,
}
data_generator = DataLoader(dataset, **data_loder_para)
with torch.no_grad():
# if torch.cuda.is_available():
model.load_state_dict(torch.load(args.init), strict=True)
model.cuda()
# else:
# model.load_state_dict(torch.load(args.init, map_location=torch.device('cpu')), strict=True)
model.eval()
if not os.path.exists(args.output):
os.mkdir(args.output)
result = args.output + '/' + '{}.txt'.format(args.task)
print('begin predicting')
with open(result, 'w') as f:
for i, (input_ids, token_type_ids, attention_mask, affinity) in enumerate(tqdm(data_generator)):
# input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
pred_affinity = model(input_ids=input_ids.cuda(), token_type_ids=token_type_ids.cuda(), attention_mask=attention_mask.cuda())
pred_affinity = pred_affinity.cpu().numpy().squeeze(-1)
for res in pred_affinity:
f.write(str(res) + '\n')
# if args.do_eval:
# os.system('python eval.py')
def train_mol(args, model, dataset, tokenizer, pre_train=False):
data_loder_para = {'batch_size': args.batch_size,
'shuffle': True,
'num_workers': args.workers,
}
data_generator = DataLoader(dataset, **data_loder_para)
if pre_train == True:
model.load_state_dict(torch.load(args.init), strict=True)
model.train()
opt = torch.optim.Adam(model.parameters(), lr=args.lr)
loss_fct = torch.nn.MSELoss()
writer = SummaryWriter('./log/' + args.savedir)
num_step = args.epochs * len(data_generator)
step = 0
save_step = num_step // 10
# detect GPU
if torch.cuda.is_available():
model.cuda()
# print(model)
print('epoch num : {}'.format(args.epochs))
print('step num : {}'.format(num_step))
print('batch size : {}'.format(args.batch_size))
print('learning rate : {}'.format(args.lr))
print('begin training')
# training
for epoch in range(args.epochs):
for i, (input, affinity) in enumerate(data_generator):
# use cuda
# input model
input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
pred_affinity = model(input_ids=input_ids.cuda(), attention_mask=attention_mask.cuda())
# print("affinity's size is ", affinity.dtype)
# print("pred_affinity's size is :", pred_affinity.dtype)
loss = loss_fct(pred_affinity, affinity.to(torch.float32).cuda().unsqueeze(-1))
step += 1
writer.add_scalar('loss', loss, global_step=step)
# Update gradient
opt.zero_grad()
# loss.float()
loss.backward()
opt.step()
# if (i % 100 == 0):
print('Training at Epoch ' + str(epoch + 1) + ' step ' + str(step) + ' with loss ' + str(
loss.cpu().detach().numpy()))
# save
if epoch >= 1 and step % save_step == 0:
save_path = './model/' + args.savedir + '/'
if not os.path.exists(save_path):
os.mkdir(save_path)
torch.save(model.state_dict(), save_path + 'epoch-{}-step-{}-loss-{}.pth'.format(epoch, step, loss))
print('training over')
writer.close()
def test_mol(args, model, dataset, tokenizer):
data_loder_para = {'batch_size': args.batch_size,
'shuffle': False,
'num_workers': args.workers,
}
data_generator = DataLoader(dataset, **data_loder_para)
with torch.no_grad():
# if torch.cuda.is_available():
model.load_state_dict(torch.load(args.init), strict=True)
model.cuda()
# else:
# model.load_state_dict(torch.load(args.init, map_location=torch.device('cpu')), strict=True)
model.eval()
if not os.path.exists(args.output):
os.mkdir(args.output)
result = args.output + '/' + '{}.txt'.format(args.task)
print('begin predicting')
with open(result, 'w') as f:
for i, (input, affinity) in enumerate(tqdm(data_generator)):
input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
# pred_affinity = model(input_ids=input_ids.cuda(), attention_mask=attention_mask.cuda(), output_attentions=True)
# attention_mat = pred_affinity["attentions"][-1].detach().cpu().numpy()
# np.save("visualize_attention/attention_mat", attention_mat)
pred_affinity = model(input_ids=input_ids.cuda(), attention_mask=attention_mask.cuda())
# , output_attentions=True)
# pred_affinity = model(input_ids=input, token_type_ids=token_type_ids, attention_mask=input_mask)
pred_affinity = pred_affinity.cpu().numpy().squeeze(-1)
for res in pred_affinity:
f.write(str(res) + '\n')
if args.do_eval:
os.system('python eval.py')
def main(args):
# load data
data_file, tokenizer_config = get_task(args.task)
if args.task in ['train_mol', 'test_mol', "test_er", "test_gpcr", "test_channel", "test_kinase"]:
dataset = Data_Gen(data_file)
else:
dataset = Data_Encoder(data_file, tokenizer_config)
# creat model
print('------------------creat model---------------------------')
config = BertConfig.from_pretrained(args.config)
model = BertAffinityModel(config)
tokenizer = Tokenizer(tokenizer_config)
print('model name : BertAffinity')
print('task name : {}'.format(args.task))
if args.task in ['train_mol']:
train_mol(args, model, dataset, tokenizer, pre_train=args.pre_train)
# train(args, model, dataset, tokenizer)
elif args.task in ['test_mol', "test_er", "test_gpcr", "test_channel", "test_kinase"]:
test_mol(args, model, dataset, tokenizer)
elif args.task in ['train', 'train_z_1', 'train_z_10', 'train_z_100']:
train(args, model, dataset, tokenizer, pre_train=args.pre_train)
elif args.task in ['test', 'test_ori_er', 'test_ori_gpcr', 'test_ori_channel', 'test_ori_kinase']:
test(args, model, dataset, tokenizer)
if __name__ == '__main__':
# get parameter
parser = ArgumentParser(description='BertAffinity')
parser.add_argument('-batch_size', default=8, type=int,
metavar='N',
help='mini-batch size (default: 16), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('-j', '--workers', default=0, type=int, metavar='N',
help='number of data loading workers (default: 0)')
parser.add_argument('--epochs', default=50, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--task', default='train', type=str, metavar='TASK',
help='Task name. Could be train, test, channel, ER, GPCR, kinase or else.')
parser.add_argument('--lr', '--learning-rate', default=1e-5, type=float,
metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--config', default='./config/config.json', type=str, help='model config file path')
# parser.add_argument('--log', default='training_log', type=str, help='training log')
parser.add_argument('--savedir', default='train', type=str, help='log and model save path')
# parser.add_argument('--device', default='0', type=str, help='name of GPU')
parser.add_argument('--init', default='model', type=str, help='init checkpoint')
parser.add_argument('--output', default='predict', type=str, help='result save path')
# parser.add_argument('--shuffle', default=True, type=str, help='shuffle data')
# parser.add_argument('--do_eval', default=False, type=bool, help='do eval')
parser.add_argument('--pre_train', default=False, type=bool, help='use pre-train')
args = parser.parse_args()
# local test
# os.environ["CUDA_VISIBLE_DEVICES"] = "5"
# args.task = 'train'
# args.epochs = 30
# args.lr = 1e-5
# args.config = './config/config_layer_6.json'
# args.savedir = 'train_ori_1217'
# args.task = 'train_mol'
# args.savedir = 'without-pre-train-layer-6-1021'
# # # args.savedir = 'train'
# args.epochs = 30
# args.lr = 1e-5
# args.config = './config/config_layer_6_mol.json'
# args.pre_train = False
# args.init = './model/mask-LM-lr-1e-4-1019/epoch-17-step-593064-loss-0.1007341668009758.pth'
# args.task = 'test_mol'
# args.task = 'test_er'
# args.task = 'test_gpcr'
# args.task = 'test_channel'
# args.task = 'test_kinase'
# args.init = './model/add_pretrain_1019/epoch-9-step-329480-loss-0.736057146887367.pth'
# args.init = './model/without-pre-train-layer-6-1021/epoch-29-step-988440-loss-0.19894360158554475.pth'
# args.output = './predict/without-pre-train-layer-6-1021-s-988440-test'
# args.output = './predict/without-pre-train-layer-6-1021-s-988440-gpcr'
# args.output = './predict/without-pre-train-layer-6-1021-s-988440-channel'
# args.output = './predict/without-pre-train-layer-6-1021-s-988440-kinase'
# args.output = './predict/without-pre-train-layer-6-1021-s-988440-er'
# args.output = './predict/add_pretrain_1019-s-329480-er'
# args.output = './predict/add_pretrain_1019-s-329480-gpcr'
# args.output = './predict/add_pretrain_1019-s-329480-channel'
# args.output = './predict/add_pretrain_1019-s-329480-kinase'
# args.config = './config/config_layer_6_mol.json'
# args.output = "./predict/test"
# args.batch_size = 1
# test ori
# args.task = 'test'
# args.task = 'test_ori_er'
# args.task = 'test_ori_gpcr'
# args.task = 'test_ori_channel'
# args.task = 'test_ori_kinase'
# args.init = 'model/train_ori_1217/epoch-8-step-296532-loss-0.5783637166023254.pth'
# args.config = './config/config_layer_6.json'
# args.output = './predict/train_ori_1217-s-296532'
####yzh new train###
# os.environ['CUDA_VISIBLE_DEVICES']='1'
# args.task = 'train_mol'
# args.batch_size = 16
# args.savedir = 'pre-train-layer-6-1021'
# # # args.savedir = 'train'
# args.epochs = 30
# args.lr = 1e-5
# args.config = './config/config_layer_6_mol.json'
# args.pre_train = True
# args.init = './model/train/epoch-23-step-790752-loss-0.12734022736549377.pth'
main(args)
import re, collections
def get_stats(vocab):
pairs = collections.defaultdict(int)
for word, freq in vocab.items():
symbols = word.split()
for i in range(len(symbols)-1):
pairs[symbols[i],symbols[i+1]] += freq
return pairs
def merge_vocab(pair, v_in):
v_out = {}
bigram = re.escape(' '.join(pair))
p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
for word in v_in:
w_out = p.sub(''.join(pair), word)
v_out[w_out] = v_in[word]
return v_out
vocab = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}
num_merges = 1000
for i in range(num_merges):
pairs = get_stats(vocab)
if not pairs:
break
best = max(pairs, key=pairs.get)
vocab = merge_vocab(best, vocab)
print(best)
# print output
# ('e', 's')
# ('es', 't')
# ('est', '</w>')
# ('l', 'o')
# ('lo', 'w')
# ('n', 'e')
# ('ne', 'w')
# ('new', 'est</w>')
# ('low', '</w>')
# ('w', 'i')
# ('wi', 'd')
# ('wid', 'est</w>')
# ('low', 'e')
# ('lowe', 'r')
# ('lower', '</w>')
\ No newline at end of file
CUDA_VISIBLE_DEVICES=1 python run_prediction.py \
--task=test_mol \
-batch_size=64 \
--output=./predict/test \
--config=./config/config_layer_6_mol.json \
--init=/notebook/our_model/model/pre-train-layer-6-1021/epoch-29-step-494220-loss-0.23760947585105896.pth
\ No newline at end of file
CUDA_VISIBLE_DEVICES=1 python run_pretraining.py \
--batch-size=16 \
--task=test-pre-train \
--config=./config/config_layer_6_mol.json \
# --output='model/mask-LM-layer-6-dobule-1020/epoch-11-step-395376-loss-0.06246088946244073' \
--init='./model/mask-LM-layer-6-dobule-1020/epoch-11-step-395376-loss-0.06246088946244073.pth'
\ No newline at end of file
CUDA_VISIBLE_DEVICES=0 \
python run_interaction.py \
--epochs=50 \
--lr=1e-5 \
--task=train_biosnap \
--batch_size=4 \
--config=./config/config_layer_6_mol.json \
--pre_train=False \
--init=/notebook/our_model/model/pre-train-new-100epochs-config_layer_6_mol/epoch-99-step-3294800-loss-0.0736498162150383.pth
\ No newline at end of file
6.339674062480976
1.4751794034241978
from subword_nmt.apply_bpe import BPE
import codecs
import collections
bpe_codes_drug = codecs.open('../config/drug_codes_chembl.txt')
dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
bpe_codes_prot = codecs.open('../config/protein_codes_uniprot.txt')
pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
def load_file(file):
data = []
with open(file, 'r') as f:
lines = f.readlines()
for line in lines:
data.append(line.strip('\n'))
return data
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def seq2vec(protein, drug):
start_token = '[CLS]'
sep_token = '[SEP]'
prots = load_file(protein)
drugs = load_file(drug)
for p, d in zip(prots, drugs):
d = dbpe.process_line(d).split()
p = pbpe.process_line(p).split()
tokens = [start_token] + d + [sep_token] + p + [sep_token]
print(len(p))
if __name__ == '__main__':
seq = '../data/test/test_protein_seq'
simle = '../data/train/train_smile'
vocab = '../config/vocab_mol.txt'
seq2vec(seq, simle)
\ No newline at end of file
import pandas as pd
import numpy as np
sub_csv = pd.read_csv('../config/subword_units_map_chembl.csv')
idx2word_d = sub_csv['index'].values
sub_csv = pd.read_csv('../config/subword_units_map_uniprot.csv')
idx2word_p = sub_csv['index'].values
# words2idx_d = dict(zip(idx2word_d, range(0, len(idx2word_d))))
spqcial_tokens = np.array(['[PAD]', '[MASK]', '[CLS]', '[SEP]', '[UNK]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]'])
all_tokens = np.concatenate((spqcial_tokens, idx2word_p, idx2word_d))
save = '../config/vocab_mol.txt'
with open(save, 'w') as f:
for token in all_tokens:
f.write(str(token) + '\n')
import numpy as np
from tqdm import tqdm
def z_score(data, save, enlarge):
with open(data, 'r') as f:
lines = f.readlines()
data = []
for line in lines:
aff = np.float64(line.strip())
data.append(aff)
data = np.array(data)
ave = np.mean(data)
std = np.std(data)
new_affinity = (data - ave) / std
new_affinity *= enlarge
new_affinity = list(new_affinity)
with open(save, 'w') as f:
for aff in tqdm(new_affinity):
f.write(str(aff) + '\n')
def reform(input_file_path, result_save_path, average, std, enlarge):
with open(input_file_path, 'r') as f:
res = f.readlines()
with open(result_save_path, 'w') as f:
for line in tqdm(res):
data = float(line.strip())
ori = ((data / enlarge) * std) + average
f.write(str(ori) + '\n')
if __name__ == '__main__':
average = 6.339674062480976
std = 1.4751794034241978
# gengerate z-score dataset
# data = '../data/train_ic50'
# save = '../data/train_z_1_ic50'
# enlarge = 1
# z_score(data, save, enlarge)
# reform result
result = '../predict/lr-1e-5-batch-32-e-10-layer3-0503-z-1-step-82370/test_1.txt'
save = '../predict/lr-1e-5-batch-32-e-10-layer3-0503-z-1-step-82370/test.txt'
reform(result, save, average, std, 1)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论