提交 074f30dc 作者: 杨志辉

Initial commit

上级
model/
model0/
tfrecord/
train/
logs/
log/
visualize_attention/attention_mat.npy
baselines/
case_study/
experment_result/
predict/
doc/
utils/train_data_analyse.csv
utils/test_data_analyse.csv
utils/data_analyse.xlsx
*.ipynb_checkpoints/
.idea/
.DS_Store
data/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
{
// 使用 IntelliSense 了解相关属性。
// 悬停以查看现有属性的描述。
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: 当前文件",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true,
"cwd": "${fileDirname}"
}
]
}
\ No newline at end of file
## Title ##
Advancing Drug-Target Interaction Prediction with BERT and Subsequence Embedding
## Abstract ##
Exploring the relationship between proteins and drugs plays a significant role in discovering new synthetic drugs. The Drug-Target Interaction (DTI) prediction is a fundamental task in the relationship between proteins and drugs. Unlike encoding proteins by amino acids, we use amino acid subsequence to encode proteins, which simulates the biological process of DTI better. For this research purpose, we proposed a novel deep learning framework based on Bidirectional Encoder Representation from Transformers (BERT), which integrates high-frequency subsequence embedding and transfer learning methods to complete the DTI prediction task. As the first key module, subsequence embedding allows to explore the functional interaction units from drug and protein sequences and then contribute to finding DTI modules. As the second key module, transfer learning promotes the model learn the common DTI features from protein and drug sequences in a large dataset. Overall, the BERT-based model can learn two kinds features through the multi-head self-attention mechanism: internal features of sequence and interaction features of both proteins and drugs, respectively. Compared with other methods, BERT-based methods enable more DTI-related features to be discovered from general features of proteins and drugs through transfer learning. We conducted extensive experiments for the DTI prediction task on three different benchmark datasets. The experimental results show that the model achieves an average prediction metrics higher than most baseline methods. In order to verify the importance of transfer learning, we conducted an ablation study on datasets, and the results show the superiority of transfer learning. In addition, we test the scalability of the model on the dataset in unseen drugs and proteins, and the results of the experiments show that it is acceptable in scalability.
## How to use ##
Use the following command to pretrain the model:
```shell
sh pretrain.sh
```
Use the following command to fine-tune the model:
```shell
sh fine_tune.sh
```
Use the following command to predict the binding affinity score:
```shell
sh test.sh
```
\ No newline at end of file
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 384,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 23614
}
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 384,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 3,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 23614
}
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 595,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 3,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 40235
}
\ No newline at end of file
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 6,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 23615
}
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 595,
"max_len": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 6,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 40235
}
\ No newline at end of file
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 384,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 384,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 9,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 23615
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "00ef6494-43ce-43de-b91c-a8039d19fdcb",
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import pickle"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "84cdc51a-91e9-4af5-8404-5ec2b5059044",
"metadata": {},
"outputs": [],
"source": [
"# 读取分子token字典\n",
"with open('subword_units_map_chembl.csv', 'r', encoding='utf-8') as f:\n",
" reader = csv.reader(f)\n",
" next(reader) # 跳过标题行\n",
" chembl_token_dict = {}\n",
" for row in reader:\n",
" token = row[2]\n",
" index = token\n",
" frequency = int(row[3])\n",
" chembl_token_dict[token] = frequency\n",
" # print(token, frequency)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "82e79dba-ec91-463e-821e-778197f240d7",
"metadata": {},
"outputs": [],
"source": [
"# 读取蛋白质token字典\n",
"with open('subword_units_map_uniprot.csv', 'r', encoding='utf-8') as f:\n",
" reader = csv.reader(f)\n",
" next(reader) # 跳过标题行\n",
" uniprot_token_dict = {}\n",
" for row in reader:\n",
" token = row[2]\n",
" index = token\n",
" frequency = int(row[3])\n",
" uniprot_token_dict[token] = frequency\n",
" # print(token, frequency) "
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "5ed22d0d-87e5-47d6-9f1f-914f904150c2",
"metadata": {},
"outputs": [],
"source": [
"#创建一个special token字典\n",
"special_token_dict = {\n",
" '[PAD]': 1,\n",
" '[MASK]': 1,\n",
" '[CLS]': 1,\n",
" '[SEP]': 1,\n",
" '[UNK]': 1,\n",
" '[unused1]': 1,\n",
" '[unused2]': 1,\n",
" '[unused3]': 1,\n",
" '[unused4]': 1,\n",
" '[unused5]': 1\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "8e0472c9-1cdb-4fd5-8483-72acc0308e58",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"L\n",
"V\n",
"S\n",
"I\n",
"T\n",
"A\n",
"R\n",
"M\n",
"P\n",
"H\n",
"Z\n",
"F\n",
"K\n",
"O\n",
"B\n",
"C\n",
"X\n",
"N\n",
"SS\n",
"NN\n",
"CS\n",
"FC\n",
"CN\n",
"NC\n",
"CC\n",
"CCS\n",
"CCN\n"
]
}
],
"source": [
"token_frequency = {}\n",
"for token, frequency in chembl_token_dict.items():\n",
" token_frequency[token] = frequency\n",
" \n",
"for token, frequency in uniprot_token_dict.items():\n",
" if token in token_frequency:\n",
" print(token)\n",
" token_frequency[token] += frequency\n",
" else:\n",
" token_frequency[token] = frequency\n",
" \n",
"for token, frequency in special_token_dict.items():\n",
" token_frequency[token] = frequency"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "a6e2d04c-900b-4446-844a-4cd9e8d1cfa2",
"metadata": {},
"outputs": [],
"source": [
"#存储到pickle中\n",
"with open('token_frequency.pickle', 'wb') as f:\n",
" pickle.dump(token_frequency, f)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "64bc73cc-aae2-4741-a98b-aae147c705fe",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"字典中的item数量为: 40208\n"
]
}
],
"source": [
"# 从pickle文件中读取字典\n",
"with open('token_frequency.pickle', 'rb') as f:\n",
" token_frequency = pickle.load(f)\n",
"\n",
"# 获取字典中的item数量\n",
"num_items = len(token_frequency.items())\n",
"\n",
"# 输出item数量\n",
"print(\"字典中的item数量为:\", num_items)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "33d4a5e1-4a03-41ae-a0ee-f60059d571c0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"100"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"token_frequency[')N7']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b55735c-e495-4d7b-bf9f-600e665bf8db",
"metadata": {},
"outputs": [],
"source": [
"###注意:蛋白质token有16,693个,分子token有23,532个,special token有10个,共计40,235个\n",
"###创建的pickle文件中有分子和蛋白质交叉的字符,所以合并后有40208个\n",
"#L\n",
"# V\n",
"# S\n",
"# I\n",
"# T\n",
"# A\n",
"# R\n",
"# M\n",
"# P\n",
"# H\n",
"# Z\n",
"# F\n",
"# K\n",
"# O\n",
"# B\n",
"# C\n",
"# X\n",
"# N\n",
"# SS\n",
"# NN\n",
"# CS\n",
"# FC\n",
"# CN\n",
"# NC\n",
"# CC\n",
"# CCS\n",
"# CCN"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import csv
import pickle
# In[24]:
# 读取分子token字典
with open('subword_units_map_chembl.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f)
next(reader) # 跳过标题行
chembl_token_dict = {}
for row in reader:
token = row[2]
index = token
frequency = int(row[3])
chembl_token_dict[token] = frequency
# print(token, frequency)
# In[25]:
# 读取蛋白质token字典
with open('subword_units_map_uniprot.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f)
next(reader) # 跳过标题行
uniprot_token_dict = {}
for row in reader:
token = row[2]
index = token
frequency = int(row[3])
uniprot_token_dict[token] = frequency
# print(token, frequency)
# In[26]:
#创建一个special token字典
special_token_dict = {
'[PAD]': 1,
'[MASK]': 1,
'[CLS]': 1,
'[SEP]': 1,
'[UNK]': 1,
'[unused1]': 1,
'[unused2]': 1,
'[unused3]': 1,
'[unused4]': 1,
'[unused5]': 1
}
# In[28]:
token_frequency = {}
for token, frequency in chembl_token_dict.items():
token_frequency[token] = frequency
for token, frequency in uniprot_token_dict.items():
if token in token_frequency:
print(token)
token_frequency[token] += frequency
else:
token_frequency[token] = frequency
for token, frequency in special_token_dict.items():
token_frequency[token] = frequency
# In[20]:
#存储到pickle中
with open('token_frequency.pickle', 'wb') as f:
pickle.dump(token_frequency, f)
# In[21]:
# 从pickle文件中读取字典
with open('token_frequency.pickle', 'rb') as f:
token_frequency = pickle.load(f)
# 获取字典中的item数量
num_items = len(token_frequency.items())
# 输出item数量
print("字典中的item数量为:", num_items)
# In[23]:
token_frequency[')N7']
# In[ ]:
###注意:蛋白质token有16,693个,分子token有23,532个,special token有10个,共计40,235个
###创建的pickle文件中有分子和蛋白质交叉的字符,所以合并后有40208个
#L
# V
# S
# I
# T
# A
# R
# M
# P
# H
# Z
# F
# K
# O
# B
# C
# X
# N
# SS
# NN
# CS
# FC
# CN
# NC
# CC
# CCS
# CCN
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" BERT model configuration """
from transformers.configuration_utils import PretrainedConfig
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json",
"bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/config.json",
"bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/config.json",
"bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/config.json",
"bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json",
"bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json",
"bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/config.json",
"bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/config.json",
"bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/config.json",
"bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/config.json",
"bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json",
"bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/config.json",
"bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/config.json",
"bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/config.json",
"bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/config.json",
"cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/config.json",
"cl-tohoku/bert-base-japanese-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json",
"cl-tohoku/bert-base-japanese-char": "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/config.json",
"cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/config.json",
"TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json",
"TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/config.json",
"wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/config.json",
"BertAffinity": "./config/config.json"
# See all BERT models at https://huggingface.co/models?filter=bert
}
class BertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a
:class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments,
defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
to that of the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
Args:
vocab_size (:obj:`int`, `optional`, defaults to 30522):
Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
:obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
:class:`~transformers.TFBertModel`.
hidden_size (:obj:`int`, `optional`, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, `optional`, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, `optional`, defaults to 3072):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string,
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
:class:`~transformers.TFBertModel`.
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
The epsilon used by the layer normalization layers.
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
:obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
:obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
<https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
`Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
<https://arxiv.org/abs/2009.13658>`__.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if ``config.is_decoder=True``.
Examples::
>>> from transformers import BertModel, BertConfig
>>> # Initializing a BERT bert-base-uncased style configuration
>>> configuration = BertConfig()
>>> # Initializing a model from the bert-base-uncased style configuration
>>> model = BertModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "bert"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
gradient_checkpointing=False,
position_embedding_type="absolute",
use_cache=True,
**kwargs
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.gradient_checkpointing = gradient_checkpointing
self.position_embedding_type = position_embedding_type
self.use_cache = use_cache
import numpy as np
import re
def eval_result(pred, label):
pred = np.array(pred)
label = np.array(label)
num = len(pred)
diff = pred - label
mse = np.sum(np.power(diff, 2)) / num
rmse = np.sqrt(mse)
pearson_co = np.corrcoef(pred, label)
return rmse, pearson_co
def eval(pred_path, label_path):
with open(pred_path, 'r') as f:
pred = f.readlines()
pred = [float(i.strip()) for i in pred]
with open(label_path, 'r') as f:
label = f.readlines()
label = [float(i.strip()) for i in label]
remse, r_mat = eval_result(pred, label)
r = r_mat[0, 1]
file = pred_path.split("/")[-1]
save_path = pred_path.replace(file, 'eval_results')
with open(save_path, 'w') as f:
f.write('RMSE : {} ; Pearson Correlation Coefficient : {}'.format(remse, r))
print('RMSE : {} ; Pearson Correlation Coefficient : {}'.format(remse, r))
if __name__ == '__main__':
# with open('pre_test.sh', 'r') as f:
# pred_dir = f.readline()
# pred_dir = pred_dir.split()[5].split('/')[-1]
# pred_result = './predict/{}/test.txt'.format(pred_dir)
# pred_result = './predict/add_pretrain_1019-s-329480_v2/test_mol.txt'
# pred_result = './predict/add_pretrain_1019-s-329480-er/test_mol.txt'
# eval single file
# pred_file = "./predict/without-pre-train-layer-6-1021-s-988440-test/test_mol.txt"
# test_label_path = './data/test/test_ic50'
# eval(pred_file, test_label_path)
# eval all
test_label_path = './data/test/test_ic50'
test_label_path_ER = './data/ER/ER_ic50'
test_label_path_GPCR = './data/GPCR/GPCR_ic50'
test_label_path_Ion_channel = './data/Ion_channel/channel_ic50'
test_label_path_Tyrosine_kinase = './data/Tyrosine_kinase/kinase_ic50'
# test mol
# pred_test = "./predict/without-pre-train-layer-6-1021-s-988440-test/test_mol.txt"
# er = "./predict/without-pre-train-layer-6-1021-s-988440-er/test_er.txt"
# gpcr = "./predict/without-pre-train-layer-6-1021-s-988440-gpcr/test_gpcr.txt"
# channel = "./predict/without-pre-train-layer-6-1021-s-988440-channel/test_channel.txt"
# kinase = "./predict/without-pre-train-layer-6-1021-s-988440-kinase/test_kinase.txt"
# test
# pred_test = "predict/train_ori_1217-s-296532/test.txt"
# er = "predict/train_ori_1217-s-296532/test_ori_er.txt"
# gpcr = "predict/train_ori_1217-s-296532/test_ori_gpcr.txt"
# channel = "predict/train_ori_1217-s-296532/test_ori_channel.txt"
# kinase = "predict/train_ori_1217-s-296532/test_ori_kinase.txt"
# deepdta
# pred_test = "baselines/DeepDTA/source/output/test/results.txt"
# er = "baselines/DeepDTA/source/output/ER/results.txt"
# gpcr = "baselines/DeepDTA/source/output/GPCR/results.txt"
# channel = "baselines/DeepDTA/source/output/Ion_channel/results.txt"
# kinase = "baselines/DeepDTA/source/output/Tyrosine_kinase/results.txt"
# attentiondta
# pred_test = "baselines/AttentionDTA_BIBM/results/test/test.txt"
# er = "baselines/AttentionDTA_BIBM/results/ER/test.txt"
# gpcr = "baselines/AttentionDTA_BIBM/results/GPCR/test.txt"
# channel = "baselines/AttentionDTA_BIBM/results/channel/test.txt"
# kinase = "baselines/AttentionDTA_BIBM/results/kinase/test.txt"
# test_mol test_2
# pred_test = "predict/pre-train-layer-6-1021/test_mol.txt"
# er = "predict/pre-train-layer-6-1021/test_er.txt"
# gpcr = "predict/pre-train-layer-6-1021/test_gpcr.txt"
# channel = "predict/pre-train-layer-6-1021/test_channel.txt"
# kinase = "predict/pre-train-layer-6-1021/test_kinase.txt"
#frequency embedding /notebook/our_model-new/predict/pre-train-layer-6-1021-freq
pred_test = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_mol.txt"
er = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_er.txt"
gpcr = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_gpcr.txt"
channel = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_channel.txt"
kinase = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_kinase.txt"
pred_list = [pred_test, er, gpcr, channel, kinase]
label_list = [test_label_path, test_label_path_ER, test_label_path_GPCR, test_label_path_Ion_channel, test_label_path_Tyrosine_kinase]
for i, j in zip(pred_list, label_list):
print(i)
eval(i, j)
CUDA_VISIBLE_DEVICES=1 python run_interaction.py \
-batch_size=4 --task=train_mol --epochs=30 --lr=1e-5 \
--savedir=lr-1e-5-batch-64-e-30-layer6-1125-new \
--config=./config/config_layer_6_mol.json \
--output='./predict/test_new' \
--pre_train=True \
--init='./saved_model/train/epoch-23-step-790752-loss-0.12734022736549377.pth'
from yaml import load
from dataset import Data_Encoder, get_task, Data_Encoder_mol, Data_Gen, Tokenizer
import torch
from torch.utils.data import DataLoader
from configuration_bert import BertConfig
from modeling_bert import BertAffinityModel
import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold, datasets
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"
def load_embedding(data_file):
tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
"vocab_pair": './config/drug_codes_chembl.txt',
"vocab_pair_p": './config/protein_codes_uniprot.txt',
"begin_id": '[CLS]',
"separate_id": "[SEP]",
"max_len": 512
}
tokenizer = Tokenizer(tokenizer_config)
sep_id = 3
dataset = Data_Gen(data_file)
data_generator = DataLoader(dataset, batch_size=1, shuffle=False)
config = BertConfig.from_pretrained('./config/config_layer_6_mol.json')
model = BertAffinityModel(config)
model.load_state_dict(torch.load('./model/add_pretrain_1019/epoch-9-step-329480-loss-0.736057146887367.pth'), strict=True)
all_drug = []
all_protein = []
for i, (input, affinity) in enumerate(data_generator):
# input = input[1:]
input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
input_embs = model.embeddings(input_ids)
sep_index = torch.where(input_ids[:, :-1] == sep_id)[-1]
drug_emb = input_embs[:, 1:sep_index].squeeze(0).detach().numpy()
protein_embs = input_embs[:, sep_index+1:-1].squeeze(0).detach().numpy()
all_drug.append(drug_emb)
all_protein.append(protein_embs)
return all_drug, all_protein
def plot_drug_protein(save):
drug_embs, protein_embs = load_embedding("add_figure/sample_data/test_sample")
all_drug_sub = np.concatenate(drug_embs)
all_protein_sub = np.concatenate(protein_embs)[:len(all_drug_sub)]
all_data = np.concatenate((all_drug_sub, all_protein_sub))
y = np.array([0]*len(all_drug_sub) + [1]*len(all_protein_sub))
# t-sne
X_tsne = manifold.TSNE(n_components=2, init='random', random_state=5, verbose=1).fit_transform(all_data)
# plot
fig, ax=plt.subplots(dpi=600)
plt.axis("off")
# ax.spines['top'].set_visible(False)
# ax.spines['right'].set_visible(False)
# ax.spines['bottom'].set_visible(False)
# ax.spines['left'].set_visible(False)
plt.scatter(X_tsne[y==0, 0], X_tsne[y==0, 1], c="darkcyan", s=5, label="Drug", marker='^')
plt.scatter(X_tsne[y==1, 0], X_tsne[y==1, 1], c="deepskyblue", s=5, label="Protein", marker="s")
# plt.scatter(X_tsne[y==2, 0], X_tsne[y==2, 1], c="salmon", s=5, label="Story")
plt.legend(labels=["Drug", "Protein"], loc=1)
plt.savefig(save, dpi=fig.dpi, pad_inches=0, bbox_inches="tight")
def plot_protein_sub(save):
drug_embs, protein_embs = load_embedding("add_figure/sample_data/test_sample")
drug_1 = protein_embs[0]
drug_2 = protein_embs[1]
drug_3 = protein_embs[2]
# drug_4 = protein_embs[3]
y = np.array([0]*len(drug_1) + [1]*len(drug_2) + [2]*len(drug_3))
# + [3]*len(drug_4))
# all_data = np.concatenate((drug_1, drug_2, drug_3, drug_4))
all_data = np.concatenate((drug_1, drug_2, drug_3))
X_tsne = manifold.TSNE(n_components=2, init='random', random_state=5, verbose=1).fit_transform(all_data)
# plot
fig, ax=plt.subplots(dpi=600)
plt.axis("off")
plt.scatter(X_tsne[y==0, 0], X_tsne[y==0, 1], c="darkcyan", s=5, label="PTPH1", marker='^')
plt.scatter(X_tsne[y==1, 0], X_tsne[y==1, 1], c="deepskyblue", s=5, label="mGluRs", marker="s")
plt.scatter(X_tsne[y==2, 0], X_tsne[y==2, 1], c="salmon", s=5, label="EZH2")
# plt.scatter(X_tsne[y==3, 0], X_tsne[y==3, 1], s=5, label="Protein_4")
# plt.legend(labels=["Protein_1", "Protein_2", "Protein_3", "Protein_4"], loc=1)
plt.legend(labels=["PTPH1", "mGluRs", "EZH2"], loc=1)
plt.savefig(save, dpi=fig.dpi, pad_inches=0, bbox_inches="tight")
def plot_drug_sub(save):
drug_embs, protein_embs = load_embedding("add_figure/sample_data/test_sample")
drug_1 = drug_embs[0]
drug_2 = drug_embs[1]
drug_3 = drug_embs[2]
# drug_4 = protein_embs[3]
y = np.array([0]*len(drug_1) + [1]*len(drug_2) + [2]*len(drug_3))
# + [3]*len(drug_4))
# all_data = np.concatenate((drug_1, drug_2, drug_3, drug_4))
all_data = np.concatenate((drug_1, drug_2, drug_3))
X_tsne = manifold.TSNE(n_components=2, init='random', random_state=5, verbose=1).fit_transform(all_data)
# plot
fig, ax=plt.subplots(dpi=600)
plt.axis("off")
plt.scatter(X_tsne[y==0, 0], X_tsne[y==0, 1], c="darkcyan", s=5, label="Drug_1", marker='^')
plt.scatter(X_tsne[y==1, 0], X_tsne[y==1, 1], c="deepskyblue", s=5, label="Drug_2", marker="s")
plt.scatter(X_tsne[y==2, 0], X_tsne[y==2, 1], c="salmon", s=5, label="Drug_3")
# plt.scatter(X_tsne[y==3, 0], X_tsne[y==3, 1], s=5, label="Protein_4")
# plt.legend(labels=["Protein_1", "Protein_2", "Protein_3", "Protein_4"], loc=1)
plt.legend(labels=["Drug_1", "Drug_2", "Drug_3"], loc=1)
plt.savefig(save, dpi=fig.dpi, pad_inches=0, bbox_inches="tight")
if __name__ == '__main__':
plot_drug_protein("drug_and_protein_sub")
# plot_drug_sub("three_drug_sub")
# plot_protein_sub("three_protein_sub")
CUDA_VISIBLE_DEVICES=4
python run_prediction.py \
--batch_size=56 \
--task=train_mol \
--epochs=100 \
--lr=1e-5 \
--savedir=pre-train-yzh \
--config=./config/config_layer_6_mol.json
\ No newline at end of file
import re, collections
def get_stats(vocab):
pairs = collections.defaultdict(int)
for word, freq in vocab.items():
symbols = word.split()
for i in range(len(symbols)-1):
pairs[symbols[i],symbols[i+1]] += freq
return pairs
def merge_vocab(pair, v_in):
v_out = {}
bigram = re.escape(' '.join(pair))
p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
for word in v_in:
w_out = p.sub(''.join(pair), word)
v_out[w_out] = v_in[word]
return v_out
vocab = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}
num_merges = 1000
for i in range(num_merges):
pairs = get_stats(vocab)
if not pairs:
break
best = max(pairs, key=pairs.get)
vocab = merge_vocab(best, vocab)
print(best)
# print output
# ('e', 's')
# ('es', 't')
# ('est', '</w>')
# ('l', 'o')
# ('lo', 'w')
# ('n', 'e')
# ('ne', 'w')
# ('new', 'est</w>')
# ('low', '</w>')
# ('w', 'i')
# ('wi', 'd')
# ('wid', 'est</w>')
# ('low', 'e')
# ('lowe', 'r')
# ('lower', '</w>')
\ No newline at end of file
CUDA_VISIBLE_DEVICES=1 python run_prediction.py \
--task=test_mol \
-batch_size=64 \
--output=./predict/test \
--config=./config/config_layer_6_mol.json \
--init=/notebook/our_model/model/pre-train-layer-6-1021/epoch-29-step-494220-loss-0.23760947585105896.pth
\ No newline at end of file
CUDA_VISIBLE_DEVICES=1 python run_pretraining.py \
--batch-size=16 \
--task=test-pre-train \
--config=./config/config_layer_6_mol.json \
# --output='model/mask-LM-layer-6-dobule-1020/epoch-11-step-395376-loss-0.06246088946244073' \
--init='./model/mask-LM-layer-6-dobule-1020/epoch-11-step-395376-loss-0.06246088946244073.pth'
\ No newline at end of file
CUDA_VISIBLE_DEVICES=0 \
python run_interaction.py \
--epochs=50 \
--lr=1e-5 \
--task=train_biosnap \
--batch_size=4 \
--config=./config/config_layer_6_mol.json \
--pre_train=False \
--init=/notebook/our_model/model/pre-train-new-100epochs-config_layer_6_mol/epoch-99-step-3294800-loss-0.0736498162150383.pth
\ No newline at end of file
6.339674062480976
1.4751794034241978
from subword_nmt.apply_bpe import BPE
import codecs
import collections
bpe_codes_drug = codecs.open('../config/drug_codes_chembl.txt')
dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
bpe_codes_prot = codecs.open('../config/protein_codes_uniprot.txt')
pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
def load_file(file):
data = []
with open(file, 'r') as f:
lines = f.readlines()
for line in lines:
data.append(line.strip('\n'))
return data
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def seq2vec(protein, drug):
start_token = '[CLS]'
sep_token = '[SEP]'
prots = load_file(protein)
drugs = load_file(drug)
for p, d in zip(prots, drugs):
d = dbpe.process_line(d).split()
p = pbpe.process_line(p).split()
tokens = [start_token] + d + [sep_token] + p + [sep_token]
print(len(p))
if __name__ == '__main__':
seq = '../data/test/test_protein_seq'
simle = '../data/train/train_smile'
vocab = '../config/vocab_mol.txt'
seq2vec(seq, simle)
\ No newline at end of file
import pandas as pd
import numpy as np
sub_csv = pd.read_csv('../config/subword_units_map_chembl.csv')
idx2word_d = sub_csv['index'].values
sub_csv = pd.read_csv('../config/subword_units_map_uniprot.csv')
idx2word_p = sub_csv['index'].values
# words2idx_d = dict(zip(idx2word_d, range(0, len(idx2word_d))))
spqcial_tokens = np.array(['[PAD]', '[MASK]', '[CLS]', '[SEP]', '[UNK]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]'])
all_tokens = np.concatenate((spqcial_tokens, idx2word_p, idx2word_d))
save = '../config/vocab_mol.txt'
with open(save, 'w') as f:
for token in all_tokens:
f.write(str(token) + '\n')
import numpy as np
from tqdm import tqdm
def z_score(data, save, enlarge):
with open(data, 'r') as f:
lines = f.readlines()
data = []
for line in lines:
aff = np.float64(line.strip())
data.append(aff)
data = np.array(data)
ave = np.mean(data)
std = np.std(data)
new_affinity = (data - ave) / std
new_affinity *= enlarge
new_affinity = list(new_affinity)
with open(save, 'w') as f:
for aff in tqdm(new_affinity):
f.write(str(aff) + '\n')
def reform(input_file_path, result_save_path, average, std, enlarge):
with open(input_file_path, 'r') as f:
res = f.readlines()
with open(result_save_path, 'w') as f:
for line in tqdm(res):
data = float(line.strip())
ori = ((data / enlarge) * std) + average
f.write(str(ori) + '\n')
if __name__ == '__main__':
average = 6.339674062480976
std = 1.4751794034241978
# gengerate z-score dataset
# data = '../data/train_ic50'
# save = '../data/train_z_1_ic50'
# enlarge = 1
# z_score(data, save, enlarge)
# reform result
result = '../predict/lr-1e-5-batch-32-e-10-layer3-0503-z-1-step-82370/test_1.txt'
save = '../predict/lr-1e-5-batch-32-e-10-layer3-0503-z-1-step-82370/test.txt'
reform(result, save, average, std, 1)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论