Initial commit

074f30dc · 杨志辉 · 074f30dc · 074f30dc · 074f30dc · 074f30dc
--- a/.gitignore
+++ b/.gitignore
+model/
+model0/
+tfrecord/
+train/
+logs/
+log/
+visualize_attention/attention_mat.npy
+baselines/
+case_study/
+experment_result/
+predict/
+doc/
+utils/train_data_analyse.csv
+utils/test_data_analyse.csv
+utils/data_analyse.xlsx
+*.ipynb_checkpoints/
+.idea/
+.DS_Store
+data/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
+{
+    // 使用 IntelliSense 了解相关属性。 
+    // 悬停以查看现有属性的描述。
+    // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: 当前文件",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${fileDirname}"
+        }
+    ]
+}
\ No newline at end of file
--- a/README.md
+++ b/README.md
+
+## Title ##
+Advancing Drug-Target Interaction Prediction with BERT and Subsequence Embedding
+
+## Abstract ##
+Exploring the relationship between proteins and drugs plays a significant role in discovering new synthetic drugs. The Drug-Target Interaction (DTI) prediction is a fundamental task in the relationship between proteins and drugs. Unlike encoding proteins by amino acids, we use amino acid subsequence to encode proteins, which simulates the biological process of DTI better. For this research purpose, we proposed a novel deep learning framework based on Bidirectional Encoder Representation from Transformers (BERT), which integrates high-frequency subsequence embedding and transfer learning methods to complete the DTI prediction task. As the first key module, subsequence embedding allows to explore the functional interaction units from drug and protein sequences and then contribute to finding DTI modules. As the second key module, transfer learning promotes the model learn the common DTI features from protein and drug sequences in a large dataset. Overall, the BERT-based model can learn two kinds features through the multi-head self-attention mechanism: internal features of sequence and interaction features of both proteins and drugs, respectively. Compared with other methods, BERT-based methods enable more DTI-related features to be discovered from general features of proteins and drugs through transfer learning. We conducted extensive experiments for the DTI prediction task on three different benchmark datasets. The experimental results show that the model achieves an average prediction metrics higher than most baseline methods. In order to verify the importance of transfer learning, we conducted an ablation study on datasets, and the results show the superiority of transfer learning. In addition, we test the scalability of the model on the dataset in unseen drugs and proteins, and the results of the experiments show that it is acceptable in scalability.
+
+## How to use ##
+Use the following command to pretrain the model:
+ ```shell
+    sh pretrain.sh
+```
+Use the following command to fine-tune the model:
+ ```shell
+    sh fine_tune.sh
+```
+Use the following command to predict the binding affinity score:
+ ```shell
+    sh test.sh
+```
\ No newline at end of file
--- a/config/config.json
+++ b/config/config.json
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 384,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 23614
+}
--- a/config/config_layer_3.json
+++ b/config/config_layer_3.json
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 384,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 3,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 23614
+}
--- a/config/config_layer_3_mol.json
+++ b/config/config_layer_3_mol.json
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 595,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 3,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 40235
+}
\ No newline at end of file
--- a/config/config_layer_6.json
+++ b/config/config_layer_6.json
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 23615
+}
--- a/config/config_layer_6_mol.json
+++ b/config/config_layer_6_mol.json
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 595,
+  "max_len": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 40235
+}
\ No newline at end of file
--- a/config/config_layer_9.json
+++ b/config/config_layer_9.json
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 384,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 9,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 23615
+}
--- a/config/count_token_freq.ipynb
+++ b/config/count_token_freq.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "00ef6494-43ce-43de-b91c-a8039d19fdcb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "84cdc51a-91e9-4af5-8404-5ec2b5059044",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 读取分子token字典\n",
+    "with open('subword_units_map_chembl.csv', 'r', encoding='utf-8') as f:\n",
+    "    reader = csv.reader(f)\n",
+    "    next(reader)  # 跳过标题行\n",
+    "    chembl_token_dict = {}\n",
+    "    for row in reader:\n",
+    "        token = row[2]\n",
+    "        index = token\n",
+    "        frequency = int(row[3])\n",
+    "        chembl_token_dict[token] = frequency\n",
+    "        # print(token, frequency)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "82e79dba-ec91-463e-821e-778197f240d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 读取蛋白质token字典\n",
+    "with open('subword_units_map_uniprot.csv', 'r', encoding='utf-8') as f:\n",
+    "    reader = csv.reader(f)\n",
+    "    next(reader)  # 跳过标题行\n",
+    "    uniprot_token_dict = {}\n",
+    "    for row in reader:\n",
+    "        token = row[2]\n",
+    "        index = token\n",
+    "        frequency = int(row[3])\n",
+    "        uniprot_token_dict[token] = frequency\n",
+    "        # print(token, frequency)        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "5ed22d0d-87e5-47d6-9f1f-914f904150c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#创建一个special token字典\n",
+    "special_token_dict = {\n",
+    "    '[PAD]': 1,\n",
+    "    '[MASK]': 1,\n",
+    "    '[CLS]': 1,\n",
+    "    '[SEP]': 1,\n",
+    "    '[UNK]': 1,\n",
+    "    '[unused1]': 1,\n",
+    "    '[unused2]': 1,\n",
+    "    '[unused3]': 1,\n",
+    "    '[unused4]': 1,\n",
+    "    '[unused5]': 1\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "8e0472c9-1cdb-4fd5-8483-72acc0308e58",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "L\n",
+      "V\n",
+      "S\n",
+      "I\n",
+      "T\n",
+      "A\n",
+      "R\n",
+      "M\n",
+      "P\n",
+      "H\n",
+      "Z\n",
+      "F\n",
+      "K\n",
+      "O\n",
+      "B\n",
+      "C\n",
+      "X\n",
+      "N\n",
+      "SS\n",
+      "NN\n",
+      "CS\n",
+      "FC\n",
+      "CN\n",
+      "NC\n",
+      "CC\n",
+      "CCS\n",
+      "CCN\n"
+     ]
+    }
+   ],
+   "source": [
+    "token_frequency = {}\n",
+    "for token, frequency in chembl_token_dict.items():\n",
+    "    token_frequency[token] = frequency\n",
+    "    \n",
+    "for token, frequency in uniprot_token_dict.items():\n",
+    "    if token in token_frequency:\n",
+    "        print(token)\n",
+    "        token_frequency[token] += frequency\n",
+    "    else:\n",
+    "        token_frequency[token] = frequency\n",
+    "        \n",
+    "for token, frequency in special_token_dict.items():\n",
+    "    token_frequency[token] = frequency"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "a6e2d04c-900b-4446-844a-4cd9e8d1cfa2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#存储到pickle中\n",
+    "with open('token_frequency.pickle', 'wb') as f:\n",
+    "    pickle.dump(token_frequency, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "64bc73cc-aae2-4741-a98b-aae147c705fe",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "字典中的item数量为： 40208\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 从pickle文件中读取字典\n",
+    "with open('token_frequency.pickle', 'rb') as f:\n",
+    "    token_frequency = pickle.load(f)\n",
+    "\n",
+    "# 获取字典中的item数量\n",
+    "num_items = len(token_frequency.items())\n",
+    "\n",
+    "# 输出item数量\n",
+    "print(\"字典中的item数量为：\", num_items)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "33d4a5e1-4a03-41ae-a0ee-f60059d571c0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "100"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "token_frequency[')N7']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b55735c-e495-4d7b-bf9f-600e665bf8db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "###注意：蛋白质token有16，693个，分子token有23，532个，special token有10个，共计40，235个\n",
+    "###创建的pickle文件中有分子和蛋白质交叉的字符，所以合并后有40208个\n",
+    "#L\n",
+    "# V\n",
+    "# S\n",
+    "# I\n",
+    "# T\n",
+    "# A\n",
+    "# R\n",
+    "# M\n",
+    "# P\n",
+    "# H\n",
+    "# Z\n",
+    "# F\n",
+    "# K\n",
+    "# O\n",
+    "# B\n",
+    "# C\n",
+    "# X\n",
+    "# N\n",
+    "# SS\n",
+    "# NN\n",
+    "# CS\n",
+    "# FC\n",
+    "# CN\n",
+    "# NC\n",
+    "# CC\n",
+    "# CCS\n",
+    "# CCN"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/config/count_token_freq.py
+++ b/config/count_token_freq.py
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+import csv
+import pickle
+
+
+# In[24]:
+
+
+# 读取分子token字典
+with open('subword_units_map_chembl.csv', 'r', encoding='utf-8') as f:
+    reader = csv.reader(f)
+    next(reader)  # 跳过标题行
+    chembl_token_dict = {}
+    for row in reader:
+        token = row[2]
+        index = token
+        frequency = int(row[3])
+        chembl_token_dict[token] = frequency
+        # print(token, frequency)
+
+
+# In[25]:
+
+
+# 读取蛋白质token字典
+with open('subword_units_map_uniprot.csv', 'r', encoding='utf-8') as f:
+    reader = csv.reader(f)
+    next(reader)  # 跳过标题行
+    uniprot_token_dict = {}
+    for row in reader:
+        token = row[2]
+        index = token
+        frequency = int(row[3])
+        uniprot_token_dict[token] = frequency
+        # print(token, frequency)        
+
+
+# In[26]:
+
+
+#创建一个special token字典
+special_token_dict = {
+    '[PAD]': 1,
+    '[MASK]': 1,
+    '[CLS]': 1,
+    '[SEP]': 1,
+    '[UNK]': 1,
+    '[unused1]': 1,
+    '[unused2]': 1,
+    '[unused3]': 1,
+    '[unused4]': 1,
+    '[unused5]': 1
+}
+
+
+# In[28]:
+
+
+token_frequency = {}
+for token, frequency in chembl_token_dict.items():
+    token_frequency[token] = frequency
+    
+for token, frequency in uniprot_token_dict.items():
+    if token in token_frequency:
+        print(token)
+        token_frequency[token] += frequency
+    else:
+        token_frequency[token] = frequency
+        
+for token, frequency in special_token_dict.items():
+    token_frequency[token] = frequency
+
+
+# In[20]:
+
+
+#存储到pickle中
+with open('token_frequency.pickle', 'wb') as f:
+    pickle.dump(token_frequency, f)
+
+
+# In[21]:
+
+
+# 从pickle文件中读取字典
+with open('token_frequency.pickle', 'rb') as f:
+    token_frequency = pickle.load(f)
+
+# 获取字典中的item数量
+num_items = len(token_frequency.items())
+
+# 输出item数量
+print("字典中的item数量为：", num_items)
+
+
+# In[23]:
+
+
+token_frequency[')N7']
+
+
+# In[ ]:
+
+
+###注意：蛋白质token有16，693个，分子token有23，532个，special token有10个，共计40，235个
+###创建的pickle文件中有分子和蛋白质交叉的字符，所以合并后有40208个
+#L
+# V
+# S
+# I
+# T
+# A
+# R
+# M
+# P
+# H
+# Z
+# F
+# K
+# O
+# B
+# C
+# X
+# N
+# SS
+# NN
+# CS
+# FC
+# CN
+# NC
+# CC
+# CCS
+# CCN
+
--- a/config/drug_codes_chembl.txt
+++ b/config/drug_codes_chembl.txt
--- a/config/protein_codes_uniprot.txt
+++ b/config/protein_codes_uniprot.txt
--- a/config/subword_units_map_chembl.csv
+++ b/config/subword_units_map_chembl.csv
--- a/config/subword_units_map_uniprot.csv
+++ b/config/subword_units_map_uniprot.csv
--- a/config/test.ipynb
+++ b/config/test.ipynb
--- a/config/token_frequency.pickle
+++ b/config/token_frequency.pickle
--- a/config/vocab.txt
+++ b/config/vocab.txt
--- a/config/vocab_mol.txt
+++ b/config/vocab_mol.txt
--- a/configuration_bert.py
+++ b/configuration_bert.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration """
+
+from transformers.configuration_utils import PretrainedConfig
+
+BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json",
+    "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/config.json",
+    "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/config.json",
+    "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/config.json",
+    "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json",
+    "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json",
+    "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/config.json",
+    "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/config.json",
+    "bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/config.json",
+    "bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/config.json",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/config.json",
+    "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/config.json",
+    "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/config.json",
+    "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese-char": "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/config.json",
+    "TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json",
+    "TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/config.json",
+    "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/config.json",
+    "BertAffinity": "./config/config.json"
+    # See all BERT models at https://huggingface.co/models?filter=bert
+}
+
+
+class BertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a
+    :class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
+            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
+            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
+            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
+            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
+            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
+            <https://arxiv.org/abs/2009.13658>`__.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+
+    Examples::
+
+        >>> from transformers import BertModel, BertConfig
+
+        >>> # Initializing a BERT bert-base-uncased style configuration
+        >>> configuration = BertConfig()
+
+        >>> # Initializing a model from the bert-base-uncased style configuration
+        >>> model = BertModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = "bert"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        gradient_checkpointing=False,
+        position_embedding_type="absolute",
+        use_cache=True,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.gradient_checkpointing = gradient_checkpointing
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
--- a/data_preprocessing.py
+++ b/data_preprocessing.py
--- a/data_preprocessing_kmers.py
+++ b/data_preprocessing_kmers.py
--- a/dataset.py
+++ b/dataset.py
--- a/dataset_backup.py
+++ b/dataset_backup.py
--- a/eval.py
+++ b/eval.py
+import numpy as np
+import re
+
+
+def eval_result(pred, label):
+    pred = np.array(pred)
+    label = np.array(label)
+    num = len(pred)
+    diff = pred - label
+    mse = np.sum(np.power(diff, 2)) / num
+    rmse = np.sqrt(mse)
+    pearson_co = np.corrcoef(pred, label)
+
+    return rmse, pearson_co
+
+
+def eval(pred_path, label_path):
+    with open(pred_path, 'r') as f:
+        pred = f.readlines()
+        pred = [float(i.strip()) for i in pred]
+    with open(label_path, 'r') as f:
+        label = f.readlines()
+        label = [float(i.strip()) for i in label]
+    remse, r_mat = eval_result(pred, label)
+    r = r_mat[0, 1]
+    file = pred_path.split("/")[-1]
+    save_path = pred_path.replace(file, 'eval_results')
+    with open(save_path, 'w') as f:
+        f.write('RMSE : {} ; Pearson Correlation Coefficient : {}'.format(remse, r))
+    print('RMSE : {} ; Pearson Correlation Coefficient : {}'.format(remse, r))
+
+
+if __name__ == '__main__':
+    # with open('pre_test.sh', 'r') as f:
+    #     pred_dir = f.readline()
+    #     pred_dir = pred_dir.split()[5].split('/')[-1]
+    # pred_result = './predict/{}/test.txt'.format(pred_dir)
+    # pred_result = './predict/add_pretrain_1019-s-329480_v2/test_mol.txt'
+    # pred_result = './predict/add_pretrain_1019-s-329480-er/test_mol.txt'
+
+    # eval single file
+    # pred_file = "./predict/without-pre-train-layer-6-1021-s-988440-test/test_mol.txt"
+    # test_label_path = './data/test/test_ic50'
+    # eval(pred_file, test_label_path)
+
+
+
+    # eval all
+
+    test_label_path = './data/test/test_ic50'
+    test_label_path_ER = './data/ER/ER_ic50'
+    test_label_path_GPCR = './data/GPCR/GPCR_ic50'
+    test_label_path_Ion_channel = './data/Ion_channel/channel_ic50'
+    test_label_path_Tyrosine_kinase = './data/Tyrosine_kinase/kinase_ic50'
+
+    # test mol
+    # pred_test = "./predict/without-pre-train-layer-6-1021-s-988440-test/test_mol.txt"
+    # er = "./predict/without-pre-train-layer-6-1021-s-988440-er/test_er.txt"
+    # gpcr = "./predict/without-pre-train-layer-6-1021-s-988440-gpcr/test_gpcr.txt"
+    # channel = "./predict/without-pre-train-layer-6-1021-s-988440-channel/test_channel.txt"
+    # kinase = "./predict/without-pre-train-layer-6-1021-s-988440-kinase/test_kinase.txt"
+    
+    # test 
+    # pred_test = "predict/train_ori_1217-s-296532/test.txt"
+    # er = "predict/train_ori_1217-s-296532/test_ori_er.txt"
+    # gpcr = "predict/train_ori_1217-s-296532/test_ori_gpcr.txt"
+    # channel = "predict/train_ori_1217-s-296532/test_ori_channel.txt"
+    # kinase = "predict/train_ori_1217-s-296532/test_ori_kinase.txt"
+    
+    # deepdta
+    # pred_test = "baselines/DeepDTA/source/output/test/results.txt"
+    # er = "baselines/DeepDTA/source/output/ER/results.txt"
+    # gpcr = "baselines/DeepDTA/source/output/GPCR/results.txt"
+    # channel = "baselines/DeepDTA/source/output/Ion_channel/results.txt"
+    # kinase = "baselines/DeepDTA/source/output/Tyrosine_kinase/results.txt"
+    
+    
+    # attentiondta
+    # pred_test = "baselines/AttentionDTA_BIBM/results/test/test.txt"
+    # er = "baselines/AttentionDTA_BIBM/results/ER/test.txt"
+    # gpcr = "baselines/AttentionDTA_BIBM/results/GPCR/test.txt"
+    # channel = "baselines/AttentionDTA_BIBM/results/channel/test.txt"
+    # kinase = "baselines/AttentionDTA_BIBM/results/kinase/test.txt"
+
+    # test_mol test_2
+    # pred_test = "predict/pre-train-layer-6-1021/test_mol.txt"
+    # er = "predict/pre-train-layer-6-1021/test_er.txt"
+    # gpcr = "predict/pre-train-layer-6-1021/test_gpcr.txt"
+    # channel = "predict/pre-train-layer-6-1021/test_channel.txt"
+    # kinase = "predict/pre-train-layer-6-1021/test_kinase.txt"
+
+    #frequency embedding /notebook/our_model-new/predict/pre-train-layer-6-1021-freq
+    pred_test = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_mol.txt"
+    er = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_er.txt"
+    gpcr = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_gpcr.txt"
+    channel = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_channel.txt"
+    kinase = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_kinase.txt"
+
+
+    pred_list = [pred_test, er, gpcr, channel, kinase]
+    label_list = [test_label_path, test_label_path_ER, test_label_path_GPCR, test_label_path_Ion_channel, test_label_path_Tyrosine_kinase]
+    for i, j in zip(pred_list, label_list):
+        print(i)
+        eval(i, j)
+
+
+    
+    
+
--- a/fine_tune.sh
+++ b/fine_tune.sh
+CUDA_VISIBLE_DEVICES=1 python run_interaction.py \
+-batch_size=4 --task=train_mol --epochs=30 --lr=1e-5 \
+--savedir=lr-1e-5-batch-64-e-30-layer6-1125-new \
+--config=./config/config_layer_6_mol.json \
+--output='./predict/test_new' \
+--pre_train=True \
+--init='./saved_model/train/epoch-23-step-790752-loss-0.12734022736549377.pth'
--- a/interaction_preprocessing.ipynb
+++ b/interaction_preprocessing.ipynb
--- a/load_embedding.py
+++ b/load_embedding.py
+from yaml import load
+from dataset import Data_Encoder, get_task, Data_Encoder_mol, Data_Gen, Tokenizer
+import torch
+from torch.utils.data import DataLoader
+from configuration_bert import BertConfig
+from modeling_bert import BertAffinityModel
+import tqdm
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn import manifold, datasets
+import os
+
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "5"
+
+
+    
+def load_embedding(data_file):
+    tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+    tokenizer = Tokenizer(tokenizer_config)
+    sep_id = 3
+    dataset = Data_Gen(data_file)
+    data_generator = DataLoader(dataset, batch_size=1, shuffle=False)
+    config = BertConfig.from_pretrained('./config/config_layer_6_mol.json')
+    model = BertAffinityModel(config)
+    model.load_state_dict(torch.load('./model/add_pretrain_1019/epoch-9-step-329480-loss-0.736057146887367.pth'), strict=True)
+    
+
+    all_drug = []
+    all_protein = []
+    
+    for i, (input, affinity) in enumerate(data_generator):
+        # input = input[1:]
+        input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
+        input_embs = model.embeddings(input_ids)
+        
+        sep_index = torch.where(input_ids[:, :-1] == sep_id)[-1]
+        drug_emb = input_embs[:, 1:sep_index].squeeze(0).detach().numpy()
+        protein_embs = input_embs[:, sep_index+1:-1].squeeze(0).detach().numpy()
+        
+        all_drug.append(drug_emb)
+        all_protein.append(protein_embs)
+    
+    return all_drug, all_protein    
+        
+
+def plot_drug_protein(save):
+
+    drug_embs, protein_embs = load_embedding("add_figure/sample_data/test_sample")
+    
+    
+    all_drug_sub = np.concatenate(drug_embs)
+    all_protein_sub = np.concatenate(protein_embs)[:len(all_drug_sub)]
+    
+    all_data = np.concatenate((all_drug_sub, all_protein_sub))
+
+    y = np.array([0]*len(all_drug_sub) + [1]*len(all_protein_sub))
+    
+    # t-sne
+    
+    X_tsne = manifold.TSNE(n_components=2, init='random', random_state=5, verbose=1).fit_transform(all_data)
+    
+    # plot
+    fig, ax=plt.subplots(dpi=600)
+    plt.axis("off")
+    # ax.spines['top'].set_visible(False)
+    # ax.spines['right'].set_visible(False)
+    # ax.spines['bottom'].set_visible(False)
+    # ax.spines['left'].set_visible(False)
+
+    plt.scatter(X_tsne[y==0, 0], X_tsne[y==0, 1], c="darkcyan", s=5, label="Drug", marker='^')
+    plt.scatter(X_tsne[y==1, 0], X_tsne[y==1, 1], c="deepskyblue", s=5, label="Protein", marker="s")
+    # plt.scatter(X_tsne[y==2, 0], X_tsne[y==2, 1], c="salmon", s=5, label="Story")
+    plt.legend(labels=["Drug", "Protein"], loc=1)
+
+    plt.savefig(save, dpi=fig.dpi, pad_inches=0, bbox_inches="tight")
+
+
+
+def plot_protein_sub(save):
+    
+    drug_embs, protein_embs = load_embedding("add_figure/sample_data/test_sample")
+    drug_1 = protein_embs[0]
+    drug_2 = protein_embs[1]
+    drug_3 = protein_embs[2]
+    # drug_4 = protein_embs[3]
+    y = np.array([0]*len(drug_1) + [1]*len(drug_2) + [2]*len(drug_3)) 
+                #  + [3]*len(drug_4))
+    
+    
+    # all_data = np.concatenate((drug_1, drug_2, drug_3, drug_4))
+    all_data = np.concatenate((drug_1, drug_2, drug_3))
+    
+    X_tsne = manifold.TSNE(n_components=2, init='random', random_state=5, verbose=1).fit_transform(all_data)
+    
+    # plot
+    fig, ax=plt.subplots(dpi=600)
+    plt.axis("off")
+
+    plt.scatter(X_tsne[y==0, 0], X_tsne[y==0, 1], c="darkcyan", s=5, label="PTPH1", marker='^')
+    plt.scatter(X_tsne[y==1, 0], X_tsne[y==1, 1], c="deepskyblue", s=5, label="mGluRs", marker="s")
+    plt.scatter(X_tsne[y==2, 0], X_tsne[y==2, 1], c="salmon", s=5, label="EZH2")
+    # plt.scatter(X_tsne[y==3, 0], X_tsne[y==3, 1], s=5, label="Protein_4")
+    # plt.legend(labels=["Protein_1", "Protein_2", "Protein_3", "Protein_4"], loc=1)
+    plt.legend(labels=["PTPH1", "mGluRs", "EZH2"], loc=1)
+
+    plt.savefig(save, dpi=fig.dpi, pad_inches=0, bbox_inches="tight")
+    
+    
+    
+
+def plot_drug_sub(save):
+    
+    drug_embs, protein_embs = load_embedding("add_figure/sample_data/test_sample")
+    drug_1 = drug_embs[0]
+    drug_2 = drug_embs[1]
+    drug_3 = drug_embs[2]
+    # drug_4 = protein_embs[3]
+    y = np.array([0]*len(drug_1) + [1]*len(drug_2) + [2]*len(drug_3)) 
+                #  + [3]*len(drug_4))
+    
+    
+    # all_data = np.concatenate((drug_1, drug_2, drug_3, drug_4))
+    all_data = np.concatenate((drug_1, drug_2, drug_3))
+    
+    X_tsne = manifold.TSNE(n_components=2, init='random', random_state=5, verbose=1).fit_transform(all_data)
+    
+    # plot
+    fig, ax=plt.subplots(dpi=600)
+    plt.axis("off")
+
+    plt.scatter(X_tsne[y==0, 0], X_tsne[y==0, 1], c="darkcyan", s=5, label="Drug_1", marker='^')
+    plt.scatter(X_tsne[y==1, 0], X_tsne[y==1, 1], c="deepskyblue", s=5, label="Drug_2", marker="s")
+    plt.scatter(X_tsne[y==2, 0], X_tsne[y==2, 1], c="salmon", s=5, label="Drug_3")
+    # plt.scatter(X_tsne[y==3, 0], X_tsne[y==3, 1], s=5, label="Protein_4")
+    # plt.legend(labels=["Protein_1", "Protein_2", "Protein_3", "Protein_4"], loc=1)
+    plt.legend(labels=["Drug_1", "Drug_2", "Drug_3"], loc=1)
+
+    plt.savefig(save, dpi=fig.dpi, pad_inches=0, bbox_inches="tight")
+
+    
+
+if __name__ == '__main__':
+    plot_drug_protein("drug_and_protein_sub")
+    # plot_drug_sub("three_drug_sub")
+    # plot_protein_sub("three_protein_sub")
--- a/modeling_bert.py
+++ b/modeling_bert.py
--- a/pretrain.sh
+++ b/pretrain.sh
+CUDA_VISIBLE_DEVICES=4 
+python run_prediction.py \
+--batch_size=56 \
+--task=train_mol \
+--epochs=100 \
+--lr=1e-5 \
+--savedir=pre-train-yzh \
+--config=./config/config_layer_6_mol.json 
\ No newline at end of file
--- a/run_interaction.py
+++ b/run_interaction.py
--- a/run_interaction_backup.py
+++ b/run_interaction_backup.py
--- a/test.py
+++ b/test.py
+import re, collections
+
+def get_stats(vocab):
+    pairs = collections.defaultdict(int)
+    for word, freq in vocab.items():
+        symbols = word.split()
+        for i in range(len(symbols)-1):
+            pairs[symbols[i],symbols[i+1]] += freq
+    return pairs
+
+def merge_vocab(pair, v_in):
+    v_out = {}
+    bigram = re.escape(' '.join(pair))
+    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
+    for word in v_in:
+        w_out = p.sub(''.join(pair), word)
+        v_out[w_out] = v_in[word]
+    return v_out
+
+vocab = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}
+num_merges = 1000
+for i in range(num_merges):
+    pairs = get_stats(vocab)
+    if not pairs:
+        break
+    best = max(pairs, key=pairs.get)
+    vocab = merge_vocab(best, vocab)
+    print(best)
+
+# print output
+# ('e', 's')
+# ('es', 't')
+# ('est', '</w>')
+# ('l', 'o')
+# ('lo', 'w')
+# ('n', 'e')
+# ('ne', 'w')
+# ('new', 'est</w>')
+# ('low', '</w>')
+# ('w', 'i')
+# ('wi', 'd')
+# ('wid', 'est</w>')
+# ('low', 'e')
+# ('lowe', 'r')
+# ('lower', '</w>')
\ No newline at end of file
--- a/test.sh
+++ b/test.sh
+CUDA_VISIBLE_DEVICES=1 python run_prediction.py \
+--task=test_mol \
+-batch_size=64 \
+--output=./predict/test \
+--config=./config/config_layer_6_mol.json \
+--init=/notebook/our_model/model/pre-train-layer-6-1021/epoch-29-step-494220-loss-0.23760947585105896.pth 
\ No newline at end of file
--- a/test_pre_training.sh
+++ b/test_pre_training.sh
+CUDA_VISIBLE_DEVICES=1 python run_pretraining.py \
+--batch-size=16 \
+--task=test-pre-train \
+--config=./config/config_layer_6_mol.json \
+
+# --output='model/mask-LM-layer-6-dobule-1020/epoch-11-step-395376-loss-0.06246088946244073' \
+--init='./model/mask-LM-layer-6-dobule-1020/epoch-11-step-395376-loss-0.06246088946244073.pth'
\ No newline at end of file
--- a/train_biosnap_no_pretrain.sh
+++ b/train_biosnap_no_pretrain.sh
+CUDA_VISIBLE_DEVICES=0 \
+
+python run_interaction.py \
+--epochs=50 \
+--lr=1e-5 \
+--task=train_biosnap \
+--batch_size=4 \
+--config=./config/config_layer_6_mol.json \
+--pre_train=False \
+--init=/notebook/our_model/model/pre-train-new-100epochs-config_layer_6_mol/epoch-99-step-3294800-loss-0.0736498162150383.pth
\ No newline at end of file
--- a/utils/analyse.txt
+++ b/utils/analyse.txt
+6.339674062480976
+1.4751794034241978
--- a/utils/analyse_data.py
+++ b/utils/analyse_data.py
+from subword_nmt.apply_bpe import BPE
+import codecs
+import collections
+
+bpe_codes_drug = codecs.open('../config/drug_codes_chembl.txt')
+dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
+
+bpe_codes_prot = codecs.open('../config/protein_codes_uniprot.txt')
+pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
+
+
+def load_file(file):
+    data = []
+    with open(file, 'r') as f:
+        lines = f.readlines()
+        for line in lines:
+            data.append(line.strip('\n'))
+        return data
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+def seq2vec(protein, drug):
+    start_token = '[CLS]'
+    sep_token = '[SEP]'
+
+    prots = load_file(protein)
+    drugs = load_file(drug)
+    for p, d in zip(prots, drugs):
+        d = dbpe.process_line(d).split()
+        p = pbpe.process_line(p).split()
+        tokens = [start_token] + d + [sep_token] + p + [sep_token]
+        print(len(p))
+
+
+if __name__ == '__main__':
+    seq = '../data/test/test_protein_seq'
+    simle = '../data/train/train_smile'
+    vocab = '../config/vocab_mol.txt'
+    seq2vec(seq, simle)
\ No newline at end of file
--- a/utils/define_vocab.py
+++ b/utils/define_vocab.py
+import pandas as pd
+import numpy as np
+
+sub_csv = pd.read_csv('../config/subword_units_map_chembl.csv')
+idx2word_d = sub_csv['index'].values
+
+sub_csv = pd.read_csv('../config/subword_units_map_uniprot.csv')
+idx2word_p = sub_csv['index'].values
+# words2idx_d = dict(zip(idx2word_d, range(0, len(idx2word_d))))
+
+spqcial_tokens = np.array(['[PAD]', '[MASK]', '[CLS]', '[SEP]', '[UNK]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]'])
+
+all_tokens = np.concatenate((spqcial_tokens, idx2word_p, idx2word_d))
+
+save = '../config/vocab_mol.txt'
+
+with open(save, 'w') as f:
+    for token in all_tokens:
+        f.write(str(token) + '\n')
+
--- a/utils/normalize_data.py
+++ b/utils/normalize_data.py
+import numpy as np
+from tqdm import tqdm
+
+def z_score(data, save, enlarge):
+    with open(data, 'r') as f:
+        lines = f.readlines()
+    data = []
+    for line in lines:
+        aff = np.float64(line.strip())
+        data.append(aff)
+    data = np.array(data)
+    ave = np.mean(data)
+    std = np.std(data)
+    new_affinity = (data - ave) / std
+    new_affinity *= enlarge
+    new_affinity = list(new_affinity)
+    with open(save, 'w') as f:
+        for aff in tqdm(new_affinity):
+            f.write(str(aff) + '\n')
+
+def reform(input_file_path, result_save_path, average, std, enlarge):
+    with open(input_file_path, 'r') as f:
+        res = f.readlines()
+    with open(result_save_path, 'w') as f:
+        for line in tqdm(res):
+            data = float(line.strip())
+            ori = ((data / enlarge) * std) + average
+            f.write(str(ori) + '\n')
+
+
+
+
+if __name__ == '__main__':
+    average = 6.339674062480976
+    std = 1.4751794034241978
+
+    # gengerate z-score dataset
+    # data = '../data/train_ic50'
+    # save = '../data/train_z_1_ic50'
+    # enlarge = 1
+    # z_score(data, save, enlarge)
+
+    # reform result
+    result = '../predict/lr-1e-5-batch-32-e-10-layer3-0503-z-1-step-82370/test_1.txt'
+    save = '../predict/lr-1e-5-batch-32-e-10-layer3-0503-z-1-step-82370/test.txt'
+    reform(result, save, average, std, 1)
\ No newline at end of file