Initial commit

074f30dc · 杨志辉 · 074f30dc · 074f30dc · 074f30dc · 074f30dc
--- a/.gitignore
+++ b/.gitignore
+model/
+model0/
+tfrecord/
+train/
+logs/
+log/
+visualize_attention/attention_mat.npy
+baselines/
+case_study/
+experment_result/
+predict/
+doc/
+utils/train_data_analyse.csv
+utils/test_data_analyse.csv
+utils/data_analyse.xlsx
+*.ipynb_checkpoints/
+.idea/
+.DS_Store
+data/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
+{
+    // 使用 IntelliSense 了解相关属性。 
+    // 悬停以查看现有属性的描述。
+    // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: 当前文件",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${fileDirname}"
+        }
+    ]
+}
\ No newline at end of file
--- a/README.md
+++ b/README.md
+## Title ##
+Advancing Drug-Target Interaction Prediction with BERT and Subsequence Embedding
+## Abstract ##
+Exploring the relationship between proteins and drugs plays a significant role in discovering new synthetic drugs. The Drug-Target Interaction (DTI) prediction is a fundamental task in the relationship between proteins and drugs. Unlike encoding proteins by amino acids, we use amino acid subsequence to encode proteins, which simulates the biological process of DTI better. For this research purpose, we proposed a novel deep learning framework based on Bidirectional Encoder Representation from Transformers (BERT), which integrates high-frequency subsequence embedding and transfer learning methods to complete the DTI prediction task. As the first key module, subsequence embedding allows to explore the functional interaction units from drug and protein sequences and then contribute to finding DTI modules. As the second key module, transfer learning promotes the model learn the common DTI features from protein and drug sequences in a large dataset. Overall, the BERT-based model can learn two kinds features through the multi-head self-attention mechanism: internal features of sequence and interaction features of both proteins and drugs, respectively. Compared with other methods, BERT-based methods enable more DTI-related features to be discovered from general features of proteins and drugs through transfer learning. We conducted extensive experiments for the DTI prediction task on three different benchmark datasets. The experimental results show that the model achieves an average prediction metrics higher than most baseline methods. In order to verify the importance of transfer learning, we conducted an ablation study on datasets, and the results show the superiority of transfer learning. In addition, we test the scalability of the model on the dataset in unseen drugs and proteins, and the results of the experiments show that it is acceptable in scalability.
+## How to use ##
+Use the following command to pretrain the model:
+ ```shell
+    sh pretrain.sh
+```
+Use the following command to fine-tune the model:
+ ```shell
+    sh fine_tune.sh
+```
+Use the following command to predict the binding affinity score:
+ ```shell
+    sh test.sh
+```
\ No newline at end of file
--- a/config/config.json
+++ b/config/config.json
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 384,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 23614
+}
--- a/config/config_layer_3.json
+++ b/config/config_layer_3.json
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 384,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 3,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 23614
+}
--- a/config/config_layer_3_mol.json
+++ b/config/config_layer_3_mol.json
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 595,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 3,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 40235
+}
\ No newline at end of file
--- a/config/config_layer_6.json
+++ b/config/config_layer_6.json
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 23615
+}
--- a/config/config_layer_6_mol.json
+++ b/config/config_layer_6_mol.json
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 595,
+  "max_len": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 40235
+}
\ No newline at end of file
--- a/config/config_layer_9.json
+++ b/config/config_layer_9.json
+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 384,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 9,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 23615
+}
--- a/config/count_token_freq.ipynb
+++ b/config/count_token_freq.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "00ef6494-43ce-43de-b91c-a8039d19fdcb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "84cdc51a-91e9-4af5-8404-5ec2b5059044",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 读取分子token字典\n",
+    "with open('subword_units_map_chembl.csv', 'r', encoding='utf-8') as f:\n",
+    "    reader = csv.reader(f)\n",
+    "    next(reader)  # 跳过标题行\n",
+    "    chembl_token_dict = {}\n",
+    "    for row in reader:\n",
+    "        token = row[2]\n",
+    "        index = token\n",
+    "        frequency = int(row[3])\n",
+    "        chembl_token_dict[token] = frequency\n",
+    "        # print(token, frequency)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "82e79dba-ec91-463e-821e-778197f240d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 读取蛋白质token字典\n",
+    "with open('subword_units_map_uniprot.csv', 'r', encoding='utf-8') as f:\n",
+    "    reader = csv.reader(f)\n",
+    "    next(reader)  # 跳过标题行\n",
+    "    uniprot_token_dict = {}\n",
+    "    for row in reader:\n",
+    "        token = row[2]\n",
+    "        index = token\n",
+    "        frequency = int(row[3])\n",
+    "        uniprot_token_dict[token] = frequency\n",
+    "        # print(token, frequency)        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "5ed22d0d-87e5-47d6-9f1f-914f904150c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#创建一个special token字典\n",
+    "special_token_dict = {\n",
+    "    '[PAD]': 1,\n",
+    "    '[MASK]': 1,\n",
+    "    '[CLS]': 1,\n",
+    "    '[SEP]': 1,\n",
+    "    '[UNK]': 1,\n",
+    "    '[unused1]': 1,\n",
+    "    '[unused2]': 1,\n",
+    "    '[unused3]': 1,\n",
+    "    '[unused4]': 1,\n",
+    "    '[unused5]': 1\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "8e0472c9-1cdb-4fd5-8483-72acc0308e58",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "L\n",
+      "V\n",
+      "S\n",
+      "I\n",
+      "T\n",
+      "A\n",
+      "R\n",
+      "M\n",
+      "P\n",
+      "H\n",
+      "Z\n",
+      "F\n",
+      "K\n",
+      "O\n",
+      "B\n",
+      "C\n",
+      "X\n",
+      "N\n",
+      "SS\n",
+      "NN\n",
+      "CS\n",
+      "FC\n",
+      "CN\n",
+      "NC\n",
+      "CC\n",
+      "CCS\n",
+      "CCN\n"
+     ]
+    }
+   ],
+   "source": [
+    "token_frequency = {}\n",
+    "for token, frequency in chembl_token_dict.items():\n",
+    "    token_frequency[token] = frequency\n",
+    "    \n",
+    "for token, frequency in uniprot_token_dict.items():\n",
+    "    if token in token_frequency:\n",
+    "        print(token)\n",
+    "        token_frequency[token] += frequency\n",
+    "    else:\n",
+    "        token_frequency[token] = frequency\n",
+    "        \n",
+    "for token, frequency in special_token_dict.items():\n",
+    "    token_frequency[token] = frequency"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "a6e2d04c-900b-4446-844a-4cd9e8d1cfa2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#存储到pickle中\n",
+    "with open('token_frequency.pickle', 'wb') as f:\n",
+    "    pickle.dump(token_frequency, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "64bc73cc-aae2-4741-a98b-aae147c705fe",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "字典中的item数量为： 40208\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 从pickle文件中读取字典\n",
+    "with open('token_frequency.pickle', 'rb') as f:\n",
+    "    token_frequency = pickle.load(f)\n",
+    "\n",
+    "# 获取字典中的item数量\n",
+    "num_items = len(token_frequency.items())\n",
+    "\n",
+    "# 输出item数量\n",
+    "print(\"字典中的item数量为：\", num_items)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "33d4a5e1-4a03-41ae-a0ee-f60059d571c0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "100"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "token_frequency[')N7']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b55735c-e495-4d7b-bf9f-600e665bf8db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "###注意：蛋白质token有16，693个，分子token有23，532个，special token有10个，共计40，235个\n",
+    "###创建的pickle文件中有分子和蛋白质交叉的字符，所以合并后有40208个\n",
+    "#L\n",
+    "# V\n",
+    "# S\n",
+    "# I\n",
+    "# T\n",
+    "# A\n",
+    "# R\n",
+    "# M\n",
+    "# P\n",
+    "# H\n",
+    "# Z\n",
+    "# F\n",
+    "# K\n",
+    "# O\n",
+    "# B\n",
+    "# C\n",
+    "# X\n",
+    "# N\n",
+    "# SS\n",
+    "# NN\n",
+    "# CS\n",
+    "# FC\n",
+    "# CN\n",
+    "# NC\n",
+    "# CC\n",
+    "# CCS\n",
+    "# CCN"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/config/count_token_freq.py
+++ b/config/count_token_freq.py
+#!/usr/bin/env python
+# coding: utf-8
+# In[1]:
+import csv
+import pickle
+# In[24]:
+# 读取分子token字典
+with open('subword_units_map_chembl.csv', 'r', encoding='utf-8') as f:
+    reader = csv.reader(f)
+    next(reader)  # 跳过标题行
+    chembl_token_dict = {}
+    for row in reader:
+        token = row[2]
+        index = token
+        frequency = int(row[3])
+        chembl_token_dict[token] = frequency
+        # print(token, frequency)
+# In[25]:
+# 读取蛋白质token字典
+with open('subword_units_map_uniprot.csv', 'r', encoding='utf-8') as f:
+    reader = csv.reader(f)
+    next(reader)  # 跳过标题行
+    uniprot_token_dict = {}
+    for row in reader:
+        token = row[2]
+        index = token
+        frequency = int(row[3])
+        uniprot_token_dict[token] = frequency
+        # print(token, frequency)        
+# In[26]:
+#创建一个special token字典
+special_token_dict = {
+    '[PAD]': 1,
+    '[MASK]': 1,
+    '[CLS]': 1,
+    '[SEP]': 1,
+    '[UNK]': 1,
+    '[unused1]': 1,
+    '[unused2]': 1,
+    '[unused3]': 1,
+    '[unused4]': 1,
+    '[unused5]': 1
+}
+# In[28]:
+token_frequency = {}
+for token, frequency in chembl_token_dict.items():
+    token_frequency[token] = frequency
+for token, frequency in uniprot_token_dict.items():
+    if token in token_frequency:
+        print(token)
+        token_frequency[token] += frequency
+    else:
+        token_frequency[token] = frequency
+for token, frequency in special_token_dict.items():
+    token_frequency[token] = frequency
+# In[20]:
+#存储到pickle中
+with open('token_frequency.pickle', 'wb') as f:
+    pickle.dump(token_frequency, f)
+# In[21]:
+# 从pickle文件中读取字典
+with open('token_frequency.pickle', 'rb') as f:
+    token_frequency = pickle.load(f)
+# 获取字典中的item数量
+num_items = len(token_frequency.items())
+# 输出item数量
+print("字典中的item数量为：", num_items)
+# In[23]:
+token_frequency[')N7']
+# In[ ]:
+###注意：蛋白质token有16，693个，分子token有23，532个，special token有10个，共计40，235个
+###创建的pickle文件中有分子和蛋白质交叉的字符，所以合并后有40208个
+#L
+# V
+# S
+# I
+# T
+# A
+# R
+# M
+# P
+# H
+# Z
+# F
+# K
+# O
+# B
+# C
+# X
+# N
+# SS
+# NN
+# CS
+# FC
+# CN
+# NC
+# CC
+# CCS
+# CCN
--- a/config/drug_codes_chembl.txt
+++ b/config/drug_codes_chembl.txt
--- a/config/protein_codes_uniprot.txt
+++ b/config/protein_codes_uniprot.txt
+#version: 0.2
+L L
+A A
+A L
+V L
+G L
+E L
+S L
+G G
+S S
+E E
+T L
+D L
+R L
+I L
+A V
+K L
+A G
+V V
+A E
+K K
+S G
+A I
+P L
+A R
+A D
+A S
+Q L
+T G
+A K
+V E
+N L
+F L
+V I
+V G
+A T
+K E
+R R
+V D
+V S
+P G
+I E
+P E
+I G
+I D
+V T
+R E
+I S
+A Q
+D G
+V K
+D E
+P S
+Y L
+R G
+I T
+A F
+N G
+K G
+A P
+V R
+T T
+I K
+F G
+S E
+A N
+V P
+H L
+I N
+M L
+I R
+T S
+T E
+V N
+Q Q
+F S
+D D
+I P
+F E
+K S
+Q E
+Q G
+D S
+R S
+Y G
+T P
+K R
+K N
+K D
+V F
+A Y
+P P
+K T
+N N
+A M
+R D
+V Q
+N E
+I I
+F D
+H G
+Y E
+N S
+T D
+P D
+Q S
+Y S
+A H
+M G
+K P
+I F
+T R
+Q R
+M E
+V Y
+M S
+K I
+K Q
+C L
+N P
+I Q
+W L
+E G
+T F
+V H
+R P
+Y D
+N D
+C G
+R F
+K F
+V M
+Q D
+I Y
+H S
+R T
+K Y
+R Q
+R N
+Q P
+T N
+S D
+R I
+F F
+R K
+T I
+H E
+M D
+T Q
+A C
+R Y
+H P
+A LL
+D P
+F P
+S P
+G E
+T K
+T Y
+AA L
+V C
+F N
+M P
+I H
+E S
+A W
+Q N
+E D
+I V
+T V
+LL L
+F T
+Q K
+R H
+Y P
+G S
+R V
+E K
+M N
+C S
+F Y
+Q T
+K V
+D F
+D I
+N F
+N I
+K H
+Q I
+N Y
+Q F
+M T
+D Y
+N K
+A GL
+N T
+D T
+D R
+M K
+E R
+Q Y
+D K
+P T
+F R
+W G
+A VL
+S T
+N R
+M R
+F I
+A EL
+Q H
+AA G
+S R
+C E
+F K
+D N
+V W
+S K
+P R
+Y R
+LL G
+T H
+E N
+A SL
+AL G
+S F
+P F
+Q V
+A GG
+S N
+P N
+S I
+M I
+Y Y
+E I
+A DL
+H D
+P K
+E T
+Y F
+A RL
+P I
+H H
+Y N
+D V
+P V
+A IL
+P Q
+Y T
+GG G
+A TL
+H R
+S Q
+E Q
+H F
+A EE
+K M
+A KL
+A SS
+I C
+P Y
+M Q
+S Y
+D Q
+N Q
+M F
+S V
+GL G
+LL E
+VL G
+A SG
+W E
+H T
+Y K
+M V
+C D
+SL G
+H I
+V AA
+A KK
+E P
+H Y
+C P
+W S
+V LL
+EL G
+H Q
+F Q
+H N
+I M
+Y Q
+AL E
+R M
+F V
+G D
+N V
+T M
+A TG
+RL G
+LL S
+C R
+A QL
+AV G
+Y I
+H K
+IL G
+AA E
+W D
+A PL
+T C
+M M
+SS S
+E F
+A FL
+P H
+EL E
+VL E
+EE E
+TL G
+AE G
+AL S
+VV G
+LL D
+T W
+SS G
+DL G
+Q M
+KL G
+AD G
+F H
+W R
+V AL
+F C
+SL S
+A PG
+VI G
+D M
+N H
+A NL
+AI G
+C T
+D H
+E V
+GL S
+M Y
+I W
+R C
+S H
+R LL
+K C
+EE G
+R W
+AR G
+VL S
+AA S
+N M
+V AV
+K W
+VD G
+E Y
+VE G
+K LL
+A PE
+AK G
+T LL
+V GG
+V VL
+C F
+V AG
+C N
+D W
+M H
+V SL
+Y H
+K AL
+K EL
+R AL
+K KL
+GL P
+A PS
+GL D
+A NG
+PE G
+SL E
+W Q
+KE G
+V TL
+AK E
+V EL
+FL G
+GL E
+P M
+V AE
+IL E
+DL E
+DL S
+AV E
+PL G
+C Q
+A YL
+F M
+ID G
+W N
+I LL
+I AA
+IL S
+C Y
+AR E
+T AL
+V EE
+VL D
+TG E
+KK G
+VV E
+C K
+C I
+EL S
+F W
+T AA
+RL E
+RE G
+V SG
+AL D
+AI E
+RL S
+GG S
+V SS
+R AA
+TL S
+A ML
+V DL
+TL E
+LL P
+RR G
+NL G
+VK G
+QL G
+AG E
+KL E
+AL R
+IE G
+V GL
+K AA
+SG S
+SL P
+W F
+Q LL
+ID E
+AV S
+V PL
+W T
+I AL
+D LL
+R EL
+AQ G
+AF G
+AG S
+A HL
+W P
+Q AL
+V TG
+V AD
+R VL
+V IL
+KL S
+K VL
+W I
+VV S
+V KE
+VD E
+V AS
+D AL
+N C
+I GG
+V AI
+K SL
+K TL
+D C
+V AT
+V AR
+Y M
+V KK
+VL P
+I AE
+PL S
+I EE
+AD E
+AS E
+I AG
+AA R
+N W
+C H
+K IL
+K EE
+VG E
+QL E
+T SS
+S M
+VI E
+SG E
+C C
+AA D
+KK E
+K RL
+I SS
+V RL
+VS E
+SL D
+K AV
+V RE
+Q W
+VV D
+T GG
+R RL
+AA AA
+TG S
+N LL
+V KL
+VI S
+T VL
+PL P
+K GL
+YL G
+V AK
+K DL
+EL D
+NN NN
+P LL
+Q C
+R GG
+A YG
+Y C
+I DL
+I VG
+FL S
+Q AA
+V PG
+T GL
+P W
+F LL
+IS E
+P C
+R SL
+V RR
+N AL
+K AI
+FL E
+NL S
+IL D
+I AR
+I SL
+Y W
+I KE
+QL S
+VI D
+H C
+Q RL
+P AA
+T SL
+AG D
+H M
+NL E
+I AV
+I GL
+SE G
+IG E
+DG S
+M AA
+DE G
+I SG
+AT S
+SS E
+AI S
+FE G
+AL P
+W Y
+I VL
+RG E
+R AV
+T AE
+R GL
+R IL
+VT E
+PL E
+I IG
+I AS
+K VV
+K AG
+VG S
+I VE
+HL G
+T EE
+T TL
+DL D
+AV D
+P PG
+I KK
+R KL
+R VV
+KG E
+I AD
+Q EL
+LL N
+AT E
+K AK
+V NG
+R DL
+F GG
+K AE
+R TL
+QQ QQ
+VG D
+R AR
+A HG
+I TL
+R AG
+I EL
+T AV
+Q VL
+T VE
+RR E
+DG E
+I AK
+D EE
+DL P
+H LL
+VT S
+GG E
+V PE
+PG S
+T EL
+K NL
+IT S
+V PS
+AE S
+AQ E
+P EL
+K SS
+ML G
+V RG
+T VV
+V NL
+Q KL
+D VL
+Q TL
+M AL
+T AG
+IT E
+IG S
+VD S
+Q QL
+T AT
+T PE
+I VS
+I ID
+T AS
+V FL
+I VV
+I PL
+AI D
+PG E
+V AQ
+I IE
+Q AV
+W K
+I IL
+P SS
+I AT
+T PL
+Q SL
+TE G
+P AL
+I TG
+VK S
+N SL
+T VT
+K DG
+IK S
+K QL
+I RE
+AR S
+FG S
+K VI
+VE S
+KK S
+IN E
+AA P
+R EE
+N KL
+R AE
+SS SS
+D EL
+H W
+K GG
+M AK
+PS P
+I VD
+D IL
+V QL
+K SG
+IE S
+I IS
+T IL
+R QL
+I KL
+NG E
+QE G
+TL P
+T SG
+N IL
+F GL
+AD S
+AF S
+R SG
+AM G
+N VL
+M AE
+K PL
+RL P
+V AN
+K VE
+T DL
+RR S
+AK S
+T AI
+VR S
+F AA
+I AI
+F SL
+T VS
+T KL
+EL P
+N AA
+N SS
+D AA
+PP P
+V AP
+F SS
+K TT
+Q IL
+R AI
+A WL
+DE D
+TL D
+R SS
+IE D
+R PL
+M LL
+IK D
+R DG
+VK D
+F SG
+Y LL
+R PG
+VN E
+T RL
+M C
+I RL
+PS E
+Q AE
+AS D
+V FG
+W H
+YE G
+T PG
+R TG
+AS P
+Q GL
+F EE
+YL E
+VS D
+K TG
+V AF
+ID S
+YL S
+R FL
+Q AG
+PE S
+I VT
+I RR
+I NL
+F AG
+K AF
+ML E
+K IS
+P VL
+T AR
+N TL
+F AL
+P EE
+TT S
+R VG
+VD D
+T AD
+IR S
+Q EE
+K AR
+T TG
+M AR
+P GG
+M KK
+R KK
+PE D
+I AQ
+Q AI
+KL D
+K RR
+M AG
+K VG
+K IE
+T FL
+N EL
+K VT
+I PG
+M AT
+T VG
+Q VV
+SE S
+T KE
+KL P
+AN S
+N AV
+A CL
+V FS
+T IG
+EE D
+K VK
+T IS
+K IK
+T VI
+I IT
+T IT
+QQ G
+VE D
+M AV
+P AG
+R VI
+N GL
+VR D
+K PG
+R IG
+AE D
+IL P
+K IT
+EE S
+NG S
+KG S
+RL D
+M SL
+VT D
+GG D
+RG S
+K PE
+P SL
+R IE
+W M
+P GL
+IN S
+P SG
+IS D
+T KK
+HL E
+T AK
+VL N
+T IE
+M AS
+FG E
+V QG
+A CG
+I FL
+K AS
+V FE
+K VS
+I VK
+R VE
+AN E
+F DL
+K IG
+K AD
+KE S
+R NL
+Q DL
+AF E
+T DG
+F DE
+AL N
+I KG
+N DL
+F VL
+Q AR
+AK D
+I VI
+T PS
+N RL
+IR D
+T VR
+K NG
+K RG
+I IN
+QG E
+TG D
+P DG
+FL D
+T SE
+T QL
+N AG
+F VG
+T VD
+R AS
+F AV
+M W
+P AV
+N VV
+P VV
+ID D
+T VK
+R KE
+V QE
+SS D
+V YL
+P AP
+K DE
+I IK
+Q PL
+I PE
+HL S
+R AD
+K AQ
+DD D
+K AT
+K VD
+R YL
+I NG
+M GL
+Y GL
+M VL
+K TS
+K SE
+NE G
+VN S
+R AK
+K RE
+R VS
+T NL
+LL Q
+N AI
+F AE
+H AL
+I AN
+M SS
+M KL
+IG D
+T DE
+IT D
+F TL
+K YL
+R VK
+M EE
+Y AA
+I FG
+K ID
+AD D
+I PS
+N PL
+N KK
+V YG
+AQ S
+H PL
+I VR
+VT P
+H SL
+K DS
+R VR
+NL D
+K FL
+F DG
+N IS
+NL P
+AT D
+I VN
+M GG
+K TE
+R DE
+N VG
+T RE
+TT E
+N GG
+I AP
+N SG
+M SG
+F AR
+AL Q
+H GL
+VV P
+F VS
+N EE
+M SE
+AA Q
+I RG
+AY E
+R FG
+SS P
+R VT
+R KR
+ME G
+F AD
+T AP
+T ID
+DS D
+F IL
+R IT
+DE S
+Y SL
+PG D
+AR D
+Y VL
+F AS
+F EL
+DD S
+P TG
+Q VI
+N PE
+T VP
+R IS
+K YG
+F SE
+PL D
+R KG
+F KE
+Y AL
+ML S
+Q SS
+V AY
+M AD
+VF D
+F VE
+SG D
+N VI
+N IG
+N AS
+N VS
+M VE
+F ID
+I IP
+F VD
+V QQ
+F IG
+ALL L
+T AQ
+T DS
+F IS
+P DL
+M EL
+M KE
+T RR
+LL H
+K AN
+R AQ
+F TG
+KK D
+RE S
+N QL
+YL D
+P AS
+I QE
+YG S
+M AI
+R AT
+K IR
+T KG
+N IE
+F VV
+Y DL
+AF D
+I FS
+K IN
+K VR
+TE S
+T IK
+Q AQ
+V AH
+R IR
+W C
+CL G
+Y GG
+I QL
+WL G
+N AE
+N NL
+DG D
+F IE
+Q KK
+Q NL
+H AA
+M TL
+N ID
+YG E
+VS P
+V ML
+AM E
+AT P
+H TL
+T AF
+W W
+Q RR
+H GG
+I AF
+EE EE
+Q VE
+Q KE
+Q IE
+N VE
+F AK
+T FG
+R VD
+K RS
+Q GG
+VQ S
+K PS
+R SE
+F KK
+FS D
+H VD
+M RL
+I IR
+F KL
+Y RL
+QG S
+Y EE
+Q AK
+KG D
+N AK
+V AM
+T IR
+FE D
+N KE
+F PG
+K AP
+RG D
+QL D
+C W
+Q AS
+N VT
+TS D
+K II
+FG D
+VN D
+T IN
+QQ E
+R PS
+K TP
+C LL
+I VP
+Q PG
+R KS
+AV P
+Y AG
+N FL
+Y EL
+M DL
+Q SG
+T RS
+PP PP
+K IP
+FE S
+N AT
+H RL
+T FS
+Y SG
+Q VG
+Y SS
+KN E
+K VP
+QL P
+Y IL
+N PD
+F PE
+T RG
+F FL
+H PE
+N IK
+V YE
+V YS
+N AR
+V HL
+KN S
+R PE
+N IT
+AY S
+M IE
+KE D
+R DS
+T KS
+R ID
+N AD
+N VD
+P TL
+I YG
+H VL
+F VT
+Y VG
+R VP
+P VT
+Y TL
+F NL
+F AI
+K VN
+IN D
+H AV
+F TS
+F AT
+N IP
+M AQ
+Q AT
+F DD
+M DE
+RE D
+Q AD
+Q VS
+K QE
+R HL
+F RL
+ML D
+M VV
+M VG
+EG E
+Q IS
+Y AE
+IT P
+F AP
+K VF
+F FG
+T IP
+F RR
+R QG
+Q FL
+M IL
+R AP
+M TE
+H IL
+VI P
+GG GG
+T VN
+V MG
+I YL
+NG D
+ID P
+K DD
+R AF
+M TT
+H AG
+T YL
+HG E
+Q IG
+N SE
+T AN
+R PD
+T KR
+Q IK
+H EL
+AA AL
+K QG
+M KR
+PS D
+Q RE
+C GG
+F KG
+P AE
+R QR
+HG S
+R VF
+R IK
+AD Y
+IS P
+HL D
+VD P
+I AY
+TS P
+SE D
+T NG
+R DD
+F VK
+M RE
+F TT
+F IN
+Q VT
+N VK
+M IG
+T DD
+N VP
+M PL
+Q AF
+F VI
+M AN
+K FG
+T KD
+Y AS
+N IN
+M NL
+F PL
+Q VK
+H SS
+H PD
+F DS
+Q VR
+R TE
+T FD
+Q YL
+N TT
+AP P
+T QE
+I VF
+F VP
+Y AR
+Y FL
+M VD
+AL AA
+R TR
+N YL
+I QS
+VG P
+AG P
+TP D
+AP D
+AN P
+Q KG
+K HL
+Y KL
+F IT
+N FS
+K ML
+F TE
+F RE
+F VR
+W LL
+T VQ
+RR D
+NN S
+Q RG
+T RD
+P KG
+F KD
+R TS
+M TG
+P VI
+K AY
+Y KE
+M TD
+VP D
+R QE
+ALL G
+M VS
+Y AD
+P VS
+K IF
+V CG
+I AH
+N PS
+M RR
+Y AV
+R FS
+H KL
+R IP
+I FE
+N TG
+I QG
+Y DE
+I VQ
+TE D
+I AM
+R FE
+Q PS
+T VF
+P IL
+R NG
+Q SE
+Y VV
+Q PE
+F IK
+T NS
+F FS
+M VK
+F AN
+LL LL
+N KN
+Y RR
+K VY
+H DL
+I YE
+N PG
+R TP
+Q DG
+K QQ
+N TS
+QE S
+AD P
+H SG
+T NE
+T KT
+V ME
+R KT
+Y TG
+MG S
+H VV
+K QR
+I QQ
+N KS
+FL P
+C M
+N KG
+C SS
+I FD
+R YG
+T YG
+M TS
+T QG
+R NE
+Y ID
+MG E
+F NE
+M QL
+H NL
+Y VE
+Y VS
+AM S
+R ML
+P RG
+N TP
+AAL G
+Q VD
+CL S
+Q IR
+Y RE
+M PG
+T HG
+Q TG
+R PP
+F RG
+AH E
+VY D
+F KS
+V HG
+Y IE
+I YS
+R IN
+F AQ
+K FE
+N RR
+Q ID
+N DE
+Q VP
+R KN
+R KD
+P QL
+AN D
+K PD
+Y PL
+T KN
+Q RS
+R II
+N II
+Y KK
+Y AK
+Y RG
+T NP
+CG S
+IG P
+AL H
+F TP
+C GL
+F VN
+IP D
+M KS
+R TT
+Y VD
+Y SE
+M IK
+P VG
+K FS
+R QQ
+I QR
+R FD
+Y PG
+F KN
+C SL
+M VT
+K AM
+I VH
+YG D
+Y AT
+Y IS
+M DG
+F TD
+P RL
+F QL
+Q KR
+M VR
+R VQ
+NN E
+P KL
+N KD
+N AQ
+F RS
+R HG
+P AT
+N DD
+AA AV
+P PL
+M AF
+M KN
+H PG
+M AP
+F IR
+T FE
+H VE
+T AY
+N DG
+Q IT
+R AN
+T HL
+H AI
+Y DG
+CL E
+H AE
+F KR
+F FD
+R NS
+P VE
+M KD
+ME S
+P VD
+N KT
+KS D
+T II
+NE S
+P AR
+F RD
+F PD
+Y IG
+F YL
+K VQ
+H QL
+N VN
+Q AP
+H AD
+RS D
+N IR
+YE S
+I HL
+Y YL
+M KT
+H VG
+N AN
+AA LL
+P AI
+M VI
+Q TE
+AH S
+Y QL
+T ML
+KK P
+R IF
+Q KS
+Q DE
+F NS
+Q ML
+T QS
+T IF
+Q HL
+M ID
+C SG
+Y AI
+K QS
+P FL
+AA GL
+K FD
+R AM
+QQ S
+Q PP
+WL E
+P AD
+MS D
+AQ D
+R AY
+H TG
+VQ D
+Q DS
+Q TT
+F FE
+Y VP
+K YS
+Q NG
+N DS
+N VR
+Y KG
+IQ D
+M IS
+Q TS
+N AF
+F VF
+P RR
+LLL G
+N RE
+N NG
+P PS
+I YD
+H VI
+N RS
+AA AG
+T YS
+T KP
+F PS
+H RR
+P TS
+I ML
+TT D
+Y NL
+N IF
+C AL
+QE D
+Y FS
+R KI
+T AH
+F YG
+R KP
+R NP
+N TE
+N KI
+N AP
+P DE
+Y KD
+H FG
+M IT
+H VS
+V CL
+R VN
+H AR
+Y VT
+AY D
+H EE
+C AA
+AL AL
+C VV
+Y RD
+F KT
+M TP
+F NN
+T YE
+H ID
+T NN
+P NG
+N HL
+N FG
+M DS
+NS D
+Q TP
+M IR
+V MS
+VH E
+K PP
+H FL
+T ND
+P TP
+T IQ
+QG D
+F NG
+R KQ
+H PS
+Q KT
+I VY
+NE D
+Q VF
+P KK
+KR D
+M PS
+MG D
+K YE
+R TD
+Y FG
+N FE
+M FL
+P VP
+Y VI
+F ND
+F IP
+M FG
+M RS
+NNNN NNNN
+P TT
+F AF
+VH S
+T AM
+AE AL
+YS D
+WL S
+M KG
+V AC
+Q IP
+Q VQ
+H IS
+Y VK
+R TF
+Q IN
+EG S
+C VL
+T QQ
+Q FG
+H TS
+H IG
+R YE
+I HS
+P NL
+R KF
+Q VN
+AA AR
+Q AN
+Y KN
+F QG
+Q KD
+M RG
+H SE
+Y DD
+HG D
+F QE
+R QS
+TT TT
+R VY
+H IE
+H HL
+R ND
+M PE
+N KR
+AL AE
+N RF
+Y PD
+Y TS
+Y RS
+C AV
+Y QG
+I CL
+Y VR
+F AY
+Y PE
+Y IT
+Y IK
+M DD
+V WL
+M KQ
+M QE
+T VY
+AAL E
+AA C
+F II
+AW G
+Q KQ
+Y AQ
+Y IN
+P AK
+Y TE
+P KE
+T KI
+N RG
+Q NS
+M TR
+Y DS
+AGL G
+N TD
+KT D
+AV AA
+N VF
+V LLL
+GG P
+H AS
+Y AP
+K HG
+H IT
+R IQ
+VM D
+Q AM
+F RN
+H AP
+R AH
+H YL
+YE D
+H EG
+M RD
+M IN
+P AQ
+Q FE
+P DS
+H RS
+N FD
+F TR
+V AAL
+QS D
+C DL
+ALL E
+N KP
+H VT
+P VK
+P IG
+F IF
+M KI
+H DG
+R YD
+Y FE
+H RG
+LL C
+N TF
+Q YG
+AL C
+Y TD
+M NP
+H AK
+F KP
+T IY
+T KF
+H KK
+Y QE
+Y YG
+P RS
+I HE
+Q II
+AE AA
+H VP
+Y TT
+R MG
+SL C
+N YG
+N YS
+M TN
+M VN
+Q KI
+Q NE
+H DE
+R IY
+CG E
+T VM
+Y QQ
+AM D
+Q DD
+K YD
+H NG
+Y VN
+AGG G
+I ME
+T QD
+T KQ
+P QG
+H AT
+N IY
+Q PD
+Y IP
+M NS
+Y TP
+R NN
+AG LL
+F VQ
+AC E
+H RD
+P VR
+M NE
+P RE
+P IS
+Q FS
+C RL
+Y IR
+H IP
+VC S
+M IP
+F YE
+Y KS
+Q KN
+F QS
+F TF
+C AG
+Y NG
+R WL
+K AH
+W AA
+AL AG
+N IQ
+C SE
+N QE
+H VR
+T QR
+M PP
+EL AA
+P AF
+C EE
+R YS
+Y NS
+P TE
+T MG
+AA GG
+F YS
+N QS
+F KQ
+A A</w>
+M AH
+Q IQ
+H KE
+N RD
+I HG
+ME D
+Y PS
+P PE
+LL W
+P IT
+M QQ
+P KS
+P VF
+M QR
+LL AL
+Q HG
+M NN
+EL AK
+Q RD
+N PP
+C AS
+KN D
+F QQ
+DS DS
+WL D
+H QR
+M VF
+Q TR
+Q NN
+Q IF
+I MG
+Q KP
+M FS
+F NP
+HS D
+W SL
+N KF
+AIL G
+Y AN
+T YD
+F QR
+C KE
+N QQ
+M AM
+W AL
+M AY
+AR AL
+AVL G
+I VM
+Y AF
+T KY
+KK KK
+W GG
+M VQ
+F HL
+N RN
+M VP
+H FS
+K MG
+C VG
+LL GL
+C EL
+P IE
+Q AY
+Y NE
+N RT
+C TG
+T VH
+F TN
+R VM
+H IK
+N ML
+AA AE
+R VH
+P DD
+W GL
+H KG
+EL AE
+K VM
+SG P
+K MS
+H VF
+N VQ
+W KE
+N TN
+T QP
+VL AL
+Q FD
+W DL
+VC E
+F IQ
+N AY
+M NG
+M FE
+M YL
+N TR
+AG AG
+EG D
+W KK
+AL RE
+H RE
+P NS
+F HP
+AC S
+F PP
+AE EL
+SL SS
+H TE
+AE LL
+H IR
+KL KE
+C VS
+I CG
+P KD
+W VL
+N RP
+P YL
+F QN
+C IL
+W RE
+P AN
+Y YD
+CL D
+K VH
+H TT
+F HG
+AK EL
+SL EE
+C TL
+Y KT
+V AAG
+R KY
+V ALL
+VL AA
+T HS
+P RP
+C KK
+LL SL
+AL GL
+AS AS
+W RL
+K ME
+AE RL
+H VK
+H TD
+Q TD
+M RQ
+LL EE
+T MS
+Y FD
+N QG
+H QQ
+I HP
+P FS
+AG AA
+C PG
+Y NN
+Q MS
+N RI
+PP D
+N YE
+M KF
+HE S
+H AF
+N AM
+AL AR
+TL AA
+Q NP
+AI AA
+F TQ
+F ML
+Y TR
+H VN
+F YD
+W VE
+I MS
+F RQ
+LL AA
+P FG
+C AI
+H TP
+N VY
+M KP
+N YD
+W IL
+Y YS
+W TL
+EL EE
+M QG
+H IN
+F IY
+EL VE
+AV LL
+W SS
+N KQ
+Q KF
+K MD
+W RR
+LLL E
+EL KK
+M FD
+KS TL
+AL AD
+C DG
+AS GG
+EL AR
+M TQ
+Y QP
+GL GL
+M ML
+N FF
+M ND
+F QD
+Y RP
+VL C
+R QD
+H FE
+H FD
+T MD
+Y VQ
+AEL G
+P TR
+F VY
+Y VF
+R HP
+W EE
+AW E
+M PD
+H QG
+C FL
+Q VY
+C VE
+R HS
+Y QS
+C DE
+F RP
+C PS
+AEL E
+RR RR
+M VM
+C VI
+P TD
+Q TQ
+VV AA
+R CL
+N MG
+AD AV
+AE KL
+H AQ
+H DD
+Y AH
+W KL
+Q YS
+N QR
+W RG
+Q AH
+P IP
+Y QR
+AL GG
+Y II
+SL GL
+Y NP
+AL EE
+K K</w>
+P ID
+C RE
+R ME
+T ME
+Q IY
+N TI
+T CL
+LLL S
+P QS
+AD AL
+Y KR
+I MD
+AS LL
+M RK
+W AG
+AA VL
+N QD
+DD DD
+C PL
+Y RK
+W VS
+Y RQ
+Q TF
+C RR
+EE LL
+C KL
+SL SL
+P KP
+F CG
+SG SG
+N IH
+AA RL
+C IS
+Q WL
+TG KT
+AR RL
+V AW
+P KR
+AD LL
+GE S
+P VQ
+KQ D
+W VV
+AA AS
+Y AY
+VE EL
+T HP
+H KR
+H YG
+W EL
+AA EL
+Y HL
+M RF
+AL AS
+ASL G
+M QS
+M IF
+K HS
+ARL G
+VL AG
+F TK
+W AS
+ALL D
+Y QD
+IE EL
+P RD
+P HL
+AE AE
+C IG
+ALL S
+W SE
+I AC
+N KY
+KE KL
+W AR
+AH D
+Q KY
+P VN
+N IV
+VG AG
+P QQ
+Y V
+F QP
+P KT
+M IQ
+AA AK
+C PE
+M TI
+EE EL
+AE AV
+AL SL
+RS RS
+W AV
+TGE KP
+F RF
+H KD
+H AN
+F AM
+P KN
+SS SL
+M TK
+GL SG
+H YE
+T WL
+F RT
+W EG
+K WL
+I AAL
+M TF
+Q YE
+AV AV
+C NL
+HE D
+W VT
+C AE
+Y IF
+F KI
+W RD
+N FP
+H DS
+C VD
+F RK
+W DE
+Y YE
+C SD
+Y RN
+LL AE
+Q FP
+P QE
+Y ND
+C AR
+H RF
+W AE
+LL AG
+AW S
+LL DE
+Q VM
+F HS
+VL VG
+W SG
+H AH
+AA AI
+F VH
+F CL
+X X
+AL VE
+ATL G
+M RN
+QQQQ QQQQ
+H KS
+M KY
+W IE
+C PP
+LL TL
+AL RR
+C DP
+AL RL
+H ML
+LL KE
+M QD
+VL GL
+GE D
+W KG
+AD VV
+N RY
+LL K
+F MG
+A K</w>
+H V
+H TF
+Q TN
+H HP
+Q MG
+P IR
+H AY
+Q TI
+P IK
+GL PG
+Y IQ
+AP AP
+AL EL
+TL EE
+IG AG
+AL KE
+C QL
+GL AV
+Y RF
+M ME
+M II
+W AI
+K CL
+RL AA
+NN D
+LL KK
+Y KP
+LL EL
+W QL
+LL SS
+H NP
+C RS
+IL AA
+F TI
+I MP
+H DP
+Q CL
+RE KL
+Q VH
+RL AE
+R HE
+N VH
+Y HG
+N QN
+N FN
+A E</w>
+W NG
+VK P
+T AC
+W IG
+N TY
+H QE
+Y ML
+LL AR
+W NL
+M RP
+C AT
+AL VL
+AL AQ
+VE KL
+M RT
+N QP
+VK EL
+AA AP
+ASS G
+AV AR
+V AGL
+ADL G
+C VP
+H RP
+F RY
+KL EE
+AS SL
+W RS
+T HE
+H TR
+AL AV
+AAL S
+T VC
+C AD
+F AH
+RL GL
+M YE
+AK AA
+K HE
+H NE
+C RG
+TG SG
+V AEL
+AE AI
+M RI
+M HL
+AVL E
+K AC
+VW S
+ASS S
+P IF
+LL R
+AG VD
+C EG
+I VC
+H II
+EL IE
+H VQ
+H KT
+PG P
+Y KQ
+AR AA
+Y VH
+C IE
+C TS
+T CG
+H IF
+VE RL
+C SP
+VV AG
+N RQ
+W VK
+GL AL
+M FF
+F HE
+N CL
+P II
+R MS
+Y KI
+P AY
+N WL
+VL AK
+R ALL
+W QE
+N HS
+AG AV
+Y FP
+RL ID
+AA K
+Y TN
+P VY
+AL K
+Y PP
+DL AA
+H KN
+Y IY
+H RQ
+GL AA
+AR LL
+H HG
+P QP
+C TE
+V ADL
+T CS
+M YS
+VL GG
+AG VP
+P IN
+Y RT
+SS LL
+SL AA
+KE KE
+AG AS
+M MG
+W GS
+VE AL
+SG SS
+LLL D
+M MS
+H QS
+VE W
+AI AG
+N ME
+T AW
+Q ND
+N YP
+KL AE
+LL FL
+N TQ
+AS IL
+I CS
+W VD
+VE AA
+Y TF
+EL KE
+VI AA
+QQ D
+W SP
+TP EE
+W PE
+Q ME
+F IH
+AGL S
+AD AA
+Q FF
+Q TY
+P TF
+W PL
+M YG
+M VH
+C VK
+F KY
+AS AA
+C ID
+C DS
+VL EL
+AT AA
+Y RY
+P FE
+H YS
+N VM
+W DG
+AV AE
+LL KL
+AD AI
+N RK
+M RV
+LL SG
+F TY
+C EK
+GG GS
+N TK
+IE KL
+AV AG
+C VR
+I LLL
+AR EL
+P RF
+RL EE
+W AK
+Q ALL
+KT TL
+AL AI
+GL SL
+VL VV
+I WL
+C YG
+Y FN
+Y KY
+C KS
+AR GL
+AT VI
+AI VG
+AL AK
+C VN
+M RH
+AG VG
+LL DL
+V AVL
+M VY
+TL H
+AL IG
+LL PL
+F QK
+W GE
+GL PL
+LL RE
+W AD
+C FS
+D VV
+C TT
+Q TK
+W VR
+P AH
+KG KG
+P NP
+F ME
+AA SL
+C NE
+N FT
+H PP
+AG RG
+C AK
+Y VY
+W DD
+AA RR
+Q HS
+W KS
+V ALG
+VL SG
+VH D
+I AAG
+ASL E
+W KR
+KL KK
+IL SG
+H IQ
+Q IV
+VD SS
+AG TG
+W TG
+LL AK
+GG AA
+Y RI
+H IH
+AGL E
+Q YD
+W SD
+TL AR
+Q HP
+M MD
+F KV
+AA KL
+VV VG
+LL VL
+W KD
+Q FY
+F CS
+IK EL
+C HG
+F TV
+GG SS
+W QG
+R AC
+K VC
+N QF
+VL AV
+VD AL
+V AGG
+LL AQ
+IL IL
+CG D
+C YL
+C IK
+N HP
+AA GE
+VV AV
+N QI
+C IT
+AL Y
+F VM
+W AT
+AD EL
+AA AQ
+C KN
+SL Y
+P FD
+C IN
+T LLL
+N MS
+W PG
+M QN
+C TP
+RL R
+H FP
+IH D
+W VP
+W FL
+W VG
+TL AG
+F VC
+AL VG
+P YS
+AK KL
+SG KS
+DL VL
+ASL S
+F IV
+P ML
+H KP
+LL RR
+C RD
+AV VV
+AL TE
+Y FF
+V LLG
+VL DE
+R VC
+Q HE
+W IS
+C ES
+VE AG
+AL AT
+SS SG
+H FF
+C NS
+AK RL
+AI AE
+W ID
+I CE
+AKL G
+TV EE
+AL F
+M QP
+W DS
+C AQ
+W IT
+RL RE
+DL VE
+AR AR
+C NG
+N AH
+GL R
+AV EL
+AE QL
+I AW
+C KG
+AI RE
+F WG
+AA KK
+M FP
+W IR
+Q FN
+RG IT
+C AF
+H TI
+C FG
+AL KK
+H ND
+V GGG
+W QR
+LL GG
+I ALL
+VL AE
+W DP
+F YP
+LL AS
+AG AD
+AAL D
+DL RR
+AEE E
+N FY
+AA AT
+C AP
+F RV
+C ED
+F KF
+VS SL
+AL T
+C VT
+AA SG
+K HP
+T MR
+IS SL
+KE EL
+C IR
+VP VG
+N QT
+IE RL
+Q TV
+H QD
+Y QN
+EL GL
+C QE
+IE TL
+C KD
+I AGL
+C TD
+F QT
+LL RL
+T MP
+H MG
+C HL
+M QK
+N MD
+M TY
+H RH
+AL VD
+W NS
+D AV
+LL AV
+AG SG
+VV EG
+LL SE
+DL KK
+AI AR
+VL AD
+N RH
+W IK
+W NE
+EL AG
+P RT
+K LLE
+SL SE
+VL RL
+EE AL
+P ND
+Q IH
+DL AR
+SL ED
+Y KF
+TL SG
+VS GG
+VV GL
+TL AE
+N HG
+KL GL
+LL IL
+P YG
+AV VG
+P IY
+IE AA
+P NE
+EL RE
+K LLL
+Y FT
+H VY
+AV AD
+AS TS
+R CG
+H RI
+PL PP
+M MN
+SL AL
+VL AT
+W FD
+SL TL
+SL SG
+M RY
+F RH
+ATL E
+AIL E
+LL NL
+W TP
+GL GG
+N QY
+Y TI
+VD EL
+VL DL
+AA EE
+W VN
+P VH
+AK AL
+T ALL
+VL EE
+H HS
+W RQ
+P QT
+W AP
+VL TG
+R MN
+Q YP
+W FG
+AG AT
+LL AT
+AG KS
+Y TH
+RL AD
+H YD
+Y AM
+W QQ
+LL TG
+H VH
+VL SL
+C PD
+P QD
+K MK
+KL VE
+AK AG
+AL VV
+W VI
+I AEL
+DE DE
+V EEG
+SL AE
+AL TG
+C RF
+SG KT
+GL AG
+V WG
+V ARL
+IL K
+R MD
+AGG S
+M HG
+N HE
+KL RE
+AS GL
+VL SS
+Y FR
+KL IE
+H QP
+AL TL
+T AAL
+DL AK
+AG EL
+W TE
+VP VV
+F RI
+VS LL
+AA TL
+H NN
+AR R
+W KN
+P FF
+AV VD
+AEE G
+VC D
+K MP
+IL SL
+LL AD
+KL AK
+KL AD
+GL TG
+A L</w>
+VR EL
+AA Y
+AC D
+AA RE
+N KV
+AE VL
+DE EL
+DL AG
+C KR
+LL IG
+P QR
+A S</w>
+W ED
+M IY
+LLG D
+EL PE
+C AN
+K ALL
+V ATL
+M CG
+H NS
+AA VV
+C VF
+ADL E
+M YD
+IE AG
+Y TY
+RL AR
+F WL
+SG AS
+GG FS
+AKL E
+W KT
+V GLG
+R LLL
+EL AQ
+Y RH
+LL VS
+LL PE
+IL DE
+C RQ
+P AM
+Y IH
+EL VK
+AL SS
+GL TL
+VE AV
+W TT
+H DF
+Q AC
+IL KK
+DL SS
+AR RR
+P KI
+V AVG
+N TV
+P RY
+K MN
+AV IG
+GL IL
+R AAL
+H KI
+F MS
+AE AG
+W TD
+VT GG
+M KH
+R AW
+KG KK
+C RN
+C DD
+H IY
+Y TK
+AL DL
+Y NF
+H RT
+D DL
+AE AR
+ID EL
+H RK
+H KF
+P NN
+I MK
+VD DL
+AI AD
+Y TQ
+KE AL
+M FN
+C FD
+TG AG
+IE DL
+W IN
+AA VG
+KE LL
+VV GG
+SL GG
+SL PL
+P FP
+Y KV
+P RN
+RG RG
+Y CL
+Y CG
+V ASG
+SL AS
+IP VI
+AL AF
+RE EL
+AA EG
+VL EG
+M AC
+AL RD
+AD DL
+LL VV
+GL GE
+VL AS
+GL T
+VL VE
+Q CG
+H CL
+C FE
+II SG
+ANL G
+P YE
+P TK
+AVL S
+W YL
+VE QL
+RL AK
+AD IL
+C QS
+P RQ
+EE IE
+AP AA
+AL SG
+LL T
+TL TE
+LL VG
+H RY
+AS VS
+Y QK
+C QG
+VT SS
+VI AG
+ATL S
+VL KK
+LL AI
+AL EG
+K CG
+W PD
+KL AA
+KE IE
+P TQ
+M KV
+AD RL
+I MN
+W TS
+C NP
+LL DS
+GL VL
+AV AK
+RE RE
+AV AL
+AA IL
+W FS
+LL EG
+LL PG
+EE VE
+Y HS
+IV AG
+VP GG
+F KH
+VK KL
+EL RR
+W ES
+C KT
+VI GG
+AV K
+ID DL
+C IP
+TT SS
+V VLG
+IE AI
+GL EL
+F MD
+SL VL
+N QK
+AL KL
+TL TL
+LL GE
+K SLG
+AE VV
+AD VI
+LL ES
+NL KK
+IV GG
+V ASL
+AL IL
+Y NI
+AA TG
+V AIL
+IE QL
+C KP
+F MP
+AL SE
+W AQ
+T AAG
+PL SL
+IL KE
+VG AT
+TD EE
+P VM
+AE GL
+VS EL
+Y MG
+R MK
+P HS
+P KF
+LL VD
+SL AR
+SL AG
+PE EL
+W NP
+W EK
+VL AR
+N KH
+K AW
+RL VD
+LL TE
+C QQ
+LL EK
+VW E
+Y FY
+QL AE
+GL DL
+AL VS
+TL KE
+AV AS
+Y HE
+Y NY
+TG TG
+M TV
+RL RL
+AV AT
+AE VI
+VL H
+IL EE
+AV VE
+SL KK
+AV SS
+Q MD
+Y QT
+I MT
+EL IK
+AG GL
+VS AA
+SL KE
+EL SE
+I AGG
+EE KL
+EL VR
+DL VV
+R CS
+AV AN
+N CG
+C TF
+LL TS
+KL SG
+AD VL
+GL SS
+AK AK
+VV VV
+M IH
+D AG
+VE DL
+A R</w>
+VG Y
+TL AS
+R ALG
+H KQ
+V AIG
+AL IE
+N MP
+SL AV
+AQ AA
+AG VL
+AEL S
+ID RL
+TG AV
+AL VT
+GL VV
+DE VI
+P TI
+Y ME
+M AKK
+AL PG
+AG T
+AK DL
+P HG
+DG VV
+SL RR
+RL KE
+IL GG
+LL GS
+EE AE
+LL KN
+SL VS
+P TN
+IV AA
+VL KE
+AE VE
+DL IE
+Y YP
+EL K
+IL AS
+ASG S
+AE IE
+AK VV
+AI VE
+VD VV
+GL K
+SL RE
+RL RR
+AI GG
+A V</w>
+I ARL
+GL N
+QL EE
+KE KK
+H HE
+H AM
+VL K
+VT VT
+SS TG
+LL KS
+IG SG
+QE RL
+C IY
+W ND
+AA VE
+LL DG
+DE DD
+R MP
+II GG
+H RV
+KL AG
+AS AV
+AE TL
+SL PS
+EL AD
+P YP
+KE IK
+AQL G
+AL VR
+AL AN
+Y FK
+VL GE
+DL DL
+W AF
+H RN
+SS SE
+DL VK
+DL PG
+KL SE
+ID AG
+KE TL
+VL SE
+SL TE
+TG AA
+P NI
+EL VD
+AL GE
+C PN
+VL F
+EE AV
+T AEE
+IK PE
+ADL S
+N RV
+SS GS
+AVL D
+SL TG
+N CS
+ATG E
+KL ID
+R LLE
+RL AG
+APE G
+VI DG
+LL SD
+IL AE
+AL KD
+AA KE
+AL ES
+IL GL
+C II
+AFL G
+C VQ
+VG SG
+H TN
+R MR
+N LLG
+C VY
+C CL
+P MP
+P IQ
+I DLG
+I ASL
+EL EG
+Y MD
+KK EE
+EL VS
+VG AA
+GL AI
+H IC
+RL AS
+H FN
+SL IL
+P KY
+AI EE
+VK AG
+GL VE
+TV AA
+DE IL
+VAA E
+RL AQ
+AA SS
+Y NT
+TL SS
+AT GG
+C PF
+W PS
+C QR
+VT VE
+LL KD
+AV KE
+SL DL
+F LLL
+VL DG
+IL SE
+K AKK
+H TQ
+EL ED
+AI AL
+Q FT
+GG AG
+VL AQ
+M QT
+P MG
+H YP
+VV IG
+SL DE
+DL AS
+DL PL
+VV EE
+NL TL
+D VI
+D GL
+Y QY
+SL AD
+I AIL
+AA VS
+K MI
+AA EK
+Y NK
+SL SD
+DL AE
+EL KD
+AT AT
+AI IG
+RL RG
+LL QL
+EL EK
+AP PP
+W RN
+T LLG
+AL PL
+AA IG
+EL SS
+R WG
+KL SS
+F AC
+AV EE
+RE IL
+M YP
+M HP
+H DY
+AV TG
+M FT
+H GGG
+GL AR
+P RK
+PS PS
+AA AD
+C RP
+ARL E
+I ALG
+P KQ
+AL QE
+VL GS
+TS TS
+W VQ
+T MN
+VD K
+CG KAF
+VL TL
+VE VV
+C TR
+PL AG
+AI KE
+V ELG
+DL EE
+N MI
+GL VG
+GL AE
+AQ LL
+M IV
+AL VK
+KL K
+VP VI
+IL TL
+GL KE
+GG RG
+P TY
+GG SG
+VL ES
+VG RG
+AD GS
+AA KR
+F QH
+AV RE
+F AW
+C NN
+AQ AE
+TL RE
+TG SS
+SL K
+IG RG
+PE DL
+SL ES
+NG KL
+M HE
+C AY
+AG IG
+W FE
+VK RL
+LL SP
+SL W
+KL SD
+I WG
+Y CS
+AR QL
+VL VD
+AV ID
+TL R
+H DI
+IL SS
+I LLG
+Y RV
+D KL
+VL IG
+IS GG
+VE AI
+II AA
+AV RR
+V AEG
+Y VC
+VL RE
+Y QF
+K CS
+C CS
+AL EK
+C FR
+VV SG
+VL DS
+EL DE
+QL RE
+IK RL
+C DF
+TG EG
+P TV
+AV GG
+AG AR
+EL AS
+AKL S
+C ER
+TL DE
+RR R
+IL VG
+I ADL
+AQL E
+VK PG
+H DR
+IP GL
+C IF
+AL KN
+RS RG
+IL TG
+W QS
+H FY
+M HS
+LL QE
+C KQ
+C CG
+AG SE
+AE DL
+PL GG
+KL VD
+W IP
+M CL
+LL RD
+AV SG
+AV IE
+AI AS
+TL RL
+F QF
+IL AD
+N AC
+PG VG
+AG VT
+PE PE
+AL KG
+VE GE
+M MP
+C YS
+KE IL
+SL AT
+DE VR
+AE K
+VV VD
+VV DL
+AA DG
+AGL D
+T ALG
+I GGG
+Y QI
+VE TL
+SS SP
+GL VD
+AL NL
+V ATG
+EL R
+LL RG
+SG KK
+VG VG
+T CE
+II AG
+DG TT
+VS GL
+Q MP
+W AN
+QL AR
+EE AI
+V ADG
+SL R
+P YD
+ARL S
+AA VR
+VK TL
+SS AS
+AL SD
+VK DL
+VE GL
+VL AN
+PG PG
+AL KR
+Y FI
+RL VG
+N VC
+AS VV
+DE NG
+AV KG
+AA DL
+Y NR
+GG VL
+VE LL
+AV GL
+W YG
+VS R
+TT TP
+AQ TL
+Y VM
+IL AG
+K MR
+VP EL
+VK VV
+KL KL
+HH HH
+AE P
+T GGG
+Q AAL
+C KI
+TG P
+VR GL
+VL VT
+D AI
+IL TE
+EE RL
+AQ AV
+Y WL
+AQ RL
+SS TL
+W KQ
+V K</w>
+RE RL
+PE GL
+VS EE
+SL VD
+H QN
+H FT
+AL AP
+AV KK
+N MN
+DD EE
+AS AG
+SL KD
+SL IE
+R AVL
+KL AR
+KG VL
+VV DG
+SL PG
+AV VT
+QL KE
+AIL S
+V LLE
+LL IS
+DL SE
+DL VD
+W NN
+AEL D
+Q AW
+PS PP
+C ML
+SL IG
+IK KL
+DL AD
+T PEG
+AT AF
+A G</w>
+Y AC
+Q VC
+H NF
+VV EL
+V ASS
+EE AK
+VT PL
+PE LL
+IL R
+GG PG
+KE AI
+N LLL
+VLL S
+TL AT
+IL EG
+AL RK
+AA RG
+F CE
+QL VE
+H WL
+H KY
+SS VS
+FS GL
+DL SG
+DL GL
+AL RS
+AA GS
+QR EL
+P HP
+T MT
+EL TE
+K MT
+V AKL
+H FK
+SL VG
+Y KH
+APL G
+H QT
+C RK
+KD AG
+IS EL
+W TR
+VL PP
+AK GL
+LL AN
+FL GL
+LL QQ
+DG KL
+AL QL
+AA IE
+KT VL
+AS VL
+SL GS
+C PT
+VE VE
+AA PG
+SL IS
+PL TL
+NL SS
+AL FG
+AA VT
+VV AT
+SL VT
+AE RE
+H TH
+P QN
+M ALL
+SL KL
+AG IE
+W QP
+PL PG
+W KP
+VV SE
+C PK
+RL SE
+AA VK
+RS IP
+AW D
+AKK G
+PE TL
+LL ED
+IP VV
+VS TL
+AN PL
+F MN
+AL DG
+GL AD
+GG EL
+GG KG
+H YR
+GG TL
+C VC
+P FT
+LL AP
+AV VL
+AG F
+Y MS
+AK AI
+SL RL
+SL FS
+KE RL
+H DN
+LL NE
+IT DG
+PL SE
+AL GS
+LL VE
+DL VS
+AL RG
+LL RS
+DG TL
+SL SP
+VG EG
+DL RD
+VF AG
+C HS
+T AGL
+KL KS
+IL AV
+ID GL
+C RT
+IT VP
+AQ AQ
+V AEE
+LL PS
+AL TT
+AD SG
+PL AA
+SL KS
+PL SS
+I MR
+H ME
+EL EL
+AL ER
+W HL
+VW D
+RE QL
+VS RL
+CG KS
+RS P
+H YT
+VS EG
+TL AK
+VL R
+TL SE
+H MP
+PG SG
+TL RG
+M FR
+LL KQ
+AS AL
+VD AA
+SL VE
+IL N
+Q LLL
+AG VE
+C QD
+VI VV
+SL AK
+H QF
+TL EG
+RL SG
+K ELG
+AG ID
+RV AE
+EL KS
+H CG
+GL IG
+RL GG
+IL RE
+VV AE
+LL TT
+SL TS
+N QH
+F QI
+Y YR
+LL KR
+AT AS
+AI AN
+AR KL
+C YE
+AL QG
+T MK
+EL KR
+PL DL
+GL TE
+AL NE
+H TY
+H KH
+SS VT
+GL KL
+AG KG
+D GG
+AR VL
+I MQ
+AK AR
+QL AA
+IL AT
+IL AI
+EE VK
+VV VT
+EL IN
+AD AG
+KI IK
+C DK
+AP SG
+AL ID
+F HD
+M QI
+AE AD
+W QD
+M FY
+H IV
+VG TG
+RL VE
+I AVL
+IL DL
+QL AK
+AG TT
+N AAL
+W DT
+GL RR
+EL SD
+DE DL
+KL KD
+Q WG
+P NT
+K ALE
+TL TG
+H TV
+EE AA
+AR EG
+VD GE
+VL VL
+KE AE
+EL IS
+N ALL
+R LLG
+QL QQ
+KL KG
+T EEE
+SL NL
+R MI
+AL KS
+KI KE
+AD TL
+Y HP
+AR AG
+SS GG
+AK EG
+SS W
+VL KN
+DL VI
+C IQ
+AT LL
+AE GE
+EL ID
+KI AD
+D AD
+C FN
+EE DL
+H TK
+SS DS
+M AAL
+GL SE
+VD AI
+VV T
+W RT
+P RI
+AA TT
+P NF
+IG QL
+SL IN
+VL VS
+SL QE
+RL IG
+KL AQ
+EE AR
+C AM
+P QF
+N MF
+AV RD
+AV EG
+V SLG
+H VM
+ASG E
+V L</w>
+SL DD
+P WL
+VI AS
+IK DL
+AG VV
+VL Q
+R GGG
+LL NS
+IG KG
+M AW
+KK IL
+H QH
+SL PP
+K ALG
+DL DG
+C DI
+AT TT
+AL TR
+GLG D
+AK SG
+VE IL
+SS AA
+K AAL
+AL AH
+SL F
+EL RD
+H MD
+EL IR
+IS GL
+II VD
+C DR
+W SR
+RH D
+KI IE
+K ELE
+VG VV
+RL AT
+EE KE
+AI KK
+VG GL
+H DT
+TV AE
+SL DS
+I VLG
+P HE
+AG AE
+DL IS
+AL IS
+AT VT
+AG VS
+SL PE
+VL SD
+VI SG
+TL KK
+SL GE
+K AAG
+AV RL
+V SSG
+RV AR
+AA QL
+LL KG
+FL DE
+Q AVL
+QP VE
+AI GL
+VL PG
+AF GG
+VD GL
+KL QE
+SS TS
+EL QE
+C RY
+LL F
+LL DD
+VS QL
+VL T
+I WS
+SS SD
+C KF
+C ND
+ID AV
+W KF
+VS AS
+VI AE
+H NY
+IK AG
+W RK
+IV SG
+VK AA
+TL AD
+TI EE
+SL RS
+SL FL
+DL KE
+T WG
+VI VG
+VV AI
+R AAG
+D RL
+TL ED
+KI AE
+T WE
+VS IL
+Q VW
+D AT
+TL GL
+EE EG
+TG KS
+VL TS
+RD VN
+EL KN
+KI KK
+C FP
+AD GL
+NL SE
+H FR
+M QF
+IL KN
+AG SS
+AA AN
+AI VV
+C QP
+W FF
+H QK
+FL EE
+EL IG
+P FI
+H NI
+AV RG
+AS AR
+VI EG
+LL ER
+AT AL
+SL RG
+KV KE
+VL KD
+AS RL
+IE AL
+AF SG
+AI AK
+AA FL
+H QI
+PL PE
+KL KN
+W IF
+R MT
+N AW
+KH D
+AT VE
+GL ID
+PL VR
+P FY
+IG DG
+QL QE
+C YD
+W FN
+TL TS
+TI AA
+SS PS
+FL AM
+AE VR
+M VC
+KK IS
+AV IL
+AG EG
+IE K
+TK EE
+M FK
+LL FG
+H CS
+DL IK
+VGG S
+RL TE
+M YR
+M ASL
+KK AK
+AG AL
+SL VV
+EL NE
+N QV
+AP GG
+VD VL
+VP EE
+NL SG
+I AKL
+AA VQ
+VG KL
+RL SS
+P CL
+C DY
+GL PE
+VE VR
+IR PL
+EL GE
+F AAL
+TP PP
+R VLG
+H QY
+EL GG
+AE RG
+F QY
+DE VV
+DE LL
+AG DL
+W ML
+TL AL
+AV VS
+AFL S
+AD KL
+V TLG
+V KEG
+FL SS
+DG SG
+EL DL
+Y QH
+VI GE
+KI IG
+GL SP
+AP VV
+K LLG
+AV AI
+PL VV
+VP P
+VK DG
+I AEE
+VL AI
+PL IG
+LL TD
+ID AA
+AG IL
+IS AS
+I ASG
+DL IL
+SL T
+PL GS
+KI EE
+H YF
+H HD
+RV AK
+AFL E
+AG AI
+TL DG
+IS AL
+RL IE
+FL GG
+D SG
+RR RG
+KR KK
+K RLG
+IT GG
+F MT
+TL AQ
+PL TG
+IV GL
+C FF
+VI AT
+R MQ
+KV AE
+KI VE
+RL KS
+IL NL
+AG FE
+TL T
+RL KL
+EL SG
+C TN
+AA T
+DL TE
+DE VL
+AV KN
+VS VG
+SL EG
+ID AI
+W ME
+FL KK
+DL PE
+RL KR
+Q MN
+KL TG
+IL KD
+GG LL
+C PR
+K R</w>
+IL GS
+DE GG
+VV GE
+VK AL
+AV PG
+EL KG
+AV SL
+AE AQ
+W RP
+AYL G
+VT AI
+R CE
+NL EE
+VE TP
+SL KG
+FL KE
+VK EE
+PL PS
+AR RG
+EL ES
+TL KL
+LL Y
+II IG
+IE KE
+GL TP
+VK VG
+N WG
+RL VR
+DL KL
+AT VL
+AT SS
+AE RR
+QL AD
+VI EE
+P YY
+K AEE
+AV VR
+VT AA
+D KS
+TR FF
+KL GE
+EL PL
+AG KT
+SL VN
+TL KD
+QL AQ
+AE AN
+TL SD
+VE Y
+SL VR
+IV DL
+VG IG
+P MS
+EE ED
+KS KK
+RE AA
+GL SD
+GG DG
+AS EG
+A Q</w>
+RL RD
+VL PL
+TL PL
+AI IS
+V ANL
+VL FS
+GL DE
+RL QE
+M MK
+K ASL
+GL W
+KI VD
+K CE
+AT AR
+VL NL
+AV EK
+F MR
+IL KL
+P RH
+AS AE
+QL AG
+P IH
+LL QS
+IL T
+VV ID
+ASS E
+IT AD
+IK EE
+AI SS
+VD KL
+GL VS
+IL EK
+F GGG
+RL VV
+VI DL
+VG AV
+V AQL
+TV SS
+R MV
+R ALE
+H MS
+VI AR
+R ARL
+VT AG
+RL GE
+QQQQ QQ
+AI AQ
+RL ED
+IR EL
+AK TL
+VL VI
+AP EE
+KK LL
+VV DS
+VE VT
+V AKK
+PE KL
+SS IS
+YL EG
+TL VG
+GL IS
+DE IE
+AV SE
+APG S
+AG IP
+VL EK
+IK AA
+IF GG
+GL Y
+ATG S
+AML G
+T ATL
+Q LLE
+N MT
+GL KK
+II KK
+D TL
+AK GG
+VL FL
+RK RR
+PG SS
+KG VV
+VP AD
+IK NL
+H KV
+RE LL
+M QH
+KL EK
+RI VE
+SL RD
+PL PL
+H DK
+Y HD
+VL TD
+NNNNNNNN NNNNNNNN
+IL AK
+EE DE
+AQ AL
+C QN
+AV KR
+AS KL
+APL S
+IR DL
+VS KK
+SL VK
+F MI
+AL PS
+VL RR
+C AH
+TV SG
+RL RS
+M QY
+AV T
+AS DL
+R LLD
+YL GG
+P RV
+Y IV
+RL W
+IK TL
+I ATL
+AA SE
+LL AF
+KL DL
+F HF
+AL DP
+AG IS
+QL SE
+AI RR
+AP EL
+VK VT
+T SSS
+FE EL
+C WL
+C RI
+W VF
+AK VI
+PS TP
+II EE
+DE VE
+DE AR
+AE IG
+H NK
+TV EL
+R AGL
+AI VD
+Y IC
+SL Q
+II AD
+GL VT
+AK VL
+IE NL
+DG SS
+AE NL
+AE ID
+DL KG
+AV SD
+AL ED
+GL VR
+FL FL
+DL VG
+AS AI
+W AH
+AL RQ
+AG KL
+IL DS
+EL KL
+DSDS DSDS
+I K</w>
+D IE
+VV AD
+R R</w>
+P CG
+LL RQ
+WY FL
+VS TS
+II DL
+IL IG
+IL ES
+VS AG
+SG TE
+I AVG
+EE KK
+AS VG
+AE VD
+EL RK
+W RF
+DS TS
+W IQ
+QL GG
+AN VV
+VI AD
+TL PE
+K ARL
+P KH
+EL VN
+N VLL
+GG VI
+AR Q
+AE IL
+SL AI
+P QK
+KV AD
+AA KG
+VR RL
+LL EQ
+TL RR
+R AGG
+C RH
+TG RR
+SL KN
+SL ID
+K A</w>
+SE SE
+II LL
+EL AN
+AT AG
+VT EG
+KE VI
+EE VV
+GL RL
+AP AS
+TG VG
+EL I
+TT PS
+AV IS
+AL DE
+GG VV
+VAA S
+Q CS
+LL QR
+EL T
+AR TL
+FL SG
+AQL S
+VL TE
+M MT
+LL KT
+TT TG
+SS EE
+KI ID
+K WG
+H FI
+GG PL
+EL TG
+AK EE
+AD GE
+VL KR
+AI AI
+W YD
+PL IT
+NG SG
+AK RR
+VL KG
+V DLG
+TL KN
+AK GE
+V APG
+VL VN
+SL NS
+NL AA
+P NR
+II KE
+AT VV
+AL EN
+Y WG
+II VG
+AL TD
+TV AS
+GL EG
+W HG
+K AVL
+IV AV
+IL SD
+AL VI
+VS VS
+DE VK
+IS KL
+V AKG
+SL KR
+MS KS
+AP PG
+M YF
+GL DS
+AL QQ
+PL SP
+M WL
+IS LL
+II RE
+I TLG
+VG KS
+IR KL
+AR GE
+VV AS
+VE VK
+SS KK
+SL AQ
+IT RD
+I WD
+AR AV
+AI IE
+Q MT
+VD AV
+RL KK
+AP AG
+VL RS
+I ATG
+AT AV
+W YS
+RE TL
+AF AA
+VE KG
+I GLG
+AL PP
+VG RL
+KR AR
+K AGG
+D ID
+VE AE
+RP GG
+F VW
+W MG
+VF GG
+I WE
+AL IR
+V RLG
+I ADG
+IL VD
+C HP
+VS N
+II GL
+RL QG
+NL IE
+KE GL
+AA ID
+W ER
+IL FL
+IL DG
+C TK
+AG VK
+T GLG
+NK ID
+DL RE
+I GGS
+DE ID
+AL IT
+VL Y
+C KV
+RL PL
+D FL
+VL VR
+VL KL
+M AEL
+VT VG
+SL N
+SG KL
+RS VI
+QL VS
+PG GL
+TV AV
+IS EE
+AR DL
+VE VI
+RL Q
+XX XX
+V VLE
+TV TL
+AG AK
+AA QG
+W EQ
+SL AP
+QL KK
+IE KG
+AI LL
+D YL
+C HE
+AT SG
+AN AV
+IL PG
+IL KG
+V ELE
+PE P
+KV EE
+SL IT
+AS PS
+C PQ
+C PI
+SG KG
+R ARG
+PL VG
+VV SS
+R ELG
+MG Y
+IV KE
+D DE
+DL SD
+T CP
+IT EE
+C TI
+W EN
+VI DE
+VE AT
+NL AR
+M YN
+FG AP
+DL VR
+AKK E
+DL PS
+RL EG
+AV GE
+YL RD
+W QK
+TE AA
+LL KP
+IS VS
+AL TP
+IG LL
+VT EE
+V APL
+TL PG
+GL IN
+AD IE
+TL RD
+RG VD
+KK RR
+K AKL
+KG TG
+GL PD
+W SF
+QT RE
+AI ID
+VR AG
+AL PE
+GL AS
+W ET
+C DV
+AR GG
+VK VI
+AI VT
+SG IL
+RV PL
+V ILG
+SL AF
+IS RL
+EL QG
+YL KE
+VD RL
+KV KK
+AV AQ
+QE KL
+IR AG
+IP VG
+VT FD
+II SS
+TL VE
+DG TP
+QQ QL
+DL DD
+VT VV
+AL FS
+VP AG
+TV AG
+TG KP
+SL EK
+SG VV
+LL NG
+IL RR
+VI GL
+PE AA
+P VC
+M AKL
+KL DE
+DG KT
+KL TE
+SG AV
+FD VV
+AF AG
+VI KE
+RL KG
+IY P
+W YE
+RS AA
+RR AR
+PV AG
+LL PP
+IL DP
+IG ID
+W DK
+VL DD
+LL VT
+T SSG
+T ASS
+LL DR
+IG QG
+I AKK
+EL ER
+RG TL
+NL KE
+I AEG
+EL FE
+AL TS
+AF AR
+TL IE
+RL SD
+VL ED
+RE AI
+F VLL
+VL KS
+RS GG
+NL KL
+ASL D
+QV AA
+H YK
+AS AT
+GL KG
+AS IS
+VS AT
+QL SS
+Q MR
+Q ALG
+EL KT
+AL KQ
+V ARG
+VL PE
+VG ID
+KR KL
+IE GE
+DL IT
+AS ES
+TL VP
+P FN
+DE DG
+AGG E
+T WS
+RL TG
+IL PE
+VG DG
+IP AD
+AA VD
+VL RD
+VI ES
+C KY
+AK LL
+V SSS
+KL VK
+AG IT
+W IY
+VL ID
+P QY
+K AGL
+EE TL
+DE AA
+AS TL
+VV VS
+I VVG
+AR VE
+KR VD
+TV TG
+GL DG
+AK QL
+VT PE
+VI SS
+IR GG
+VLP WG
+SL QL
+KK IG
+IE VE
+Q MI
+TS TT
+PL RD
+F HH
+SL NE
+N CE
+H NR
+VP AA
+M MQ
+IL KS
+IK VV
+EL RS
+AP AE
+AI TG
+RI IE
+HVD HG
+F HN
+AK AE
+W SK
+VI IG
+K VLE
+C DN
+VL RG
+KN KK
+IV AT
+F HQ
+VR GG
+T LLS
+QL VG
+IV KK
+AL QR
+C TH
+AA YL
+RV GG
+PP AP
+N HD
+AD AD
+RL VS
+RL TL
+SS NS
+R RLG
+IS IL
+F HR
+M FI
+KV RR
+DL TL
+SL TD
+FE GG
+VS AE
+GL FL
+AL KT
+V PEG
+T VVG
+LL DP
+IS KE
+AI AT
+DL KD
+C TY
+SL IR
+RL FG
+RG Y
+PL TS
+EL DD
+EE IK
+QL KS
+PL SG
+IL NS
+AS VT
+C QK
+VV VE
+K AIL
+V ALE
+KL TD
+II TG
+EL TR
+AI KG
+EL AV
+DL ID
+VL FG
+TT LL
+QL VR
+W RH
+VD AG
+IG FG
+DG KK
+C NK
+C DT
+VAV S
+KE VE
+I VIG
+AK VT
+V KKG
+IE VI
+GL GS
+FL KS
+VF TG
+N VW
+N HF
+AHL G
+R AEL
+PG TP
+K VLG
+DE AI
+AV TE
+R ASG
+PE VV
+P NY
+Y HF
+Y CE
+R K</w>
+KL GG
+IP AG
+H YY
+EL TD
+TL KS
+T AVL
+SP EE
+H VC
+AT IE
+R MF
+RI AE
+GL VP
+P QH
+KQ KL
+IE KD
+SL TP
+LL PD
+GL RG
+P AW
+M HD
+RS AL
+IR RL
+D SS
+D SE
+VT IT
+VLPWG QMS
+AG IK
+AD IG
+AD AE
+VL TT
+RV AA
+PS TS
+KL AS
+GL Q
+W ST
+W SQ
+VR IG
+TL GG
+DL GG
+AD SL
+Y KM
+LL ID
+KN IL
+AI PY
+PL AV
+NL ID
+DL KR
+IP EE
+KL ED
+TV RG
+TP TP
+TG SL
+SG IG
+RR K
+APG E
+KG VI
+DG VT
+AS FL
+AL DS
+D VT
+TL ES
+TL AV
+P YR
+LL FS
+W DR
+AF SS
+GL RS
+DL RS
+VL PS
+NNNN NN
+KK GE
+IV EL
+D VE
+TE EL
+P YF
+IR NL
+AL FL
+TL DD
+RL SP
+KI IN
+RD KL
+DE AV
+AI EK
+TL NL
+EL VG
+AE Q
+GL KD
+DG EE
+D AP
+TL SL
+RI AA
+IS NL
+AF AD
+V ALS
+IL GE
+AI AV
+R SLG
+PE AV
+KKL S
+D NL
+P QI
+KI RE
+IS AG
+AP Y
+AL VN
+AI RL
+GL AT
+GG VG
+VT SG
+LL VK
+PG VL
+AL FE
+AS EE
+AK VE
+KL IK
+RE AG
+M MR
+H NT
+C TC
+KL Y
+IS PL
+H TGEKP
+D IG
+TS AA
+TI SS
+I MV
+SL IP
+RR RL
+M AGL
+KL VS
+DL VT
+AD QL
+IG VV
+AQ AG
+AE TG
+TPG HVD
+AD VD
+VR SG
+MS GG
+I SSG
+GL IT
+C YT
+AD AT
+A D</w>
+C YR
+IG AT
+AP SS
+V A</w>
+I LLS
+GL VI
+GL EE
+FL KD
+EL VQ
+DG DG
+AS RR
+VR EE
+KV AA
+IS AA
+AQ QL
+AK AV
+TG VV
+PG VP
+PE QL
+IV EE
+IV AL
+EL IT
+AS VE
+V VLS
+VK AV
+R AIL
+KL IS
+IP VS
+AE ML
+AE KK
+M YY
+GG AV
+VI LL
+V KLG
+PD VV
+IK N
+F HY
+Y MP
+R WE
+P ME
+P AC
+K GLE
+IL ED
+C TQ
+AQ VV
+AE IT
+VT PG
+LL SR
+KE AA
+ID EE
+TV AT
+GL KN
+AL QD
+AG RS
+NL TD
+AD VS
+VI KK
+QL SG
+IL VV
+AG VH
+W HS
+VT VK
+RE AR
+DL EG
+T MF
+II SL
+DL DE
+DG RL
+YL KK
+AL VP
+VV TE
+VG SS
+R VLE
+DL KN
+GG RT
+D KK
+AG IR
+H KM
+DT PG
+IL TS
+AS AP
+PS AP
+KL RS
+VK AT
+VE IE
+VE NL
+Q MK
+VE IG
+SL VP
+DL DS
+AD YL
+DL T
+AQ AR
+VI NG
+RL GS
+LL SK
+KE VV
+IK SL
+WN FGS
+VQ AA
+TG AP
+KD KK
+ANG AS
+AK N
+AA AY
+VS Y
+KE KG
+ID Y
+C VH
+AG KK
+AE VT
+TG TP
+SS FS
+C NY
+AG AP
+AD FL
+M QV
+KT KK
+IV LL
+RSIP NKL
+EL N
+VS KG
+RV LL
+RD GL
+K AEL
+TL VV
+SL TT
+RD DL
+NG VV
+I VDG
+GL KS
+FE KL
+AK PG
+VL TP
+T LLE
+PE AI
+KR KR
+P FK
+IN KL
+DE N
+W RY
+V ANG
+VL QE
+VI DS
+SS IG
+SG AA
+ID QL
+AF AV
+EL Q
+DL KS
+VS AV
+VS AL
+T VLG
+VR AA
+VL PD
+VD VI
+IL Q
+F GLG
+EL AT
+C NT
+AG RR
+VK AI
+SL H
+KE AV
+EE RE
+DL TG
+W TQ
+TGS NN
+IV RR
+GL PV
+AD VE
+QH D
+KV AV
+AS RG
+VT VD
+P FR
+Y HR
+VSG S
+SS TT
+RL AV
+PL VE
+N PLG
+K KLG
+HL RS
+GL DP
+GG IL
+M MF
+K GLG
+IE AV
+YL HS
+VV RR
+KI AA
+VS IG
+SS ST
+QL TE
+W QN
+NL TS
+KE RE
+FWG ATVI
+FL ES
+EL QD
+AK TG
+SL FG
+RK KL
+AL SP
+VV TG
+VG KG
+V TGE
+N ALG
+KI IS
+AQ GE
+R ILG
+IL NG
+APL E
+AL W
+Q CE
+VK SG
+I MI
+D AE
+VT VS
+M AVL
+VP VE
+VP VD
+P NK
+P MD
+P LLL
+LL EN
+IL VT
+AL DD
+AK NL
+N HH
+KL NL
+AE RF
+SE DE
+QL ID
+C FT
+VV AR
+TL SP
+PP PG
+NS NS
+DL SL
+F QV
+AD PE
+NG TL
+KP EE
+IS VG
+EL AI
+EL AL
+P YT
+M CS
+KE IT
+KE GE
+AI SL
+TL NE
+SK EE
+RG IP
+R AKL
+KS KS
+VL EN
+TT IP
+KL ES
+ID ID
+I ASS
+W DF
+TI AT
+PL AE
+LL TP
+FL EG
+V DLS
+RG VV
+C FY
+W HP
+VK NL
+TL VS
+RD RD
+IV AE
+TS SL
+T LLD
+R LLS
+PL AR
+II AE
+WYFL FAY
+W AY
+VTL E
+RV TG
+VR KL
+VI GS
+KK IE
+K AAE
+DE AL
+C NR
+GL FG
+VI VD
+H CE
+FS TL
+VK GE
+KP KK
+IG AE
+VS TG
+VL NS
+R ASL
+GL NL
+AE H
+T ARL
+GL RE
+YL SP
+VL AP
+PG VV
+IV SS
+FL SE
+C NI
+SL IK
+RV RR
+IL PL
+F LLG
+YL TR
+Y QV
+VLPWGQMS FWGATVI
+VG VE
+K L</w>
+IS QL
+II RR
+H AC
+DL N
+VT AS
+SL AN
+R ADL
+LL QG
+DS EE
+D PE
+AG TE
+LL TR
+IL KR
+GG FG
+RL IT
+NE KL
+IS TL
+ANL S
+VL NG
+V REG
+RL AL
+F ALL
+EL RQ
+VL ND
+RS RR
+Q AEL
+PD AV
+W CG
+VR VV
+VR PG
+IE DG
+AK RQ
+AD ID
+VL ER
+R GLG
+EL TL
+AS QG
+RE AV
+I EEG
+GL AQ
+DL IN
+M ASS
+KL IT
+DG EG
+YL KS
+Y LLL
+T MI
+M YT
+I MF
+H LLL
+VR DF
+VP VS
+KL PE
+IE AE
+FL IG
+AI AP
+WYFLFAY AIL
+IS EG
+AN AA
+MG EL
+II AS
+F AGL
+VL IT
+VI DD
+IL QE
+FL RR
+VK VE
+SL KT
+C DQ
+RG ID
+N HN
+IS KK
+C YP
+Y TV
+EE IR
+DL EK
+DE IK
+AD VG
+RG SG
+NL IK
+NG EL
+TL GE
+N HT
+AG FD
+IG IL
+IE KK
+I AQL
+C PV
+KL NE
+FS SL
+C MG
+AS PE
+RI AR
+M HR
+LL RT
+LL ND
+AV GS
+AG EE
+IL SP
+VK QL
+NI VG
+TL KR
+I IDG
+ER IL
+C AC
+W TF
+DL AL
+W PP
+QM NE
+N AGL
+KL FG
+KE IS
+VE GG
+GL TS
+AT AE
+Q AAE
+PL EE
+NG NG
+IE VG
+D IT
+C VM
+VP SS
+M HY
+VI RE
+T AGG
+DL ES
+AD AR
+W II
+SG VG
+PP PS
+K LLD
+IV RE
+DL Y
+W KY
+TG TS
+IT SS
+ID GE
+AT PE
+AL IK
+YL SS
+VL NE
+VGG E
+VL QG
+TT AA
+II AR
+IE TP
+TG EI
+QE AV
+KK IK
+PL AS
+AK VS
+VP VT
+V DLE
+QL AI
+KK IT
+IV AS
+IE GL
+FL DD
+AF AE
+VD Y
+T AEL
+PG IG
+FL AG
+RG RR
+IL VN
+F AEL
+C ME
+SL DG
+I VEG
+HL GG
+DG VI
+AV AY
+AQ EL
+V EEE
+RG RL
+IE RE
+FL NL
+DL VN
+AL RN
+VL QL
+IL IS
+F ASL
+KK KS
+IT PG
+GG EG
+Q ARL
+C PY
+W DY
+SG AR
+KG KT
+F MK
+I SLG
+GL PS
+AL SK
+TG KL
+RI RE
+KG VQ
+IL TD
+IG RR
+AIL D
+IG AV
+GL AF
+A N</w>
+IV RG
+IP SS
+AT TG
+Y MN
+VK RR
+VG AD
+IG KN
+K WE
+IL RL
+DG TE
+W SN
+TV AD
+TL ID
+SL PD
+RV KE
+M LLL
+KR VV
+IC D
+H QV
+H CD
+AS TT
+AG VR
+VP PP
+VL SP
+NG IK
+F VAA
+TS PL
+TL AI
+SSSS SSSS
+KL IN
+W CL
+VL KT
+LL RP
+AP AT
+VD FL
+V AFL
+GL DD
+AP KG
+DL FE
+AA FG
+SD EE
+EL QL
+EL PP
+AR AI
+YG TG
+T MQ
+KKL E
+AE KE
+AA TE
+VQ AV
+TL Q
+NL PE
+YL EE
+VG IN
+KTT TT
+AG VI
+WYFLFAYAIL RSIPNKL
+W NK
+TS EE
+II AV
+NL NL
+IT PE
+GG RL
+VP AS
+VL VK
+TN LLS
+P CS
+VD SL
+SL QD
+N MK
+AI SE
+TV RE
+TG TT
+AG RL
+W IH
+V AKE
+TL TR
+PS RR
+NL DE
+IE SL
+I IEG
+GG TS
+IK PG
+C LLL
+IL RG
+EL RL
+AL IP
+TH D
+KI AK
+W MS
+HR DL
+SG SD
+EL QR
+D PG
+C NF
+VD VS
+PS AS
+KT VK
+IV AI
+IK DG
+I AKG
+DL GE
+AT AK
+AE AK
+RE AY
+Q ALE
+KP LL
+I SSS
+Q LLG
+AA IR
+W FP
+VR AV
+R AAE
+M VW
+DD DE
+PL IL
+KI GL
+DG SL
+C KH
+VI AV
+KL RG
+II AT
+VP LL
+VL DP
+RV AV
+IG EG
+FS GG
+VS DG
+TL QE
+TG KG
+KL VN
+KL EG
+F HK
+W WL
+W TN
+P QV
+KL RL
+F HT
+VP VL
+VE KE
+EL DR
+VF VG
+RL VN
+PL GL
+IS TS
+AG Y
+W TY
+IK AI
+F AVL
+AA SF
+P HF
+KR RL
+QL RR
+IT GL
+SG TT
+R ELE
+QL IE
+KT VV
+KS LL
+AG TP
+AF LL
+VF AA
+VD PE
+TL PS
+FL AE
+EL FG
+AG K
+W TK
+V RRG
+TG VI
+RI EE
+AQ AI
+V FLG
+SL TR
+PL RL
+P WG
+IG HG
+EE RR
+T ASG
+NL KG
+M HQ
+GL VK
+EL VT
+IL AL
+DS AV
+AR AK
+M ADL
+H YN
+IE RG
+I ILG
+VF DL
+TR TR
+TP SS
+TE KL
+P KV
+KI VK
+DE RL
+C IC
+VV DE
+Q MV
+TT NS
+RL KD
+RK LL
+KK AI
+D RE
+AL RP
+IE ID
+FL AA
+C QT
+T CD
+RV SG
+RG KG
+DL EL
+TG FG
+T AAE
+QG VD
+PG VT
+EE ID
+Y HH
+VP AP
+PL TT
+AV ES
+VS ES
+VR AR
+SS IL
+PS VV
+IG VI
+TL N
+SS VG
+GL AK
+C FK
+VK SL
+QE QE
+Q MQ
+Q ELG
+NL PL
+IV PL
+IP TN
+I EEE
+I ANL
+VI VT
+V APE
+SG AG
+GL PP
+GL IR
+AS KK
+SS RS
+RR GE
+Q ALR
+KQ RS
+AE VS
+N K</w>
+IV AR
+D VR
+AE TE
+VD SE
+VAG S
+NG IG
+IL AN
+EE KR
+AL QS
+VE VS
+R V</w>
+QL RD
+IG AY
+AT VG
+V PLG
+IL RD
+F VLG
+W DI
+V SLS
+P VW
+RG AA
+K TLG
+AI DG
+TG KK
+EL SL
+AL IN
+W VY
+T ASL
+I RLG
+I KEG
+DT VV
+TL TD
+QL VN
+QL EK
+PE GE
+AE PL
+TG KR
+RV KK
+M ATL
+AS PG
+AL AY
+VT VP
+TL VD
+IL VL
+DL NL
+D KE
+AL EI
+AA IT
+AA IQ
+TL EK
+RG VT
+IR SL
+IK NG
+TG RS
+T K</w>
+EL NL
+AK DG
+AI SG
+AI GE
+AA RQ
+QL GL
+LL VR
+IP VT
+F CD
+AE SE
+RI ID
+Y VW
+VE GS
+TL TP
+TL IS
+TL IG
+QL VD
+LL VP
+IG KS
+V ARE
+LL QD
+AK IL
+AG PG
+QQ RL
+PS PL
+AP TP
+AA AF
+YL AE
+VV EK
+TL VT
+QL R
+EL RG
+IR N
+AML E
+AA ER
+SS AT
+IV NG
+H VW
+GG GGG
+VL RQ
+VE K
+RI RR
+ID NL
+GG AS
+AK AF
+PS SL
+KE NL
+IK AV
+AE SL
+VR DG
+VE VL
+VD EE
+TV AR
+KL DP
+W AM
+VS AF
+IK SG
+FL TG
+AL HL
+KI AR
+VE N
+C KC
+VS PE
+V GLS
+PG SP
+KL IG
+IT AI
+IS DE
+LL VN
+KK GS
+AT DL
+TE DG
+PS GL
+PG VY
+IR AA
+PL VI
+KL TR
+KL N
+KE IG
+EL VV
+AS KG
+W FT
+VS AI
+VL AF
+EL FS
+Y AW
+RP RR
+RI AQ
+M HH
+IP SG
+AD AF
+VR DL
+NL FL
+AH GG
+AG SL
+VG AI
+R ALS
+KL SL
+EL VL
+AE DG
+W NT
+TL RS
+RD RR
+PT PS
+I AAE
+RP NVG
+IG EE
+VI VE
+LL DK
+VQ VV
+VI RR
+TG EE
+IQ AV
+DD FD
+AA AM
+YD AI
+TI AR
+M AAG
+KR EE
+DE AE
+D IK
+AS TE
+AS IT
+AQ RQ
+AL VF
+VT SP
+VN TT
+VL ST
+IP KG
+EL IP
+AT VS
+AE AM
+RI KE
+N AVL
+KP DL
+ID PE
+TI AG
+RG VP
+II NG
+AT VK
+Y ALL
+VI TS
+V NLG
+TL K
+IE GG
+DS SL
+VT IG
+M ARL
+QI AA
+NL KS
+N ATL
+IS TT
+AP PE
+W DN
+VE Q
+SS GL
+SS GE
+RV RE
+LL NN
+EE GL
+DL TS
+D IP
+AS IG
+AF HF
+SG TS
+IL TT
+AL EQ
+KL FS
+IV AD
+DG SE
+TV RR
+KI VV
+I AIG
+C MS
+TV KE
+HE TGSNN
+VE RR
+RI VV
+QL IG
+DD AI
+AY RE
+VL IS
+K LLS
+IV AN
+IN PE
+AR VV
+AP AV
+C NQ
+AR QG
+ER AG
+EE GE
+D VD
+AE KF
+SS PG
+PL AT
+PD NY
+NL AS
+AE GG
+VV PS
+SG VT
+SG SE
+Q A</w>
+LL IE
+IL F
+I APL
+VS VD
+VEL S
+TW IGG
+RI VD
+Q AAG
+K ALS
+FS P
+AT IT
+AG TS
+VV RS
+KS GG
+FL KN
+DL QL
+R VIG
+IV TG
+AR RE
+AL FD
+VE SL
+N HI
+FL SL
+AI P
+VS PS
+M YK
+LL RK
+IK P
+AT GL
+TP EL
+RL PE
+PL VT
+I AYL
+AE GS
+VE RG
+T ADL
+KL VV
+IN EL
+H NQ
+IS PE
+DS DD
+VD EG
+SG VI
+RV RG
+IS TG
+DL SK
+D RQ
+AK RG
+VLPWGQMSFWGATVI TNLLS
+KL KQ
+QL DE
+KL QL
+IL ND
+EL KQ
+AS TG
+AL AM
+KK KE
+IK VI
+AE KG
+T AIL
+LL QN
+KK PE
+IG VG
+I ELG
+PG H
+NG SS
+LL ST
+VKK E
+GG ML
+EL DG
+AS AQ
+VL AY
+IL FS
+DL AT
+AP GL
+VV KG
+VG VN
+TL HL
+RDVN YG
+IS KP
+DL AN
+TL AN
+R A</w>
+ME AL
+K S</w>
+AI IP
+AG FG
+RG VR
+R G</w>
+NL PS
+KL RD
+IT KE
+IR KT
+I KKG
+AI TR
+A P</w>
+W HE
+VG EE
+TL TT
+IH TGEKP
+DL AQ
+AV ER
+AT TF
+AG AY
+VV FG
+VI AF
+TI SG
+RI IG
+R SSG
+QL KQ
+W EI
+TP AE
+KL RR
+IL NE
+VQ AR
+IS ES
+DL PP
+D VS
+AD ML
+TP RG
+PL RG
+M YQ
+KV RE
+AR KR
+YL DG
+SS TE
+SL VI
+M ALG
+LL AH
+GG IG
+F AAG
+W DQ
+ID AL
+DG AA
+AI EL
+VTL S
+NL RS
+AF VS
+W VM
+TT TS
+SS RE
+R ILE
+MK KL
+M HT
+M CE
+YL RE
+VV ES
+AT EE
+AR RD
+AK EK
+W VH
+TL GS
+M AEE
+IQ EL
+GL RD
+F WS
+DL ER
+AR GS
+AL RT
+VE VP
+RL DE
+RG EE
+RF RQ
+QL AS
+IS N
+D PL
+AK P
+VV AL
+VD IL
+R VVG
+R CP
+DL TD
+AK IE
+W SI
+VY SG
+RS PS
+IT VN
+FL RE
+DV VL
+VV LL
+R CR
+PE VL
+KL Q
+KK AG
+IT ID
+VE SG
+IP AN
+AS PL
+VAG E
+TI TG
+IF DE
+FL SD
+DL AV
+Y GGG
+VS FG
+VG AL
+AG SD
+VT VR
+TR KG
+TL QR
+TL FG
+R AKK
+P CP
+K AKE
+AA SV
+V APS
+NL KN
+NG KK
+KD IE
+FL GS
+AA ES
+VG SL
+QL IR
+NL SD
+IQ AF
+IK GL
+F WD
+D FS
+VS KL
+VI TG
+V FEG
+T ELG
+QL VK
+QL N
+EE IL
+AA QR
+YF RD
+VE VG
+TS PE
+PL TP
+NG KS
+IS KS
+VT TT
+VE VD
+VAV E
+RR EG
+NL TG
+NL AK
+NG KP
+K AIE
+H AW
+EL PD
+VV KE
+VP VP
+VE AQ
+TV AL
+ST SS
+QP QP
+QL EG
+KD KE
+FL KR
+Q LLS
+I LLE
+FG FK
+EL QQ
+AS RS
+AR IE
+AL SR
+AH ID
+AG DG
+KD IL
+DN VV
+DL IR
+VL IE
+RV RL
+RR RE
+PL RE
+AD IS
+VS VP
+VE AN
+T TLG
+R WR
+IT TS
+IN KK
+AP AR
+LL YL
+EL EN
+DL NE
+DE KK
+Q WE
+NL IS
+I PEG
+D SL
+W YR
+W QT
+VV RE
+VS AK
+VI SE
+VG KT
+VD ES
+TE AE
+T AEG
+AI ES
+AF GL
+T MV
+SG RR
+F SLG
+F ALG
+DL RL
+RS RE
+R CD
+KG VN
+IL DD
+VT SL
+IN EE
+VS KE
+VS FE
+VI GD
+SV EE
+PE RL
+M MI
+IT SG
+AS KR
+AL VQ
+AD KK
+TT GG
+NI SS
+DL NS
+DE AG
+AD VP
+Y HN
+V E</w>
+TG EL
+PL IS
+KL DD
+KI SS
+KI FG
+K EEE
+IW GGFS
+AT TS
+W SY
+TG KE
+IL AR
+FL VG
+AD EE
+R ATL
+C HT
+AG AN
+VI SD
+VF SG
+SL SK
+LL SN
+DL IG
+AA RD
+V AYL
+KV AK
+DL TP
+AV VK
+AL DR
+VI AN
+KG KL
+FL DS
+ANL E
+Y AAL
+TR SG
+T SLG
+SL QS
+RG EG
+NL TE
+GL QG
+FS RL
+T SLS
+GL KR
+FS EL
+AL NG
+VP SG
+RR SS
+FL RD
+AS EL
+AS AK
+AA TS
+RI AK
+IL VS
+AP AK
+V ILE
+RL IN
+PP SS
+NL IG
+KG ID
+AE IS
+W IC
+VD AT
+KT IE
+KEL S
+IP AV
+GL TR
+V GLE
+T IKD
+H LLG
+D PS
+RG ED
+KI VG
+C IH
+YL KD
+DL ED
+VV KK
+R WS
+IR Y
+AE AF
+AA QQ
+SL ST
+KL GS
+IL KQ
+II AI
+AT AI
+AG KP
+PL VL
+ID TL
+AS QL
+AE PG
+QE EL
+D FE
+C QI
+VP PG
+IT TL
+GL AP
+SL FD
+RI AN
+LL SI
+LL QP
+IT TT
+FL DL
+AA SI
+W MD
+VK N
+SS KS
+RR RS
+Q AEE
+PL EG
+NL IN
+NI KK
+ID ES
+D RS
+D IS
+KL AL
+VS VV
+KR ML
+AT SE
+VS DE
+K ILG
+IE IG
+TG IS
+TG DG
+RE AF
+P HH
+IT DE
+F MQ
+AI AF
+VI KS
+VF VD
+P MT
+M HF
+KL FE
+IT AS
+IG IG
+GL ED
+AT SL
+AL YD
+V GLD
+NE EL
+GL KP
+AE KS
+VK PE
+I AFL
+FL N
+C FI
+VK RG
+QE QL
+KG Y
+II AF
+VL SK
+VL HE
+T GGS
+T AKL
+RE GL
+DD IE
+VS PL
+QL IK
+PE VQ
+LL QK
+KL KT
+KL H
+AS TR
+VR Q
+VR NL
+KV SG
+AR PG
+AA HG
+VV PG
+VP IG
+TP DG
+RV AG
+R ASS
+AD VT
+TL VL
+T AVG
+RI KK
+KV TG
+KI H
+FG IG
+VT AV
+VI PL
+T VIG
+PL NL
+LL HL
+GL IP
+C QF
+LL IN
+IE KS
+EL FD
+DG AE
+AT IG
+VE KD
+PV PG
+NR PL
+VK AR
+IE Q
+DL KT
+C QH
+AS VR
+TWIGG QPVE
+LL DT
+KQ RL
+K ARG
+I LLD
+FG AF
+AT PG
+RI VK
+PE VT
+KK RL
+KD KG
+II AK
+EE EK
+DG AV
+AG QL
+QE IL
+NL RE
+I L</w>
+GG QQ
+AP PS
+LL IR
+K VIG
+K ILE
+DE KG
+YL AK
+VL IP
+K CD
+IE SG
+EI AK
+AG AQ
+SS ES
+R AQL
+QQ AQ
+PL KG
+KD VV
+GL VN
+FL AR
+DE AQ
+SL EL
+KP GG
+VS VE
+NG EE
+KL KR
+FL RS
+ANG E
+T VLS
+QL RS
+GL YG
+AT SP
+AA PS
+VI PE
+TG RP
+T CR
+LL DY
+AA F
+SG DS
+R SSS
+LL YG
+IL Y
+VI KG
+KI KG
+VS DL
+VR TL
+NS SL
+NL RR
+LL PF
+K E</w>
+AF KN
+AYL S
+AS FS
+AR DG
+VT ED
+VI AQ
+RL ES
+IP GG
+DV AK
+AD IT
+VV RG
+VG FG
+RE ML
+KR AV
+IS AV
+IP TT
+TS TL
+P ALL
+KL F
+CG KT
+AS KS
+VR VL
+SL EN
+KI TG
+IP VD
+GL AY
+FD AI
+VK IG
+TV GG
+QL AV
+N VLG
+KE AG
+K VVG
+IT ES
+FL TL
+D AR
+VR NG
+LL RF
+KT VT
+F APL
+RG NP
+KK VI
+IL PS
+I APS
+TE KE
+NK YG
+IR IG
+C QY
+AD GG
+TP AP
+T APS
+SG TG
+IR KK
+IE TS
+GL TT
+Q ASL
+P AAL
+I AKE
+FG DG
+C FQ
+VSL E
+VI ID
+RP SS
+RI AV
+PS PG
+KT VE
+FL EN
+FG AD
+AT IL
+AE AS
+VR SS
+RL VL
+DV IL
+DE RR
+AT RG
+AL PD
+AAAA AA
+VV GS
+RL TR
+Q AIL
+IR TL
+DI VL
+DI IK
+Y AAG
+YL DL
+TI AD
+NG IS
+N ASL
+KR VL
+K ADL
+H AAL
+EL QK
+EL IL
+VV AK
+VR SE
+SG VD
+PS PE
+KT KE
+K WS
+I DLE
+H WG
+VQ RD
+SG RG
+IK AQ
+IG AI
+F WE
+AV TL
+ATL TRFF
+VK TG
+RL DG
+NL EK
+KR GG
+IR EE
+YL SE
+VT DE
+T RLG
+KL VG
+KK EG
+KF KE
+FHP YY
+AE YG
+W KI
+TV PL
+ML AT
+I KLG
+AG SP
+V AFG
+T ILG
+RK AA
+LL TN
+LL RN
+KK VL
+F LLE
+AE VG
+YL VG
+VV PE
+VS TT
+NL KD
+EL PG
+D AS
+TS IS
+P HR
+LL ET
+IP VF
+VS AY
+VE KS
+SL YS
+NN NS
+KV GL
+AT SD
+VT AL
+VI R
+SL RQ
+RR ID
+KL AN
+IS DS
+Y IM
+VLPWGQMSFWGATVITNLLS AIPY
+TR IR
+SL RN
+KI AQ
+IT TE
+IE NG
+DL AP
+VN AG
+RV EE
+LL TQ
+FD RG
+AA RS
+Y MT
+TI RE
+RL AN
+QL SD
+QE AI
+Q MF
+LL FE
+VR VD
+T ALS
+SS RR
+RL IS
+RL DD
+II VR
+EL TS
+AG TD
+AG AF
+VK VK
+VE NG
+VD FS
+LL LLL
+IG KK
+I PLG
+V RLS
+TT EE
+TE SE
+RE KE
+PL FL
+K SLE
+IL AQ
+DL H
+D AQ
+VT TG
+RF IE
+FD AL
+AI RD
+VG VP
+TR VG
+TI RG
+RI SG
+I WR
+AE N
+TK KL
+RK AR
+IT VD
+IE N
+D AK
+C MD
+A I</w>
+TP AV
+PL AL
+KT EE
+ID YL
+AI DL
+AE VK
+VE SS
+KV IL
+IV TL
+D TS
+C HR
+AI ER
+VQ KL
+VD TG
+RG IR
+RD AV
+M LLG
+D DG
+AT DP
+AP VS
+W NY
+NL SL
+K G</w>
+IP VE
+IL RS
+FD AV
+VP PS
+VL AH
+TP VT
+W TI
+IV SL
+IK VN
+GG AL
+AS HL
+VR P
+TS PG
+RI VG
+QL Q
+EL QS
+AI SD
+AF GE
+Y CD
+V AHL
+RL QQ
+RG FS
+QL IS
+PE TT
+IN P
+II VN
+EL TQ
+VE SE
+TP SG
+VR AF
+TG ST
+RR SG
+NL VK
+KS KL
+KS IS
+II ID
+EL F
+VS ED
+IE IE
+W NF
+IL QG
+EL SP
+EL DS
+VV TS
+VD ID
+RK VV
+IP TP
+I CLG
+DE VT
+AS KE
+YL KN
+T ATG
+NL AE
+IL IN
+CT GG
+VS VT
+TL NS
+T VGD
+RR QL
+PL TE
+N LLE
+KI IT
+K DLG
+K CP
+C CE
+YL H
+VL VP
+VF DG
+P HK
+NL VE
+N A</w>
+LL VI
+LL AY
+IS FL
+FE KE
+VF VN
+TP AD
+PS TT
+LL VQ
+II FG
+GL FS
+FE GE
+VL HG
+VI VR
+SS SN
+IT AA
+ID KL
+F IDE
+AT AQ
+AD VN
+VR VG
+RL VK
+RL KN
+QL ES
+P MN
+F ADL
+W FY
+RV SS
+NL KR
+I VDE
+AT TP
+AK RE
+Y HY
+VE DG
+RG SS
+LL IP
+FE TL
+DV FS
+DT AG
+VI EK
+RL AM
+RG FG
+QV AE
+PG TS
+K SLS
+IT AE
+AI KN
+AI GS
+IL AF
+EL PS
+AG DP
+VL QS
+VK SE
+RL TS
+IE VV
+AK AQ
+RV AL
+RE RR
+QI EE
+PL RR
+PG TT
+KI QE
+K ATG
+GL YE
+GGG TFD
+C HF
+IL PD
+IK AM
+F ATL
+ANGAS MFF
+AG ES
+H ALL
+EL IQ
+D PD
+AR AE
+YD PN
+M HN
+LLFL HETGSNN
+K N</w>
+C YY
+AV DS
+AS VD
+QI AE
+KI KL
+GG GE
+FL EK
+VL SR
+TG IG
+N HR
+GG GL
+DL QE
+PL AN
+KE GS
+IE AR
+GG VF
+F AGG
+AI KR
+AG FS
+AD AP
+AA SP
+AA IS
+AA IN
+VG MG
+VE AR
+SV SS
+Q LLD
+IR AV
+DL VQ
+AI DE
+VL HL
+VK VN
+QL GE
+NI AK
+MR GL
+DD VI
+AI KD
+TS TG
+TI VV
+Q G</w>
+IL EN
+FL QS
+DD ID
+AV IT
+AA VI
+AA ML
+RI TG
+KK VV
+K ASS
+DL SP
+AG EI
+TV RL
+T ALE
+KK KL
+DG IG
+AS RD
+APE S
+VT GL
+SS DD
+R S</w>
+DI AG
+AL ET
+VL RN
+VL NN
+K AVE
+NG SL
+KV AR
+II FL
+DL RP
+VL IN
+VK GL
+RE DG
+RD RL
+PG TG
+PG AV
+NL EG
+ME KL
+LL KY
+KK KG
+IWGGFS VDK
+AV DE
+AM GG
+AA SD
+AA KN
+RD SG
+QL ED
+QE KE
+TL IT
+TL DS
+RK RL
+PR GL
+PQ GG
+KK AR
+KD IK
+IL IT
+IKPE WYFLFAYAILRSIPNKL
+ID AR
+FS EE
+AG PS
+VV FL
+RG IS
+RE GE
+C CP
+AL QK
+VE QG
+TL DL
+PL PD
+KI AV
+FE AR
+AS TP
+AL DK
+TR PE
+KQ LL
+IL TP
+FLAM HY
+AP VP
+W KH
+T EEG
+I SLS
+GG AR
+FE TF
+AYL E
+YL VN
+TD VV
+KR IR
+IS PD
+I AQG
+GG KL
+EL TP
+AP VI
+Q VLE
+N WS
+MV KE
+DE GE
+AI RQ
+YL ES
+YL EK
+VL QQ
+QE TT
+PL GE
+M HI
+KT AV
+KE IR
+FE DL
+AF AQ
+AA DE
+RG ER
+II AQ
+DL IP
+AG IN
+AA KP
+KK VG
+IQ QL
+FL KG
+AS VP
+AI ED
+AD EG
+VT VN
+TT P
+SG EL
+RP GL
+QE HS
+M GGG
+IQ RL
+I DLS
+AK VG
+A H</w>
+YL PE
+QI AR
+PV VL
+KI VN
+H VAA
+AI IN
+AA DD
+TV AN
+TI PG
+RG RE
+Q AKK
+PV AV
+EE ES
+DS VT
+AA ED
+T KLG
+SS SSS
+QL IN
+II KL
+DL AI
+AQ KR
+AP AL
+R AEE
+NG KT
+LL NP
+KI SG
+K VVE
+IK SS
+EL EQ
+AE IR
+VK NG
+V RGE
+TV AQ
+TL KG
+T AKK
+Q AGL
+KE TE
+IE KN
+F CP
+F APS
+DL IF
+DG KP
+AL QN
+VS RS
+KG RL
+IT AT
+IG KD
+GL NP
+FG KG
+AT ES
+VV VN
+SE AE
+LL ML
+H MT
+AL NN
+VP AE
+T L</w>
+Q RLG
+PE VI
+NL GS
+NL ES
+ID SL
+FL FG
+AN IG
+AD SS
+PPPP PP
+GD VV
+AT RL
+SG PS
+QL PL
+ID TS
+F MF
+N HY
+KK AV
+IT SL
+IS DL
+VNG E
+VD TS
+RE IE
+M WE
+LL IK
+I ILS
+D IR
+C YN
+AT IN
+AE YL
+VV DD
+TL EQ
+R WD
+R SLS
+M ASG
+KK VS
+IG NG
+FL TE
+Y HI
+IS PS
+FG VE
+AI EG
+VL IL
+VD VT
+N CP
+GG VLAL
+EL SK
+EL AP
+DG RG
+AS VI
+QL KN
+NL RL
+AI KS
+VSG E
+V ILS
+SL QQ
+QL EQ
+FG SG
+DE IR
+NG KE
+KI TL
+IV PE
+IL QQ
+H VPL
+AA TP
+TI KE
+IG VS
+FR VT
+ATLTRFF AFHF
+W NR
+RT PL
+NL DP
+IL NN
+F ARL
+AT PS
+AT AD
+VV FD
+VD VE
+SS TP
+Q GLG
+NL GE
+LL SQ
+KV AP
+KI RR
+AI PG
+R AVG
+IL QS
+FS IG
+EE AQ
+AT KE
+AN FL
+VG DS
+TL PP
+SS PL
+SG KP
+SE KE
+R KLG
+P HD
+NE IK
+I APG
+H ADY
+GG N
+EL VP
+AP DG
+VL KQ
+T APG
+KL VI
+IN NN
+GG NG
+FL RG
+FL AD
+W PT
+VL HS
+KS KG
+IS ED
+AL KP
+YE CL
+KV SS
+IG VL
+AA SK
+T A</w>
+VG VS
+RT AR
+M AAS
+KL SK
+IE VP
+C GGG
+AI FG
+YL AQ
+Y HK
+VD VD
+TE VE
+QG SG
+KR PL
+K RLS
+IG TL
+I TLE
+GL NG
+AT AN
+SL ET
+QL KG
+IP TL
+I ARG
+AP TT
+VT DL
+VR AL
+TK VT
+T SGS
+SG TP
+RL ST
+I FLG
+I ALS
+F AKL
+EE IT
+AS DG
+AI VS
+TV AK
+TL TN
+TI AV
+QL QS
+LL DF
+II VS
+DI AR
+DE AD
+AT KG
+AA QE
+SL YL
+QL AL
+NL GG
+KT PL
+EL AY
+DE KL
+DE DS
+AG KR
+W PI
+VI VN
+V AQG
+RL PS
+AK AD
+VR VR
+T SLP
+SL NG
+NL FE
+FL NE
+D VG
+AS AF
+W QY
+VS FD
+P YN
+M AIL
+IG IS
+DI KE
+Y LLG
+VL TR
+TL FE
+RD AL
+IG VE
+FS KE
+AV SP
+V AML
+T ADG
+NL SP
+MK IG
+IF SS
+QL NE
+EE TE
+AS DS
+RG DD
+QL RL
+IT SD
+FY TG
+VR N
+NL GL
+FS TS
+FG VT
+R ELS
+IP PP
+II IS
+ID TG
+FS AT
+AV IR
+AL TF
+YE VS
+W YF
+W FK
+VV RT
+VI KN
+SL QG
+ML EE
+IL YG
+II VT
+II QE
+EL SR
+AQ RR
+AI TS
+VF TT
+PS EE
+NL IP
+KG IK
+K ILS
+FL AK
+VQ TL
+VG RR
+ML SS
+ML KE
+KL AV
+IT PL
+IE AK
+H VLL
+AL FT
+VV TD
+VK GG
+RV AQ
+PS VE
+N HQ
+ID SS
+C TM
+APS E
+AD VR
+W QH
+VS RD
+MAT AF
+IN KE
+GL RF
+AA PP
+AA EQ
+RL PG
+QV AR
+PL AP
+II TL
+I APE
+FL DG
+EL GS
+DL FD
+AV ED
+AR RS
+YL KG
+TI AK
+I GLS
+C FV
+VV IT
+VF SE
+LL EI
+VV IS
+TT VT
+Q AKL
+PS KK
+PE AP
+NR AP
+DT IE
+AF AK
+RE AL
+NS KL
+NL DD
+N CI
+KT AE
+YL P
+VK VL
+N VAA
+N SSS
+N GLG
+K ATL
+HL AR
+AI IL
+VK VD
+MP VG
+HL PE
+GG RR
+DV TG
+DI IE
+C HD
+AI Y
+AI NE
+VK AE
+TS TP
+R DLG
+H MN
+FG SS
+DE F
+AQ GS
+TK DG
+SV AV
+RG QR
+N CD
+IS VP
+AL SQ
+AL SN
+YL TE
+PL AK
+M AKG
+KD AV
+GG FL
+EE VL
+DE IT
+D KG
+VK AK
+TV FE
+AEE S
+VPE S
+NL PP
+II AN
+IG AR
+GL SI
+GG SD
+VS FS
+SS DG
+RL YD
+RE KG
+PPG PPG
+PE PG
+LL IT
+AT TL
+VS RG
+VS IT
+VG DL
+TE TE
+PL RS
+PL IE
+F LLS
+AQ VI
+YL ED
+YD KL
+WNFGS LLG
+VT AK
+V R</w>
+TI IN
+SL QR
+SL AY
+SG TL
+QE AE
+FL KL
+AM ED
+AD ES
+VS TP
+PI IE
+NL IL
+DI KK
+AG RD
+VT ID
+TI KK
+TI AE
+RG VI
+QV AD
+Q VLG
+IS ID
+TV NG
+RQ RL
+K VEG
+IS NS
+VT IN
+TG AR
+VN AA
+VD DE
+TR AG
+PS TL
+ML P
+M GLG
+K EEG
+IP AE
+I VLE
+C FC
+AV TD
+VV KN
+VL TQ
+VI NR
+TG AS
+SS IN
+SG ES
+GL RN
+GG RF
+AV KS
+T VKG
+SL FE
+NL VG
+K DLE
+IF VN
+EE VI
+DD EL
+W FR
+VD GS
+SS AG
+RD EL
+D RR
+D QL
+VS AR
+VP FS
+SL YD
+SL RP
+Q WD
+NL VN
+N AAG
+M VLG
+IK QL
+DR VV
+DL FN
+AT RE
+AT EL
+RV KL
+PL TR
+KV AI
+VL IF
+SS PE
+PP TP
+K TLS
+IK GG
+IG AK
+I IDE
+AN PE
+AD FE
+YL AG
+VS ID
+KI IR
+K TLE
+K AKG
+II KG
+IG KE
+HL AA
+DE HG
+AT IS
+AT EG
+W PV
+VD VN
+TV KK
+TE R
+T VLE
+RS KS
+NL DS
+NL DL
+M CD
+K VLD
+IP AS
+IK PD
+AT RR
+AD SE
+TQ EE
+Q ALS
+N ARL
+KV AG
+KL PL
+IL KT
+IK NS
+AQ VG
+VQ AI
+RD VE
+RD SE
+PG AR
+IS DG
+FL PL
+AT AP
+VT TL
+SG EG
+QI NG
+KS TT
+GL ES
+AM AA
+RR GD
+RR AK
+I ADE
+EE QL
+AV FG
+VD ED
+TV FS
+TD SE
+SG FG
+N HK
+LL SF
+IS DD
+IE KT
+F HI
+AN RL
+AKK S
+YL SD
+YL RS
+TG ES
+RE KK
+QG RL
+PE IL
+NI IN
+LL VF
+KL TS
+TE GE
+KG EE
+H WD
+AR SR
+AN IL
+AI DD
+VP GL
+Q KLG
+PE VR
+K AFL
+IE KQ
+FL RL
+DS VV
+AN AS
+AL YG
+AL NS
+W SV
+T AIG
+LL TK
+IV AF
+FE DG
+DL SR
+VS RE
+VAD E
+RL RQ
+PS AV
+P HT
+NL KQ
+KE HL
+D AM
+AE KR
+AD ED
+NG DT
+IS KD
+DL TR
+DL RG
+DK VV
+DK VK
+DG R
+AK SD
+W DV
+VL EQ
+RT REG
+QK QR
+IF GL
+ID SE
+ID PD
+D VF
+AI RG
+AE HL
+AA FS
+YL SG
+W VC
+W PR
+VTG S
+TL EL
+T WD
+PP EE
+V KLE
+TN KY
+PK HL
+IR DG
+GG VP
+VT TS
+VS DS
+KG FG
+K ASG
+AV TP
+AG H
+AA PL
+V S</w>
+SL AH
+R FLS
+M ATG
+KV KL
+GG DL
+FL GE
+AL VH
+AA RK
+TS AK
+TL SK
+RL Y
+PL AD
+KE KD
+IG IP
+ID AE
+EL AH
+AF VE
+RI GL
+KE AK
+IK DS
+II RG
+II AL
+DGTT TAT
+AN AI
+TE SG
+SS VL
+Q ADL
+FL AL
+AV VI
+VN KK
+T AQL
+T AAS
+RI AS
+NL FG
+LL HS
+KR IS
+IWGGFSVDK ATLTRFFAFHF
+II VE
+GL H
+AV Y
+AV RS
+AF AS
+VV KS
+TY GG
+NL VP
+KE N
+FI FG
+DV AR
+AP VE
+VN SP
+VG EL
+SR KS
+RP VV
+Q CP
+IL GD
+FD AA
+VD DG
+NL NS
+KD SG
+ITVP AY
+F CR
+AV RT
+VV TL
+SS AQ
+QR QR
+PS AVG
+LL FP
+KV AL
+ID IE
+FL RN
+FL QR
+DL FS
+DE SG
+TK EL
+T PLG
+SS RL
+SE EL
+NG VL
+KL NS
+KE VK
+AP VL
+AA TR
+SL VQ
+IL PF
+IK AD
+HL AE
+GL IE
+CG SG
+AI TE
+VT AE
+SL SR
+RI AF
+PG EG
+KK KR
+KD GL
+FG EK
+AA EI
+VP DG
+TI IG
+M DLE
+KK IR
+IK PS
+FL QE
+AV TS
+AV KD
+AS RE
+VV KR
+TG AT
+SL KQ
+RL NE
+QL DS
+NL VQ
+NI AR
+K WR
+EE EI
+AS ED
+RG TF
+NT VL
+IT AL
+IS IG
+AI VK
+TK AG
+R VLS
+LL CL
+IR KG
+IP VL
+IL C
+YE KL
+QT PL
+QI RE
+K SSS
+DS VS
+DL YL
+AS YL
+AA VP
+AA H
+RL QD
+QL AN
+NS TS
+N LLD
+IN SP
+GL TD
+FL TS
+AV VQ
+ANG S
+W WG
+VT KE
+QP QQ
+PL DP
+F WQ
+AL ATL
+AD IN
+VI TD
+VD PS
+QE IE
+PG KG
+IS KG
+DG AR
+AQ KL
+AM EK
+VN VG
+TT SG
+T AFL
+SS EL
+SL DP
+NL KT
+DG KG
+AF EE
+YL NS
+YL AD
+VV ED
+VS AP
+TP FF
+RV TL
+QL QR
+Q K</w>
+N ILG
+IL EL
+GL RP
+GL IF
+GL EK
+FS KL
+FL Y
+DL GS
+D VK
+VS DD
+RV AI
+RL H
+QE ML
+KI VR
+YL KR
+VI KD
+P GLG
+LL PR
+I VLD
+FR TT
+FE DS
+DE PL
+TV PS
+TGE RP
+T APL
+NL IT
+N AIL
+IR KS
+AV DD
+AP TS
+VN SS
+T GLS
+ML KL
+IQ VE
+I AMG
+DK IP
+DE KE
+AS PD
+AL TH
+YG SD
+VP KE
+VK IP
+TI RR
+QG KL
+PL IR
+HL GD
+GG AQ
+EE SE
+DG RR
+VI AL
+SS FL
+RL EK
+QL PP
+KT IT
+K CR
+I VLS
+SS IT
+RL SR
+M VVG
+K RLE
+IE DD
+DN VG
+AQ SL
+AE NE
+VS IS
+TL DP
+PV SS
+PH IKPEWYFLFAYAILRSIPNKL
+FG ID
+F LLD
+DV AD
+AE SG
+VK PS
+TL IN
+QV RE
+QL TD
+NL QR
+NL NG
+KL SP
+IR NE
+FD PE
+F AAE
+AS ID
+AAL AA
+YL ID
+Y MK
+VR PE
+TT VV
+QL PE
+NG LL
+HL RE
+DL TT
+AV DG
+AS H
+AP IG
+TP P
+SL EQ
+II Y
+I VVE
+FG FD
+AR KG
+YL GE
+SL SF
+Q AAS
+P LLG
+NG AG
+KL IL
+VT ES
+VR VE
+VI EQ
+TV PG
+KE KN
+IK AN
+FS KK
+DE AK
+D KD
+AI RS
+R KLE
+ML RE
+KG ND
+GL DF
+FR PG
+FL QG
+D AN
+AS DD
+VR Y
+TV LL
+Q GGG
+PL SD
+KE TG
+K VIE
+GL IY
+FD RL
+EE AG
+AF GS
+VL FF
+LLL RR
+K AVG
+K ANL
+IG RL
+I SLE
+FS PE
+VE IP
+TL Y
+QL KD
+KT LL
+IK TG
+AP TR
+AN SG
+AL MG
+AD FG
+T VDG
+N MV
+IT AG
+FL PE
+VT KG
+VR LL
+TL QS
+IV AQ
+FT KD
+F ADG
+AE NG
+VI TE
+VI IS
+VG PG
+VE IT
+N WE
+ML DG
+LL DQ
+KN KD
+KL ND
+ED VE
+AE RQ
+AA FD
+YL AR
+VT EL
+VS RR
+VS KS
+V GLP
+RE IT
+R AEG
+PS AT
+NL VD
+FS PD
+D AF
+AG TY
+TI IE
+RS PL
+NP EL
+KT KS
+I AAS
+EEEE EEEE
+AL SF
+VS VL
+VS PG
+VAD S
+SS VV
+PS GG
+IK VG
+I ELE
+AL ND
+AL DN
+AI QE
+VLPWGQMSFWGATVITNLLSAIPY IG
+TP DE
+RI GG
+Q WS
+PL KL
+KK EK
+K AEG
+IV AK
+IE IL
+DL IQ
+V QEG
+TK SG
+M WG
+I ALE
+DS QR
+C GLP
+AV AF
+VE AF
+VD AD
+V AIE
+TL QQ
+SG EE
+RP DL
+RE AE
+R AFL
+PD P
+KR VR
+KP VV
+GL FD
+FG AE
+DL EN
+AP RL
+AG QG
+Y HQ
+VP PE
+TV KL
+SL NP
+RK IS
+RG KL
+NL AN
+KE QG
+H LLE
+FL IS
+AR P
+VT IE
+VL VF
+IT KK
+FG SD
+YL AT
+YG EE
+VPG S
+SL SN
+SL ND
+R ARE
+Q ILG
+NS NN
+KT IK
+GL AN
+FL ED
+EL YE
+D TG
+D TE
+AP KP
+W YP
+VS VR
+VL RT
+VI VK
+VD IR
+TT AT
+SE DD
+RI IP
+R L</w>
+LL RH
+IT IN
+IT ED
+F MH
+AL HG
+VR AS
+VI NE
+VE PE
+RF VQ
+KS QL
+II IN
+FS DD
+AV QG
+AF SP
+VG YG
+Q CQ
+M AKE
+K VLS
+IE P
+FE KG
+EE AS
+AR ET
+AK EI
+AI AY
+VV DP
+VE FD
+HL EG
+FE RL
+EL TT
+TS LL
+TL PD
+TL HP
+SL SQ
+R APL
+KV LL
+ID KK
+FL AS
+K I</w>
+DE NE
+AY RR
+SV AE
+RI VN
+PS RS
+KI AS
+II PL
+AV NE
+AI TD
+VY DL
+PE AE
+KI AN
+IE Y
+GG TT
+DG SD
+VL DK
+QL TG
+AR H
+AI EQ
+AG RE
+SL TN
+RP AP
+QL SP
+NL IR
+IQ KL
+W NI
+VK PL
+TL QG
+Q CR
+KK KN
+IS AM
+GL VF
+DD VV
+CG KG
+VP KG
+VP DS
+VE AS
+V QLG
+TI AS
+TD VL
+SL IF
+RV FE
+QL KR
+ME RL
+KL AT
+IL SK
+I NLG
+FGG AGVG
+C CF
+AP LL
+AI FL
+AA AH
+VD IP
+TT SD
+M VIG
+M HK
+IN PL
+IG KL
+GL ET
+YL QE
+RV AF
+QV SS
+Q ASS
+N MR
+KE FE
+DL R
+DI AK
+AM EG
+TL ER
+SL YG
+RS AS
+QV AT
+PP PE
+NL ND
+NL DG
+II PG
+DE VD
+DE P
+AI VL
+AG VN
+TP GL
+Q SLG
+PSAVG YQP
+ND EE
+MV NG
+MGG MGG
+K GGG
+IE VT
+ID AT
+FD PS
+AA PE
+QR RL
+QL RG
+NL YG
+KI IP
+IG VP
+IG SS
+IG IT
+DL NG
+VAN S
+TG Y
+SI EE
+RV PE
+QL GS
+IS NN
+DL DK
+AL ST
+AA RN
+YL GS
+VE GD
+TL FS
+TGL FLAMHY
+T ELE
+NK VD
+LL DN
+II FS
+FD EL
+AG ED
+VS IP
+N MY
+N ELG
+KG IE
+DD IL
+AL RF
+AA QS
+TL IP
+RT TG
+R ILD
+IS Y
+IR TG
+AR Y
+AL YE
+VD AS
+SL NN
+PL TD
+NE TL
+KY VP
+KK SE
+KD KL
+K VDG
+IR KE
+H CR
+GL EN
+AQ GG
+ANGASMFF ICL
+AI TT
+AH PD
+AAAA AAAA
+YD IE
+VSS E
+VN EG
+VL RP
+RK QL
+RE RD
+QR AQ
+AV FL
+AI EN
+AA NS
+QG VV
+Q EEE
+PGD VF
+NK KL
+KK IN
+IT IT
+IG NL
+EL RP
+C YF
+TL KT
+SL QP
+Q QLE
+KE ID
+K WD
+I ARE
+DI RR
+C YK
+QV AK
+N APG
+KE Y
+DE SE
+AP RR
+AL TN
+AF VL
+YE AR
+VD SG
+ML SP
+KG SS
+GL TF
+AL KY
+W TH
+VK AD
+VF RS
+SG DP
+RT SS
+GG TG
+C YQ
+NP AT
+N ADL
+KS Y
+KE NE
+IS KN
+IE IP
+I VSE
+I REG
+FP DG
+AE VY
+VP TT
+VP IS
+VN ID
+RR KS
+RL VT
+NL VV
+KR IL
+KL VR
+KE AR
+DV AG
+AV VN
+AV IN
+AL ME
+W AC
+T ARG
+RG RS
+KS KN
+IS RD
+F AEG
+DV AA
+DS AR
+AQ VL
+AE IK
+RN GG
+QD W
+NS FL
+ML TG
+KP VI
+FT AN
+VGE RTREG
+TL IL
+TK AV
+QL RQ
+Q WR
+NL ER
+NI ID
+N MQ
+KI VS
+KI RL
+KI FS
+FI SS
+EE NE
+DE AT
+VD HP
+V YLG
+TI FS
+R PEG
+PS RL
+NL VT
+NG TS
+ID ED
+GL KT
+DE AS
+AM AR
+AL AAL
+AA RY
+VR AD
+VI RS
+SL IQ
+RQ RR
+QE GE
+N CT
+KV RG
+I KKE
+FD KL
+DI VE
+C AW
+R EEE
+NI IK
+KV SL
+IL IE
+IG PG
+IG EL
+H SLG
+GL F
+FV AR
+D VN
+AA KS
+VV GD
+NG HL
+M YI
+IE KR
+DT SG
+DL VF
+DK AR
+AV TF
+VL SI
+PE GS
+KK AE
+FL PD
+EL DP
+VI VS
+TR DG
+TI KG
+TE VG
+TE KG
+T KKG
+SG RS
+RV FG
+PL PT
+KK VE
+IR P
+ID GS
+GL ST
+DL PD
+VL QR
+V I</w>
+RF GG
+NL PG
+KV RL
+I AGS
+EE DD
+DE TL
+AP SL
+YL PS
+VT AT
+VI KR
+RV QG
+Q AGG
+IE AN
+IDE AR
+H WE
+AQ PG
+AD AK
+VG AE
+SG AI
+RF TT
+R TLG
+LL YS
+H AVL
+NL RD
+NL AD
+KP SS
+IS RE
+IL ID
+II EL
+H CP
+EE ER
+DL RN
+AM AT
+VT PD
+VP TS
+VI TL
+VI PP
+VG AN
+M ANL
+I GLD
+D RG
+TG RL
+RQ AG
+RK RG
+Q ELE
+KKP NS
+KG KS
+IT PD
+FE VV
+DS P
+DI VV
+AD SD
+TI AN
+TF SS
+SL TQ
+IQ P
+IL ST
+II PS
+IG ED
+EL RN
+AK GS
+YL QQ
+VT VL
+RE VV
+QI KE
+KL DG
+KK PL
+K AQL
+IK VE
+DG NG
+AL RAL
+AL DY
+AF YG
+YL TS
+VG AP
+TN SS
+RR GS
+RR GG
+RL TD
+PG VS
+IT SE
+IR TP
+IL FF
+AI GD
+AA KT
+VG ED
+TL VN
+RI SS
+KD VK
+IV FL
+H LLS
+FV SS
+AI VN
+AG DS
+AA AAG
+W FQ
+VP FL
+VI TT
+TPGHVD FT
+TL ST
+SG SQ
+QMNE PPG
+QG IT
+C CR
+AN EG
+SL RT
+RLL ER
+RL FE
+R SLE
+LL IF
+KK VK
+DE IN
+D VY
+C NV
+AG RP
+AD PS
+AD F
+W PF
+RG VS
+PV TG
+NL Y
+FG ES
+DL DP
+DG KS
+AT NP
+TR NG
+R EEG
+K SSG
+IL SN
+IL FG
+IK AT
+GL PI
+ET AE
+EE FE
+DE SD
+AV GD
+RS AR
+RL ER
+RG VE
+M LLE
+LL SV
+KS TG
+EL NS
+AD AN
+AA KQ
+YL QS
+R AIG
+QV KK
+QM DG
+KE SE
+KD PS
+IS VD
+I SGE
+GL NE
+AS Y
+W KM
+PL HL
+IG VK
+DG VF
+D YE
+D IN
+C MP
+AV TR
+AR YL
+W NQ
+VV TT
+TL KQ
+PG YG
+LL II
+IE DS
+I ANG
+AV TT
+A F</w>
+VS QG
+VE RS
+VE AK
+TP AQ
+TG IT
+QL TL
+KR KS
+IG FD
+AA RP
+YY GG
+TS DS
+T KKE
+T DLG
+IK VT
+IK AK
+ID FS
+I ASE
+AY ED
+AS AN
+AP RG
+V AYG
+TV GL
+PS FS
+NL AL
+IDE ID
+DV KK
+DE AY
+AH VD
+YL NE
+VS KT
+SE SD
+QL QL
+PS AE
+KK YG
+IL NP
+IG QS
+IE RF
+GG VM
+GG SP
+C VW
+AP VT
+VF EE
+VE RE
+VD AE
+TL RQ
+SL TF
+RT RR
+RL IR
+NL ED
+M SSS
+KE VT
+KD AI
+GL EI
+FG AT
+AS F
+AF EG
+AD TT
+VL SN
+T PLE
+SS PP
+RI IS
+QI KK
+QE AQ
+PL SQ
+KP FL
+IP P
+I AFG
+AS KT
+AA HL
+VAS E
+RD FL
+QV AV
+M AGG
+DI VI
+AA RF
+RL NG
+RL FS
+RAR S
+KL DS
+KE KS
+IN PD
+FT GG
+DL F
+VP AV
+RL PP
+RD P
+N SLS
+HIC RDVNYG
+AS PP
+AG FL
+VV TP
+VV AQ
+VQ EL
+VF TD
+TR VY
+TR EE
+SG SP
+N AKL
+MS KE
+ML AE
+KL QQ
+K NLG
+IS TD
+IG SL
+APG FGD
+AL TQ
+YL PG
+YG EG
+VR QS
+VL FD
+VF DE
+TL IK
+TF SE
+RT RE
+K QLG
+DT VI
+AR FS
+AP VG
+AN AR
+SL KP
+RT LL
+IV QG
+IF DS
+IAD S
+FL ND
+DG VD
+AWL G
+AS AD
+AN IN
+AI QL
+R AVE
+PE IS
+IP VY
+IL VK
+DL VY
+YG SG
+VS PD
+SS NG
+RL RK
+R AIE
+IE NE
+DD LL
+C RV
+AL IQ
+AE KI
+TL QL
+QV FS
+Q AFL
+NG KG
+N ASS
+KR AK
+KG IT
+KE VL
+K QLE
+F RLG
+AS IN
+AP TG
+AI TP
+Y WS
+VE TE
+TF EE
+ML AG
+M MV
+IL QR
+ER MG
+YL DD
+TG ID
+SL FN
+RL AF
+RF ID
+PR GG
+PE SP
+KL ME
+KL GD
+KI VI
+KD P
+DL KP
+AN PD
+TS DT
+RY IE
+PL VK
+IGE PG
+FP EE
+EE VG
+DV AV
+DG VR
+AG NL
+AD FS
+VV AF
+VS HL
+TV VL
+TI ID
+T IDG
+T FLG
+T AKG
+II NL
+FG FR
+AG AM
+AA NL
+VV IN
+VD LL
+TS KG
+TE DS
+SS NE
+SL HL
+Q ASG
+PE SS
+IL PP
+IK SE
+FL TD
+DL QD
+W AW
+RK EL
+PG VR
+PE NF
+KK SG
+IT NE
+IG GL
+IF SG
+D FG
+AR AF
+AF SD
+VQ VI
+TP LL
+RI FG
+PL SR
+PE Y
+KR VK
+IS FS
+IQ EE
+DL HL
+DG ID
+AT VR
+VV SL
+VIS ITDG
+T ANL
+PE PP
+IT VT
+EE EEE
+AD IR
+Q AQL
+KK SS
+IT RE
+FT SG
+AN KE
+AF SE
+VR AE
+VN PE
+TL QD
+TL EN
+SL SI
+RR AA
+PL AQ
+IT QH
+IL KP
+IE AQ
+GL RT
+AVD PL
+VS VI
+VP AT
+VF PD
+RF RG
+QV EE
+NI LL
+IT KD
+IT EG
+IK NN
+H SSG
+D VP
+D TD
+AY AD
+VK VW
+VD RD
+TP GG
+TL FD
+R KEG
+IY SG
+IL VE
+FHPYY TIKD
+VD IE
+TI PS
+T VKS
+RL QL
+QE LL
+PPG S
+KI VQ
+IK AR
+FL AQ
+DL QS
+AI KL
+AG KD
+VV SD
+VT RE
+VI SP
+VG DD
+T VIE
+RL DS
+ML AR
+MG QK
+KT VN
+KG KR
+IV PS
+IG TG
+FE QL
+DR DG
+DE TT
+AG QE
+YL AV
+VG RD
+TL FL
+T AKE
+SE SS
+M LLS
+IR QL
+IL QL
+GG IS
+FG EE
+AS QS
+YL DE
+W QI
+VS ALLG
+VD PG
+TS PS
+RR VR
+RL TT
+R VVD
+IP VP
+IE SS
+I GLE
+I AHG
+AP AD
+AE QG
+AD EI
+W YN
+W PQ
+W PN
+VV ER
+VI VI
+TE KK
+R VLD
+NL QS
+NG VT
+M AFL
+KT SS
+KN KL
+KE KY
+II FF
+IG VT
+DS PL
+AY AT
+AP KE
+VD VR
+I SGS
+HP EL
+AT QL
+AA VAA
+VD AR
+T DGS
+SL ER
+PL FE
+NI EE
+K AIG
+IT VE
+IK NE
+DV LL
+VG SD
+VG KR
+VD VP
+RL KQ
+RE RG
+R GLS
+NG VS
+IS AR
+IG AL
+I YEG
+FV AK
+FR KL
+FG DY
+EE GG
+DG VL
+DG EL
+AK RP
+AE TT
+VN KL
+VG NL
+TW NIG
+TR AE
+TK AE
+RL IQ
+PG AT
+IT AR
+FD IE
+AV RQ
+AT QG
+AS QE
+AD DE
+VT VH
+VP AR
+TL VF
+TE IK
+T DLE
+P APS
+KV GG
+KN KN
+GL QL
+GL QE
+FL GD
+FG Y
+AM KK
+W CE
+VH LLFLHETGSNN
+PG RR
+PG AG
+KK VR
+KI Y
+KE SS
+IT DS
+IS VV
+HR SGE
+DV AE
+DL QG
+D NG
+VV AN
+VT PS
+VS AN
+VEW IWGGFSVDKATLTRFFAFHF
+TD PE
+RE RS
+R TLE
+QF MD
+Q ARG
+PL AI
+M AQL
+GL FF
+DG TG
+AD II
+VL DT
+VK VF
+SG Y
+ML KG
+KK EL
+AT FS
+AK RF
+AK AY
+AA FE
+VV VL
+RG TG
+NL N
+M ALS
+KL EN
+KE QL
+FL ET
+AG KTT
+SL QN
+PE DE
+KF SS
+ID PS
+AV EQ
+W RI
+W KW
+T TLS
+RV AD
+RL RN
+RL QR
+QS QS
+PK VL
+LL TF
+KN P
+IN SS
+FK DD
+DG VE
+DE IS
+AT KL
+VE KT
+TK AQ
+RE RF
+N ALS
+KV FE
+IV KL
+IR AL
+ID VL
+EI AR
+VK PD
+VI AI
+ST SG
+PG EE
+MT EE
+ML SG
+ID VS
+ID FD
+HL EE
+DV AF
+C HI
+TG IP
+RE VL
+RE VI
+RE KS
+KI AP
+C CY
+RP IS
+QS VV
+PL KS
+NV SS
+KR AA
+IV FG
+GL HQ
+FS KD
+FL SP
+DV SG
+AR EK
+AQ QQ
+VAT E
+NN IN
+M SSG
+IK SD
+AQ VK
+VS F
+T ANG
+R ADG
+PS IS
+PE TS
+NP DL
+N AEL
+KI AL
+ID DE
+HL VE
+H MK
+FL KQ
+AI RN
+AG TF
+VK IN
+VEE S
+TG TL
+TG DS
+TG DL
+SL DN
+RP DG
+QL TS
+KV QE
+KK KY
+IQ AL
+GL SR
+DI IN
+C HN
+AA VF
+Y CR
+VR PD
+VL QD
+SE VE
+RVK DLPG
+PR PP
+PL EL
+PKG RN
+D RD
+C YI
+AT PL
+AN VI
+AA TQ
+VT LL
+TK TT
+RG AR
+NP KG
+IG DS
+I SLP
+FS DL
+D NS
+W MP
+VV NG
+VR YH
+VG LL
+VG AR
+RL IK
+RK SG
+R FLG
+PG ES
+N LLS
+KE IN
+IR SG
+IL H
+IE VK
+IE GS
+GG DS
+AK SL
+VQ AL
+TQ HS
+T PLS
+SI SS
+RG EL
+PV GL
+MS KR
+KN VL
+KL VL
+K V</w>
+IL VI
+ID IN
+F ASS
+D KR
+AG VF
+VT SE
+VS QE
+VQ PG
+SSSS SS
+RQ PE
+R RLE
+KD KD
+H VAL
+GG AF
+AWL E
+VE P
+TV FL
+TG ED
+RR P
+RK EE
+R AKG
+NL PD
+NL NN
+KT RR
+HGGG EG
+FD EE
+AQ IR
+AE RD
+VD AF
+TK AT
+SL YE
+PV EE
+NL VF
+IE PL
+H MR
+DL VP
+D AY
+AV AP
+VD PD
+TL VI
+N GGG
+IT VG
+HL KK
+FL QQ
+AR EE
+AE VF
+AD PD
+TE RG
+RL SL
+RG GL
+QL VT
+Q MM
+KH KS
+IT LL
+IP KE
+IL QN
+HKL GE
+FG EG
+F K</w>
+VS VN
+VR KG
+VE IK
+Q VVE
+LL AM
+FI EE
+FD QL
+AE NN
+YL RR
+W YT
+VQ AG
+TP RE
+T VEG
+RL KT
+RG YG
+QP AA
+PG AF
+NL VL
+NL EL
+N AEE
+FS DE
+DD VL
+D ML
+TR TF
+TM AE
+TI AQ
+RR VS
+RK AG
+RHG NKG
+R RLS
+IP TS
+GL QR
+VT PP
+TI IS
+TE SS
+SL DR
+RL EL
+PL ED
+KN GG
+ID IS
+H AGL
+DI SS
+AS DP
+AG YG
+VV VR
+VP AQ
+VG AK
+TD AA
+QI ID
+NV EE
+M AVG
+KE AF
+IL ER
+IG DD
+C HY
+AE DS
+VK VR
+VD DD
+TP VV
+RG YE
+Q ILE
+PR EL
+ME VW
+KV FG
+HL DG
+GG RS
+AE PE
+YL KT
+VPL S
+VD TD
+TE LL
+N L</w>
+FL FS
+F AEE
+AN GG
+YR EL
+VK Y
+Q AEG
+DG KD
+D KT
+C IM
+VK SS
+SV LL
+NI AE
+MS EE
+KS VS
+KK ID
+IE KF
+I AVS
+FG TG
+DI FG
+YG SS
+VI ER
+QS AV
+QG VP
+PG SD
+NL QE
+N SLG
+KN VV
+KK TG
+IE NS
+HG SG
+H ADL
+FDG DQ
+AML S
+VY GG
+VQ P
+VI EN
+VI ED
+VG AY
+VF SS
+VAE S
+TL ET
+SV AS
+RL IL
+QL KL
+PE SE
+NL TT
+IK VD
+HL TE
+GL FE
+VI SR
+TV FG
+TF FE
+RI INE
+QE RE
+PG AP
+ML AK
+IV FS
+IL SR
+IG Y
+F ANL
+AR ER
+AM SS
+AF AL
+VV KD
+VH PD
+TL AH
+QV AS
+Q CD
+PS LL
+PL DG
+PE AL
+NN SS
+IT IP
+IT EK
+ID SG
+FD P
+DEG KG
+AI AH
+TP VF
+T GLD
+QN VN
+Q VLD
+NL EN
+II YD
+ID VE
+F WN
+DE KN
+AV KL
+Y MR
+VS AD
+T ELS
+RQ QL
+RL GD
+ML AS
+KK K</w>
+IR QG
+ID FE
+I TGE
+GL PT
+AA M
+VQ QL
+VP QE
+VG KD
+V G</w>
+TL ND
+SG ST
+SE KL
+RK VL
+PL NE
+NG IP
+IT AN
+IS EN
+IE AT
+I ALD
+H TLG
+DL AF
+AR AD
+ACL G
+VK DS
+VI RD
+T VIS
+QV GG
+PS TE
+MS RR
+KI AG
+IT TD
+FD RY
+EL DK
+AE AT
+VG RS
+VE RF
+TS VL
+RLE RE
+RK TG
+RI IN
+R VVE
+QG AG
+N AKK
+IG YG
+HNL QEHS
+FT VT
+AQ SG
+AP VN
+AA DP
+VD FE
+QK KL
+ML RR
+MG IP
+K IDG
+IS VL
+H AAG
+EL ME
+AR FE
+AG YE
+AE Y
+YL TG
+VV PL
+RL NS
+QL SR
+PQ SE
+NN P
+LL VM
+KV FS
+IID S
+GG SL
+F SSS
+D IF
+AA ET
+VP TP
+T VVE
+RV AS
+RD NG
+PG Y
+N TLG
+N CR
+KG ES
+ID RR
+ID FG
+AV QE
+AK VK
+AK SS
+AA YG
+W CP
+VS VK
+TS VS
+RK IG
+MV GG
+IK Y
+FV KE
+FD GL
+DS VI
+D TT
+VP SL
+VN SE
+VAT S
+TL VK
+RI AT
+RG AV
+RE IS
+QL DD
+Q WF
+NL VS
+KP VT
+KL IR
+KAI S
+K DGE
+FG AG
+AP IH
+A T</w>
+W HR
+VE DD
+TL YS
+RE QG
+RE DL
+PE RR
+IL EQ
+ID VV
+F GLS
+DI EE
+AV FE
+VE PL
+TT SP
+TD AE
+NP EE
+NL FD
+ML DE
+KV FL
+KK GG
+KI RG
+IS TE
+IE RR
+DE QG
+AL MD
+VT RR
+SG SL
+RL PD
+QL PG
+NI IG
+LLGD PDNY
+KV EL
+AK IK
+VQG S
+VG TS
+V KLS
+RV EL
+NL TR
+NI PL
+IP ML
+IL RN
+DL ST
+DD IT
+YG KD
+V D</w>
+TG SE
+TG AE
+T TGE
+QL PS
+I ILE
+DL QR
+AF KD
+AE QQ
+YL EL
+VR AI
+VN VV
+VK TE
+TP VD
+T ILE
+M WS
+M ALE
+KV AQ
+KE IP
+IL TF
+IF EE
+I AHL
+VK GS
+TL RN
+TE AQ
+PV PS
+ND DG
+K SGS
+IL QD
+IE KY
+ID YD
+IAE S
+YL DS
+VN DL
+TP ANPL
+SG AT
+RI AG
+RE FG
+PG VF
+PG PP
+LL HE
+C TV
+W LLL
+RN RR
+RL VP
+PS VF
+PS KP
+ML KK
+M GLE
+K PLG
+FT PE
+DE KS
+AY AE
+AF SL
+AF RE
+AE RY
+VT IP
+VT FE
+VL QF
+RL VQ
+QQQQQQQQ QQQQQQQQ
+QL DL
+LL NR
+KL YD
+KD EE
+IT AV
+IQ SG
+DG VM
+AYG E
+AS RP
+YL AN
+VL YG
+RI AD
+RG KK
+R GGS
+QE AL
+PP Y
+NK IK
+NI KE
+HP VLL
+FS FL
+FS DS
+EE KS
+W QF
+TL VQ
+PS ES
+PG KY
+K APL
+DG TF
+DG AS
+VI FG
+RV AT
+RH RH
+RG VN
+RG AS
+NP SS
+NK VI
+NE KK
+KT SL
+IL AH
+IE H
+I ANS
+FE VT
+DI IT
+AT HG
+AL SI
+VN TG
+VISITDG QI
+TS RS
+TE AT
+T VLD
+NG Y
+K D</w>
+FS RS
+FL EL
+AV AH
+AK VR
+SL FP
+R ITS
+R AAS
+QV AN
+PV AP
+II VV
+DG RT
+DD AR
+AV RN
+AR IS
+VQ VF
+TP AS
+RL ND
+QL NS
+PG DN
+NV KK
+KG VE
+IL SF
+I RRG
+FS VG
+FS HL
+FI SG
+VN P
+RF FL
+R KLS
+R GLE
+QS GG
+PL H
+N SSG
+MD IK
+KV PE
+IQ AA
+GL DK
+FR KG
+C IV
+VLG D
+VL SQ
+VL NP
+VG EP
+VE KP
+VD GG
+TS DE
+RL FL
+RH RR
+DG IT
+YS LL
+VI PG
+TR P
+TG AN
+NK PE
+NG TF
+M RLG
+KP TT
+KL EL
+IR TT
+FL PS
+DL SQ
+AV QQ
+AI VR
+A Y</w>
+W TW
+TS DD
+TG LL
+QL YE
+NK IL
+IR KQ
+FS AG
+FR EL
+DI IS
+AS DE
+YS VS
+W HF
+VV PP
+VD IS
+TR IG
+TD DE
+SE KG
+RY KG
+RK VR
+RG SD
+NP SG
+NG RR
+KV AT
+IK YL
+IK IT
+FS DG
+D IY
+AQ P
+AE FL
+W FV
+VK IE
+Q AIE
+LL QT
+K AGE
+I TLS
+H MY
+GL KQ
+F EEG
+DL AY
+AV QS
+VI SL
+TI TL
+TG AD
+TD TE
+RL RT
+QL VV
+IK AL
+GL DY
+AF EK
+VN SG
+VE NS
+VE AY
+VD DS
+RT HL
+QG EG
+PS YS
+NTP PHIKPEWYFLFAYAILRSIPNKL
+NG IL
--- a/config/subword_units_map_chembl.csv
+++ b/config/subword_units_map_chembl.csv
--- a/config/subword_units_map_uniprot.csv
+++ b/config/subword_units_map_uniprot.csv
--- a/config/test.ipynb
+++ b/config/test.ipynb
--- a/config/token_frequency.pickle
+++ b/config/token_frequency.pickle
--- a/config/vocab.txt
+++ b/config/vocab.txt
--- a/config/vocab_mol.txt
+++ b/config/vocab_mol.txt
--- a/configuration_bert.py
+++ b/configuration_bert.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration """
+from transformers.configuration_utils import PretrainedConfig
+BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json",
+    "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/config.json",
+    "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/config.json",
+    "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/config.json",
+    "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json",
+    "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json",
+    "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/config.json",
+    "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/config.json",
+    "bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/config.json",
+    "bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/config.json",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/config.json",
+    "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/config.json",
+    "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/config.json",
+    "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese-char": "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/config.json",
+    "TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json",
+    "TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/config.json",
+    "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/config.json",
+    "BertAffinity": "./config/config.json"
+    # See all BERT models at https://huggingface.co/models?filter=bert
+}
+class BertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a
+    :class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
+            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
+            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
+            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
+            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
+            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
+            <https://arxiv.org/abs/2009.13658>`__.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+    Examples::
+        >>> from transformers import BertModel, BertConfig
+        >>> # Initializing a BERT bert-base-uncased style configuration
+        >>> configuration = BertConfig()
+        >>> # Initializing a model from the bert-base-uncased style configuration
+        >>> model = BertModel(configuration)
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = "bert"
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        gradient_checkpointing=False,
+        position_embedding_type="absolute",
+        use_cache=True,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.gradient_checkpointing = gradient_checkpointing
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
--- a/data_preprocessing.py
+++ b/data_preprocessing.py
+from subword_nmt.apply_bpe import BPE
+import codecs
+import json
+import numpy as np
+from tqdm import tqdm
+import math
+import random
+def get_tokenzie_seq(file, save, mask=False):
+    begin_token = '[CLS]'
+    separate_token = "[SEP]"
+    with open(file['seq'], 'r') as f:
+        seq = f.readlines()
+    with open(file["smile"], 'r') as f:
+        smile = f.readlines()
+    with open(file["affinity"], 'r') as f:
+        affinity = f.readlines()
+    bpe_codes_drug = codecs.open('./config/drug_codes_chembl.txt')
+    dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
+    bpe_codes_prot = codecs.open('./config/protein_codes_uniprot.txt')
+    pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
+    with open(save, "w") as f:
+        for i in tqdm(range(len(seq))):
+            d = dbpe.process_line(smile[i].strip()).split()
+            p = pbpe.process_line(seq[i].strip()).split()
+            if mask == True:
+                d = random_mask(d)
+                p = random_mask(p)
+            final_seq = [begin_token] + d + [separate_token] + p + [separate_token]
+            affinity_num = affinity[i].strip()
+            item = {
+                "seq": " ".join(final_seq),
+                "affinity": affinity_num
+            }
+            new_item = json.dumps(item)
+            f.write(new_item + '\n')
+def random_mask(input_seq, mask_proportion=0.15):
+    mask_len = math.ceil(len(input_seq)*mask_proportion)
+    mask_token_posi = np.random.choice(len(input_seq), mask_len)
+    for i in mask_token_posi:
+        choice = random.random()
+        if choice < 0.8:
+            input_seq[i] = "[MASK]"
+            # mask_vec[i] = 1
+    # elif choice >= 0.8 and choice < 0.9:
+    return input_seq
+def get_tokenzie_seq_case(file, save, mask=False):
+    begin_token = '[CLS]'
+    separate_token = "[SEP]"
+    with open(file['seq'], 'r') as f:
+        seq = f.readlines()
+        seq = [i.strip() for i in seq]
+        seq = "".join(seq)
+    with open(file["smile"], 'r') as f:
+        smile = f.readlines()
+    # with open(file["affinity"], 'r') as f:
+    #     affinity = f.readlines()
+    bpe_codes_drug = codecs.open('./config/drug_codes_chembl.txt')
+    dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
+    bpe_codes_prot = codecs.open('./config/protein_codes_uniprot.txt')
+    pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
+    with open(save, "w") as f:
+        for i in tqdm(range(len(smile))):
+            d = dbpe.process_line(smile[i].strip()).split()
+            p = pbpe.process_line(seq).split()
+            if mask == True:
+                d = random_mask(d)
+                p = random_mask(p)
+            final_seq = [begin_token] + d + [separate_token] + p + [separate_token]
+            # affinity_num = affinity[i].strip()
+            item = {
+                "seq": " ".join(final_seq),
+                # "affinity": affinity_num
+            }
+            new_item = json.dumps(item)
+            f.write(new_item + '\n')
+if __name__ == '__main__':
+    # file_train = {"sps": './data/train/train_sps',
+    #             'seq': './data/train/train_protein_seq',
+    #             "smile": './data/train/train_smile',
+    #             "affinity": './data/train/train_ic50',
+    #             }
+    # save = "./data/tokenize_data/train.tokenize"
+    # save_mask = "./data/tokenize_data/train.tokenize.mask"
+    df_test = {"sps": './data/test/test_sps',
+               'seq': './data/test/test_protein_seq',
+               "smile": './data/test/test_smile',
+               "affinity": './data/test/test_ic50',
+               }
+    df_ER = {"sps": './data/ER/ER_sps',
+               'seq': './data/ER/ER_protein_seq',
+               "smile": './data/ER/ER_smile',
+               "affinity": './data/ER/ER_ic50',
+               }
+    df_GPCR = {"sps": './data/GPCR/GPCR_sps',
+               'seq': './data/GPCR/GPCR_protein_seq',
+               "smile": './data/GPCR/GPCR_smile',
+               "affinity": './data/GPCR/GPCR_ic50',
+               }
+    df_Ion_channel = {"sps": './data/Ion_channel/channel_sps',
+               'seq': './data/Ion_channel/channel_protein_seq',
+               "smile": './data/Ion_channel/channel_smile',
+               "affinity": './data/Ion_channel/channel_ic50',
+               }
+    df_Tyrosine_kinase = {"sps": './data/Tyrosine_kinase/kinase_sps',
+               'seq': './data/Tyrosine_kinase/kinase_protein_seq',
+               "smile": './data/Tyrosine_kinase/kinase_smile',
+               "affinity": './data/Tyrosine_kinase/kinase_ic50',
+               }
+    # save = "./data/tokenize_data/test.tokenize"
+    # save = "./data/tokenize_data/test.tokenize.mask"
+    # get_tokenzie_seq(df_test, save)
+    # get_tokenzie_seq(file_train, save_mask, mask=True)
+    # save_er = "./data/tokenize_data/er.tokenize"
+    # save_GPCR = "./data/tokenize_data/gpcr.tokenize"
+    # save_channel = "./data/tokenize_data/channel.tokenize"
+    # save_kinase = "./data/tokenize_data/kinase.tokenize"
+    save_er_mask = "./data/tokenize_data/er.tokenize.mask"
+    save_GPCR_mask = "./data/tokenize_data/gpcr.tokenize.mask"
+    save_channel_mask = "./data/tokenize_data/channel.tokenize.mask"
+    save_kinase_mask = "./data/tokenize_data/kinase.tokenize.mask"
+    # get_tokenzie_seq(df_ER, save_er)
+    # get_tokenzie_seq(df_GPCR, save_GPCR)
+    # get_tokenzie_seq(df_Ion_channel, save_channel)
+    # get_tokenzie_seq(df_Tyrosine_kinase, save_kinase)
+    # get_tokenzie_seq(df_ER, save_er_mask, mask=True)
+    # get_tokenzie_seq(df_GPCR, save_GPCR_mask, mask=True)
+    # get_tokenzie_seq(df_Ion_channel, save_channel_mask, mask=True)
+    # get_tokenzie_seq(df_Tyrosine_kinase, save_kinase_mask, mask=True)
+    df_case = {'seq': './case_study/data/spike.txt',
+               "smile": './data/test/test_smile',
+            #    "affinity": './data/Tyrosine_kinase/kinase_ic50',
+               }
+    save_case = "./case_study/spike.tokenize"
+    # get_tokenzie_seq_case(df_case, save_case)
+    #interaction datasets including the train, valide, test
+    ## bindingbd dataset
+    df_bindingbd_train =  {'seq':'./data/interaction/dataset/BindingDB/train/protein',
+                    'smile':'./data/interaction/dataset/BindingDB/train/smile',
+                    'affinity':'./data/interaction/dataset/BindingDB/train/label'}
+    save_bindingbd_train = './data/tokenize_data/bindingdb_train.tokenize'
+    # get_tokenzie_seq(df_bindingbd_train, save_bindingbd_train)
+    df_bindingbd_valid =  {'seq':'./data/interaction/dataset/BindingDB/validate/protein',
+                    'smile':'./data/interaction/dataset/BindingDB/validate/smile',
+                    'affinity':'./data/interaction/dataset/BindingDB/validate/label'}
+    save_bindingbd_valid = './data/tokenize_data/bindingdb_valid.tokenize'
+    # get_tokenzie_seq(df_bindingbd_valid, save_bindingbd_valid)
+    df_bindingbd_test =  {'seq':'./data/interaction/dataset/BindingDB/test/protein',
+                    'smile':'./data/interaction/dataset/BindingDB/test/smile',
+                    'affinity':'./data/interaction/dataset/BindingDB/test/label'}
+    save_bindingbd_test = './data/tokenize_data/bindingdb_test.tokenize'
+    # get_tokenzie_seq(df_bindingbd_test, save_bindingbd_test)
+    ##  biosnap
+    df_biosnap_train = {'seq':'./data/interaction/dataset/BIOSNAP/full_data/train/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/full_data/train/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/full_data/train/label'}
+    save_biosnap_train = './data/tokenize_data/biosnap_train.tokenize'
+    get_tokenzie_seq(df_biosnap_train, save_biosnap_train)
+    df_biosnap_valid = {'seq':'./data/interaction/dataset/BIOSNAP/full_data/validate/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/full_data/validate/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/full_data/validate/label'}
+    save_biosnap_valid = './data/tokenize_data/biosnap_valid.tokenize'
+    get_tokenzie_seq(df_biosnap_valid, save_biosnap_valid)
+    df_biosnap_test = {'seq':'./data/interaction/dataset/BIOSNAP/full_data/test/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/full_data/test/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/full_data/test/label'}
+    save_biosnap_test = './data/tokenize_data/biosnap_test.tokenize'
+    get_tokenzie_seq(df_biosnap_test, save_biosnap_test)
+    ##  davis
+    df_davis_train = {'seq':'./data/interaction/dataset/DAVIS/train/protein',
+                      'smile':'./data/interaction/dataset/DAVIS/train/smile',
+                      'affinity':'./data/interaction/dataset/DAVIS/train/label'}
+    save_davis_train = './data/tokenize_data/davis_train.tokenize'
+    # get_tokenzie_seq(df_davis_train, save_davis_train)
+    df_davis_valid = {'seq':'./data/interaction/dataset/DAVIS/validate/protein',
+                      'smile':'./data/interaction/dataset/DAVIS/validate/smile',
+                      'affinity':'./data/interaction/dataset/DAVIS/validate/label'}
+    save_davis_valid = './data/tokenize_data/davis_valid.tokenize'
+    # get_tokenzie_seq(df_davis_valid, save_davis_valid)
+    df_davis_test = {'seq':'./data/interaction/dataset/DAVIS/test/protein',
+                      'smile':'./data/interaction/dataset/DAVIS/test/smile',
+                      'affinity':'./data/interaction/dataset/DAVIS/test/label'}
+    save_davis_test = './data/tokenize_data/davis_test.tokenize'
+    # get_tokenzie_seq(df_davis_test, save_davis_test)
+    ## biosnap for unseen protein
+    df_up_biosnap_train = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_protein/train/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/unseen_protein/train/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/unseen_protein/train/label'}
+    save_up_biosnap_train = './data/tokenize_data/biosnap_unseen_protein_train.tokenize'
+    # get_tokenzie_seq(df_up_biosnap_train, save_up_biosnap_train)
+    df_up_biosnap_valid = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_protein/validate/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/unseen_protein/validate/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/unseen_protein/validate/label'}
+    save_up_biosnap_valid = './data/tokenize_data/biosnap_unseen_protein_valid.tokenize'
+    # get_tokenzie_seq(df_up_biosnap_valid, save_up_biosnap_valid)
+    df_up_biosnap_test = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_protein/test/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/unseen_protein/test/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/unseen_protein/test/label'}
+    save_up_biosnap_test = './data/tokenize_data/biosnap_unseen_protein_test.tokenize'
+    # get_tokenzie_seq(df_up_biosnap_test, save_up_biosnap_test)
+    ## biosnap for unseen drug
+    df_ud_biosnap_train = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_drug/train/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/unseen_drug/train/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/unseen_drug/train/label'}
+    save_ud_biosnap_train = './data/tokenize_data/biosnap_unseen_drug_train.tokenize'
+    # get_tokenzie_seq(df_ud_biosnap_train, save_ud_biosnap_train)
+    df_ud_biosnap_valid = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_drug/validate/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/unseen_drug/validate/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/unseen_drug/validate/label'}
+    save_ud_biosnap_valid = './data/tokenize_data/biosnap_unseen_drug_valid.tokenize'
+    # get_tokenzie_seq(df_ud_biosnap_valid, save_ud_biosnap_valid)
+    df_ud_biosnap_test = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_drug/test/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/unseen_drug/test/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/unseen_drug/test/label'}
+    save_ud_biosnap_test = './data/tokenize_data/biosnap_unseen_drug_test.tokenize'
+    # get_tokenzie_seq(df_ud_biosnap_test, save_ud_biosnap_test)
\ No newline at end of file
--- a/data_preprocessing_kmers.py
+++ b/data_preprocessing_kmers.py
+from subword_nmt.apply_bpe import BPE
+import codecs
+import json
+import numpy as np
+from tqdm import tqdm
+import math
+import random
+def get_tokenzie_seq(file, save, mask=False):
+    begin_token = '[CLS]'
+    separate_token = "[SEP]"
+    with open(file['seq'], 'r') as f:
+        seq = f.readlines()
+    with open(file["smile"], 'r') as f:
+        smile = f.readlines()
+    with open(file["affinity"], 'r') as f:
+        affinity = f.readlines()
+    bpe_codes_drug = codecs.open('./config/drug_codes_chembl.txt')
+    dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
+    bpe_codes_prot = codecs.open('./config/protein_codes_uniprot.txt')
+    pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
+    with open(save, "w") as f:
+        for i in tqdm(range(len(seq))):
+            d = dbpe.process_line(smile[i].strip()).split()
+            p = pbpe.process_line(seq[i].strip()).split()
+            if mask == True:
+                d = random_mask(d)
+                p = random_mask(p)
+            final_seq = [begin_token] + d + [separate_token] + p + [separate_token]
+            affinity_num = affinity[i].strip()
+            item = {
+                "seq": " ".join(final_seq),
+                "affinity": affinity_num
+            }
+            new_item = json.dumps(item)
+            f.write(new_item + '\n')
+def random_mask(input_seq, mask_proportion=0.15):
+    mask_len = math.ceil(len(input_seq)*mask_proportion)
+    mask_token_posi = np.random.choice(len(input_seq), mask_len)
+    for i in mask_token_posi:
+        choice = random.random()
+        if choice < 0.8:
+            input_seq[i] = "[MASK]"
+            # mask_vec[i] = 1
+    # elif choice >= 0.8 and choice < 0.9:
+    return input_seq
+def get_tokenzie_seq_case(file, save, mask=False):
+    begin_token = '[CLS]'
+    separate_token = "[SEP]"
+    with open(file['seq'], 'r') as f:
+        seq = f.readlines()
+        seq = [i.strip() for i in seq]
+        seq = "".join(seq)
+    with open(file["smile"], 'r') as f:
+        smile = f.readlines()
+    # with open(file["affinity"], 'r') as f:
+    #     affinity = f.readlines()
+    bpe_codes_drug = codecs.open('./config/drug_codes_chembl.txt')
+    dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
+    bpe_codes_prot = codecs.open('./config/protein_codes_uniprot.txt')
+    pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
+    with open(save, "w") as f:
+        for i in tqdm(range(len(smile))):
+            d = dbpe.process_line(smile[i].strip()).split()
+            p = pbpe.process_line(seq).split()
+            if mask == True:
+                d = random_mask(d)
+                p = random_mask(p)
+            final_seq = [begin_token] + d + [separate_token] + p + [separate_token]
+            # affinity_num = affinity[i].strip()
+            item = {
+                "seq": " ".join(final_seq),
+                # "affinity": affinity_num
+            }
+            new_item = json.dumps(item)
+            f.write(new_item + '\n')
+if __name__ == '__main__':
+    # file_train = {"sps": './data/train/train_sps',
+    #             'seq': './data/train/train_protein_seq',
+    #             "smile": './data/train/train_smile',
+    #             "affinity": './data/train/train_ic50',
+    #             }
+    # save = "./data/tokenize_data/train.tokenize"
+    # save_mask = "./data/tokenize_data/train.tokenize.mask"
+    df_test = {"sps": './data/test/test_sps',
+               'seq': './data/test/test_protein_seq',
+               "smile": './data/test/test_smile',
+               "affinity": './data/test/test_ic50',
+               }
+    df_ER = {"sps": './data/ER/ER_sps',
+               'seq': './data/ER/ER_protein_seq',
+               "smile": './data/ER/ER_smile',
+               "affinity": './data/ER/ER_ic50',
+               }
+    df_GPCR = {"sps": './data/GPCR/GPCR_sps',
+               'seq': './data/GPCR/GPCR_protein_seq',
+               "smile": './data/GPCR/GPCR_smile',
+               "affinity": './data/GPCR/GPCR_ic50',
+               }
+    df_Ion_channel = {"sps": './data/Ion_channel/channel_sps',
+               'seq': './data/Ion_channel/channel_protein_seq',
+               "smile": './data/Ion_channel/channel_smile',
+               "affinity": './data/Ion_channel/channel_ic50',
+               }
+    df_Tyrosine_kinase = {"sps": './data/Tyrosine_kinase/kinase_sps',
+               'seq': './data/Tyrosine_kinase/kinase_protein_seq',
+               "smile": './data/Tyrosine_kinase/kinase_smile',
+               "affinity": './data/Tyrosine_kinase/kinase_ic50',
+               }
+    # save = "./data/tokenize_data/test.tokenize"
+    # save = "./data/tokenize_data/test.tokenize.mask"
+    # get_tokenzie_seq(df_test, save)
+    # get_tokenzie_seq(file_train, save_mask, mask=True)
+    # save_er = "./data/tokenize_data/er.tokenize"
+    # save_GPCR = "./data/tokenize_data/gpcr.tokenize"
+    # save_channel = "./data/tokenize_data/channel.tokenize"
+    # save_kinase = "./data/tokenize_data/kinase.tokenize"
+    save_er_mask = "./data/tokenize_data/er.tokenize.mask"
+    save_GPCR_mask = "./data/tokenize_data/gpcr.tokenize.mask"
+    save_channel_mask = "./data/tokenize_data/channel.tokenize.mask"
+    save_kinase_mask = "./data/tokenize_data/kinase.tokenize.mask"
+    # get_tokenzie_seq(df_ER, save_er)
+    # get_tokenzie_seq(df_GPCR, save_GPCR)
+    # get_tokenzie_seq(df_Ion_channel, save_channel)
+    # get_tokenzie_seq(df_Tyrosine_kinase, save_kinase)
+    # get_tokenzie_seq(df_ER, save_er_mask, mask=True)
+    # get_tokenzie_seq(df_GPCR, save_GPCR_mask, mask=True)
+    # get_tokenzie_seq(df_Ion_channel, save_channel_mask, mask=True)
+    # get_tokenzie_seq(df_Tyrosine_kinase, save_kinase_mask, mask=True)
+    df_case = {'seq': './case_study/data/spike.txt',
+               "smile": './data/test/test_smile',
+            #    "affinity": './data/Tyrosine_kinase/kinase_ic50',
+               }
+    save_case = "./case_study/spike.tokenize"
+    # get_tokenzie_seq_case(df_case, save_case)
+    #interaction datasets including the train, valide, test
+    ## bindingbd dataset
+    df_bindingbd_train =  {'seq':'./data/interaction/dataset/BindingDB/train/protein',
+                    'smile':'./data/interaction/dataset/BindingDB/train/smile',
+                    'affinity':'./data/interaction/dataset/BindingDB/train/label'}
+    save_bindingbd_train = './data/tokenize_data/bindingdb_train.tokenize'
+    # get_tokenzie_seq(df_bindingbd_train, save_bindingbd_train)
+    df_bindingbd_valid =  {'seq':'./data/interaction/dataset/BindingDB/validate/protein',
+                    'smile':'./data/interaction/dataset/BindingDB/validate/smile',
+                    'affinity':'./data/interaction/dataset/BindingDB/validate/label'}
+    save_bindingbd_valid = './data/tokenize_data/bindingdb_valid.tokenize'
+    # get_tokenzie_seq(df_bindingbd_valid, save_bindingbd_valid)
+    df_bindingbd_test =  {'seq':'./data/interaction/dataset/BindingDB/test/protein',
+                    'smile':'./data/interaction/dataset/BindingDB/test/smile',
+                    'affinity':'./data/interaction/dataset/BindingDB/test/label'}
+    save_bindingbd_test = './data/tokenize_data/bindingdb_test.tokenize'
+    # get_tokenzie_seq(df_bindingbd_test, save_bindingbd_test)
+    ##  biosnap
+    df_biosnap_train = {'seq':'./data/interaction/dataset/BIOSNAP/full_data/train/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/full_data/train/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/full_data/train/label'}
+    save_biosnap_train = './data/tokenize_data/biosnap_train.tokenize'
+    get_tokenzie_seq(df_biosnap_train, save_biosnap_train)
+    df_biosnap_valid = {'seq':'./data/interaction/dataset/BIOSNAP/full_data/validate/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/full_data/validate/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/full_data/validate/label'}
+    save_biosnap_valid = './data/tokenize_data/biosnap_valid.tokenize'
+    get_tokenzie_seq(df_biosnap_valid, save_biosnap_valid)
+    df_biosnap_test = {'seq':'./data/interaction/dataset/BIOSNAP/full_data/test/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/full_data/test/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/full_data/test/label'}
+    save_biosnap_test = './data/tokenize_data/biosnap_test.tokenize'
+    get_tokenzie_seq(df_biosnap_test, save_biosnap_test)
+    ##  davis
+    df_davis_train = {'seq':'./data/interaction/dataset/DAVIS/train/protein',
+                      'smile':'./data/interaction/dataset/DAVIS/train/smile',
+                      'affinity':'./data/interaction/dataset/DAVIS/train/label'}
+    save_davis_train = './data/tokenize_data/davis_train.tokenize'
+    # get_tokenzie_seq(df_davis_train, save_davis_train)
+    df_davis_valid = {'seq':'./data/interaction/dataset/DAVIS/validate/protein',
+                      'smile':'./data/interaction/dataset/DAVIS/validate/smile',
+                      'affinity':'./data/interaction/dataset/DAVIS/validate/label'}
+    save_davis_valid = './data/tokenize_data/davis_valid.tokenize'
+    # get_tokenzie_seq(df_davis_valid, save_davis_valid)
+    df_davis_test = {'seq':'./data/interaction/dataset/DAVIS/test/protein',
+                      'smile':'./data/interaction/dataset/DAVIS/test/smile',
+                      'affinity':'./data/interaction/dataset/DAVIS/test/label'}
+    save_davis_test = './data/tokenize_data/davis_test.tokenize'
+    # get_tokenzie_seq(df_davis_test, save_davis_test)
+    ## biosnap for unseen protein
+    df_up_biosnap_train = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_protein/train/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/unseen_protein/train/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/unseen_protein/train/label'}
+    save_up_biosnap_train = './data/tokenize_data/biosnap_unseen_protein_train.tokenize'
+    # get_tokenzie_seq(df_up_biosnap_train, save_up_biosnap_train)
+    df_up_biosnap_valid = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_protein/validate/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/unseen_protein/validate/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/unseen_protein/validate/label'}
+    save_up_biosnap_valid = './data/tokenize_data/biosnap_unseen_protein_valid.tokenize'
+    # get_tokenzie_seq(df_up_biosnap_valid, save_up_biosnap_valid)
+    df_up_biosnap_test = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_protein/test/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/unseen_protein/test/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/unseen_protein/test/label'}
+    save_up_biosnap_test = './data/tokenize_data/biosnap_unseen_protein_test.tokenize'
+    # get_tokenzie_seq(df_up_biosnap_test, save_up_biosnap_test)
+    ## biosnap for unseen drug
+    df_ud_biosnap_train = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_drug/train/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/unseen_drug/train/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/unseen_drug/train/label'}
+    save_ud_biosnap_train = './data/tokenize_data/biosnap_unseen_drug_train.tokenize'
+    # get_tokenzie_seq(df_ud_biosnap_train, save_ud_biosnap_train)
+    df_ud_biosnap_valid = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_drug/validate/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/unseen_drug/validate/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/unseen_drug/validate/label'}
+    save_ud_biosnap_valid = './data/tokenize_data/biosnap_unseen_drug_valid.tokenize'
+    # get_tokenzie_seq(df_ud_biosnap_valid, save_ud_biosnap_valid)
+    df_ud_biosnap_test = {'seq':'./data/interaction/dataset/BIOSNAP/unseen_drug/test/protein',
+                        'smile':'./data/interaction/dataset/BIOSNAP/unseen_drug/test/smile',
+                        'affinity':'./data/interaction/dataset/BIOSNAP/unseen_drug/test/label'}
+    save_ud_biosnap_test = './data/tokenize_data/biosnap_unseen_drug_test.tokenize'
+    # get_tokenzie_seq(df_ud_biosnap_test, save_ud_biosnap_test)
\ No newline at end of file
--- a/dataset.py
+++ b/dataset.py
+import numpy as np
+import pandas as pd
+import torch
+from torch.utils import data
+import json
+import collections
+from torch.utils.data import DataLoader
+from subword_nmt.apply_bpe import BPE
+import codecs
+from collections import Counter
+from tqdm import tqdm
+import math
+import random
+from torch.nn.utils.rnn import pad_sequence
+import pickle, csv
+import os
+# vocab_path = './ESPF/protein_codes_uniprot.txt'
+# bpe_codes_protein = codecs.open(vocab_path)
+# pbpe = BPE(bpe_codes_protein, merges=-1, separator='')
+# sub_csv = pd.read_csv('./ESPF/subword_units_map_uniprot.csv')
+#
+# idx2word_p = sub_csv['index'].values
+# words2idx_p = dict(zip(idx2word_p, range(0, len(idx2word_p))))
+# vocab_path = './ESPF/drug_codes_chembl.txt'
+# bpe_codes_drug = codecs.open(vocab_path)
+# dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
+# sub_csv = pd.read_csv('./ESPF/subword_units_map_chembl.csv')
+#
+# idx2word_d = sub_csv['index'].values
+# words2idx_d = dict(zip(idx2word_d, range(0, len(idx2word_d))))
+# max_d = 205
+# max_p = 545
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    # vocab_file = os.path.join(os.path.dirname(__file__), 'config', 'vocab_mol.txt')
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+# def protein2emb_encoder(x, words2idx_p):
+#     max_p = 152
+#     # t1 = pbpe.process_line(x).split()  # split
+#     t1 = x.split(',')
+#     try:
+#         i1 = np.asarray([words2idx_p[i] for i in t1])  # index
+#     except:
+#         i1 = np.array([0])
+#         # print(x)
+#
+#     l = len(i1)
+#
+#     if l < max_p:
+#         i = np.pad(i1, (0, max_p - l), 'constant', constant_values=0)
+#         input_mask = ([1] * l) + ([0] * (max_p - l))
+#     else:
+#         i = i1[:max_p]
+#         input_mask = [1] * max_p
+#
+#     return i, np.asarray(input_mask)
+# def drug2emb_encoder(x, dbpe, words2idx_d):
+#     max_d = 50
+#     # max_d = 100
+#     t1 = dbpe.process_line(x)
+#     t1 = t1.split()  # split
+#     try:
+#         i1 = np.asarray([words2idx_d[i] for i in t1])  # index
+#     except:
+#         i1 = np.array([0])
+#         # print(x)
+#
+#     l = len(i1)
+#     print(i1)
+#
+#     if l < max_d:
+#         i = np.pad(i1, (0, max_d - l), 'constant', constant_values=0)
+#         input_mask = ([1] * l) + ([0] * (max_d - l))
+#
+#     else:
+#         i = i1[:max_d]
+#         input_mask = [1] * max_d
+#
+#     return i, np.asarray(input_mask)
+def seq2emb_encoder(input_seq, max_len, vocab):
+    try:
+        ids = np.asarray([vocab[i] for i in input_seq])
+    except:
+        ids = np.array([0])
+    l = len(ids)
+    if l < max_len:
+        ids = np.pad(ids, (0, max_len - l), 'constant', constant_values=0)
+        input_mask = np.array(([1] * l) + ([0] * (max_len - l)))
+    else:
+        ids = ids[:max_len]
+        input_mask = np.array([1] * max_len)
+    return ids, input_mask
+def seq2emb_encoder_simple(input_seq, max_len, vocab):
+    try:
+        ids = np.asarray([vocab[i] for i in input_seq])
+    except:
+        ids = np.array([0])
+    # l = len(ids)
+    #
+    # if l < max_len:
+    #     ids = np.pad(ids, (0, max_len - l), 'constant', constant_values=0)
+    #     input_mask = np.array(([1] * l) + ([0] * (max_len - l)))
+    # else:
+    #     ids = ids[:max_len]
+    #     input_mask = np.array([1] * max_len)
+    return ids
+class Data_Encoder(data.Dataset):
+    def __init__(self, train_file, tokenizer_config):
+        'Initialization'
+        # load data
+        with open(train_file["sps"], 'r') as f:
+            self.sps = f.readlines()
+        with open(train_file["smile"], 'r') as f:
+            self.smile = f.readlines()
+        with open(train_file["affinity"], 'r') as f:
+            self.affinity = f.readlines()
+        # define tokenizer
+        self.begin_id = tokenizer_config["begin_id"]
+        self.sep_id = tokenizer_config["separate_id"]
+        self.max_len = tokenizer_config["max_len"]
+        self.vocab = load_vocab(tokenizer_config["vocab_file"])
+        bpe_codes_drug = codecs.open(tokenizer_config["vocab_pair"])
+        self.dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.sps)
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        # Select sample
+        # Load data and get label
+        # tokenization
+        d = self.dbpe.process_line(self.smile[index].strip()).split()
+        p = self.sps[index].strip().split(',')
+        y = np.float(self.affinity[index].strip())
+        input_seq = [self.begin_id] + d + [self.sep_id] + p + [self.sep_id]
+        token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
+        token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
+        input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
+        return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(
+            input_mask).long(), y
+        # return len(d), len(p)
+class Data_Encoder_mol(data.Dataset):
+    def __init__(self, train_file, tokenizer_config):
+        'Initialization'
+        # load data
+        # with open(train_file["sps"], 'r') as f:
+        #     self.sps = f.readlines()
+        with open(train_file['seq'], 'r') as f:
+            self.seq = f.readlines()
+        with open(train_file["smile"], 'r') as f:
+            self.smile = f.readlines()
+        with open(train_file["affinity"], 'r') as f:
+            self.affinity = f.readlines()
+        # define tokenizer
+        self.begin_id = tokenizer_config["begin_id"]
+        self.sep_id = tokenizer_config["separate_id"]
+        self.max_len = tokenizer_config["max_len"]
+        self.vocab = load_vocab(tokenizer_config["vocab_file"])
+        bpe_codes_drug = codecs.open(tokenizer_config["vocab_pair"])
+        self.dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
+        bpe_codes_prot = codecs.open(tokenizer_config["vocab_pair_p"])
+        self.pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.smile)
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        # Select sample
+        # Load data and get label
+        # tokenization
+        d = self.dbpe.process_line(self.smile[index].strip()).split()
+        p = self.pbpe.process_line(self.seq[index].strip()).split()
+        y = np.float64(self.affinity[index].strip())
+        input_seq = [self.begin_id] + d + [self.sep_id] + p + [self.sep_id]
+        token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
+        if len(input_seq) > self.max_len:
+            input_seq = input_seq[:self.max_len-1] + [self.sep_id]
+            token_type_ids = token_type_ids[:self.max_len]
+        else:
+            token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
+        input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
+        return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(input_mask).long(), y
+        # return len(d), len(p)
+class Data_Encoder_LM(data.Dataset):
+    def __init__(self, train_file, tokenizer_config):
+        'Initialization'
+        # load data
+        # with open(train_file["sps"], 'r') as f:
+        #     self.sps = f.readlines()
+        with open(train_file['seq'], 'r') as f:
+            self.seq = f.readlines()
+        with open(train_file["smile"], 'r') as f:
+            self.smile = f.readlines()
+        with open(train_file["affinity"], 'r') as f:
+            self.affinity = f.readlines()
+        # define tokenizer
+        self.begin_id = tokenizer_config["begin_id"]
+        self.sep_id = tokenizer_config["separate_id"]
+        self.max_len = tokenizer_config["max_len"]
+        self.vocab = load_vocab(tokenizer_config["vocab_file"])
+        bpe_codes_drug = codecs.open(tokenizer_config["vocab_pair"])
+        self.dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
+        bpe_codes_prot = codecs.open(tokenizer_config["vocab_pair_p"])
+        self.pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.smile)
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        # Select sample
+        # Load data and get label
+        # tokenization
+        d = self.dbpe.process_line(self.smile[index].strip()).split()
+        p = self.pbpe.process_line(self.seq[index].strip()).split()
+        # mask_d, mask_d_posi = self.random_mask(d)
+        # mask_p, mask_p_posi = self.random_mask(p)
+        y = np.float64(self.affinity[index].strip())
+        #
+        # input_seq = [self.begin_id] + mask_d + [self.sep_id] + mask_p + [self.sep_id]
+        # mask_posi = np.concatenate((np.zeros(1), mask_d_posi, np.zeros(1), mask_p_posi, np.zeros(1)))
+        # token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
+        # if len(input_seq) > self.max_len:
+        #     input_seq = input_seq[:self.max_len-1] + [self.sep_id]
+        #     token_type_ids = token_type_ids[:self.max_len]
+        #     mask_posi = mask_posi[:self.max_len]
+        # else:
+        #     mask_posi = np.pad(mask_posi, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
+        #     token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
+        # input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
+        # return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(input_mask).long(), y, torch.from_numpy(mask_posi).long()
+        return " ".join(d), " ".join(p), y
+        # return len(d), len(p)
+class Data_Provide(data.Dataset):
+    def __init__(self, train_file, mask_file, tokenizer):
+        'Initialization'
+        # load data
+        with open(train_file, 'r') as f:
+            self.seq = f.readlines()
+        with open(mask_file, 'r') as f:
+            self.seq_mask = f.readlines()
+        self.tokenizer = tokenizer
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.seq)
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        # Select sample
+        # Load data and get label
+        item = json.loads(self.seq[index])
+        mask_item = json.loads(self.seq_mask[index])
+        seq = item["seq"]
+        # freq_score  = self.tokenizer.calculate_tf_idf(seq)
+        seq_mask = mask_item["seq"]
+        y = np.float64(item["affinity"])
+        # print("len(seq.split())",len(seq.split()))
+        # print("len(freq_score)",len(freq_score))
+        # print(len(seq_mask))
+        # print(y)
+        # print(seq.size(), seq_mask.size(), len(fre_score), len(y))
+        # return seq, seq_mask, freq_score, y
+        return seq, seq_mask, y
+class Data_Gen(data.Dataset):
+    def __init__(self, train_file):
+        'Initialization'
+        # load data
+        with open(train_file, 'r') as f:
+            self.seq = f.readlines()
+        # with open(mask_file, 'r') as f:
+        #     self.seq_mask = f.readlines()
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.seq)
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        # Select sample
+        # Load data and get label
+        item = json.loads(self.seq[index])
+        # mask_item = json.loads(self.seq_mask[index])
+        seq = item["seq"]
+        # seq_mask = mask_item["seq"]
+        if "affinity" not in item.keys():
+            return seq
+        else:
+            y = np.float64(item["affinity"])
+            return seq, y
+def get_task(task_name):
+    tokenizer_config = {"vocab_file": './config/vocab.txt',
+                        "vocab_pair": './config/drug_codes_chembl.txt',
+                        "begin_id": '[CLS]',
+                        "separate_id": "[SEP]",
+                        "max_len": 512
+                        }
+    if task_name.lower() == 'train':
+        df_train = {"sps": './data/train/train_sps',
+                    "smile": './data/train/train_smile',
+                    "affinity": './data/train/train_ic50',
+                    }
+        return df_train, tokenizer_config
+    elif task_name.lower() == 'test':
+        df_test = {"sps": './data/test/test_sps',
+                   "smile": './data/test/test_smile',
+                   "affinity": './data/test/test_ic50',
+                   }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_ori_er':
+        df_test = {"sps": './data/ER/ER_sps',
+                   "smile": './data/ER/ER_smile',
+                   "affinity": './data/ER/ER_ic50',
+                   }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_ori_gpcr':
+        df_test = {"sps": './data/GPCR/GPCR_sps',
+                   "smile": './data/GPCR/GPCR_smile',
+                   "affinity": './data/GPCR/GPCR_ic50',
+                   }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_ori_channel':
+        df_test = {"sps": './data/Ion_channel/channel_sps',
+                   "smile": './data/Ion_channel/channel_smile',
+                   "affinity": './data/Ion_channel/channel_ic50',
+                   }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_ori_kinase':
+        df_test = {"sps": './data/Tyrosine_kinase/kinase_sps',
+                   "smile": './data/Tyrosine_kinase/kinase_smile',
+                   "affinity": './data/Tyrosine_kinase/kinase_ic50',
+                   }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'train_mol':
+        df_train = "data/tokenize_data/train.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, tokenizer_config
+    elif task_name.lower() == 'test_mol':
+        df_test = "data/tokenize_data/test.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_er':
+        df_test = "data/tokenize_data/er.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_gpcr':
+        df_test = "data/tokenize_data/gpcr.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_channel':
+        df_test = "data/tokenize_data/channel.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_kinase':
+        df_test = "data/tokenize_data/kinase.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'pre-train':
+        df_train_mask = "data/tokenize_data/train.tokenize.mask"
+        df_train = "data/tokenize_data/train.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, df_train_mask, tokenizer_config
+    elif task_name.lower() == 'test-pre-train':
+        df_train_mask = "data/tokenize_data/test.tokenize.mask"
+        df_train = "data/tokenize_data/test.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, df_train_mask, tokenizer_config
+    elif task_name.lower() == 'test-pre-train-er':
+        df_train_mask = "data/tokenize_data/er.tokenize.mask"
+        df_train = "data/tokenize_data/er.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, df_train_mask, tokenizer_config
+    elif task_name.lower() == 'test-pre-train-gpcr':
+        df_train_mask = "data/tokenize_data/gpcr.tokenize.mask"
+        df_train = "data/tokenize_data/gpcr.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, df_train_mask, tokenizer_config
+    elif task_name.lower() == 'test-pre-train-channel':
+        df_train_mask = "data/tokenize_data/channel.tokenize.mask"
+        df_train = "data/tokenize_data/channel.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, df_train_mask, tokenizer_config
+    elif task_name.lower() == 'test-pre-train-kinase':
+        df_train_mask = "data/tokenize_data/kinase.tokenize.mask"
+        df_train = "data/tokenize_data/kinase.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, df_train_mask, tokenizer_config
+    elif task_name.lower() == 'case_study':
+        # df_train_mask = "data/tokenize_data/kinase.tokenize.mask"
+        df_train = "case_study/spike.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, tokenizer_config
+def random_mask(input_seq, mask_proportion=0.15):
+    input = [i.split() for i in input_seq]
+    mask_len = [math.ceil(len(i)*mask_proportion) for i in input]
+    # mask_posi = np.arange(len(input_seq))
+    # mask_token_posi = random.sample(mask_posi, mask_len)
+    mask_token_posi = [np.random.choice(len(i), j) for i, j in zip(input, mask_len)]
+    # mask_vec = np.zeros(len(input_seq))
+    for i, posi in enumerate(mask_token_posi):
+        for j in posi:
+            choice = random.random()
+            if choice < 0.8:
+                input[i][j] = "[MASK]"
+            # mask_vec[i] = 1
+    # elif choice >= 0.8 and choice < 0.9:
+    return input
+class Tokenizer(object):
+    def  __init__(self, tokenizer_config):
+        self.begin_id = tokenizer_config["begin_id"]
+        self.sep_id = tokenizer_config["separate_id"]
+        self.max_len = tokenizer_config["max_len"]
+        self.vocab = load_vocab(tokenizer_config["vocab_file"])
+        # 读取vocab.txt中的token列表和对应的id
+        vocab = './config/vocab_mol.txt'
+        with open(vocab, 'r', encoding='utf-8') as f:
+            tokens = [token.strip() for token in f.readlines()]
+        self.token_to_id = {token: i for i, token in enumerate(tokens)}
+        #读取token的frequency字典
+        token_frequency_file = './config/token_frequency.pickle'
+        with open(token_frequency_file, 'rb') as f:
+            self.token_frequency = pickle.load(f)
+        self.total_tokens = sum(self.token_frequency.values())
+        # 定义函数，将一个sequence转换为对应的token id列表
+    def tokenize_sequence(self, sequence):
+        tokens = [sequence.split(' ')]
+        token_ids = [self.token_to_id.get(token, self.token_to_id['[UNK]']) for token in tokens]
+        return token_ids
+    def seq2emb_encoder_simple(self, input_seq, vocab):
+        all_ids = []
+        for i in input_seq:
+            try:
+                id = vocab[i]
+                all_ids.append(id)
+            except:
+                id = vocab["[UNK]"]
+                all_ids.append(id)
+        ids = np.asarray(all_ids)
+        return ids
+    def convert_token_to_ids(self, seq):
+        # input_seq = [[self.begin_id] + i + [self.sep_id] + j + [self.sep_id] for i, j in zip(mask_d, mask_p)]
+        # input_seq_ori = [[self.begin_id] + i.split() + [self.sep_id] + j.split() + [self.sep_id] for i, j in zip(d, p)]
+        # mask_posi = np.concatenate((np.zeros(1), mask_d_posi, np.zeros(1), mask_p_posi, np.zeros(1)))
+        # token_type_ids = [[np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))] for d, p in zip(mask_d, mask_p)]
+        # seq = seq.split()
+        all_seq = [i.split() for i in seq]
+        for i, seq_i in enumerate(all_seq):
+            if len(seq_i) > self.max_len:
+                all_seq[i] = seq_i[:self.max_len-1] + [self.sep_id]
+                # input_seq_ori[i] = seq[:self.max_len-1] + [self.sep_id]
+            # token_type_ids = token_type_ids[:self.max_len]
+            # mask_posi = mask_posi[:self.max_len]
+        # else:
+            # mask_posi = np.pad(mask_posi, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
+            # token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
+        all_seq_ids = []
+        # all_seq_ori = []
+        # all_mask = []
+        for seq in all_seq:
+            input = self.seq2emb_encoder_simple(seq, self.vocab)
+            # input_ori = seq2emb_encoder_simple(ori, self.max_len, self.vocab)
+            all_seq_ids.append(torch.from_numpy(input).long())
+            # all_seq_ori.append(torch.from_numpy(input_ori).long())
+        padded_seq_ids = pad_sequence(all_seq_ids, batch_first=True)
+        if padded_seq_ids.size(1) < self.max_len:
+            padded_seq_ids = torch.cat([padded_seq_ids, torch.zeros(padded_seq_ids.size(0), self.max_len - padded_seq_ids.size(1))], dim=1)
+        else:
+            padded_seq_ids = padded_seq_ids[:, :self.max_len]
+        # input_ori = pad_sequence(all_seq_ori, batch_first=True)
+        # input_mask = pad_sequence(all_mask)
+        # return torch.from_numpy(input).long(), torch.from_numpy(input_mask).long(), torch.from_numpy(token_type_ids).long()
+        # return input, input_mask, input_ori
+        input_mask = (padded_seq_ids != 0)    
+        return padded_seq_ids.long(), input_mask
+    # 定义函数，计算一个sequence中每个token的TF-IDF值
+    def calculate_tf_idf(self, sequences):
+        # 将sequence转换为token id列表
+        tf_idfs = []
+        total_tokens = 23533
+        for index, sequence in enumerate(sequences):
+            seq_ids = self.seq2emb_encoder_simple(sequence.split(), self.vocab)
+            # input_ori = seq2emb_encoder_simple(ori, self.max_len, self.vocab)
+            # all_seq_ids.append(torch.from_numpy(input).long())
+            token_count = Counter(seq_ids)
+            token_tf = [token_count[token] / len(seq_ids) for token in seq_ids]
+            # for token in enumerate(sequence):
+                # print(token)
+                # print(self.token_frequency.get(self.vocab[token])) 
+            # token_idf = [np.log(((self.token_frequency.get(token, 1))+1) / (token_count[token]+1)) for i, token in enumerate(sequence.split())]
+            token_idf = [np.log(((self.token_frequency.get(token, 1))+1) / (token_count[token]+1)) for token in enumerate(sequence)]
+            # print(token, token_idf)
+            token_tf_idf = [tf * idf for tf, idf in zip(token_tf, token_idf)]
+            tf_idfs.append(torch.tensor(token_tf_idf))
+        padded_tf_idfs = torch.zeros((len(tf_idfs), self.max_len))
+        for i,tf_idf in enumerate(tf_idfs):
+            if tf_idf.size(0) < self.max_len:
+                padded_tf_idf = torch.cat([tf_idf, torch.zeros(self.max_len - len(tf_idf))])
+            else:
+                padded_tf_idf = tf_idf[:self.max_len]
+            # padded_tf_idfs.append(padded_tf_idf)
+            padded_tf_idfs[i] = padded_tf_idf
+        # padded_tf_idfs = torch.stack(padded_tf_idfs)
+        # print(padded_tf_idfs.shape)
+        return padded_tf_idfs
+def collate_fn(batch):
+    # Get the maximum length of freq_score in the batch
+    max_len = max(len(item[2]) for item in batch)
+    # Initialize empty lists for seq, seq_mask, freq_score, and y
+    seqs, seq_masks, freq_scores, ys = [], [], [], []
+    # Pad freq_score and concatenate seq and seq_mask for each item in the batch
+    for item in batch:
+        freq_score = item[1]
+        freq_score += [0] * (max_len - len(freq_score))
+        freq_scores.append(freq_score)
+        ys.append(item[2])
+    # Convert lists to tensors
+    seqs = [torch.tensor(seq) for seq in seqs]
+    seq_masks = [torch.tensor(seq_mask) for seq_mask in seq_masks]
+    freq_scores = torch.tensor(freq_scores)
+    ys = torch.tensor(ys)
+    # Return the batch
+    return seqs, seq_masks, ys
+if __name__ == "__main__":
+    # local test
+    # dataFolder = './IC50/SPS/train_smile'
+    # with open(dataFolder, 'r') as f:
+    #     train_smi = f.readlines()
+    # drug_smi = train_smi[0]
+    # d_v, input_mask_d = drug2emb_encoder(drug_smi)
+    # test load vocab
+    # vocab_file = './ESPF/vocab.txt'
+    # vocab = load_vocab(vocab_file)
+    # test train
+    '''
+    task = 'pre-train'
+    data_file, data_mask, tokenizer_config = get_task(task)
+    dataset = Data_Provide(data_file, data_mask)
+    tokenizer = Tokenizer(tokenizer_config)
+    data_loder_para = {'batch_size': 2,
+                       'shuffle': False,
+                       'num_workers': 0,
+                       }
+    data_generator = DataLoader(dataset, **data_loder_para)
+    all_len = []
+    m = 0
+    for i, (seq, seq_mask, affinity) in enumerate(tqdm(data_generator)):
+        input_random_mask, attention_mask = tokenizer.convert_token_to_ids(seq_mask)
+        label, _ = tokenizer.convert_token_to_ids(seq)
+        posi = torch.where(input_random_mask == 1)
+        target = label[posi]
+        a = input_random_mask == 4
+        if torch.sum(a) > 2:
+            print(torch.sum(a))
+            '''
+        # a = seq[0].split()
+        # b = seq_mask[0].split()
+        # all_len.append(len(a))
+        # if len(a) > 512:
+        #     m += 1
+        # if len(a) != len(b):
+        #     print(seq)
+        #     print(i)
+    # all_len = np.array(all_len)
+    # print(np.max(all_len))
+    # print(np.mean(all_len))
+    # print(m)
+    #test for tokenizer and count frequency
+    # sequence = '[CLS] CC1=CC=C (O 1)C2=N C(=CC(=N [SEP] MP VRRG H VAP QN'
+    output_file_path = "tf_idf_values.txt"
+    sequence = ['[CLS]', ')cn1', '(O', '1)C2=N', 'C(=CC(=N', '[SEP]', 'MP', 'VRRG', 'H', 'VAP', 'MP', 'VRRG', 'QN']
+    tokenizer_config =     {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+    tokenizer = Tokenizer(tokenizer_config)
+    tf_idf_values = tokenizer.calculate_tf_idf(sequence)
+    print(tf_idf_values.size())
+    # 指定输出文件路径
+    output_file_path = "tf_idf_values.txt"
+    # 打开文件并写入tf_idf_values
+    with open(output_file_path, "w") as file:
+        # 将tf_idf_values转换为字符串形式
+        tf_idf_str = "\n".join(str(value) for value in tf_idf_values)
+        # 写入文件
+        file.write(tf_idf_str)
+    print("tf_idf_values已成功写入到文件：", output_file_path)
+    # for i, token in enumerate(sequence.split()):
+    #     print(f"Token: {token}, TF-IDF value: {tf_idf_values[i]}")
+    # task = 'pre-train'
+    # data_file, data_mask, tokenizer_config = get_task(task)
+    # tokenizer = Tokenizer(tokenizer_config)
+    # dataset = Data_Provide(data_file, data_mask, tokenizer)
+    # data_loder_para = {'batch_size': 2,
+    #                    'shuffle': False,
+    #                    'num_workers': 0,
+    #                    }
+    # data_generator = DataLoader(dataset, **data_loder_para)
+    # for idx, inputs  in  enumerate(data_generator):
+    #     x,y1,y2 = inputs
+    #     print(f"Batch {idx}: Inputs shape: {x.dtype}")
+    #     print(f"Batch {idx}: Targets shape: {y1.dtype}")
+    #     # print(f"Batch {idx}: Targets shape: {fre.dtype}")
+    #     print(f"Batch {idx}: Targets shape: {y2.dtype}")
+    #     # print(f"Batch {idx}: Targets shape: {fre.shape}")
+    #     if idx == 2:
+    #         break
--- a/dataset_backup.py
+++ b/dataset_backup.py
+import numpy as np
+import pandas as pd
+import torch
+from torch.utils import data
+import json
+import collections
+from torch.utils.data import DataLoader
+from subword_nmt.apply_bpe import BPE
+import codecs
+from collections import Counter
+from tqdm import tqdm
+import math
+import random
+from torch.nn.utils.rnn import pad_sequence
+import pickle, csv
+import os
+# vocab_path = './ESPF/protein_codes_uniprot.txt'
+# bpe_codes_protein = codecs.open(vocab_path)
+# pbpe = BPE(bpe_codes_protein, merges=-1, separator='')
+# sub_csv = pd.read_csv('./ESPF/subword_units_map_uniprot.csv')
+#
+# idx2word_p = sub_csv['index'].values
+# words2idx_p = dict(zip(idx2word_p, range(0, len(idx2word_p))))
+# vocab_path = './ESPF/drug_codes_chembl.txt'
+# bpe_codes_drug = codecs.open(vocab_path)
+# dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
+# sub_csv = pd.read_csv('./ESPF/subword_units_map_chembl.csv')
+#
+# idx2word_d = sub_csv['index'].values
+# words2idx_d = dict(zip(idx2word_d, range(0, len(idx2word_d))))
+# max_d = 205
+# max_p = 545
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    # vocab_file = os.path.join(os.path.dirname(__file__), 'config', 'vocab_mol.txt')
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+# def protein2emb_encoder(x, words2idx_p):
+#     max_p = 152
+#     # t1 = pbpe.process_line(x).split()  # split
+#     t1 = x.split(',')
+#     try:
+#         i1 = np.asarray([words2idx_p[i] for i in t1])  # index
+#     except:
+#         i1 = np.array([0])
+#         # print(x)
+#
+#     l = len(i1)
+#
+#     if l < max_p:
+#         i = np.pad(i1, (0, max_p - l), 'constant', constant_values=0)
+#         input_mask = ([1] * l) + ([0] * (max_p - l))
+#     else:
+#         i = i1[:max_p]
+#         input_mask = [1] * max_p
+#
+#     return i, np.asarray(input_mask)
+# def drug2emb_encoder(x, dbpe, words2idx_d):
+#     max_d = 50
+#     # max_d = 100
+#     t1 = dbpe.process_line(x)
+#     t1 = t1.split()  # split
+#     try:
+#         i1 = np.asarray([words2idx_d[i] for i in t1])  # index
+#     except:
+#         i1 = np.array([0])
+#         # print(x)
+#
+#     l = len(i1)
+#     print(i1)
+#
+#     if l < max_d:
+#         i = np.pad(i1, (0, max_d - l), 'constant', constant_values=0)
+#         input_mask = ([1] * l) + ([0] * (max_d - l))
+#
+#     else:
+#         i = i1[:max_d]
+#         input_mask = [1] * max_d
+#
+#     return i, np.asarray(input_mask)
+def seq2emb_encoder(input_seq, max_len, vocab):
+    try:
+        ids = np.asarray([vocab[i] for i in input_seq])
+    except:
+        ids = np.array([0])
+    l = len(ids)
+    if l < max_len:
+        ids = np.pad(ids, (0, max_len - l), 'constant', constant_values=0)
+        input_mask = np.array(([1] * l) + ([0] * (max_len - l)))
+    else:
+        ids = ids[:max_len]
+        input_mask = np.array([1] * max_len)
+    return ids, input_mask
+def seq2emb_encoder_simple(input_seq, max_len, vocab):
+    try:
+        ids = np.asarray([vocab[i] for i in input_seq])
+    except:
+        ids = np.array([0])
+    # l = len(ids)
+    #
+    # if l < max_len:
+    #     ids = np.pad(ids, (0, max_len - l), 'constant', constant_values=0)
+    #     input_mask = np.array(([1] * l) + ([0] * (max_len - l)))
+    # else:
+    #     ids = ids[:max_len]
+    #     input_mask = np.array([1] * max_len)
+    return ids
+class Data_Encoder(data.Dataset):
+    def __init__(self, train_file, tokenizer_config):
+        'Initialization'
+        # load data
+        with open(train_file["sps"], 'r') as f:
+            self.sps = f.readlines()
+        with open(train_file["smile"], 'r') as f:
+            self.smile = f.readlines()
+        with open(train_file["affinity"], 'r') as f:
+            self.affinity = f.readlines()
+        # define tokenizer
+        self.begin_id = tokenizer_config["begin_id"]
+        self.sep_id = tokenizer_config["separate_id"]
+        self.max_len = tokenizer_config["max_len"]
+        self.vocab = load_vocab(tokenizer_config["vocab_file"])
+        bpe_codes_drug = codecs.open(tokenizer_config["vocab_pair"])
+        self.dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.sps)
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        # Select sample
+        # Load data and get label
+        # tokenization
+        d = self.dbpe.process_line(self.smile[index].strip()).split()
+        p = self.sps[index].strip().split(',')
+        y = np.float(self.affinity[index].strip())
+        input_seq = [self.begin_id] + d + [self.sep_id] + p + [self.sep_id]
+        token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
+        token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
+        input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
+        return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(
+            input_mask).long(), y
+        # return len(d), len(p)
+class Data_Encoder_mol(data.Dataset):
+    def __init__(self, train_file, tokenizer_config):
+        'Initialization'
+        # load data
+        # with open(train_file["sps"], 'r') as f:
+        #     self.sps = f.readlines()
+        with open(train_file['seq'], 'r') as f:
+            self.seq = f.readlines()
+        with open(train_file["smile"], 'r') as f:
+            self.smile = f.readlines()
+        with open(train_file["affinity"], 'r') as f:
+            self.affinity = f.readlines()
+        # define tokenizer
+        self.begin_id = tokenizer_config["begin_id"]
+        self.sep_id = tokenizer_config["separate_id"]
+        self.max_len = tokenizer_config["max_len"]
+        self.vocab = load_vocab(tokenizer_config["vocab_file"])
+        bpe_codes_drug = codecs.open(tokenizer_config["vocab_pair"])
+        self.dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
+        bpe_codes_prot = codecs.open(tokenizer_config["vocab_pair_p"])
+        self.pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.smile)
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        # Select sample
+        # Load data and get label
+        # tokenization
+        d = self.dbpe.process_line(self.smile[index].strip()).split()
+        p = self.pbpe.process_line(self.seq[index].strip()).split()
+        y = np.float64(self.affinity[index].strip())
+        input_seq = [self.begin_id] + d + [self.sep_id] + p + [self.sep_id]
+        token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
+        if len(input_seq) > self.max_len:
+            input_seq = input_seq[:self.max_len-1] + [self.sep_id]
+            token_type_ids = token_type_ids[:self.max_len]
+        else:
+            token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
+        input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
+        return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(input_mask).long(), y
+        # return len(d), len(p)
+class Data_Encoder_LM(data.Dataset):
+    def __init__(self, train_file, tokenizer_config):
+        'Initialization'
+        # load data
+        # with open(train_file["sps"], 'r') as f:
+        #     self.sps = f.readlines()
+        with open(train_file['seq'], 'r') as f:
+            self.seq = f.readlines()
+        with open(train_file["smile"], 'r') as f:
+            self.smile = f.readlines()
+        with open(train_file["affinity"], 'r') as f:
+            self.affinity = f.readlines()
+        # define tokenizer
+        self.begin_id = tokenizer_config["begin_id"]
+        self.sep_id = tokenizer_config["separate_id"]
+        self.max_len = tokenizer_config["max_len"]
+        self.vocab = load_vocab(tokenizer_config["vocab_file"])
+        bpe_codes_drug = codecs.open(tokenizer_config["vocab_pair"])
+        self.dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
+        bpe_codes_prot = codecs.open(tokenizer_config["vocab_pair_p"])
+        self.pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.smile)
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        # Select sample
+        # Load data and get label
+        # tokenization
+        d = self.dbpe.process_line(self.smile[index].strip()).split()
+        p = self.pbpe.process_line(self.seq[index].strip()).split()
+        # mask_d, mask_d_posi = self.random_mask(d)
+        # mask_p, mask_p_posi = self.random_mask(p)
+        y = np.float64(self.affinity[index].strip())
+        #
+        # input_seq = [self.begin_id] + mask_d + [self.sep_id] + mask_p + [self.sep_id]
+        # mask_posi = np.concatenate((np.zeros(1), mask_d_posi, np.zeros(1), mask_p_posi, np.zeros(1)))
+        # token_type_ids = np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))
+        # if len(input_seq) > self.max_len:
+        #     input_seq = input_seq[:self.max_len-1] + [self.sep_id]
+        #     token_type_ids = token_type_ids[:self.max_len]
+        #     mask_posi = mask_posi[:self.max_len]
+        # else:
+        #     mask_posi = np.pad(mask_posi, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
+        #     token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
+        # input, input_mask = seq2emb_encoder(input_seq, self.max_len, self.vocab)
+        # return torch.from_numpy(input).long(), torch.from_numpy(token_type_ids).long(), torch.from_numpy(input_mask).long(), y, torch.from_numpy(mask_posi).long()
+        return " ".join(d), " ".join(p), y
+        # return len(d), len(p)
+class Data_Provide(data.Dataset):
+    def __init__(self, train_file, mask_file, tokenizer):
+        'Initialization'
+        # load data
+        with open(train_file, 'r') as f:
+            self.seq = f.readlines()
+        with open(mask_file, 'r') as f:
+            self.seq_mask = f.readlines()
+        self.tokenizer = tokenizer
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.seq)
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        # Select sample
+        # Load data and get label
+        item = json.loads(self.seq[index])
+        mask_item = json.loads(self.seq_mask[index])
+        seq = item["seq"]
+        # freq_score  = self.tokenizer.calculate_tf_idf(seq)
+        seq_mask = mask_item["seq"]
+        y = np.float64(item["affinity"])
+        # print("len(seq.split())",len(seq.split()))
+        # print("len(freq_score)",len(freq_score))
+        # print(len(seq_mask))
+        # print(y)
+        # print(seq.size(), seq_mask.size(), len(fre_score), len(y))
+        # return seq, seq_mask, freq_score, y
+        return seq, seq_mask, y
+class Data_Gen(data.Dataset):
+    def __init__(self, train_file):
+        'Initialization'
+        # load data
+        with open(train_file, 'r') as f:
+            self.seq = f.readlines()
+        # with open(mask_file, 'r') as f:
+        #     self.seq_mask = f.readlines()
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.seq)
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        # Select sample
+        # Load data and get label
+        item = json.loads(self.seq[index])
+        # mask_item = json.loads(self.seq_mask[index])
+        seq = item["seq"]
+        # seq_mask = mask_item["seq"]
+        if "affinity" not in item.keys():
+            return seq
+        else:
+            y = np.float64(item["affinity"])
+            return seq, y
+def get_task(task_name):
+    tokenizer_config = {"vocab_file": './config/vocab.txt',
+                        "vocab_pair": './config/drug_codes_chembl.txt',
+                        "begin_id": '[CLS]',
+                        "separate_id": "[SEP]",
+                        "max_len": 512
+                        }
+    if task_name.lower() == 'train':
+        df_train = {"sps": './data/train/train_sps',
+                    "smile": './data/train/train_smile',
+                    "affinity": './data/train/train_ic50',
+                    }
+        return df_train, tokenizer_config
+    elif task_name.lower() == 'test':
+        df_test = {"sps": './data/test/test_sps',
+                   "smile": './data/test/test_smile',
+                   "affinity": './data/test/test_ic50',
+                   }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_ori_er':
+        df_test = {"sps": './data/ER/ER_sps',
+                   "smile": './data/ER/ER_smile',
+                   "affinity": './data/ER/ER_ic50',
+                   }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_ori_gpcr':
+        df_test = {"sps": './data/GPCR/GPCR_sps',
+                   "smile": './data/GPCR/GPCR_smile',
+                   "affinity": './data/GPCR/GPCR_ic50',
+                   }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_ori_channel':
+        df_test = {"sps": './data/Ion_channel/channel_sps',
+                   "smile": './data/Ion_channel/channel_smile',
+                   "affinity": './data/Ion_channel/channel_ic50',
+                   }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_ori_kinase':
+        df_test = {"sps": './data/Tyrosine_kinase/kinase_sps',
+                   "smile": './data/Tyrosine_kinase/kinase_smile',
+                   "affinity": './data/Tyrosine_kinase/kinase_ic50',
+                   }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'train_mol':
+        df_train = "data/tokenize_data/train.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, tokenizer_config
+    elif task_name.lower() == 'test_mol':
+        df_test = "data/tokenize_data/test.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_er':
+        df_test = "data/tokenize_data/er.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_gpcr':
+        df_test = "data/tokenize_data/gpcr.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_channel':
+        df_test = "data/tokenize_data/channel.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'test_kinase':
+        df_test = "data/tokenize_data/kinase.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_test, tokenizer_config
+    elif task_name.lower() == 'pre-train':
+        df_train_mask = "data/tokenize_data/train.tokenize.mask"
+        df_train = "data/tokenize_data/train.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, df_train_mask, tokenizer_config
+    elif task_name.lower() == 'test-pre-train':
+        df_train_mask = "data/tokenize_data/test.tokenize.mask"
+        df_train = "data/tokenize_data/test.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, df_train_mask, tokenizer_config
+    elif task_name.lower() == 'test-pre-train-er':
+        df_train_mask = "data/tokenize_data/er.tokenize.mask"
+        df_train = "data/tokenize_data/er.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, df_train_mask, tokenizer_config
+    elif task_name.lower() == 'test-pre-train-gpcr':
+        df_train_mask = "data/tokenize_data/gpcr.tokenize.mask"
+        df_train = "data/tokenize_data/gpcr.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, df_train_mask, tokenizer_config
+    elif task_name.lower() == 'test-pre-train-channel':
+        df_train_mask = "data/tokenize_data/channel.tokenize.mask"
+        df_train = "data/tokenize_data/channel.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, df_train_mask, tokenizer_config
+    elif task_name.lower() == 'test-pre-train-kinase':
+        df_train_mask = "data/tokenize_data/kinase.tokenize.mask"
+        df_train = "data/tokenize_data/kinase.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, df_train_mask, tokenizer_config
+    elif task_name.lower() == 'case_study':
+        # df_train_mask = "data/tokenize_data/kinase.tokenize.mask"
+        df_train = "case_study/spike.tokenize"
+        tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+        return df_train, tokenizer_config
+def random_mask(input_seq, mask_proportion=0.15):
+    input = [i.split() for i in input_seq]
+    mask_len = [math.ceil(len(i)*mask_proportion) for i in input]
+    # mask_posi = np.arange(len(input_seq))
+    # mask_token_posi = random.sample(mask_posi, mask_len)
+    mask_token_posi = [np.random.choice(len(i), j) for i, j in zip(input, mask_len)]
+    # mask_vec = np.zeros(len(input_seq))
+    for i, posi in enumerate(mask_token_posi):
+        for j in posi:
+            choice = random.random()
+            if choice < 0.8:
+                input[i][j] = "[MASK]"
+            # mask_vec[i] = 1
+    # elif choice >= 0.8 and choice < 0.9:
+    return input
+class Tokenizer(object):
+    def  __init__(self, tokenizer_config):
+        self.begin_id = tokenizer_config["begin_id"]
+        self.sep_id = tokenizer_config["separate_id"]
+        self.max_len = tokenizer_config["max_len"]
+        self.vocab = load_vocab(tokenizer_config["vocab_file"])
+        # 读取vocab.txt中的token列表和对应的id
+        vocab = './config/vocab_mol.txt'
+        with open(vocab, 'r', encoding='utf-8') as f:
+            tokens = [token.strip() for token in f.readlines()]
+        self.token_to_id = {token: i for i, token in enumerate(tokens)}
+        #读取token的frequency字典
+        token_frequency_file = './config/token_frequency.pickle'
+        with open(token_frequency_file, 'rb') as f:
+            self.token_frequency = pickle.load(f)
+        self.total_tokens = sum(self.token_frequency.values())
+        # 定义函数，将一个sequence转换为对应的token id列表
+    def tokenize_sequence(self, sequence):
+        tokens = [sequence.split(' ')]
+        token_ids = [self.token_to_id.get(token, self.token_to_id['[UNK]']) for token in tokens]
+        return token_ids
+    def seq2emb_encoder_simple(self, input_seq, vocab):
+        all_ids = []
+        for i in input_seq:
+            try:
+                id = vocab[i]
+                all_ids.append(id)
+            except:
+                id = vocab["[UNK]"]
+                all_ids.append(id)
+        ids = np.asarray(all_ids)
+        return ids
+    def convert_token_to_ids(self, seq):
+        # input_seq = [[self.begin_id] + i + [self.sep_id] + j + [self.sep_id] for i, j in zip(mask_d, mask_p)]
+        # input_seq_ori = [[self.begin_id] + i.split() + [self.sep_id] + j.split() + [self.sep_id] for i, j in zip(d, p)]
+        # mask_posi = np.concatenate((np.zeros(1), mask_d_posi, np.zeros(1), mask_p_posi, np.zeros(1)))
+        # token_type_ids = [[np.concatenate((np.zeros((len(d) + 2), dtype=np.int), np.ones((len(p) + 1), dtype=np.int)))] for d, p in zip(mask_d, mask_p)]
+        # seq = seq.split()
+        all_seq = [i.split() for i in seq]
+        for i, seq_i in enumerate(all_seq):
+            if len(seq_i) > self.max_len:
+                all_seq[i] = seq_i[:self.max_len-1] + [self.sep_id]
+                # input_seq_ori[i] = seq[:self.max_len-1] + [self.sep_id]
+            # token_type_ids = token_type_ids[:self.max_len]
+            # mask_posi = mask_posi[:self.max_len]
+        # else:
+            # mask_posi = np.pad(mask_posi, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
+            # token_type_ids = np.pad(token_type_ids, (0, self.max_len - len(input_seq)), 'constant', constant_values=0)
+        all_seq_ids = []
+        # all_seq_ori = []
+        # all_mask = []
+        for seq in all_seq:
+            input = self.seq2emb_encoder_simple(seq, self.vocab)
+            # input_ori = seq2emb_encoder_simple(ori, self.max_len, self.vocab)
+            all_seq_ids.append(torch.from_numpy(input).long())
+            # all_seq_ori.append(torch.from_numpy(input_ori).long())
+        padded_seq_ids = pad_sequence(all_seq_ids, batch_first=True)
+        if padded_seq_ids.size(1) < self.max_len:
+            padded_seq_ids = torch.cat([padded_seq_ids, torch.zeros(padded_seq_ids.size(0), self.max_len - padded_seq_ids.size(1))], dim=1)
+        else:
+            padded_seq_ids = padded_seq_ids[:, :self.max_len]
+        # input_ori = pad_sequence(all_seq_ori, batch_first=True)
+        # input_mask = pad_sequence(all_mask)
+        # return torch.from_numpy(input).long(), torch.from_numpy(input_mask).long(), torch.from_numpy(token_type_ids).long()
+        # return input, input_mask, input_ori
+        input_mask = (padded_seq_ids != 0)    
+        return padded_seq_ids.long(), input_mask
+    # 定义函数，计算一个sequence中每个token的TF-IDF值
+    def calculate_tf_idf(self, sequences):
+        # 将sequence转换为token id列表
+        tf_idfs = []
+        total_tokens = 23533
+        for index, sequence in enumerate(sequences):
+            seq_ids = self.seq2emb_encoder_simple(sequence.split(), self.vocab)
+            # input_ori = seq2emb_encoder_simple(ori, self.max_len, self.vocab)
+            # all_seq_ids.append(torch.from_numpy(input).long())
+            token_count = Counter(seq_ids)
+            token_tf = [token_count[token] / len(seq_ids) for token in seq_ids]
+            # for token in enumerate(sequence):
+                # print(token)
+                # print(self.token_frequency.get(self.vocab[token])) 
+            # token_idf = [np.log(((self.token_frequency.get(token, 1))+1) / (token_count[token]+1)) for i, token in enumerate(sequence.split())]
+            token_idf = [np.log(((self.token_frequency.get(token, 1))+1) / (token_count[token]+1)) for token in enumerate(sequence)]
+            # print(token, token_idf)
+            token_tf_idf = [tf * idf for tf, idf in zip(token_tf, token_idf)]
+            tf_idfs.append(torch.tensor(token_tf_idf))
+        padded_tf_idfs = torch.zeros((len(tf_idfs), self.max_len))
+        for i,tf_idf in enumerate(tf_idfs):
+            if tf_idf.size(0) < self.max_len:
+                padded_tf_idf = torch.cat([tf_idf, torch.zeros(self.max_len - len(tf_idf))])
+            else:
+                padded_tf_idf = tf_idf[:self.max_len]
+            # padded_tf_idfs.append(padded_tf_idf)
+            padded_tf_idfs[i] = padded_tf_idf
+        # padded_tf_idfs = torch.stack(padded_tf_idfs)
+        # print(padded_tf_idfs.shape)
+        return padded_tf_idfs
+def collate_fn(batch):
+    # Get the maximum length of freq_score in the batch
+    max_len = max(len(item[2]) for item in batch)
+    # Initialize empty lists for seq, seq_mask, freq_score, and y
+    seqs, seq_masks, freq_scores, ys = [], [], [], []
+    # Pad freq_score and concatenate seq and seq_mask for each item in the batch
+    for item in batch:
+        freq_score = item[1]
+        freq_score += [0] * (max_len - len(freq_score))
+        freq_scores.append(freq_score)
+        ys.append(item[2])
+    # Convert lists to tensors
+    seqs = [torch.tensor(seq) for seq in seqs]
+    seq_masks = [torch.tensor(seq_mask) for seq_mask in seq_masks]
+    freq_scores = torch.tensor(freq_scores)
+    ys = torch.tensor(ys)
+    # Return the batch
+    return seqs, seq_masks, ys
+if __name__ == "__main__":
+    # local test
+    # dataFolder = './IC50/SPS/train_smile'
+    # with open(dataFolder, 'r') as f:
+    #     train_smi = f.readlines()
+    # drug_smi = train_smi[0]
+    # d_v, input_mask_d = drug2emb_encoder(drug_smi)
+    # test load vocab
+    # vocab_file = './ESPF/vocab.txt'
+    # vocab = load_vocab(vocab_file)
+    # test train
+    '''
+    task = 'pre-train'
+    data_file, data_mask, tokenizer_config = get_task(task)
+    dataset = Data_Provide(data_file, data_mask)
+    tokenizer = Tokenizer(tokenizer_config)
+    data_loder_para = {'batch_size': 2,
+                       'shuffle': False,
+                       'num_workers': 0,
+                       }
+    data_generator = DataLoader(dataset, **data_loder_para)
+    all_len = []
+    m = 0
+    for i, (seq, seq_mask, affinity) in enumerate(tqdm(data_generator)):
+        input_random_mask, attention_mask = tokenizer.convert_token_to_ids(seq_mask)
+        label, _ = tokenizer.convert_token_to_ids(seq)
+        posi = torch.where(input_random_mask == 1)
+        target = label[posi]
+        a = input_random_mask == 4
+        if torch.sum(a) > 2:
+            print(torch.sum(a))
+            '''
+        # a = seq[0].split()
+        # b = seq_mask[0].split()
+        # all_len.append(len(a))
+        # if len(a) > 512:
+        #     m += 1
+        # if len(a) != len(b):
+        #     print(seq)
+        #     print(i)
+    # all_len = np.array(all_len)
+    # print(np.max(all_len))
+    # print(np.mean(all_len))
+    # print(m)
+    #test for tokenizer and count frequency
+    # sequence = '[CLS] CC1=CC=C (O 1)C2=N C(=CC(=N [SEP] MP VRRG H VAP QN'
+    sequence = ['[CLS]', ')cn1', '(O', '1)C2=N', 'C(=CC(=N', '[SEP]', 'MP', 'VRRG', 'H', 'VAP', 'MP', 'VRRG', 'QN']
+    tokenizer_config =     {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+    tokenizer = Tokenizer(tokenizer_config)
+    tf_idf_values = tokenizer.calculate_tf_idf(sequence)
+    print(tf_idf_values)
+    # for i, token in enumerate(sequence.split()):
+    #     print(f"Token: {token}, TF-IDF value: {tf_idf_values[i]}")
+    # task = 'pre-train'
+    # data_file, data_mask, tokenizer_config = get_task(task)
+    # tokenizer = Tokenizer(tokenizer_config)
+    # dataset = Data_Provide(data_file, data_mask, tokenizer)
+    # data_loder_para = {'batch_size': 2,
+    #                    'shuffle': False,
+    #                    'num_workers': 0,
+    #                    }
+    # data_generator = DataLoader(dataset, **data_loder_para)
+    # for idx, inputs  in  enumerate(data_generator):
+    #     x,y1,y2 = inputs
+    #     print(f"Batch {idx}: Inputs shape: {x.dtype}")
+    #     print(f"Batch {idx}: Targets shape: {y1.dtype}")
+    #     # print(f"Batch {idx}: Targets shape: {fre.dtype}")
+    #     print(f"Batch {idx}: Targets shape: {y2.dtype}")
+    #     # print(f"Batch {idx}: Targets shape: {fre.shape}")
+    #     if idx == 2:
+    #         break
--- a/eval.py
+++ b/eval.py
+import numpy as np
+import re
+def eval_result(pred, label):
+    pred = np.array(pred)
+    label = np.array(label)
+    num = len(pred)
+    diff = pred - label
+    mse = np.sum(np.power(diff, 2)) / num
+    rmse = np.sqrt(mse)
+    pearson_co = np.corrcoef(pred, label)
+    return rmse, pearson_co
+def eval(pred_path, label_path):
+    with open(pred_path, 'r') as f:
+        pred = f.readlines()
+        pred = [float(i.strip()) for i in pred]
+    with open(label_path, 'r') as f:
+        label = f.readlines()
+        label = [float(i.strip()) for i in label]
+    remse, r_mat = eval_result(pred, label)
+    r = r_mat[0, 1]
+    file = pred_path.split("/")[-1]
+    save_path = pred_path.replace(file, 'eval_results')
+    with open(save_path, 'w') as f:
+        f.write('RMSE : {} ; Pearson Correlation Coefficient : {}'.format(remse, r))
+    print('RMSE : {} ; Pearson Correlation Coefficient : {}'.format(remse, r))
+if __name__ == '__main__':
+    # with open('pre_test.sh', 'r') as f:
+    #     pred_dir = f.readline()
+    #     pred_dir = pred_dir.split()[5].split('/')[-1]
+    # pred_result = './predict/{}/test.txt'.format(pred_dir)
+    # pred_result = './predict/add_pretrain_1019-s-329480_v2/test_mol.txt'
+    # pred_result = './predict/add_pretrain_1019-s-329480-er/test_mol.txt'
+    # eval single file
+    # pred_file = "./predict/without-pre-train-layer-6-1021-s-988440-test/test_mol.txt"
+    # test_label_path = './data/test/test_ic50'
+    # eval(pred_file, test_label_path)
+    # eval all
+    test_label_path = './data/test/test_ic50'
+    test_label_path_ER = './data/ER/ER_ic50'
+    test_label_path_GPCR = './data/GPCR/GPCR_ic50'
+    test_label_path_Ion_channel = './data/Ion_channel/channel_ic50'
+    test_label_path_Tyrosine_kinase = './data/Tyrosine_kinase/kinase_ic50'
+    # test mol
+    # pred_test = "./predict/without-pre-train-layer-6-1021-s-988440-test/test_mol.txt"
+    # er = "./predict/without-pre-train-layer-6-1021-s-988440-er/test_er.txt"
+    # gpcr = "./predict/without-pre-train-layer-6-1021-s-988440-gpcr/test_gpcr.txt"
+    # channel = "./predict/without-pre-train-layer-6-1021-s-988440-channel/test_channel.txt"
+    # kinase = "./predict/without-pre-train-layer-6-1021-s-988440-kinase/test_kinase.txt"
+    # test 
+    # pred_test = "predict/train_ori_1217-s-296532/test.txt"
+    # er = "predict/train_ori_1217-s-296532/test_ori_er.txt"
+    # gpcr = "predict/train_ori_1217-s-296532/test_ori_gpcr.txt"
+    # channel = "predict/train_ori_1217-s-296532/test_ori_channel.txt"
+    # kinase = "predict/train_ori_1217-s-296532/test_ori_kinase.txt"
+    # deepdta
+    # pred_test = "baselines/DeepDTA/source/output/test/results.txt"
+    # er = "baselines/DeepDTA/source/output/ER/results.txt"
+    # gpcr = "baselines/DeepDTA/source/output/GPCR/results.txt"
+    # channel = "baselines/DeepDTA/source/output/Ion_channel/results.txt"
+    # kinase = "baselines/DeepDTA/source/output/Tyrosine_kinase/results.txt"
+    # attentiondta
+    # pred_test = "baselines/AttentionDTA_BIBM/results/test/test.txt"
+    # er = "baselines/AttentionDTA_BIBM/results/ER/test.txt"
+    # gpcr = "baselines/AttentionDTA_BIBM/results/GPCR/test.txt"
+    # channel = "baselines/AttentionDTA_BIBM/results/channel/test.txt"
+    # kinase = "baselines/AttentionDTA_BIBM/results/kinase/test.txt"
+    # test_mol test_2
+    # pred_test = "predict/pre-train-layer-6-1021/test_mol.txt"
+    # er = "predict/pre-train-layer-6-1021/test_er.txt"
+    # gpcr = "predict/pre-train-layer-6-1021/test_gpcr.txt"
+    # channel = "predict/pre-train-layer-6-1021/test_channel.txt"
+    # kinase = "predict/pre-train-layer-6-1021/test_kinase.txt"
+    #frequency embedding /notebook/our_model-new/predict/pre-train-layer-6-1021-freq
+    pred_test = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_mol.txt"
+    er = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_er.txt"
+    gpcr = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_gpcr.txt"
+    channel = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_channel.txt"
+    kinase = "/notebook/our_model-new/predict/pre-train-layer-6-1021-freq/test_kinase.txt"
+    pred_list = [pred_test, er, gpcr, channel, kinase]
+    label_list = [test_label_path, test_label_path_ER, test_label_path_GPCR, test_label_path_Ion_channel, test_label_path_Tyrosine_kinase]
+    for i, j in zip(pred_list, label_list):
+        print(i)
+        eval(i, j)
--- a/fine_tune.sh
+++ b/fine_tune.sh
+CUDA_VISIBLE_DEVICES=1 python run_interaction.py \
+-batch_size=4 --task=train_mol --epochs=30 --lr=1e-5 \
+--savedir=lr-1e-5-batch-64-e-30-layer6-1125-new \
+--config=./config/config_layer_6_mol.json \
+--output='./predict/test_new' \
+--pre_train=True \
+--init='./saved_model/train/epoch-23-step-790752-loss-0.12734022736549377.pth'
--- a/interaction_preprocessing.ipynb
+++ b/interaction_preprocessing.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_train_file_1 = '/notebook/our_model/data/interaction/dataset/BindingDB/train.csv'\n",
+    "data_train_file_2 = '/notebook/our_model/data/interaction/dataset/DAVIS/train.csv'\n",
+    "data_train_file_3 = '/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/train.csv'\n",
+    "\n",
+    "data_val_file_1 = '/notebook/our_model/data/interaction/dataset/BindingDB/val.csv'\n",
+    "data_val_file_2 = '/notebook/our_model/data/interaction/dataset/DAVIS/val.csv'\n",
+    "data_val_file_3 = '/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/val.csv'\n",
+    "\n",
+    "data_test_file_1 = '/notebook/our_model/data/interaction/dataset/BindingDB/test.csv'\n",
+    "data_test_file_2 = '/notebook/our_model/data/interaction/dataset/DAVIS/test.csv'\n",
+    "data_test_file_3 = '/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/test.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv(data_train_file_3)\n",
+    "data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/train/protein',header=None,index=None)\n",
+    "data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/train/smile',header=None,index=None)\n",
+    "data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/train/label',header=None,index=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv(data_val_file_3)\n",
+    "data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/validate/protein',header=None,index=None)\n",
+    "data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/validate/smile',header=None,index=None)\n",
+    "data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/validate/label',header=None,index=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv(data_test_file_3)\n",
+    "data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/test/protein',header=None,index=None)\n",
+    "data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/test/smile',header=None,index=None)\n",
+    "data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/full_data/test/label',header=None,index=None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#unseen protein"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "up_train_file = '/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/train.csv'\n",
+    "up_val_file =   '/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/val.csv'\n",
+    "up_test_file =  '/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/test.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv(up_train_file)\n",
+    "data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/train/protein',header=None,index=None)\n",
+    "data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/train/smile',header=None,index=None)\n",
+    "data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/train/label',header=None,index=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv(up_val_file)\n",
+    "data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/validate/protein',header=None,index=None)\n",
+    "data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/validate/smile',header=None,index=None)\n",
+    "data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/validate/label',header=None,index=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv(up_test_file)\n",
+    "data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/test/protein',header=None,index=None)\n",
+    "data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/test/smile',header=None,index=None)\n",
+    "data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_protein/test/label',header=None,index=None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#unseen drug"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ud_train_file = '/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/train.csv'\n",
+    "ud_val_file =   '/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/val.csv'\n",
+    "ud_test_file =  '/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/test.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv(ud_train_file)\n",
+    "data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/train/protein',header=None,index=None)\n",
+    "data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/train/smile',header=None,index=None)\n",
+    "data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/train/label',header=None,index=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv(ud_val_file)\n",
+    "data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/validate/protein',header=None,index=None)\n",
+    "data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/validate/smile',header=None,index=None)\n",
+    "data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/validate/label',header=None,index=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv(ud_test_file)\n",
+    "data['Target Sequence'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/test/protein',header=None,index=None)\n",
+    "data['SMILES'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/test/smile',header=None,index=None)\n",
+    "data['Label'].to_csv('/notebook/our_model/data/interaction/dataset/BIOSNAP/unseen_drug/test/label',header=None,index=None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##测试词汇分割效率##"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from subword_nmt.apply_bpe import BPE\n",
+    "import codecs\n",
+    "import json\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm\n",
+    "import math\n",
+    "import random"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_tokenzie_seq(file, save, mask=False):\n",
+    "    begin_token = '[CLS]'\n",
+    "    separate_token = \"[SEP]\"\n",
+    "    with open(file['seq'], 'r') as f:\n",
+    "        seq = f.readlines()\n",
+    "    with open(file[\"smile\"], 'r') as f:\n",
+    "        smile = f.readlines()\n",
+    "    with open(file[\"affinity\"], 'r') as f:\n",
+    "        affinity = f.readlines()\n",
+    "     \n",
+    "    bpe_codes_drug = codecs.open('./config/drug_codes_chembl.txt')\n",
+    "    dbpe = BPE(bpe_codes_drug, merges=-1, separator='')\n",
+    "    bpe_codes_prot = codecs.open('./config/protein_codes_uniprot.txt')\n",
+    "    pbpe = BPE(bpe_codes_prot, merges=-1, separator='')\n",
+    "\n",
+    "    with open(save, \"w\") as f:\n",
+    "        for i in tqdm(range(len(seq))):\n",
+    "            d = dbpe.process_line(smile[i].strip()).split()\n",
+    "            p = pbpe.process_line(seq[i].strip()).split()\n",
+    "            if mask == True:\n",
+    "                d = random_mask(d)\n",
+    "                p = random_mask(p)\n",
+    "            final_seq = [begin_token] + d + [separate_token] + p + [separate_token]\n",
+    "            affinity_num = affinity[i].strip()\n",
+    "            item = {\n",
+    "                \"seq\": \" \".join(final_seq),\n",
+    "                \"affinity\": affinity_num\n",
+    "            }\n",
+    "            new_item = json.dumps(item)\n",
+    "            f.write(new_item + '\\n')\n",
+    "\n",
+    "def get_tokenzie_seq_case(file, save, mask=False):\n",
+    "    begin_token = '[CLS]'\n",
+    "    separate_token = \"[SEP]\"\n",
+    "    with open(file['seq'], 'r') as f:\n",
+    "        seq = f.readlines()\n",
+    "        seq = [i.strip() for i in seq]\n",
+    "        seq = \"\".join(seq)\n",
+    "    with open(file[\"smile\"], 'r') as f:\n",
+    "        smile = f.readlines()\n",
+    "    # with open(file[\"affinity\"], 'r') as f:\n",
+    "    #     affinity = f.readlines()\n",
+    "\n",
+    "    bpe_codes_drug = codecs.open('./config/drug_codes_chembl.txt')\n",
+    "    dbpe = BPE(bpe_codes_drug, merges=-1, separator='')\n",
+    "    bpe_codes_prot = codecs.open('./config/protein_codes_uniprot.txt')\n",
+    "    pbpe = BPE(bpe_codes_prot, merges=-1, separator='')\n",
+    "\n",
+    "    with open(save, \"w\") as f:\n",
+    "        for i in tqdm(range(len(smile))):\n",
+    "            d = dbpe.process_line(smile[i].strip()).split()\n",
+    "            p = pbpe.process_line(seq).split()\n",
+    "            if mask == True:\n",
+    "                d = random_mask(d)\n",
+    "                p = random_mask(p)\n",
+    "            final_seq = [begin_token] + d + [separate_token] + p + [separate_token]\n",
+    "            # affinity_num = affinity[i].strip()\n",
+    "            item = {\n",
+    "                \"seq\": \" \".join(final_seq),\n",
+    "                # \"affinity\": affinity_num\n",
+    "            }\n",
+    "            new_item = json.dumps(item)\n",
+    "            f.write(new_item + '\\n')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.12 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/load_embedding.py
+++ b/load_embedding.py
+from yaml import load
+from dataset import Data_Encoder, get_task, Data_Encoder_mol, Data_Gen, Tokenizer
+import torch
+from torch.utils.data import DataLoader
+from configuration_bert import BertConfig
+from modeling_bert import BertAffinityModel
+import tqdm
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn import manifold, datasets
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "5"
+def load_embedding(data_file):
+    tokenizer_config = {"vocab_file": './config/vocab_mol.txt',
+                            "vocab_pair": './config/drug_codes_chembl.txt',
+                            "vocab_pair_p": './config/protein_codes_uniprot.txt',
+                            "begin_id": '[CLS]',
+                            "separate_id": "[SEP]",
+                            "max_len": 512
+                            }
+    tokenizer = Tokenizer(tokenizer_config)
+    sep_id = 3
+    dataset = Data_Gen(data_file)
+    data_generator = DataLoader(dataset, batch_size=1, shuffle=False)
+    config = BertConfig.from_pretrained('./config/config_layer_6_mol.json')
+    model = BertAffinityModel(config)
+    model.load_state_dict(torch.load('./model/add_pretrain_1019/epoch-9-step-329480-loss-0.736057146887367.pth'), strict=True)
+    all_drug = []
+    all_protein = []
+    for i, (input, affinity) in enumerate(data_generator):
+        # input = input[1:]
+        input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
+        input_embs = model.embeddings(input_ids)
+        sep_index = torch.where(input_ids[:, :-1] == sep_id)[-1]
+        drug_emb = input_embs[:, 1:sep_index].squeeze(0).detach().numpy()
+        protein_embs = input_embs[:, sep_index+1:-1].squeeze(0).detach().numpy()
+        all_drug.append(drug_emb)
+        all_protein.append(protein_embs)
+    return all_drug, all_protein    
+def plot_drug_protein(save):
+    drug_embs, protein_embs = load_embedding("add_figure/sample_data/test_sample")
+    all_drug_sub = np.concatenate(drug_embs)
+    all_protein_sub = np.concatenate(protein_embs)[:len(all_drug_sub)]
+    all_data = np.concatenate((all_drug_sub, all_protein_sub))
+    y = np.array([0]*len(all_drug_sub) + [1]*len(all_protein_sub))
+    # t-sne
+    X_tsne = manifold.TSNE(n_components=2, init='random', random_state=5, verbose=1).fit_transform(all_data)
+    # plot
+    fig, ax=plt.subplots(dpi=600)
+    plt.axis("off")
+    # ax.spines['top'].set_visible(False)
+    # ax.spines['right'].set_visible(False)
+    # ax.spines['bottom'].set_visible(False)
+    # ax.spines['left'].set_visible(False)
+    plt.scatter(X_tsne[y==0, 0], X_tsne[y==0, 1], c="darkcyan", s=5, label="Drug", marker='^')
+    plt.scatter(X_tsne[y==1, 0], X_tsne[y==1, 1], c="deepskyblue", s=5, label="Protein", marker="s")
+    # plt.scatter(X_tsne[y==2, 0], X_tsne[y==2, 1], c="salmon", s=5, label="Story")
+    plt.legend(labels=["Drug", "Protein"], loc=1)
+    plt.savefig(save, dpi=fig.dpi, pad_inches=0, bbox_inches="tight")
+def plot_protein_sub(save):
+    drug_embs, protein_embs = load_embedding("add_figure/sample_data/test_sample")
+    drug_1 = protein_embs[0]
+    drug_2 = protein_embs[1]
+    drug_3 = protein_embs[2]
+    # drug_4 = protein_embs[3]
+    y = np.array([0]*len(drug_1) + [1]*len(drug_2) + [2]*len(drug_3)) 
+                #  + [3]*len(drug_4))
+    # all_data = np.concatenate((drug_1, drug_2, drug_3, drug_4))
+    all_data = np.concatenate((drug_1, drug_2, drug_3))
+    X_tsne = manifold.TSNE(n_components=2, init='random', random_state=5, verbose=1).fit_transform(all_data)
+    # plot
+    fig, ax=plt.subplots(dpi=600)
+    plt.axis("off")
+    plt.scatter(X_tsne[y==0, 0], X_tsne[y==0, 1], c="darkcyan", s=5, label="PTPH1", marker='^')
+    plt.scatter(X_tsne[y==1, 0], X_tsne[y==1, 1], c="deepskyblue", s=5, label="mGluRs", marker="s")
+    plt.scatter(X_tsne[y==2, 0], X_tsne[y==2, 1], c="salmon", s=5, label="EZH2")
+    # plt.scatter(X_tsne[y==3, 0], X_tsne[y==3, 1], s=5, label="Protein_4")
+    # plt.legend(labels=["Protein_1", "Protein_2", "Protein_3", "Protein_4"], loc=1)
+    plt.legend(labels=["PTPH1", "mGluRs", "EZH2"], loc=1)
+    plt.savefig(save, dpi=fig.dpi, pad_inches=0, bbox_inches="tight")
+def plot_drug_sub(save):
+    drug_embs, protein_embs = load_embedding("add_figure/sample_data/test_sample")
+    drug_1 = drug_embs[0]
+    drug_2 = drug_embs[1]
+    drug_3 = drug_embs[2]
+    # drug_4 = protein_embs[3]
+    y = np.array([0]*len(drug_1) + [1]*len(drug_2) + [2]*len(drug_3)) 
+                #  + [3]*len(drug_4))
+    # all_data = np.concatenate((drug_1, drug_2, drug_3, drug_4))
+    all_data = np.concatenate((drug_1, drug_2, drug_3))
+    X_tsne = manifold.TSNE(n_components=2, init='random', random_state=5, verbose=1).fit_transform(all_data)
+    # plot
+    fig, ax=plt.subplots(dpi=600)
+    plt.axis("off")
+    plt.scatter(X_tsne[y==0, 0], X_tsne[y==0, 1], c="darkcyan", s=5, label="Drug_1", marker='^')
+    plt.scatter(X_tsne[y==1, 0], X_tsne[y==1, 1], c="deepskyblue", s=5, label="Drug_2", marker="s")
+    plt.scatter(X_tsne[y==2, 0], X_tsne[y==2, 1], c="salmon", s=5, label="Drug_3")
+    # plt.scatter(X_tsne[y==3, 0], X_tsne[y==3, 1], s=5, label="Protein_4")
+    # plt.legend(labels=["Protein_1", "Protein_2", "Protein_3", "Protein_4"], loc=1)
+    plt.legend(labels=["Drug_1", "Drug_2", "Drug_3"], loc=1)
+    plt.savefig(save, dpi=fig.dpi, pad_inches=0, bbox_inches="tight")
+if __name__ == '__main__':
+    plot_drug_protein("drug_and_protein_sub")
+    # plot_drug_sub("three_drug_sub")
+    # plot_protein_sub("three_protein_sub")
--- a/modeling_bert.py
+++ b/modeling_bert.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from configuration_bert import BertConfig
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "bert-base-uncased"
+_CONFIG_FOR_DOC = "BertConfig"
+_TOKENIZER_FOR_DOC = "BertTokenizer"
+BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "bert-base-uncased",
+    "bert-large-uncased",
+    "bert-base-cased",
+    "bert-large-cased",
+    "bert-base-multilingual-uncased",
+    "bert-base-multilingual-cased",
+    "bert-base-chinese",
+    "bert-base-german-cased",
+    "bert-large-uncased-whole-word-masking",
+    "bert-large-cased-whole-word-masking",
+    "bert-large-uncased-whole-word-masking-finetuned-squad",
+    "bert-large-cased-whole-word-masking-finetuned-squad",
+    "bert-base-cased-finetuned-mrpc",
+    "bert-base-german-dbmdz-cased",
+    "bert-base-german-dbmdz-uncased",
+    "cl-tohoku/bert-base-japanese",
+    "cl-tohoku/bert-base-japanese-whole-word-masking",
+    "cl-tohoku/bert-base-japanese-char",
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking",
+    "TurkuNLP/bert-base-finnish-cased-v1",
+    "TurkuNLP/bert-base-finnish-uncased-v1",
+    "wietsedv/bert-base-dutch-cased",
+    # See all BERT models at https://huggingface.co/models?filter=bert
+]
+def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info("Skipping {}".format("/".join(name)))
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            assert (
+                pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.is_decoder = config.is_decoder
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
+            self.crossattention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            assert hasattr(
+                self, "crossattention"
+            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+        return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
+                        "`use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = BertConfig
+    load_tf_weights = load_tf_weights_in_bert
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+@dataclass
+class BertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.BertForPreTraining`.
+    Args:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+BERT_START_DOCSTRING = r"""
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+    Parameters:
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+)
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.init_weights()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    # @add_code_sample_docstrings(
+    #     tokenizer_class=_TOKENIZER_FOR_DOC,
+    #     checkpoint=_CHECKPOINT_FOR_DOC,
+    #     output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+    #     config_class=_CONFIG_FOR_DOC,
+    # )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForPreTraining(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertForPreTraining
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.prediction_logits
+            >>> seq_relationship_logits = outputs.seq_relationship_logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return BertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top for CLM fine-tuning. """, BERT_START_DOCSTRING
+)
+class BertLMHeadModel(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        if not config.is_decoder:
+            logger.warning("If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`")
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> config.is_decoder = True
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past}
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
+class BertForMaskedLM(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    # @add_code_sample_docstrings(
+    #     tokenizer_class=_TOKENIZER_FOR_DOC,
+    #     checkpoint=_CHECKPOINT_FOR_DOC,
+    #     output_type=MaskedLMOutput,
+    #     config_class=_CONFIG_FOR_DOC,
+    # )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING,
+)
+class BertForNextSentencePrediction(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+        self.init_weights()
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertForNextSentencePrediction
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+            >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+            >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+            >>> logits = outputs.logits
+            >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        """
+        if "next_sentence_label" in kwargs:
+            warnings.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        seq_relationship_scores = self.cls(pooled_output)
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    # @add_code_sample_docstrings(
+    #     tokenizer_class=_TOKENIZER_FOR_DOC,
+    #     checkpoint=_CHECKPOINT_FOR_DOC,
+    #     output_type=SequenceClassifierOutput,
+    #     config_class=_CONFIG_FOR_DOC,
+    # )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForMultipleChoice(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.init_weights()
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    # @add_code_sample_docstrings(
+    #     tokenizer_class=_TOKENIZER_FOR_DOC,
+    #     checkpoint=_CHECKPOINT_FOR_DOC,
+    #     output_type=MultipleChoiceModelOutput,
+    #     config_class=_CONFIG_FOR_DOC,
+    # )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
+            :obj:`input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForTokenClassification(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    # @add_code_sample_docstrings(
+    #     tokenizer_class=_TOKENIZER_FOR_DOC,
+    #     checkpoint=_CHECKPOINT_FOR_DOC,
+    #     output_type=TokenClassifierOutput,
+    #     config_class=_CONFIG_FOR_DOC,
+    # )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForQuestionAnswering(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+         # 创建TF-IDF embedding 层
+        self.tfidf_emb = nn.Embedding(config.max_len, config.hidden_size)
+        self.init_weights()
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    # @add_code_sample_docstrings(
+    #     tokenizer_class=_TOKENIZER_FOR_DOC,
+    #     checkpoint=_CHECKPOINT_FOR_DOC,
+    #     output_type=QuestionAnsweringModelOutput,
+    #     config_class=_CONFIG_FOR_DOC,
+    # )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class Multilayer_perceptron(nn.Module):
+    def __init__(self, config):
+        super(Multilayer_perceptron, self).__init__()
+        self.layer_1 = nn.Linear(config.hidden_size, 1)
+        # self.drop_out = nn.Dropout()
+        # self.layer_2 = nn.Linear(512, 256)
+        # self.layer_3 = nn.Linear(256, 1)
+        # self.drop_out = nn.Dropout(0.5)
+    def forward(self, bert_output):
+        # x = self.drop_out(bert_output)
+        x1 = self.layer_1(bert_output)
+        # x2 = self.drop_out(x1)
+        # x1 = F.relu(x1, inplace=True)
+        # x1 = self.drop_out(x1)
+        # x2 = self.layer_2(x1)
+        # x2 = F.relu(x2, inplace=True)
+        # x2 = self.drop_out(x2)
+        # x2 = self.layer_3(x1)
+        # return x2
+        return x1
+class BertAffinityModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.mlp = Multilayer_perceptron(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+        # 创建TF-IDF embedding 层
+        self.tfidf_emb = nn.Embedding(config.vocab_size, config.hidden_size)
+        # self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.init_weights()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    # @add_code_sample_docstrings(
+    #     tokenizer_class=_TOKENIZER_FOR_DOC,
+    #     checkpoint=_CHECKPOINT_FOR_DOC,
+    #     output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+    #     config_class=_CONFIG_FOR_DOC,
+    # )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        # 添加一个新的参数来传递TF-IDF值
+        tfidf_values=None
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        #在获取embedding output之前，先获取TF-IDF embedding
+        tfidf_embeds = self.tfidf_emb(tfidf_values)
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        #在embedding上添加TF-IDF embedding
+        embedding_output += tfidf_embeds
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        # pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        # if not return_dict:
+            # return (sequence_output, pooled_output) + encoder_outputs[1:]
+        # print(sequence_output.size())
+        bert_pred = sequence_output[:,0,:]
+        pred_affinity = self.mlp.forward(bert_pred)
+      #  if output_attentions is not None:
+     #       return BaseModelOutputWithPoolingAndCrossAttentions(
+     #           last_hidden_state=sequence_output,
+     #           # pooler_output=pooled_output,
+     #           past_key_values=encoder_outputs.past_key_values,
+     #           hidden_states=encoder_outputs.hidden_states,
+     #           attentions=encoder_outputs.attentions,
+     #           cross_attentions=encoder_outputs.cross_attentions,
+     #       )
+     #   else:
+     #       return pred_affinity
+        return pred_affinity
+class BertAffinityModel_MaskLM(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.mlp = Multilayer_perceptron(config)
+        # self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+        # 创建TF-IDF embedding 层
+        self.tfidf_emb = nn.Embedding(config.max_len, config.hidden_size)
+        self.init_weights()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    # @add_code_sample_docstrings(
+    #     tokenizer_class=_TOKENIZER_FOR_DOC,
+    #     checkpoint=_CHECKPOINT_FOR_DOC,
+    #     output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+    #     config_class=_CONFIG_FOR_DOC,
+    # )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        # 添加一个新的参数来传递TF-IDF值
+        tfidf_values=None
+    ):
+        """
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        #在获取embedding output之前，先获取TF-IDF embedding
+        tfidf_embeds = self.tfidf_emb(tfidf_values)
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        #在embedding上添加TF-IDF embedding
+        embedding_output += tfidf_embeds
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        logits = self.lm_head(sequence_output)
+        # pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        # if not return_dict:
+            # return (sequence_output, pooled_output) + encoder_outputs[1:]
+        # print(sequence_output.size())
+        # bert_pred = sequence_output[:,0,:]
+        # pred_affinity = self.mlp.forward(bert_pred)
+        # return BaseModelOutputWithPoolingAndCrossAttentions(
+        #     last_hidden_state=sequence_output,
+        #     pooler_output=pooled_output,
+        #     past_key_values=encoder_outputs.past_key_values,
+        #     hidden_states=encoder_outputs.hidden_states,
+        #     attentions=encoder_outputs.attentions,
+        #     cross_attentions=encoder_outputs.cross_attentions,
+        # )
+        return logits
--- a/pretrain.sh
+++ b/pretrain.sh
+CUDA_VISIBLE_DEVICES=4 
+python run_prediction.py \
+--batch_size=56 \
+--task=train_mol \
+--epochs=100 \
+--lr=1e-5 \
+--savedir=pre-train-yzh \
+--config=./config/config_layer_6_mol.json 
\ No newline at end of file
--- a/run_interaction.py
+++ b/run_interaction.py
+from argparse import ArgumentParser
+import numpy as np
+from dataset import Data_Encoder, get_task, Data_Encoder_mol, Data_Gen, Tokenizer
+import torch
+from torch.utils.data import DataLoader
+from configuration_bert import BertConfig
+from modeling_bert import BertAffinityModel
+from torch.utils.tensorboard import SummaryWriter
+import os
+from tqdm import tqdm
+# torch.set_default_tensor_type(torch.DoubleTensor)
+def train(args, model, dataset, tokenizer, pre_train=False):
+    data_loder_para = {'batch_size': args.batch_size,
+                       'shuffle': True,
+                       'num_workers': args.workers,
+                       }
+    data_generator = DataLoader(dataset, **data_loder_para)
+    if pre_train == True:
+        model.load_state_dict(torch.load(args.init), strict=True)
+    model.train()
+    opt = torch.optim.Adam(model.parameters(), lr=args.lr)
+    loss_fct = torch.nn.MSELoss()
+    writer = SummaryWriter('./log/' + args.savedir)
+    num_step = args.epochs * len(data_generator)
+    step = 0
+    save_step = num_step // 10
+    # detect GPU
+    if torch.cuda.is_available():
+        model.cuda()
+    # print(model)
+    print('epoch num : {}'.format(args.epochs))
+    print('step num : {}'.format(num_step))
+    print('batch size : {}'.format(args.batch_size))
+    print('learning rate : {}'.format(args.lr))
+    print('begin training')
+    # training
+    for epoch in range(args.epochs):
+        for i, (input_ids, token_type_ids, attention_mask, affinity) in enumerate(data_generator):
+            # use cuda
+            # input model
+            # input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
+            pred_affinity = model(input_ids=input_ids.cuda(), token_type_ids=token_type_ids.cuda(), attention_mask=attention_mask.cuda())
+            loss = loss_fct(pred_affinity, affinity.cuda().float().unsqueeze(-1))
+            step += 1
+            writer.add_scalar('loss', loss, global_step=step)
+            # Update gradient
+            opt.zero_grad()
+            loss.backward()
+            opt.step()
+            #                 if (i % 100 == 0):
+            print('Training at Epoch ' + str(epoch + 1) + ' step ' + str(step) + ' with loss ' + str(
+                loss.cpu().detach().numpy()))
+            # save
+            if epoch >= 1 and step % save_step == 0:
+                save_path = './model/' + args.savedir + '/'
+                if not os.path.exists(save_path):
+                    os.mkdir(save_path)
+                torch.save(model.state_dict(), save_path + 'epoch-{}-step-{}-loss-{}.pth'.format(epoch, step, loss))
+    print('training  over')
+    writer.close()
+def test(args, model, dataset, tokenizer):
+    data_loder_para = {'batch_size': args.batch_size,
+                       'shuffle': False,
+                       'num_workers': args.workers,
+                       }
+    data_generator = DataLoader(dataset, **data_loder_para)
+    with torch.no_grad():
+        # if torch.cuda.is_available():
+        model.load_state_dict(torch.load(args.init), strict=True)
+        model.cuda()
+        # else:
+        #     model.load_state_dict(torch.load(args.init, map_location=torch.device('cpu')), strict=True)
+        model.eval()
+        if not os.path.exists(args.output):
+            os.mkdir(args.output)
+        result = args.output + '/' + '{}.txt'.format(args.task)
+        print('begin predicting')
+        with open(result, 'w') as f:
+            for i, (input_ids, token_type_ids, attention_mask, affinity) in enumerate(tqdm(data_generator)):
+                # input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
+                pred_affinity = model(input_ids=input_ids.cuda(), token_type_ids=token_type_ids.cuda(), attention_mask=attention_mask.cuda())
+                pred_affinity = pred_affinity.cpu().numpy().squeeze(-1)
+                for res in pred_affinity:
+                    f.write(str(res) + '\n')
+    # if args.do_eval:
+    #     os.system('python eval.py')
+def train_mol(args, model, dataset, tokenizer, pre_train=False):
+    data_loder_para = {'batch_size': args.batch_size,
+                       'shuffle': True,
+                       'num_workers': args.workers,
+                       }
+    data_generator = DataLoader(dataset, **data_loder_para)
+    if pre_train == True:
+        model.load_state_dict(torch.load(args.init), strict=True)
+    model.train()
+    opt = torch.optim.Adam(model.parameters(), lr=args.lr)
+    loss_fct = torch.nn.MSELoss()
+    writer = SummaryWriter('./log/' + args.savedir)
+    num_step = args.epochs * len(data_generator)
+    step = 0
+    save_step = num_step // 10
+    # detect GPU
+    if torch.cuda.is_available():
+        model.cuda()
+    # print(model)
+    print('epoch num : {}'.format(args.epochs))
+    print('step num : {}'.format(num_step))
+    print('batch size : {}'.format(args.batch_size))
+    print('learning rate : {}'.format(args.lr))
+    print('begin training')
+    # training
+    for epoch in range(args.epochs):
+        for i, (input, affinity) in enumerate(data_generator):
+            # use cuda
+            # input model
+            input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
+            pred_affinity = model(input_ids=input_ids.cuda(), attention_mask=attention_mask.cuda())
+            loss = loss_fct(pred_affinity, affinity.to(torch.float32).cuda().unsqueeze(-1))
+            step += 1
+            writer.add_scalar('loss', loss, global_step=step)
+            # Update gradient
+            opt.zero_grad()
+            loss.backward()
+            opt.step()
+            #                 if (i % 100 == 0):
+            print('Training at Epoch ' + str(epoch + 1) + ' step ' + str(step) + ' with loss ' + str(
+                loss.cpu().detach().numpy()))
+            # save
+            if epoch >= 1 and step % save_step == 0:
+                save_path = './model/' + args.savedir + '/'
+                if not os.path.exists(save_path):
+                    os.mkdir(save_path)
+                torch.save(model.state_dict(), save_path + 'epoch-{}-step-{}-loss-{}.pth'.format(epoch, step, loss))
+    print('training  over')
+    writer.close()
+def test_mol(args, model, dataset, tokenizer):
+    data_loder_para = {'batch_size': args.batch_size,
+                       'shuffle': False,
+                       'num_workers': args.workers,
+                       }
+    data_generator = DataLoader(dataset, **data_loder_para)
+    with torch.no_grad():
+        # if torch.cuda.is_available():
+        model.load_state_dict(torch.load(args.init), strict=True)
+        model.cuda()
+        # else:
+        #     model.load_state_dict(torch.load(args.init, map_location=torch.device('cpu')), strict=True)
+        model.eval()
+        if not os.path.exists(args.output):
+            os.mkdir(args.output)
+        result = args.output + '/' + '{}.txt'.format(args.task)
+        print('begin predicting')
+        with open(result, 'w') as f:
+            for i, (input, affinity) in enumerate(tqdm(data_generator)):
+                input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
+                # pred_affinity = model(input_ids=input_ids.cuda(), attention_mask=attention_mask.cuda(), output_attentions=True)
+                # attention_mat = pred_affinity["attentions"][-1].detach().cpu().numpy()
+                # np.save("visualize_attention/attention_mat", attention_mat)
+                pred_affinity = model(input_ids=input_ids.cuda(), attention_mask=attention_mask.cuda())
+                                    #   , output_attentions=True)
+                # pred_affinity = model(input_ids=input, token_type_ids=token_type_ids, attention_mask=input_mask)
+                pred_affinity = pred_affinity.cpu().numpy().squeeze(-1)
+                for res in pred_affinity:
+                    f.write(str(res) + '\n')
+    # if args.do_eval:
+    #     os.system('python eval.py')
+def main(args):
+    # load data
+    data_file, tokenizer_config = get_task(args.task)
+    if args.task in ['train_mol', 'test_mol', "test_er", "test_gpcr", "test_channel", "test_kinase"]:
+        dataset = Data_Gen(data_file)
+    else:
+        dataset = Data_Encoder(data_file, tokenizer_config)
+    # creat model
+    print('------------------creat model---------------------------')
+    config = BertConfig.from_pretrained(args.config)
+    model = BertAffinityModel(config)
+    tokenizer = Tokenizer(tokenizer_config)
+    print('model name : BertAffinity')
+    print('task name : {}'.format(args.task))
+    if args.task in ['train_mol']:
+        train_mol(args, model, dataset, tokenizer, pre_train=args.pre_train)
+        # train(args, model, dataset, tokenizer)
+    elif args.task in ['test_mol', "test_er", "test_gpcr", "test_channel", "test_kinase"]:
+        test_mol(args, model, dataset, tokenizer)
+    elif args.task in ['train', 'train_z_1', 'train_z_10', 'train_z_100']:
+        train(args, model, dataset, tokenizer, pre_train=args.pre_train)
+    elif args.task in ['test', 'test_ori_er', 'test_ori_gpcr', 'test_ori_channel', 'test_ori_kinase']:
+        test(args, model, dataset, tokenizer)
+if __name__ == '__main__':
+    # get parameter
+    parser = ArgumentParser(description='BertAffinity')
+    parser.add_argument('-b','--batch-size', default=8, type=int,
+                        metavar='N',
+                        help='mini-batch size (default: 16), this is the total '
+                             'batch size of all GPUs on the current node when '
+                             'using Data Parallel or Distributed Data Parallel')
+    parser.add_argument('-j', '--workers', default=0, type=int, metavar='N',
+                        help='number of data loading workers (default: 0)')
+    parser.add_argument('--epochs', default=50, type=int, metavar='N',
+                        help='number of total epochs to run')
+    parser.add_argument('--task', default='train', type=str, metavar='TASK',
+                        help='Task name. Could be train, test, channel, ER, GPCR, kinase or else.')
+    parser.add_argument('--lr', '--learning-rate', default=1e-5, type=float,
+                        metavar='LR', help='initial learning rate', dest='lr')
+    parser.add_argument('--config', default='./config/config.json', type=str, help='model config file path')
+    # parser.add_argument('--log', default='training_log', type=str, help='training log')
+    parser.add_argument('--savedir', default='train', type=str, help='log and model save path')
+    # parser.add_argument('--device', default='0', type=str, help='name of GPU')
+    parser.add_argument('--init', default='model', type=str, help='init checkpoint')
+    parser.add_argument('--output', default='predict', type=str, help='result save path')
+    # parser.add_argument('--shuffle', default=True, type=str, help='shuffle data')
+    # parser.add_argument('--do_eval', default=False, type=bool, help='do eval')
+    parser.add_argument('--pre_train', default=False, type=bool, help='use pre-train')
+    args = parser.parse_args()
+    # local test
+    # os.environ["CUDA_VISIBLE_DEVICES"] = "5"
+    # args.task = 'train'
+    # args.epochs = 30
+    # args.lr = 1e-5
+    # args.config = './config/config_layer_6.json'
+    # args.savedir = 'train_ori_1217'
+    # args.task = 'train_mol'
+    # args.savedir = 'without-pre-train-layer-6-1021'
+    # # # args.savedir = 'train'
+    # args.epochs = 30
+    # args.lr = 1e-5
+    # args.config = './config/config_layer_6_mol.json'
+    # args.pre_train = False
+    # args.init = './model/mask-LM-lr-1e-4-1019/epoch-17-step-593064-loss-0.1007341668009758.pth'
+    #args.task = 'test_mol'
+    # args.task = 'test_er'
+    # args.task = 'test_gpcr'
+    # args.task = 'test_channel'
+    # args.task = 'test_kinase'
+    #args.init = './model/add_pretrain_1019/epoch-9-step-329480-loss-0.736057146887367.pth'
+    # args.init = './model/without-pre-train-layer-6-1021/epoch-29-step-988440-loss-0.19894360158554475.pth'
+    # args.output = './predict/without-pre-train-layer-6-1021-s-988440-test'
+    # args.output = './predict/without-pre-train-layer-6-1021-s-988440-gpcr'
+    # args.output = './predict/without-pre-train-layer-6-1021-s-988440-channel'
+    # args.output = './predict/without-pre-train-layer-6-1021-s-988440-kinase'
+    # args.output = './predict/without-pre-train-layer-6-1021-s-988440-er'
+    # args.output = './predict/add_pretrain_1019-s-329480-er'
+    # args.output = './predict/add_pretrain_1019-s-329480-gpcr'
+    # args.output = './predict/add_pretrain_1019-s-329480-channel'
+    # args.output = './predict/add_pretrain_1019-s-329480-kinase'
+    #args.config = './config/config_layer_6_mol.json'
+    #args.output = "./predict/test"
+    # args.batch_size = 1
+    # test ori
+    # args.task = 'test'
+    # args.task = 'test_ori_er'
+    # args.task = 'test_ori_gpcr'
+    # args.task = 'test_ori_channel'
+    # args.task = 'test_ori_kinase'
+    # args.init = 'model/train_ori_1217/epoch-8-step-296532-loss-0.5783637166023254.pth'
+    # args.config = './config/config_layer_6.json'
+    # args.output = './predict/train_ori_1217-s-296532'
+    ####yzh new train###
+    args.task = 'train_mol'
+    args.batch_size = 16
+    args.savedir = 'fine-tune-new-50epochs-config-layer-6-mol'
+    # # args.savedir = 'train'
+    args.epochs = 50
+    args.lr = 1e-5
+    args.config = './config/config_layer_6_mol.json'
+    args.pre_train = True
+    #args.init = './model/train/epoch-23-step-790752-loss-0.12734022736549377.pth'
+    #args.init = './model/mask-LM-lr-1e-4-1019/epoch-17-step-593064-loss-0.1007341668009758.pth'
+    args.init = './model/pre-train-new-100epochs-config_layer_6_mol/epoch-99-step-3294800-loss-0.0736498162150383.pth'
+    ####yzh new test###
+    #args.task = 'test_mol'
+    #args.task = 'test_er'
+    #args.task = 'test_gpcr'
+    #args.task = 'test_channel'
+    args.task = 'test_kinase'
+    args.batch_size = 32
+    #args.init = './model/fine-tune-50-epochs-config-layer-6-mol/epoch-49-step-823700-loss-0.1486610472202301.pth'
+    args.init = './model/fine-tune-new-50epochs-config-layer-6-mol/epoch-49-step-823700-loss-0.30148088932037354.pth'
+    args.config = './config/config_layer_6_mol.json'
+    args.output = './predict/fine-tune-new-50epochs-config-layer-6-mol'
+    main(args)
--- a/run_interaction_backup.py
+++ b/run_interaction_backup.py
+from argparse import ArgumentParser
+import sys
+import numpy as np
+from dataset import Data_Encoder, get_task, Data_Encoder_mol, Data_Gen, Tokenizer
+import torch
+from torch.utils.data import DataLoader
+from configuration_bert import BertConfig
+from modeling_bert import BertAffinityModel
+from torch.utils.tensorboard import SummaryWriter
+import os
+from tqdm import tqdm
+# torch.set_default_tensor_type(torch.DoubleTensor)
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.append(ROOT_DIR)
+def train(args, model, dataset, tokenizer, pre_train=False):
+    data_loder_para = {'batch_size': args.batch_size,
+                       'shuffle': True,
+                       'num_workers': args.workers,
+                       }
+    data_generator = DataLoader(dataset, **data_loder_para)
+    if pre_train == True:
+        model.load_state_dict(torch.load(args.init), strict=True)
+    model.train()
+    opt = torch.optim.Adam(model.parameters(), lr=args.lr)
+    loss_fct = torch.nn.MSELoss()
+    writer = SummaryWriter('./log/' + args.savedir)
+    num_step = args.epochs * len(data_generator)
+    step = 0
+    save_step = num_step // 10
+    # detect GPU
+    if torch.cuda.is_available():
+        model.cuda()
+    # print(model)
+    print('epoch num : {}'.format(args.epochs))
+    print('step num : {}'.format(num_step))
+    print('batch size : {}'.format(args.batch_size))
+    print('learning rate : {}'.format(args.lr))
+    print('begin training')
+    # training
+    for epoch in range(args.epochs):
+        for i, (input_ids, token_type_ids, attention_mask, affinity) in enumerate(data_generator):
+            # use cuda
+            # input model
+            # input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
+            pred_affinity = model(input_ids=input_ids.cuda(), token_type_ids=token_type_ids.cuda(), attention_mask=attention_mask.cuda())
+            loss = loss_fct(pred_affinity, affinity.cuda().float().unsqueeze(-1))
+            step += 1
+            writer.add_scalar('loss', loss, global_step=step)
+            # Update gradient
+            opt.zero_grad()
+            loss.backward()
+            opt.step()
+            #                 if (i % 100 == 0):
+            print('Training at Epoch ' + str(epoch + 1) + ' step ' + str(step) + ' with loss ' + str(
+                loss.cpu().detach().numpy()))
+            # save
+            if epoch >= 1 and step % save_step == 0:
+                save_path = './model/' + args.savedir + '/'
+                if not os.path.exists(save_path):
+                    os.mkdir(save_path)
+                torch.save(model.state_dict(), save_path + 'epoch-{}-step-{}-loss-{}.pth'.format(epoch, step, loss))
+    print('training  over')
+    writer.close()
+def test(args, model, dataset, tokenizer):
+    data_loder_para = {'batch_size': args.batch_size,
+                       'shuffle': False,
+                       'num_workers': args.workers,
+                       }
+    data_generator = DataLoader(dataset, **data_loder_para)
+    with torch.no_grad():
+        # if torch.cuda.is_available():
+        model.load_state_dict(torch.load(args.init), strict=True)
+        model.cuda()
+        # else:
+        #     model.load_state_dict(torch.load(args.init, map_location=torch.device('cpu')), strict=True)
+        model.eval()
+        if not os.path.exists(args.output):
+            os.mkdir(args.output)
+        result = args.output + '/' + '{}.txt'.format(args.task)
+        print('begin predicting')
+        with open(result, 'w') as f:
+            for i, (input_ids, token_type_ids, attention_mask, affinity) in enumerate(tqdm(data_generator)):
+                # input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
+                pred_affinity = model(input_ids=input_ids.cuda(), token_type_ids=token_type_ids.cuda(), attention_mask=attention_mask.cuda())
+                pred_affinity = pred_affinity.cpu().numpy().squeeze(-1)
+                for res in pred_affinity:
+                    f.write(str(res) + '\n')
+    # if args.do_eval:
+    #     os.system('python eval.py')
+def train_mol(args, model, dataset, tokenizer, pre_train=False):
+    data_loder_para = {'batch_size': args.batch_size,
+                       'shuffle': True,
+                       'num_workers': args.workers,
+                       }
+    data_generator = DataLoader(dataset, **data_loder_para)
+    if pre_train == True:
+        model.load_state_dict(torch.load(args.init), strict=True)
+    model.train()
+    opt = torch.optim.Adam(model.parameters(), lr=args.lr)
+    loss_fct = torch.nn.MSELoss()
+    writer = SummaryWriter('./log/' + args.savedir)
+    num_step = args.epochs * len(data_generator)
+    step = 0
+    save_step = num_step // 10
+    # detect GPU
+    if torch.cuda.is_available():
+        model.cuda()
+    # print(model)
+    print('epoch num : {}'.format(args.epochs))
+    print('step num : {}'.format(num_step))
+    print('batch size : {}'.format(args.batch_size))
+    print('learning rate : {}'.format(args.lr))
+    print('begin training')
+    # training
+    for epoch in range(args.epochs):
+        for i, (input, affinity) in enumerate(data_generator):
+            # use cuda
+            # input model
+            input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
+            pred_affinity = model(input_ids=input_ids.cuda(), attention_mask=attention_mask.cuda())
+            # print("affinity's size is ", affinity.dtype)
+            # print("pred_affinity's size is :", pred_affinity.dtype)
+            loss = loss_fct(pred_affinity, affinity.to(torch.float32).cuda().unsqueeze(-1))
+            step += 1
+            writer.add_scalar('loss', loss, global_step=step)
+            # Update gradient
+            opt.zero_grad()
+            # loss.float()
+            loss.backward()
+            opt.step()
+            #                 if (i % 100 == 0):
+            print('Training at Epoch ' + str(epoch + 1) + ' step ' + str(step) + ' with loss ' + str(
+                loss.cpu().detach().numpy()))
+            # save
+            if epoch >= 1 and step % save_step == 0:
+                save_path = './model/' + args.savedir + '/'
+                if not os.path.exists(save_path):
+                    os.mkdir(save_path)
+                torch.save(model.state_dict(), save_path + 'epoch-{}-step-{}-loss-{}.pth'.format(epoch, step, loss))
+    print('training  over')
+    writer.close()
+def test_mol(args, model, dataset, tokenizer):
+    data_loder_para = {'batch_size': args.batch_size,
+                       'shuffle': False,
+                       'num_workers': args.workers,
+                       }
+    data_generator = DataLoader(dataset, **data_loder_para)
+    with torch.no_grad():
+        # if torch.cuda.is_available():
+        model.load_state_dict(torch.load(args.init), strict=True)
+        model.cuda()
+        # else:
+        #     model.load_state_dict(torch.load(args.init, map_location=torch.device('cpu')), strict=True)
+        model.eval()
+        if not os.path.exists(args.output):
+            os.mkdir(args.output)
+        result = args.output + '/' + '{}.txt'.format(args.task)
+        print('begin predicting')
+        with open(result, 'w') as f:
+            for i, (input, affinity) in enumerate(tqdm(data_generator)):
+                input_ids, attention_mask = tokenizer.convert_token_to_ids(input)
+                # pred_affinity = model(input_ids=input_ids.cuda(), attention_mask=attention_mask.cuda(), output_attentions=True)
+                # attention_mat = pred_affinity["attentions"][-1].detach().cpu().numpy()
+                # np.save("visualize_attention/attention_mat", attention_mat)
+                pred_affinity = model(input_ids=input_ids.cuda(), attention_mask=attention_mask.cuda())
+                                    #   , output_attentions=True)
+                # pred_affinity = model(input_ids=input, token_type_ids=token_type_ids, attention_mask=input_mask)
+                pred_affinity = pred_affinity.cpu().numpy().squeeze(-1)
+                for res in pred_affinity:
+                    f.write(str(res) + '\n')
+    if args.do_eval:
+        os.system('python eval.py')
+def main(args):
+    # load data
+    data_file, tokenizer_config = get_task(args.task)
+    if args.task in ['train_mol', 'test_mol', "test_er", "test_gpcr", "test_channel", "test_kinase"]:
+        dataset = Data_Gen(data_file)
+    else:
+        dataset = Data_Encoder(data_file, tokenizer_config)
+    # creat model
+    print('------------------creat model---------------------------')
+    config = BertConfig.from_pretrained(args.config)
+    model = BertAffinityModel(config)
+    tokenizer = Tokenizer(tokenizer_config)
+    print('model name : BertAffinity')
+    print('task name : {}'.format(args.task))
+    if args.task in ['train_mol']:
+        train_mol(args, model, dataset, tokenizer, pre_train=args.pre_train)
+        # train(args, model, dataset, tokenizer)
+    elif args.task in ['test_mol', "test_er", "test_gpcr", "test_channel", "test_kinase"]:
+        test_mol(args, model, dataset, tokenizer)
+    elif args.task in ['train', 'train_z_1', 'train_z_10', 'train_z_100']:
+        train(args, model, dataset, tokenizer, pre_train=args.pre_train)
+    elif args.task in ['test', 'test_ori_er', 'test_ori_gpcr', 'test_ori_channel', 'test_ori_kinase']:
+        test(args, model, dataset, tokenizer)
+if __name__ == '__main__':
+    # get parameter
+    parser = ArgumentParser(description='BertAffinity')
+    parser.add_argument('-batch_size', default=8, type=int,
+                        metavar='N',
+                        help='mini-batch size (default: 16), this is the total '
+                             'batch size of all GPUs on the current node when '
+                             'using Data Parallel or Distributed Data Parallel')
+    parser.add_argument('-j', '--workers', default=0, type=int, metavar='N',
+                        help='number of data loading workers (default: 0)')
+    parser.add_argument('--epochs', default=50, type=int, metavar='N',
+                        help='number of total epochs to run')
+    parser.add_argument('--task', default='train', type=str, metavar='TASK',
+                        help='Task name. Could be train, test, channel, ER, GPCR, kinase or else.')
+    parser.add_argument('--lr', '--learning-rate', default=1e-5, type=float,
+                        metavar='LR', help='initial learning rate', dest='lr')
+    parser.add_argument('--config', default='./config/config.json', type=str, help='model config file path')
+    # parser.add_argument('--log', default='training_log', type=str, help='training log')
+    parser.add_argument('--savedir', default='train', type=str, help='log and model save path')
+    # parser.add_argument('--device', default='0', type=str, help='name of GPU')
+    parser.add_argument('--init', default='model', type=str, help='init checkpoint')
+    parser.add_argument('--output', default='predict', type=str, help='result save path')
+    # parser.add_argument('--shuffle', default=True, type=str, help='shuffle data')
+    # parser.add_argument('--do_eval', default=False, type=bool, help='do eval')
+    parser.add_argument('--pre_train', default=False, type=bool, help='use pre-train')
+    args = parser.parse_args()
+    # local test
+    # os.environ["CUDA_VISIBLE_DEVICES"] = "5"
+    # args.task = 'train'
+    # args.epochs = 30
+    # args.lr = 1e-5
+    # args.config = './config/config_layer_6.json'
+    # args.savedir = 'train_ori_1217'
+    # args.task = 'train_mol'
+    # args.savedir = 'without-pre-train-layer-6-1021'
+    # # # args.savedir = 'train'
+    # args.epochs = 30
+    # args.lr = 1e-5
+    # args.config = './config/config_layer_6_mol.json'
+    # args.pre_train = False
+    # args.init = './model/mask-LM-lr-1e-4-1019/epoch-17-step-593064-loss-0.1007341668009758.pth'
+    # args.task = 'test_mol'
+    # args.task = 'test_er'
+    # args.task = 'test_gpcr'
+    # args.task = 'test_channel'
+    # args.task = 'test_kinase'
+    # args.init = './model/add_pretrain_1019/epoch-9-step-329480-loss-0.736057146887367.pth'
+    # args.init = './model/without-pre-train-layer-6-1021/epoch-29-step-988440-loss-0.19894360158554475.pth'
+    # args.output = './predict/without-pre-train-layer-6-1021-s-988440-test'
+    # args.output = './predict/without-pre-train-layer-6-1021-s-988440-gpcr'
+    # args.output = './predict/without-pre-train-layer-6-1021-s-988440-channel'
+    # args.output = './predict/without-pre-train-layer-6-1021-s-988440-kinase'
+    # args.output = './predict/without-pre-train-layer-6-1021-s-988440-er'
+    # args.output = './predict/add_pretrain_1019-s-329480-er'
+    # args.output = './predict/add_pretrain_1019-s-329480-gpcr'
+    # args.output = './predict/add_pretrain_1019-s-329480-channel'
+    # args.output = './predict/add_pretrain_1019-s-329480-kinase'
+    # args.config = './config/config_layer_6_mol.json'
+    # args.output = "./predict/test"
+    # args.batch_size = 1
+    # test ori
+    # args.task = 'test'
+    # args.task = 'test_ori_er'
+    # args.task = 'test_ori_gpcr'
+    # args.task = 'test_ori_channel'
+    # args.task = 'test_ori_kinase'
+    # args.init = 'model/train_ori_1217/epoch-8-step-296532-loss-0.5783637166023254.pth'
+    # args.config = './config/config_layer_6.json'
+    # args.output = './predict/train_ori_1217-s-296532'
+    ####yzh new train###
+    # os.environ['CUDA_VISIBLE_DEVICES']='1'
+    # args.task = 'train_mol'
+    # args.batch_size = 16
+    # args.savedir = 'pre-train-layer-6-1021'
+    # # # args.savedir = 'train'
+    # args.epochs = 30
+    # args.lr = 1e-5
+    # args.config = './config/config_layer_6_mol.json'
+    # args.pre_train = True
+    # args.init = './model/train/epoch-23-step-790752-loss-0.12734022736549377.pth'
+    main(args)
--- a/test.py
+++ b/test.py
+import re, collections
+def get_stats(vocab):
+    pairs = collections.defaultdict(int)
+    for word, freq in vocab.items():
+        symbols = word.split()
+        for i in range(len(symbols)-1):
+            pairs[symbols[i],symbols[i+1]] += freq
+    return pairs
+def merge_vocab(pair, v_in):
+    v_out = {}
+    bigram = re.escape(' '.join(pair))
+    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
+    for word in v_in:
+        w_out = p.sub(''.join(pair), word)
+        v_out[w_out] = v_in[word]
+    return v_out
+vocab = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}
+num_merges = 1000
+for i in range(num_merges):
+    pairs = get_stats(vocab)
+    if not pairs:
+        break
+    best = max(pairs, key=pairs.get)
+    vocab = merge_vocab(best, vocab)
+    print(best)
+# print output
+# ('e', 's')
+# ('es', 't')
+# ('est', '</w>')
+# ('l', 'o')
+# ('lo', 'w')
+# ('n', 'e')
+# ('ne', 'w')
+# ('new', 'est</w>')
+# ('low', '</w>')
+# ('w', 'i')
+# ('wi', 'd')
+# ('wid', 'est</w>')
+# ('low', 'e')
+# ('lowe', 'r')
+# ('lower', '</w>')
\ No newline at end of file
--- a/test.sh
+++ b/test.sh
+CUDA_VISIBLE_DEVICES=1 python run_prediction.py \
+--task=test_mol \
+-batch_size=64 \
+--output=./predict/test \
+--config=./config/config_layer_6_mol.json \
+--init=/notebook/our_model/model/pre-train-layer-6-1021/epoch-29-step-494220-loss-0.23760947585105896.pth 
\ No newline at end of file
--- a/test_pre_training.sh
+++ b/test_pre_training.sh
+CUDA_VISIBLE_DEVICES=1 python run_pretraining.py \
+--batch-size=16 \
+--task=test-pre-train \
+--config=./config/config_layer_6_mol.json \
+# --output='model/mask-LM-layer-6-dobule-1020/epoch-11-step-395376-loss-0.06246088946244073' \
+--init='./model/mask-LM-layer-6-dobule-1020/epoch-11-step-395376-loss-0.06246088946244073.pth'
\ No newline at end of file
--- a/train_biosnap_no_pretrain.sh
+++ b/train_biosnap_no_pretrain.sh
+CUDA_VISIBLE_DEVICES=0 \
+python run_interaction.py \
+--epochs=50 \
+--lr=1e-5 \
+--task=train_biosnap \
+--batch_size=4 \
+--config=./config/config_layer_6_mol.json \
+--pre_train=False \
+--init=/notebook/our_model/model/pre-train-new-100epochs-config_layer_6_mol/epoch-99-step-3294800-loss-0.0736498162150383.pth
\ No newline at end of file
--- a/utils/analyse.txt
+++ b/utils/analyse.txt
+6.339674062480976
+1.4751794034241978
--- a/utils/analyse_data.py
+++ b/utils/analyse_data.py
+from subword_nmt.apply_bpe import BPE
+import codecs
+import collections
+bpe_codes_drug = codecs.open('../config/drug_codes_chembl.txt')
+dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
+bpe_codes_prot = codecs.open('../config/protein_codes_uniprot.txt')
+pbpe = BPE(bpe_codes_prot, merges=-1, separator='')
+def load_file(file):
+    data = []
+    with open(file, 'r') as f:
+        lines = f.readlines()
+        for line in lines:
+            data.append(line.strip('\n'))
+        return data
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+def seq2vec(protein, drug):
+    start_token = '[CLS]'
+    sep_token = '[SEP]'
+    prots = load_file(protein)
+    drugs = load_file(drug)
+    for p, d in zip(prots, drugs):
+        d = dbpe.process_line(d).split()
+        p = pbpe.process_line(p).split()
+        tokens = [start_token] + d + [sep_token] + p + [sep_token]
+        print(len(p))
+if __name__ == '__main__':
+    seq = '../data/test/test_protein_seq'
+    simle = '../data/train/train_smile'
+    vocab = '../config/vocab_mol.txt'
+    seq2vec(seq, simle)
\ No newline at end of file
--- a/utils/define_vocab.py
+++ b/utils/define_vocab.py
+import pandas as pd
+import numpy as np
+sub_csv = pd.read_csv('../config/subword_units_map_chembl.csv')
+idx2word_d = sub_csv['index'].values
+sub_csv = pd.read_csv('../config/subword_units_map_uniprot.csv')
+idx2word_p = sub_csv['index'].values
+# words2idx_d = dict(zip(idx2word_d, range(0, len(idx2word_d))))
+spqcial_tokens = np.array(['[PAD]', '[MASK]', '[CLS]', '[SEP]', '[UNK]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]'])
+all_tokens = np.concatenate((spqcial_tokens, idx2word_p, idx2word_d))
+save = '../config/vocab_mol.txt'
+with open(save, 'w') as f:
+    for token in all_tokens:
+        f.write(str(token) + '\n')
--- a/utils/normalize_data.py
+++ b/utils/normalize_data.py
+import numpy as np
+from tqdm import tqdm
+def z_score(data, save, enlarge):
+    with open(data, 'r') as f:
+        lines = f.readlines()
+    data = []
+    for line in lines:
+        aff = np.float64(line.strip())
+        data.append(aff)
+    data = np.array(data)
+    ave = np.mean(data)
+    std = np.std(data)
+    new_affinity = (data - ave) / std
+    new_affinity *= enlarge
+    new_affinity = list(new_affinity)
+    with open(save, 'w') as f:
+        for aff in tqdm(new_affinity):
+            f.write(str(aff) + '\n')
+def reform(input_file_path, result_save_path, average, std, enlarge):
+    with open(input_file_path, 'r') as f:
+        res = f.readlines()
+    with open(result_save_path, 'w') as f:
+        for line in tqdm(res):
+            data = float(line.strip())
+            ori = ((data / enlarge) * std) + average
+            f.write(str(ori) + '\n')
+if __name__ == '__main__':
+    average = 6.339674062480976
+    std = 1.4751794034241978
+    # gengerate z-score dataset
+    # data = '../data/train_ic50'
+    # save = '../data/train_z_1_ic50'
+    # enlarge = 1
+    # z_score(data, save, enlarge)
+    # reform result
+    result = '../predict/lr-1e-5-batch-32-e-10-layer3-0503-z-1-step-82370/test_1.txt'
+    save = '../predict/lr-1e-5-batch-32-e-10-layer3-0503-z-1-step-82370/test.txt'
+    reform(result, save, average, std, 1)
\ No newline at end of file