add trainer

e37fe169 · 朱学凯 · 8f39b5be · 8f39b5be · e37fe169 · e37fe169
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
-# Default ignored files
-/shelf/
-/workspace.xml
-# Datasource local storage ignored files
-/dataSources/
-/dataSources.local.xml
-# Editor-based HTTP Client requests
-/httpRequests/
--- a/.idea/CPI.iml
+++ b/.idea/CPI.iml
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="PYTHON_MODULE" version="4">
-  <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.6 (py3.6)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
 </module>
\ No newline at end of file
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="1">
+            <item index="0" class="java.lang.String" itemvalue="subword-nmt" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
\ No newline at end of file
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (code)" project-jdk-type="Python SDK" />
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (py3.6)" project-jdk-type="Python SDK" />
 </project>
\ No newline at end of file
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/CPI.iml" filepath="$PROJECT_DIR$/.idea/CPI.iml" />
-    </modules>
-  </component>
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/CPI.iml" filepath="$PROJECT_DIR$/.idea/CPI.iml" />
+    </modules>
+  </component>
 </project>
\ No newline at end of file
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="VcsDirectoryMappings">
-    <mapping directory="$PROJECT_DIR$" vcs="Git" />
-  </component>
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
 </project>
\ No newline at end of file
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ChangeListManager">
+    <list default="true" id="7d3a4caa-6c7d-4ed2-9017-145cec64d9a3" name="Default Changelist" comment="">
+      <change afterPath="$PROJECT_DIR$/.idea/inspectionProfiles/Project_Default.xml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/.idea/.gitignore" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/.idea/CPI.iml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/CPI.iml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/.idea/misc.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/misc.xml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/.idea/modules.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/modules.xml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/.idea/vcs.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/vcs.xml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/activations.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/configuration_bert.py" beforeDir="false" afterPath="$PROJECT_DIR$/configuration_bert.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/configuration_utils.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/dataset.py" beforeDir="false" afterPath="$PROJECT_DIR$/dataset.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/file_utils.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/generation_beam_search.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/generation_logits_process.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/generation_stopping_criteria.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/generation_utils.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/modeling_bert.py" beforeDir="false" afterPath="$PROJECT_DIR$/modeling_bert.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/modeling_outputs.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/modeling_utils.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/run_interaction.py" beforeDir="false" afterPath="$PROJECT_DIR$/run_interaction.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/utils/hf_api.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/utils/logging.py" beforeDir="false" />
+    </list>
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+  </component>
+  <component name="ProjectId" id="1qEzgr8swJnGyU4GzuvJCGLLPSb" />
+  <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent">
+    <property name="RunOnceActivity.OpenProjectViewOnStart" value="true" />
+    <property name="RunOnceActivity.ShowReadmeOnStart" value="true" />
+    <property name="SHARE_PROJECT_CONFIGURATION_FILES" value="true" />
+    <property name="WebServerToolWindowFactoryState" value="false" />
+    <property name="last_opened_file_path" value="$PROJECT_DIR$" />
+    <property name="restartRequiresConfirmation" value="false" />
+    <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
+  </component>
+  <component name="RecentsManager">
+    <key name="CopyFile.RECENT_KEYS">
+      <recent name="E:\CPI\project\CPI" />
+    </key>
+  </component>
+  <component name="RunManager" selected="Python.run_interaction">
+    <configuration name="dataset" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="CPI" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/dataset.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="run_interaction" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="CPI" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/run_interaction.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <recent_temporary>
+      <list>
+        <item itemvalue="Python.run_interaction" />
+        <item itemvalue="Python.dataset" />
+      </list>
+    </recent_temporary>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="7d3a4caa-6c7d-4ed2-9017-145cec64d9a3" name="Default Changelist" comment="" />
+      <created>1616659651684</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1616659651684</updated>
+      <workItem from="1616659659765" duration="24696000" />
+    </task>
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+  <component name="Vcs.Log.Tabs.Properties">
+    <option name="TAB_STATES">
+      <map>
+        <entry key="MAIN">
+          <value>
+            <State />
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
+  <component name="XDebuggerManager">
+    <breakpoint-manager>
+      <breakpoints>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/run_interaction.py</url>
+          <line>57</line>
+          <option name="timeStamp" value="16" />
+        </line-breakpoint>
+      </breakpoints>
+    </breakpoint-manager>
+  </component>
+  <component name="com.intellij.coverage.CoverageDataManagerImpl">
+    <SUITE FILE_PATH="coverage/CPI$run_interaction.coverage" NAME="run_interaction Coverage Results" MODIFIED="1616929902995" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/CPI$dataset.coverage" NAME="dataset Coverage Results" MODIFIED="1616847821413" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+  </component>
+</project>
\ No newline at end of file
--- a/activations.py
+++ b/activations.py
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-
-import torch
-import torch.nn.functional as F
-from packaging import version
-
-from utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-def _gelu_python(x):
-    """
-    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
-    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
-    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in
-    torch.nn.functional Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
-    """
-    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
-
-
-def gelu_new(x):
-    """
-    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
-    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
-    """
-    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
-
-
-if version.parse(torch.__version__) < version.parse("1.4"):
-    gelu = _gelu_python
-else:
-    gelu = F.gelu
-
-
-def gelu_fast(x):
-    return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
-
-
-def _silu_python(x):
-    """
-    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
-    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
-    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
-    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
-    later.
-    """
-    return x * torch.sigmoid(x)
-
-
-if version.parse(torch.__version__) < version.parse("1.7"):
-    silu = _silu_python
-else:
-    silu = F.silu
-
-
-def mish(x):
-    return x * torch.tanh(torch.nn.functional.softplus(x))
-
-
-def linear_act(x):
-    return x
-
-
-ACT2FN = {
-    "relu": F.relu,
-    "silu": silu,
-    "swish": silu,
-    "gelu": gelu,
-    "tanh": torch.tanh,
-    "gelu_new": gelu_new,
-    "gelu_fast": gelu_fast,
-    "mish": mish,
-    "linear": linear_act,
-    "sigmoid": torch.sigmoid,
-}
-
-
-def get_activation(activation_string):
-    if activation_string in ACT2FN:
-        return ACT2FN[activation_string]
-    else:
-        raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys())))
--- a/configuration_bert.py
+++ b/configuration_bert.py
@@ -15,11 +15,7 @@
 # limitations under the License.
 """ BERT model configuration """

-from configuration_utils import PretrainedConfig
-from file_utils import logging
-
-
-logger = logging.get_logger(__name__)
+from transformers.configuration_utils import PretrainedConfig

 BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json",
@@ -44,6 +40,7 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json",
    "TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/config.json",
    "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/config.json",
+    "BertAffinity": "./config/config.json"
    # See all BERT models at https://huggingface.co/models?filter=bert
 }


--- a/configuration_utils.py
+++ b/configuration_utils.py
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Configuration base class and utilities."""
-
-
-import copy
-import json
-import os
-from typing import Any, Dict, Tuple, Union
-
-from file_utils import CONFIG_NAME, cached_path, hf_bucket_url, is_offline_mode, is_remote_url
-from utils import logging
-
-__version__ = "4.4.2"
-logger = logging.get_logger(__name__)
-
-
-class PretrainedConfig(object):
-    r"""
-    Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as
-    methods for loading/downloading/saving configurations.
-
-    Note: A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
-    initialize a model does **not** load the model weights. It only affects the model's configuration.
-
-    Class attributes (overridden by derived classes)
-
-        - **model_type** (:obj:`str`): An identifier for the model type, serialized into the JSON file, and used to
-          recreate the correct object in :class:`~transformers.AutoConfig`.
-        - **is_composition** (:obj:`bool`): Whether the config class is composed of multiple sub-configs. In this case
-          the config has to be initialized from two or more configs of type :class:`~transformers.PretrainedConfig`
-          like: :class:`~transformers.EncoderDecoderConfig` or :class:`~RagConfig`.
-        - **keys_to_ignore_at_inference** (:obj:`List[str]`): A list of keys to ignore by default when looking at
-          dictionary outputs of the model during inference.
-
-    Args:
-        name_or_path (:obj:`str`, `optional`, defaults to :obj:`""`):
-            Store the string that was passed to :func:`~transformers.PreTrainedModel.from_pretrained` or
-            :func:`~transformers.TFPreTrainedModel.from_pretrained` as ``pretrained_model_name_or_path`` if the
-            configuration was created with such a method.
-        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the model should return all hidden-states.
-        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the model should returns all attentions.
-        return_dict (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a plain
-            tuple.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether the model is used as an encoder/decoder or not.
-        is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether the model is used as decoder or not (in which case it's used as an encoder).
-        add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether cross-attention layers should be added to the model. Note, this option is only relevant for models
-            that can be used as decoder models within the `:class:~transformers.EncoderDecoderModel` class, which
-            consists of all models in ``AUTO_MODELS_FOR_CAUSAL_LM``.
-        tie_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`)
-            Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder
-            and decoder model to have the exact same parameter names.
-        prune_heads (:obj:`Dict[int, List[int]]`, `optional`, defaults to :obj:`{}`):
-            Pruned heads of the model. The keys are the selected layer indices and the associated values, the list of
-            heads to prune in said layer.
-
-            For instance ``{1: [0, 2], 2: [2, 3]}`` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
-        chunk_size_feed_forward (:obj:`int`, `optional`, defaults to :obj:`0`):
-            The chunk size of all feed forward layers in the residual attention blocks. A chunk size of :obj:`0` means
-            that the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes
-            :obj:`n` < sequence_length embeddings at a time. For more information on feed forward chunking, see `How
-            does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
-
-    Parameters for sequence generation
-
-        - **max_length** (:obj:`int`, `optional`, defaults to 20) -- Maximum length that will be used by default in the
-          :obj:`generate` method of the model.
-        - **min_length** (:obj:`int`, `optional`, defaults to 10) -- Minimum length that will be used by default in the
-          :obj:`generate` method of the model.
-        - **do_sample** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default in the
-          :obj:`generate` method of the model. Whether or not to use sampling ; use greedy decoding otherwise.
-        - **early_stopping** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default
-          in the :obj:`generate` method of the model. Whether to stop the beam search when at least ``num_beams``
-          sentences are finished per batch or not.
-        - **num_beams** (:obj:`int`, `optional`, defaults to 1) -- Number of beams for beam search that will be used by
-          default in the :obj:`generate` method of the model. 1 means no beam search.
-        - **num_beam_groups** (:obj:`int`, `optional`, defaults to 1) -- Number of groups to divide :obj:`num_beams`
-          into in order to ensure diversity among different groups of beams that will be used by default in the
-          :obj:`generate` method of the model. 1 means no group beam search.
-        - **diversity_penalty** (:obj:`float`, `optional`, defaults to 0.0) -- Value to control diversity for group
-          beam search. that will be used by default in the :obj:`generate` method of the model. 0 means no diversity
-          penalty. The higher the penalty, the more diverse are the outputs.
-        - **temperature** (:obj:`float`, `optional`, defaults to 1) -- The value used to module the next token
-          probabilities that will be used by default in the :obj:`generate` method of the model. Must be strictly
-          positive.
-        - **top_k** (:obj:`int`, `optional`, defaults to 50) -- Number of highest probability vocabulary tokens to keep
-          for top-k-filtering that will be used by default in the :obj:`generate` method of the model.
-        - **top_p** (:obj:`float`, `optional`, defaults to 1) -- Value that will be used by default in the
-          :obj:`generate` method of the model for ``top_p``. If set to float < 1, only the most probable tokens with
-          probabilities that add up to ``top_p`` or higher are kept for generation.
-        - **repetition_penalty** (:obj:`float`, `optional`, defaults to 1) -- Parameter for repetition penalty that
-          will be used by default in the :obj:`generate` method of the model. 1.0 means no penalty.
-        - **length_penalty** (:obj:`float`, `optional`, defaults to 1) -- Exponential penalty to the length that will
-          be used by default in the :obj:`generate` method of the model.
-        - **no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by default in the
-          :obj:`generate` method of the model for ``no_repeat_ngram_size``. If set to int > 0, all ngrams of that size
-          can only occur once.
-        - **encoder_no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by
-          default in the :obj:`generate` method of the model for ``encoder_no_repeat_ngram_size``. If set to int > 0,
-          all ngrams of that size that occur in the ``encoder_input_ids`` cannot occur in the ``decoder_input_ids``.
-        - **bad_words_ids** (:obj:`List[int]`, `optional`) -- List of token ids that are not allowed to be generated
-          that will be used by default in the :obj:`generate` method of the model. In order to get the tokens of the
-          words that should not appear in the generated text, use :obj:`tokenizer.encode(bad_word,
-          add_prefix_space=True)`.
-        - **num_return_sequences** (:obj:`int`, `optional`, defaults to 1) -- Number of independently computed returned
-          sequences for each element in the batch that will be used by default in the :obj:`generate` method of the
-          model.
-        - **output_scores** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether the model should return the
-          logits when used for generation
-        - **return_dict_in_generate** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether the model should
-          return a :class:`~transformers.file_utils.ModelOutput` instead of a :obj:`torch.LongTensor`
-        - **forced_bos_token_id** (:obj:`int`, `optional`) -- The id of the token to force as the first generated token
-          after the :obj:`decoder_start_token_id`. Useful for multilingual models like :doc:`mBART
-          <../model_doc/mbart>` where the first generated token needs to be the target language token.
-        - **forced_eos_token_id** (:obj:`int`, `optional`) -- The id of the token to force as the last generated token
-          when :obj:`max_length` is reached.
-
-
-    Parameters for fine-tuning tasks
-
-        - **architectures** (:obj:`List[str]`, `optional`) -- Model architectures that can be used with the model
-          pretrained weights.
-        - **finetuning_task** (:obj:`str`, `optional`) -- Name of the task used to fine-tune the model. This can be
-          used when converting from an original (TensorFlow or PyTorch) checkpoint.
-        - **id2label** (:obj:`Dict[int, str]`, `optional`) -- A map from index (for instance prediction index, or
-          target index) to label.
-        - **label2id** (:obj:`Dict[str, int]`, `optional`) -- A map from label to index for the model.
-        - **num_labels** (:obj:`int`, `optional`) -- Number of labels to use in the last layer added to the model,
-          typically for a classification task.
-        - **task_specific_params** (:obj:`Dict[str, Any]`, `optional`) -- Additional keyword arguments to store for the
-          current task.
-
-    Parameters linked to the tokenizer
-
-        - **tokenizer_class** (:obj:`str`, `optional`) -- The name of the associated tokenizer class to use (if none is
-          set, will use the tokenizer associated to the model by default).
-        - **prefix** (:obj:`str`, `optional`) -- A specific prompt that should be added at the beginning of each text
-          before calling the model.
-        - **bos_token_id** (:obj:`int`, `optional`)) -- The id of the `beginning-of-stream` token.
-        - **pad_token_id** (:obj:`int`, `optional`)) -- The id of the `padding` token.
-        - **eos_token_id** (:obj:`int`, `optional`)) -- The id of the `end-of-stream` token.
-        - **decoder_start_token_id** (:obj:`int`, `optional`)) -- If an encoder-decoder model starts decoding with a
-          different token than `bos`, the id of that token.
-        - **sep_token_id** (:obj:`int`, `optional`)) -- The id of the `separation` token.
-
-    PyTorch specific parameters
-
-        - **torchscript** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should be
-          used with Torchscript.
-        - **tie_word_embeddings** (:obj:`bool`, `optional`, defaults to :obj:`True`) -- Whether the model's input and
-          output word embeddings should be tied. Note that this is only relevant if the model has a output word
-          embedding layer.
-
-    TensorFlow specific parameters
-
-        - **use_bfloat16** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should use
-          BFloat16 scalars (only used by some TensorFlow models).
-    """
-    model_type: str = ""
-    is_composition: bool = False
-
-    def __init__(self, **kwargs):
-        # Attributes with defaults
-        self.return_dict = kwargs.pop("return_dict", True)
-        self.output_hidden_states = kwargs.pop("output_hidden_states", False)
-        self.output_attentions = kwargs.pop("output_attentions", False)
-        self.torchscript = kwargs.pop("torchscript", False)  # Only used by PyTorch models
-        self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
-        self.pruned_heads = kwargs.pop("pruned_heads", {})
-        self.tie_word_embeddings = kwargs.pop(
-            "tie_word_embeddings", True
-        )  # Whether input and output word embeddings should be tied for all MLM, LM and Seq2Seq models.
-
-        # Is decoder is used in encoder-decoder models to differentiate encoder from decoder
-        self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False)
-        self.is_decoder = kwargs.pop("is_decoder", False)
-        self.add_cross_attention = kwargs.pop("add_cross_attention", False)
-        self.tie_encoder_decoder = kwargs.pop("tie_encoder_decoder", False)
-
-        # Parameters for sequence generation
-        self.max_length = kwargs.pop("max_length", 20)
-        self.min_length = kwargs.pop("min_length", 0)
-        self.do_sample = kwargs.pop("do_sample", False)
-        self.early_stopping = kwargs.pop("early_stopping", False)
-        self.num_beams = kwargs.pop("num_beams", 1)
-        self.num_beam_groups = kwargs.pop("num_beam_groups", 1)
-        self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0)
-        self.temperature = kwargs.pop("temperature", 1.0)
-        self.top_k = kwargs.pop("top_k", 50)
-        self.top_p = kwargs.pop("top_p", 1.0)
-        self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
-        self.length_penalty = kwargs.pop("length_penalty", 1.0)
-        self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
-        self.encoder_no_repeat_ngram_size = kwargs.pop("encoder_no_repeat_ngram_size", 0)
-        self.bad_words_ids = kwargs.pop("bad_words_ids", None)
-        self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
-        self.chunk_size_feed_forward = kwargs.pop("chunk_size_feed_forward", 0)
-        self.output_scores = kwargs.pop("output_scores", False)
-        self.return_dict_in_generate = kwargs.pop("return_dict_in_generate", False)
-        self.forced_bos_token_id = kwargs.pop("forced_bos_token_id", None)
-        self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None)
-
-        # Fine-tuning task arguments
-        self.architectures = kwargs.pop("architectures", None)
-        self.finetuning_task = kwargs.pop("finetuning_task", None)
-        self.id2label = kwargs.pop("id2label", None)
-        self.label2id = kwargs.pop("label2id", None)
-        if self.id2label is not None:
-            kwargs.pop("num_labels", None)
-            self.id2label = dict((int(key), value) for key, value in self.id2label.items())
-            # Keys are always strings in JSON so convert ids to int here.
-        else:
-            self.num_labels = kwargs.pop("num_labels", 2)
-
-        # Tokenizer arguments TODO: eventually tokenizer and models should share the same config
-        self.tokenizer_class = kwargs.pop("tokenizer_class", None)
-        self.prefix = kwargs.pop("prefix", None)
-        self.bos_token_id = kwargs.pop("bos_token_id", None)
-        self.pad_token_id = kwargs.pop("pad_token_id", None)
-        self.eos_token_id = kwargs.pop("eos_token_id", None)
-        self.sep_token_id = kwargs.pop("sep_token_id", None)
-
-        self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
-
-        # task specific arguments
-        self.task_specific_params = kwargs.pop("task_specific_params", None)
-
-        # TPU arguments
-        if kwargs.pop("xla_device", None) is not None:
-            logger.warn(
-                "The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can "
-                "safely remove it from your `config.json` file."
-            )
-
-        # Name or path to the pretrained checkpoint
-        self._name_or_path = str(kwargs.pop("name_or_path", ""))
-
-        # Drop the transformers version info
-        kwargs.pop("transformers_version", None)
-
-        # Additional attributes without default values
-        for key, value in kwargs.items():
-            try:
-                setattr(self, key, value)
-            except AttributeError as err:
-                logger.error("Can't set {} with value {} for {}".format(key, value, self))
-                raise err
-
-    @property
-    def name_or_path(self) -> str:
-        return self._name_or_path
-
-    @name_or_path.setter
-    def name_or_path(self, value):
-        self._name_or_path = str(value)  # Make sure that name_or_path is a string (for JSON encoding)
-
-    @property
-    def use_return_dict(self) -> bool:
-        """
-        :obj:`bool`: Whether or not return :class:`~transformers.file_utils.ModelOutput` instead of tuples.
-        """
-        # If torchscript is set, force `return_dict=False` to avoid jit errors
-        return self.return_dict and not self.torchscript
-
-    @property
-    def num_labels(self) -> int:
-        """
-        :obj:`int`: The number of labels for classification models.
-        """
-        return len(self.id2label)
-
-    @num_labels.setter
-    def num_labels(self, num_labels: int):
-        if self.id2label is None or len(self.id2label) != num_labels:
-            self.id2label = {i: "LABEL_{}".format(i) for i in range(num_labels)}
-            self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
-
-    def save_pretrained(self, save_directory: Union[str, os.PathLike]):
-        """
-        Save a configuration object to the directory ``save_directory``, so that it can be re-loaded using the
-        :func:`~transformers.PretrainedConfig.from_pretrained` class method.
-
-        Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
-                Directory where the configuration JSON file will be saved (will be created if it does not exist).
-        """
-        if os.path.isfile(save_directory):
-            raise AssertionError("Provided path ({}) should be a directory, not a file".format(save_directory))
-        os.makedirs(save_directory, exist_ok=True)
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_config_file = os.path.join(save_directory, CONFIG_NAME)
-
-        self.to_json_file(output_config_file, use_diff=True)
-        logger.info(f"Configuration saved in {output_config_file}")
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        r"""
-        Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pretrained model
-        configuration.
-
-        Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
-                This can be either:
-
-                - a string, the `model id` of a pretrained model configuration hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a configuration file saved using the
-                  :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g., ``./my_model_directory/``.
-                - a path or url to a saved configuration JSON `file`, e.g.,
-                  ``./my_model_directory/configuration.json``.
-            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to force to (re-)download the configuration files and override the cached versions if
-                they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
-                exists.
-            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (:obj:`str` or `bool`, `optional`):
-                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
-                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
-                identifier allowed by git.
-            return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                If :obj:`False`, then this function returns just the final configuration object.
-
-                If :obj:`True`, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs`
-                is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e.,
-                the part of ``kwargs`` which has not been used to update ``config`` and is otherwise ignored.
-            kwargs (:obj:`Dict[str, Any]`, `optional`):
-                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
-                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
-                by the ``return_unused_kwargs`` keyword parameter.
-
-        .. note::
-
-            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
-
-
-        Returns:
-            :class:`PretrainedConfig`: The configuration object instantiated from this pretrained model.
-
-        Examples::
-
-            # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
-            # derived class: BertConfig
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from huggingface.co and cache.
-            config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-            config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
-            config = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
-            assert config.output_attentions == True
-            config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True,
-                                                               foo=False, return_unused_kwargs=True)
-            assert config.output_attentions == True
-            assert unused_kwargs == {'foo': False}
-
-        """
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        return cls.from_dict(config_dict, **kwargs)
-
-    @classmethod
-    def get_config_dict(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        """
-        From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a
-        :class:`~transformers.PretrainedConfig` using ``from_dict``.
-
-
-
-        Parameters:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
-                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
-
-        Returns:
-            :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
-
-        """
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        revision = kwargs.pop("revision", None)
-
-        if is_offline_mode() and not local_files_only:
-            logger.info("Offline mode: forcing local_files_only=True")
-            local_files_only = True
-
-        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-        if os.path.isdir(pretrained_model_name_or_path):
-            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-            config_file = pretrained_model_name_or_path
-        else:
-            config_file = hf_bucket_url(
-                pretrained_model_name_or_path, filename=CONFIG_NAME, revision=revision, mirror=None
-            )
-
-        try:
-            # Load from URL or cache if already cached
-            resolved_config_file = cached_path(
-                config_file,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-            )
-            # Load config dict
-            config_dict = cls._dict_from_json_file(resolved_config_file)
-
-        except EnvironmentError as err:
-            logger.error(err)
-            msg = (
-                f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
-                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
-                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {CONFIG_NAME} file\n\n"
-            )
-            raise EnvironmentError(msg)
-
-        except json.JSONDecodeError:
-            msg = (
-                "Couldn't reach server at '{}' to download configuration file or "
-                "configuration file is not a valid JSON file. "
-                "Please check network or file content here: {}.".format(config_file, resolved_config_file)
-            )
-            raise EnvironmentError(msg)
-
-        if resolved_config_file == config_file:
-            logger.info("loading configuration file {}".format(config_file))
-        else:
-            logger.info("loading configuration file {} from cache at {}".format(config_file, resolved_config_file))
-
-        return config_dict, kwargs
-
-    @classmethod
-    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig":
-        """
-        Instantiates a :class:`~transformers.PretrainedConfig` from a Python dictionary of parameters.
-
-        Args:
-            config_dict (:obj:`Dict[str, Any]`):
-                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
-                retrieved from a pretrained checkpoint by leveraging the
-                :func:`~transformers.PretrainedConfig.get_config_dict` method.
-            kwargs (:obj:`Dict[str, Any]`):
-                Additional parameters from which to initialize the configuration object.
-
-        Returns:
-            :class:`PretrainedConfig`: The configuration object instantiated from those parameters.
-        """
-        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
-
-        config = cls(**config_dict)
-
-        if hasattr(config, "pruned_heads"):
-            config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
-
-        # Update config with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
-            if hasattr(config, key):
-                setattr(config, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
-
-        logger.info("Model config %s", str(config))
-        if return_unused_kwargs:
-            return config, kwargs
-        else:
-            return config
-
-    @classmethod
-    def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "PretrainedConfig":
-        """
-        Instantiates a :class:`~transformers.PretrainedConfig` from the path to a JSON file of parameters.
-
-        Args:
-            json_file (:obj:`str` or :obj:`os.PathLike`):
-                Path to the JSON file containing the parameters.
-
-        Returns:
-            :class:`PretrainedConfig`: The configuration object instantiated from that JSON file.
-
-        """
-        config_dict = cls._dict_from_json_file(json_file)
-        return cls(**config_dict)
-
-    @classmethod
-    def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
-        with open(json_file, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        return json.loads(text)
-
-    def __eq__(self, other):
-        return self.__dict__ == other.__dict__
-
-    def __repr__(self):
-        return "{} {}".format(self.__class__.__name__, self.to_json_string())
-
-    def to_diff_dict(self) -> Dict[str, Any]:
-        """
-        Removes all attributes from config which correspond to the default config attributes for better readability and
-        serializes to a Python dictionary.
-
-        Returns:
-            :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        config_dict = self.to_dict()
-
-        # get the default config dict
-        default_config_dict = PretrainedConfig().to_dict()
-
-        # get class specific config dict
-        class_config_dict = self.__class__().to_dict() if not self.is_composition else {}
-
-        serializable_config_dict = {}
-
-        # only serialize values that differ from the default config
-        for key, value in config_dict.items():
-            if (
-                key not in default_config_dict
-                or key == "transformers_version"
-                or value != default_config_dict[key]
-                or (key in class_config_dict and value != class_config_dict[key])
-            ):
-                serializable_config_dict[key] = value
-
-        return serializable_config_dict
-
-    def to_dict(self) -> Dict[str, Any]:
-        """
-        Serializes this instance to a Python dictionary.
-
-        Returns:
-            :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
-        """
-        output = copy.deepcopy(self.__dict__)
-        if hasattr(self.__class__, "model_type"):
-            output["model_type"] = self.__class__.model_type
-
-        # Transformers version when serializing the model
-        output["transformers_version"] = __version__
-
-        return output
-
-    def to_json_string(self, use_diff: bool = True) -> str:
-        """
-        Serializes this instance to a JSON string.
-
-        Args:
-            use_diff (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                If set to ``True``, only the difference between the config instance and the default
-                ``PretrainedConfig()`` is serialized to JSON string.
-
-        Returns:
-            :obj:`str`: String containing all the attributes that make up this configuration instance in JSON format.
-        """
-        if use_diff is True:
-            config_dict = self.to_diff_dict()
-        else:
-            config_dict = self.to_dict()
-        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True):
-        """
-        Save this instance to a JSON file.
-
-        Args:
-            json_file_path (:obj:`str` or :obj:`os.PathLike`):
-                Path to the JSON file in which this configuration instance's parameters will be saved.
-            use_diff (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                If set to ``True``, only the difference between the config instance and the default
-                ``PretrainedConfig()`` is serialized to JSON file.
-        """
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            writer.write(self.to_json_string(use_diff=use_diff))
-
-    def update(self, config_dict: Dict[str, Any]):
-        """
-        Updates attributes of this class with attributes from ``config_dict``.
-
-        Args:
-            config_dict (:obj:`Dict[str, Any]`): Dictionary of attributes that shall be updated for this class.
-        """
-        for key, value in config_dict.items():
-            setattr(self, key, value)
--- a/dataset.py
+++ b/dataset.py
@@ -173,6 +173,6 @@ if __name__ == "__main__":
              'num_workers': 0,
              'drop_last': True}
    trainset = Data_Encoder(df_train, tokenizer_config)
-    training_generator = data.DataLoader(trainset)
+    training_generator = data.DataLoader(trainset, **params)
    for i, (input, affinity) in tqdm(enumerate(training_generator)):
-        print('')
+        print(input.size())
--- a/file_utils.py
+++ b/file_utils.py
-# Copyright 2020 The HuggingFace Team, the AllenNLP library authors. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Utilities for working with the local dataset cache. Parts of this file is adapted from the AllenNLP library at
-https://github.com/allenai/allennlp.
-"""
-
-import copy
-import fnmatch
-import importlib.util
-import io
-import json
-import os
-import re
-import shutil
-import sys
-import tarfile
-import tempfile
-from collections import OrderedDict, UserDict
-from contextlib import contextmanager
-from dataclasses import fields
-from enum import Enum
-from functools import partial, wraps
-from hashlib import sha256
-from pathlib import Path
-from types import ModuleType
-from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
-from urllib.parse import urlparse
-from zipfile import ZipFile, is_zipfile
-
-import numpy as np
-from packaging import version
-from tqdm.auto import tqdm
-
-import requests
-from filelock import FileLock
-
-__version__ = '4.4.2'
-from utils.hf_api import HfFolder
-from utils import logging
-
-
-# The package importlib_metadata is in a different place, depending on the python version.
-if sys.version_info < (3, 8):
-    import importlib_metadata
-else:
-    import importlib.metadata as importlib_metadata
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
-ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
-
-USE_TF = os.environ.get("USE_TF", "AUTO").upper()
-USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
-USE_JAX = os.environ.get("USE_FLAX", "AUTO").upper()
-
-if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
-    _torch_available = importlib.util.find_spec("torch") is not None
-    if _torch_available:
-        try:
-            _torch_version = importlib_metadata.version("torch")
-            logger.info(f"PyTorch version {_torch_version} available.")
-        except importlib_metadata.PackageNotFoundError:
-            _torch_available = False
-else:
-    logger.info("Disabling PyTorch because USE_TF is set")
-    _torch_available = False
-
-
-if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
-    _tf_available = importlib.util.find_spec("tensorflow") is not None
-    if _tf_available:
-        # For the metadata, we have to look for both tensorflow and tensorflow-cpu
-        try:
-            _tf_version = importlib_metadata.version("tensorflow")
-        except importlib_metadata.PackageNotFoundError:
-            try:
-                _tf_version = importlib_metadata.version("tensorflow-cpu")
-            except importlib_metadata.PackageNotFoundError:
-                try:
-                    _tf_version = importlib_metadata.version("tensorflow-gpu")
-                except importlib_metadata.PackageNotFoundError:
-                    try:
-                        _tf_version = importlib_metadata.version("tf-nightly")
-                    except importlib_metadata.PackageNotFoundError:
-                        try:
-                            _tf_version = importlib_metadata.version("tf-nightly-cpu")
-                        except importlib_metadata.PackageNotFoundError:
-                            try:
-                                _tf_version = importlib_metadata.version("tf-nightly-gpu")
-                            except importlib_metadata.PackageNotFoundError:
-                                # Support for intel-tensorflow version
-                                try:
-                                    _tf_version = importlib_metadata.version("intel-tensorflow")
-                                except importlib_metadata.PackageNotFoundError:
-                                    _tf_version = None
-                                    _tf_available = False
-    if _tf_available:
-        if version.parse(_tf_version) < version.parse("2"):
-            logger.info(f"TensorFlow found but with version {_tf_version}. Transformers requires version 2 minimum.")
-            _tf_available = False
-        else:
-            logger.info(f"TensorFlow version {_tf_version} available.")
-else:
-    logger.info("Disabling Tensorflow because USE_TORCH is set")
-    _tf_available = False
-
-
-if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
-    _flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
-    if _flax_available:
-        try:
-            _jax_version = importlib_metadata.version("jax")
-            _flax_version = importlib_metadata.version("flax")
-            logger.info(f"JAX version {_jax_version}, Flax version {_flax_version} available.")
-        except importlib_metadata.PackageNotFoundError:
-            _flax_available = False
-else:
-    _flax_available = False
-
-
-_datasets_available = importlib.util.find_spec("datasets") is not None
-try:
-    # Check we're not importing a "datasets" directory somewhere but the actual library by trying to grab the version
-    # AND checking it has an author field in the metadata that is HuggingFace.
-    _ = importlib_metadata.version("datasets")
-    _datasets_metadata = importlib_metadata.metadata("datasets")
-    if _datasets_metadata.get("author", "") != "HuggingFace Inc.":
-        _datasets_available = False
-except importlib_metadata.PackageNotFoundError:
-    _datasets_available = False
-
-
-_faiss_available = importlib.util.find_spec("faiss") is not None
-try:
-    _faiss_version = importlib_metadata.version("faiss")
-    logger.debug(f"Successfully imported faiss version {_faiss_version}")
-except importlib_metadata.PackageNotFoundError:
-    try:
-        _faiss_version = importlib_metadata.version("faiss-cpu")
-        logger.debug(f"Successfully imported faiss version {_faiss_version}")
-    except importlib_metadata.PackageNotFoundError:
-        _faiss_available = False
-
-
-_onnx_available = (
-    importlib.util.find_spec("keras2onnx") is not None and importlib.util.find_spec("onnxruntime") is not None
-)
-try:
-    _onxx_version = importlib_metadata.version("onnx")
-    logger.debug(f"Successfully imported onnx version {_onxx_version}")
-except importlib_metadata.PackageNotFoundError:
-    _onnx_available = False
-
-
-_scatter_available = importlib.util.find_spec("torch_scatter") is not None
-try:
-    _scatter_version = importlib_metadata.version("torch_scatter")
-    logger.debug(f"Successfully imported torch-scatter version {_scatter_version}")
-except importlib_metadata.PackageNotFoundError:
-    _scatter_available = False
-
-
-_soundfile_available = importlib.util.find_spec("soundfile") is not None
-try:
-    _soundfile_version = importlib_metadata.version("soundfile")
-    logger.debug(f"Successfully imported soundfile version {_soundfile_version}")
-except importlib_metadata.PackageNotFoundError:
-    _soundfile_available = False
-
-_torchaudio_available = importlib.util.find_spec("torchaudio")
-try:
-    _torchaudio_version = importlib_metadata.version("torchaudio")
-    logger.debug(f"Successfully imported soundfile version {_torchaudio_version}")
-except importlib_metadata.PackageNotFoundError:
-    _torchaudio_available = False
-
-
-torch_cache_home = os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
-old_default_cache_path = os.path.join(torch_cache_home, "transformers")
-# New default cache, shared with the Datasets library
-hf_cache_home = os.path.expanduser(
-    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
-)
-default_cache_path = os.path.join(hf_cache_home, "transformers")
-
-# Onetime move from the old location to the new one if no ENV variable has been set.
-if (
-    os.path.isdir(old_default_cache_path)
-    and not os.path.isdir(default_cache_path)
-    and "PYTORCH_PRETRAINED_BERT_CACHE" not in os.environ
-    and "PYTORCH_TRANSFORMERS_CACHE" not in os.environ
-    and "TRANSFORMERS_CACHE" not in os.environ
-):
-    logger.warn(
-        "In Transformers v4.0.0, the default path to cache downloaded models changed from "
-        "'~/.cache/torch/transformers' to '~/.cache/huggingface/transformers'. Since you don't seem to have overridden "
-        "and '~/.cache/torch/transformers' is a directory that exists, we're moving it to "
-        "'~/.cache/huggingface/transformers' to avoid redownloading models you have already in the cache. You should "
-        "only see this message once."
-    )
-    shutil.move(old_default_cache_path, default_cache_path)
-
-PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
-PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
-TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE)
-DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", False)
-
-WEIGHTS_NAME = "pytorch_model.bin"
-TF2_WEIGHTS_NAME = "tf_model.h5"
-TF_WEIGHTS_NAME = "model.ckpt"
-FLAX_WEIGHTS_NAME = "flax_model.msgpack"
-CONFIG_NAME = "config.json"
-FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
-MODEL_CARD_NAME = "modelcard.json"
-
-SENTENCEPIECE_UNDERLINE = "▁"
-SPIECE_UNDERLINE = SENTENCEPIECE_UNDERLINE  # Kept for backward compatibility
-
-MULTIPLE_CHOICE_DUMMY_INPUTS = [
-    [[0, 1, 0, 1], [1, 0, 0, 1]]
-] * 2  # Needs to have 0s and 1s only since XLM uses it for langs too.
-DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
-
-S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
-CLOUDFRONT_DISTRIB_PREFIX = "https://cdn.huggingface.co"
-HUGGINGFACE_CO_PREFIX = "https://huggingface.co/{model_id}/resolve/{revision}/{filename}"
-
-PRESET_MIRROR_DICT = {
-    "tuna": "https://mirrors.tuna.tsinghua.edu.cn/hugging-face-models",
-    "bfsu": "https://mirrors.bfsu.edu.cn/hugging-face-models",
-}
-
-
-_is_offline_mode = True if os.environ.get("TRANSFORMERS_OFFLINE", "0").upper() in ENV_VARS_TRUE_VALUES else False
-
-
-def is_offline_mode():
-    return _is_offline_mode
-
-
-def is_torch_available():
-    return _torch_available
-
-
-def is_torch_cuda_available():
-    if is_torch_available():
-        import torch
-
-        return torch.cuda.is_available()
-    else:
-        return False
-
-
-def is_tf_available():
-    return _tf_available
-
-
-def is_onnx_available():
-    return _onnx_available
-
-
-def is_flax_available():
-    return _flax_available
-
-
-def is_torch_tpu_available():
-    if not _torch_available:
-        return False
-    # This test is probably enough, but just in case, we unpack a bit.
-    if importlib.util.find_spec("torch_xla") is None:
-        return False
-    if importlib.util.find_spec("torch_xla.core") is None:
-        return False
-    return importlib.util.find_spec("torch_xla.core.xla_model") is not None
-
-
-def is_datasets_available():
-    return _datasets_available
-
-
-def is_psutil_available():
-    return importlib.util.find_spec("psutil") is not None
-
-
-def is_py3nvml_available():
-    return importlib.util.find_spec("py3nvml") is not None
-
-
-def is_apex_available():
-    return importlib.util.find_spec("apex") is not None
-
-
-def is_faiss_available():
-    return _faiss_available
-
-
-def is_sklearn_available():
-    if importlib.util.find_spec("sklearn") is None:
-        return False
-    if importlib.util.find_spec("scipy") is None:
-        return False
-    return importlib.util.find_spec("sklearn.metrics") and importlib.util.find_spec("scipy.stats")
-
-
-def is_sentencepiece_available():
-    return importlib.util.find_spec("sentencepiece") is not None
-
-
-def is_protobuf_available():
-    if importlib.util.find_spec("google") is None:
-        return False
-    return importlib.util.find_spec("google.protobuf") is not None
-
-
-def is_tokenizers_available():
-    return importlib.util.find_spec("tokenizers") is not None
-
-
-def is_in_notebook():
-    try:
-        # Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py
-        get_ipython = sys.modules["IPython"].get_ipython
-        if "IPKernelApp" not in get_ipython().config:
-            raise ImportError("console")
-        if "VSCODE_PID" in os.environ:
-            raise ImportError("vscode")
-
-        return importlib.util.find_spec("IPython") is not None
-    except (AttributeError, ImportError, KeyError):
-        return False
-
-
-def is_scatter_available():
-    return _scatter_available
-
-
-def is_pandas_available():
-    return importlib.util.find_spec("pandas") is not None
-
-
-def is_sagemaker_distributed_available():
-    # Get the sagemaker specific env variable.
-    sagemaker_params = os.getenv("SM_FRAMEWORK_PARAMS", "{}")
-    try:
-        # Parse it and check the field "sagemaker_distributed_dataparallel_enabled".
-        sagemaker_params = json.loads(sagemaker_params)
-        if not sagemaker_params.get("sagemaker_distributed_dataparallel_enabled", False):
-            return False
-    except json.JSONDecodeError:
-        return False
-    # Lastly, check if the `smdistributed` module is present.
-    return importlib.util.find_spec("smdistributed") is not None
-
-
-def is_training_run_on_sagemaker():
-    return "SAGEMAKER_JOB_NAME" in os.environ and not DISABLE_TELEMETRY
-
-
-def is_soundfile_availble():
-    return _soundfile_available
-
-
-def is_torchaudio_available():
-    return _torchaudio_available
-
-
-def torch_only_method(fn):
-    def wrapper(*args, **kwargs):
-        if not _torch_available:
-            raise ImportError(
-                "You need to install pytorch to use this method or class, "
-                "or activate it with environment variables USE_TORCH=1 and USE_TF=0."
-            )
-        else:
-            return fn(*args, **kwargs)
-
-    return wrapper
-
-
-# docstyle-ignore
-DATASETS_IMPORT_ERROR = """
-{0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with:
-```
-pip install datasets
-```
-In a notebook or a colab, you can install it by executing a cell with
-```
-!pip install datasets
-```
-then restarting your kernel.
-
-Note that if you have a local folder named `datasets` or a local python file named `datasets.py` in your current
-working directory, python may try to import this instead of the 🤗 Datasets library. You should rename this folder or
-that python file if that's the case.
-"""
-
-
-# docstyle-ignore
-TOKENIZERS_IMPORT_ERROR = """
-{0} requires the 🤗 Tokenizers library but it was not found in your environment. You can install it with:
-```
-pip install tokenizers
-```
-In a notebook or a colab, you can install it by executing a cell with
-```
-!pip install tokenizers
-```
-"""
-
-
-# docstyle-ignore
-SENTENCEPIECE_IMPORT_ERROR = """
-{0} requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
-installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
-that match your environment.
-"""
-
-
-# docstyle-ignore
-PROTOBUF_IMPORT_ERROR = """
-{0} requires the protobuf library but it was not found in your environment. Checkout the instructions on the
-installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
-that match your environment.
-"""
-
-
-# docstyle-ignore
-FAISS_IMPORT_ERROR = """
-{0} requires the faiss library but it was not found in your environment. Checkout the instructions on the
-installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
-that match your environment.
-"""
-
-
-# docstyle-ignore
-PYTORCH_IMPORT_ERROR = """
-{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
-installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
-"""
-
-
-# docstyle-ignore
-SKLEARN_IMPORT_ERROR = """
-{0} requires the scikit-learn library but it was not found in your environment. You can install it with:
-```
-pip install -U scikit-learn
-```
-In a notebook or a colab, you can install it by executing a cell with
-```
-!pip install -U scikit-learn
-```
-"""
-
-
-# docstyle-ignore
-TENSORFLOW_IMPORT_ERROR = """
-{0} requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the
-installation page: https://www.tensorflow.org/install and follow the ones that match your environment.
-"""
-
-
-# docstyle-ignore
-FLAX_IMPORT_ERROR = """
-{0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
-installation page: https://github.com/google/flax and follow the ones that match your environment.
-"""
-
-
-# docstyle-ignore
-SCATTER_IMPORT_ERROR = """
-{0} requires the torch-scatter library but it was not found in your environment. You can install it with pip as
-explained here: https://github.com/rusty1s/pytorch_scatter.
-"""
-
-
-# docstyle-ignore
-PANDAS_IMPORT_ERROR = """
-{0} requires the pandas library but it was not found in your environment. You can install it with pip as
-explained here: https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html.
-"""
-
-
-def requires_datasets(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_datasets_available():
-        raise ImportError(DATASETS_IMPORT_ERROR.format(name))
-
-
-def requires_faiss(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_faiss_available():
-        raise ImportError(FAISS_IMPORT_ERROR.format(name))
-
-
-def requires_pytorch(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_torch_available():
-        raise ImportError(PYTORCH_IMPORT_ERROR.format(name))
-
-
-def requires_sklearn(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_sklearn_available():
-        raise ImportError(SKLEARN_IMPORT_ERROR.format(name))
-
-
-def requires_tf(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_tf_available():
-        raise ImportError(TENSORFLOW_IMPORT_ERROR.format(name))
-
-
-def requires_flax(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_flax_available():
-        raise ImportError(FLAX_IMPORT_ERROR.format(name))
-
-
-def requires_tokenizers(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_tokenizers_available():
-        raise ImportError(TOKENIZERS_IMPORT_ERROR.format(name))
-
-
-def requires_sentencepiece(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_sentencepiece_available():
-        raise ImportError(SENTENCEPIECE_IMPORT_ERROR.format(name))
-
-
-def requires_protobuf(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_protobuf_available():
-        raise ImportError(PROTOBUF_IMPORT_ERROR.format(name))
-
-
-def requires_pandas(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_pandas_available():
-        raise ImportError(PANDAS_IMPORT_ERROR.format(name))
-
-
-def requires_scatter(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_scatter_available():
-        raise ImportError(SCATTER_IMPORT_ERROR.format(name))
-
-
-def add_start_docstrings(*docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
-        return fn
-
-    return docstring_decorator
-
-
-def add_start_docstrings_to_model_forward(*docstr):
-    def docstring_decorator(fn):
-        class_name = ":class:`~transformers.{}`".format(fn.__qualname__.split(".")[0])
-        intro = "   The {} forward method, overrides the :func:`__call__` special method.".format(class_name)
-        note = r"""
-
-    .. note::
-        Although the recipe for forward pass needs to be defined within this function, one should call the
-        :class:`Module` instance afterwards instead of this since the former takes care of running the pre and post
-        processing steps while the latter silently ignores them.
-        """
-        fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
-        return fn
-
-    return docstring_decorator
-
-
-def add_end_docstrings(*docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + "".join(docstr)
-        return fn
-
-    return docstring_decorator
-
-
-PT_RETURN_INTRODUCTION = r"""
-    Returns:
-        :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`: A :class:`~{full_output_type}` (if
-        ``return_dict=True`` is passed or when ``config.return_dict=True``) or a tuple of :obj:`torch.FloatTensor`
-        comprising various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs.
-
-"""
-
-
-TF_RETURN_INTRODUCTION = r"""
-    Returns:
-        :class:`~{full_output_type}` or :obj:`tuple(tf.Tensor)`: A :class:`~{full_output_type}` (if
-        ``return_dict=True`` is passed or when ``config.return_dict=True``) or a tuple of :obj:`tf.Tensor` comprising
-        various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs.
-
-"""
-
-
-def _get_indent(t):
-    """Returns the indentation in the first line of t"""
-    search = re.search(r"^(\s*)\S", t)
-    return "" if search is None else search.groups()[0]
-
-
-def _convert_output_args_doc(output_args_doc):
-    """Convert output_args_doc to display properly."""
-    # Split output_arg_doc in blocks argument/description
-    indent = _get_indent(output_args_doc)
-    blocks = []
-    current_block = ""
-    for line in output_args_doc.split("\n"):
-        # If the indent is the same as the beginning, the line is the name of new arg.
-        if _get_indent(line) == indent:
-            if len(current_block) > 0:
-                blocks.append(current_block[:-1])
-            current_block = f"{line}\n"
-        else:
-            # Otherwise it's part of the description of the current arg.
-            # We need to remove 2 spaces to the indentation.
-            current_block += f"{line[2:]}\n"
-    blocks.append(current_block[:-1])
-
-    # Format each block for proper rendering
-    for i in range(len(blocks)):
-        blocks[i] = re.sub(r"^(\s+)(\S+)(\s+)", r"\1- **\2**\3", blocks[i])
-        blocks[i] = re.sub(r":\s*\n\s*(\S)", r" -- \1", blocks[i])
-
-    return "\n".join(blocks)
-
-
-def _prepare_output_docstrings(output_type, config_class):
-    """
-    Prepares the return part of the docstring using `output_type`.
-    """
-    docstrings = output_type.__doc__
-
-    # Remove the head of the docstring to keep the list of args only
-    lines = docstrings.split("\n")
-    i = 0
-    while i < len(lines) and re.search(r"^\s*(Args|Parameters):\s*$", lines[i]) is None:
-        i += 1
-    if i < len(lines):
-        docstrings = "\n".join(lines[(i + 1) :])
-        docstrings = _convert_output_args_doc(docstrings)
-
-    # Add the return introduction
-    full_output_type = f"{output_type.__module__}.{output_type.__name__}"
-    intro = TF_RETURN_INTRODUCTION if output_type.__name__.startswith("TF") else PT_RETURN_INTRODUCTION
-    intro = intro.format(full_output_type=full_output_type, config_class=config_class)
-    return intro + docstrings
-
-
-PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
-    Example::
-
-        >>> from transformers import {tokenizer_class}, {model_class}
-        >>> import torch
-
-        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
-        >>> model = {model_class}.from_pretrained('{checkpoint}')
-
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)  # Batch size 1
-
-        >>> outputs = model(**inputs, labels=labels)
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
-"""
-
-PT_QUESTION_ANSWERING_SAMPLE = r"""
-    Example::
-
-        >>> from transformers import {tokenizer_class}, {model_class}
-        >>> import torch
-
-        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
-        >>> model = {model_class}.from_pretrained('{checkpoint}')
-
-        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        >>> inputs = tokenizer(question, text, return_tensors='pt')
-        >>> start_positions = torch.tensor([1])
-        >>> end_positions = torch.tensor([3])
-
-        >>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
-        >>> loss = outputs.loss
-        >>> start_scores = outputs.start_logits
-        >>> end_scores = outputs.end_logits
-"""
-
-PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
-    Example::
-
-        >>> from transformers import {tokenizer_class}, {model_class}
-        >>> import torch
-
-        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
-        >>> model = {model_class}.from_pretrained('{checkpoint}')
-
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(**inputs, labels=labels)
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
-"""
-
-PT_MASKED_LM_SAMPLE = r"""
-    Example::
-
-        >>> from transformers import {tokenizer_class}, {model_class}
-        >>> import torch
-
-        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
-        >>> model = {model_class}.from_pretrained('{checkpoint}')
-
-        >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
-        >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
-
-        >>> outputs = model(**inputs, labels=labels)
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
-"""
-
-PT_BASE_MODEL_SAMPLE = r"""
-    Example::
-
-        >>> from transformers import {tokenizer_class}, {model_class}
-        >>> import torch
-
-        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
-        >>> model = {model_class}.from_pretrained('{checkpoint}')
-
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
-
-        >>> last_hidden_states = outputs.last_hidden_state
-"""
-
-PT_MULTIPLE_CHOICE_SAMPLE = r"""
-    Example::
-
-        >>> from transformers import {tokenizer_class}, {model_class}
-        >>> import torch
-
-        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
-        >>> model = {model_class}.from_pretrained('{checkpoint}')
-
-        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        >>> choice0 = "It is eaten with a fork and a knife."
-        >>> choice1 = "It is eaten while held in the hand."
-        >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-
-        >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
-        >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels)  # batch size is 1
-
-        >>> # the linear classifier still needs to be trained
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
-"""
-
-PT_CAUSAL_LM_SAMPLE = r"""
-    Example::
-
-        >>> import torch
-        >>> from transformers import {tokenizer_class}, {model_class}
-
-        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
-        >>> model = {model_class}.from_pretrained('{checkpoint}')
-
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs, labels=inputs["input_ids"])
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
-"""
-
-TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
-    Example::
-
-        >>> from transformers import {tokenizer_class}, {model_class}
-        >>> import tensorflow as tf
-
-        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
-        >>> model = {model_class}.from_pretrained('{checkpoint}')
-
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
-        >>> input_ids = inputs["input_ids"]
-        >>> inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-
-        >>> outputs = model(inputs)
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
-"""
-
-TF_QUESTION_ANSWERING_SAMPLE = r"""
-    Example::
-
-        >>> from transformers import {tokenizer_class}, {model_class}
-        >>> import tensorflow as tf
-
-        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
-        >>> model = {model_class}.from_pretrained('{checkpoint}')
-
-        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        >>> input_dict = tokenizer(question, text, return_tensors='tf')
-        >>> outputs = model(input_dict)
-        >>> start_logits = outputs.start_logits
-        >>> end_logits = outputs.end_logits
-
-        >>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        >>> answer = ' '.join(all_tokens[tf.math.argmax(start_logits, 1)[0] : tf.math.argmax(end_logits, 1)[0]+1])
-"""
-
-TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
-    Example::
-
-        >>> from transformers import {tokenizer_class}, {model_class}
-        >>> import tensorflow as tf
-
-        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
-        >>> model = {model_class}.from_pretrained('{checkpoint}')
-
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
-        >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-
-        >>> outputs = model(inputs)
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
-"""
-
-TF_MASKED_LM_SAMPLE = r"""
-    Example::
-
-        >>> from transformers import {tokenizer_class}, {model_class}
-        >>> import tensorflow as tf
-
-        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
-        >>> model = {model_class}.from_pretrained('{checkpoint}')
-
-        >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="tf")
-        >>> inputs["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"]
-
-        >>> outputs = model(inputs)
-        >>> loss = outputs.loss
-        >>> logits = outputs.logits
-"""
-
-TF_BASE_MODEL_SAMPLE = r"""
-    Example::
-
-        >>> from transformers import {tokenizer_class}, {model_class}
-        >>> import tensorflow as tf
-
-        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
-        >>> model = {model_class}.from_pretrained('{checkpoint}')
-
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
-        >>> outputs = model(inputs)
-
-        >>> last_hidden_states = outputs.last_hidden_state
-"""
-
-TF_MULTIPLE_CHOICE_SAMPLE = r"""
-    Example::
-
-        >>> from transformers import {tokenizer_class}, {model_class}
-        >>> import tensorflow as tf
-
-        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
-        >>> model = {model_class}.from_pretrained('{checkpoint}')
-
-        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        >>> choice0 = "It is eaten with a fork and a knife."
-        >>> choice1 = "It is eaten while held in the hand."
-
-        >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', padding=True)
-        >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
-        >>> outputs = model(inputs)  # batch size is 1
-
-        >>> # the linear classifier still needs to be trained
-        >>> logits = outputs.logits
-"""
-
-TF_CAUSAL_LM_SAMPLE = r"""
-    Example::
-
-        >>> from transformers import {tokenizer_class}, {model_class}
-        >>> import tensorflow as tf
-
-        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
-        >>> model = {model_class}.from_pretrained('{checkpoint}')
-
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
-        >>> outputs = model(inputs)
-        >>> logits = outputs.logits
-"""
-
-
-def add_code_sample_docstrings(
-    *docstr, tokenizer_class=None, checkpoint=None, output_type=None, config_class=None, mask=None
-):
-    def docstring_decorator(fn):
-        model_class = fn.__qualname__.split(".")[0]
-        is_tf_class = model_class[:2] == "TF"
-        doc_kwargs = dict(model_class=model_class, tokenizer_class=tokenizer_class, checkpoint=checkpoint)
-
-        if "SequenceClassification" in model_class:
-            code_sample = TF_SEQUENCE_CLASSIFICATION_SAMPLE if is_tf_class else PT_SEQUENCE_CLASSIFICATION_SAMPLE
-        elif "QuestionAnswering" in model_class:
-            code_sample = TF_QUESTION_ANSWERING_SAMPLE if is_tf_class else PT_QUESTION_ANSWERING_SAMPLE
-        elif "TokenClassification" in model_class:
-            code_sample = TF_TOKEN_CLASSIFICATION_SAMPLE if is_tf_class else PT_TOKEN_CLASSIFICATION_SAMPLE
-        elif "MultipleChoice" in model_class:
-            code_sample = TF_MULTIPLE_CHOICE_SAMPLE if is_tf_class else PT_MULTIPLE_CHOICE_SAMPLE
-        elif "MaskedLM" in model_class or model_class in ["FlaubertWithLMHeadModel", "XLMWithLMHeadModel"]:
-            doc_kwargs["mask"] = "[MASK]" if mask is None else mask
-            code_sample = TF_MASKED_LM_SAMPLE if is_tf_class else PT_MASKED_LM_SAMPLE
-        elif "LMHead" in model_class or "CausalLM" in model_class:
-            code_sample = TF_CAUSAL_LM_SAMPLE if is_tf_class else PT_CAUSAL_LM_SAMPLE
-        elif "Model" in model_class or "Encoder" in model_class:
-            code_sample = TF_BASE_MODEL_SAMPLE if is_tf_class else PT_BASE_MODEL_SAMPLE
-        else:
-            raise ValueError(f"Docstring can't be built for model {model_class}")
-
-        output_doc = _prepare_output_docstrings(output_type, config_class) if output_type is not None else ""
-        built_doc = code_sample.format(**doc_kwargs)
-        fn.__doc__ = (fn.__doc__ or "") + "".join(docstr) + output_doc + built_doc
-        return fn
-
-    return docstring_decorator
-
-
-def replace_return_docstrings(output_type=None, config_class=None):
-    def docstring_decorator(fn):
-        docstrings = fn.__doc__
-        lines = docstrings.split("\n")
-        i = 0
-        while i < len(lines) and re.search(r"^\s*Returns?:\s*$", lines[i]) is None:
-            i += 1
-        if i < len(lines):
-            lines[i] = _prepare_output_docstrings(output_type, config_class)
-            docstrings = "\n".join(lines)
-        else:
-            raise ValueError(
-                f"The function {fn} should have an empty 'Return:' or 'Returns:' in its docstring as placeholder, current docstring is:\n{docstrings}"
-            )
-        fn.__doc__ = docstrings
-        return fn
-
-    return docstring_decorator
-
-
-def is_remote_url(url_or_filename):
-    parsed = urlparse(url_or_filename)
-    return parsed.scheme in ("http", "https")
-
-
-def hf_bucket_url(
-    model_id: str, filename: str, subfolder: Optional[str] = None, revision: Optional[str] = None, mirror=None
-) -> str:
-    """
-    Resolve a model identifier, a file name, and an optional revision id, to a huggingface.co-hosted url, redirecting
-    to Cloudfront (a Content Delivery Network, or CDN) for large files.
-
-    Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our
-    bandwidth costs).
-
-    Cloudfront aggressively caches files by default (default TTL is 24 hours), however this is not an issue here
-    because we migrated to a git-based versioning system on huggingface.co, so we now store the files on S3/Cloudfront
-    in a content-addressable way (i.e., the file name is its hash). Using content-addressable filenames means cache
-    can't ever be stale.
-
-    In terms of client-side caching from this library, we base our caching on the objects' ETag. An object' ETag is:
-    its sha1 if stored in git, or its sha256 if stored in git-lfs. Files cached locally from transformers before v3.5.0
-    are not shared with those new files, because the cached file's name contains a hash of the url (which changed).
-    """
-    if subfolder is not None:
-        filename = f"{subfolder}/{filename}"
-
-    if mirror:
-        endpoint = PRESET_MIRROR_DICT.get(mirror, mirror)
-        legacy_format = "/" not in model_id
-        if legacy_format:
-            return f"{endpoint}/{model_id}-{filename}"
-        else:
-            return f"{endpoint}/{model_id}/{filename}"
-
-    if revision is None:
-        revision = "main"
-    return HUGGINGFACE_CO_PREFIX.format(model_id=model_id, revision=revision, filename=filename)
-
-
-def url_to_filename(url: str, etag: Optional[str] = None) -> str:
-    """
-    Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's,
-    delimited by a period. If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0 can
-    identify it as a HDF5 file (see
-    https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
-    """
-    url_bytes = url.encode("utf-8")
-    filename = sha256(url_bytes).hexdigest()
-
-    if etag:
-        etag_bytes = etag.encode("utf-8")
-        filename += "." + sha256(etag_bytes).hexdigest()
-
-    if url.endswith(".h5"):
-        filename += ".h5"
-
-    return filename
-
-
-def filename_to_url(filename, cache_dir=None):
-    """
-    Return the url and etag (which may be ``None``) stored for `filename`. Raise ``EnvironmentError`` if `filename` or
-    its stored metadata do not exist.
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    cache_path = os.path.join(cache_dir, filename)
-    if not os.path.exists(cache_path):
-        raise EnvironmentError("file {} not found".format(cache_path))
-
-    meta_path = cache_path + ".json"
-    if not os.path.exists(meta_path):
-        raise EnvironmentError("file {} not found".format(meta_path))
-
-    with open(meta_path, encoding="utf-8") as meta_file:
-        metadata = json.load(meta_file)
-    url = metadata["url"]
-    etag = metadata["etag"]
-
-    return url, etag
-
-
-def get_cached_models(cache_dir: Union[str, Path] = None) -> List[Tuple]:
-    """
-    Returns a list of tuples representing model binaries that are cached locally. Each tuple has shape
-    :obj:`(model_url, etag, size_MB)`. Filenames in :obj:`cache_dir` are use to get the metadata for each model, only
-    urls ending with `.bin` are added.
-
-    Args:
-        cache_dir (:obj:`Union[str, Path]`, `optional`):
-            The cache directory to search for models within. Will default to the transformers cache if unset.
-
-    Returns:
-        List[Tuple]: List of tuples each with shape :obj:`(model_url, etag, size_MB)`
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    elif isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    cached_models = []
-    for file in os.listdir(cache_dir):
-        if file.endswith(".json"):
-            meta_path = os.path.join(cache_dir, file)
-            with open(meta_path, encoding="utf-8") as meta_file:
-                metadata = json.load(meta_file)
-                url = metadata["url"]
-                etag = metadata["etag"]
-                if url.endswith(".bin"):
-                    size_MB = os.path.getsize(meta_path.strip(".json")) / 1e6
-                    cached_models.append((url, etag, size_MB))
-
-    return cached_models
-
-
-def cached_path(
-    url_or_filename,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    resume_download=False,
-    user_agent: Union[Dict, str, None] = None,
-    extract_compressed_file=False,
-    force_extract=False,
-    use_auth_token: Union[bool, str, None] = None,
-    local_files_only=False,
-) -> Optional[str]:
-    """
-    Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file
-    and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and
-    then return the path
-
-    Args:
-        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
-        force_download: if True, re-download the file even if it's already cached in the cache dir.
-        resume_download: if True, resume the download if incompletely received file is found.
-        user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
-        use_auth_token: Optional string or boolean to use as Bearer token for remote files. If True,
-            will get token from ~/.huggingface.
-        extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
-            file in a folder along the archive.
-        force_extract: if True when extract_compressed_file is True and the archive was already extracted,
-            re-extract the archive and override the folder where it was extracted.
-
-    Return:
-        Local path (string) of file or if networking is off, last version of file cached on disk.
-
-    Raises:
-        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(url_or_filename, Path):
-        url_or_filename = str(url_or_filename)
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    if is_offline_mode() and not local_files_only:
-        logger.info("Offline mode: forcing local_files_only=True")
-        local_files_only = True
-
-    if is_remote_url(url_or_filename):
-        # URL, so get it from the cache (downloading if necessary)
-        output_path = get_from_cache(
-            url_or_filename,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            proxies=proxies,
-            resume_download=resume_download,
-            user_agent=user_agent,
-            use_auth_token=use_auth_token,
-            local_files_only=local_files_only,
-        )
-    elif os.path.exists(url_or_filename):
-        # File, and it exists.
-        output_path = url_or_filename
-    elif urlparse(url_or_filename).scheme == "":
-        # File, but it doesn't exist.
-        raise EnvironmentError("file {} not found".format(url_or_filename))
-    else:
-        # Something unknown
-        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
-
-    if extract_compressed_file:
-        if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
-            return output_path
-
-        # Path where we extract compressed archives
-        # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
-        output_dir, output_file = os.path.split(output_path)
-        output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
-        output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
-
-        if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
-            return output_path_extracted
-
-        # Prevent parallel extractions
-        lock_path = output_path + ".lock"
-        with FileLock(lock_path):
-            shutil.rmtree(output_path_extracted, ignore_errors=True)
-            os.makedirs(output_path_extracted)
-            if is_zipfile(output_path):
-                with ZipFile(output_path, "r") as zip_file:
-                    zip_file.extractall(output_path_extracted)
-                    zip_file.close()
-            elif tarfile.is_tarfile(output_path):
-                tar_file = tarfile.open(output_path)
-                tar_file.extractall(output_path_extracted)
-                tar_file.close()
-            else:
-                raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
-
-        return output_path_extracted
-
-    return output_path
-
-
-def define_sagemaker_information():
-    try:
-        instance_data = requests.get(os.environ["ECS_CONTAINER_METADATA_URI"]).json()
-        dlc_container_used = instance_data["Image"]
-        dlc_tag = instance_data["Image"].split(":")[1]
-    except Exception:
-        dlc_container_used = None
-        dlc_tag = None
-
-    sagemaker_params = json.loads(os.getenv("SM_FRAMEWORK_PARAMS", "{}"))
-    runs_distributed_training = True if "sagemaker_distributed_dataparallel_enabled" in sagemaker_params else False
-    account_id = os.getenv("TRAINING_JOB_ARN").split(":")[4] if "TRAINING_JOB_ARN" in os.environ else None
-
-    sagemaker_object = {
-        "sm_framework": os.getenv("SM_FRAMEWORK_MODULE", None),
-        "sm_region": os.getenv("AWS_REGION", None),
-        "sm_number_gpu": os.getenv("SM_NUM_GPUS", 0),
-        "sm_number_cpu": os.getenv("SM_NUM_CPUS", 0),
-        "sm_distributed_training": runs_distributed_training,
-        "sm_deep_learning_container": dlc_container_used,
-        "sm_deep_learning_container_tag": dlc_tag,
-        "sm_account_id": account_id,
-    }
-    return sagemaker_object
-
-
-def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
-    """
-    Formats a user-agent string with basic info about a request.
-    """
-    ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
-    if is_torch_available():
-        ua += f"; torch/{_torch_version}"
-    if is_tf_available():
-        ua += f"; tensorflow/{_tf_version}"
-    if is_training_run_on_sagemaker():
-        ua += "; " + "; ".join(f"{k}/{v}" for k, v in define_sagemaker_information().items())
-    if isinstance(user_agent, dict):
-        ua += "; " + "; ".join(f"{k}/{v}" for k, v in user_agent.items())
-    elif isinstance(user_agent, str):
-        ua += "; " + user_agent
-    return ua
-
-
-def http_get(url: str, temp_file: BinaryIO, proxies=None, resume_size=0, headers: Optional[Dict[str, str]] = None):
-    """
-    Donwload remote file. Do not gobble up errors.
-    """
-    headers = copy.deepcopy(headers)
-    if resume_size > 0:
-        headers["Range"] = "bytes=%d-" % (resume_size,)
-    r = requests.get(url, stream=True, proxies=proxies, headers=headers)
-    r.raise_for_status()
-    content_length = r.headers.get("Content-Length")
-    total = resume_size + int(content_length) if content_length is not None else None
-    progress = tqdm(
-        unit="B",
-        unit_scale=True,
-        total=total,
-        initial=resume_size,
-        desc="Downloading",
-        disable=bool(logging.get_verbosity() == logging.NOTSET),
-    )
-    for chunk in r.iter_content(chunk_size=1024):
-        if chunk:  # filter out keep-alive new chunks
-            progress.update(len(chunk))
-            temp_file.write(chunk)
-    progress.close()
-
-
-def get_from_cache(
-    url: str,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    etag_timeout=10,
-    resume_download=False,
-    user_agent: Union[Dict, str, None] = None,
-    use_auth_token: Union[bool, str, None] = None,
-    local_files_only=False,
-) -> Optional[str]:
-    """
-    Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the
-    path to the cached file.
-
-    Return:
-        Local path (string) of file or if networking is off, last version of file cached on disk.
-
-    Raises:
-        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    os.makedirs(cache_dir, exist_ok=True)
-
-    headers = {"user-agent": http_user_agent(user_agent)}
-    if isinstance(use_auth_token, str):
-        headers["authorization"] = "Bearer {}".format(use_auth_token)
-    elif use_auth_token:
-        token = HfFolder.get_token()
-        if token is None:
-            raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.")
-        headers["authorization"] = "Bearer {}".format(token)
-
-    url_to_download = url
-    etag = None
-    if not local_files_only:
-        try:
-            r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout)
-            r.raise_for_status()
-            etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
-            # We favor a custom header indicating the etag of the linked resource, and
-            # we fallback to the regular etag header.
-            # If we don't have any of those, raise an error.
-            if etag is None:
-                raise OSError(
-                    "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
-                )
-            # In case of a redirect,
-            # save an extra redirect on the request.get call,
-            # and ensure we download the exact atomic version even if it changed
-            # between the HEAD and the GET (unlikely, but hey).
-            if 300 <= r.status_code <= 399:
-                url_to_download = r.headers["Location"]
-        except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
-            # etag is already None
-            pass
-
-    filename = url_to_filename(url, etag)
-
-    # get cache path to put the file
-    cache_path = os.path.join(cache_dir, filename)
-
-    # etag is None == we don't have a connection or we passed local_files_only.
-    # try to get the last downloaded one
-    if etag is None:
-        if os.path.exists(cache_path):
-            return cache_path
-        else:
-            matching_files = [
-                file
-                for file in fnmatch.filter(os.listdir(cache_dir), filename.split(".")[0] + ".*")
-                if not file.endswith(".json") and not file.endswith(".lock")
-            ]
-            if len(matching_files) > 0:
-                return os.path.join(cache_dir, matching_files[-1])
-            else:
-                # If files cannot be found and local_files_only=True,
-                # the models might've been found if local_files_only=False
-                # Notify the user about that
-                if local_files_only:
-                    raise FileNotFoundError(
-                        "Cannot find the requested files in the cached path and outgoing traffic has been"
-                        " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
-                        " to False."
-                    )
-                else:
-                    raise ValueError(
-                        "Connection error, and we cannot find the requested files in the cached path."
-                        " Please try again or make sure your Internet connection is on."
-                    )
-
-    # From now on, etag is not None.
-    if os.path.exists(cache_path) and not force_download:
-        return cache_path
-
-    # Prevent parallel downloads of the same file with a lock.
-    lock_path = cache_path + ".lock"
-    with FileLock(lock_path):
-
-        # If the download just completed while the lock was activated.
-        if os.path.exists(cache_path) and not force_download:
-            # Even if returning early like here, the lock will be released.
-            return cache_path
-
-        if resume_download:
-            incomplete_path = cache_path + ".incomplete"
-
-            @contextmanager
-            def _resumable_file_manager() -> "io.BufferedWriter":
-                with open(incomplete_path, "ab") as f:
-                    yield f
-
-            temp_file_manager = _resumable_file_manager
-            if os.path.exists(incomplete_path):
-                resume_size = os.stat(incomplete_path).st_size
-            else:
-                resume_size = 0
-        else:
-            temp_file_manager = partial(tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False)
-            resume_size = 0
-
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with temp_file_manager() as temp_file:
-            logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
-
-            http_get(url_to_download, temp_file, proxies=proxies, resume_size=resume_size, headers=headers)
-
-        logger.info("storing %s in cache at %s", url, cache_path)
-        os.replace(temp_file.name, cache_path)
-
-        logger.info("creating metadata file for %s", cache_path)
-        meta = {"url": url, "etag": etag}
-        meta_path = cache_path + ".json"
-        with open(meta_path, "w") as meta_file:
-            json.dump(meta, meta_file)
-
-    return cache_path
-
-
-class cached_property(property):
-    """
-    Descriptor that mimics @property but caches output in member variable.
-
-    From tensorflow_datasets
-
-    Built-in in functools from Python 3.8.
-    """
-
-    def __get__(self, obj, objtype=None):
-        # See docs.python.org/3/howto/descriptor.html#properties
-        if obj is None:
-            return self
-        if self.fget is None:
-            raise AttributeError("unreadable attribute")
-        attr = "__cached_" + self.fget.__name__
-        cached = getattr(obj, attr, None)
-        if cached is None:
-            cached = self.fget(obj)
-            setattr(obj, attr, cached)
-        return cached
-
-
-def torch_required(func):
-    # Chose a different decorator name than in tests so it's clear they are not the same.
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        if is_torch_available():
-            return func(*args, **kwargs)
-        else:
-            raise ImportError(f"Method `{func.__name__}` requires PyTorch.")
-
-    return wrapper
-
-
-def tf_required(func):
-    # Chose a different decorator name than in tests so it's clear they are not the same.
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        if is_tf_available():
-            return func(*args, **kwargs)
-        else:
-            raise ImportError(f"Method `{func.__name__}` requires TF.")
-
-    return wrapper
-
-
-def is_tensor(x):
-    """ Tests if ``x`` is a :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`. """
-    if is_torch_available():
-        import torch
-
-        if isinstance(x, torch.Tensor):
-            return True
-    if is_tf_available():
-        import tensorflow as tf
-
-        if isinstance(x, tf.Tensor):
-            return True
-    return isinstance(x, np.ndarray)
-
-
-def _is_numpy(x):
-    return isinstance(x, np.ndarray)
-
-
-def _is_torch(x):
-    import torch
-
-    return isinstance(x, torch.Tensor)
-
-
-def _is_torch_device(x):
-    import torch
-
-    return isinstance(x, torch.device)
-
-
-def _is_tensorflow(x):
-    import tensorflow as tf
-
-    return isinstance(x, tf.Tensor)
-
-
-def _is_jax(x):
-    import jax.numpy as jnp  # noqa: F811
-
-    return isinstance(x, jnp.ndarray)
-
-
-def to_py_obj(obj):
-    """
-    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
-    """
-    if isinstance(obj, (dict, UserDict)):
-        return {k: to_py_obj(v) for k, v in obj.items()}
-    elif isinstance(obj, (list, tuple)):
-        return [to_py_obj(o) for o in obj]
-    elif is_tf_available() and _is_tensorflow(obj):
-        return obj.numpy().tolist()
-    elif is_torch_available() and _is_torch(obj):
-        return obj.detach().cpu().tolist()
-    elif isinstance(obj, np.ndarray):
-        return obj.tolist()
-    else:
-        return obj
-
-
-class ModelOutput(OrderedDict):
-    """
-    Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like
-    a tuple) or strings (like a dictionary) that will ignore the ``None`` attributes. Otherwise behaves like a regular
-    python dictionary.
-
-    .. warning::
-        You can't unpack a :obj:`ModelOutput` directly. Use the :meth:`~transformers.file_utils.ModelOutput.to_tuple`
-        method to convert it to a tuple before.
-    """
-
-    def __post_init__(self):
-        class_fields = fields(self)
-
-        # Safety and consistency checks
-        assert len(class_fields), f"{self.__class__.__name__} has no fields."
-        assert all(
-            field.default is None for field in class_fields[1:]
-        ), f"{self.__class__.__name__} should not have more than one required field."
-
-        first_field = getattr(self, class_fields[0].name)
-        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
-
-        if other_fields_are_none and not is_tensor(first_field):
-            try:
-                iterator = iter(first_field)
-                first_field_iterator = True
-            except TypeError:
-                first_field_iterator = False
-
-            # if we provided an iterator as first field and the iterator is a (key, value) iterator
-            # set the associated fields
-            if first_field_iterator:
-                for element in iterator:
-                    if (
-                        not isinstance(element, (list, tuple))
-                        or not len(element) == 2
-                        or not isinstance(element[0], str)
-                    ):
-                        break
-                    setattr(self, element[0], element[1])
-                    if element[1] is not None:
-                        self[element[0]] = element[1]
-            elif first_field is not None:
-                self[class_fields[0].name] = first_field
-        else:
-            for field in class_fields:
-                v = getattr(self, field.name)
-                if v is not None:
-                    self[field.name] = v
-
-    def __delitem__(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
-
-    def setdefault(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
-
-    def pop(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
-
-    def update(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
-
-    def __getitem__(self, k):
-        if isinstance(k, str):
-            inner_dict = {k: v for (k, v) in self.items()}
-            return inner_dict[k]
-        else:
-            return self.to_tuple()[k]
-
-    def __setattr__(self, name, value):
-        if name in self.keys() and value is not None:
-            # Don't call self.__setitem__ to avoid recursion errors
-            super().__setitem__(name, value)
-        super().__setattr__(name, value)
-
-    def __setitem__(self, key, value):
-        # Will raise a KeyException if needed
-        super().__setitem__(key, value)
-        # Don't call self.__setattr__ to avoid recursion errors
-        super().__setattr__(key, value)
-
-    def to_tuple(self) -> Tuple[Any]:
-        """
-        Convert self to a tuple containing all the attributes/keys that are not ``None``.
-        """
-        return tuple(self[k] for k in self.keys())
-
-
-class ExplicitEnum(Enum):
-    """
-    Enum with more explicit error message for missing values.
-    """
-
-    @classmethod
-    def _missing_(cls, value):
-        raise ValueError(
-            "%r is not a valid %s, please select one of %s"
-            % (value, cls.__name__, str(list(cls._value2member_map_.keys())))
-        )
-
-
-class PaddingStrategy(ExplicitEnum):
-    """
-    Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
-    in an IDE.
-    """
-
-    LONGEST = "longest"
-    MAX_LENGTH = "max_length"
-    DO_NOT_PAD = "do_not_pad"
-
-
-class TensorType(ExplicitEnum):
-    """
-    Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
-    tab-completion in an IDE.
-    """
-
-    PYTORCH = "pt"
-    TENSORFLOW = "tf"
-    NUMPY = "np"
-    JAX = "jax"
-
-
-class _BaseLazyModule(ModuleType):
-    """
-    Module class that surfaces all objects but only performs associated imports when the objects are requested.
-    """
-
-    # Very heavily inspired by optuna.integration._IntegrationModule
-    # https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
-    def __init__(self, name, import_structure):
-        super().__init__(name)
-        self._modules = set(import_structure.keys())
-        self._class_to_module = {}
-        for key, values in import_structure.items():
-            for value in values:
-                self._class_to_module[value] = key
-        # Needed for autocompletion in an IDE
-        self.__all__ = list(import_structure.keys()) + sum(import_structure.values(), [])
-
-    # Needed for autocompletion in an IDE
-    def __dir__(self):
-        return super().__dir__() + self.__all__
-
-    def __getattr__(self, name: str) -> Any:
-        if name in self._modules:
-            value = self._get_module(name)
-        elif name in self._class_to_module.keys():
-            module = self._get_module(self._class_to_module[name])
-            value = getattr(module, name)
-        else:
-            raise AttributeError(f"module {self.__name__} has no attribute {name}")
-
-        setattr(self, name, value)
-        return value
-
-    def _get_module(self, module_name: str) -> ModuleType:
-        raise NotImplementedError
--- a/generation_beam_search.py
+++ b/generation_beam_search.py
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from abc import ABC, abstractmethod
-from collections import UserDict
-from typing import Optional, Tuple
-
-import torch
-
-from file_utils import add_start_docstrings
-
-
-PROCESS_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using any class inheriting from :class:`~transformers.PretrainedTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        next_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
-            Current scores of the top :obj:`2 * num_beams` non-finished beam hypotheses.
-        next_tokens (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
-            :obj:`input_ids` of the tokens corresponding to the top :obj:`2 * num_beams` non-finished beam hypotheses.
-        next_indices (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
-            Beam indices indicating to which beam hypothesis the :obj:`next_tokens` correspond.
-        pad_token_id (:obj:`int`, `optional`):
-            The id of the `padding` token.
-        eos_token_id (:obj:`int`, `optional`):
-            The id of the `end-of-sequence` token.
-
-    Return:
-        :obj:`UserDict`: A dictionary composed of the fields as defined above:
-
-            - **next_beam_scores** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Updated
-              scores of all non-finished beams.
-            - **next_beam_tokens** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Next tokens
-              to be added to the non-finished beam_hypotheses.
-            - **next_beam_indices** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Beam indices
-              indicating to which beam the next tokens shall be added.
-
-"""
-
-FINALIZE_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using any class inheriting from :class:`~transformers.PretrainedTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        final_beam_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
-            The final scores of all non-finished beams.
-        final_beam_tokens (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
-            The last tokens to be added to the non-finished beam_hypotheses.
-        final_beam_indices (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
-            The beam indices indicating to which beam the :obj:`final_beam_tokens` shall be added.
-        pad_token_id (:obj:`int`, `optional`):
-            The id of the `padding` token.
-        eos_token_id (:obj:`int`, `optional`):
-            The id of the `end-of-sequence` token.
-
-    Return:
-        :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated
-        sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all
-        batches finished early due to the :obj:`eos_token_id`.
-
-"""
-
-
-class BeamScorer(ABC):
-    """
-    Abstract base class for all beam scorers that are used for :meth:`~transformers.PretrainedModel.beam_search` and
-    :meth:`~transformers.PretrainedModel.beam_sample`.
-    """
-
-    @abstractmethod
-    @add_start_docstrings(PROCESS_INPUTS_DOCSTRING)
-    def process(
-        self,
-        input_ids: torch.LongTensor,
-        next_scores: torch.FloatTensor,
-        next_tokens: torch.LongTensor,
-        next_indices: torch.LongTensor,
-        **kwargs
-    ) -> Tuple[torch.Tensor]:
-        raise NotImplementedError("This is an abstract method.")
-
-    @abstractmethod
-    @add_start_docstrings(FINALIZE_INPUTS_DOCSTRING)
-    def finalize(
-        self,
-        input_ids: torch.LongTensor,
-        next_scores: torch.FloatTensor,
-        next_tokens: torch.LongTensor,
-        next_indices: torch.LongTensor,
-        **kwargs
-    ) -> torch.LongTensor:
-        raise NotImplementedError("This is an abstract method.")
-
-
-class BeamSearchScorer(BeamScorer):
-    r"""
-    :class:`transformers.BeamScorer` implementing standard beam search decoding.
-
-    Adapted in part from `Facebook's XLM beam search code
-    <https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529>`__.
-
-    Reference for the diverse beam search algorithm and implementation `Ashwin Kalyan's DBS implementation
-    <https://github.com/ashwinkalyan/dbs/blob/master/dbs/beam_utils.lua>`__
-
-    Args:
-        batch_size (:obj:`int`):
-            Batch Size of :obj:`input_ids` for which standard beam search decoding is run in parallel.
-        max_length (:obj:`int`):
-            The maximum length of the sequence to be generated.
-        num_beams (:obj:`int`):
-            Number of beams for beam search.
-        device (:obj:`torch.device`):
-            Defines the device type (*e.g.*, :obj:`"cpu"` or :obj:`"cuda"`) on which this instance of
-            :obj:`BeamSearchScorer` will be allocated.
-        length_penalty (:obj:`float`, `optional`, defaults to 1.0):
-            Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the
-            model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer
-            sequences.
-        do_early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
-        num_beam_hyps_to_keep (:obj:`int`, `optional`, defaults to 1):
-            The number of beam hypotheses that shall be returned upon calling
-            :meth:`~transformer.BeamSearchScorer.finalize`.
-        num_beam_groups (:obj:`int`):
-            Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of
-            beams. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
-    """
-
-    def __init__(
-        self,
-        batch_size: int,
-        max_length: int,
-        num_beams: int,
-        device: torch.device,
-        length_penalty: Optional[float] = 1.0,
-        do_early_stopping: Optional[bool] = False,
-        num_beam_hyps_to_keep: Optional[int] = 1,
-        num_beam_groups: Optional[int] = 1,
-    ):
-        self.max_length = max_length
-        self.num_beams = num_beams
-        self.device = device
-        self.length_penalty = length_penalty
-        self.do_early_stopping = do_early_stopping
-        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
-        self.num_beam_groups = num_beam_groups
-        self.group_size = self.num_beams // self.num_beam_groups
-
-        self._is_init = False
-        self._beam_hyps = [
-            BeamHypotheses(
-                num_beams=self.num_beams,
-                max_length=self.max_length,
-                length_penalty=self.length_penalty,
-                early_stopping=self.do_early_stopping,
-            )
-            for _ in range(batch_size)
-        ]
-        self._done = torch.tensor([False for _ in range(batch_size)], dtype=torch.bool, device=self.device)
-
-        if not isinstance(num_beams, int) or num_beams <= 1:
-            raise ValueError(
-                f"`num_beams` has to be an integer strictly greater than 1, but is {num_beams}. For `num_beams` == 1, one should make use of `greedy_search` instead."
-            )
-
-        if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0):
-            raise ValueError(
-                f"`num_beam_groups` has to be an integer smaller or equal than `num_beams` and `num_beams` "
-                f"has to be divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}."
-            )
-
-    @property
-    def is_done(self) -> bool:
-        return self._done.all()
-
-    def process(
-        self,
-        input_ids: torch.LongTensor,
-        next_scores: torch.FloatTensor,
-        next_tokens: torch.LongTensor,
-        next_indices: torch.LongTensor,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[int] = None,
-    ) -> Tuple[torch.Tensor]:
-        cur_len = input_ids.shape[-1]
-        batch_size = len(self._beam_hyps)
-        assert batch_size == (input_ids.shape[0] // self.group_size)
-
-        device = input_ids.device
-        next_beam_scores = torch.zeros((batch_size, self.group_size), dtype=next_scores.dtype, device=device)
-        next_beam_tokens = torch.zeros((batch_size, self.group_size), dtype=next_tokens.dtype, device=device)
-        next_beam_indices = torch.zeros((batch_size, self.group_size), dtype=next_indices.dtype, device=device)
-
-        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
-            if self._done[batch_idx]:
-                assert (
-                    len(beam_hyp) >= self.num_beams
-                ), "Batch can only be done if at least {} beams have been generated".format(self.num_beams)
-                assert (
-                    eos_token_id is not None and pad_token_id is not None
-                ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined"
-                # pad the batch
-                next_beam_scores[batch_idx, :] = 0
-                next_beam_tokens[batch_idx, :] = pad_token_id
-                next_beam_indices[batch_idx, :] = 0
-                continue
-
-            # next tokens for this sentence
-            beam_idx = 0
-            for beam_token_rank, (next_token, next_score, next_index) in enumerate(
-                zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx])
-            ):
-                batch_beam_idx = batch_idx * self.group_size + next_index
-                # add to generated hypotheses if end of sentence
-                if (eos_token_id is not None) and (next_token.item() == eos_token_id):
-                    # if beam_token does not belong to top num_beams tokens, it should not be added
-                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
-                    if is_beam_token_worse_than_top_num_beams:
-                        continue
-                    beam_hyp.add(
-                        input_ids[batch_beam_idx].clone(),
-                        next_score.item(),
-                    )
-                else:
-                    # add next predicted token since it is not eos_token
-                    next_beam_scores[batch_idx, beam_idx] = next_score
-                    next_beam_tokens[batch_idx, beam_idx] = next_token
-                    next_beam_indices[batch_idx, beam_idx] = batch_beam_idx
-                    beam_idx += 1
-
-                # once the beam for next step is full, don't add more tokens to it.
-                if beam_idx == self.group_size:
-                    break
-
-            if beam_idx < self.group_size:
-                raise ValueError(
-                    f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id: {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected."
-                )
-
-            # Check if we are done so that we can save a pad step if all(done)
-            self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done(
-                next_scores[batch_idx].max().item(), cur_len
-            )
-
-        return UserDict(
-            {
-                "next_beam_scores": next_beam_scores.view(-1),
-                "next_beam_tokens": next_beam_tokens.view(-1),
-                "next_beam_indices": next_beam_indices.view(-1),
-            }
-        )
-
-    def finalize(
-        self,
-        input_ids: torch.LongTensor,
-        final_beam_scores: torch.FloatTensor,
-        final_beam_tokens: torch.LongTensor,
-        final_beam_indices: torch.LongTensor,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[int] = None,
-    ) -> Tuple[torch.LongTensor]:
-        batch_size = len(self._beam_hyps)
-
-        # finalize all open beam hypotheses and add to generated hypotheses
-        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
-            if self._done[batch_idx]:
-                continue
-
-            # all open beam hypotheses are added to the beam hypothesis
-            # beam hypothesis class automatically keeps the best beams
-            for beam_id in range(self.num_beams):
-                batch_beam_idx = batch_idx * self.num_beams + beam_id
-                final_score = final_beam_scores[batch_beam_idx].item()
-                final_tokens = input_ids[batch_beam_idx]
-                beam_hyp.add(final_tokens, final_score)
-
-        # select the best hypotheses
-        sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep)
-        best = []
-        best_scores = torch.zeros(batch_size * self.num_beam_hyps_to_keep, device=self.device, dtype=torch.float32)
-
-        # retrieve best hypotheses
-        for i, beam_hyp in enumerate(self._beam_hyps):
-            sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0])
-            for j in range(self.num_beam_hyps_to_keep):
-                best_hyp_tuple = sorted_hyps.pop()
-                best_score = best_hyp_tuple[0]
-                best_hyp = best_hyp_tuple[1]
-                sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp)
-
-                # append to lists
-                best.append(best_hyp)
-                best_scores[i * self.num_beam_hyps_to_keep + j] = best_score
-
-        # prepare for adding eos
-        sent_max_len = min(sent_lengths.max().item() + 1, self.max_length)
-        decoded: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
-        # shorter batches are padded if needed
-        if sent_lengths.min().item() != sent_lengths.max().item():
-            assert pad_token_id is not None, "`pad_token_id` has to be defined"
-            decoded.fill_(pad_token_id)
-
-        # fill with hypotheses and eos_token_id if the latter fits in
-        for i, hypo in enumerate(best):
-            decoded[i, : sent_lengths[i]] = hypo
-            if sent_lengths[i] < self.max_length:
-                decoded[i, sent_lengths[i]] = eos_token_id
-        return UserDict(
-            {
-                "sequences": decoded,
-                "sequence_scores": best_scores,
-            }
-        )
-
-
-class BeamHypotheses:
-    def __init__(self, num_beams: int, max_length: int, length_penalty: float, early_stopping: bool):
-        """
-        Initialize n-best list of hypotheses.
-        """
-        self.max_length = max_length - 1  # ignoring bos_token
-        self.length_penalty = length_penalty
-        self.early_stopping = early_stopping
-        self.num_beams = num_beams
-        self.beams = []
-        self.worst_score = 1e9
-
-    def __len__(self):
-        """
-        Number of hypotheses in the list.
-        """
-        return len(self.beams)
-
-    def add(self, hyp: torch.LongTensor, sum_logprobs: float):
-        """
-        Add a new hypothesis to the list.
-        """
-        score = sum_logprobs / (hyp.shape[-1] ** self.length_penalty)
-        if len(self) < self.num_beams or score > self.worst_score:
-            self.beams.append((score, hyp))
-            if len(self) > self.num_beams:
-                sorted_next_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
-                del self.beams[sorted_next_scores[0][1]]
-                self.worst_score = sorted_next_scores[1][0]
-            else:
-                self.worst_score = min(score, self.worst_score)
-
-    def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool:
-        """
-        If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
-        one in the heap, then we are done with this sentence.
-        """
-
-        if len(self) < self.num_beams:
-            return False
-        elif self.early_stopping:
-            return True
-        else:
-            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
-            ret = self.worst_score >= cur_score
-            return ret
--- a/generation_logits_process.py
+++ b/generation_logits_process.py
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import math
-from abc import ABC
-from typing import Callable, Iterable, List
-
-import numpy as np
-import torch
-
-from file_utils import add_start_docstrings
-
-
-LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`):
-            Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
-            or scores for each vocabulary token after SoftMax.
-        kwargs:
-            Additional logits processor specific kwargs.
-
-    Return:
-        :obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`: The processed prediction scores.
-
-"""
-
-
-class LogitsProcessor(ABC):
-    """Abstract base class for all logit processors that can be applied during generation."""
-
-    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        """Torch method for processing logits."""
-        raise NotImplementedError(
-            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
-        )
-
-
-class LogitsWarper(ABC):
-    """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling."""
-
-    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        """Torch method for warping logits."""
-        raise NotImplementedError(
-            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
-        )
-
-
-class LogitsProcessorList(list):
-    """
-    This class can be used to create a list of :class:`~transformers.LogitsProcessor` or
-    :class:`~transformers.LogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits from
-    list and adds a specific `__call__` method to apply each :class:`~transformers.LogitsProcessor` or
-    :class:`~transformers.LogitsProcessor` to the inputs.
-    """
-
-    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
-        for processor in self:
-            function_args = inspect.signature(processor.__call__).parameters
-            if len(function_args) > 2:
-                assert all(
-                    arg in kwargs for arg in list(function_args.keys())[2:]
-                ), f"Make sure that all the required parameters: {list(function_args.keys())} for {processor.__class__} are passed to the logits processor."
-                scores = processor(input_ids, scores, **kwargs)
-            else:
-                scores = processor(input_ids, scores)
-        return scores
-
-
-class MinLengthLogitsProcessor(LogitsProcessor):
-    r"""
-    :class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0.
-
-    Args:
-        min_length (:obj:`int`):
-            The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
-        eos_token_id (:obj:`int`):
-            The id of the `end-of-sequence` token.
-    """
-
-    def __init__(self, min_length: int, eos_token_id: int):
-        if not isinstance(min_length, int) or min_length < 0:
-            raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
-
-        if not isinstance(eos_token_id, int) or eos_token_id < 0:
-            raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")
-
-        self.min_length = min_length
-        self.eos_token_id = eos_token_id
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        cur_len = input_ids.shape[-1]
-        if cur_len < self.min_length:
-            scores[:, self.eos_token_id] = -float("inf")
-        return scores
-
-
-class TemperatureLogitsWarper(LogitsWarper):
-    r"""
-    :class:`transformers.LogitsWarper` for temperature (exponential scaling output probability distribution).
-
-    Args:
-        temperature (:obj:`float`):
-            The value used to module the logits distribution.
-    """
-
-    def __init__(self, temperature: float):
-        if not isinstance(temperature, float) or not (temperature > 0):
-            raise ValueError(f"`temperature` has to be a strictly positive float, but is {temperature}")
-
-        self.temperature = temperature
-
-    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
-        scores = scores / self.temperature
-        return scores
-
-
-class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
-    r"""
-    :class:`transformers.LogitsProcessor` enforcing an exponential penalty on repeated sequences.
-
-    Args:
-        repetition_penalty (:obj:`float`):
-            The parameter for repetition penalty. 1.0 means no penalty. See `this paper
-            <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
-    """
-
-    def __init__(self, penalty: float):
-        if not isinstance(penalty, float) or not (penalty > 0):
-            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")
-
-        self.penalty = penalty
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        score = torch.gather(scores, 1, input_ids)
-
-        # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
-        score = torch.where(score < 0, score * self.penalty, score / self.penalty)
-
-        scores.scatter_(1, input_ids, score)
-        return scores
-
-
-class TopPLogitsWarper(LogitsWarper):
-    """
-    :class:`transformers.LogitsWarper` that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
-    prob_cut_off.
-
-    Args:
-        top_p (:obj:`float`):
-            If set to < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or higher are
-            kept for generation.
-        filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
-            All filtered values will be set to this float value.
-        min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
-            Minimum number of tokens that cannot be filtered.
-    """
-
-    def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
-        if not isinstance(top_p, float) or (top_p < 0 or top_p > 1.0):
-            raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
-
-        self.top_p = top_p
-        self.filter_value = filter_value
-        self.min_tokens_to_keep = min_tokens_to_keep
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        sorted_logits, sorted_indices = torch.sort(scores, descending=True)
-        cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
-
-        # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
-        sorted_indices_to_remove = cumulative_probs > self.top_p
-        if self.min_tokens_to_keep > 1:
-            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
-            sorted_indices_to_remove[..., : self.min_tokens_to_keep - 1] = 0
-        # Shift the indices to the right to keep also the first token above the threshold
-        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-        sorted_indices_to_remove[..., 0] = 0
-
-        # scatter sorted tensors to original indexing
-        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
-        scores = scores.masked_fill(indices_to_remove, self.filter_value)
-        return scores
-
-
-class TopKLogitsWarper(LogitsWarper):
-    r"""
-    :class:`transformers.LogitsWarper` that performs top-k, i.e. restricting to the k highest probability elements.
-
-    Args:
-        top_k (:obj:`int`):
-            The number of highest probability vocabulary tokens to keep for top-k-filtering.
-        filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
-            All filtered values will be set to this float value.
-        min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
-            Minimum number of tokens that cannot be filtered.
-    """
-
-    def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
-        if not isinstance(top_k, int) or top_k <= 0:
-            raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
-
-        self.top_k = top_k
-        self.filter_value = filter_value
-        self.min_tokens_to_keep = min_tokens_to_keep
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        top_k = min(max(self.top_k, self.min_tokens_to_keep), scores.size(-1))  # Safety check
-        # Remove all tokens with a probability less than the last token of the top-k
-        indices_to_remove = scores < torch.topk(scores, top_k)[0][..., -1, None]
-        scores = scores.masked_fill(indices_to_remove, self.filter_value)
-        return scores
-
-
-def _get_ngrams(ngram_size: int, prev_input_ids: torch.Tensor, num_hypos: int):
-    generated_ngrams = [{} for _ in range(num_hypos)]
-    for idx in range(num_hypos):
-        gen_tokens = prev_input_ids[idx].tolist()
-        generated_ngram = generated_ngrams[idx]
-        for ngram in zip(*[gen_tokens[i:] for i in range(ngram_size)]):
-            prev_ngram_tuple = tuple(ngram[:-1])
-            generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
-    return generated_ngrams
-
-
-def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur_len):
-    # Before decoding the next token, prevent decoding of ngrams that have already appeared
-    start_idx = cur_len + 1 - ngram_size
-    ngram_idx = tuple(prev_input_ids[start_idx:cur_len].tolist())
-    return banned_ngrams.get(ngram_idx, [])
-
-
-def _calc_banned_ngram_tokens(
-    ngram_size: int, prev_input_ids: torch.Tensor, num_hypos: int, cur_len: int
-) -> List[Iterable[int]]:
-    """Copied from fairseq for no_repeat_ngram in beam_search"""
-    if cur_len + 1 < ngram_size:
-        # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
-        return [[] for _ in range(num_hypos)]
-
-    generated_ngrams = _get_ngrams(ngram_size, prev_input_ids, num_hypos)
-
-    banned_tokens = [
-        _get_generated_ngrams(generated_ngrams[hypo_idx], prev_input_ids[hypo_idx], ngram_size, cur_len)
-        for hypo_idx in range(num_hypos)
-    ]
-    return banned_tokens
-
-
-class NoRepeatNGramLogitsProcessor(LogitsProcessor):
-    r"""
-    :class:`transformers.LogitsProcessor` that enforces no repetition of n-grams. See `Fairseq
-    <https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345>`__.
-
-    Args:
-        ngram_size (:obj:`int`):
-            All ngrams of size :obj:`ngram_size` can only occur once.
-    """
-
-    def __init__(self, ngram_size: int):
-        if not isinstance(ngram_size, int) or ngram_size <= 0:
-            raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
-        self.ngram_size = ngram_size
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        num_batch_hypotheses = scores.shape[0]
-        cur_len = input_ids.shape[-1]
-        banned_batch_tokens = _calc_banned_ngram_tokens(self.ngram_size, input_ids, num_batch_hypotheses, cur_len)
-
-        for i, banned_tokens in enumerate(banned_batch_tokens):
-            scores[i, banned_tokens] = -float("inf")
-
-        return scores
-
-
-class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor):
-    r"""
-    :class:`transformers.LogitsProcessor` that enforces no repetition of encoder input ids n-grams for the decoder ids.
-    See `ParlAI <https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/torch_generator_agent.py#L1350>`__.
-
-    Args:
-        encoder_ngram_size (:obj:`int`):
-            All ngrams of size :obj:`ngram_size` can only occur within the encoder input ids.
-        encoder_input_ids (:obj:`int`):
-            The encoder_input_ids that should not be repeated within the decoder ids.
-    """
-
-    def __init__(self, encoder_ngram_size: int, encoder_input_ids: torch.LongTensor):
-        if not isinstance(encoder_ngram_size, int) or encoder_ngram_size <= 0:
-            raise ValueError(
-                f"`encoder_ngram_size` has to be a strictly positive integer, but is {encoder_ngram_size}"
-            )
-        self.ngram_size = encoder_ngram_size
-        if len(encoder_input_ids.shape) == 1:
-            encoder_input_ids = encoder_input_ids.unsqueeze(0)
-        self.batch_size = encoder_input_ids.shape[0]
-        self.generated_ngrams = _get_ngrams(encoder_ngram_size, encoder_input_ids, self.batch_size)
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        # B x num_beams
-        num_hypos = scores.shape[0]
-        num_beams = num_hypos // self.batch_size
-        cur_len = input_ids.shape[-1]
-        banned_batch_tokens = [
-            _get_generated_ngrams(
-                self.generated_ngrams[hypo_idx // num_beams], input_ids[hypo_idx], self.ngram_size, cur_len
-            )
-            for hypo_idx in range(num_hypos)
-        ]
-
-        for i, banned_tokens in enumerate(banned_batch_tokens):
-            scores[i, banned_tokens] = -float("inf")
-
-        return scores
-
-
-class NoBadWordsLogitsProcessor(LogitsProcessor):
-    """
-    :class:`transformers.LogitsProcessor` that enforces that specified sequences will never be sampled.
-
-    Args:
-        bad_words_ids (:obj:`List[List[int]]`):
-            List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
-            that should not appear in the generated text, use :obj:`tokenizer(bad_word,
-            add_prefix_space=True).input_ids`.
-        eos_token_id (:obj:`int`):
-            The id of the `end-of-sequence` token.
-    """
-
-    def __init__(self, bad_words_ids: Iterable[Iterable[int]], eos_token_id: int):
-
-        if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0:
-            raise ValueError(f"`bad_words_ids` has to be a non-emtpy list, but is {bad_words_ids}.")
-        if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
-            raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.")
-        if any(
-            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids)
-            for bad_word_ids in bad_words_ids
-        ):
-            raise ValueError(
-                f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}."
-            )
-
-        self.bad_words_ids = list(filter(lambda bad_token_seq: bad_token_seq != [eos_token_id], bad_words_ids))
-
-        for banned_token_seq in self.bad_words_ids:
-            assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format(
-                bad_words_ids
-            )
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        banned_tokens = self._calc_banned_bad_words_ids(input_ids)
-        scores = self._set_scores_to_inf_for_banned_tokens(scores, banned_tokens)
-
-        return scores
-
-    def _tokens_match(self, prev_tokens: torch.LongTensor, tokens: List[int]) -> bool:
-        if len(tokens) == 0:
-            # if bad word tokens is just one token always ban it
-            return True
-        elif len(tokens) > len(prev_tokens):
-            # if bad word tokens are longer then prev input_ids they can't be equal
-            return False
-        elif prev_tokens[-len(tokens) :].tolist() == tokens:
-            # if tokens match
-            return True
-        else:
-            return False
-
-    def _calc_banned_bad_words_ids(self, prev_input_ids: Iterable[int]) -> Iterable[int]:
-        banned_tokens = []
-        for prev_input_ids_slice in prev_input_ids:
-            banned_tokens_slice = []
-            for banned_token_seq in self.bad_words_ids:
-                if self._tokens_match(prev_input_ids_slice, banned_token_seq[:-1]) is False:
-                    # if tokens do not match continue
-                    continue
-
-                banned_tokens_slice.append(banned_token_seq[-1])
-
-            banned_tokens.append(banned_tokens_slice)
-
-        return banned_tokens
-
-    def _set_scores_to_inf_for_banned_tokens(self, scores: torch.Tensor, banned_tokens: List[List[int]]) -> None:
-        """
-        Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be a
-        list of list of banned tokens to ban in the format [[batch index, vocabulary position],...
-
-        Args:
-            scores: logits distribution of shape (batch size, vocabulary size)
-            banned_tokens: list of list of tokens to ban of length (batch_size)
-        """
-        banned_mask_list = []
-        for idx, batch_banned_tokens in enumerate(banned_tokens):
-            for token in batch_banned_tokens:
-                banned_mask_list.append([idx, token])
-        if not banned_mask_list:
-            return scores
-
-        banned_mask = torch.LongTensor(banned_mask_list)
-        indices = torch.ones(len(banned_mask))
-        # A sparse tensor is generated from a list of coordinates: [[0, 1], [0, 2], [2, 0]]. A conversion to dense tensor generates:
-        # [ 0  1  1 ]
-        # [ 0  0  0 ]
-        # [ 1  0  0 ]
-
-        banned_mask = (
-            torch.sparse.LongTensor(banned_mask.t(), indices, scores.size()).to(scores.device).to_dense().bool()
-        )
-        scores = scores.masked_fill(banned_mask, -float("inf"))
-        return scores
-
-
-class PrefixConstrainedLogitsProcessor(LogitsProcessor):
-    r"""
-    :class:`transformers.LogitsProcessor` that enforces contrained generation and is useful for prefix-conditioned
-    constrained generation. See `Autoregressive Entity Retrieval <https://arxiv.org/abs/2010.00904>`__ for more
-    information.
-
-    Args:
-        prefix_allowed_tokens_fn: (:obj:`Callable[[int, torch.Tensor], List[int]]`):
-            This function constraints the beam search to allowed tokens only at each step. This function takes 2
-            arguments :obj:`inputs_ids` and the batch ID :obj:`batch_id`. It has to return a list with the allowed
-            tokens for the next generation step conditioned on the previously generated tokens :obj:`inputs_ids` and
-            the batch ID :obj:`batch_id`.
-    """
-
-    def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]], num_beams: int):
-        self._prefix_allowed_tokens_fn = prefix_allowed_tokens_fn
-        self._num_beams = num_beams
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        mask = torch.full_like(scores, -math.inf)
-        for batch_id, beam_sent in enumerate(input_ids.view(-1, self._num_beams, input_ids.shape[-1])):
-            for beam_id, sent in enumerate(beam_sent):
-                mask[batch_id * self._num_beams + beam_id, self._prefix_allowed_tokens_fn(batch_id, sent)] = 0
-
-        return scores + mask
-
-
-class HammingDiversityLogitsProcessor(LogitsProcessor):
-    r"""
-    :class:`transformers.LogitsProcessor` that enforces diverse beam search. Note that this logits processor is only
-    effective for :meth:`transformers.PretrainedModel.group_beam_search`. See `Diverse Beam Search: Decoding Diverse
-    Solutions from Neural Sequence Models <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
-
-    Args:
-        diversity_penalty (:obj:`float`):
-            This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
-            particular time. Note that :obj:`diversity_penalty` is only effective if ``group beam search`` is enabled.
-        num_beams (:obj:`int`):
-            Number of beams used for group beam search. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for
-            more details.
-        num_beam_groups (:obj:`int`):
-            Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of
-            beams. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
-    """
-
-    def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int):
-        if not isinstance(diversity_penalty, float) or (not diversity_penalty > 0.0):
-            raise ValueError("`diversity_penalty` should be a float strictly larger than 0.")
-        self._diversity_penalty = diversity_penalty
-        if not isinstance(num_beams, int) or num_beams < 2:
-            raise ValueError("`num_beams` should be an integer strictly larger than 1.")
-        self._num_beams = num_beams
-        if not isinstance(num_beam_groups, int) or num_beam_groups < 2:
-            raise ValueError("`num_beam_groups` should be an integer strictly larger than 1.")
-        if num_beam_groups > num_beams:
-            raise ValueError("`beam_groups` has to be smaller or equal to `num_beams`.")
-        self._num_sub_beams = num_beams // num_beam_groups
-
-    def __call__(
-        self,
-        input_ids: torch.LongTensor,
-        scores: torch.FloatTensor,
-        current_tokens: torch.LongTensor,
-        beam_group_idx: int,
-    ) -> torch.FloatTensor:
-        # hamming diversity: penalise using same token in current group which was used in previous groups at
-        # the same time step
-        batch_size = current_tokens.shape[0] // self._num_beams
-        group_start_idx = beam_group_idx * self._num_sub_beams
-        group_end_idx = min(group_start_idx + self._num_sub_beams, self._num_beams)
-        group_size = group_end_idx - group_start_idx
-        vocab_size = scores.shape[-1]
-
-        if group_start_idx == 0:
-            return scores
-
-        for batch_idx in range(batch_size):
-            # predicted tokens of last time step of previous groups
-            previous_group_tokens = current_tokens[
-                batch_idx * self._num_beams : batch_idx * self._num_beams + group_start_idx
-            ]
-            token_frequency = torch.bincount(previous_group_tokens, minlength=vocab_size).to(scores.device)
-            scores[batch_idx * group_size : (batch_idx + 1) * group_size] -= self._diversity_penalty * token_frequency
-
-        return scores
-
-
-class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
-    r"""
-    :class:`~transformers.LogitsProcessor` that enforces the specified token as the first generated token.
-
-    Args:
-        bos_token_id (:obj:`int`):
-            The id of the token to force as the first generated token.
-    """
-
-    def __init__(self, bos_token_id: int):
-        self.bos_token_id = bos_token_id
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        cur_len = input_ids.shape[-1]
-        if cur_len == 1:
-            num_tokens = scores.shape[1]
-            scores[:, [i for i in range(num_tokens) if i != self.bos_token_id]] = -float("inf")
-            scores[:, self.bos_token_id] = 0
-        return scores
-
-
-class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
-    r"""
-    :class:`~transformers.LogitsProcessor` that enforces the specified token as the last generated token when
-    :obj:`max_length` is reached.
-
-    Args:
-        max_length (:obj:`int`):
-            The maximum length of the sequence to be generated.
-        eos_token_id (:obj:`int`):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached.
-    """
-
-    def __init__(self, max_length: int, eos_token_id: int):
-        self.max_length = max_length
-        self.eos_token_id = eos_token_id
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        cur_len = input_ids.shape[-1]
-        if cur_len == self.max_length - 1:
-            num_tokens = scores.shape[1]
-            scores[:, [i for i in range(num_tokens) if i != self.eos_token_id]] = -float("inf")
-            scores[:, self.eos_token_id] = 0
-        return scores
--- a/generation_stopping_criteria.py
+++ b/generation_stopping_criteria.py
-import time
-import warnings
-from abc import ABC
-from typing import Optional
-
-import torch
-
-from file_utils import add_start_docstrings
-
-
-LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`):
-            Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
-            or scores for each vocabulary token after SoftMax.
-        kwargs:
-            Additional stopping critera specific kwargs.
-
-    Return:
-        :obj:`bool`. :obj:`False` indicates we should continue, :obj:`True` indicates we should stop.
-
-"""
-
-
-class StoppingCriteria(ABC):
-    """Abstract base class for all stopping criteria that can be applied during generation."""
-
-    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: torch.LongTensor, score: torch.FloatTensor, **kwargs) -> bool:
-        raise NotImplementedError("StoppingCriteria needs to be subclassed")
-
-
-class MaxLengthCriteria(StoppingCriteria):
-    """
-    This class can be used to stop generation whenever the full generated number of tokens exceeds :obj:`max_length`.
-    Keep in mind for decoder-only type of transformers, this will include the initial prompted tokens.
-
-    Args:
-        max_length (:obj:`int`):
-            The maximum length that the output sequence can have in number of tokens.
-    """
-
-    def __init__(self, max_length: int):
-        self.max_length = max_length
-
-    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        return input_ids.shape[-1] > self.max_length
-
-
-class MaxTimeCriteria(StoppingCriteria):
-    """
-    This class can be used to stop generation whenever the full generation exceeds some amount of time. By default, the
-    time will start being counted when you initialize this function. You can override this by passing an
-    :obj:`initial_time`.
-
-    Args:
-        max_time (:obj:`float`):
-            The maximum allowed time in seconds for the generation.
-        initial_time (:obj:`float`, `optional`, defaults to :obj:`time.time()`):
-            The start of the generation allowed time.
-    """
-
-    def __init__(self, max_time: float, initial_timestamp: Optional[float] = None):
-        self.max_time = max_time
-        self.initial_timestamp = time.time() if initial_timestamp is None else initial_timestamp
-
-    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        return time.time() - self.initial_timestamp > self.max_time
-
-
-class StoppingCriteriaList(list):
-    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        return any(criteria(input_ids, scores) for criteria in self)
-
-
-def validate_stopping_criteria(stopping_criteria: StoppingCriteriaList, max_length: int):
-    found = False
-    for stopping_criterium in stopping_criteria:
-        if isinstance(stopping_criterium, MaxLengthCriteria):
-            found = True
-            if stopping_criterium.max_length != max_length:
-                warnings.warn(
-                    "You set different `max_length` for stopping criteria and `max_length` parameter", UserWarning
-                )
-    if not found:
-        stopping_criteria.append(MaxLengthCriteria(max_length=max_length))
--- a/generation_utils.py
+++ b/generation_utils.py
--- a/modeling_bert.py
+++ b/modeling_bert.py
--- a/modeling_outputs.py
+++ b/modeling_outputs.py
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import Optional, Tuple
-
-import torch
-
-from file_utils import ModelOutput
-
-
-@dataclass
-class BaseModelOutput(ModelOutput):
-    """
-    Base class for model's outputs, with potential hidden states and attentions.
-
-    Args:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class BaseModelOutputWithPooling(ModelOutput):
-    """
-    Base class for model's outputs that also contains a pooling of the last hidden states.
-
-    Args:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token) further processed by a
-            Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
-            prediction (classification) objective during pretraining.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    pooler_output: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class BaseModelOutputWithPast(ModelOutput):
-    """
-    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
-
-    Args:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-
-            If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
-            1, hidden_size)` is output.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
-            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
-            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
-            ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads,
-            encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
-            ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see
-            :obj:`past_key_values` input) to speed up sequential decoding.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class BaseModelOutputWithCrossAttentions(ModelOutput):
-    """
-    Base class for model's outputs, with potential hidden states and attentions.
-
-    Args:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
-    """
-    Base class for model's outputs that also contains a pooling of the last hidden states.
-
-    Args:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token) further processed by a
-            Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
-            prediction (classification) objective during pretraining.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
-            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
-            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
-            ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads,
-            encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
-            ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see
-            :obj:`past_key_values` input) to speed up sequential decoding.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    pooler_output: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
-    """
-    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
-
-    Args:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-
-            If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
-            1, hidden_size)` is output.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
-            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
-            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
-            ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads,
-            encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
-            ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see
-            :obj:`past_key_values` input) to speed up sequential decoding.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class Seq2SeqModelOutput(ModelOutput):
-    """
-    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
-    decoding.
-
-    Args:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the decoder of the model.
-
-            If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
-            1, hidden_size)` is output.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
-            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
-            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-            shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class CausalLMOutput(ModelOutput):
-    """
-    Base class for causal language model (or autoregressive) outputs.
-
-    Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Language modeling loss (for next-token prediction).
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class CausalLMOutputWithPast(ModelOutput):
-    """
-    Base class for causal language model (or autoregressive) outputs.
-
-    Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Language modeling loss (for next-token prediction).
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (:obj:`tuple(tupel(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
-            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
-            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
-            :obj:`past_key_values` input) to speed up sequential decoding.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class CausalLMOutputWithCrossAttentions(ModelOutput):
-    """
-    Base class for causal language model (or autoregressive) outputs.
-
-    Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Language modeling loss (for next-token prediction).
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Cross attentions weights after the attention softmax, used to compute the weighted average in the
-            cross-attention heads.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
-            Tuple of :obj:`torch.FloatTensor` tuples of length :obj:`config.n_layers`, with each tuple containing the
-            cached key, value states of the self-attention and the cross-attention layers if model is used in
-            encoder-decoder setting. Only relevant if ``config.is_decoder = True``.
-
-            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
-            :obj:`past_key_values` input) to speed up sequential decoding.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class SequenceClassifierOutputWithPast(ModelOutput):
-    """
-    Base class for outputs of sentence classification models.
-
-    Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        past_key_values (:obj:`tuple(tupel(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
-            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
-            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
-            :obj:`past_key_values` input) to speed up sequential decoding.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class MaskedLMOutput(ModelOutput):
-    """
-    Base class for masked language models outputs.
-
-    Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Masked language modeling (MLM) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class Seq2SeqLMOutput(ModelOutput):
-    """
-    Base class for sequence-to-sequence language models outputs.
-
-    Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Language modeling loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
-            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
-            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-            shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class NextSentencePredictorOutput(ModelOutput):
-    """
-    Base class for outputs of models predicting if two sentences are consecutive or not.
-
-    Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided):
-            Next sequence prediction (classification) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
-            before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class SequenceClassifierOutput(ModelOutput):
-    """
-    Base class for outputs of sentence classification models.
-
-    Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class Seq2SeqSequenceClassifierOutput(ModelOutput):
-    """
-    Base class for outputs of sequence-to-sequence sentence classification models.
-
-    Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
-            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
-            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-            shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class MultipleChoiceModelOutput(ModelOutput):
-    """
-    Base class for outputs of multiple choice models.
-
-    Args:
-        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Classification loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
-            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
-
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class TokenClassifierOutput(ModelOutput):
-    """
-    Base class for outputs of token classification models.
-
-    Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
-            Classification loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class QuestionAnsweringModelOutput(ModelOutput):
-    """
-    Base class for outputs of question answering models.
-
-    Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Span-start scores (before SoftMax).
-        end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    start_logits: torch.FloatTensor = None
-    end_logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
-    """
-    Base class for outputs of sequence-to-sequence question answering models.
-
-    Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Span-start scores (before SoftMax).
-        end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Span-end scores (before SoftMax).
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
-            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
-            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-            shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
-        decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    start_logits: torch.FloatTensor = None
-    end_logits: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
-    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
--- a/modeling_utils.py
+++ b/modeling_utils.py
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import os
-import re
-import warnings
-from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
-
-import torch
-from torch import Tensor, device, dtype, nn
-from torch.nn import CrossEntropyLoss
-from torch.nn import functional as F
-
-from activations import get_activation
-from configuration_utils import PretrainedConfig
-from file_utils import (
-    DUMMY_INPUTS,
-    TF2_WEIGHTS_NAME,
-    TF_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-    ModelOutput,
-    cached_path,
-    hf_bucket_url,
-    is_offline_mode,
-    is_remote_url,
-    replace_return_docstrings,
-)
-from generation_utils import GenerationMixin
-from utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-try:
-    from torch.nn import Identity
-except ImportError:
-    # Older PyTorch compatibility
-    class Identity(nn.Module):
-        r"""A placeholder identity operator that is argument-insensitive."""
-
-        def __init__(self, *args, **kwargs):
-            super().__init__()
-
-        def forward(self, input):
-            return input
-
-
-def find_pruneable_heads_and_indices(
-    heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int]
-) -> Tuple[Set[int], torch.LongTensor]:
-    """
-    Finds the heads and their indices taking :obj:`already_pruned_heads` into account.
-
-    Args:
-        heads (:obj:`List[int]`): List of the indices of heads to prune.
-        n_heads (:obj:`int`): The number of heads in the model.
-        head_size (:obj:`int`): The size of each head.
-        already_pruned_heads (:obj:`Set[int]`): A set of already pruned heads.
-
-    Returns:
-        :obj:`Tuple[Set[int], torch.LongTensor]`: A tuple with the remaining heads and their corresponding indices.
-    """
-    mask = torch.ones(n_heads, head_size)
-    heads = set(heads) - already_pruned_heads  # Convert to set and remove already pruned heads
-    for head in heads:
-        # Compute how many pruned heads are before the head and move the index accordingly
-        head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
-        mask[head] = 0
-    mask = mask.view(-1).contiguous().eq(1)
-    index: torch.LongTensor = torch.arange(len(mask))[mask].long()
-    return heads, index
-
-
-def get_parameter_device(parameter: Union[nn.Module, GenerationMixin, "ModuleUtilsMixin"]):
-    try:
-        return next(parameter.parameters()).device
-    except StopIteration:
-        # For nn.DataParallel compatibility in PyTorch 1.5
-
-        def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
-            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
-            return tuples
-
-        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
-        first_tuple = next(gen)
-        return first_tuple[1].device
-
-
-def get_parameter_dtype(parameter: Union[nn.Module, GenerationMixin, "ModuleUtilsMixin"]):
-    try:
-        return next(parameter.parameters()).dtype
-    except StopIteration:
-        # For nn.DataParallel compatibility in PyTorch 1.5
-
-        def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
-            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
-            return tuples
-
-        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
-        first_tuple = next(gen)
-        return first_tuple[1].dtype
-
-
-class ModuleUtilsMixin:
-    """
-    A few utilities for :obj:`torch.nn.Modules`, to be used as a mixin.
-    """
-
-    @staticmethod
-    def _hook_rss_memory_pre_forward(module, *args, **kwargs):
-        try:
-            import psutil
-        except (ImportError):
-            raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
-
-        process = psutil.Process(os.getpid())
-        mem = process.memory_info()
-        module.mem_rss_pre_forward = mem.rss
-        return None
-
-    @staticmethod
-    def _hook_rss_memory_post_forward(module, *args, **kwargs):
-        try:
-            import psutil
-        except (ImportError):
-            raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
-
-        process = psutil.Process(os.getpid())
-        mem = process.memory_info()
-        module.mem_rss_post_forward = mem.rss
-        mem_rss_diff = module.mem_rss_post_forward - module.mem_rss_pre_forward
-        module.mem_rss_diff = mem_rss_diff + (module.mem_rss_diff if hasattr(module, "mem_rss_diff") else 0)
-        return None
-
-    def add_memory_hooks(self):
-        """
-        Add a memory hook before and after each sub-module forward pass to record increase in memory consumption.
-
-        Increase in memory consumption is stored in a :obj:`mem_rss_diff` attribute for each module and can be reset to
-        zero with :obj:`model.reset_memory_hooks_state()`.
-        """
-        for module in self.modules():
-            module.register_forward_pre_hook(self._hook_rss_memory_pre_forward)
-            module.register_forward_hook(self._hook_rss_memory_post_forward)
-        self.reset_memory_hooks_state()
-
-    def reset_memory_hooks_state(self):
-        """
-        Reset the :obj:`mem_rss_diff` attribute of each module (see
-        :func:`~transformers.modeling_utils.ModuleUtilsMixin.add_memory_hooks`).
-        """
-        for module in self.modules():
-            module.mem_rss_diff = 0
-            module.mem_rss_post_forward = 0
-            module.mem_rss_pre_forward = 0
-
-    @property
-    def device(self) -> device:
-        """
-        :obj:`torch.device`: The device on which the module is (assuming that all the module parameters are on the same
-        device).
-        """
-        return get_parameter_device(self)
-
-    @property
-    def dtype(self) -> dtype:
-        """
-        :obj:`torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
-        """
-        return get_parameter_dtype(self)
-
-    def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor:
-        """
-        Invert an attention mask (e.g., switches 0. and 1.).
-
-        Args:
-            encoder_attention_mask (:obj:`torch.Tensor`): An attention mask.
-
-        Returns:
-            :obj:`torch.Tensor`: The inverted attention mask.
-        """
-        if encoder_attention_mask.dim() == 3:
-            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
-        if encoder_attention_mask.dim() == 2:
-            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
-        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
-        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
-        # /transformer/transformer_layers.py#L270
-        # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
-        # encoder_extended_attention_mask.transpose(-1, -2))
-        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-
-        if self.dtype == torch.float16:
-            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
-        elif self.dtype == torch.float32:
-            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
-        else:
-            raise ValueError(
-                "{} not recognized. `dtype` should be set to either `torch.float32` or `torch.float16`".format(
-                    self.dtype
-                )
-            )
-
-        return encoder_extended_attention_mask
-
-    def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device) -> Tensor:
-        """
-        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
-
-        Arguments:
-            attention_mask (:obj:`torch.Tensor`):
-                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
-            input_shape (:obj:`Tuple[int]`):
-                The shape of the input to the model.
-            device: (:obj:`torch.device`):
-                The device of the input to the model.
-
-        Returns:
-            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
-        """
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        if attention_mask.dim() == 3:
-            extended_attention_mask = attention_mask[:, None, :, :]
-        elif attention_mask.dim() == 2:
-            # Provided a padding mask of dimensions [batch_size, seq_length]
-            # - if the model is a decoder, apply a causal mask in addition to the padding mask
-            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-            if self.config.is_decoder:
-                batch_size, seq_length = input_shape
-                seq_ids = torch.arange(seq_length, device=device)
-                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
-                # in case past_key_values are used we need to add a prefix ones mask to the causal mask
-                # causal and attention masks must have same type with pytorch version < 1.3
-                causal_mask = causal_mask.to(attention_mask.dtype)
-
-                if causal_mask.shape[1] < attention_mask.shape[1]:
-                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
-                    causal_mask = torch.cat(
-                        [
-                            torch.ones(
-                                (batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype
-                            ),
-                            causal_mask,
-                        ],
-                        axis=-1,
-                    )
-
-                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
-            else:
-                extended_attention_mask = attention_mask[:, None, None, :]
-        else:
-            raise ValueError(
-                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
-                    input_shape, attention_mask.shape
-                )
-            )
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-        return extended_attention_mask
-
-    def get_head_mask(
-        self, head_mask: Optional[Tensor], num_hidden_layers: int, is_attention_chunked: bool = False
-    ) -> Tensor:
-        """
-        Prepare the head mask if needed.
-
-        Args:
-            head_mask (:obj:`torch.Tensor` with shape :obj:`[num_heads]` or :obj:`[num_hidden_layers x num_heads]`, `optional`):
-                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
-            num_hidden_layers (:obj:`int`):
-                The number of hidden layers in the model.
-            is_attention_chunked: (:obj:`bool`, `optional, defaults to :obj:`False`):
-                Whether or not the attentions scores are computed by chunks or not.
-
-        Returns:
-            :obj:`torch.Tensor` with shape :obj:`[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or
-            list with :obj:`[None]` for each layer.
-        """
-        if head_mask is not None:
-            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
-            if is_attention_chunked is True:
-                head_mask = head_mask.unsqueeze(-1)
-        else:
-            head_mask = [None] * num_hidden_layers
-
-        return head_mask
-
-    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
-        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
-        if head_mask.dim() == 1:
-            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-            head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1)
-        elif head_mask.dim() == 2:
-            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-        assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
-        head_mask = head_mask.to(dtype=self.dtype)  # switch to float if need + fp16 compatibility
-        return head_mask
-
-    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
-        """
-        Get number of (optionally, trainable or non-embeddings) parameters in the module.
-
-        Args:
-            only_trainable (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to return only the number of trainable parameters
-
-            exclude_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to return only the number of non-embeddings parameters
-
-        Returns:
-            :obj:`int`: The number of parameters.
-        """
-
-        def parameter_filter(x):
-            return (x.requires_grad or not only_trainable) and not (
-                isinstance(x, torch.nn.Embedding) and exclude_embeddings
-            )
-
-        params = filter(parameter_filter, self.parameters()) if only_trainable else self.parameters()
-        return sum(p.numel() for p in params)
-
-    def estimate_tokens(self, input_dict: Dict[str, Union[torch.Tensor, Any]]) -> int:
-        """
-        Helper function to estimate the total number of tokens from the model inputs.
-
-        Args:
-            inputs (:obj:`dict`): The model inputs.
-
-        Returns:
-            :obj:`int`: The total number of tokens.
-        """
-        token_inputs = [tensor for key, tensor in input_dict.items() if "input" in key]
-        if token_inputs:
-            return sum([token_input.numel() for token_input in token_inputs])
-        else:
-            warnings.warn(
-                "Could not estimate the number of tokens of the input, floating-point operations will not be computed"
-            )
-            return 0
-
-    def floating_point_ops(
-        self, input_dict: Dict[str, Union[torch.Tensor, Any]], exclude_embeddings: bool = True
-    ) -> int:
-        """
-        Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a
-        batch with this transformer model. Default approximation neglects the quadratic dependency on the number of
-        tokens (valid if :obj:`12 * d_model << sequence_length`) as laid out in `this paper
-        <https://arxiv.org/pdf/2001.08361.pdf>`__ section 2.1. Should be overridden for transformers with parameter
-        re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths.
-
-        Args:
-            batch_size (:obj:`int`):
-                The batch size for the forward pass.
-
-            sequence_length (:obj:`int`):
-                The number of tokens in each line of the batch.
-
-            exclude_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether or not to count embedding and softmax operations.
-
-        Returns:
-            :obj:`int`: The number of floating-point operations.
-        """
-
-        return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings)
-
-
-class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
-    r"""
-    Base class for all models.
-
-    :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods
-    for loading, downloading and saving models as well as a few methods common to all models to:
-
-        * resize the input embeddings,
-        * prune heads in the self-attention heads.
-
-    Class attributes (overridden by derived classes):
-
-        - **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of
-          :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
-        - **load_tf_weights** (:obj:`Callable`) -- A python `method` for loading a TensorFlow checkpoint in a PyTorch
-          model, taking as arguments:
-
-            - **model** (:class:`~transformers.PreTrainedModel`) -- An instance of the model on which to load the
-              TensorFlow checkpoint.
-            - **config** (:class:`~transformers.PreTrainedConfig`) -- An instance of the configuration associated to
-              the model.
-            - **path** (:obj:`str`) -- A path to the TensorFlow checkpoint.
-
-        - **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
-          derived classes of the same architecture adding modules on top of the base model.
-        - **is_parallelizable** (:obj:`bool`) -- A flag indicating whether this model supports model parallelization.
-    """
-    config_class = None
-    base_model_prefix = ""
-    # a list of re pattern of tensor names to ignore from the model when loading the model weights
-    # (and avoid unnecessary warnings).
-    _keys_to_ignore_on_load_missing = None
-    # a list of re pattern of tensor names to ignore from the weights when loading the model weights
-    # (and avoid unnecessary warnings).
-    _keys_to_ignore_on_load_unexpected = None
-    # a list of of tensor names to ignore when saving the model (useful for keys that aren't
-    # trained, but which are deterministic)
-    _keys_to_ignore_on_save = None
-
-    is_parallelizable = False
-
-    @property
-    def dummy_inputs(self) -> Dict[str, torch.Tensor]:
-        """
-        :obj:`Dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network.
-        """
-        return {"input_ids": torch.tensor(DUMMY_INPUTS)}
-
-    def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
-        super().__init__()
-        if not isinstance(config, PretrainedConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
-                "To create a model from a pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                )
-            )
-        # Save config and origin of the pretrained weights if given in model
-        self.config = config
-        self.name_or_path = config.name_or_path
-
-    @property
-    def base_model(self) -> nn.Module:
-        """
-        :obj:`torch.nn.Module`: The main body of the model.
-        """
-        return getattr(self, self.base_model_prefix, self)
-
-    def get_input_embeddings(self) -> nn.Module:
-        """
-        Returns the model's input embeddings.
-
-        Returns:
-            :obj:`nn.Module`: A torch module mapping vocabulary to hidden states.
-        """
-        base_model = getattr(self, self.base_model_prefix, self)
-        if base_model is not self:
-            return base_model.get_input_embeddings()
-        else:
-            raise NotImplementedError
-
-    def set_input_embeddings(self, value: nn.Module):
-        """
-        Set model's input embeddings.
-
-        Args:
-            value (:obj:`nn.Module`): A module mapping vocabulary to hidden states.
-        """
-        base_model = getattr(self, self.base_model_prefix, self)
-        if base_model is not self:
-            base_model.set_input_embeddings(value)
-        else:
-            raise NotImplementedError
-
-    def get_output_embeddings(self) -> nn.Module:
-        """
-        Returns the model's output embeddings.
-
-        Returns:
-            :obj:`nn.Module`: A torch module mapping hidden states to vocabulary.
-        """
-        return None  # Overwrite for models with output embeddings
-
-    def tie_weights(self):
-        """
-        Tie the weights between the input embeddings and the output embeddings.
-
-        If the :obj:`torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning
-        the weights instead.
-        """
-        output_embeddings = self.get_output_embeddings()
-        if output_embeddings is not None and self.config.tie_word_embeddings:
-            self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
-
-        if self.config.is_encoder_decoder and self.config.tie_encoder_decoder:
-            if hasattr(self, self.base_model_prefix):
-                self = getattr(self, self.base_model_prefix)
-            self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix)
-
-    @staticmethod
-    def _tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module, base_model_prefix: str):
-        uninitialized_encoder_weights: List[str] = []
-        if decoder.__class__ != encoder.__class__:
-            logger.info(
-                f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized."
-            )
-
-        def tie_encoder_to_decoder_recursively(
-            decoder_pointer: nn.Module,
-            encoder_pointer: nn.Module,
-            module_name: str,
-            uninitialized_encoder_weights: List[str],
-            depth=0,
-        ):
-            assert isinstance(decoder_pointer, nn.Module) and isinstance(
-                encoder_pointer, nn.Module
-            ), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module"
-            if hasattr(decoder_pointer, "weight"):
-                assert hasattr(encoder_pointer, "weight")
-                encoder_pointer.weight = decoder_pointer.weight
-                if hasattr(decoder_pointer, "bias"):
-                    assert hasattr(encoder_pointer, "bias")
-                    encoder_pointer.bias = decoder_pointer.bias
-                return
-
-            encoder_modules = encoder_pointer._modules
-            decoder_modules = decoder_pointer._modules
-            if len(decoder_modules) > 0:
-                assert (
-                    len(encoder_modules) > 0
-                ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
-
-                all_encoder_weights = set([module_name + "/" + sub_name for sub_name in encoder_modules.keys()])
-                encoder_layer_pos = 0
-                for name, module in decoder_modules.items():
-                    if name.isdigit():
-                        encoder_name = str(int(name) + encoder_layer_pos)
-                        decoder_name = name
-                        if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])) and len(
-                            encoder_modules
-                        ) != len(decoder_modules):
-                            # this can happen if the name corresponds to the position in a list module list of layers
-                            # in this case the decoder has added a cross-attention that the encoder does not have
-                            # thus skip this step and subtract one layer pos from encoder
-                            encoder_layer_pos -= 1
-                            continue
-                    elif name not in encoder_modules:
-                        continue
-                    elif depth > 500:
-                        raise ValueError(
-                            "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model."
-                        )
-                    else:
-                        decoder_name = encoder_name = name
-                    tie_encoder_to_decoder_recursively(
-                        decoder_modules[decoder_name],
-                        encoder_modules[encoder_name],
-                        module_name + "/" + name,
-                        uninitialized_encoder_weights,
-                        depth=depth + 1,
-                    )
-                    all_encoder_weights.remove(module_name + "/" + encoder_name)
-
-                uninitialized_encoder_weights += list(all_encoder_weights)
-
-        # tie weights recursively
-        tie_encoder_to_decoder_recursively(decoder, encoder, base_model_prefix, uninitialized_encoder_weights)
-        if len(uninitialized_encoder_weights) > 0:
-            logger.warning(
-                f"The following encoder weights were not tied to the decoder {uninitialized_encoder_weights}"
-            )
-
-    def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
-        """Tie or clone module weights depending of whether we are using TorchScript or not"""
-        if self.config.torchscript:
-            output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())
-        else:
-            output_embeddings.weight = input_embeddings.weight
-
-        if getattr(output_embeddings, "bias", None) is not None:
-            output_embeddings.bias.data = torch.nn.functional.pad(
-                output_embeddings.bias.data,
-                (
-                    0,
-                    output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],
-                ),
-                "constant",
-                0,
-            )
-        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
-            output_embeddings.out_features = input_embeddings.num_embeddings
-
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> torch.nn.Embedding:
-        """
-        Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`.
-
-        Takes care of tying weights embeddings afterwards if the model class has a :obj:`tie_weights()` method.
-
-        Arguments:
-            new_num_tokens (:obj:`int`, `optional`):
-                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
-                vectors at the end. Reducing the size will remove vectors from the end. If not provided or :obj:`None`,
-                just returns a pointer to the input tokens :obj:`torch.nn.Embedding` module of the model without doing
-                anything.
-
-        Return:
-            :obj:`torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
-        """
-        model_embeds = self._resize_token_embeddings(new_num_tokens)
-        if new_num_tokens is None:
-            return model_embeds
-
-        # Update base model and current model config
-        self.config.vocab_size = new_num_tokens
-        self.vocab_size = new_num_tokens
-
-        # Tie weights again if needed
-        self.tie_weights()
-
-        return model_embeds
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        old_embeddings = self.get_input_embeddings()
-        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
-        self.set_input_embeddings(new_embeddings)
-
-        # if word embeddings are not tied, make sure that lm head is resized as well
-        if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings:
-            old_lm_head = self.get_output_embeddings()
-            new_lm_head = self._get_resized_lm_head(old_lm_head, new_num_tokens)
-            self.set_output_embeddings(new_lm_head)
-
-        return self.get_input_embeddings()
-
-    def _get_resized_embeddings(
-        self, old_embeddings: torch.nn.Embedding, new_num_tokens: Optional[int] = None
-    ) -> torch.nn.Embedding:
-        """
-        Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
-        initialized vectors at the end. Reducing the size will remove vectors from the end
-
-        Args:
-            old_embeddings (:obj:`torch.nn.Embedding`):
-                Old embeddings to be resized.
-            new_num_tokens (:obj:`int`, `optional`):
-                New number of tokens in the embedding matrix.
-
-                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
-                vectors from the end. If not provided or :obj:`None`, just returns a pointer to the input tokens
-                :obj:`torch.nn.Embedding`` module of the model without doing anything.
-
-        Return:
-            :obj:`torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if
-            :obj:`new_num_tokens` is :obj:`None`
-        """
-        if new_num_tokens is None:
-            return old_embeddings
-
-        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
-        if old_num_tokens == new_num_tokens:
-            return old_embeddings
-
-        if not isinstance(old_embeddings, nn.Embedding):
-            raise TypeError(
-                f"Old embeddings are of type {type(old_embeddings)}, which is not an instance of {nn.Embedding}."
-                f"You should either use a different resize function or make sure that `old_embeddings` are an instance of {nn.Embedding}."
-            )
-
-        # Build new embeddings
-        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim).to(self.device)
-
-        # initialize all new embeddings (in particular added tokens)
-        self._init_weights(new_embeddings)
-
-        # Copy token embeddings from the previous weights
-        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
-        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
-
-        return new_embeddings
-
-    def _get_resized_lm_head(
-        self, old_lm_head: torch.nn.Linear, new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False
-    ) -> torch.nn.Linear:
-        """
-        Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized
-        vectors at the end. Reducing the size will remove vectors from the end
-
-        Args:
-            old_lm_head (:obj:`torch.nn.Linear`):
-                Old lm head liner layer to be resized.
-            new_num_tokens (:obj:`int`, `optional`):
-                New number of tokens in the linear matrix.
-
-                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
-                vectors from the end. If not provided or :obj:`None`, just returns a pointer to the input tokens
-                :obj:`torch.nn.Linear`` module of the model without doing anything.
-            transposed (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether ``old_lm_head`` is transposed or not. If True ``old_lm_head.size()`` is ``lm_head_dim,
-                vocab_size`` else ``vocab_size, lm_head_dim``.
-
-        Return:
-            :obj:`torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if
-            :obj:`new_num_tokens` is :obj:`None`
-        """
-        if new_num_tokens is None:
-            return old_lm_head
-
-        old_num_tokens, old_lm_head_dim = (
-            old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size()
-        )
-
-        if old_num_tokens == new_num_tokens:
-            return old_lm_head
-
-        if not isinstance(old_lm_head, nn.Linear):
-            raise TypeError(
-                f"Old language model head is of type {type(old_lm_head)}, which is not an instance of {nn.Linear}."
-                f"You should either use a different resize function or make sure that `old_embeddings` are an instance of {nn.Linear}."
-            )
-
-        # Build new lm head
-        new_lm_head_shape = (old_lm_head_dim, new_num_tokens) if not transposed else (new_num_tokens, old_lm_head_dim)
-        has_new_lm_head_bias = old_lm_head.bias is not None
-        new_lm_head = nn.Linear(*new_lm_head_shape, bias=has_new_lm_head_bias).to(self.device)
-
-        # initialize new lm head (in particular added tokens)
-        self._init_weights(new_lm_head)
-
-        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
-
-        # Copy old lm head weights to new lm head
-        if not transposed:
-            new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[:num_tokens_to_copy, :]
-        else:
-            new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[:, :num_tokens_to_copy]
-
-        # Copy bias weights to new lm head
-        if has_new_lm_head_bias:
-            new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy]
-
-        return new_lm_head
-
-    def init_weights(self):
-        """
-        Initializes and prunes weights if needed.
-        """
-        # Initialize weights
-        self.apply(self._init_weights)
-
-        # Prune heads if needed
-        if self.config.pruned_heads:
-            self.prune_heads(self.config.pruned_heads)
-
-        # Tie weights if needed
-        self.tie_weights()
-
-    def prune_heads(self, heads_to_prune: Dict[int, List[int]]):
-        """
-        Prunes heads of the base model.
-
-        Arguments:
-            heads_to_prune (:obj:`Dict[int, List[int]]`):
-                Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list of
-                heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads
-                0 and 2 on layer 1 and heads 2 and 3 on layer 2.
-        """
-        # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
-        for layer, heads in heads_to_prune.items():
-            union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
-            self.config.pruned_heads[layer] = list(union_heads)  # Unfortunately we have to store it as list for JSON
-
-        self.base_model._prune_heads(heads_to_prune)
-
-    def save_pretrained(
-        self,
-        save_directory: Union[str, os.PathLike],
-        save_config: bool = True,
-        state_dict: Optional[dict] = None,
-        save_function: Callable = torch.save,
-    ):
-        """
-        Save a model and its configuration file to a directory, so that it can be re-loaded using the
-        `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
-
-        Arguments:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
-                Directory to which to save. Will be created if it doesn't exist.
-            save_config (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether or not to save the config of the model. Useful when in distributed training like TPUs and need
-                to call this function on all processes. In this case, set :obj:`save_config=True` only on the main
-                process to avoid race conditions.
-            state_dict (nested dictionary of :obj:`torch.Tensor`):
-                The state dictionary of the model to save. Will default to :obj:`self.state_dict()`, but can be used to
-                only save parts of the model or if special precautions need to be taken when recovering the state
-                dictionary of a model (like when using model parallelism).
-            save_function (:obj:`Callable`):
-                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
-                need to replace :obj:`torch.save` by another method.
-        """
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-        os.makedirs(save_directory, exist_ok=True)
-
-        # Only save the model itself if we are using distributed training
-        model_to_save = unwrap_model(self)
-
-        # Attach architecture to the config
-        model_to_save.config.architectures = [model_to_save.__class__.__name__]
-
-        # Save the config
-        if save_config:
-            model_to_save.config.save_pretrained(save_directory)
-
-        # Save the model
-        if state_dict is None:
-            state_dict = model_to_save.state_dict()
-
-        # Handle the case where some state_dict keys shouldn't be saved
-        if self._keys_to_ignore_on_save is not None:
-            state_dict = {k: v for k, v in state_dict.items() if k not in self._keys_to_ignore_on_save}
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
-        save_function(state_dict, output_model_file)
-
-        logger.info("Model weights saved in {}".format(output_model_file))
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
-        r"""
-        Instantiate a pretrained pytorch model from a pre-trained model configuration.
-
-        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated). To
-        train the model, you should first set it back in training mode with ``model.train()``.
-
-        The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come
-        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
-        task.
-
-        The warning `Weights from XXX not used in YYY` means that the layer XXX is not used by YYY, therefore those
-        weights are discarded.
-
-        Parameters:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, `optional`):
-                Can be either:
-
-                    - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
-                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                    - A path to a `directory` containing model weights saved using
-                      :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
-                    - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In
-                      this case, ``from_tf`` should be set to :obj:`True` and a configuration object should be provided
-                      as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in
-                      a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-                    - :obj:`None` if you are both providing the configuration and state dictionary (resp. with keyword
-                      arguments ``config`` and ``state_dict``).
-            model_args (sequence of positional arguments, `optional`):
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method.
-            config (:obj:`Union[PretrainedConfig, str, os.PathLike]`, `optional`):
-                Can be either:
-
-                    - an instance of a class derived from :class:`~transformers.PretrainedConfig`,
-                    - a string or path valid as input to :func:`~transformers.PretrainedConfig.from_pretrained`.
-
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can
-                be automatically loaded when:
-
-                    - The model is a model provided by the library (loaded with the `model id` string of a pretrained
-                      model).
-                    - The model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
-                      by supplying the save directory.
-                    - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
-                      configuration JSON file named `config.json` is found in the directory.
-            state_dict (:obj:`Dict[str, torch.Tensor]`, `optional`):
-                A state dictionary to use instead of a state dictionary loaded from saved weights file.
-
-                This option can be used if you want to create a model from a pretrained configuration but load your own
-                weights. In this case though, you should check if using
-                :func:`~transformers.PreTrainedModel.save_pretrained` and
-                :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-            cache_dir (:obj:`Union[str, os.PathLike]`, `optional`):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            from_tf (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Load the model weights from a TensorFlow checkpoint save file (see docstring of
-                ``pretrained_model_name_or_path`` argument).
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (:obj:`Dict[str, str], `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (:obj:`str` or `bool`, `optional`):
-                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
-                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
-                identifier allowed by git.
-            mirror(:obj:`str`, `optional`, defaults to :obj:`None`):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information.
-            kwargs (remaining dictionary of keyword arguments, `optional`):
-                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
-                :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
-                automatically loaded:
-
-                    - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
-                      underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
-                      already been done)
-                    - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
-                      initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
-                      ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
-                      with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
-                      attribute will be passed to the underlying model's ``__init__`` function.
-
-        .. note::
-
-            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
-
-        Examples::
-
-            >>> from transformers import BertConfig, BertModel
-            >>> # Download model and configuration from huggingface.co and cache.
-            >>> model = BertModel.from_pretrained('bert-base-uncased')
-            >>> # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable).
-            >>> model = BertModel.from_pretrained('./test/saved_model/')
-            >>> # Update configuration during loading.
-            >>> model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)
-            >>> assert model.config.output_attentions == True
-            >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
-            >>> config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
-            >>> model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-        """
-        config = kwargs.pop("config", None)
-        state_dict = kwargs.pop("state_dict", None)
-        cache_dir = kwargs.pop("cache_dir", None)
-        from_tf = kwargs.pop("from_tf", False)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        output_loading_info = kwargs.pop("output_loading_info", False)
-        local_files_only = kwargs.pop("local_files_only", False)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        mirror = kwargs.pop("mirror", None)
-
-        if is_offline_mode() and not local_files_only:
-            logger.info("Offline mode: forcing local_files_only=True")
-            local_files_only = True
-
-        # Load config if we don't provide a configuration
-        if not isinstance(config, PretrainedConfig):
-            config_path = config if config is not None else pretrained_model_name_or_path
-            config, model_kwargs = cls.config_class.from_pretrained(
-                config_path,
-                *model_args,
-                cache_dir=cache_dir,
-                return_unused_kwargs=True,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                **kwargs,
-            )
-        else:
-            model_kwargs = kwargs
-
-        # Load model
-        if pretrained_model_name_or_path is not None:
-            pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-            if os.path.isdir(pretrained_model_name_or_path):
-                if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")):
-                    # Load from a TF 1.0 checkpoint in priority if from_tf
-                    archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
-                elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
-                    # Load from a TF 2.0 checkpoint in priority if from_tf
-                    archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
-                elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
-                    # Load from a PyTorch checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-                else:
-                    raise EnvironmentError(
-                        "Error no file named {} found in directory {} or `from_tf` set to False".format(
-                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
-                            pretrained_model_name_or_path,
-                        )
-                    )
-            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-                archive_file = pretrained_model_name_or_path
-            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                assert (
-                    from_tf
-                ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
-                    pretrained_model_name_or_path + ".index"
-                )
-                archive_file = pretrained_model_name_or_path + ".index"
-            else:
-                archive_file = hf_bucket_url(
-                    pretrained_model_name_or_path,
-                    filename=(TF2_WEIGHTS_NAME if from_tf else WEIGHTS_NAME),
-                    revision=revision,
-                    mirror=mirror,
-                )
-
-            try:
-                # Load from URL or cache if already cached
-                resolved_archive_file = cached_path(
-                    archive_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                )
-            except EnvironmentError as err:
-                logger.error(err)
-                msg = (
-                    f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
-                    f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
-                    f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME}.\n\n"
-                )
-                raise EnvironmentError(msg)
-
-            if resolved_archive_file == archive_file:
-                logger.info("loading weights file {}".format(archive_file))
-            else:
-                logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file))
-        else:
-            resolved_archive_file = None
-
-        config.name_or_path = pretrained_model_name_or_path
-
-        # Instantiate model.
-        model = cls(config, *model_args, **model_kwargs)
-
-        if state_dict is None and not from_tf:
-            try:
-                state_dict = torch.load(resolved_archive_file, map_location="cpu")
-            except Exception:
-                raise OSError(
-                    f"Unable to load weights from pytorch checkpoint file for '{pretrained_model_name_or_path}' "
-                    f"at '{resolved_archive_file}'"
-                    "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. "
-                )
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-
-        if from_tf:
-            if resolved_archive_file.endswith(".index"):
-                # Load from a TensorFlow 1.X checkpoint - provided by original authors
-                model = cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
-            else:
-                # Load from our TensorFlow 2.0 checkpoints
-                try:
-                    from .modeling_tf_pytorch_utils import load_tf2_checkpoint_in_pytorch_model
-
-                    model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True)
-                except ImportError:
-                    logger.error(
-                        "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
-                        "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
-                    )
-                    raise
-        else:
-            # Convert old format to new format if needed from a PyTorch state_dict
-            old_keys = []
-            new_keys = []
-            for key in state_dict.keys():
-                new_key = None
-                if "gamma" in key:
-                    new_key = key.replace("gamma", "weight")
-                if "beta" in key:
-                    new_key = key.replace("beta", "bias")
-                if new_key:
-                    old_keys.append(key)
-                    new_keys.append(new_key)
-            for old_key, new_key in zip(old_keys, new_keys):
-                state_dict[new_key] = state_dict.pop(old_key)
-
-            # copy state_dict so _load_from_state_dict can modify it
-            metadata = getattr(state_dict, "_metadata", None)
-            state_dict = state_dict.copy()
-            if metadata is not None:
-                state_dict._metadata = metadata
-
-            # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-            # so we need to apply the function recursively.
-            def load(module: nn.Module, prefix=""):
-                local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-                module._load_from_state_dict(
-                    state_dict,
-                    prefix,
-                    local_metadata,
-                    True,
-                    missing_keys,
-                    unexpected_keys,
-                    error_msgs,
-                )
-                for name, child in module._modules.items():
-                    if child is not None:
-                        load(child, prefix + name + ".")
-
-            # Make sure we are able to load base models as well as derived models (with heads)
-            start_prefix = ""
-            model_to_load = model
-            has_prefix_module = any(s.startswith(cls.base_model_prefix) for s in state_dict.keys())
-            if not hasattr(model, cls.base_model_prefix) and has_prefix_module:
-                start_prefix = cls.base_model_prefix + "."
-            if hasattr(model, cls.base_model_prefix) and not has_prefix_module:
-                model_to_load = getattr(model, cls.base_model_prefix)
-
-            load(model_to_load, prefix=start_prefix)
-
-            if model.__class__.__name__ != model_to_load.__class__.__name__:
-                base_model_state_dict = model_to_load.state_dict().keys()
-                head_model_state_dict_without_base_prefix = [
-                    key.split(cls.base_model_prefix + ".")[-1] for key in model.state_dict().keys()
-                ]
-                missing_keys.extend(head_model_state_dict_without_base_prefix - base_model_state_dict)
-
-            # Some models may have keys that are not in the state by design, removing them before needlessly warning
-            # the user.
-            if cls._keys_to_ignore_on_load_missing is not None:
-                for pat in cls._keys_to_ignore_on_load_missing:
-                    missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
-
-            if cls._keys_to_ignore_on_load_unexpected is not None:
-                for pat in cls._keys_to_ignore_on_load_unexpected:
-                    unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-
-            if len(unexpected_keys) > 0:
-                logger.warning(
-                    f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when "
-                    f"initializing {model.__class__.__name__}: {unexpected_keys}\n"
-                    f"- This IS expected if you are initializing {model.__class__.__name__} from the checkpoint of a model trained on another task "
-                    f"or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n"
-                    f"- This IS NOT expected if you are initializing {model.__class__.__name__} from the checkpoint of a model that you expect "
-                    f"to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
-                )
-            else:
-                logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
-            if len(missing_keys) > 0:
-                logger.warning(
-                    f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at {pretrained_model_name_or_path} "
-                    f"and are newly initialized: {missing_keys}\n"
-                    f"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference."
-                )
-            else:
-                logger.info(
-                    f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at {pretrained_model_name_or_path}.\n"
-                    f"If your task is similar to the task the model of the checkpoint was trained on, "
-                    f"you can already use {model.__class__.__name__} for predictions without further training."
-                )
-            if len(error_msgs) > 0:
-                raise RuntimeError(
-                    "Error(s) in loading state_dict for {}:\n\t{}".format(
-                        model.__class__.__name__, "\n\t".join(error_msgs)
-                    )
-                )
-        # make sure token embedding weights are still tied if needed
-        model.tie_weights()
-
-        # Set model in evaluation mode to deactivate DropOut modules by default
-        model.eval()
-
-        if output_loading_info:
-            loading_info = {
-                "missing_keys": missing_keys,
-                "unexpected_keys": unexpected_keys,
-                "error_msgs": error_msgs,
-            }
-            return model, loading_info
-
-        return model
-
-
-class Conv1D(nn.Module):
-    """
-    1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
-
-    Basically works like a linear layer but the weights are transposed.
-
-    Args:
-        nf (:obj:`int`): The number of output features.
-        nx (:obj:`int`): The number of input features.
-    """
-
-    def __init__(self, nf, nx):
-        super().__init__()
-        self.nf = nf
-        w = torch.empty(nx, nf)
-        nn.init.normal_(w, std=0.02)
-        self.weight = nn.Parameter(w)
-        self.bias = nn.Parameter(torch.zeros(nf))
-
-    def forward(self, x):
-        size_out = x.size()[:-1] + (self.nf,)
-        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-        x = x.view(*size_out)
-        return x
-
-
-class PoolerStartLogits(nn.Module):
-    """
-    Compute SQuAD start logits from sequence hidden states.
-
-    Args:
-        config (:class:`~transformers.PretrainedConfig`):
-            The config used by the model, will be used to grab the :obj:`hidden_size` of the model.
-    """
-
-    def __init__(self, config: PretrainedConfig):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, 1)
-
-    def forward(
-        self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None
-    ) -> torch.FloatTensor:
-        """
-        Args:
-            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
-                The final hidden states of the model.
-            p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
-                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
-                should be masked.
-
-        Returns:
-            :obj:`torch.FloatTensor`: The start logits for SQuAD.
-        """
-        x = self.dense(hidden_states).squeeze(-1)
-
-        if p_mask is not None:
-            if get_parameter_dtype(self) == torch.float16:
-                x = x * (1 - p_mask) - 65500 * p_mask
-            else:
-                x = x * (1 - p_mask) - 1e30 * p_mask
-
-        return x
-
-
-class PoolerEndLogits(nn.Module):
-    """
-    Compute SQuAD end logits from sequence hidden states.
-
-    Args:
-        config (:class:`~transformers.PretrainedConfig`):
-            The config used by the model, will be used to grab the :obj:`hidden_size` of the model and the
-            :obj:`layer_norm_eps` to use.
-    """
-
-    def __init__(self, config: PretrainedConfig):
-        super().__init__()
-        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
-        self.activation = nn.Tanh()
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dense_1 = nn.Linear(config.hidden_size, 1)
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        start_states: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        p_mask: Optional[torch.FloatTensor] = None,
-    ) -> torch.FloatTensor:
-        """
-        Args:
-            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
-                The final hidden states of the model.
-            start_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`, `optional`):
-                The hidden states of the first tokens for the labeled span.
-            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                The position of the first token for the labeled span.
-            p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
-                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
-                should be masked.
-
-        .. note::
-
-            One of ``start_states`` or ``start_positions`` should be not obj:`None`. If both are set,
-            ``start_positions`` overrides ``start_states``.
-
-        Returns:
-            :obj:`torch.FloatTensor`: The end logits for SQuAD.
-        """
-        assert (
-            start_states is not None or start_positions is not None
-        ), "One of start_states, start_positions should be not None"
-        if start_positions is not None:
-            slen, hsz = hidden_states.shape[-2:]
-            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(-2, start_positions)  # shape (bsz, 1, hsz)
-            start_states = start_states.expand(-1, slen, -1)  # shape (bsz, slen, hsz)
-
-        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
-        x = self.activation(x)
-        x = self.LayerNorm(x)
-        x = self.dense_1(x).squeeze(-1)
-
-        if p_mask is not None:
-            if get_parameter_dtype(self) == torch.float16:
-                x = x * (1 - p_mask) - 65500 * p_mask
-            else:
-                x = x * (1 - p_mask) - 1e30 * p_mask
-
-        return x
-
-
-class PoolerAnswerClass(nn.Module):
-    """
-    Compute SQuAD 2.0 answer class from classification and start tokens hidden states.
-
-    Args:
-        config (:class:`~transformers.PretrainedConfig`):
-            The config used by the model, will be used to grab the :obj:`hidden_size` of the model.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
-        self.activation = nn.Tanh()
-        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        start_states: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        cls_index: Optional[torch.LongTensor] = None,
-    ) -> torch.FloatTensor:
-        """
-        Args:
-            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
-                The final hidden states of the model.
-            start_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`, `optional`):
-                The hidden states of the first tokens for the labeled span.
-            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                The position of the first token for the labeled span.
-            cls_index (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Position of the CLS token for each sentence in the batch. If :obj:`None`, takes the last token.
-
-        .. note::
-
-            One of ``start_states`` or ``start_positions`` should be not obj:`None`. If both are set,
-            ``start_positions`` overrides ``start_states``.
-
-        Returns:
-            :obj:`torch.FloatTensor`: The SQuAD 2.0 answer class.
-        """
-        # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
-        hsz = hidden_states.shape[-1]
-        assert (
-            start_states is not None or start_positions is not None
-        ), "One of start_states, start_positions should be not None"
-        if start_positions is not None:
-            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
-
-        if cls_index is not None:
-            cls_index = cls_index[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
-            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, hsz)
-        else:
-            cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
-
-        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
-        x = self.activation(x)
-        x = self.dense_1(x).squeeze(-1)
-
-        return x
-
-
-@dataclass
-class SquadHeadOutput(ModelOutput):
-    """
-    Base class for outputs of question answering models using a :class:`~transformers.modeling_utils.SQuADHead`.
-
-    Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
-            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
-            losses.
-        start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
-        start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Indices for the top config.start_n_top start token possibilities (beam-search).
-        end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities
-            (beam-search).
-        end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the ``is_impossible`` label of the answers.
-
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    start_top_log_probs: Optional[torch.FloatTensor] = None
-    start_top_index: Optional[torch.LongTensor] = None
-    end_top_log_probs: Optional[torch.FloatTensor] = None
-    end_top_index: Optional[torch.LongTensor] = None
-    cls_logits: Optional[torch.FloatTensor] = None
-
-
-class SQuADHead(nn.Module):
-    r"""
-    A SQuAD head inspired by XLNet.
-
-    Args:
-        config (:class:`~transformers.PretrainedConfig`):
-            The config used by the model, will be used to grab the :obj:`hidden_size` of the model and the
-            :obj:`layer_norm_eps` to use.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.start_n_top = config.start_n_top
-        self.end_n_top = config.end_n_top
-
-        self.start_logits = PoolerStartLogits(config)
-        self.end_logits = PoolerEndLogits(config)
-        self.answer_class = PoolerAnswerClass(config)
-
-    @replace_return_docstrings(output_type=SquadHeadOutput, config_class=PretrainedConfig)
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        start_positions: Optional[torch.LongTensor] = None,
-        end_positions: Optional[torch.LongTensor] = None,
-        cls_index: Optional[torch.LongTensor] = None,
-        is_impossible: Optional[torch.LongTensor] = None,
-        p_mask: Optional[torch.FloatTensor] = None,
-        return_dict: bool = False,
-    ) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]:
-        """
-        Args:
-            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
-                Final hidden states of the model on the sequence tokens.
-            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Positions of the first token for the labeled span.
-            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Positions of the last token for the labeled span.
-            cls_index (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Position of the CLS token for each sentence in the batch. If :obj:`None`, takes the last token.
-            is_impossible (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Whether the question has a possible answer in the paragraph or not.
-            p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
-                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
-                should be masked.
-            return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-
-        Returns:
-        """
-        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
-
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, let's remove the dimension added by batch splitting
-            for x in (start_positions, end_positions, cls_index, is_impossible):
-                if x is not None and x.dim() > 1:
-                    x.squeeze_(-1)
-
-            # during training, compute the end logits based on the ground truth of the start position
-            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
-
-            loss_fct = CrossEntropyLoss()
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-            if cls_index is not None and is_impossible is not None:
-                # Predict answerability from the representation of CLS and START
-                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
-                loss_fct_cls = nn.BCEWithLogitsLoss()
-                cls_loss = loss_fct_cls(cls_logits, is_impossible)
-
-                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
-                total_loss += cls_loss * 0.5
-
-            return SquadHeadOutput(loss=total_loss) if return_dict else (total_loss,)
-
-        else:
-            # during inference, compute the end logits based on beam search
-            bsz, slen, hsz = hidden_states.size()
-            start_log_probs = F.softmax(start_logits, dim=-1)  # shape (bsz, slen)
-
-            start_top_log_probs, start_top_index = torch.topk(
-                start_log_probs, self.start_n_top, dim=-1
-            )  # shape (bsz, start_n_top)
-            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
-            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
-            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
-
-            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
-                start_states
-            )  # shape (bsz, slen, start_n_top, hsz)
-            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
-            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
-            end_log_probs = F.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
-
-            end_top_log_probs, end_top_index = torch.topk(
-                end_log_probs, self.end_n_top, dim=1
-            )  # shape (bsz, end_n_top, start_n_top)
-            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
-            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
-
-            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
-            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
-
-            if not return_dict:
-                return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
-            else:
-                return SquadHeadOutput(
-                    start_top_log_probs=start_top_log_probs,
-                    start_top_index=start_top_index,
-                    end_top_log_probs=end_top_log_probs,
-                    end_top_index=end_top_index,
-                    cls_logits=cls_logits,
-                )
-
-
-class SequenceSummary(nn.Module):
-    r"""
-    Compute a single vector summary of a sequence hidden states.
-
-    Args:
-        config (:class:`~transformers.PretrainedConfig`):
-            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
-            config class of your model for the default values it uses):
-
-            - **summary_type** (:obj:`str`) -- The method to use to make this summary. Accepted values are:
-
-                - :obj:`"last"` -- Take the last token hidden state (like XLNet)
-                - :obj:`"first"` -- Take the first token hidden state (like Bert)
-                - :obj:`"mean"` -- Take the mean of all tokens hidden states
-                - :obj:`"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
-                - :obj:`"attn"` -- Not implemented now, use multi-head attention
-
-            - **summary_use_proj** (:obj:`bool`) -- Add a projection after the vector extraction.
-            - **summary_proj_to_labels** (:obj:`bool`) -- If :obj:`True`, the projection outputs to
-              :obj:`config.num_labels` classes (otherwise to :obj:`config.hidden_size`).
-            - **summary_activation** (:obj:`Optional[str]`) -- Set to :obj:`"tanh"` to add a tanh activation to the
-              output, another string or :obj:`None` will add no activation.
-            - **summary_first_dropout** (:obj:`float`) -- Optional dropout probability before the projection and
-              activation.
-            - **summary_last_dropout** (:obj:`float`)-- Optional dropout probability after the projection and
-              activation.
-    """
-
-    def __init__(self, config: PretrainedConfig):
-        super().__init__()
-
-        self.summary_type = getattr(config, "summary_type", "last")
-        if self.summary_type == "attn":
-            # We should use a standard multi-head attention module with absolute positional embedding for that.
-            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
-            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
-            raise NotImplementedError
-
-        self.summary = Identity()
-        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
-            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
-                num_classes = config.num_labels
-            else:
-                num_classes = config.hidden_size
-            self.summary = nn.Linear(config.hidden_size, num_classes)
-
-        activation_string = getattr(config, "summary_activation", None)
-        self.activation: Callable = get_activation(activation_string) if activation_string else Identity()
-
-        self.first_dropout = Identity()
-        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
-            self.first_dropout = nn.Dropout(config.summary_first_dropout)
-
-        self.last_dropout = Identity()
-        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
-            self.last_dropout = nn.Dropout(config.summary_last_dropout)
-
-    def forward(
-        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
-    ) -> torch.FloatTensor:
-        """
-        Compute a single vector summary of a sequence hidden states.
-
-        Args:
-            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`[batch_size, seq_len, hidden_size]`):
-                The hidden states of the last layer.
-            cls_index (:obj:`torch.LongTensor` of shape :obj:`[batch_size]` or :obj:`[batch_size, ...]` where ... are optional leading dimensions of :obj:`hidden_states`, `optional`):
-                Used if :obj:`summary_type == "cls_index"` and takes the last token of the sequence as classification
-                token.
-
-        Returns:
-            :obj:`torch.FloatTensor`: The summary of the sequence hidden states.
-        """
-        if self.summary_type == "last":
-            output = hidden_states[:, -1]
-        elif self.summary_type == "first":
-            output = hidden_states[:, 0]
-        elif self.summary_type == "mean":
-            output = hidden_states.mean(dim=1)
-        elif self.summary_type == "cls_index":
-            if cls_index is None:
-                cls_index = torch.full_like(
-                    hidden_states[..., :1, :],
-                    hidden_states.shape[-2] - 1,
-                    dtype=torch.long,
-                )
-            else:
-                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
-                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
-            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
-            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
-        elif self.summary_type == "attn":
-            raise NotImplementedError
-
-        output = self.first_dropout(output)
-        output = self.summary(output)
-        output = self.activation(output)
-        output = self.last_dropout(output)
-
-        return output
-
-
-def unwrap_model(model: torch.nn.Module) -> torch.nn.Module:
-    """
-    Recursively unwraps a model from potential containers (as used in distributed training).
-
-    Args:
-        model (:obj:`torch.nn.Module`): The model to unwrap.
-    """
-    # since there could be multiple levels of wrapping, unwrap recursively
-    if hasattr(model, "module"):
-        return unwrap_model(model.module)
-    else:
-        return model
-
-
-def prune_linear_layer(layer: torch.nn.Linear, index: torch.LongTensor, dim: int = 0) -> torch.nn.Linear:
-    """
-    Prune a linear layer to keep only entries in index.
-
-    Used to remove heads.
-
-    Args:
-        layer (:obj:`torch.nn.Linear`): The layer to prune.
-        index (:obj:`torch.LongTensor`): The indices to keep in the layer.
-        dim (:obj:`int`, `optional`, defaults to 0): The dimension on which to keep the indices.
-
-    Returns:
-        :obj:`torch.nn.Linear`: The pruned layer as a new layer with :obj:`requires_grad=True`.
-    """
-    index = index.to(layer.weight.device)
-    W = layer.weight.index_select(dim, index).clone().detach()
-    if layer.bias is not None:
-        if dim == 1:
-            b = layer.bias.clone().detach()
-        else:
-            b = layer.bias[index].clone().detach()
-    new_size = list(layer.weight.size())
-    new_size[dim] = len(index)
-    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
-    new_layer.weight.requires_grad = False
-    new_layer.weight.copy_(W.contiguous())
-    new_layer.weight.requires_grad = True
-    if layer.bias is not None:
-        new_layer.bias.requires_grad = False
-        new_layer.bias.copy_(b.contiguous())
-        new_layer.bias.requires_grad = True
-    return new_layer
-
-
-def prune_conv1d_layer(layer: Conv1D, index: torch.LongTensor, dim: int = 1) -> Conv1D:
-    """
-    Prune a Conv1D layer to keep only entries in index. A Conv1D work as a Linear layer (see e.g. BERT) but the weights
-    are transposed.
-
-    Used to remove heads.
-
-    Args:
-        layer (:class:`~transformers.modeling_utils.Conv1D`): The layer to prune.
-        index (:obj:`torch.LongTensor`): The indices to keep in the layer.
-        dim (:obj:`int`, `optional`, defaults to 1): The dimension on which to keep the indices.
-
-    Returns:
-        :class:`~transformers.modeling_utils.Conv1D`: The pruned layer as a new layer with :obj:`requires_grad=True`.
-    """
-    index = index.to(layer.weight.device)
-    W = layer.weight.index_select(dim, index).clone().detach()
-    if dim == 0:
-        b = layer.bias.clone().detach()
-    else:
-        b = layer.bias[index].clone().detach()
-    new_size = list(layer.weight.size())
-    new_size[dim] = len(index)
-    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
-    new_layer.weight.requires_grad = False
-    new_layer.weight.copy_(W.contiguous())
-    new_layer.weight.requires_grad = True
-    new_layer.bias.requires_grad = False
-    new_layer.bias.copy_(b.contiguous())
-    new_layer.bias.requires_grad = True
-    return new_layer
-
-
-def prune_layer(
-    layer: Union[torch.nn.Linear, Conv1D], index: torch.LongTensor, dim: Optional[int] = None
-) -> Union[torch.nn.Linear, Conv1D]:
-    """
-    Prune a Conv1D or linear layer to keep only entries in index.
-
-    Used to remove heads.
-
-    Args:
-        layer (:obj:`Union[torch.nn.Linear, Conv1D]`): The layer to prune.
-        index (:obj:`torch.LongTensor`): The indices to keep in the layer.
-        dim (:obj:`int`, `optional`): The dimension on which to keep the indices.
-
-    Returns:
-        :obj:`torch.nn.Linear` or :class:`~transformers.modeling_utils.Conv1D`: The pruned layer as a new layer with
-        :obj:`requires_grad=True`.
-    """
-    if isinstance(layer, nn.Linear):
-        return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
-    elif isinstance(layer, Conv1D):
-        return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
-    else:
-        raise ValueError("Can't prune layer of class {}".format(layer.__class__))
-
-
-def apply_chunking_to_forward(
-    forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors
-) -> torch.Tensor:
-    """
-    This function chunks the :obj:`input_tensors` into smaller input tensor parts of size :obj:`chunk_size` over the
-    dimension :obj:`chunk_dim`. It then applies a layer :obj:`forward_fn` to each chunk independently to save memory.
-
-    If the :obj:`forward_fn` is independent across the :obj:`chunk_dim` this function will yield the same result as
-    directly applying :obj:`forward_fn` to :obj:`input_tensors`.
-
-    Args:
-        forward_fn (:obj:`Callable[..., torch.Tensor]`):
-            The forward function of the model.
-        chunk_size (:obj:`int`):
-            The chunk size of a chunked tensor: :obj:`num_chunks = len(input_tensors[0]) / chunk_size`.
-        chunk_dim (:obj:`int`):
-            The dimension over which the :obj:`input_tensors` should be chunked.
-        input_tensors (:obj:`Tuple[torch.Tensor]`):
-            The input tensors of ``forward_fn`` which will be chunked
-
-    Returns:
-        :obj:`torch.Tensor`: A tensor with the same shape as the :obj:`forward_fn` would have given if applied`.
-
-
-    Examples::
-
-        # rename the usual forward() fn to forward_chunk()
-        def forward_chunk(self, hidden_states):
-            hidden_states = self.decoder(hidden_states)
-            return hidden_states
-
-        # implement a chunked forward function
-        def forward(self, hidden_states):
-            return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
-    """
-
-    assert len(input_tensors) > 0, "{} has to be a tuple/list of tensors".format(input_tensors)
-    tensor_shape = input_tensors[0].shape[chunk_dim]
-    assert all(
-        input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors
-    ), "All input tenors have to be of the same shape"
-
-    # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility
-    num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
-    assert num_args_in_forward_chunk_fn == len(
-        input_tensors
-    ), "forward_chunk_fn expects {} arguments, but only {} input tensors are given".format(
-        num_args_in_forward_chunk_fn, len(input_tensors)
-    )
-
-    if chunk_size > 0:
-        assert (
-            input_tensors[0].shape[chunk_dim] % chunk_size == 0
-        ), "The dimension to be chunked {} has to be a multiple of the chunk size {}".format(
-            input_tensors[0].shape[chunk_dim], chunk_size
-        )
-
-        num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size
-
-        # chunk input tensor into tuples
-        input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors)
-        # apply forward fn to every tuple
-        output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks))
-        # concatenate output at same dimension
-        return torch.cat(output_chunks, dim=chunk_dim)
-
-    return forward_fn(*input_tensors)
--- a/run_interaction.py
+++ b/run_interaction.py
 from argparse import ArgumentParser
 from dataset import Data_Encoder
 import torch
-from transformers import BertModel, BertTokenizer, BertLayer
-from transformers import AutoTokenizer, AutoModel
-# from modeling_bert import BertModel
-
-
-
-parser = ArgumentParser(description='BertAff Training.')
-parser.add_argument('-b', '--batch-size', default=16, type=int,
-                    metavar='N',
-                    help='mini-batch size (default: 16), this is the total '
-                         'batch size of all GPUs on the current node when '
-                         'using Data Parallel or Distributed Data Parallel')
-parser.add_argument('-j', '--workers', default=0, type=int, metavar='N',
-                    help='number of data loading workers (default: 0)')
-parser.add_argument('--epochs', default=50, type=int, metavar='N',
-                    help='number of total epochs to run')
-parser.add_argument('--task', choices=['train', 'test', 'channel', 'ER', 'GPCR', 'kinase'],
-                    default='', type=str, metavar='TASK',
-                    help='Task name. Could be train, test, channel, ER, GPCR, kinase.')
-parser.add_argument('--lr', '--learning-rate', default=1e-4, type=float,
-                    metavar='LR', help='initial learning rate', dest='lr')
-
-
+from torch.utils.data import DataLoader
+from configuration_bert import BertConfig
+from modeling_bert import BertAffinityModel


 def get_task(task_name):
@@ -39,11 +19,7 @@ def get_task(task_name):
                            "separate_id": "[SEP]",
                            "max_len": 256
                            }
-        params = {'batch_size': 5,
-                  'shuffle': True,
-                  'num_workers': 0,
-                  'drop_last': True}
-        return df_train, tokenizer_config, params
+        return df_train, tokenizer_config
    elif task_name.lower() == 'test':
        df_test = {"sps": './data/test_sps',
                    "smile": './data/test_smile',
@@ -55,25 +31,63 @@ def get_task(task_name):
                            "separate_id": "[SEP]",
                            "max_len": 256
                            }
-        params = {'batch_size': 5,
-                  'shuffle': True,
-                  'num_workers': 0,
-                  'drop_last': True}

-        return df_test, tokenizer_config, params
+        return df_test, tokenizer_config
+
+
+def main(args):
+    # load data
+    data_file, tokenizer_config = get_task(args.task)
+    dataset = Data_Encoder(data_file, tokenizer_config)
+    data_loder_para = {'batch_size': args.batch_size,
+                       'shuffle': True,
+                       'num_workers': args.workers,
+                       'drop_last': True
+                       }
+    data_generator = DataLoader(dataset, **data_loder_para)
+    # creat model
+    config = BertConfig.from_pretrained(args.config)
+    model = BertAffinityModel(config)
+
+    if args.task == 'train':
+        opt = torch.optim.Adam(model.parameters(), lr=args.lr)
+        loss_fct = torch.nn.MSELoss()
+        for epoch in range(args.epochs):
+            for i, (input, affinity) in enumerate(data_generator):
+                pred_affinity = model(input.long())
+                loss = loss_fct(pred_affinity, affinity)
+                opt.zero_grad()
+                loss.backward()
+                opt.step()
+                print('------------------')
+

-def main():
-    args = parser.parse_args()



-    dataset, config, params = get_task("train")
-    model = BertModel.from_pretrained("bert-base-uncased")

 if __name__ == '__main__':
-    # main()
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-    # model = BertModel('./config/config.json')
-    model = BertModel.from_pretrained("bert-base-uncased")
-    inputs = tokenizer("Hello world!", return_tensors="pt")
-    outputs = model(**inputs)
+    # get parameter
+    parser = ArgumentParser(description='BertAffinity')
+    parser.add_argument('-b', '--batch-size', default=1, type=int,
+                        metavar='N',
+                        help='mini-batch size (default: 16), this is the total '
+                             'batch size of all GPUs on the current node when '
+                             'using Data Parallel or Distributed Data Parallel')
+    parser.add_argument('-j', '--workers', default=0, type=int, metavar='N',
+                        help='number of data loading workers (default: 0)')
+    parser.add_argument('--epochs', default=50, type=int, metavar='N',
+                        help='number of total epochs to run')
+    parser.add_argument('--task', choices=['train', 'test', 'channel', 'ER', 'GPCR', 'kinase'],
+                        default='train', type=str, metavar='TASK',
+                        help='Task name. Could be train, test, channel, ER, GPCR, kinase.')
+    parser.add_argument('--lr', '--learning-rate', default=1e-4, type=float,
+                        metavar='LR', help='initial learning rate', dest='lr')
+    parser.add_argument('--config', default='./config/config.json', type=str, help='model config file path')
+    args = parser.parse_args()
+
+
+    # input dict
+    main(args)
+    # load_config(args.config)
+
--- a/utils/hf_api.py
+++ b/utils/hf_api.py
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import io
-import os
-from os.path import expanduser
-from typing import Dict, List, Optional, Tuple
-
-from tqdm import tqdm
-
-import requests
-
-
-ENDPOINT = "https://huggingface.co"
-
-
-class RepoObj:
-    """
-    HuggingFace git-based system, data structure that represents a file belonging to the current user.
-    """
-
-    def __init__(self, filename: str, lastModified: str, commit: str, size: int, **kwargs):
-        self.filename = filename
-        self.lastModified = lastModified
-        self.commit = commit
-        self.size = size
-
-
-class ModelSibling:
-    """
-    Data structure that represents a public file inside a model, accessible from huggingface.co
-    """
-
-    def __init__(self, rfilename: str, **kwargs):
-        self.rfilename = rfilename  # filename relative to the model root
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-
-
-class ModelInfo:
-    """
-    Info about a public model accessible from huggingface.co
-    """
-
-    def __init__(
-        self,
-        modelId: Optional[str] = None,  # id of model
-        tags: List[str] = [],
-        pipeline_tag: Optional[str] = None,
-        siblings: Optional[List[Dict]] = None,  # list of files that constitute the model
-        **kwargs
-    ):
-        self.modelId = modelId
-        self.tags = tags
-        self.pipeline_tag = pipeline_tag
-        self.siblings = [ModelSibling(**x) for x in siblings] if siblings is not None else None
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-
-
-class HfApi:
-    def __init__(self, endpoint=None):
-        self.endpoint = endpoint if endpoint is not None else ENDPOINT
-
-    def login(self, username: str, password: str) -> str:
-        """
-        Call HF API to sign in a user and get a token if credentials are valid.
-
-        Outputs: token if credentials are valid
-
-        Throws: requests.exceptions.HTTPError if credentials are invalid
-        """
-        path = "{}/api/login".format(self.endpoint)
-        r = requests.post(path, json={"username": username, "password": password})
-        r.raise_for_status()
-        d = r.json()
-        return d["token"]
-
-    def whoami(self, token: str) -> Tuple[str, List[str]]:
-        """
-        Call HF API to know "whoami"
-        """
-        path = "{}/api/whoami".format(self.endpoint)
-        r = requests.get(path, headers={"authorization": "Bearer {}".format(token)})
-        r.raise_for_status()
-        d = r.json()
-        return d["user"], d["orgs"]
-
-    def logout(self, token: str) -> None:
-        """
-        Call HF API to log out.
-        """
-        path = "{}/api/logout".format(self.endpoint)
-        r = requests.post(path, headers={"authorization": "Bearer {}".format(token)})
-        r.raise_for_status()
-
-    def model_list(self) -> List[ModelInfo]:
-        """
-        Get the public list of all the models on huggingface.co
-        """
-        path = "{}/api/models".format(self.endpoint)
-        r = requests.get(path)
-        r.raise_for_status()
-        d = r.json()
-        return [ModelInfo(**x) for x in d]
-
-    def list_repos_objs(self, token: str, organization: Optional[str] = None) -> List[RepoObj]:
-        """
-        HuggingFace git-based system, used for models.
-
-        Call HF API to list all stored files for user (or one of their organizations).
-        """
-        path = "{}/api/repos/ls".format(self.endpoint)
-        params = {"organization": organization} if organization is not None else None
-        r = requests.get(path, params=params, headers={"authorization": "Bearer {}".format(token)})
-        r.raise_for_status()
-        d = r.json()
-        return [RepoObj(**x) for x in d]
-
-    def create_repo(
-        self,
-        token: str,
-        name: str,
-        organization: Optional[str] = None,
-        private: Optional[bool] = None,
-        exist_ok=False,
-        lfsmultipartthresh: Optional[int] = None,
-    ) -> str:
-        """
-        HuggingFace git-based system, used for models.
-
-        Call HF API to create a whole repo.
-
-        Params:
-            private: Whether the model repo should be private (requires a paid huggingface.co account)
-
-            exist_ok: Do not raise an error if repo already exists
-
-            lfsmultipartthresh: Optional: internal param for testing purposes.
-        """
-        path = "{}/api/repos/create".format(self.endpoint)
-        json = {"name": name, "organization": organization, "private": private}
-        if lfsmultipartthresh is not None:
-            json["lfsmultipartthresh"] = lfsmultipartthresh
-        r = requests.post(
-            path,
-            headers={"authorization": "Bearer {}".format(token)},
-            json=json,
-        )
-        if exist_ok and r.status_code == 409:
-            return ""
-        r.raise_for_status()
-        d = r.json()
-        return d["url"]
-
-    def delete_repo(self, token: str, name: str, organization: Optional[str] = None):
-        """
-        HuggingFace git-based system, used for models.
-
-        Call HF API to delete a whole repo.
-
-        CAUTION(this is irreversible).
-        """
-        path = "{}/api/repos/delete".format(self.endpoint)
-        r = requests.delete(
-            path,
-            headers={"authorization": "Bearer {}".format(token)},
-            json={"name": name, "organization": organization},
-        )
-        r.raise_for_status()
-
-
-class TqdmProgressFileReader:
-    """
-    Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`) and override `f.read()` so as to display a
-    tqdm progress bar.
-
-    see github.com/huggingface/transformers/pull/2078#discussion_r354739608 for implementation details.
-    """
-
-    def __init__(self, f: io.BufferedReader):
-        self.f = f
-        self.total_size = os.fstat(f.fileno()).st_size
-        self.pbar = tqdm(total=self.total_size, leave=False)
-        self.read = f.read
-        f.read = self._read
-
-    def _read(self, n=-1):
-        self.pbar.update(n)
-        return self.read(n)
-
-    def close(self):
-        self.pbar.close()
-
-
-class HfFolder:
-    path_token = expanduser("~/.huggingface/token")
-
-    @classmethod
-    def save_token(cls, token):
-        """
-        Save token, creating folder as needed.
-        """
-        os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
-        with open(cls.path_token, "w+") as f:
-            f.write(token)
-
-    @classmethod
-    def get_token(cls):
-        """
-        Get token or None if not existent.
-        """
-        try:
-            with open(cls.path_token, "r") as f:
-                return f.read()
-        except FileNotFoundError:
-            pass
-
-    @classmethod
-    def delete_token(cls):
-        """
-        Delete token. Do not fail if token does not exist.
-        """
-        try:
-            os.remove(cls.path_token)
-        except FileNotFoundError:
-            pass
--- a/utils/logging.py
+++ b/utils/logging.py
-# coding=utf-8
-# Copyright 2020 Optuna, Hugging Face
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Logging utilities. """
-
-import logging
-import os
-import sys
-import threading
-from logging import CRITICAL  # NOQA
-from logging import DEBUG  # NOQA
-from logging import ERROR  # NOQA
-from logging import FATAL  # NOQA
-from logging import INFO  # NOQA
-from logging import NOTSET  # NOQA
-from logging import WARN  # NOQA
-from logging import WARNING  # NOQA
-from typing import Optional
-
-
-_lock = threading.Lock()
-_default_handler: Optional[logging.Handler] = None
-
-log_levels = {
-    "debug": logging.DEBUG,
-    "info": logging.INFO,
-    "warning": logging.WARNING,
-    "error": logging.ERROR,
-    "critical": logging.CRITICAL,
-}
-
-_default_log_level = logging.WARNING
-
-
-def _get_default_logging_level():
-    """
-    If TRANSFORMERS_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
-    not - fall back to ``_default_log_level``
-    """
-    env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
-    if env_level_str:
-        if env_level_str in log_levels:
-            return log_levels[env_level_str]
-        else:
-            logging.getLogger().warning(
-                f"Unknown option TRANSFORMERS_VERBOSITY={env_level_str}, "
-                f"has to be one of: { ', '.join(log_levels.keys()) }"
-            )
-    return _default_log_level
-
-
-def _get_library_name() -> str:
-
-    return __name__.split(".")[0]
-
-
-def _get_library_root_logger() -> logging.Logger:
-
-    return logging.getLogger(_get_library_name())
-
-
-def _configure_library_root_logger() -> None:
-
-    global _default_handler
-
-    with _lock:
-        if _default_handler:
-            # This library has already configured the library root logger.
-            return
-        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
-        _default_handler.flush = sys.stderr.flush
-
-        # Apply our default configuration to the library root logger.
-        library_root_logger = _get_library_root_logger()
-        library_root_logger.addHandler(_default_handler)
-        library_root_logger.setLevel(_get_default_logging_level())
-        library_root_logger.propagate = False
-
-
-def _reset_library_root_logger() -> None:
-
-    global _default_handler
-
-    with _lock:
-        if not _default_handler:
-            return
-
-        library_root_logger = _get_library_root_logger()
-        library_root_logger.removeHandler(_default_handler)
-        library_root_logger.setLevel(logging.NOTSET)
-        _default_handler = None
-
-
-def get_logger(name: Optional[str] = None) -> logging.Logger:
-    """
-    Return a logger with the specified name.
-
-    This function is not supposed to be directly accessed unless you are writing a custom transformers module.
-    """
-
-    if name is None:
-        name = _get_library_name()
-
-    _configure_library_root_logger()
-    return logging.getLogger(name)
-
-
-def get_verbosity() -> int:
-    """
-    Return the current level for the 🤗 Transformers's root logger as an int.
-
-    Returns:
-        :obj:`int`: The logging level.
-
-    .. note::
-
-        🤗 Transformers has following logging levels:
-
-        - 50: ``transformers.logging.CRITICAL`` or ``transformers.logging.FATAL``
-        - 40: ``transformers.logging.ERROR``
-        - 30: ``transformers.logging.WARNING`` or ``transformers.logging.WARN``
-        - 20: ``transformers.logging.INFO``
-        - 10: ``transformers.logging.DEBUG``
-    """
-
-    _configure_library_root_logger()
-    return _get_library_root_logger().getEffectiveLevel()
-
-
-def set_verbosity(verbosity: int) -> None:
-    """
-    Set the vebosity level for the 🤗 Transformers's root logger.
-
-    Args:
-        verbosity (:obj:`int`):
-            Logging level, e.g., one of:
-
-            - ``transformers.logging.CRITICAL`` or ``transformers.logging.FATAL``
-            - ``transformers.logging.ERROR``
-            - ``transformers.logging.WARNING`` or ``transformers.logging.WARN``
-            - ``transformers.logging.INFO``
-            - ``transformers.logging.DEBUG``
-    """
-
-    _configure_library_root_logger()
-    _get_library_root_logger().setLevel(verbosity)
-
-
-def set_verbosity_info():
-    """Set the verbosity to the :obj:`INFO` level."""
-    return set_verbosity(INFO)
-
-
-def set_verbosity_warning():
-    """Set the verbosity to the :obj:`WARNING` level."""
-    return set_verbosity(WARNING)
-
-
-def set_verbosity_debug():
-    """Set the verbosity to the :obj:`DEBUG` level."""
-    return set_verbosity(DEBUG)
-
-
-def set_verbosity_error():
-    """Set the verbosity to the :obj:`ERROR` level."""
-    return set_verbosity(ERROR)
-
-
-def disable_default_handler() -> None:
-    """Disable the default handler of the HuggingFace Transformers's root logger."""
-
-    _configure_library_root_logger()
-
-    assert _default_handler is not None
-    _get_library_root_logger().removeHandler(_default_handler)
-
-
-def enable_default_handler() -> None:
-    """Enable the default handler of the HuggingFace Transformers's root logger."""
-
-    _configure_library_root_logger()
-
-    assert _default_handler is not None
-    _get_library_root_logger().addHandler(_default_handler)
-
-
-def add_handler(handler: logging.Handler) -> None:
-    """adds a handler to the HuggingFace Transformers's root logger."""
-
-    _configure_library_root_logger()
-
-    assert handler is not None
-    _get_library_root_logger().addHandler(handler)
-
-
-def remove_handler(handler: logging.Handler) -> None:
-    """removes given handler from the HuggingFace Transformers's root logger."""
-
-    _configure_library_root_logger()
-
-    assert handler is not None and handler not in _get_library_root_logger().handlers
-    _get_library_root_logger().removeHandler(handler)
-
-
-def disable_propagation() -> None:
-    """
-    Disable propagation of the library log outputs. Note that log propagation is disabled by default.
-    """
-
-    _configure_library_root_logger()
-    _get_library_root_logger().propagate = False
-
-
-def enable_propagation() -> None:
-    """
-    Enable propagation of the library log outputs. Please disable the HuggingFace Transformers's default handler to
-    prevent double logging if the root logger has been configured.
-    """
-
-    _configure_library_root_logger()
-    _get_library_root_logger().propagate = True
-
-
-def enable_explicit_format() -> None:
-    """
-    Enable explicit formatting for every HuggingFace Transformers's logger. The explicit formatter is as follows:
-
-    ::
-
-        [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
-
-    All handlers currently bound to the root logger are affected by this method.
-    """
-    handlers = _get_library_root_logger().handlers
-
-    for handler in handlers:
-        formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
-        handler.setFormatter(formatter)
-
-
-def reset_format() -> None:
-    """
-    Resets the formatting for HuggingFace Transformers's loggers.
-
-    All handlers currently bound to the root logger are affected by this method.
-    """
-    handlers = _get_library_root_logger().handlers
-
-    for handler in handlers:
-        handler.setFormatter(None)