Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions src/transformers/configuration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@
PushToHubMixin,
cached_file,
copy_func,
download_url,
extract_commit_hash,
is_remote_url,
is_torch_available,
logging,
)
Expand Down Expand Up @@ -659,9 +657,6 @@ def _get_config_dict(
# Special case when pretrained_model_name_or_path is a local file
resolved_config_file = pretrained_model_name_or_path
is_local = True
elif is_remote_url(pretrained_model_name_or_path):
configuration_file = pretrained_model_name_or_path if gguf_file is None else gguf_file
resolved_config_file = download_url(pretrained_model_name_or_path)
else:
configuration_file = kwargs.pop("_configuration_file", CONFIG_NAME) if gguf_file is None else gguf_file

Expand Down
6 changes: 0 additions & 6 deletions src/transformers/feature_extraction_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,8 @@
PushToHubMixin,
TensorType,
copy_func,
download_url,
is_numpy_array,
is_offline_mode,
is_remote_url,
is_torch_available,
is_torch_device,
is_torch_dtype,
Expand Down Expand Up @@ -430,10 +428,6 @@ def get_feature_extractor_dict(
resolved_feature_extractor_file = pretrained_model_name_or_path
resolved_processor_file = None
is_local = True
elif is_remote_url(pretrained_model_name_or_path):
feature_extractor_file = pretrained_model_name_or_path
resolved_processor_file = None
resolved_feature_extractor_file = download_url(pretrained_model_name_or_path)
else:
feature_extractor_file = FEATURE_EXTRACTOR_NAME
try:
Expand Down
5 changes: 0 additions & 5 deletions src/transformers/generation/configuration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,7 @@
ExplicitEnum,
PushToHubMixin,
cached_file,
download_url,
extract_commit_hash,
is_remote_url,
is_torch_available,
logging,
)
Expand Down Expand Up @@ -872,9 +870,6 @@ def from_pretrained(
# Special case when config_path is a local file
resolved_config_file = config_path
is_local = True
elif is_remote_url(config_path):
configuration_file = config_path
resolved_config_file = download_url(config_path)
else:
configuration_file = config_file_name
try:
Expand Down
6 changes: 0 additions & 6 deletions src/transformers/image_processing_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,7 @@
PROCESSOR_NAME,
PushToHubMixin,
copy_func,
download_url,
is_offline_mode,
is_remote_url,
logging,
safe_load_json_file,
)
Expand Down Expand Up @@ -283,10 +281,6 @@ def get_image_processor_dict(
resolved_image_processor_file = pretrained_model_name_or_path
resolved_processor_file = None
is_local = True
elif is_remote_url(pretrained_model_name_or_path):
image_processor_file = pretrained_model_name_or_path
resolved_processor_file = None
resolved_image_processor_file = download_url(pretrained_model_name_or_path)
else:
image_processor_file = image_processor_filename
try:
Expand Down
5 changes: 0 additions & 5 deletions src/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,12 @@
cached_file,
check_torch_load_is_safe,
copy_func,
download_url,
has_file,
is_accelerate_available,
is_flash_attn_2_available,
is_flash_attn_3_available,
is_kernels_available,
is_offline_mode,
is_remote_url,
is_torch_flex_attn_available,
is_torch_greater_or_equal,
is_torch_mlu_available,
Expand Down Expand Up @@ -531,9 +529,6 @@ def _get_resolved_checkpoint_files(
elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)):
archive_file = pretrained_model_name_or_path
is_local = True
elif is_remote_url(pretrained_model_name_or_path):
filename = pretrained_model_name_or_path
resolved_archive_file = download_url(pretrained_model_name_or_path)
else:
# set correct filename
if transformers_explicit_filename is not None:
Expand Down
9 changes: 0 additions & 9 deletions src/transformers/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,7 @@
cached_file,
copy_func,
direct_transformers_import,
download_url,
is_offline_mode,
is_remote_url,
is_torch_available,
list_repo_templates,
logging,
Expand Down Expand Up @@ -940,13 +938,6 @@ def get_processor_dict(
resolved_raw_chat_template_file = None
resolved_audio_tokenizer_file = None
is_local = True
elif is_remote_url(pretrained_model_name_or_path):
processor_file = pretrained_model_name_or_path
resolved_processor_file = download_url(pretrained_model_name_or_path)
# can't load chat-template and audio tokenizer when given a file url as pretrained_model_name_or_path
resolved_chat_template_file = None
resolved_raw_chat_template_file = None
resolved_audio_tokenizer_file = None
else:
if is_local:
template_dir = Path(pretrained_model_name_or_path, CHAT_TEMPLATE_DIR)
Expand Down
157 changes: 68 additions & 89 deletions src/transformers/tokenization_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,11 @@
add_end_docstrings,
cached_file,
copy_func,
download_url,
extract_commit_hash,
is_mlx_available,
is_numpy_array,
is_offline_mode,
is_protobuf_available,
is_remote_url,
is_tokenizers_available,
is_torch_available,
is_torch_device,
Expand Down Expand Up @@ -2010,94 +2008,77 @@ def from_pretrained(

is_local = os.path.isdir(pretrained_model_name_or_path)
single_file_id = None
if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
if len(cls.vocab_files_names) > 1 and not gguf_file:
raise ValueError(
f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not "
"supported for this tokenizer. Use a model identifier or the path to a directory instead."
)
warnings.warn(
f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and "
"won't be possible anymore in v5. Use a model identifier or the path to a directory instead.",
FutureWarning,
)
file_id = list(cls.vocab_files_names.keys())[0]

vocab_files[file_id] = pretrained_model_name_or_path
single_file_id = file_id
if gguf_file:
vocab_files["vocab_file"] = gguf_file
else:
if gguf_file:
vocab_files["vocab_file"] = gguf_file
else:
# At this point pretrained_model_name_or_path is either a directory or a model identifier name
additional_files_names = {
"added_tokens_file": ADDED_TOKENS_FILE, # kept only for legacy
"special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, # kept only for legacy
"tokenizer_config_file": TOKENIZER_CONFIG_FILE,
# tokenizer_file used to initialize a slow from a fast. Properly copy the `addedTokens` instead of adding in random orders
"tokenizer_file": FULL_TOKENIZER_FILE,
"chat_template_file": CHAT_TEMPLATE_FILE,
}

vocab_files = {**cls.vocab_files_names, **additional_files_names}
if "tokenizer_file" in vocab_files:
# Try to get the tokenizer config to see if there are versioned tokenizer files.
fast_tokenizer_file = FULL_TOKENIZER_FILE

try:
resolved_config_file = cached_file(
pretrained_model_name_or_path,
TOKENIZER_CONFIG_FILE,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
token=token,
revision=revision,
local_files_only=local_files_only,
subfolder=subfolder,
user_agent=user_agent,
_raise_exceptions_for_missing_entries=False,
_commit_hash=commit_hash,
)
except OSError:
# Re-raise any error raised by cached_file in order to get a helpful error message
raise
except Exception:
# For any other exception, we throw a generic error.
raise OSError(
f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
"'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
f"containing all relevant files for a {cls.__name__} tokenizer."
)
# At this point pretrained_model_name_or_path is either a directory or a model identifier name
additional_files_names = {
"added_tokens_file": ADDED_TOKENS_FILE, # kept only for legacy
"special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, # kept only for legacy
"tokenizer_config_file": TOKENIZER_CONFIG_FILE,
# tokenizer_file used to initialize a slow from a fast. Properly copy the `addedTokens` instead of adding in random orders
"tokenizer_file": FULL_TOKENIZER_FILE,
"chat_template_file": CHAT_TEMPLATE_FILE,
}

commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
if resolved_config_file is not None:
with open(resolved_config_file, encoding="utf-8") as reader:
tokenizer_config = json.load(reader)
if "fast_tokenizer_files" in tokenizer_config:
fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"])
vocab_files["tokenizer_file"] = fast_tokenizer_file

# This block looks for any extra chat template files
if is_local:
template_dir = Path(pretrained_model_name_or_path, CHAT_TEMPLATE_DIR)
if template_dir.is_dir():
for template_file in template_dir.glob("*.jinja"):
template_name = template_file.name.removesuffix(".jinja")
vocab_files[f"chat_template_{template_name}"] = (
f"{CHAT_TEMPLATE_DIR}/{template_file.name}"
)
else:
for template in list_repo_templates(
pretrained_model_name_or_path,
local_files_only=local_files_only,
revision=revision,
cache_dir=cache_dir,
token=token,
):
template = template.removesuffix(".jinja")
vocab_files[f"chat_template_{template}"] = f"{CHAT_TEMPLATE_DIR}/{template}.jinja"
vocab_files = {**cls.vocab_files_names, **additional_files_names}
if "tokenizer_file" in vocab_files:
# Try to get the tokenizer config to see if there are versioned tokenizer files.
fast_tokenizer_file = FULL_TOKENIZER_FILE

try:
resolved_config_file = cached_file(
pretrained_model_name_or_path,
TOKENIZER_CONFIG_FILE,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
token=token,
revision=revision,
local_files_only=local_files_only,
subfolder=subfolder,
user_agent=user_agent,
_raise_exceptions_for_missing_entries=False,
_commit_hash=commit_hash,
)
except OSError:
# Re-raise any error raised by cached_file in order to get a helpful error message
raise
except Exception:
# For any other exception, we throw a generic error.
raise OSError(
f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
"'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
f"containing all relevant files for a {cls.__name__} tokenizer."
)

commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
if resolved_config_file is not None:
with open(resolved_config_file, encoding="utf-8") as reader:
tokenizer_config = json.load(reader)
if "fast_tokenizer_files" in tokenizer_config:
fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"])
vocab_files["tokenizer_file"] = fast_tokenizer_file

# This block looks for any extra chat template files
if is_local:
template_dir = Path(pretrained_model_name_or_path, CHAT_TEMPLATE_DIR)
if template_dir.is_dir():
for template_file in template_dir.glob("*.jinja"):
template_name = template_file.name.removesuffix(".jinja")
vocab_files[f"chat_template_{template_name}"] = f"{CHAT_TEMPLATE_DIR}/{template_file.name}"
else:
for template in list_repo_templates(
pretrained_model_name_or_path,
local_files_only=local_files_only,
revision=revision,
cache_dir=cache_dir,
token=token,
):
template = template.removesuffix(".jinja")
vocab_files[f"chat_template_{template}"] = f"{CHAT_TEMPLATE_DIR}/{template}.jinja"

if not is_local and not local_files_only:
try:
Expand All @@ -2121,8 +2102,6 @@ def from_pretrained(
elif single_file_id == file_id:
if os.path.isfile(file_path):
resolved_vocab_files[file_id] = file_path
elif is_remote_url(file_path):
resolved_vocab_files[file_id] = download_url(file_path, proxies=proxies)
else:
try:
resolved_vocab_files[file_id] = cached_file(
Expand Down
2 changes: 0 additions & 2 deletions src/transformers/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,10 @@
cached_file,
default_cache_path,
define_sagemaker_information,
download_url,
extract_commit_hash,
has_file,
http_user_agent,
is_offline_mode,
is_remote_url,
list_repo_templates,
try_to_load_from_cache,
)
Expand Down
Loading