From 82f7f8a2374aaa5c2ce345c63eb12f5b4735379f Mon Sep 17 00:00:00 2001 From: "TF.Text Team" Date: Thu, 30 Oct 2025 11:38:03 -0700 Subject: [PATCH] 1. Ignore SentencePiece::BYTE during encoding instead of throwing error 2. Early exit from DecodePrecompiledCharsmap when precompiled_charsmap is empty PiperOrigin-RevId: 826120506 --- tensorflow_text/core/kernels/sentencepiece/BUILD | 1 + tensorflow_text/core/kernels/sentencepiece/model_converter.cc | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/tensorflow_text/core/kernels/sentencepiece/BUILD b/tensorflow_text/core/kernels/sentencepiece/BUILD index 473dc1c6e..db2792090 100644 --- a/tensorflow_text/core/kernels/sentencepiece/BUILD +++ b/tensorflow_text/core/kernels/sentencepiece/BUILD @@ -10,6 +10,7 @@ licenses(["notice"]) # Visibility rules package(default_visibility = [ + "//java/com/google/android/apps/pixel/psi:__subpackages__", "//visibility:public", ]) diff --git a/tensorflow_text/core/kernels/sentencepiece/model_converter.cc b/tensorflow_text/core/kernels/sentencepiece/model_converter.cc index 6eb7b4b05..61814ce33 100644 --- a/tensorflow_text/core/kernels/sentencepiece/model_converter.cc +++ b/tensorflow_text/core/kernels/sentencepiece/model_converter.cc @@ -46,6 +46,9 @@ DecodePrecompiledCharsmap( const ::sentencepiece::NormalizerSpec& normalizer_spec) { // This function "undoes" encoding done by // sentencepiece::normalizer::Normalizer::EncodePrecompiledCharsMap. + if (normalizer_spec.precompiled_charsmap().empty()) { + return std::make_tuple(std::vector(), std::vector()); + } const char* precompiled_map = normalizer_spec.precompiled_charsmap().data(); const uint32_t trie_size = *reinterpret_cast(precompiled_map); @@ -89,6 +92,7 @@ absl::StatusOr ConvertSentencepieceModelToFlatBuffer( break; case ::sentencepiece::ModelProto::SentencePiece::UNKNOWN: case ::sentencepiece::ModelProto::SentencePiece::CONTROL: + case ::sentencepiece::ModelProto::SentencePiece::BYTE: // Ignore unknown and control codes. break; default: