diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index ef8bf4414..8a73e9cd6 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -16,7 +16,7 @@ jobs: runs-on: windows-latest strategy: matrix: - python: ["3.9", "3.10", "3.11", "3.12", "3.13"] + python: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -52,6 +52,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest] + python: ["3.13", "3.13t"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -72,7 +73,7 @@ jobs: - name: Install Python uses: actions/setup-python@v5 with: - python-version: 3.13 + python-version: ${{ matrix.python }} architecture: "x64" diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index 28f6b38d4..e0b546288 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -557,6 +557,44 @@ def test_multiprocessing_with_parallelism(self): multiprocessing_with_parallelism(tokenizer, False) multiprocessing_with_parallelism(tokenizer, True) + def test_multithreaded_concurrency(self): + + # Thread worker functions + def encode_batch(batch): + tokenizer = Tokenizer(BPE()) + return tokenizer.encode_batch(batch) + + def encode_batch_fast(batch): + tokenizer = Tokenizer(BPE()) + return tokenizer.encode_batch_fast(batch) + + # Create some significant workload + batches = [ + ["my name is john " * 50] * 20, + ["my name is paul " * 50] * 20, + ["my name is ringo " * 50] * 20, + ] + + # Many encoding operations to run concurrently + tasks = [ + (encode_batch, batches[0]), + (encode_batch_fast, batches[1]), + (encode_batch, batches[2]), + ] * 10 + + executor = concurrent.futures.ThreadPoolExecutor(max_workers=4) + + futures = [] + for task in tasks: + futures.append(executor.submit(*task)) + + # All tasks should complete successfully + results = [f.result() for f in futures] + + # Verify results + assert len(results) == 30 + assert all(len(result) == 20 for result in results) + def test_from_pretrained(self): tokenizer = Tokenizer.from_pretrained("bert-base-cased") output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False)