From 3de70fa6b6312c274ab9026bd38e85b6dc3a5939 Mon Sep 17 00:00:00 2001
From: Tim Lodemann <timalodemann@gmail.com>
Date: Mon, 23 Jun 2025 22:54:56 +0200
Subject: [PATCH] Fix splitting algorithm for unknown characters

Fix #10
---
 wordninja.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/wordninja.py b/wordninja.py
index 4d7ce15..68773f7 100644
--- a/wordninja.py
+++ b/wordninja.py
@@ -51,7 +51,28 @@ def _split(self, s):
     # Returns a pair (match_cost, match_length).
     def best_match(i):
       candidates = enumerate(reversed(cost[max(0, i-self._maxword):i]))
-      return min((c + self._wordcost.get(s[i-k-1:i].lower(), 9e999), k+1) for k,c in candidates)
+      min_cost = float('inf')
+      best_k = 0
+
+      for k, c in candidates:
+          word = s[i-k-1:i].lower()
+          word_cost = self._wordcost.get(word)
+
+          if word_cost is None:
+              if len(word) == 1:
+                  # Use a high (but not infinite) penalty for unknown single characters to allow continuation of the algorithm.
+                  word_cost = 25
+              else:
+                  # Use a a very high penalty for unknown longer words to force splitting into known words.
+                  word_cost = 9e999
+
+          current_total_cost = c + word_cost
+
+          if current_total_cost < min_cost:
+              min_cost = current_total_cost
+              best_k = k + 1
+
+      return min_cost, best_k
 
     # Build the cost array.
     cost = [0]