diff --git a/UnicodeBasic.lean b/UnicodeBasic.lean index eca1c62aa..c8eead4e6 100644 --- a/UnicodeBasic.lean +++ b/UnicodeBasic.lean @@ -2,9 +2,11 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ +module +public import UnicodeBasic.Types +public import UnicodeBasic.TableLookup -import UnicodeBasic.Types -import UnicodeBasic.TableLookup +public section /-! # General API # diff --git a/UnicodeBasic/CharacterDatabase.lean b/UnicodeBasic/CharacterDatabase.lean index b587127b6..fa22df1ed 100644 --- a/UnicodeBasic/CharacterDatabase.lean +++ b/UnicodeBasic/CharacterDatabase.lean @@ -2,6 +2,9 @@ Copyright © 2023 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ +module + +public section namespace Unicode @@ -61,7 +64,3 @@ protected def next? (stream : UCDStream) : Option (Array String.Slice × UCDStre instance : Std.Stream UCDStream (Array String.Slice) where next? := UCDStream.next? - -end UCDStream - -end Unicode diff --git a/UnicodeBasic/Hangul.lean b/UnicodeBasic/Hangul.lean index e1b7f9e18..b16b276dc 100644 --- a/UnicodeBasic/Hangul.lean +++ b/UnicodeBasic/Hangul.lean @@ -2,33 +2,36 @@ Copyright © 2024 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ +module + +public section namespace Unicode.Hangul -private def shortNameL : Array String := +def shortNameL : Array String := #["G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"] -private def shortNameV : Array String := +def shortNameV : Array String := #["A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"] -private def shortNameT : Array String := +def shortNameT : Array String := #["", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"] -private abbrev sizeL := shortNameL.size -- 19 -private abbrev sizeV := shortNameV.size -- 21 -private abbrev sizeT := shortNameT.size -- 28 -private abbrev sizeLV := sizeL * sizeV -- 399 -private abbrev sizeVT := sizeV * sizeT -- 588 -private abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172 +abbrev sizeL := shortNameL.size -- 19 +abbrev sizeV := shortNameV.size -- 21 +abbrev sizeT := shortNameT.size -- 28 +abbrev sizeLV := sizeL * sizeV -- 399 +abbrev sizeVT := sizeV * sizeT -- 588 +abbrev sizeLVT := sizeL * sizeV * sizeT -- 11172 -private abbrev baseL : UInt32 := 0x1100 -private abbrev baseV : UInt32 := 0x1161 -private abbrev baseT : UInt32 := 0x11A7 +abbrev baseL : UInt32 := 0x1100 +abbrev baseV : UInt32 := 0x1161 +abbrev baseT : UInt32 := 0x11A7 /-- Number of Hangul syllables -/ def Syllable.size := sizeLVT @@ -112,5 +115,3 @@ def getSyllable! (code : UInt32) : Syllable := match getSyllable? code with | some s => s | none => panic! "not a Hangul syllable" - -end Unicode.Hangul diff --git a/UnicodeBasic/TableLookup.lean b/UnicodeBasic/TableLookup.lean index 2733a4bce..0251d7cde 100644 --- a/UnicodeBasic/TableLookup.lean +++ b/UnicodeBasic/TableLookup.lean @@ -2,10 +2,12 @@ Copyright © 2024-2025 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ - +module import UnicodeBasic.CharacterDatabase import UnicodeBasic.Hangul -import UnicodeBasic.Types +public import UnicodeBasic.Types + +public section namespace Unicode @@ -406,5 +408,3 @@ def lookupScriptName (s : Script) : Option String.Slice := where str : String := include_str "../data/table/Script_Name.txt" table : Thunk <| Array (UInt32 × String.Slice) := parseTable str fun _ n => n[0]! - -end Unicode diff --git a/UnicodeBasic/Types.lean b/UnicodeBasic/Types.lean index ee0505525..e665c4a60 100644 --- a/UnicodeBasic/Types.lean +++ b/UnicodeBasic/Types.lean @@ -2,8 +2,10 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ +module +public import Std.Data.HashMap -import Std.Data.HashMap +public section /-- Low-level conversion from `UInt32` to `Char` (*unsafe*) @@ -206,6 +208,7 @@ deriving DecidableEq /-- General category (GC) Unicode property: `General_Category` -/ +@[expose] def GC := UInt32 deriving DecidableEq, Inhabited namespace GC @@ -414,11 +417,12 @@ def toAbbrev! (x : GC) : String := | [a] => a | _ => panic! "invalid general category" -open Std.Format Repr in instance : Repr GC where - reprPrec x := addAppParen (group (joinSep (reprAux x |>.map (text "Unicode.GC." ++ text ·)) (text " |||" ++ line)) .fill) +open Std.Format Repr in + def reprPrec (x : GC) := addAppParen (group (joinSep (reprAux x |>.map (text "Unicode.GC." ++ text ·)) (text " |||" ++ line)) .fill) + instance : Repr GC where reprPrec -instance : ToString GC where - toString x := " | ".intercalate (reprAux x) +def toString (x : GC) := " | ".intercalate (reprAux x) +instance : ToString GC where toString def ofAbbrev? (s : String.Slice) : Option GC := match s.chars.take 3 |>.toList with @@ -466,7 +470,7 @@ def ofAbbrev? (s : String.Slice) : Option GC := def ofAbbrev! (s : String.Slice) : GC := match ofAbbrev? s with - | some c => c + | .some c => c | none => panic! "invalid general category" def ofString? (s : String.Slice) : Option GC := do @@ -477,7 +481,7 @@ def ofString? (s : String.Slice) : Option GC := do def ofString! (s : String.Slice) : GC := match ofString? s with - | some c => c + | .some c => c | none => panic! "invalid general category" end GC @@ -1072,5 +1076,3 @@ def ofAbbrev? (abbr : String.Slice) : Option Script := def ofAbbrev! (abbr : String.Slice) : Script := ofAbbrev? abbr |>.get! end Script - -end Unicode diff --git a/UnicodeData.lean b/UnicodeData.lean index 5aeb9a3e2..50fbd1c25 100644 --- a/UnicodeData.lean +++ b/UnicodeData.lean @@ -3,7 +3,8 @@ Copyright © 2023-2026 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ -import UnicodeData.Aliases -import UnicodeData.Basic -import UnicodeData.PropList -import UnicodeData.Scripts +module +public import UnicodeData.Aliases +public import UnicodeData.Basic +public import UnicodeData.PropList +public import UnicodeData.Scripts diff --git a/UnicodeData/Aliases.lean b/UnicodeData/Aliases.lean index 0067ccc13..665396de0 100644 --- a/UnicodeData/Aliases.lean +++ b/UnicodeData/Aliases.lean @@ -2,11 +2,13 @@ Copyright © 2026 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ - -import Std.Data.HashMap +module +public import Std.Data.HashMap import UnicodeBasic.Types import UnicodeBasic.CharacterDatabase +public section + namespace Unicode /-- Type for list of aliases -/ diff --git a/UnicodeData/Basic.lean b/UnicodeData/Basic.lean index 3ace8974c..9d443c4b3 100644 --- a/UnicodeData/Basic.lean +++ b/UnicodeData/Basic.lean @@ -2,10 +2,12 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ - +module import UnicodeBasic.CharacterDatabase import UnicodeBasic.Hangul -import UnicodeBasic.Types +public import UnicodeBasic.Types + +public section namespace Unicode @@ -143,29 +145,8 @@ def UnicodeData.mkTangutIdeograph (c : UInt32) : UnicodeData where protected def UnicodeData.txt := include_str "../data/UnicodeData.txt" /-- Parse `UnicodeData.txt` -/ -private unsafe def UnicodeData.init : IO (Array UnicodeData) := do - let stream := UCDStream.ofString UnicodeData.txt - let mut arr := #[] - for record in stream do - arr := arr.push { - code := ofHexString! record[0]! - name := record[1]! - gc := GC.ofAbbrev! record[2]! - cc := record[3]!.toNat! - bidi := BidiClass.ofAbbrev! record[4]! - decomp := getDecompositionMapping? record[5]! - numeric := getNumericType? record[6]! record[7]! record[8]! - bidiMirrored := record[9]! == "Y" - uppercase := if record[12]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12]! - lowercase := if record[13]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13]! - titlecase := if record[14]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14]! - } - return arr - -where - - /-- Get decomposition mapping -/ - getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do +unsafe initialize UnicodeData.data : Array UnicodeData ← + let getDecompositionMapping? (s : String.Slice) : Option DecompositionMapping := do /- The value of the `Decomposition_Mapping` property for a character is provided in field 5 of `UnicodeData.txt`. This is a string-valued @@ -218,8 +199,10 @@ where some ⟨tag, cs⟩ | [] => unreachable! - /-- Get numeric type -/ - getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do + let getDigitUnsafe (char : Char) : Fin 10 := + unsafeCast (char.val - '0'.val).toNat + + let getNumericType? (s₁ s₂ s₃ : String.Slice) : Option NumericType := do /- If the character has the property value `Numeric_Type=Decimal`, then the `Numeric_Value` of that digit is represented with an integer value @@ -263,14 +246,23 @@ where else return .decimal <| getDigitUnsafe <| s₁.front - /-- Get decimal digit -/ - @[inline] - getDigitUnsafe (char : Char) : Fin 10 := - unsafeCast (char.val - '0'.val).toNat - -/-- Parsed data from `UnicodeData.txt` -/ -@[init UnicodeData.init] -protected def UnicodeData.data : Array UnicodeData := #[] + let stream := UCDStream.ofString UnicodeData.txt + let mut arr := #[] + for record in stream do + arr := arr.push { + code := ofHexString! record[0]! + name := record[1]! + gc := GC.ofAbbrev! record[2]! + cc := record[3]!.toNat! + bidi := BidiClass.ofAbbrev! record[4]! + decomp := getDecompositionMapping? record[5]! + numeric := getNumericType? record[6]! record[7]! record[8]! + bidiMirrored := record[9]! == "Y" + uppercase := if record[12]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[12]! + lowercase := if record[13]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[13]! + titlecase := if record[14]!.isEmpty then none else some <| Char.mkUnsafe <| ofHexString! record[14]! + } + return arr /-- Get code point data from `UnicodeData.txt` -/ partial def getUnicodeData? (code : UInt32) : Option UnicodeData := do @@ -370,12 +362,12 @@ structure UnicodeDataStream where default : UInt32 → UnicodeData := UnicodeData.mkNoncharacter deriving Inhabited -private def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do +def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeData × UnicodeDataStream) := do let c := s.code let i := s.index if c > Unicode.max then none - else if h : i < UnicodeData.data.size.toUSize then + else if h : i.toNat < UnicodeData.data.size then let d := UnicodeData.data[i] let n := d.name if c < d.code then @@ -410,5 +402,3 @@ private def UnicodeDataStream.next? (s : UnicodeDataStream) : Option (UnicodeDat instance : Std.Stream UnicodeDataStream UnicodeData where next? := UnicodeDataStream.next? - -end Unicode diff --git a/UnicodeData/PropList.lean b/UnicodeData/PropList.lean index e43cd62e0..8a2133be8 100644 --- a/UnicodeData/PropList.lean +++ b/UnicodeData/PropList.lean @@ -2,10 +2,12 @@ Copyright © 2023-2025 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ - +module import UnicodeBasic.Types import UnicodeBasic.CharacterDatabase +public section + namespace Unicode /-- Structure containing supported character properties from `PropList.txt` -/ @@ -35,7 +37,7 @@ deriving Inhabited, Repr /-- Raw string form `PropList.txt` -/ protected def PropList.txt := include_str "../data/PropList.txt" -private unsafe def PropList.init : IO PropList := do +unsafe initialize PropList.data : PropList ← let stream := UCDStream.ofString PropList.txt let mut list : PropList := {} for record in stream do @@ -66,10 +68,6 @@ private unsafe def PropList.init : IO PropList := do list := {list with deprecated := list.deprecated.push val} return list -/-- Parsed data from `PropList.txt` -/ -@[init PropList.init] -protected def PropList.data : PropList := {} - -- TODO: stop reinventing the wheel! /-- Binary search -/ private partial def find (code : UInt32) (data : Array (UInt32 × Option UInt32)) (lo hi : Nat) : Nat := @@ -174,5 +172,3 @@ def PropList.isDeprecated (code : UInt32) : Bool := match data[find code data 0 data.size]! with | (val, none) => code == val | (_, some top) => code <= top - -end Unicode diff --git a/UnicodeData/Scripts.lean b/UnicodeData/Scripts.lean index 52ab9f58f..221d22325 100644 --- a/UnicodeData/Scripts.lean +++ b/UnicodeData/Scripts.lean @@ -2,11 +2,12 @@ Copyright © 2026 François G. Dorais. All rights reserved. Released under Apache 2.0 license as described in the file LICENSE. -/ - -import Std.Data.HashMap +module +public import UnicodeData.Aliases import UnicodeBasic.Types import UnicodeBasic.CharacterDatabase -import UnicodeData.Aliases + +public section namespace Unicode