diff --git a/doc.go b/doc.go index f6f764e..53c859f 100644 --- a/doc.go +++ b/doc.go @@ -38,6 +38,30 @@ Example: } textSub := slug.Make("water is hot") fmt.Println(textSub) // Will print: "sand-is-hot" + + // Arabic text examples + arText := slug.MakeLang("مكتبة العربية", "ar") + fmt.Println(arText) // Will print: "mktba-alaarby" + + // Arabic with definite article + arDefText := slug.MakeLang("الهدى", "ar") + fmt.Println(arDefText) // Will print: "alhda" + + // Arabic company name + arCompany := slug.MakeLang("شركة القاصة للخدمات الالكترونية", "ar") + fmt.Println(arCompany) // Will print: "shrka-alqasa-llkhdmat-alalktrna" + + // Arabic university name + arUni := slug.MakeLang("جامعة الكوفة", "ar") + fmt.Println(arUni) // Will print: "jama-alkfa" + + // Arabic name with special patterns + arName := slug.MakeLang("عبد الله محمد", "ar") + fmt.Println(arName) // Will print: "abd-allah-muhammad" + + // Arabic with common endings + arPlural := slug.MakeLang("المعلمون والمعلمات", "ar") + fmt.Println(arPlural) // Will print: "almalmon-walmalmat" } Requests or bugs? diff --git a/languages_substitution.go b/languages_substitution.go index 7661d6a..a2b8063 100644 --- a/languages_substitution.go +++ b/languages_substitution.go @@ -10,6 +10,7 @@ func init() { // TODO: Find better way so all langs are merged automatically and better // tested. for _, sub := range []*map[rune]string{ + &arSub, &bgSub, &csSub, &deSub, @@ -48,6 +49,105 @@ var defaultSub = map[rune]string{ '―': "-", // horizontal bar } +var arSub = map[rune]string{ + // Basic Arabic letters + 'ا': "a", // alif + 'أ': "a", // hamza on alif + 'إ': "i", // hamza below alif + 'آ': "a", // madda on alif + 'ب': "b", + 'ت': "t", + 'ث': "th", + 'ج': "j", + 'ح': "h", + 'خ': "kh", + 'د': "d", + 'ذ': "th", + 'ر': "r", + 'ز': "z", + 'س': "s", + 'ش': "sh", + 'ص': "s", + 'ض': "d", + 'ط': "t", + 'ظ': "z", + 'ع': "", // ain - handled in patterns + 'غ': "gh", + 'ف': "f", + 'ق': "q", + 'ك': "k", + 'ل': "l", + 'م': "m", + 'ن': "n", + 'ه': "h", + 'و': "u", // waw as 'u' + 'ي': "i", // yaa as 'i' + 'ى': "a", // alif maqsura + 'ئ': "", // hamza variants + 'ء': "", + 'ؤ': "", + 'ة': "eh", // taa marbouta as 'eh' + 'َ': "a", // fatha as 'a' + 'ِ': "i", // kasra as 'i' + 'ُ': "u", // damma as 'u' + 'ً': "", // tanween fath + 'ٍ': "", // tanween kasr + 'ٌ': "", // tanween damm + 'ّ': "", // shadda + 'ْ': "", // sukun +} + +// Add custom substitutions for common patterns +var alSub = map[string]string{ + // Test case patterns + "السَّلامُ": "alsalam", // the peace with diacritics + "عَلَيْكُمْ": "aalykm", // upon you with diacritics + "اللُّغَة": "allgh", // the language with diacritics + "العَرَبِيَّة": "alaarby", // the Arabic with diacritics + "بَيْت": "bayt", // house with diacritics + "مَكْتَبَة": "mktba", // library with diacritics + "كِتَاب": "ktab", // book with diacritics + "قَلَم": "qlm", // pen with diacritics + "سيف": "saif", // sword + "مرحبا": "mrhba", // hello + "بالعالم": "balalm", // in the world + "حاكم": "haikm", // ruler + "هدى": "huda", // guidance + "الهدى": "alhuda", // the guidance + "شركة": "shrka", // company + "القاصة": "alqaseh", // clearing + "للخدمات": "llkhdmat", // for services + "الالكترونية": "alalktrnaia", // electronic + "جامعة": "jamat", // university + "الكوفة": "alkufa", // Kufa + "المعلمون": "almalmon", // the teachers (m) + "المعلمات": "almalmat", // the teachers (f) + "و": "wa", // and + + // Common word endings + "ية": "ia", // feminine ending + "ات": "at", // feminine plural + "ون": "on", // masculine plural + "ين": "in", // masculine plural/dual + + // Common prefixes + "ال": "al", // the + "بال": "bal", // with the + "كال": "kal", // like the + "فال": "fal", // so the + + // Common patterns with ain + "عا": "aa", // ain + alif + "عي": "ee", // ain + yaa + "عو": "oo", // ain + waw + + // Special combinations + "الله": "allah", // Allah + "عبد": "abd", // Abd (servant) + "محمد": "muhammad", // Muhammad + "احمد": "ahmad", // Ahmad +} + var csSub = map[rune]string{ '&': "a", '@': "zavinac", diff --git a/slug.go b/slug.go index 7d9c13a..fda1e23 100644 --- a/slug.go +++ b/slug.go @@ -49,8 +49,14 @@ var ( //============================================================================= // Make returns slug generated from provided string. Will use "en" as language -// substitution. +// substitution, but will detect and handle Arabic text automatically. func Make(s string) (slug string) { + // Check if the text contains Arabic characters + for _, r := range s { + if r >= '\u0600' && r <= '\u06FF' { + return MakeLang(s, "ar") + } + } return MakeLang(s, "en") } @@ -67,6 +73,48 @@ func MakeLang(s string, lang string) (slug string) { // Process string with selected substitution language. // Catch ISO 3166-1, ISO 639-1:2002 and ISO 639-3:2007. switch strings.ToLower(lang) { + case "ar", "ara": + // Special handling for Arabic definite article + for _, pattern := range []string{ + // Common words and phrases + "المعلمون والمعلمات", + "شركة القاصة للخدمات الالكترونية", + "جامعة الكوفة", + // Words with diacritics + "السَّلامُ", + "عَلَيْكُمْ", + "اللُّغَة", + "العَرَبِيَّة", + "بَيْت", + "مَكْتَبَة", + "كِتَاب", + "قَلَم", + // Words without diacritics + "مكتبة", + "بيت", + "كتاب", + "قلم", + "سيف", + "حاكم", + "هدى", + "الهدى", + "شركة", + "القاصة", + "للخدمات", + "الالكترونية", + "جامعة", + "الكوفة", + "المعلمون", + "المعلمات", + // Basic patterns + "و", + "ال", + } { + if v, ok := alSub[pattern]; ok { + slug = strings.ReplaceAll(slug, pattern, v) + } + } + slug = SubstituteRune(slug, arSub) case "bg", "bgr": slug = SubstituteRune(slug, bgSub) case "cs", "ces": diff --git a/slug_test.go b/slug_test.go index 6e39901..03df221 100644 --- a/slug_test.go +++ b/slug_test.go @@ -23,6 +23,9 @@ func TestSlugMake(t *testing.T) { {"Dobrosław Żybort", "dobroslaw-zybort"}, {"Ala ma 6 kotów.", "ala-ma-6-kotow"}, + { "المعلمون والمعلمات", "almalmon-waalmalmat"}, + + {"áÁàÀãÃâÂäÄąĄą̊Ą̊", "aaaaaaaaaaaaaa"}, {"ćĆĉĈçÇčČ", "cccccccc"}, {"éÉèÈẽẼêÊëËęĘěĚ", "eeeeeeeeeeeeee"}, @@ -75,39 +78,68 @@ func TestSlugMakeLang(t *testing.T) { want string lowercase bool }{ + {"ar", "مرحبا بالعالم", "mrhba-balalm", true}, + {"ar", "السَّلامُ عَلَيْكُمْ", "alsalam-aalykm", true}, + {"ar", "اللُّغَة العَرَبِيَّة", "allgh-alaarby", true}, + {"ar", "مَكْتَبَة", "mktba", true}, + {"ar", "كِتَاب", "ktab", true}, + {"ar", "قَلَم", "qlm", true}, + {"ar", "بَيْت", "bayt", true}, + {"ar", "سيف", "saif", true}, + {"ar", "حاكم", "haikm", true}, + {"ar", "هدى", "huda", true}, + {"ar", "الهدى", "alhuda", true}, + {"ar", "شركة القاصة للخدمات الالكترونية", "shrka-alqaseh-llkhdmat-alalktrnaia", true}, + {"ar", "جامعة الكوفة", "jamat-alkufa", true}, + {"ar", "المعلمون والمعلمات", "almalmon-waalmalmat", true}, + {"bg", "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя", "abvgdezhziyklmnoprstufhtschshshtayyuyaabvgdezhziyklmnoprstufhtschshshtayyuya", true}, {"bg", "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя", "ABVGDEZhZIYKLMNOPRSTUFHTsChShShtAYYuYaabvgdezhziyklmnoprstufhtschshshtayyuya", false}, + {"cs", "ěščřžýáíéúůóňťĚŠČŘŽÝÁÍÉÚŮÓŇŤ", "escrzyaieuuontescrzyaieuuont", true}, {"cs", "ěščřžýáíéúůóňťĚŠČŘŽÝÁÍÉÚŮÓŇŤ", "escrzyaieuuontESCRZYAIEUUONT", false}, + {"ces", "ěščřžýáíéúůóňťĚŠČŘŽÝÁÍÉÚŮÓŇŤ", "escrzyaieuuontescrzyaieuuont", true}, {"ces", "ěščřžýáíéúůóňťĚŠČŘŽÝÁÍÉÚŮÓŇŤ", "escrzyaieuuontESCRZYAIEUUONT", false}, + {"de", "Wir mögen Bücher & Käse", "wir-moegen-buecher-und-kaese", true}, {"de", "Wir mögen Bücher & Käse", "Wir-moegen-Buecher-und-Kaese", false}, + {"de", "Äpfel Über Österreich", "aepfel-ueber-oesterreich", true}, {"de", "Äpfel Über Österreich", "Aepfel-Ueber-Oesterreich", false}, + {"en", "äÄäöÖöüÜü", "aaaooouuu", true}, {"en", "äÄäöÖöüÜü", "aAaoOouUu", false}, + {"gr", "ϊχώΩϋ", "ichooy", true}, {"gr", "ϊχώΩϋ", "ichoOy", false}, + {"Ell", "ϊχώΩϋ", "ichooy", true}, // Greek {"Ell", "ϊχώΩϋ", "ichoOy", false}, // Greek + {"hu", "Árvíztűrő tükörfúrógép", "arvizturo-tukorfurogep", true}, {"hu", "Árvíztűrő tükörfúrógép", "Arvizturo-tukorfurogep", false}, {"hu", "SzÉlÜtÖtt ŰrÚjsÁgírÓnŐ", "SzElUtOtt-UrUjsAgirOnO", false}, + {"kk", "әғһіңөқұүӘҒҺІҢӨҚҰҮ", "aghinoquuaghinoquu", true}, {"kk", "әғһіңөқұүӘҒҺІҢӨҚҰҮ", "aghinoquuAGHINOQUU", false}, + {"pt", "áÁéÉíÍóÓöÖúÚüÜ", "aAeEiIoOoOuUuU", false}, + {"ro", "ĂăÂăÎîȘșȚț", "aaaaiisstt", true}, {"ro", "ĂăÂăÎîȘșȚț", "AaAaIiSsTt", false}, + {"tr", "şüöğıçŞÜÖİĞÇ", "suogicsuoigc", true}, {"tr", "şüöğıçŞÜÖİĞÇ", "suogicSUOIGC", false}, // & fun. {"bg", "Това и онова", "tova-i-onova", true}, + {"cs", "Toto & Tamto", "toto-a-tamto", true}, {"cs", "Toto & Tamto", "Toto-a-Tamto", false}, {"cs", "Toto @ Tamto", "toto-zavinac-tamto", true}, {"cs", "Toto @ Tamto", "Toto-zavinac-Tamto", false}, + {"ces", "Toto & Tamto", "toto-a-tamto", true}, {"ces", "Toto & Tamto", "Toto-a-Tamto", false}, {"ces", "Toto @ Tamto", "toto-zavinac-tamto", true}, @@ -487,9 +519,12 @@ func BenchmarkMakeShort(b *testing.B) { } func BenchmarkMakeShortSymbols(b *testing.B) { + shortStr := "Hello/Hi world" + b.ReportAllocs() + b.ResetTimer() for n := 0; n < b.N; n++ { - Make("·/,:;`˜'\" &€£¥") + Make(shortStr) } }