-
-
Notifications
You must be signed in to change notification settings - Fork 14.7k
Add APIs for dealing with titlecase #122668
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -778,7 +778,72 @@ impl char { | |
| pub fn is_alphabetic(self) -> bool { | ||
| match self { | ||
| 'a'..='z' | 'A'..='Z' => true, | ||
| c => c > '\x7f' && unicode::Alphabetic(c), | ||
| '\0'..='\u{A9}' => false, | ||
| _ => unicode::Alphabetic(self), | ||
| } | ||
| } | ||
|
|
||
| /// Returns `true` if this `char` has the `Cased` property. | ||
| /// A character is cased if and only if it is uppercase, lowercase, or titlecase. | ||
| /// | ||
| /// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and | ||
| /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. | ||
| /// | ||
| /// [Unicode Standard]: https://www.unicode.org/versions/latest/ | ||
| /// [ucd]: https://www.unicode.org/reports/tr44/ | ||
| /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// Basic usage: | ||
| /// | ||
| /// ``` | ||
| /// #![feature(titlecase)] | ||
| /// assert!('A'.is_cased()); | ||
| /// assert!('a'.is_cased()); | ||
| /// assert!(!'京'.is_cased()); | ||
| /// ``` | ||
| #[must_use] | ||
| #[unstable(feature = "titlecase", issue = "153892")] | ||
| #[inline] | ||
| pub fn is_cased(self) -> bool { | ||
| match self { | ||
| 'a'..='z' | 'A'..='Z' => true, | ||
| '\0'..='\u{A9}' => false, | ||
| _ => unicode::Cased(self), | ||
| } | ||
| } | ||
|
|
||
| /// Returns the case of this character: | ||
| /// [`Some(CharCase::Upper)`][`CharCase::Upper`] if [`self.is_uppercase()`][`char::is_uppercase`], | ||
| /// [`Some(CharCase::Lower)`][`CharCase::Lower`] if [`self.is_lowercase()`][`char::is_lowercase`], | ||
| /// [`Some(CharCase::Title)`][`CharCase::Title`] if [`self.is_titlecase()`][`char::is_titlecase`], and | ||
| /// `None` if [`!self.is_cased()`][`char::is_cased`]. | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// ``` | ||
| /// #![feature(titlecase)] | ||
| /// use core::char::CharCase; | ||
| /// assert_eq!('a'.case(), Some(CharCase::Lower)); | ||
| /// assert_eq!('δ'.case(), Some(CharCase::Lower)); | ||
| /// assert_eq!('A'.case(), Some(CharCase::Upper)); | ||
| /// assert_eq!('Δ'.case(), Some(CharCase::Upper)); | ||
| /// assert_eq!('Dž'.case(), Some(CharCase::Title)); | ||
| /// assert_eq!('中'.case(), None); | ||
| /// ``` | ||
| #[must_use] | ||
| #[unstable(feature = "titlecase", issue = "153892")] | ||
| #[inline] | ||
| pub fn case(self) -> Option<CharCase> { | ||
| match self { | ||
| 'a'..='z' => Some(CharCase::Lower), | ||
| 'A'..='Z' => Some(CharCase::Upper), | ||
| '\0'..='\u{A9}' => None, | ||
| _ if !unicode::Cased(self) => None, | ||
| _ if unicode::Lowercase(self) => Some(CharCase::Lower), | ||
| _ if unicode::Uppercase(self) => Some(CharCase::Upper), | ||
| _ => Some(CharCase::Title), | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -819,7 +884,42 @@ impl char { | |
| pub const fn is_lowercase(self) -> bool { | ||
| match self { | ||
| 'a'..='z' => true, | ||
| c => c > '\x7f' && unicode::Lowercase(c), | ||
| '\0'..='\u{A9}' => false, | ||
| _ => unicode::Lowercase(self), | ||
| } | ||
| } | ||
|
|
||
| /// Returns `true` if this `char` has the general category for titlecase letters. | ||
| /// Conceptually, these characters consist of an uppercase portion followed by a lowercase portion. | ||
|
Comment on lines
+892
to
+893
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does it mean to "have a category"? Something can "have a property" or "be in a category", but the wording used here doesn't seem grammatical to me.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "Has the Unicode property assignment |
||
| /// | ||
| /// Titlecase letters (code points with the general category of `Lt`) are described in Chapter 4 | ||
| /// (Character Properties) of the [Unicode Standard] and specified in the [Unicode Character | ||
| /// Database][ucd] [`UnicodeData.txt`]. | ||
| /// | ||
| /// [Unicode Standard]: https://www.unicode.org/versions/latest/ | ||
| /// [ucd]: https://www.unicode.org/reports/tr44/ | ||
| /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// Basic usage: | ||
| /// | ||
| /// ``` | ||
| /// #![feature(titlecase)] | ||
| /// assert!('Dž'.is_titlecase()); | ||
| /// assert!('ῼ'.is_titlecase()); | ||
| /// assert!(!'D'.is_titlecase()); | ||
| /// assert!(!'z'.is_titlecase()); | ||
| /// assert!(!'中'.is_titlecase()); | ||
| /// assert!(!' '.is_titlecase()); | ||
| /// ``` | ||
| #[must_use] | ||
| #[unstable(feature = "titlecase", issue = "153892")] | ||
| #[inline] | ||
| pub fn is_titlecase(self) -> bool { | ||
| match self { | ||
| '\0'..='\u{01C4}' => false, | ||
| _ => self.is_cased() && !self.is_lowercase() && !self.is_uppercase(), | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -860,7 +960,8 @@ impl char { | |
| pub const fn is_uppercase(self) -> bool { | ||
| match self { | ||
| 'A'..='Z' => true, | ||
| c => c > '\x7f' && unicode::Uppercase(c), | ||
| '\0'..='\u{BF}' => false, | ||
| _ => unicode::Uppercase(self), | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -893,7 +994,8 @@ impl char { | |
| pub const fn is_whitespace(self) -> bool { | ||
| match self { | ||
| ' ' | '\x09'..='\x0d' => true, | ||
| c => c > '\x7f' && unicode::White_Space(c), | ||
| '\0'..='\u{84}' => false, | ||
| _ => unicode::White_Space(self), | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -920,10 +1022,10 @@ impl char { | |
| #[stable(feature = "rust1", since = "1.0.0")] | ||
| #[inline] | ||
| pub fn is_alphanumeric(self) -> bool { | ||
| if self.is_ascii() { | ||
| self.is_ascii_alphanumeric() | ||
| } else { | ||
| unicode::Alphabetic(self) || unicode::N(self) | ||
| match self { | ||
| 'a'..='z' | 'A'..='Z' | '0'..='9' => true, | ||
| '\0'..='\u{A9}' => false, | ||
| _ => unicode::Alphabetic(self) || unicode::N(self), | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -969,23 +1071,7 @@ impl char { | |
| #[must_use] | ||
| #[inline] | ||
| pub(crate) fn is_grapheme_extended(self) -> bool { | ||
| !self.is_ascii() && unicode::Grapheme_Extend(self) | ||
| } | ||
|
|
||
| /// Returns `true` if this `char` has the `Cased` property. | ||
| /// | ||
| /// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and | ||
| /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. | ||
| /// | ||
| /// [Unicode Standard]: https://www.unicode.org/versions/latest/ | ||
| /// [ucd]: https://www.unicode.org/reports/tr44/ | ||
| /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt | ||
| #[must_use] | ||
| #[inline] | ||
| #[doc(hidden)] | ||
| #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] | ||
| pub fn is_cased(self) -> bool { | ||
| if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) } | ||
| self > '\u{02FF}' && unicode::Grapheme_Extend(self) | ||
| } | ||
|
|
||
| /// Returns `true` if this `char` has the `Case_Ignorable` property. | ||
|
|
@@ -1047,7 +1133,8 @@ impl char { | |
| pub fn is_numeric(self) -> bool { | ||
| match self { | ||
| '0'..='9' => true, | ||
| c => c > '\x7f' && unicode::N(c), | ||
| '\0'..='\u{B1}' => false, | ||
| _ => unicode::N(self), | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -1110,17 +1197,123 @@ impl char { | |
| /// // convert into themselves. | ||
| /// assert_eq!('山'.to_lowercase().to_string(), "山"); | ||
| /// ``` | ||
| #[must_use = "this returns the lowercase character as a new iterator, \ | ||
| #[must_use = "this returns the lowercased character as a new iterator, \ | ||
| without modifying the original"] | ||
| #[stable(feature = "rust1", since = "1.0.0")] | ||
| #[inline] | ||
| pub fn to_lowercase(self) -> ToLowercase { | ||
| ToLowercase(CaseMappingIter::new(conversions::to_lower(self))) | ||
| } | ||
|
|
||
| /// Returns an iterator that yields the titlecase mapping of this `char` as one or more | ||
| /// `char`s. | ||
| /// | ||
| /// This is usually, but not always, equivalent to the uppercase mapping | ||
| /// returned by [`Self::to_uppercase`]. Prefer this method when seeking to capitalize | ||
| /// Only The First Letter of a word, but use [`Self::to_uppercase`] for ALL CAPS. | ||
| /// | ||
| /// If this `char` does not have an titlecase mapping, the iterator yields the same `char`. | ||
Jules-Bertholet marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| /// | ||
| /// If this `char` has a one-to-one titlecase mapping given by the [Unicode Character | ||
| /// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`. | ||
| /// | ||
| /// [ucd]: https://www.unicode.org/reports/tr44/ | ||
| /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt | ||
| /// | ||
| /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields | ||
| /// the `char`(s) given by [`SpecialCasing.txt`]. | ||
| /// | ||
| /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt | ||
| /// | ||
| /// This operation performs an unconditional mapping without tailoring. That is, the conversion | ||
| /// is independent of context and language. | ||
| /// | ||
| /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in | ||
| /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion. | ||
| /// | ||
| /// [Unicode Standard]: https://www.unicode.org/versions/latest/ | ||
| /// | ||
| /// # Examples | ||
| /// | ||
| /// As an iterator: | ||
| /// | ||
| /// ``` | ||
| /// #![feature(titlecase)] | ||
| /// for c in 'ß'.to_titlecase() { | ||
| /// print!("{c}"); | ||
| /// } | ||
| /// println!(); | ||
| /// ``` | ||
| /// | ||
| /// Using `println!` directly: | ||
| /// | ||
| /// ``` | ||
| /// #![feature(titlecase)] | ||
| /// println!("{}", 'ß'.to_titlecase()); | ||
| /// ``` | ||
| /// | ||
| /// Both are equivalent to: | ||
| /// | ||
| /// ``` | ||
| /// println!("Ss"); | ||
| /// ``` | ||
| /// | ||
| /// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string): | ||
| /// | ||
| /// ``` | ||
| /// #![feature(titlecase)] | ||
| /// assert_eq!('c'.to_titlecase().to_string(), "C"); | ||
| /// assert_eq!('dž'.to_titlecase().to_string(), "Dž"); | ||
| /// assert_eq!('ῼ'.to_titlecase().to_string(), "ῼ"); | ||
| /// | ||
| /// // Sometimes the result is more than one character: | ||
| /// assert_eq!('ß'.to_titlecase().to_string(), "Ss"); | ||
| /// | ||
| /// // Characters that do not have separate cased forms | ||
| /// // convert into themselves. | ||
| /// assert_eq!('山'.to_titlecase().to_string(), "山"); | ||
| /// ``` | ||
| /// | ||
| /// # Note on locale | ||
| /// | ||
| /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: | ||
| /// | ||
| /// * 'Dotless': I / ı, sometimes written ï | ||
| /// * 'Dotted': İ / i | ||
| /// | ||
| /// Note that the lowercase dotted 'i' is the same as the Latin. Therefore: | ||
| /// | ||
| /// ``` | ||
| /// #![feature(titlecase)] | ||
| /// let upper_i = 'i'.to_titlecase().to_string(); | ||
| /// ``` | ||
| /// | ||
| /// The value of `upper_i` here relies on the language of the text: if we're | ||
| /// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should | ||
| /// be `"İ"`. `to_titlecase()` does not take this into account, and so: | ||
| /// | ||
| /// ``` | ||
| /// #![feature(titlecase)] | ||
| /// let upper_i = 'i'.to_titlecase().to_string(); | ||
| /// | ||
| /// assert_eq!(upper_i, "I"); | ||
| /// ``` | ||
| /// | ||
| /// holds across languages. | ||
| #[must_use = "this returns the titlecased character as a new iterator, \ | ||
| without modifying the original"] | ||
| #[unstable(feature = "titlecase", issue = "153892")] | ||
| #[inline] | ||
| pub fn to_titlecase(self) -> ToTitlecase { | ||
| ToTitlecase(CaseMappingIter::new(conversions::to_title(self))) | ||
| } | ||
|
|
||
| /// Returns an iterator that yields the uppercase mapping of this `char` as one or more | ||
| /// `char`s. | ||
| /// | ||
| /// Prefer this method when converting a word into ALL CAPS, but consider [`Self::to_titlecase`] | ||
| /// instead if you seek to capitalize Only The First Letter. | ||
| /// | ||
| /// If this `char` does not have an uppercase mapping, the iterator yields the same `char`. | ||
| /// | ||
| /// If this `char` has a one-to-one uppercase mapping given by the [Unicode Character | ||
|
|
@@ -1170,9 +1363,11 @@ impl char { | |
| /// | ||
| /// ``` | ||
| /// assert_eq!('c'.to_uppercase().to_string(), "C"); | ||
| /// assert_eq!('dž'.to_uppercase().to_string(), "DŽ"); | ||
| /// | ||
| /// // Sometimes the result is more than one character: | ||
| /// assert_eq!('ſt'.to_uppercase().to_string(), "ST"); | ||
| /// assert_eq!('ῼ'.to_uppercase().to_string(), "ΩΙ"); | ||
| /// | ||
| /// // Characters that do not have both uppercase and lowercase | ||
| /// // convert into themselves. | ||
|
|
@@ -1181,7 +1376,7 @@ impl char { | |
| /// | ||
| /// # Note on locale | ||
| /// | ||
| /// In Turkish, the equivalent of 'i' in Latin has five forms instead of two: | ||
| /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: | ||
| /// | ||
| /// * 'Dotless': I / ı, sometimes written ï | ||
| /// * 'Dotted': İ / i | ||
|
|
@@ -1193,7 +1388,7 @@ impl char { | |
| /// ``` | ||
| /// | ||
| /// The value of `upper_i` here relies on the language of the text: if we're | ||
| /// in `en-US`, it should be `"I"`, but if we're in `tr_TR`, it should | ||
| /// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should | ||
| /// be `"İ"`. `to_uppercase()` does not take this into account, and so: | ||
| /// | ||
| /// ``` | ||
|
|
@@ -1203,7 +1398,7 @@ impl char { | |
| /// ``` | ||
| /// | ||
| /// holds across languages. | ||
| #[must_use = "this returns the uppercase character as a new iterator, \ | ||
| #[must_use = "this returns the uppercased character as a new iterator, \ | ||
| without modifying the original"] | ||
| #[stable(feature = "rust1", since = "1.0.0")] | ||
| #[inline] | ||
|
|
@@ -1446,7 +1641,7 @@ impl char { | |
| #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] | ||
| #[inline] | ||
| pub const fn is_ascii_alphabetic(&self) -> bool { | ||
| matches!(*self, 'A'..='Z' | 'a'..='z') | ||
| matches!(*self, 'a'..='z' | 'A'..='Z') | ||
| } | ||
|
|
||
| /// Checks if the value is an ASCII uppercase character: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is from "Extend ASCII fast paths of
charmethods beyond ASCII"...Can you confirm we continue to have exhaustive coverage in tests? I'm not sure how much the generated tests by the test generator call the public methods vs check that
unicode::Alphabetic(for example) is accurate.Also, I imagine that flipping the order of capital A-Z vs. lowercase a-z might influence codegen, and lowercase seems more likely to be common. Maybe worth doing something different there?
How did you decide on the particular threshold here (and in other modified functions)? Maybe we can split this out to a separate PR?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point, I fixed it.
I chose the highest value that would work, using https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp to verify. E.g., for this function, https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3AAlphabetic%3A%5D-%5B%3AASCII%3A%5D&abb=on says that the first non-ASCII alphabetic character is U+AA.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
They didn't call the public methods, no. But should now