From c03a7d724293731e8ea991bd8c72ad60bf8890c7 Mon Sep 17 00:00:00 2001 From: Etgar Perets Date: Wed, 9 Jul 2025 15:20:16 +0300 Subject: [PATCH 1/3] Added unquoted identifiers unicode support for mySql, postgreSqp, also added a test for that --- src/dialect/mysql.rs | 2 +- src/dialect/postgresql.rs | 2 +- src/dialect/redshift.rs | 2 +- tests/sqlparser_common.rs | 7 +++++++ 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs index f69e42436..70cf10c6c 100644 --- a/src/dialect/mysql.rs +++ b/src/dialect/mysql.rs @@ -51,7 +51,7 @@ impl Dialect for MySqlDialect { } fn is_identifier_part(&self, ch: char) -> bool { - self.is_identifier_start(ch) || ch.is_ascii_digit() + self.is_identifier_start(ch) || ch.is_ascii_digit() || !ch.is_ascii() } fn is_delimited_identifier_start(&self, ch: char) -> bool { diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index b2d4014cb..a7f50ce1f 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -72,7 +72,7 @@ impl Dialect for PostgreSqlDialect { } fn is_identifier_part(&self, ch: char) -> bool { - ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_' + ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_' || !ch.is_ascii() } fn supports_unicode_string_literal(&self) -> bool { diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs index 8ffed98af..ae237145f 100644 --- a/src/dialect/redshift.rs +++ b/src/dialect/redshift.rs @@ -88,7 +88,7 @@ impl Dialect for RedshiftSqlDialect { fn is_identifier_part(&self, ch: char) -> bool { // Extends Postgres dialect with sharp and UTF-8 multibyte chars // https://docs.aws.amazon.com/redshift/latest/dg/r_names.html - PostgreSqlDialect {}.is_identifier_part(ch) || ch == '#' || !ch.is_ascii() + PostgreSqlDialect {}.is_identifier_part(ch) || ch == '#' } /// redshift has `CONVERT(type, value)` instead of `CONVERT(value, type)` diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index 15144479c..559228bb0 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -16133,3 +16133,10 @@ SELECT * FROM tbl2 assert_eq!(stmts.len(), 2); assert!(stmts.iter().all(|s| matches!(s, Statement::Query { .. }))); } + #[test] + fn test_unicode_support() { + let unicode_sql = r#"SELECT phoneǤЖשचᎯ⻩☯♜🦄⚛🀄ᚠ⌛🌀 tbl FROM customers"#; + let dialects_supporting_unicode = TestedDialects::new(vec![Box::new(MySqlDialect {}), Box::new(RedshiftSqlDialect {}), Box::new(PostgreSqlDialect {})]); + let _ = dialects_supporting_unicode.parse_sql_statements(unicode_sql).unwrap(); + } + From a78376632ef043e20e185014c5c06f3d858711e2 Mon Sep 17 00:00:00 2001 From: Etgar Perets Date: Sun, 13 Jul 2025 14:21:05 +0300 Subject: [PATCH 2/3] SGA-11409 Added requested changes from pr request and fixed conflicts and codestyle issues --- src/dialect/mysql.rs | 4 +++- src/dialect/postgresql.rs | 4 +++- src/dialect/redshift.rs | 2 +- tests/sqlparser_common.rs | 16 ++++++++++------ 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs index 70cf10c6c..b50c8df50 100644 --- a/src/dialect/mysql.rs +++ b/src/dialect/mysql.rs @@ -51,7 +51,9 @@ impl Dialect for MySqlDialect { } fn is_identifier_part(&self, ch: char) -> bool { - self.is_identifier_start(ch) || ch.is_ascii_digit() || !ch.is_ascii() + self.is_identifier_start(ch) || ch.is_ascii_digit() || + // MySQL implements Unicode characters in identifiers. + !ch.is_ascii() } fn is_delimited_identifier_start(&self, ch: char) -> bool { diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index a7f50ce1f..c1f025574 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -72,7 +72,9 @@ impl Dialect for PostgreSqlDialect { } fn is_identifier_part(&self, ch: char) -> bool { - ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_' || !ch.is_ascii() + ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_' || + // PostgreSQL implements Unicode characters in identifiers. + !ch.is_ascii() } fn supports_unicode_string_literal(&self) -> bool { diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs index ae237145f..3cbdb57c4 100644 --- a/src/dialect/redshift.rs +++ b/src/dialect/redshift.rs @@ -88,7 +88,7 @@ impl Dialect for RedshiftSqlDialect { fn is_identifier_part(&self, ch: char) -> bool { // Extends Postgres dialect with sharp and UTF-8 multibyte chars // https://docs.aws.amazon.com/redshift/latest/dg/r_names.html - PostgreSqlDialect {}.is_identifier_part(ch) || ch == '#' + PostgreSqlDialect {}.is_identifier_part(ch) || ch == '#' } /// redshift has `CONVERT(type, value)` instead of `CONVERT(value, type)` diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index 559228bb0..777615526 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -16133,10 +16133,14 @@ SELECT * FROM tbl2 assert_eq!(stmts.len(), 2); assert!(stmts.iter().all(|s| matches!(s, Statement::Query { .. }))); } - #[test] - fn test_unicode_support() { - let unicode_sql = r#"SELECT phoneǤЖשचᎯ⻩☯♜🦄⚛🀄ᚠ⌛🌀 tbl FROM customers"#; - let dialects_supporting_unicode = TestedDialects::new(vec![Box::new(MySqlDialect {}), Box::new(RedshiftSqlDialect {}), Box::new(PostgreSqlDialect {})]); - let _ = dialects_supporting_unicode.parse_sql_statements(unicode_sql).unwrap(); - } +#[test] +fn test_identifier_unicode_support() { + let sql = r#"SELECT phoneǤЖשचᎯ⻩☯♜🦄⚛🀄ᚠ⌛🌀 AS tbl FROM customers"#; + let dialects = TestedDialects::new(vec![ + Box::new(MySqlDialect {}), + Box::new(RedshiftSqlDialect {}), + Box::new(PostgreSqlDialect {}), + ]); + let _ = dialects.verified_stmt(sql); +} From d22578e0f8bb6b2ec7f9e9eedcec5ff38addd461 Mon Sep 17 00:00:00 2001 From: Etgar Perets Date: Sun, 13 Jul 2025 14:49:48 +0300 Subject: [PATCH 3/3] SGA-11409 Fixed comment in light of changes --- src/dialect/redshift.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs index 3cbdb57c4..c910e4c77 100644 --- a/src/dialect/redshift.rs +++ b/src/dialect/redshift.rs @@ -86,7 +86,7 @@ impl Dialect for RedshiftSqlDialect { } fn is_identifier_part(&self, ch: char) -> bool { - // Extends Postgres dialect with sharp and UTF-8 multibyte chars + // UTF-8 multibyte characters are supported in identifiers via the PostgreSqlDialect. // https://docs.aws.amazon.com/redshift/latest/dg/r_names.html PostgreSqlDialect {}.is_identifier_part(ch) || ch == '#' }