From e8a5c97ad0c66c049dcfeffe4d9186e653d34640 Mon Sep 17 00:00:00 2001 From: Bertrand Gorge Date: Tue, 25 Feb 2020 12:00:54 +0100 Subject: [PATCH] Added support for utf8mb4 (emoji support) Fixes issue #791 --- qa-config-example.php | 9 +++++++++ qa-include/db/install.php | 12 ++++++++++-- qa-include/db/selects.php | 14 ++++++++++++-- qa-include/qa-db.php | 7 ++++++- qa-include/util/string.php | 3 +++ 5 files changed, 40 insertions(+), 5 deletions(-) diff --git a/qa-config-example.php b/qa-config-example.php index 707963069..6d5dd40a6 100644 --- a/qa-config-example.php +++ b/qa-config-example.php @@ -63,6 +63,15 @@ define('QA_MYSQL_TABLE_PREFIX', 'qa_'); +/* + QA_USE_UTF8MB4 allows to use utf8mb4 instead of utf8 - this mainly allows compatibility with emojis. + Note that if you enable this setting after your database has been created, you need to change the + charset of all the tables in your database (using an export and reimport). See this discussion for + more info: https://www.question2answer.org/qa/62412/unicode-10-characters-filtered-out +*/ + + define('QA_USE_UTF8MB4', 'false'); + /* If you wish, you can define QA_MYSQL_USERS_PREFIX separately from QA_MYSQL_TABLE_PREFIX. If so, tables containing information about user accounts (not including users' activity and points) diff --git a/qa-include/db/install.php b/qa-include/db/install.php index f7bd7d81b..3312515ef 100644 --- a/qa-include/db/install.php +++ b/qa-include/db/install.php @@ -67,6 +67,11 @@ function qa_db_table_definitions() require_once QA_INCLUDE_DIR . 'db/maxima.php'; require_once QA_INCLUDE_DIR . 'app/users.php'; + if (defined('QA_USE_UTF8MB4') && QA_USE_UTF8MB4) + $collation = 'utf8mb4_bin'; + else + $collation = 'utf_bin'; + /* Important note on character encoding in database and PHP connection to MySQL @@ -107,7 +112,7 @@ function qa_db_table_definitions() 'avatarheight' => 'SMALLINT UNSIGNED', // pixel height of stored avatar 'passsalt' => 'BINARY(16)', // salt used to calculate passcheck - null if no password set for direct login 'passcheck' => 'BINARY(20)', // checksum from password and passsalt - null if no password set for direct login - 'passhash' => 'VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL', // password_hash + 'passhash' => 'VARCHAR(255) CHARACTER SET utf8 COLLATE '.$collation.' DEFAULT NULL', // password_hash 'level' => 'TINYINT UNSIGNED NOT NULL', // basic, editor, admin, etc... 'loggedin' => 'DATETIME NOT NULL', // time of last login 'loginip' => 'VARBINARY(16) NOT NULL', // INET6_ATON of IP address of last login @@ -714,7 +719,10 @@ function qa_db_create_table_sql($rawname, $definition) if (isset($coldef)) $querycols .= (strlen($querycols) ? ', ' : '') . (is_int($colname) ? $coldef : ($colname . ' ' . $coldef)); - return 'CREATE TABLE ^' . $rawname . ' (' . $querycols . ') ENGINE=InnoDB CHARSET=utf8'; + if (defined('QA_USE_UTF8MB4') && QA_USE_UTF8MB4) + return 'CREATE TABLE ^' . $rawname . ' (' . $querycols . ') ENGINE=InnoDB CHARSET=utf8mb4'; + else + return 'CREATE TABLE ^' . $rawname . ' (' . $querycols . ') ENGINE=InnoDB CHARSET=utf8'; } diff --git a/qa-include/db/selects.php b/qa-include/db/selects.php index 030fb2706..03f7225ee 100644 --- a/qa-include/db/selects.php +++ b/qa-include/db/selects.php @@ -1216,8 +1216,13 @@ function qa_db_tag_recent_qs_selectspec($voteuserid, $tag, $start, $full = false $selectspec = qa_db_posts_basic_selectspec($voteuserid, $full); + if (defined('QA_USE_UTF8MB4') && QA_USE_UTF8MB4) + $collation = 'utf8mb4_bin'; + else + $collation = 'utf8_bin'; + // use two tests here - one which can use the index, and the other which narrows it down exactly - then limit to 1 just in case - $selectspec['source'] .= " JOIN (SELECT postid FROM ^posttags WHERE wordid=(SELECT wordid FROM ^words WHERE word=$ AND word=$ COLLATE utf8_bin LIMIT 1) ORDER BY postcreated DESC LIMIT #,#) y ON ^posts.postid=y.postid"; + $selectspec['source'] .= " JOIN (SELECT postid FROM ^posttags WHERE wordid=(SELECT wordid FROM ^words WHERE word=$ AND word=$ COLLATE $collation LIMIT 1) ORDER BY postcreated DESC LIMIT #,#) y ON ^posts.postid=y.postid"; array_push($selectspec['arguments'], $tag, qa_strtolower($tag), $start, $count); $selectspec['sortdesc'] = 'created'; @@ -1232,9 +1237,14 @@ function qa_db_tag_recent_qs_selectspec($voteuserid, $tag, $start, $full = false */ function qa_db_tag_word_selectspec($tag) { + if (defined('QA_USE_UTF8MB4') && QA_USE_UTF8MB4) + $collation = 'utf8mb4_bin'; + else + $collation = 'utf8_bin'; + return array( 'columns' => array('wordid', 'word', 'tagcount'), - 'source' => '^words WHERE word=$ AND word=$ COLLATE utf8_bin', + 'source' => '^words WHERE word=$ AND word=$ COLLATE ' . $collation, 'arguments' => array($tag, qa_strtolower($tag)), 'single' => true, ); diff --git a/qa-include/qa-db.php b/qa-include/qa-db.php index d32576123..be3a29ce4 100644 --- a/qa-include/qa-db.php +++ b/qa-include/qa-db.php @@ -91,7 +91,12 @@ function qa_db_connect($failhandler = null) // From Q2A 1.5, we explicitly set the character encoding of the MySQL connection, instead of using lots of "SELECT BINARY col"-style queries. // Testing showed that overhead is minimal, so this seems worth trading off against the benefit of more straightforward queries, especially // for plugin developers. - if (!$db->set_charset('utf8')) + if (defined('QA_USE_UTF8MB4') && QA_USE_UTF8MB4) + $collation = 'utf8mb4'; + else + $collation = 'utf8'; + + if (!$db->set_charset($collation)) qa_db_fail_error('set_charset', $db->errno, $db->error); qa_report_process_stage('db_connected'); diff --git a/qa-include/util/string.php b/qa-include/util/string.php index a0719a21d..95487612c 100644 --- a/qa-include/util/string.php +++ b/qa-include/util/string.php @@ -601,6 +601,9 @@ function qa_shorten_string_line($string, $length, $ellipsis = ' ... ') */ function qa_remove_utf8mb4($string) { + if (defined('QA_USE_UTF8MB4') && QA_USE_UTF8MB4) + return $string; + return preg_replace('%(?: \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15