diff --git a/.gitmodules b/.gitmodules index 7a2c5600e65b..ea108b6dbe9e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -221,3 +221,6 @@ [submodule "contrib/NuRaft"] path = contrib/NuRaft url = https://github.com/ClickHouse-Extras/NuRaft.git +[submodule "contrib/LucenePlusPlus"] + path = contrib/LucenePlusPlus + url = https://github.com/cloudnativecube/LucenePlusPlus.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 9002f1df140c..03975dc7ee27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -490,6 +490,7 @@ include (cmake/find/rapidjson.cmake) include (cmake/find/fastops.cmake) include (cmake/find/odbc.cmake) include (cmake/find/rocksdb.cmake) +include (cmake/find/luceneplusplus.cmake) include (cmake/find/libpqxx.cmake) include (cmake/find/nuraft.cmake) diff --git a/README.md b/README.md index ce2615f7c1c9..3329a98877ff 100644 --- a/README.md +++ b/README.md @@ -13,68 +13,3 @@ ClickHouse® is an open-source column-oriented database management system that a * [Code Browser](https://clickhouse.tech/codebrowser/html_report/ClickHouse/index.html) with syntax highlight and navigation. * [Contacts](https://clickhouse.tech/#contacts) can help to get your questions answered if there are any. * You can also [fill this form](https://clickhouse.tech/#meet) to meet Yandex ClickHouse team in person. - - -## Neoway Research - -This branch is part of a research where we implemented a proof of concept for full text search using [ClickHouse](https://github.com/ClickHouse/ClickHouse) and [Tantivy](https://github.com/tantivy-search/tantivy). - -Tantivy is a full text search engine library written in Rust. - -The implementation consists in creating the tantivy storage engine and tantivy SQL function. -Because this is just a test, we decided to hard code this three column names in the code so that we don't have to create all the logic behind dynamic column names with different types. It is hard-coded for columns `primary_id`, `secondary_id` and `body`. Then we can create the table using the query - -```sql -CREATE TABLE fulltext_table -( - primary_id UInt64, - secondary_id UInt64, - body String -) -ENGINE = Tantivy('/var/lib/clickhouse/tantivy/fulltext_table') --- Tantivy engine takes as parameter a path to save the data. -``` - -For the [Storage Engine](https://github.com/NeowayLabs/ClickHouse/blob/fulltext-21.3/src/Storages/StorageTantivy.cpp) it has to be able to receive data from the INSERT query and index into tantivy. For the SELECT queries we need to push the full text WHERE clause to tantivy and create a Clickhouse column with the result. - -Because the full text search query needs to be sent to tantivy we created an SQL function named tantivy, so the syntax for making queries is the following -```sql -SELECT primary_id -FROM fulltext_table -WHERE tantivy('full text query here') -``` -The `tantivy` SQL function doesn't return anything and has no logic inside. Its only purpose is to validade the input and generate the `ASTSelectQuery`. -Inside the storage engine we take the AST parameters and push the query to the Rust implementation inside the folder [contrib/tantivysearch](https://github.com/NeowayLabs/ClickHouse/tree/fulltext-21.3/contrib/tantivysearch). - -When data is indexed in tantivy it needs to be commited. That's an expensive job to do every insert so we decided to call it when optimize table is called -```sql -OPTIMIZE TABLE fulltext_table FINAL -``` -After the optimization the data is available for queries. - -## Results -We inserted 39 million texts with an average of 4895 characters, also all the texts were unique. Our testing machine is a n2d-standard-16, 16 CPU, 62.8G Mem, 2 Local SSD 375 GB in RAID 0, on Google Cloud. - -In our implementation we were not interested in retrieving the actual text from the search result. That means we chose to return only the ID columns and don't return the text. It would be easy to return the text, but for our use case we just want to have statistics on the data. An example would be to answer how many rows match with the phrase 'covid 19' ? The result for that is a query that runs at the same speed tantivy would run with a little increment of time to copy the result to a Clickhouse column. For the majority of searches we could get the result in milliseconds. Queries using OR operator and matching almost all the texts were slower and could time more than 1 second. - -Another use case is that we have a table with dozens of columns that is related to our fulltext_table by an ID. So we would have a query like this -```sql -SELECT * -FROM a_very_big_table -WHERE - -- many_filters_here - AND primary_id IN ( - SELECT primary_id - FROM fulltext_table - WHERE tantivy('full text query here') - ) -``` -Also we wanted to do many different queries, all with the same text filter, and running in parallel. Instead of doing the same query on tantivy at the same time, with the same result, we implemented a concurrent bounded cache mechanism that we can set a TTL and perform a single computation for multiple parallel queries on the same input resolving the same result to all once done. We noticed that the speed of those queries were fast making this solution very promising. - - -## Alternatives -Other alternatives to this is to use [data skipping indexes](https://clickhouse.tech/docs/en/engines/table-engines/mergetree-family/mergetree/#table_engine-mergetree-data_skipping-indexes) or implement something akin to an [inverted index](https://hannes.muehleisen.org/SIGIR2014-column-stores-ir-prototyping.pdf) on SQL directly. - -Data skipping indexes requires a lot of parameter tuning and it is very tricky to make they work with the SQL functions. Even with all that tuning we got very poor performance. - -Inverted index is an interesting solution, but it is very complex to implement and requires an external tokenizer and big complicated queries to search the data. The performance is better than data skipping indexes but still too slow for a real scenario. diff --git a/cmake/find/luceneplusplus.cmake b/cmake/find/luceneplusplus.cmake new file mode 100644 index 000000000000..c4e3da27f03a --- /dev/null +++ b/cmake/find/luceneplusplus.cmake @@ -0,0 +1,49 @@ +option(ENABLE_LUCENE "Enable LUCENE" ${ENABLE_LIBRARIES}) + +if (NOT ENABLE_LUCENE) + if (USE_INTERNAL_LUCENE_LIBRARY) + message (${RECONFIGURE_MESSAGE_LEVEL} "Can't use internal lucene library with ENABLE_LUCENE=OFF") + endif() + return() +endif() + +option(USE_INTERNAL_LUCENE_LIBRARY "Set to FALSE to use system LUCENE library instead of bundled" ${NOT_UNBUNDLED}) + +if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/LucenePlusPlus/CMakeLists.txt") + if (USE_INTERNAL_LUCENE_LIBRARY) + message (WARNING "submodule contrib is missing. to fix try run: \n git submodule update --init --recursive") + message(${RECONFIGURE_MESSAGE_LEVEL} "cannot find internal lucene") + endif() + set (MISSING_INTERNAL_LUCENE 1) +endif () + +if (NOT USE_INTERNAL_LUCENE_LIBRARY) + find_library (LUCENE_LIBRARY lucene++) + find_path (LUCENE_INCLUDE_DIR NAMES lucene++/LuceneHeaders.h PATHS ${LUCENE_INCLUDE_PATHS}) + if (NOT LUCENE_LIBRARY OR NOT LUCENE_INCLUDE_DIR) + message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system lucene library") + endif() + + if (NOT ZLIB_LIBRARY) + include(cmake/find/zlib.cmake) + endif() + + if(ZLIB_LIBRARY) + list (APPEND LUCENE_LIBRARY ${ZLIB_LIBRARY}) + else() + message (${RECONFIGURE_MESSAGE_LEVEL} + "Can't find system lucene: zlib=${ZLIB_LIBRARY} ;") + endif() +endif () + +if(LUCENE_LIBRARY AND LUCENE_INCLUDE_DIR) + set(USE_LUCENE 1) +elseif (NOT MISSING_INTERNAL_LUCENE) + set (USE_INTERNAL_LUCENE_LIBRARY 1) + + set (LUCENE_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/LucenePlusPlus/include") + set (LUCENE_LIBRARY "lucene++") + set (USE_LUCENE 1) +endif () + +message (STATUS "Using LUCENE=${USE_LUCENE}: ${LUCENE_INCLUDE_DIR} : ${LUCENE_LIBRARY}") diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index cfe5a6aed57b..cab963c4d32c 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -309,7 +309,7 @@ if (USE_INTERNAL_ROCKSDB_LIBRARY) add_subdirectory(rocksdb-cmake) endif() -add_subdirectory(tantivysearch-cmake) +add_subdirectory(LucenePlusPlus) if (USE_LIBPQXX) add_subdirectory (libpq-cmake) diff --git a/contrib/LucenePlusPlus b/contrib/LucenePlusPlus new file mode 160000 index 000000000000..460945ca3a32 --- /dev/null +++ b/contrib/LucenePlusPlus @@ -0,0 +1 @@ +Subproject commit 460945ca3a32b51a6fa9314834e976a573c4a44e diff --git a/contrib/boost b/contrib/boost index ee24fa55bc46..eede626248b3 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit ee24fa55bc46e4d2ce7d0d052cc5a0d9b1be8c36 +Subproject commit eede626248b3710fe4f5f9c03b3f479a2da0af41 diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index b9298f59f2b2..dc375e456d49 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -13,6 +13,8 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) regex context coroutine + date_time + thread ) if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND @@ -32,6 +34,8 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) add_library (_boost_system INTERFACE) add_library (_boost_context INTERFACE) add_library (_boost_coroutine INTERFACE) + add_library (_boost_date_time INTERFACE) + add_library (_boost_thread INTERFACE) target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY}) target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY}) @@ -40,6 +44,8 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY}) target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY}) target_link_libraries (_boost_coroutine INTERFACE ${Boost_COROUTINE_LIBRARY}) + target_link_libraries (_boost_date_time INTERFACE ${Boost_DATE_TIME_LIBRARY}) + target_link_libraries (_boost_thread INTERFACE ${Boost_THREAD_LIBRARY}) add_library (boost::filesystem ALIAS _boost_filesystem) add_library (boost::iostreams ALIAS _boost_iostreams) @@ -48,6 +54,8 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) add_library (boost::system ALIAS _boost_system) add_library (boost::context ALIAS _boost_context) add_library (boost::coroutine ALIAS _boost_coroutine) + add_library (boost::date_time ALIAS _boost_date_time) + add_library (boost::thread ALIAS _boost_thread) else() set(EXTERNAL_BOOST_FOUND 0) message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost") @@ -220,4 +228,33 @@ if (NOT EXTERNAL_BOOST_FOUND) add_library (boost::coroutine ALIAS _boost_coroutine) target_include_directories (_boost_coroutine PRIVATE ${LIBRARY_DIR}) target_link_libraries(_boost_coroutine PRIVATE _boost_context) + + # date_time + + set (SRCS_DATE_TIME + ${LIBRARY_DIR}/libs/date_time/src/gregorian/date_generators.cpp + ${LIBRARY_DIR}/libs/date_time/src/gregorian/greg_month.cpp + ${LIBRARY_DIR}/libs/date_time/src/gregorian/greg_names.hpp + ${LIBRARY_DIR}/libs/date_time/src/gregorian/greg_weekday.cpp + ${LIBRARY_DIR}/libs/date_time/src/gregorian/gregorian_types.cpp + ${LIBRARY_DIR}/libs/date_time/src/posix_time/posix_time_types.cpp + ) + add_library (_boost_date_time ${SRCS_DATE_TIME}) + add_library (boost::date_time ALIAS _boost_date_time) + target_include_directories (_boost_date_time PRIVATE ${LIBRARY_DIR}) + target_link_libraries(_boost_date_time PRIVATE _boost_context) + + # thread + + set (SRCS_THREAD + ${LIBRARY_DIR}/libs/thread/src/pthread/once.cpp + ${LIBRARY_DIR}/libs/thread/src/pthread/once_atomic.cpp + ${LIBRARY_DIR}/libs/thread/src/pthread/thread.cpp + ${LIBRARY_DIR}/libs/thread/src/future.cpp + ${LIBRARY_DIR}/libs/thread/src/tss_null.cpp + ) + add_library (_boost_thread ${SRCS_THREAD}) + add_library (boost::thread ALIAS _boost_thread) + target_include_directories (_boost_thread PRIVATE ${LIBRARY_DIR}) + target_link_libraries(_boost_thread PRIVATE _boost_context _boost_date_time) endif () diff --git a/contrib/tantivysearch-cmake/CMakeLists.txt b/contrib/tantivysearch-cmake/CMakeLists.txt deleted file mode 100644 index fb6608448103..000000000000 --- a/contrib/tantivysearch-cmake/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -set(TANTIVYSEARCH_SOURCE_DIR ${ClickHouse_SOURCE_DIR}/contrib/tantivysearch) - -find_library (LIBRARY_TANTIVYSEARCH NAMES libtantivysearch.a tantivisearch PATHS ${TANTIVYSEARCH_SOURCE_DIR}/target/release REQUIRED) -find_path (INCLUDE_TANTIVYSEARCH NAMES tantivysearch.h PATHS ${TANTIVYSEARCH_SOURCE_DIR}/include) - -if (LIBRARY_TANTIVYSEARCH AND INCLUDE_TANTIVYSEARCH) - set(CMAKE_REQUIRED_LIBRARIES ${LIBRARY_TANTIVYSEARCH}) - set(CMAKE_REQUIRED_INCLUDES ${INCLUDE_TANTIVYSEARCH}) - add_library (tantivysearch INTERFACE) - set_property (TARGET tantivysearch PROPERTY INTERFACE_LINK_LIBRARIES ${LIBRARY_TANTIVYSEARCH}) - set_property (TARGET tantivysearch PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${INCLUDE_TANTIVYSEARCH}) - message (STATUS "Using tantivysearch: ${INCLUDE_TANTIVYSEARCH} : ${LIBRARY_TANTIVYSEARCH}") -else() - message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find tantivysearch") -endif() diff --git a/contrib/tantivysearch/.gitignore b/contrib/tantivysearch/.gitignore deleted file mode 100644 index ea8c4bf7f35f..000000000000 --- a/contrib/tantivysearch/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/target diff --git a/contrib/tantivysearch/Cargo.lock b/contrib/tantivysearch/Cargo.lock deleted file mode 100644 index 05c2359fcad6..000000000000 --- a/contrib/tantivysearch/Cargo.lock +++ /dev/null @@ -1,1369 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -[[package]] -name = "addr2line" -version = "0.14.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a55f82cfe485775d02112886f4169bde0c5894d75e79ead7eafe7e40a25e45f7" -dependencies = [ - "gimli", -] - -[[package]] -name = "adler" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e" - -[[package]] -name = "ahash" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" - -[[package]] -name = "atomicwrites" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a2baf2feb820299c53c7ad1cc4f5914a220a1cb76d7ce321d2522a94b54651f" -dependencies = [ - "nix", - "tempdir", - "winapi 0.3.9", -] - -[[package]] -name = "autocfg" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" - -[[package]] -name = "backtrace" -version = "0.3.55" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef5140344c85b01f9bbb4d4b7288a8aa4b3287ccef913a14bcc78a1063623598" -dependencies = [ - "addr2line", - "cfg-if 1.0.0", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", -] - -[[package]] -name = "base64" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3441f0f7b02788e948e47f457ca01f1d7e6d92c693bc132c22b087d3141c03ff" - -[[package]] -name = "bitflags" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" - -[[package]] -name = "bitpacking" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3744aff20a3437a99ebc0bb7733e9e60c7bf590478c9b897e95b38d57e5acb68" -dependencies = [ - "crunchy", -] - -[[package]] -name = "byteorder" -version = "1.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae44d1a3d5a19df61dd0c8beb138458ac2a53a7ac09eba97d55592540004306b" - -[[package]] -name = "cc" -version = "1.0.66" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c0496836a84f8d0495758516b8621a622beb77c0fed418570e50764093ced48" - -[[package]] -name = "census" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5927edd8345aef08578bcbb4aea7314f340d80c7f4931f99fbeb40b99d8f5060" - -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "chrono" -version = "0.4.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" -dependencies = [ - "libc", - "num-integer", - "num-traits", - "time", - "winapi 0.3.9", -] - -[[package]] -name = "cloudabi" -version = "0.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" -dependencies = [ - "bitflags", -] - -[[package]] -name = "combine" -version = "4.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc4369b5e4c0cddf64ad8981c0111e7df4f7078f4d6ba98fb31f2e17c4c57b7e" -dependencies = [ - "memchr", -] - -[[package]] -name = "const_fn" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28b9d6de7f49e22cf97ad17fc4036ece69300032f45f78f30b4a4482cdc3f4a6" - -[[package]] -name = "crc32fast" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" -dependencies = [ - "cfg-if 1.0.0", -] - -[[package]] -name = "crossbeam" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69323bff1fb41c635347b8ead484a5ca6c3f11914d784170b158d8449ab07f8e" -dependencies = [ - "cfg-if 0.1.10", - "crossbeam-channel 0.4.4", - "crossbeam-deque 0.7.3", - "crossbeam-epoch 0.8.2", - "crossbeam-queue", - "crossbeam-utils 0.7.2", -] - -[[package]] -name = "crossbeam-channel" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b153fe7cbef478c567df0f972e02e6d736db11affe43dfc9c56a9374d1adfb87" -dependencies = [ - "crossbeam-utils 0.7.2", - "maybe-uninit", -] - -[[package]] -name = "crossbeam-channel" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-utils 0.8.1", -] - -[[package]] -name = "crossbeam-deque" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f02af974daeee82218205558e51ec8768b48cf524bd01d550abe5573a608285" -dependencies = [ - "crossbeam-epoch 0.8.2", - "crossbeam-utils 0.7.2", - "maybe-uninit", -] - -[[package]] -name = "crossbeam-deque" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-epoch 0.9.1", - "crossbeam-utils 0.8.1", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace" -dependencies = [ - "autocfg", - "cfg-if 0.1.10", - "crossbeam-utils 0.7.2", - "lazy_static", - "maybe-uninit", - "memoffset 0.5.6", - "scopeguard", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1aaa739f95311c2c7887a76863f500026092fb1dce0161dab577e559ef3569d" -dependencies = [ - "cfg-if 1.0.0", - "const_fn", - "crossbeam-utils 0.8.1", - "lazy_static", - "memoffset 0.6.1", - "scopeguard", -] - -[[package]] -name = "crossbeam-queue" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "774ba60a54c213d409d5353bda12d49cd68d14e45036a285234c8d6f91f92570" -dependencies = [ - "cfg-if 0.1.10", - "crossbeam-utils 0.7.2", - "maybe-uninit", -] - -[[package]] -name = "crossbeam-utils" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8" -dependencies = [ - "autocfg", - "cfg-if 0.1.10", - "lazy_static", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02d96d1e189ef58269ebe5b97953da3274d83a93af647c2ddd6f9dab28cedb8d" -dependencies = [ - "autocfg", - "cfg-if 1.0.0", - "lazy_static", -] - -[[package]] -name = "crunchy" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" - -[[package]] -name = "downcast-rs" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ea835d29036a4087793836fa931b08837ad5e957da9e23886b29586fb9b6650" - -[[package]] -name = "either" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" - -[[package]] -name = "fail" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be3c61c59fdc91f5dbc3ea31ee8623122ce80057058be560654c5d410d181a6" -dependencies = [ - "lazy_static", - "log", - "rand 0.7.3", -] - -[[package]] -name = "failure" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d32e9bd16cc02eae7db7ef620b392808b89f6a5e16bb3497d159c6b92a0f4f86" -dependencies = [ - "backtrace", - "failure_derive", -] - -[[package]] -name = "failure_derive" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa4da3c766cd7a0db8242e326e9e4e081edd567072893ed320008189715366a4" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "synstructure", -] - -[[package]] -name = "filetime" -version = "0.2.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c122a393ea57648015bf06fbd3d372378992e86b9ff5a7a497b076a28c79efe" -dependencies = [ - "cfg-if 1.0.0", - "libc", - "redox_syscall", - "winapi 0.3.9", -] - -[[package]] -name = "flurry" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c0a35f7b50e99185a2825541946252f669f3c3ca77801357cd682a1b356bb3e" -dependencies = [ - "ahash", - "crossbeam-epoch 0.8.2", - "num_cpus", - "parking_lot", -] - -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - -[[package]] -name = "fs2" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "fsevent" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ab7d1bd1bd33cc98b0889831b72da23c0aa4df9cec7e0702f46ecea04b35db6" -dependencies = [ - "bitflags", - "fsevent-sys", -] - -[[package]] -name = "fsevent-sys" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f41b048a94555da0f42f1d632e2e19510084fb8e303b0daa2816e733fb3644a0" -dependencies = [ - "libc", -] - -[[package]] -name = "fuchsia-cprng" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" - -[[package]] -name = "fuchsia-zircon" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" -dependencies = [ - "bitflags", - "fuchsia-zircon-sys", -] - -[[package]] -name = "fuchsia-zircon-sys" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" - -[[package]] -name = "futures" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c70be434c505aee38639abccb918163b63158a4b4bb791b45b7023044bdc3c9c" -dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-channel" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f01c61843314e95f96cc9245702248733a3a3d744e43e2e755e3c7af8348a0a9" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db8d3b0917ff63a2a96173133c02818fac4a746b0a57569d3baca9ec0e945e08" - -[[package]] -name = "futures-executor" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ee9ca2f7eb4475772cf39dd1cd06208dce2670ad38f4d9c7262b3e15f127068" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", - "num_cpus", -] - -[[package]] -name = "futures-io" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e37c1a51b037b80922864b8eed90692c5cd8abd4c71ce49b77146caa47f3253b" - -[[package]] -name = "futures-macro" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f8719ca0e1f3c5e34f3efe4570ef2c0610ca6da85ae7990d472e9cbfba13664" -dependencies = [ - "proc-macro-hack", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "futures-sink" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6adabac1290109cfa089f79192fb6244ad2c3f1cc2281f3e1dd987592b71feb" - -[[package]] -name = "futures-task" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a92a0843a2ff66823a8f7c77bffe9a09be2b64e533562c412d63075643ec0038" -dependencies = [ - "once_cell", -] - -[[package]] -name = "futures-util" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "036a2107cdeb57f6d7322f1b6c363dad67cd63ca3b7d1b925bdf75bd5d96cda9" -dependencies = [ - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite", - "pin-utils", - "proc-macro-hack", - "proc-macro-nested", - "slab", -] - -[[package]] -name = "getrandom" -version = "0.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" -dependencies = [ - "cfg-if 1.0.0", - "libc", - "wasi 0.9.0+wasi-snapshot-preview1", -] - -[[package]] -name = "getrandom" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4060f4657be78b8e766215b02b18a2e862d83745545de804638e2b545e81aee6" -dependencies = [ - "cfg-if 1.0.0", - "libc", - "wasi 0.10.1+wasi-snapshot-preview1", -] - -[[package]] -name = "gimli" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6503fe142514ca4799d4c26297c4248239fe8838d827db6bd6065c6ed29a6ce" - -[[package]] -name = "hermit-abi" -version = "0.1.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aca5565f760fb5b220e499d72710ed156fdb74e631659e99377d9ebfbd13ae8" -dependencies = [ - "libc", -] - -[[package]] -name = "htmlescape" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163" - -[[package]] -name = "inotify" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4816c66d2c8ae673df83366c18341538f234a26d65a9ecea5c348b453ac1d02f" -dependencies = [ - "bitflags", - "inotify-sys", - "libc", -] - -[[package]] -name = "inotify-sys" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4563555856585ab3180a5bf0b2f9f8d301a728462afffc8195b3f5394229c55" -dependencies = [ - "libc", -] - -[[package]] -name = "iovec" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" -dependencies = [ - "libc", -] - -[[package]] -name = "itoa" -version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" - -[[package]] -name = "kernel32-sys" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" -dependencies = [ - "winapi 0.2.8", - "winapi-build", -] - -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - -[[package]] -name = "lazycell" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" - -[[package]] -name = "levenshtein_automata" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f44db4199cdb049b494a92d105acbfa43c25b3925e33803923ba9580b7bc9e1a" - -[[package]] -name = "libc" -version = "0.2.82" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89203f3fba0a3795506acaad8ebce3c80c0af93f994d5a1d7a0b1eeb23271929" - -[[package]] -name = "lock_api" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4da24a77a3d8a6d4862d95f72e6fdb9c09a643ecdb402d754004a557f2bec75" -dependencies = [ - "scopeguard", -] - -[[package]] -name = "log" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcf3805d4480bb5b86070dcfeb9e2cb2ebc148adb753c5cca5f884d1d65a42b2" -dependencies = [ - "cfg-if 0.1.10", -] - -[[package]] -name = "maybe-uninit" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" - -[[package]] -name = "memchr" -version = "2.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" - -[[package]] -name = "memmap" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "memoffset" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "043175f069eda7b85febe4a74abbaeff828d9f8b448515d3151a14a3542811aa" -dependencies = [ - "autocfg", -] - -[[package]] -name = "memoffset" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "157b4208e3059a8f9e78d559edc658e13df41410cb3ae03979c83130067fdd87" -dependencies = [ - "autocfg", -] - -[[package]] -name = "miniz_oxide" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f2d26ec3309788e423cfbf68ad1800f061638098d76a83681af979dc4eda19d" -dependencies = [ - "adler", - "autocfg", -] - -[[package]] -name = "mio" -version = "0.6.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4afd66f5b91bf2a3bc13fad0e21caedac168ca4c707504e75585648ae80e4cc4" -dependencies = [ - "cfg-if 0.1.10", - "fuchsia-zircon", - "fuchsia-zircon-sys", - "iovec", - "kernel32-sys", - "libc", - "log", - "miow", - "net2", - "slab", - "winapi 0.2.8", -] - -[[package]] -name = "mio-extras" -version = "2.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52403fe290012ce777c4626790c8951324a2b9e3316b3143779c72b029742f19" -dependencies = [ - "lazycell", - "log", - "mio", - "slab", -] - -[[package]] -name = "miow" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebd808424166322d4a38da87083bfddd3ac4c131334ed55856112eb06d46944d" -dependencies = [ - "kernel32-sys", - "net2", - "winapi 0.2.8", - "ws2_32-sys", -] - -[[package]] -name = "murmurhash32" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d736ff882f0e85fe9689fb23db229616c4c00aee2b3ac282f666d8f20eb25d4a" -dependencies = [ - "byteorder", -] - -[[package]] -name = "net2" -version = "0.2.37" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "391630d12b68002ae1e25e8f974306474966550ad82dac6886fb8910c19568ae" -dependencies = [ - "cfg-if 0.1.10", - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "nix" -version = "0.14.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c722bee1037d430d0f8e687bbdbf222f27cc6e4e68d5caf630857bb2b6dbdce" -dependencies = [ - "bitflags", - "cc", - "cfg-if 0.1.10", - "libc", - "void", -] - -[[package]] -name = "notify" -version = "4.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80ae4a7688d1fab81c5bf19c64fc8db920be8d519ce6336ed4e7efe024724dbd" -dependencies = [ - "bitflags", - "filetime", - "fsevent", - "fsevent-sys", - "inotify", - "libc", - "mio", - "mio-extras", - "walkdir", - "winapi 0.3.9", -] - -[[package]] -name = "num-integer" -version = "0.1.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" -dependencies = [ - "autocfg", - "num-traits", -] - -[[package]] -name = "num-traits" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" -dependencies = [ - "autocfg", -] - -[[package]] -name = "num_cpus" -version = "1.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" -dependencies = [ - "hermit-abi", - "libc", -] - -[[package]] -name = "object" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d3b63360ec3cb337817c2dbd47ab4a0f170d285d8e5a2064600f3def1402397" - -[[package]] -name = "once_cell" -version = "1.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3" - -[[package]] -name = "owned-read" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b66d1e235abcebc845cf93550b89b74f468c051496fafb433ede4104b9f71ba1" -dependencies = [ - "stable_deref_trait", -] - -[[package]] -name = "owning_ref" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ff55baddef9e4ad00f88b6c743a2a8062d4c6ade126c2a528644b8e444d52ce" -dependencies = [ - "stable_deref_trait", -] - -[[package]] -name = "parking_lot" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3a704eb390aafdc107b0e392f56a82b668e3a71366993b5340f5833fd62505e" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d58c7c768d4ba344e3e8d72518ac13e259d7c7ade24167003b8488e10b6740a3" -dependencies = [ - "cfg-if 0.1.10", - "cloudabi", - "libc", - "redox_syscall", - "smallvec", - "winapi 0.3.9", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "ppv-lite86" -version = "0.2.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" - -[[package]] -name = "proc-macro-hack" -version = "0.5.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" - -[[package]] -name = "proc-macro-nested" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eba180dafb9038b050a4c280019bbedf9f2467b61e5d892dcad585bb57aadc5a" - -[[package]] -name = "proc-macro2" -version = "1.0.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" -dependencies = [ - "unicode-xid", -] - -[[package]] -name = "quote" -version = "1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "991431c3519a3f36861882da93630ce66b52918dcf1b8e2fd66b397fc96f28df" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "rand" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" -dependencies = [ - "fuchsia-cprng", - "libc", - "rand_core 0.3.1", - "rdrand", - "winapi 0.3.9", -] - -[[package]] -name = "rand" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" -dependencies = [ - "getrandom 0.1.16", - "libc", - "rand_chacha", - "rand_core 0.5.1", - "rand_hc", -] - -[[package]] -name = "rand_chacha" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" -dependencies = [ - "ppv-lite86", - "rand_core 0.5.1", -] - -[[package]] -name = "rand_core" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" -dependencies = [ - "rand_core 0.4.2", -] - -[[package]] -name = "rand_core" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" - -[[package]] -name = "rand_core" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" -dependencies = [ - "getrandom 0.1.16", -] - -[[package]] -name = "rand_hc" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" -dependencies = [ - "rand_core 0.5.1", -] - -[[package]] -name = "rayon" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b0d8e0819fadc20c74ea8373106ead0600e3a67ef1fe8da56e39b9ae7275674" -dependencies = [ - "autocfg", - "crossbeam-deque 0.8.0", - "either", - "rayon-core", -] - -[[package]] -name = "rayon-core" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a" -dependencies = [ - "crossbeam-channel 0.5.0", - "crossbeam-deque 0.8.0", - "crossbeam-utils 0.8.1", - "lazy_static", - "num_cpus", -] - -[[package]] -name = "rdrand" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "redox_syscall" -version = "0.1.57" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" - -[[package]] -name = "regex" -version = "1.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9251239e129e16308e70d853559389de218ac275b515068abc96829d05b948a" -dependencies = [ - "regex-syntax 0.6.22", -] - -[[package]] -name = "regex-syntax" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e931c58b93d86f080c734bfd2bce7dd0079ae2331235818133c8be7f422e20e" - -[[package]] -name = "regex-syntax" -version = "0.6.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581" - -[[package]] -name = "remove_dir_all" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "rust-stemmers" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" -dependencies = [ - "serde", - "serde_derive", -] - -[[package]] -name = "rustc-demangle" -version = "0.1.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e3bad0ee36814ca07d7968269dd4b7ec89ec2da10c4bb613928d3077083c232" - -[[package]] -name = "ryu" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" - -[[package]] -name = "same-file" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "scopeguard" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" - -[[package]] -name = "serde" -version = "1.0.118" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06c64263859d87aa2eb554587e2d23183398d617427327cf2b3d0ed8c69e4800" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.118" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c84d3526699cd55261af4b941e4e725444df67aa4f9e6a3564f18030d12672df" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "serde_json" -version = "1.0.61" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fceb2595057b6891a4ee808f70054bd2d12f0e97f1cbb78689b59f676df325a" -dependencies = [ - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "slab" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" - -[[package]] -name = "smallvec" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" - -[[package]] -name = "snap" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98d3306e84bf86710d6cd8b4c9c3b721d5454cc91a603180f8f8cd06cfd317b4" - -[[package]] -name = "stable_deref_trait" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" - -[[package]] -name = "syn" -version = "1.0.58" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc60a3d73ea6594cd712d830cc1f0390fd71542d8c8cd24e70cc54cdfd5e05d5" -dependencies = [ - "proc-macro2", - "quote", - "unicode-xid", -] - -[[package]] -name = "synstructure" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b834f2d66f734cb897113e34aaff2f1ab4719ca946f9a7358dba8f8064148701" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "unicode-xid", -] - -[[package]] -name = "tantivy" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37dfd693ae55afd87e798e967bb2d894b32163e3c9a172958efe5bc89ed7df08" -dependencies = [ - "atomicwrites", - "base64", - "bitpacking", - "byteorder", - "census", - "chrono", - "crc32fast", - "crossbeam", - "downcast-rs", - "fail", - "failure", - "fnv", - "fs2", - "futures", - "htmlescape", - "levenshtein_automata", - "log", - "memmap", - "murmurhash32", - "notify", - "num_cpus", - "once_cell", - "owned-read", - "owning_ref", - "rayon", - "regex", - "rust-stemmers", - "serde", - "serde_json", - "smallvec", - "snap", - "stable_deref_trait", - "tantivy-fst", - "tantivy-query-grammar", - "tempfile", - "uuid", - "winapi 0.3.9", -] - -[[package]] -name = "tantivy-fst" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb20cdc0d83e9184560bdde9cd60142dbb4af2e0f770e88fce45770495224205" -dependencies = [ - "byteorder", - "regex-syntax 0.4.2", - "utf8-ranges", -] - -[[package]] -name = "tantivy-query-grammar" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ea03b8224ca9ff4ccfc7dfab790527c8a9d8edbc53f4677bdf6ba0fd8000c75" -dependencies = [ - "combine", -] - -[[package]] -name = "tantivysearch" -version = "0.1.0" -dependencies = [ - "flurry", - "libc", - "once_cell", - "rayon", - "tantivy", -] - -[[package]] -name = "tempdir" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15f2b5fb00ccdf689e0149d1b1b3c03fead81c2b37735d812fa8bddbbf41b6d8" -dependencies = [ - "rand 0.4.6", - "remove_dir_all", -] - -[[package]] -name = "tempfile" -version = "3.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9" -dependencies = [ - "cfg-if 0.1.10", - "libc", - "rand 0.7.3", - "redox_syscall", - "remove_dir_all", - "winapi 0.3.9", -] - -[[package]] -name = "time" -version = "0.1.43" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "unicode-xid" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" - -[[package]] -name = "utf8-ranges" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ae116fef2b7fea257ed6440d3cfcff7f190865f170cdad00bb6465bf18ecba" - -[[package]] -name = "uuid" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" -dependencies = [ - "getrandom 0.2.1", - "serde", -] - -[[package]] -name = "void" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" - -[[package]] -name = "walkdir" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "777182bc735b6424e1a57516d35ed72cb8019d85c8c9bf536dccb3445c1a2f7d" -dependencies = [ - "same-file", - "winapi 0.3.9", - "winapi-util", -] - -[[package]] -name = "wasi" -version = "0.9.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" - -[[package]] -name = "wasi" -version = "0.10.1+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93c6c3420963c5c64bca373b25e77acb562081b9bb4dd5bb864187742186cea9" - -[[package]] -name = "winapi" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-build" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "ws2_32-sys" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d59cefebd0c892fa2dd6de581e937301d8552cb44489cdff035c6187cb63fa5e" -dependencies = [ - "winapi 0.2.8", - "winapi-build", -] diff --git a/contrib/tantivysearch/Cargo.toml b/contrib/tantivysearch/Cargo.toml deleted file mode 100644 index 7c6667a75181..000000000000 --- a/contrib/tantivysearch/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "tantivysearch" -version = "0.1.0" -authors = ["André Guedes "] -edition = "2018" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[lib] -name = "tantivysearch" -crate-type = ["cdylib", "staticlib"] - -[dependencies] -libc = "0.2.82" -tantivy = "0.13.2" -rayon = "1.5" -once_cell = "1.7" -flurry = "0.3" diff --git a/contrib/tantivysearch/cbindgen.toml b/contrib/tantivysearch/cbindgen.toml deleted file mode 100644 index a61dc8b17a13..000000000000 --- a/contrib/tantivysearch/cbindgen.toml +++ /dev/null @@ -1,20 +0,0 @@ -header = "// SPDX-License-Identifier: Apache-2.0" -sys_includes = ["stddef.h", "stdint.h", "stdlib.h"] -no_includes = true -include_guard = "TANTIVYSEARCH_H" -tab_width = 4 -style = "Type" -# language = "C" -cpp_compat = true - -[parse] -parse_deps = true -include = ['tantivysearch'] - -[export] -prefix = "TantivySearch" -item_types = ["enums", "structs", "unions", "typedefs", "opaque", "functions"] - -[enum] -rename_variants = "ScreamingSnakeCase" -prefix_with_name = true diff --git a/contrib/tantivysearch/include/tantivysearch.h b/contrib/tantivysearch/include/tantivysearch.h deleted file mode 100644 index e58c3a002993..000000000000 --- a/contrib/tantivysearch/include/tantivysearch.h +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -#ifndef TANTIVYSEARCH_H -#define TANTIVYSEARCH_H - -#include -#include -#include - -struct TantivySearchIndexRW; - -struct TantivySearchIterWrapper; - -extern "C" { - -TantivySearchIndexRW *tantivysearch_open_or_create_index(const char *dir_ptr); - -TantivySearchIterWrapper *tantivysearch_search(TantivySearchIndexRW *irw, - const char *query_ptr, - uint64_t limit); - -TantivySearchIterWrapper *tantivysearch_ranked_search(TantivySearchIndexRW *irw, - const char *query_ptr, - uint64_t limit); - -unsigned char tantivysearch_index(TantivySearchIndexRW *irw, - const uint64_t *primary_ids, - const uint64_t *secondary_ids, - const char *chars, - const uint64_t *offsets, - size_t size); - -unsigned char tantivysearch_writer_commit(TantivySearchIndexRW *irw); - -unsigned char tantivysearch_index_truncate(TantivySearchIndexRW *irw); - -unsigned char tantivysearch_iter_next(TantivySearchIterWrapper *iter_ptr, - uint64_t *primary_id_ptr, - uint64_t *secondary_id_ptr); - -size_t tantivysearch_iter_batch(TantivySearchIterWrapper *iter_ptr, - uint64_t count, - uint64_t *primary_ids_ptr, - uint64_t *secondary_ids_ptr); - -size_t tantivysearch_iter_count(TantivySearchIterWrapper *iter_ptr); - -void tantivysearch_iter_free(TantivySearchIterWrapper *iter_ptr); - -void tantivysearch_index_free(TantivySearchIndexRW *irw); - -void tantivysearch_index_delete(TantivySearchIndexRW *irw); - -} // extern "C" - -#endif // TANTIVYSEARCH_H diff --git a/contrib/tantivysearch/src/cache.rs b/contrib/tantivysearch/src/cache.rs deleted file mode 100644 index 13850164962c..000000000000 --- a/contrib/tantivysearch/src/cache.rs +++ /dev/null @@ -1,127 +0,0 @@ -use std::borrow::Borrow; -use std::fmt::{self, Debug, Formatter}; -use std::hash::{BuildHasher, Hash}; -use std::time::Instant; - -use once_cell::sync::OnceCell; -use flurry::{HashMap, DefaultHashBuilder}; - -pub struct ConcurrentCache { - size: usize, - seconds: u64, - items: HashMap), S> -} - -impl ConcurrentCache -where - K: 'static + Hash + Ord + Clone + Send + Sync, - V: 'static + Clone + Send + Sync -{ - /// Constructs a new `ConcurrentCache` with the default hashing algorithm and an - /// initial capacity of 0. - #[must_use] - pub fn new(size: usize, seconds: u64) -> Self { - Self::with_capacity(size, seconds, 0) - } - - /// Constructs a new `ConcurrentCache` with the default hashing algorithm and the - /// specified initial capacity. - #[must_use] - pub fn with_capacity(size: usize, seconds: u64, capacity: usize) -> Self { - Self::with_capacity_and_hasher(size, seconds, capacity, DefaultHashBuilder::default()) - } -} - - -impl ConcurrentCache -where - K: 'static + Hash + Ord + Clone + Send + Sync, - V: 'static + Clone + Send + Sync, - S: BuildHasher + Clone -{ - /// Constructs a new `ConcurrentCache` with the specified hasher and an initial - /// capacity of 0. - #[must_use] - pub fn with_hasher(size: usize, seconds: u64, hasher: S) -> Self { - Self::with_capacity_and_hasher(size, seconds, 0, hasher) - } - - /// Constructs a new `ConcurrentCache` with the specified hasher and initial - /// capacity. - #[must_use] - pub fn with_capacity_and_hasher(size: usize, seconds: u64, capacity: usize, hasher: S) -> Self { - Self { size, seconds, items: HashMap::with_capacity_and_hasher(capacity, hasher) } - } - - /// Returns `true` if the cache currently contains no items and `false` - /// otherwise. - #[must_use] - pub fn is_empty(&self) -> bool { - self.items.pin().is_empty() - } - - /// Returns the number of items currently in the cache. - #[must_use] - pub fn len(&self) -> usize { - self.items.pin().len() - } - - /// Empties the cache of all items. - pub fn clear(&self) { - self.items.pin().clear() - } - - /// Retrieves the value with the specified key, or initializes it if it is - /// not present. - /// - /// If the key is present but the value is not fully resolved, the current - /// thread will block until resolution completes. If the key is not present, - /// `init` is executed to produce a value. In either case, an immutable - /// reference to the value is returned. - /// - /// # Notes - /// The resolution closure, `init`, does not provide access to the key being - /// resolved. You may need to provide a copy of this value to the closure. - /// This is done to allow for maximum concurrency, as it permits the key - /// to be accessed by other threads during the resolution process. - pub fn resolve V>(&self, key: K, init: F) -> V { - let pinned = self.items.pin(); - - if let Some(val_ref) = pinned.get(&key) { - if val_ref.0.elapsed().as_secs() <= self.seconds { - let result_ref = val_ref.1.get_or_init(init); - let result = result_ref.clone(); - return result; - } - } - - match pinned.try_insert(key.clone(), (Instant::now(), OnceCell::new())) { - Ok(val_ref) => { - let result = val_ref.1.get_or_init(init).clone(); - if pinned.len() > self.size { - let mut count = 0; - // Max size reached, try to evict expired items or random valid item - pinned.retain(|k, v| { - let valid = v.0.elapsed().as_secs() <= self.seconds; - if valid { - count += 1; - count <= self.size - } else { - false - } - }); - } - result - } - Err(e) => { - let val_ref = e.current; - if val_ref.0.elapsed().as_secs() <= self.seconds { - val_ref.1.get_or_init(init).clone() - } else { - pinned.insert(key.clone(), e.not_inserted); - pinned.get(&key).expect("this should not happen").1.get_or_init(init).clone() - } - } - } - } -} diff --git a/contrib/tantivysearch/src/lib.rs b/contrib/tantivysearch/src/lib.rs deleted file mode 100644 index 6936d0ea444c..000000000000 --- a/contrib/tantivysearch/src/lib.rs +++ /dev/null @@ -1,656 +0,0 @@ -use std::ffi::CString; -use std::ffi::CStr; -use std::mem; -use std::ptr; -use std::slice; -use std::iter::FusedIterator; -use std::cmp::Ordering; - -use libc::*; - -use tantivy::collector::{TopDocs, Count}; -use tantivy::query::QueryParser; -use tantivy::schema::*; -use tantivy::collector::{Collector, SegmentCollector}; -use tantivy::{Index, IndexReader, IndexWriter, SegmentReader, SegmentLocalId, DocId, Score, DocAddress, TantivyError}; -use tantivy::ReloadPolicy; -use rayon::prelude::*; -use std::sync::Arc; - -mod cache; - -static CACHE: once_cell::sync::Lazy, Vec)>>> = once_cell::sync::Lazy::new(|| { - cache::ConcurrentCache::with_capacity(100, 3600, 110) -}); - -const TIMING: bool = true; - -macro_rules! start { - ($val:ident) => { - let $val = if TIMING { - Some(std::time::Instant::now()) - } else { - None - }; - }; -} - -macro_rules! end { - ($val:ident) => { - if TIMING { - let $val = $val.unwrap(); - dbg!($val.elapsed()); - } - }; - ($val:ident, $ex:expr) => { - if TIMING { - let $val = $val.unwrap(); - dbg!($val.elapsed(), $ex); - } - }; -} - -#[derive(Default)] -pub struct Docs { - limit: usize -} - -impl Docs { - pub fn with_limit(limit: usize) -> Docs { - Docs { limit } - } -} - -impl Collector for Docs { - type Fruit = Vec<(Score, DocAddress)>; - - type Child = SegmentDocsCollector; - - fn for_segment( - &self, - segment_local_id: SegmentLocalId, - _: &SegmentReader, - ) -> tantivy::Result { - Ok(SegmentDocsCollector { docs: vec!(), segment_local_id, limit: self.limit }) - } - - fn requires_scoring(&self) -> bool { - false - } - - fn merge_fruits(&self, segment_docs: Vec>) -> tantivy::Result> { - start!(merge); - let lens: Vec<_> = segment_docs.iter().map(|v| v.len()).collect(); - let full_len = lens.iter().sum(); - - let mut all = Vec::with_capacity(full_len); - unsafe { all.set_len(full_len) }; - - let mut mut_slice = &mut all[..]; - let mut mut_slices = vec!(); - for len in lens { - let (slice, rest) = mut_slice.split_at_mut(len); - mut_slices.push(slice); - mut_slice = rest; - } - - segment_docs.into_par_iter().zip(mut_slices.into_par_iter()).for_each(|(vec, slice)| { - slice.copy_from_slice(&vec[..]); - }); - end!(merge); - - start!(resize); - if all.len() > self.limit { - all.resize(self.limit, (0.0f32, DocAddress(0, 0))); - } - end!(resize); - - Ok(all) - } -} - -#[derive(Clone, Copy, Debug)] -pub struct OrdDoc(Score, DocAddress); - -impl Ord for OrdDoc { - fn cmp(&self, other: &Self) -> Ordering { - self.0.partial_cmp(&other.0).unwrap_or(self.1.cmp(&other.1)) - } -} - -impl PartialOrd for OrdDoc { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl PartialEq for OrdDoc { - fn eq(&self, other: &Self) -> bool { - self.1 == other.1 - } -} - -impl Eq for OrdDoc {} - -#[derive(Default)] -pub struct RankedDocs { - limit: usize -} - -impl RankedDocs { - pub fn with_limit(limit: usize) -> RankedDocs { - RankedDocs { limit } - } -} - -impl Collector for RankedDocs { - type Fruit = Vec; - - type Child = SegmentOrdDocsCollector; - - fn for_segment( - &self, - segment_local_id: SegmentLocalId, - _: &SegmentReader, - ) -> tantivy::Result { - Ok(SegmentOrdDocsCollector { docs: vec!(), segment_local_id, limit: self.limit }) - } - - fn requires_scoring(&self) -> bool { - true - } - - fn merge_fruits(&self, segment_docs: Vec>) -> tantivy::Result> { - start!(merge); - let lens: Vec<_> = segment_docs.iter().map(|v| v.len()).collect(); - let full_len = lens.iter().sum(); - - let mut all = Vec::with_capacity(full_len); - unsafe { all.set_len(full_len) }; - - let mut mut_slice = &mut all[..]; - let mut mut_slices = vec!(); - for len in lens { - let (slice, rest) = mut_slice.split_at_mut(len); - mut_slices.push(slice); - mut_slice = rest; - } - - segment_docs.into_par_iter().zip(mut_slices.into_par_iter()).for_each(|(vec, slice)| { - slice.copy_from_slice(&vec[..]); - }); - end!(merge); - - start!(sort); - all.par_sort(); - end!(sort); - - start!(resize); - if all.len() > self.limit { - all.resize(self.limit, OrdDoc(0.0f32, DocAddress(0, 0))); - } - end!(resize); - - Ok(all) - } -} - -#[derive(Default)] -pub struct SegmentOrdDocsCollector { - docs: Vec, - segment_local_id: SegmentLocalId, - limit: usize -} - -impl SegmentCollector for SegmentOrdDocsCollector { - type Fruit = Vec; - - #[inline] - fn collect(&mut self, doc_id: DocId, score: Score) { - if self.docs.len() < self.limit { - self.docs.push(OrdDoc(score, DocAddress(self.segment_local_id, doc_id))); - } - } - - fn harvest(self) -> Vec { - self.docs - } -} - -#[derive(Default)] -pub struct SegmentDocsCollector { - docs: Vec<(Score, DocAddress)>, - segment_local_id: SegmentLocalId, - limit: usize -} - -impl SegmentCollector for SegmentDocsCollector { - type Fruit = Vec<(Score, DocAddress)>; - - #[inline] - fn collect(&mut self, doc_id: DocId, score: Score) { - if self.docs.len() < self.limit { - self.docs.push((score, DocAddress(self.segment_local_id, doc_id))); - } - } - - fn harvest(self) -> Vec<(Score, DocAddress)> { - self.docs - } -} - -fn leak_buf(v: Vec, vallen: *mut size_t) -> *mut c_char { - unsafe { - *vallen = v.len(); - } - let mut bsv = v.into_boxed_slice(); - let val = bsv.as_mut_ptr() as *mut _; - mem::forget(bsv); - val -} - -// #[no_mangle] -// pub unsafe extern "C" fn tantivy_free_buf(buf: *mut c_char, sz: size_t) { -// drop(Vec::from_raw_parts(buf, sz, sz)); -// } -#[derive(Clone)] -pub struct IterWrapper { - inner: Arc<(Vec, Vec)>, - offset: usize -} - -impl From, Vec)>> for IterWrapper { - fn from(inner: Arc<(Vec, Vec)>) -> IterWrapper { - IterWrapper { inner, offset: 0 } - } -} - -impl Iterator for IterWrapper { - type Item = (u64, u64); - - #[inline] - fn next(&mut self) -> Option<(u64, u64)> { - if self.offset >= self.inner.0.len() { - None - } else { - let result = Some((self.inner.0[self.offset], self.inner.1[self.offset])); - self.offset += 1; - result - } - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let size = self.inner.0.len() - self.offset; - (size, Some(size)) - } - - #[inline] - fn count(self) -> usize { - self.inner.0.len() - self.offset - } -} - -impl FusedIterator for IterWrapper {} - - -#[derive(Clone)] -pub struct VecIterWrapper { - iter: std::vec::IntoIter<(u64, u64)> -} - -impl From> for VecIterWrapper { - fn from(iter: std::vec::IntoIter<(u64, u64)>) -> VecIterWrapper { - VecIterWrapper { iter } - } -} - -impl Iterator for VecIterWrapper { - type Item = (u64, u64); - - #[inline] - fn next(&mut self) -> Option<(u64, u64)> { - self.iter.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.iter.size_hint() - } - - #[inline] - fn count(self) -> usize { - self.iter.count() - } -} - -impl DoubleEndedIterator for VecIterWrapper { - #[inline] - fn next_back(&mut self) -> Option<(u64, u64)> { - self.iter.next_back() - } -} - -impl FusedIterator for VecIterWrapper {} - -pub struct IndexRW { - pub path: String, - pub index: Index, - pub reader: IndexReader, - pub writer: IndexWriter -} - -#[no_mangle] -pub extern "C" fn tantivysearch_open_or_create_index(dir_ptr: *const c_char) -> *mut IndexRW { - let dir_c_str = unsafe { - assert!(!dir_ptr.is_null()); - - CStr::from_ptr(dir_ptr) - }; - - let dir_str = dir_c_str.to_str().expect("failed to get &str from cstr"); - - println!("Opening index on {}", dir_str); - let mut index = match Index::open_in_dir(dir_str) { - Ok(index) => index, - Err(e) => { - match e { - TantivyError::PathDoesNotExist(_) => { - println!("Creating index on {}", dir_str); - std::fs::create_dir_all(dir_str).expect("failed to create index dir"); - let mut schema_builder = Schema::builder(); - schema_builder.add_u64_field("primary_id", FAST); - schema_builder.add_u64_field("secondary_id", FAST); - schema_builder.add_text_field("body", TEXT); - let schema = schema_builder.build(); - Index::create_in_dir(dir_str, schema).expect("failed to create index") - } - _ => { - panic!("this should not happen"); - } - } - } - }; - - index.set_default_multithread_executor().expect("failed to create thread pool"); - let reader = index - .reader_builder() - .reload_policy(ReloadPolicy::OnCommit) - .try_into().expect("failed to create reader"); - let writer = index - .writer(1024 * 1024 * 1024) - .expect("failed to create writer"); - - // let mut policy = tantivy::merge_policy::LogMergePolicy::default(); - // policy.set_max_merge_size(3_000_000); - - // writer.set_merge_policy(Box::new(policy)); - - Box::into_raw(Box::new(IndexRW { index, reader, writer, path: dir_str.to_string() })) -} - -pub fn tantivysearch_search_impl(irw: *mut IndexRW, query_str: &str, limit: u64) -> Arc<(Vec, Vec)> { - CACHE.resolve((irw as usize, query_str.to_string(), limit, false), move || { - println!("Searching index for {} with limit {}", query_str, limit); - let search = std::time::Instant::now(); - - let schema = unsafe { (*irw).index.schema() }; - - let body = schema.get_field("body").expect("missing field body"); - let primary_id = schema.get_field("primary_id").expect("missing field primary_id"); - let secondary_id = schema.get_field("secondary_id").expect("missing field secondary_id"); - - let searcher = unsafe { (*irw).reader.searcher() }; - let segment_readers = searcher.segment_readers(); - let ff_readers_primary: Vec<_> = segment_readers.iter().map(|seg_r| { - let ffs = seg_r.fast_fields(); - ffs.u64(primary_id).unwrap() - }).collect(); - let ff_readers_secondary: Vec<_> = segment_readers.iter().map(|seg_r| { - let ffs = seg_r.fast_fields(); - ffs.u64(secondary_id).unwrap() - }).collect(); - - - let query_parser = QueryParser::for_index(unsafe { &(*irw).index }, vec![body]); - - let query = query_parser.parse_query(query_str).expect("failed to parse query"); - let docs = searcher.search(&query, &Docs::with_limit(limit as usize)).expect("failed to search"); - let mut results: (Vec<_>, Vec<_>) = docs.into_par_iter().map(|(_score, doc_address)| { - let ff_reader_primary = &ff_readers_primary[doc_address.segment_ord() as usize]; - let ff_reader_secondary = &ff_readers_secondary[doc_address.segment_ord() as usize]; - let primary_id: u64 = ff_reader_primary.get(doc_address.doc()); - let secondary_id: u64 = ff_reader_secondary.get(doc_address.doc()); - (primary_id, secondary_id) - }).unzip(); - - dbg!(search.elapsed()); - Arc::new(results) - }) -} - -pub fn tantivysearch_ranked_search_impl(irw: *mut IndexRW, query_str: &str, limit: u64) -> Arc<(Vec, Vec)> { - CACHE.resolve((irw as usize, query_str.to_string(), limit, true), move || { - println!("Searching index for {} with limit {} and ranking", query_str, limit); - let search = std::time::Instant::now(); - - let schema = unsafe { (*irw).index.schema() }; - - let body = schema.get_field("body").expect("missing field body"); - let primary_id = schema.get_field("primary_id").expect("missing field primary_id"); - let secondary_id = schema.get_field("secondary_id").expect("missing field secondary_id"); - - let searcher = unsafe { (*irw).reader.searcher() }; - let segment_readers = searcher.segment_readers(); - let ff_readers_primary: Vec<_> = segment_readers.iter().map(|seg_r| { - let ffs = seg_r.fast_fields(); - ffs.u64(primary_id).unwrap() - }).collect(); - let ff_readers_secondary: Vec<_> = segment_readers.iter().map(|seg_r| { - let ffs = seg_r.fast_fields(); - ffs.u64(secondary_id).unwrap() - }).collect(); - - - let query_parser = QueryParser::for_index(unsafe { &(*irw).index }, vec![body]); - - let query = query_parser.parse_query(query_str).expect("failed to parse query"); - let docs = searcher.search(&query, &RankedDocs::with_limit(limit as usize)).expect("failed to search"); - let mut results: (Vec<_>, Vec<_>) = docs.into_par_iter().map(|OrdDoc(_score, doc_address)| { - let ff_reader_primary = &ff_readers_primary[doc_address.segment_ord() as usize]; - let ff_reader_secondary = &ff_readers_secondary[doc_address.segment_ord() as usize]; - let primary_id: u64 = ff_reader_primary.get(doc_address.doc()); - let secondary_id: u64 = ff_reader_secondary.get(doc_address.doc()); - (primary_id, secondary_id) - }).unzip(); - - dbg!(search.elapsed()); - Arc::new(results) - }) -} - -#[no_mangle] -pub extern "C" fn tantivysearch_search(irw: *mut IndexRW, query_ptr: *const c_char, limit: u64) -> *mut IterWrapper { - assert!(!irw.is_null()); - - let query_c_str = unsafe { - assert!(!query_ptr.is_null()); - - CStr::from_ptr(query_ptr) - }; - - let query_str = query_c_str.to_str().expect("failed to get &str from cstr"); - - let results = tantivysearch_search_impl(irw, query_str, limit); - - println!("Search results: {}", results.0.len()); - - Box::into_raw(Box::new(results.into())) -} - -#[no_mangle] -pub extern "C" fn tantivysearch_ranked_search(irw: *mut IndexRW, query_ptr: *const c_char, limit: u64) -> *mut IterWrapper { - assert!(!irw.is_null()); - - let query_c_str = unsafe { - assert!(!query_ptr.is_null()); - - CStr::from_ptr(query_ptr) - }; - - let query_str = query_c_str.to_str().expect("failed to get &str from cstr"); - - let results = tantivysearch_ranked_search_impl(irw, query_str, limit); - - println!("Search results: {}", results.0.len()); - - Box::into_raw(Box::new(results.into())) -} - -#[no_mangle] -pub extern "C" fn tantivysearch_index(irw: *mut IndexRW, primary_ids: *const u64, secondary_ids: *const u64, chars: *const c_char, offsets: *const u64, size: size_t) -> c_uchar { - assert!(!irw.is_null()); - assert!(!primary_ids.is_null()); - assert!(!secondary_ids.is_null()); - assert!(!offsets.is_null()); - assert!(!chars.is_null()); - if size == 0 { - return 1; - } - let primary_slice = unsafe { slice::from_raw_parts(primary_ids, size) }; - let secondary_slice = unsafe { slice::from_raw_parts(secondary_ids, size) }; - let offsets_slice = unsafe { slice::from_raw_parts(offsets, size) }; - let chars_len: usize = (*offsets_slice.iter().last().unwrap()) as usize; - let chars_slice = unsafe { slice::from_raw_parts(chars as *const u8, chars_len) }; - let mut strs = Vec::with_capacity(size); - let mut current_start = 0; - for i in 0..size { - let end: usize = (offsets_slice[i] as usize - 1); - strs.push(unsafe { std::str::from_utf8_unchecked(&chars_slice[current_start..end]) }); - current_start = end + 1; - } - - let schema = unsafe { (*irw).index.schema() }; - - let body = schema.get_field("body").expect("missing field body"); - let primary_id = schema.get_field("primary_id").expect("missing field primary_id"); - let secondary_id = schema.get_field("secondary_id").expect("missing field secondary_id"); - - for i in 0..size { - let mut doc = Document::default(); - doc.add_u64(primary_id, primary_slice[i]); - doc.add_u64(secondary_id, secondary_slice[i]); - doc.add_text(body, strs[i]); - unsafe { (*irw).writer.add_document(doc) }; - } - - 1 -} - -#[no_mangle] -pub extern "C" fn tantivysearch_writer_commit(irw: *mut IndexRW) -> c_uchar { - assert!(!irw.is_null()); - match unsafe { (*irw).writer.commit() } { - Ok(_) => 1, - Err(e) => { - eprintln!("Failed to commit writer: {}", e); - 0 - } - } -} - -#[no_mangle] -pub extern "C" fn tantivysearch_index_truncate(irw: *mut IndexRW) -> c_uchar { - assert!(!irw.is_null()); - match unsafe { (*irw).writer.delete_all_documents() } { - Ok(_) => { - match unsafe { (*irw).writer.commit() } { - Ok(_) => 1, - Err(e) => { - eprintln!("Failed to commit writer: {}", e); - 0 - } - } - }, - Err(e) => { - eprintln!("Failed to delete all documents: {}", e); - 0 - } - } -} - -#[no_mangle] -pub extern "C" fn tantivysearch_iter_next(iter_ptr: *mut IterWrapper, primary_id_ptr: *mut u64, secondary_id_ptr: *mut u64) -> c_uchar { - assert!(!iter_ptr.is_null()); - match unsafe { (*iter_ptr).next() } { - Some((primary_id, secondary_id)) => { - unsafe { - *primary_id_ptr = primary_id; - *secondary_id_ptr = secondary_id; - } - 1 - } - None => 0 - } -} - -#[no_mangle] -pub extern "C" fn tantivysearch_iter_batch(iter_ptr: *mut IterWrapper, count: u64, primary_ids_ptr: *mut u64, secondary_ids_ptr: *mut u64) -> size_t { - assert!(!iter_ptr.is_null()); - if primary_ids_ptr.is_null() { - return 0; - } - - let iter_size = unsafe { (*iter_ptr).inner.0.len() - (*iter_ptr).offset }; - let n_to_write = std::cmp::min(count as usize, iter_size); - - unsafe { - let src_ptr = (*iter_ptr).inner.0.as_ptr().offset((*iter_ptr).offset as isize); - std::ptr::copy_nonoverlapping(src_ptr, primary_ids_ptr, n_to_write); - } - - if !secondary_ids_ptr.is_null() { - unsafe { - let src_ptr = (*iter_ptr).inner.1.as_ptr().offset((*iter_ptr).offset as isize); - std::ptr::copy_nonoverlapping(src_ptr, secondary_ids_ptr, n_to_write); - } - } - - unsafe { (*iter_ptr).offset += n_to_write }; - - n_to_write -} - -#[no_mangle] -pub extern "C" fn tantivysearch_iter_count(iter_ptr: *mut IterWrapper) -> size_t { - assert!(!iter_ptr.is_null()); - unsafe { (*iter_ptr).inner.0.len() - (*iter_ptr).offset } -} - -#[no_mangle] -pub extern "C" fn tantivysearch_iter_free(iter_ptr: *mut IterWrapper) { - assert!(!iter_ptr.is_null()); - drop(unsafe { Box::from_raw(iter_ptr) }); -} - -#[no_mangle] -pub extern "C" fn tantivysearch_index_free(irw: *mut IndexRW) { - assert!(!irw.is_null()); - drop(unsafe { Box::from_raw(irw) }); -} - -#[no_mangle] -pub extern "C" fn tantivysearch_index_delete(irw: *mut IndexRW) { - assert!(!irw.is_null()); - let path = unsafe { (*irw).path.clone() }; - std::fs::remove_dir_all(path).expect("failed to delete index"); - println!("removed dir"); -} - -#[cfg(test)] -mod tests { - #[test] - fn it_works() { - assert_eq!(2 + 2, 4); - } -} diff --git a/docker/builder/Dockerfile b/docker/builder/Dockerfile index b6fa9ab6b4f1..199b5217d795 100644 --- a/docker/builder/Dockerfile +++ b/docker/builder/Dockerfile @@ -37,8 +37,6 @@ RUN apt-get update \ lldb-${LLVM_VERSION} \ --yes --no-install-recommends -RUN curl https://sh.rustup.rs -sSf | sh -s -- -y - COPY build.sh / CMD ["/bin/bash", "/build.sh"] diff --git a/docker/builder/build.sh b/docker/builder/build.sh index d71411e65c9a..d4cf662e91b4 100755 --- a/docker/builder/build.sh +++ b/docker/builder/build.sh @@ -1,9 +1,6 @@ #!/usr/bin/env bash set -e -cd /server/contrib/tantivysearch -~/.cargo/bin/cargo build --release - #ccache -s # uncomment to display CCache statistics mkdir -p /server/build_docker cd /server/build_docker diff --git a/docker/server/.gitignore b/docker/server/.gitignore index 7f07d17405ab..692758d55aa1 100644 --- a/docker/server/.gitignore +++ b/docker/server/.gitignore @@ -1,3 +1,2 @@ alpine-root/* -built-root/* tgz-packages/* diff --git a/docker/server/built.Dockerfile b/docker/server/built.Dockerfile deleted file mode 100644 index 61be1ad1ae46..000000000000 --- a/docker/server/built.Dockerfile +++ /dev/null @@ -1,49 +0,0 @@ -FROM ubuntu:20.04 - -ARG gosu_ver=1.10 - -RUN apt-get update \ - && apt-get install --yes --no-install-recommends \ - apt-transport-https \ - ca-certificates \ - dirmngr \ - locales \ - wget \ - && rm -rf \ - /var/lib/apt/lists/* \ - /var/cache/debconf \ - /tmp/* \ - && apt-get clean - -ADD https://github.com/tianon/gosu/releases/download/$gosu_ver/gosu-amd64 /bin/gosu - -RUN locale-gen en_US.UTF-8 -ENV LANG en_US.UTF-8 -ENV LANGUAGE en_US:en -ENV LC_ALL en_US.UTF-8 -ENV TZ UTC - -RUN mkdir /docker-entrypoint-initdb.d -RUN mkdir -p /etc/clickhouse-server/config.d/ -RUN mkdir -p /etc/clickhouse-server/users.d/ - -COPY built-root/config.xml /etc/clickhouse-server/config.xml -COPY built-root/users.xml /etc/clickhouse-server/users.xml -COPY built-root/clickhouse /usr/bin/clickhouse -RUN ln -s /usr/bin/clickhouse /usr/bin/clickhouse-client -RUN ln -s /usr/bin/clickhouse /usr/bin/clickhouse-server -COPY docker_related_config.xml /etc/clickhouse-server/config.d/ -COPY entrypoint.sh /entrypoint.sh - -RUN useradd -M -U -u 999 clickhouse - -RUN chmod +x \ - /entrypoint.sh \ - /bin/gosu - -EXPOSE 9000 8123 9009 -VOLUME /var/lib/clickhouse - -ENV CLICKHOUSE_CONFIG /etc/clickhouse-server/config.xml - -ENTRYPOINT ["/entrypoint.sh"] diff --git a/docker/server/prepare-built b/docker/server/prepare-built deleted file mode 100644 index eec10ed032a8..000000000000 --- a/docker/server/prepare-built +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -set -e - -mkdir built-root -cd built-root - -SRC_DIR=../../.. -BUILD_DIR=${SRC_DIR}/build_docker - -cp ${BUILD_DIR}/programs/clickhouse . -cp ${SRC_DIR}/programs/server/{config,users}.xml . - -strip clickhouse \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c780ff6a294c..83e28663b40e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -51,6 +51,7 @@ add_subdirectory (Storages) add_subdirectory (Parsers) add_subdirectory (Parsers/New) add_subdirectory (IO) +add_subdirectory (LuceneAnalyzer) add_subdirectory (Functions) add_subdirectory (Interpreters) add_subdirectory (AggregateFunctions) @@ -89,6 +90,10 @@ if (USE_ROCKSDB) add_headers_and_sources(dbms Storages/RocksDB) endif() +#if (USE_LUCENE) +# add_headers_and_sources(dbms Storages/LUCENE) +#endif() + if (USE_AWS_S3) add_headers_and_sources(dbms Common/S3) add_headers_and_sources(dbms Disks/S3) @@ -166,6 +171,7 @@ add_object_library(clickhouse_access Access) add_object_library(clickhouse_core Core) add_object_library(clickhouse_core_mysql Core/MySQL) add_object_library(clickhouse_compression Compression) +add_object_library(clickhouse_lucene_analyzer LuceneAnalyzer) add_object_library(clickhouse_datastreams DataStreams) add_object_library(clickhouse_datatypes DataTypes) add_object_library(clickhouse_databases Databases) @@ -339,7 +345,6 @@ dbms_target_link_libraries ( Poco::JSON Poco::MongoDB string_utils - tantivysearch PUBLIC ${MYSQLXX_LIBRARY} boost::system @@ -452,6 +457,11 @@ if (USE_ROCKSDB) dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${ROCKSDB_INCLUDE_DIR}) endif() +if (USE_LUCENE) + dbms_target_link_libraries(PUBLIC ${LUCENE_LIBRARY}) + dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${LUCENE_INCLUDE_DIR}) +endif() + if (USE_LIBPQXX) dbms_target_link_libraries(PUBLIC ${LIBPQXX_LIBRARY}) dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${LIBPQXX_INCLUDE_DIR}) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index fa921ef7c1c6..9f357019382d 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -541,6 +541,7 @@ M(572, TOO_MANY_QUERY_PLAN_OPTIMIZATIONS) \ M(573, EPOLL_ERROR) \ M(574, DISTRIBUTED_TOO_MANY_PENDING_BYTES) \ + M(801, UNKNOWN_LUCENE_ANLYZER) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Core/config_core.h.in b/src/Core/config_core.h.in index 666ef32efdf7..69277a1a21ce 100644 --- a/src/Core/config_core.h.in +++ b/src/Core/config_core.h.in @@ -12,5 +12,6 @@ #cmakedefine01 USE_OPENCL #cmakedefine01 USE_LDAP #cmakedefine01 USE_ROCKSDB +#cmakedefine01 USE_LUCENE #cmakedefine01 USE_LIBPQXX #cmakedefine01 USE_NURAFT diff --git a/src/Functions/Tantivy.cpp b/src/Functions/Lucene.cpp similarity index 93% rename from src/Functions/Tantivy.cpp rename to src/Functions/Lucene.cpp index 0c3800d8e8a6..879b041f3c46 100644 --- a/src/Functions/Tantivy.cpp +++ b/src/Functions/Lucene.cpp @@ -18,14 +18,14 @@ namespace ErrorCodes namespace { -class FunctionTantivy : public IFunction +class FunctionLucene : public IFunction { public: - static constexpr auto name = "tantivy"; + static constexpr auto name = "lucene"; static FunctionPtr create(const Context &) { - return std::make_shared(); + return std::make_shared(); } std::string getName() const override @@ -107,9 +107,9 @@ class FunctionTantivy : public IFunction } -void registerFunctionTantivy(FunctionFactory & factory) +void registerFunctionLucene(FunctionFactory & factory) { - factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/registerFunctionsString.cpp b/src/Functions/registerFunctionsString.cpp index d889f52dc5c7..bd096a904881 100644 --- a/src/Functions/registerFunctionsString.cpp +++ b/src/Functions/registerFunctionsString.cpp @@ -35,7 +35,7 @@ void registerFunctionNormalizedQueryHash(FunctionFactory &); void registerFunctionCountMatches(FunctionFactory &); void registerFunctionEncodeXMLComponent(FunctionFactory &); void registerFunctionDecodeXMLComponent(FunctionFactory &); -void registerFunctionTantivy(FunctionFactory &); +void registerFunctionLucene(FunctionFactory &); void registerFunctionExtractTextFromHTML(FunctionFactory &); @@ -75,7 +75,7 @@ void registerFunctionsString(FunctionFactory & factory) registerFunctionCountMatches(factory); registerFunctionEncodeXMLComponent(factory); registerFunctionDecodeXMLComponent(factory); - registerFunctionTantivy(factory); + registerFunctionLucene(factory); registerFunctionExtractTextFromHTML(factory); #if USE_BASE64 registerFunctionBase64Encode(factory); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index fd4ead58c1fc..17a16952ff88 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -54,6 +54,7 @@ #include #include +#include #include #include @@ -333,6 +334,21 @@ ASTPtr InterpreterCreateQuery::formatColumns(const ColumnsDescription & columns) if (column.ttl) column_declaration->ttl = column.ttl; + if (column.store_modifier) + column_declaration->store_modifier = column.store_modifier; + + if (column.index_modifier) + column_declaration->index_modifier = column.index_modifier; + + if (column.termvector_modifier) + column_declaration->termvector_modifier = column.termvector_modifier; + + if (column.analyzer) + column_declaration->analyzer=column.analyzer; + + if (column.search_analyzer) + column_declaration->search_analyzer=column.search_analyzer; + columns_list->children.push_back(column_declaration_ptr); } @@ -474,6 +490,30 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( if (col_decl.ttl) column.ttl = col_decl.ttl; + if (col_decl.store_modifier) + column.store_modifier = col_decl.store_modifier; + + if (col_decl.index_modifier) + column.index_modifier = col_decl.index_modifier; + + if (col_decl.store_modifier) + column.termvector_modifier = col_decl.termvector_modifier; + + if (col_decl.analyzer) + { + auto& name = col_decl.analyzer->children[0]->children[0]->as()->name; + AnalyzerFactory::instance().validate(name); + + column.analyzer = col_decl.analyzer; + } + + if (col_decl.search_analyzer) + { + auto& name = col_decl.search_analyzer->children[0]->children[0]->as()->name; + AnalyzerFactory::instance().validate(name); + column.search_analyzer = col_decl.search_analyzer; + } + res.add(std::move(column)); } diff --git a/src/LuceneAnalyzer/AnalyzerFactory.cpp b/src/LuceneAnalyzer/AnalyzerFactory.cpp new file mode 100644 index 000000000000..561f73b30e5f --- /dev/null +++ b/src/LuceneAnalyzer/AnalyzerFactory.cpp @@ -0,0 +1,46 @@ +#include +#include +#include + +namespace DB{ + +namespace ErrorCodes +{ + extern const int UNKNOWN_LUCENE_ANLYZER; +} + +LuceneAnalyzerPair genLuceneAnalyzers() +{ + static LuceneAnalyzerPair analyzers; + analyzers["STANDARDANALYZER"] = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT); + // TODO: stop words + analyzers["STOPANALYZER"] = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT); + analyzers["WHITESPACEANALYZER"] = Lucene::newLucene(); + analyzers["SIMPLEANALYZER"] = Lucene::newLucene(); + return analyzers; +} + +const LuceneAnalyzerPair AnalyzerFactory::analyzers = genLuceneAnalyzers(); + +void AnalyzerFactory::validate(const String& name) const +{ + + auto name_u = Poco::toUpper(name); + if (analyzers.find(name_u) == analyzers.end()) + { + throw Exception("Unknown LuceneAnalyzer family analyzer: " + name, ErrorCodes::UNKNOWN_LUCENE_ANLYZER); + } +} + +AnalyzerFactory::AnalyzerFactory() +{ +} + +AnalyzerFactory& AnalyzerFactory::instance() +{ + static AnalyzerFactory ret; + return ret; +} + +} + diff --git a/src/LuceneAnalyzer/AnalyzerFactory.h b/src/LuceneAnalyzer/AnalyzerFactory.h new file mode 100644 index 000000000000..f2aa11bb551e --- /dev/null +++ b/src/LuceneAnalyzer/AnalyzerFactory.h @@ -0,0 +1,30 @@ +#pragma once +#include +#include +#include + +namespace DB +{ + +using LuceneAnalyzerPair = std::unordered_map; + +class AnalyzerFactory +{ +public: + + static AnalyzerFactory & instance(); + + const char* getDefaultAnalyzer() const; + + /// Validate codecs AST specified by user + void validate(const String & analyzer_name) const; + +public: + static const LuceneAnalyzerPair analyzers; + +private: + AnalyzerFactory(); +}; + +} + diff --git a/src/LuceneAnalyzer/CMakeLists.txt b/src/LuceneAnalyzer/CMakeLists.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/src/LuceneAnalyzer/ya.make b/src/LuceneAnalyzer/ya.make new file mode 100644 index 000000000000..0ea23e229160 --- /dev/null +++ b/src/LuceneAnalyzer/ya.make @@ -0,0 +1,17 @@ +# This file is generated automatically, do not edit. See 'ya.make.in' and use 'utils/generate-ya-make' to regenerate it. +OWNER(g:clickhouse) + +LIBRARY() + + +PEERDIR( + clickhouse/src/Common + contrib/LucenePlusPlus +) + + +SRCS( + AnalyzerFactory.cpp +) + +END() diff --git a/src/LuceneAnalyzer/ya.make.in b/src/LuceneAnalyzer/ya.make.in new file mode 100644 index 000000000000..b4c074a6df5e --- /dev/null +++ b/src/LuceneAnalyzer/ya.make.in @@ -0,0 +1,16 @@ +OWNER(g:clickhouse) + +LIBRARY() + + +PEERDIR( + clickhouse/src/Common + contrib/LucenePlusPlus +) + + +SRCS( + +) + +END() diff --git a/src/Parsers/ASTColumnDeclaration.cpp b/src/Parsers/ASTColumnDeclaration.cpp index 4c14230e926b..ee582f4b5c96 100644 --- a/src/Parsers/ASTColumnDeclaration.cpp +++ b/src/Parsers/ASTColumnDeclaration.cpp @@ -43,6 +43,28 @@ ASTPtr ASTColumnDeclaration::clone() const res->children.push_back(res->ttl); } + if (store_modifier) { + res->store_modifier = store_modifier; + } + + if (index_modifier) { + res->index_modifier = index_modifier; + } + + if (termvector_modifier) { + res->termvector_modifier = termvector_modifier; + } + + if (analyzer) + { + res->analyzer = analyzer; + } + + if (search_analyzer) + { + res->search_analyzer = search_analyzer; + } + return res; } @@ -92,6 +114,36 @@ void ASTColumnDeclaration::formatImpl(const FormatSettings & settings, FormatSta settings.ostr << ' ' << (settings.hilite ? hilite_keyword : "") << "TTL" << (settings.hilite ? hilite_none : "") << ' '; ttl->formatImpl(settings, state, frame); } + + if (store_modifier) + { + settings.ostr << ' ' << (settings.hilite ? hilite_keyword : "") + << (*store_modifier ? "STORE" : "NOT_STORE") << (settings.hilite ? hilite_none : ""); + } + + if (index_modifier) + { + settings.ostr << ' ' << (settings.hilite ? hilite_keyword : "") + << (*index_modifier ? "INDEX" : "NOT_INDEX") << (settings.hilite ? hilite_none : ""); + } + + if (termvector_modifier) + { + settings.ostr << ' ' << (settings.hilite ? hilite_keyword : "") + << (*termvector_modifier ? "TERMVECTOR" : "NOT_TERMVECTOR") << (settings.hilite ? hilite_none : ""); + } + + if (analyzer) + { + settings.ostr << ' '; + analyzer->formatImpl(settings, state, frame); + } + + if (search_analyzer) + { + settings.ostr << ' '; + search_analyzer->formatImpl(settings, state, frame); + } } } diff --git a/src/Parsers/ASTColumnDeclaration.h b/src/Parsers/ASTColumnDeclaration.h index ea17a8b4dfa3..8c512bf8684a 100644 --- a/src/Parsers/ASTColumnDeclaration.h +++ b/src/Parsers/ASTColumnDeclaration.h @@ -19,6 +19,11 @@ class ASTColumnDeclaration : public IAST ASTPtr comment; ASTPtr codec; ASTPtr ttl; + std::optional store_modifier; + std::optional index_modifier; + std::optional termvector_modifier; + ASTPtr analyzer; + ASTPtr search_analyzer; String getID(char delim) const override { return "ColumnDeclaration" + (delim + name); } diff --git a/src/Parsers/CommonParsers.h b/src/Parsers/CommonParsers.h index 85b5217b617f..d8cf89ca3726 100644 --- a/src/Parsers/CommonParsers.h +++ b/src/Parsers/CommonParsers.h @@ -17,10 +17,9 @@ class ParserKeyword : public IParserBase public: ParserKeyword(const char * s_); - -protected: const char * getName() const override; +protected: bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 7a426e7774d3..2dd691397d33 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -815,6 +815,13 @@ bool ParserCodecDeclarationList::parseImpl(Pos & pos, ASTPtr & node, Expected & std::make_unique(TokenType::Comma), false).parse(pos, node, expected); } +bool ParserAnalyzerDeclarationList::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +{ + // TODO:: analyzer family + return ParserList(std::make_unique(), + std::make_unique(TokenType::Comma), false).parse(pos, node, expected); +} + bool ParserCodec::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { ParserCodecDeclarationList codecs; @@ -840,6 +847,36 @@ bool ParserCodec::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) return true; } +ParserAnalyzer::ParserAnalyzer(const char* s_):s(s_) +{ + +} + +bool ParserAnalyzer::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +{ + ParserAnalyzerDeclarationList analyzer; + ASTPtr expr_list_args; + + if (pos->type != TokenType::OpeningRoundBracket) + return false; + + ++pos; + if (!analyzer.parse(pos, expr_list_args, expected)) + return false; + + if (pos->type != TokenType::ClosingRoundBracket) + return false; + ++pos; + + auto function_node = std::make_shared(); + function_node->name = s; + function_node->arguments = expr_list_args; + function_node->children.push_back(function_node->arguments); + + node = function_node; + return true; +} + bool ParserCastExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { /// Either CAST(expr AS type) or CAST(expr, 'type') diff --git a/src/Parsers/ExpressionElementParsers.h b/src/Parsers/ExpressionElementParsers.h index b6194f981fec..d816dc041ef6 100644 --- a/src/Parsers/ExpressionElementParsers.h +++ b/src/Parsers/ExpressionElementParsers.h @@ -199,6 +199,13 @@ class ParserCodecDeclarationList : public IParserBase bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; +class ParserAnalyzerDeclarationList : public IParserBase +{ +protected: + const char * getName() const override { return "codec declaration list"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; +}; + /** Parse compression codec * CODEC(ZSTD(2)) */ @@ -209,6 +216,17 @@ class ParserCodec : public IParserBase bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; +class ParserAnalyzer : public IParserBase +{ +private: + const char * s; +public: + ParserAnalyzer(const char * s_); +protected: + const char * getName() const override { return "analyzer"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; +}; + class ParserCastExpression : public IParserBase { protected: diff --git a/src/Parsers/ParserCreateQuery.h b/src/Parsers/ParserCreateQuery.h index fbdc308d5bcd..33cc50854a99 100644 --- a/src/Parsers/ParserCreateQuery.h +++ b/src/Parsers/ParserCreateQuery.h @@ -125,10 +125,22 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E ParserKeyword s_comment{"COMMENT"}; ParserKeyword s_codec{"CODEC"}; ParserKeyword s_ttl{"TTL"}; + ParserKeyword s_store{"STORE"}; + ParserKeyword s_not_store{"NOT_STORE"}; + ParserKeyword s_index{"INDEX"}; + ParserKeyword s_not_index{"NOT_INDEX"}; + ParserKeyword s_termvector{"TERMVECTOR"}; + ParserKeyword s_not_termvector{"NOT_TERMVECTOR"}; + ParserKeyword s_analyzer{"ANALYZER"}; + ParserKeyword s_search_analyzer{"SEARCH_ANALYZER"}; ParserKeyword s_remove{"REMOVE"}; ParserTernaryOperatorExpression expr_parser; ParserStringLiteral string_literal_parser; ParserCodec codec_parser; + // ParserAnalyzer analyzer_parser{"ANALYZER"}; + // ParserAnalyzer search_analyzer_parser{"SEARCH_ANALYZER"}; + ParserAnalyzer analyzer_parser{s_analyzer.getName()}; + ParserAnalyzer search_analyzer_parser{s_search_analyzer.getName()}; ParserExpression expression_parser; /// mandatory column name @@ -164,6 +176,11 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E ASTPtr comment_expression; ASTPtr codec_expression; ASTPtr ttl_expression; + std::optional store_modifier; + std::optional index_modifier; + std::optional termvector_modifier; + ASTPtr analyzer_expression; + ASTPtr search_analyzer_expression; if (!s_default.checkWithoutMoving(pos, expected) && !s_materialized.checkWithoutMoving(pos, expected) @@ -220,6 +237,57 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E return false; } + if (s_store.ignore(pos, expected)) + { + store_modifier.emplace(true); + } + if (s_not_store.ignore(pos, expected)) + { + if (store_modifier) + { + return false; + } + store_modifier.emplace(false); + } + + if (s_index.ignore(pos, expected)) + { + index_modifier.emplace(true); + } + if (s_not_index.ignore(pos, expected)) + { + if (index_modifier) + { + return false; + } + index_modifier.emplace(false); + } + + if (s_termvector.ignore(pos, expected)) + { + termvector_modifier.emplace(true); + } + if (s_not_termvector.ignore(pos, expected)) + { + if (termvector_modifier) + { + return false; + } + termvector_modifier.emplace(false); + } + + if (s_analyzer.ignore(pos, expected)) + { + if (!analyzer_parser.parse(pos, analyzer_expression, expected)) + return false; + } + + if (s_search_analyzer.ignore(pos, expected)) + { + if (!search_analyzer_parser.parse(pos, search_analyzer_expression, expected)) + return false; + } + node = column_declaration; if (type) @@ -255,6 +323,22 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E column_declaration->children.push_back(std::move(ttl_expression)); } + column_declaration->store_modifier = store_modifier; + column_declaration->index_modifier = index_modifier; + column_declaration->termvector_modifier = termvector_modifier; + + if (analyzer_expression) + { + column_declaration->analyzer = analyzer_expression; + column_declaration->children.push_back(std::move(analyzer_expression)); + } + + if (search_analyzer_expression) + { + column_declaration->search_analyzer = search_analyzer_expression; + column_declaration->children.push_back(std::move(search_analyzer_expression)); + } + return true; } diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index 26e300045447..d307f3583a55 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -33,6 +33,11 @@ struct ColumnDescription String comment; ASTPtr codec; ASTPtr ttl; + std::optional store_modifier; + std::optional index_modifier; + std::optional termvector_modifier; + ASTPtr analyzer; + ASTPtr search_analyzer; ColumnDescription() = default; ColumnDescription(ColumnDescription &&) = default; diff --git a/src/Storages/StorageLucene.cpp b/src/Storages/StorageLucene.cpp new file mode 100644 index 000000000000..7824a9cda0ce --- /dev/null +++ b/src/Storages/StorageLucene.cpp @@ -0,0 +1,419 @@ +#include + +#include + +#include +#include + +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int INCORRECT_FILE_NAME; +} + +using LuceneConfig = std::unordered_map>; + +class LuceneSource : public SourceWithProgress +{ +public: + LuceneSource( + Names column_names_, + const StorageLucene & storage_, + const StorageMetadataPtr & metadata_snapshot_, + const String & query_text_, + const Int32 limit_, + Lucene::FSDirectoryPtr index_dir_) + : SourceWithProgress(metadata_snapshot_->getSampleBlockForColumns(column_names_, storage_.getVirtuals(), storage_.getStorageID())), + column_names(std::move(column_names_)), + metadata_snapshot(metadata_snapshot_), + query_text(std::move(query_text_)), + limit(limit_) + { + this->reader = Lucene::IndexReader::open(index_dir_, true); + std::cout << "Opened lucene index path" << std::endl; + + std::wstring_convert> converter; + auto& columns = metadata_snapshot->getColumns(); + for(auto& column : columns) { + auto analyzer = (column.analyzer) ? column.analyzer->children[0]->children[0]->as()->name : "StandardAnalyzer"; + auto search_analyzer = (column.search_analyzer) ? column.search_analyzer->children[0]->children[0]->as()->name : analyzer; + search_analyzer = Poco::toUpper(search_analyzer); + configs[column.name] = std::make_tuple(false, false, false, "", search_analyzer); + } + auto fieldAnalyzers = Lucene::MapStringAnalyzer::newInstance(); + for(auto& config: configs) + { + auto& col_name = config.first; + auto& ana_name = std::get<4>(config.second); + Lucene::String col_name_ws = converter.from_bytes(col_name); + auto ana_name_up = Poco::toUpper(ana_name); + fieldAnalyzers.put(col_name_ws, AnalyzerFactory::analyzers.at(ana_name_up)); + } + Lucene::PerFieldAnalyzerWrapperPtr aWrapper = + Lucene::newLucene( + Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT), + fieldAnalyzers); + + + this->searcher = Lucene::newLucene(this->reader); + Lucene::QueryPtr query; + if (!this->query_text.empty()) + { + Lucene::Collection fields = Lucene::Collection::newInstance(column_names.size()); + for (size_t i = 0; i < column_names.size(); ++i) + { + fields[i] = (converter.from_bytes(column_names[i])); + } + + Lucene::QueryParserPtr parser + = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT, fields, aWrapper); + query = parser->parse(converter.from_bytes(query_text)); + + std::cout << "Search query_text: " << query_text << std::endl; + } + else + { + query = Lucene::newLucene(); + std::cout << "Search all docs" << std::endl; + } + // Sort, use TopFieldCollector + Lucene::TopScoreDocCollectorPtr collector = Lucene::TopScoreDocCollector::create(limit, false); + searcher->search(query, collector); + this->hits = collector->topDocs()->scoreDocs; + + } + + ~LuceneSource() override { + this->searcher->close(); + this->reader->close(); + } + + String getName() const override { return "Lucene"; } + +protected: + Chunk generate() override + { + if (current_block_idx == 1) + return {}; + + const auto & sample_block = metadata_snapshot->getSampleBlock(); + auto columns = sample_block.cloneEmptyColumns(); + std::wstring_convert> converter; + + for (int i = 0; i < hits.size(); ++i) + { + Lucene::DocumentPtr doc = this->searcher->doc(hits[i]->doc); + + size_t idx = 0; + for (const auto & elem : sample_block) + { + Lucene::String doc_column_name = converter.from_bytes(elem.name); + Lucene::String doc_column_value = doc->get(doc_column_name); + String column_value = converter.to_bytes(doc_column_value); + ReadBufferFromString column_value_buffer(column_value); + std::cout << "Searched: row[" << i << "]column[" << idx << "]: " << elem.name << "=" << column_value << std::endl; + elem.type->deserializeAsWholeText(*columns[idx], column_value_buffer, FormatSettings()); + ++idx; + } + } + + current_block_idx = 1; + UInt64 num_rows = columns.at(0)->size(); + return Chunk(std::move(columns), num_rows); + } + +private: + const Names column_names; + const StorageMetadataPtr metadata_snapshot; + size_t current_block_idx = 0; + const String query_text; + Int32 limit; + Lucene::IndexReaderPtr reader; + Lucene::SearcherPtr searcher; + Lucene::Collection hits; + LuceneConfig configs; +}; + +class LuceneBlockOutputStream : public IBlockOutputStream +{ +public: + explicit LuceneBlockOutputStream( + StorageLucene & storage_, + const StorageMetadataPtr & metadata_snapshot_) + : storage(storage_) + , metadata_snapshot(metadata_snapshot_) + { + auto& columns = metadata_snapshot->getColumns(); + for(auto& column : columns) { + bool store = (column.store_modifier) ? *column.store_modifier: false; + bool index = (column.index_modifier) ? *column.index_modifier: true; + bool termvector = (column.termvector_modifier) ? *column.termvector_modifier: false; + auto analyzer = (column.analyzer) ? column.analyzer->children[0]->children[0]->as()->name : "StandardAnalyzer"; + analyzer = Poco::toUpper(analyzer); + configs[column.name] = std::make_tuple(store, index, termvector, analyzer, ""); + } + } + Block getHeader() const override { return metadata_snapshot->getSampleBlock(); } + void write(const Block & block) override + { + const auto size_bytes_diff = block.allocatedBytes(); + const auto size_rows_diff = block.rows(); + metadata_snapshot->check(block, true); + { + std::wstring_convert> converter; + Lucene::String index_path_ws = converter.from_bytes(storage.index_path); + // create a new index if there is not already an index at the provided path + // and otherwise open the existing index. + auto fieldAnalyzers = Lucene::MapStringAnalyzer::newInstance(); + for(auto& config: configs) + { + auto& col_name = config.first; + auto& ana_name_up = std::get<3>(config.second); + Lucene::String col_name_ws = converter.from_bytes(col_name); + fieldAnalyzers.put(col_name_ws, AnalyzerFactory::analyzers.at(ana_name_up)); + } + Lucene::PerFieldAnalyzerWrapperPtr aWrapper = + Lucene::newLucene( + Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT), + fieldAnalyzers); + Lucene::IndexWriterPtr writer = Lucene::newLucene( + Lucene::FSDirectory::open(index_path_ws), + aWrapper, + Lucene::IndexWriter::MaxFieldLengthLIMITED); + + auto rows = block.rows(); + + WriteBufferFromOwnString write_buffer; + + for (size_t i = 0; i < rows; i++) + { + std::cout << "Lucene inserting row[" << i << "]" << std::endl; + Lucene::DocumentPtr doc = Lucene::newLucene(); + size_t idx = 0; + for (const auto & elem : block) + { + write_buffer.restart(); + auto column_name = block.safeGetByPosition(idx).name; + // TODO: Optimize code structure + auto config = configs[column_name]; + auto store = std::get<0>(config); + auto index = std::get<1>(config); + auto termvector = std::get<2>(config); + elem.type->serializeAsText(*elem.column, i, write_buffer, FormatSettings()); + doc->add(Lucene::newLucene( + converter.from_bytes(column_name), + converter.from_bytes(write_buffer.str()), + store ? Lucene::Field::STORE_YES : Lucene::Field::STORE_NO, + index ? Lucene::Field::INDEX_ANALYZED : Lucene::Field::INDEX_NOT_ANALYZED, + termvector ? Lucene::Field::TERM_VECTOR_YES : Lucene::Field::TERM_VECTOR_NO)); + + ++idx; + } + std::cout << "Lucene inserted row[" << i << "]" << std::endl; + writer->addDocument(doc); + } + if (rows > 0) + { + writer->optimize(); + } + writer->close(); + + storage.total_size_bytes.fetch_add(size_bytes_diff, std::memory_order_relaxed); + storage.total_size_rows.fetch_add(size_rows_diff, std::memory_order_relaxed); + } + } +private: + StorageLucene & storage; + StorageMetadataPtr metadata_snapshot; + LuceneConfig configs; +}; + + +StorageLucene::StorageLucene(const std::string & relative_table_dir_path, CommonArguments args) + : StorageLucene(args) +{ + if (relative_table_dir_path.empty()) + throw Exception("Storage " + getName() + " requires data path", ErrorCodes::INCORRECT_FILE_NAME); + + this->index_path = base_path + relative_table_dir_path + "/"; + std::cout << "StorageLucene index_path:" << this->index_path << std::endl; + Poco::File(this->index_path).createDirectories(); +} + +StorageLucene::StorageLucene(CommonArguments args) + : IStorage(args.table_id) + , base_path(args.context.getPath()) +{ + StorageInMemoryMetadata storage_metadata; + storage_metadata.setColumns(args.columns); + storage_metadata.setConstraints(args.constraints); + setInMemoryMetadata(storage_metadata); +} + +Pipe StorageLucene::read( + const Names & column_names, + const StorageMetadataPtr & metadata_snapshot, + SelectQueryInfo & query_info, + const Context & /*context*/, + QueryProcessingStage::Enum /*processed_stage*/, + size_t /*max_block_size*/, + unsigned /*num_streams*/) +{ + metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + + String query_text; + Int32 limit = 10000; + const ASTSelectQuery & select = query_info.query->as(); + const ASTPtr & where = select.where(); + if (where) + { + const auto * function = where->as(); + if (function->name != "lucene") + { + throw Exception("WHERE clause should contain only lucene function", ErrorCodes::NOT_IMPLEMENTED); + } + + if (function->arguments->children.size() >= 1) + { + query_text = function->arguments->children[0]->as().value.safeGet(); + } + + if (function->arguments->children.size() >= 2) + { + if (function->arguments->children[1]->as()) + { + limit = function->arguments->children[1]->as().value.safeGet(); + } + } + } + + + std::wstring_convert> converter; + Lucene::String index_path_ws = converter.from_bytes(index_path); + Lucene::FSDirectoryPtr index_dir = Lucene::FSDirectory::open(index_path_ws); + if (index_dir->listAll().empty()) + { + std::cout << "No files in lucene index path: " << this->index_path << std::endl; + return {}; + } + + + return Pipe( + std::make_shared( + column_names, + *this, + metadata_snapshot, + query_text, + limit, + index_dir + )); +} + +BlockOutputStreamPtr StorageLucene::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, const Context & /*context*/) +{ + return std::make_shared(*this, metadata_snapshot); +} + +bool StorageLucene::optimize( + const ASTPtr & /*query*/, + const StorageMetadataPtr & /*metadata_snapshot*/, + const ASTPtr & /*partition*/, + bool /*final*/, + bool /*deduplicate*/, + const Names & /*deduplicate_by_columns*/, + const Context & /*context*/) +{ + std::cerr << "Running optimize" << std::endl; + return false; +} + +void StorageLucene::truncate( + const ASTPtr & /*query*/, + const StorageMetadataPtr & /* metadata_snapshot */, + const Context & /* context */, + TableExclusiveLockHolder &) +{ + std::cout << "StorageLucene is truncate" << std::endl; + Poco::File(this->index_path).remove(true); + Poco::File(this->index_path).createDirectories(); + // TODO: init lucene index files +} + + +std::optional StorageLucene::totalRows(const Settings &) const +{ + /// All modifications of these counters are done under mutex which automatically guarantees synchronization/consistency + /// When run concurrently we are fine with any value: "before" or "after" + return total_size_rows.load(std::memory_order_relaxed); +} + +std::optional StorageLucene::totalBytes(const Settings &) const +{ + return total_size_bytes.load(std::memory_order_relaxed); +} + +//void StorageLucene::startup() +//{ +// return; +//} + +// when "DROP TABLE" is called, or clickhouse-server is shutdown +//void StorageLucene::shutdown() +//{ +// std::cout << "StorageLucene is shutdown" << std::endl; +// Poco::File(index_path).remove(true); +// return; +//} + +//void StorageLucene::drop() { +// std::cout << "StorageLucene is dropped" << std::endl; +// Poco::File(index_path).remove(true); +// return; +//} + +void registerStorageLucene(StorageFactory & factory) +{ + factory.registerStorage("Lucene", [](const StorageFactory::Arguments & factory_args) + { + StorageLucene::CommonArguments storage_args{ + .table_id = factory_args.table_id, + .columns = factory_args.columns, + .constraints = factory_args.constraints, + .context = factory_args.context + }; + + return StorageLucene::create(factory_args.relative_data_path, storage_args); + }); +} + +} diff --git a/src/Storages/StorageTantivy.h b/src/Storages/StorageLucene.h similarity index 72% rename from src/Storages/StorageTantivy.h rename to src/Storages/StorageLucene.h index c2e42cbcbd14..acc341ba3e8d 100644 --- a/src/Storages/StorageTantivy.h +++ b/src/Storages/StorageLucene.h @@ -9,9 +9,11 @@ #include #include #include -#include #include +#include +#include +#include namespace DB { @@ -21,14 +23,14 @@ namespace DB * It does not support keys. * Data is stored as a set of blocks and is not stored anywhere else. */ -class StorageTantivy final : public ext::shared_ptr_helper, public IStorage +class StorageLucene final : public ext::shared_ptr_helper, public IStorage { -friend struct ext::shared_ptr_helper; -friend class TantivyBlockOutputStream; +friend struct ext::shared_ptr_helper; +friend class LuceneBlockOutputStream; public: - String getName() const override { return "Tantivy"; } + String getName() const override { return "Lucene"; } size_t getSize() const { return data.size(); } @@ -45,8 +47,8 @@ friend class TantivyBlockOutputStream; size_t max_block_size, unsigned num_streams) override; - void startup() override; - void shutdown() override; +// void startup() override; +// void shutdown() override; bool supportsParallelInsert() const override { return false; } @@ -67,25 +69,38 @@ friend class TantivyBlockOutputStream; const Context & context, TableExclusiveLockHolder &) override; - void drop() override; +// void drop() override; bool supportsSampling() const override { return false; } std::optional totalRows(const Settings &) const override; std::optional totalBytes(const Settings &) const override; + struct CommonArguments + { + StorageID table_id; + const ColumnsDescription & columns; + const ConstraintsDescription & constraints; + const Context & context; + }; + private: + String base_path; /// The data itself. `list` - so that when inserted to the end, the existing iterators are not invalidated. BlocksList data; String index_path; mutable std::mutex mutex; - TantivySearchIndexRW *tantivy_index = nullptr; +// Lucene::IndexReaderPtr reader; +// Lucene::IndexWriterPtr writer; std::atomic total_size_bytes = 0; std::atomic total_size_rows = 0; Poco::Logger * log; protected: - StorageTantivy(const StorageID & table_id_, ColumnsDescription columns_description_, ConstraintsDescription constraints_, const String & index_path_); + StorageLucene(const std::string & relative_table_dir_path, CommonArguments args); + +private: + explicit StorageLucene(CommonArguments args); }; } diff --git a/src/Storages/StorageTantivy.cpp b/src/Storages/StorageTantivy.cpp deleted file mode 100644 index 36ade534c2fd..000000000000 --- a/src/Storages/StorageTantivy.cpp +++ /dev/null @@ -1,315 +0,0 @@ -#include - -#include - -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; -} - - -class TantivySource : public SourceWithProgress -{ -public: - TantivySource( - Names column_names_, - const StorageTantivy & storage, - const StorageMetadataPtr & metadata_snapshot, - const String & tantivy_arg_, - const UInt64 limit_, - TantivySearchIterWrapper *tantivy_iter_) - : SourceWithProgress(metadata_snapshot->getSampleBlockForColumns(column_names_, storage.getVirtuals(), storage.getStorageID())) - , column_names(std::move(column_names_)) - , tantivy_arg(std::move(tantivy_arg_)) - , limit(limit_) - , tantivy_iter(tantivy_iter_) - { - } - - String getName() const override { return "Tantivy"; } - -protected: - Chunk generate() override - { - if (current_block_idx == 1) - return {}; - - Columns columns; - columns.reserve(column_names.size()); - - auto column_primary = ColumnUInt64::create(); - auto & data_primary = column_primary->getData(); - - auto column_secondary = ColumnUInt64::create(); - auto & data_secondary = column_secondary->getData(); - - size_t tantivy_size = tantivysearch_iter_count(tantivy_iter); - if (tantivy_size < limit) - limit = tantivy_size; - data_primary.resize(limit); - data_secondary.resize(limit); - - UInt64 i = 0; - UInt64 primary_id = 0; - UInt64 secondary_id = 0; - int r = tantivysearch_iter_next(tantivy_iter, &primary_id, &secondary_id); - while (r) - { - data_primary[i] = primary_id; - data_secondary[i] = secondary_id; - if (i > limit) - break; - i++; - r = tantivysearch_iter_next(tantivy_iter, &primary_id, &secondary_id); - } - tantivysearch_iter_free(tantivy_iter); - - for (size_t c=0; csize(); - return Chunk(std::move(columns), num_rows); - } - -private: - const Names column_names; - size_t current_block_idx = 0; - const String tantivy_arg; - UInt64 limit; - TantivySearchIterWrapper *tantivy_iter; -}; - -class TantivyBlockOutputStream : public IBlockOutputStream -{ -public: - explicit TantivyBlockOutputStream( - StorageTantivy & storage_, - const StorageMetadataPtr & metadata_snapshot_) - : storage(storage_) - , metadata_snapshot(metadata_snapshot_) - {} - Block getHeader() const override { return metadata_snapshot->getSampleBlock(); } - void write(const Block & block) override - { - const auto size_bytes_diff = block.allocatedBytes(); - const auto size_rows_diff = block.rows(); - metadata_snapshot->check(block, true); - { - // std::lock_guard lock(storage.mutex); - // auto new_data = std::make_unique(*(storage.data.get())); - // new_data->push_back(block); - // storage.data.set(std::move(new_data)); - if (block.columns() != 3) { - throw Exception( - "Inserts need all columns", - ErrorCodes::NOT_IMPLEMENTED); - } - - auto & primary_id = block.getByName("primary_id"); - auto primary_id_col = checkAndGetColumn(primary_id.column.get()); - auto & secondary_id = block.getByName("secondary_id"); - auto secondary_id_col = checkAndGetColumn(secondary_id.column.get()); - auto & body = block.getByName("body"); - auto body_col = checkAndGetColumn(body.column.get()); - - if (primary_id_col && secondary_id_col && body_col) - { - auto & primary_data = primary_id_col->getData(); - auto & secondary_data = secondary_id_col->getData(); - auto & chars = body_col->getChars(); - auto & offsets = body_col->getOffsets(); - const char * char_ptr = reinterpret_cast(&chars[0]); - - int res = tantivysearch_index(storage.tantivy_index, &primary_data[0], &secondary_data[0], char_ptr, &offsets[0], primary_data.size()); - std::cerr << "index result: " << res << std::endl; - } else { - throw Exception( - "Inserts need all columns", - ErrorCodes::NOT_IMPLEMENTED); - } - - storage.total_size_bytes.fetch_add(size_bytes_diff, std::memory_order_relaxed); - storage.total_size_rows.fetch_add(size_rows_diff, std::memory_order_relaxed); - } - } -private: - StorageTantivy & storage; - StorageMetadataPtr metadata_snapshot; -}; - - -StorageTantivy::StorageTantivy(const StorageID & table_id_, ColumnsDescription columns_description_, ConstraintsDescription constraints_, const String & index_path_) - : IStorage(table_id_), index_path(index_path_), log(&Poco::Logger::get("StorageTantivy (" + table_id_.table_name + ")")) -{ - StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(std::move(columns_description_)); - storage_metadata.setConstraints(std::move(constraints_)); - setInMemoryMetadata(storage_metadata); -} - - -Pipe StorageTantivy::read( - const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, - SelectQueryInfo & query_info, - const Context & /*context*/, - QueryProcessingStage::Enum /*processed_stage*/, - size_t /*max_block_size*/, - unsigned /*num_streams*/) -{ - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); - - const ASTSelectQuery & select = query_info.query->as(); - const ASTPtr & where = select.where(); - if (!where) - { - throw Exception( - "Missing WHERE clause", - ErrorCodes::NOT_IMPLEMENTED); - } - const auto * function = where->as(); - if (function->name != "tantivy") - { - throw Exception( - "WHERE clause should contain only tantivy function", - ErrorCodes::NOT_IMPLEMENTED); - } - - UInt64 limit = 1000000UL; - - if (function->arguments->children.size() == 2) - { - if (function->arguments->children[1]->as()) - { - limit = function->arguments->children[1]->as().value.safeGet(); - } - } - - String tantivy_text_arg = function->arguments->children[0]->as().value.safeGet(); - - TantivySearchIterWrapper *tantivy_iter = tantivysearch_search(tantivy_index, tantivy_text_arg.c_str(), limit); - - return Pipe( - std::make_shared( - column_names, *this, metadata_snapshot, tantivy_text_arg, limit, tantivy_iter - )); -} - -BlockOutputStreamPtr StorageTantivy::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, const Context & /*context*/) -{ - return std::make_shared(*this, metadata_snapshot); -} - -bool StorageTantivy::optimize( - const ASTPtr & /*query*/, - const StorageMetadataPtr & /*metadata_snapshot*/, - const ASTPtr & /*partition*/, - bool /*final*/, - bool /*deduplicate*/, - const Names & /*deduplicate_by_columns*/, - const Context & /*context*/) -{ - std::cerr << "Running optimize" << std::endl; - if (tantivysearch_writer_commit(tantivy_index)) - { - return true; - } - return false; -} - -void StorageTantivy::truncate( - const ASTPtr & /*query*/, - const StorageMetadataPtr & /* metadata_snapshot */, - const Context & /* context */, - TableExclusiveLockHolder &) -{ - bool res = tantivysearch_index_truncate(tantivy_index); - LOG_DEBUG(log, "Truncated index with result: {}", res); -} - - -std::optional StorageTantivy::totalRows(const Settings &) const -{ - /// All modifications of these counters are done under mutex which automatically guarantees synchronization/consistency - /// When run concurrently we are fine with any value: "before" or "after" - return total_size_rows.load(std::memory_order_relaxed); -} - -std::optional StorageTantivy::totalBytes(const Settings &) const -{ - return total_size_bytes.load(std::memory_order_relaxed); -} - -void StorageTantivy::startup() -{ - this->tantivy_index = tantivysearch_open_or_create_index(index_path.c_str()); - return; -} - -void StorageTantivy::shutdown() -{ - if (tantivy_index != nullptr) - { - tantivysearch_index_free(tantivy_index); - } - return; -} - -void StorageTantivy::drop() { - if (tantivy_index != nullptr) - { - tantivysearch_index_delete(tantivy_index); - } - return; -} - -void registerStorageTantivy(StorageFactory & factory) -{ - factory.registerStorage("Tantivy", [](const StorageFactory::Arguments & args) - { - if (args.engine_args.size() != 1) - throw Exception( - "Engine " + args.engine_name + " needs the data path argument", - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - - String index_path = args.engine_args[0]->as().value.safeGet(); - - return StorageTantivy::create(args.table_id, args.columns, args.constraints, index_path); - }); -} - -} diff --git a/src/Storages/registerStorages.cpp b/src/Storages/registerStorages.cpp index b92ee2a0e276..874956caea42 100644 --- a/src/Storages/registerStorages.cpp +++ b/src/Storages/registerStorages.cpp @@ -26,7 +26,7 @@ void registerStorageJoin(StorageFactory & factory); void registerStorageView(StorageFactory & factory); void registerStorageMaterializedView(StorageFactory & factory); void registerStorageLiveView(StorageFactory & factory); -void registerStorageTantivy(StorageFactory & factory); +void registerStorageLucene(StorageFactory & factory); void registerStorageGenerateRandom(StorageFactory & factory); #if USE_AWS_S3 @@ -84,7 +84,7 @@ void registerStorages() registerStorageView(factory); registerStorageMaterializedView(factory); registerStorageLiveView(factory); - registerStorageTantivy(factory); + registerStorageLucene(factory); registerStorageGenerateRandom(factory); #if USE_AWS_S3 diff --git a/src/ya.make b/src/ya.make index 5361c8a56953..92ec51d83ec0 100644 --- a/src/ya.make +++ b/src/ya.make @@ -20,6 +20,7 @@ PEERDIR( clickhouse/src/Functions clickhouse/src/Interpreters clickhouse/src/IO + clickhouse/src/LuceneAnalyzer clickhouse/src/Parsers clickhouse/src/Processors clickhouse/src/Server