From ec94b2f73509966170d3655e94b6a65bdc25ff86 Mon Sep 17 00:00:00 2001 From: madianjun Date: Sat, 17 Jul 2021 17:42:50 +0800 Subject: [PATCH 01/12] Add lucene storage --- .gitmodules | 3 + CMakeLists.txt | 1 + cmake/find/luceneplusplus.cmake | 49 +++++++ contrib/CMakeLists.txt | 2 +- contrib/LucenePlusPlus | 1 + contrib/boost-cmake/CMakeLists.txt | 37 +++++ src/CMakeLists.txt | 10 +- src/Core/config_core.h.in | 1 + src/Storages/StorageTantivy.cpp | 220 ++++++++++++++++++----------- src/Storages/StorageTantivy.h | 13 +- 10 files changed, 252 insertions(+), 85 deletions(-) create mode 100644 cmake/find/luceneplusplus.cmake create mode 160000 contrib/LucenePlusPlus diff --git a/.gitmodules b/.gitmodules index 7a2c5600e65b..afccdbdeab09 100644 --- a/.gitmodules +++ b/.gitmodules @@ -221,3 +221,6 @@ [submodule "contrib/NuRaft"] path = contrib/NuRaft url = https://github.com/ClickHouse-Extras/NuRaft.git +[submodule "contrib/LucenePlusPlus"] + path = contrib/LucenePlusPlus + url = https://github.com/luceneplusplus/LucenePlusPlus.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 9002f1df140c..03975dc7ee27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -490,6 +490,7 @@ include (cmake/find/rapidjson.cmake) include (cmake/find/fastops.cmake) include (cmake/find/odbc.cmake) include (cmake/find/rocksdb.cmake) +include (cmake/find/luceneplusplus.cmake) include (cmake/find/libpqxx.cmake) include (cmake/find/nuraft.cmake) diff --git a/cmake/find/luceneplusplus.cmake b/cmake/find/luceneplusplus.cmake new file mode 100644 index 000000000000..c4e3da27f03a --- /dev/null +++ b/cmake/find/luceneplusplus.cmake @@ -0,0 +1,49 @@ +option(ENABLE_LUCENE "Enable LUCENE" ${ENABLE_LIBRARIES}) + +if (NOT ENABLE_LUCENE) + if (USE_INTERNAL_LUCENE_LIBRARY) + message (${RECONFIGURE_MESSAGE_LEVEL} "Can't use internal lucene library with ENABLE_LUCENE=OFF") + endif() + return() +endif() + +option(USE_INTERNAL_LUCENE_LIBRARY "Set to FALSE to use system LUCENE library instead of bundled" ${NOT_UNBUNDLED}) + +if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/LucenePlusPlus/CMakeLists.txt") + if (USE_INTERNAL_LUCENE_LIBRARY) + message (WARNING "submodule contrib is missing. to fix try run: \n git submodule update --init --recursive") + message(${RECONFIGURE_MESSAGE_LEVEL} "cannot find internal lucene") + endif() + set (MISSING_INTERNAL_LUCENE 1) +endif () + +if (NOT USE_INTERNAL_LUCENE_LIBRARY) + find_library (LUCENE_LIBRARY lucene++) + find_path (LUCENE_INCLUDE_DIR NAMES lucene++/LuceneHeaders.h PATHS ${LUCENE_INCLUDE_PATHS}) + if (NOT LUCENE_LIBRARY OR NOT LUCENE_INCLUDE_DIR) + message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system lucene library") + endif() + + if (NOT ZLIB_LIBRARY) + include(cmake/find/zlib.cmake) + endif() + + if(ZLIB_LIBRARY) + list (APPEND LUCENE_LIBRARY ${ZLIB_LIBRARY}) + else() + message (${RECONFIGURE_MESSAGE_LEVEL} + "Can't find system lucene: zlib=${ZLIB_LIBRARY} ;") + endif() +endif () + +if(LUCENE_LIBRARY AND LUCENE_INCLUDE_DIR) + set(USE_LUCENE 1) +elseif (NOT MISSING_INTERNAL_LUCENE) + set (USE_INTERNAL_LUCENE_LIBRARY 1) + + set (LUCENE_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/LucenePlusPlus/include") + set (LUCENE_LIBRARY "lucene++") + set (USE_LUCENE 1) +endif () + +message (STATUS "Using LUCENE=${USE_LUCENE}: ${LUCENE_INCLUDE_DIR} : ${LUCENE_LIBRARY}") diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index cfe5a6aed57b..cab963c4d32c 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -309,7 +309,7 @@ if (USE_INTERNAL_ROCKSDB_LIBRARY) add_subdirectory(rocksdb-cmake) endif() -add_subdirectory(tantivysearch-cmake) +add_subdirectory(LucenePlusPlus) if (USE_LIBPQXX) add_subdirectory (libpq-cmake) diff --git a/contrib/LucenePlusPlus b/contrib/LucenePlusPlus new file mode 160000 index 000000000000..b05729eedf53 --- /dev/null +++ b/contrib/LucenePlusPlus @@ -0,0 +1 @@ +Subproject commit b05729eedf53115c0abc3763410e366a74fc8be7 diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index b9298f59f2b2..dc375e456d49 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -13,6 +13,8 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) regex context coroutine + date_time + thread ) if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND @@ -32,6 +34,8 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) add_library (_boost_system INTERFACE) add_library (_boost_context INTERFACE) add_library (_boost_coroutine INTERFACE) + add_library (_boost_date_time INTERFACE) + add_library (_boost_thread INTERFACE) target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY}) target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY}) @@ -40,6 +44,8 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY}) target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY}) target_link_libraries (_boost_coroutine INTERFACE ${Boost_COROUTINE_LIBRARY}) + target_link_libraries (_boost_date_time INTERFACE ${Boost_DATE_TIME_LIBRARY}) + target_link_libraries (_boost_thread INTERFACE ${Boost_THREAD_LIBRARY}) add_library (boost::filesystem ALIAS _boost_filesystem) add_library (boost::iostreams ALIAS _boost_iostreams) @@ -48,6 +54,8 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) add_library (boost::system ALIAS _boost_system) add_library (boost::context ALIAS _boost_context) add_library (boost::coroutine ALIAS _boost_coroutine) + add_library (boost::date_time ALIAS _boost_date_time) + add_library (boost::thread ALIAS _boost_thread) else() set(EXTERNAL_BOOST_FOUND 0) message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost") @@ -220,4 +228,33 @@ if (NOT EXTERNAL_BOOST_FOUND) add_library (boost::coroutine ALIAS _boost_coroutine) target_include_directories (_boost_coroutine PRIVATE ${LIBRARY_DIR}) target_link_libraries(_boost_coroutine PRIVATE _boost_context) + + # date_time + + set (SRCS_DATE_TIME + ${LIBRARY_DIR}/libs/date_time/src/gregorian/date_generators.cpp + ${LIBRARY_DIR}/libs/date_time/src/gregorian/greg_month.cpp + ${LIBRARY_DIR}/libs/date_time/src/gregorian/greg_names.hpp + ${LIBRARY_DIR}/libs/date_time/src/gregorian/greg_weekday.cpp + ${LIBRARY_DIR}/libs/date_time/src/gregorian/gregorian_types.cpp + ${LIBRARY_DIR}/libs/date_time/src/posix_time/posix_time_types.cpp + ) + add_library (_boost_date_time ${SRCS_DATE_TIME}) + add_library (boost::date_time ALIAS _boost_date_time) + target_include_directories (_boost_date_time PRIVATE ${LIBRARY_DIR}) + target_link_libraries(_boost_date_time PRIVATE _boost_context) + + # thread + + set (SRCS_THREAD + ${LIBRARY_DIR}/libs/thread/src/pthread/once.cpp + ${LIBRARY_DIR}/libs/thread/src/pthread/once_atomic.cpp + ${LIBRARY_DIR}/libs/thread/src/pthread/thread.cpp + ${LIBRARY_DIR}/libs/thread/src/future.cpp + ${LIBRARY_DIR}/libs/thread/src/tss_null.cpp + ) + add_library (_boost_thread ${SRCS_THREAD}) + add_library (boost::thread ALIAS _boost_thread) + target_include_directories (_boost_thread PRIVATE ${LIBRARY_DIR}) + target_link_libraries(_boost_thread PRIVATE _boost_context _boost_date_time) endif () diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c780ff6a294c..4e2c5bd579db 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -89,6 +89,10 @@ if (USE_ROCKSDB) add_headers_and_sources(dbms Storages/RocksDB) endif() +#if (USE_LUCENE) +# add_headers_and_sources(dbms Storages/LUCENE) +#endif() + if (USE_AWS_S3) add_headers_and_sources(dbms Common/S3) add_headers_and_sources(dbms Disks/S3) @@ -339,7 +343,6 @@ dbms_target_link_libraries ( Poco::JSON Poco::MongoDB string_utils - tantivysearch PUBLIC ${MYSQLXX_LIBRARY} boost::system @@ -452,6 +455,11 @@ if (USE_ROCKSDB) dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${ROCKSDB_INCLUDE_DIR}) endif() +if (USE_LUCENE) + dbms_target_link_libraries(PUBLIC ${LUCENE_LIBRARY}) + dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${LUCENE_INCLUDE_DIR}) +endif() + if (USE_LIBPQXX) dbms_target_link_libraries(PUBLIC ${LIBPQXX_LIBRARY}) dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${LIBPQXX_INCLUDE_DIR}) diff --git a/src/Core/config_core.h.in b/src/Core/config_core.h.in index 666ef32efdf7..69277a1a21ce 100644 --- a/src/Core/config_core.h.in +++ b/src/Core/config_core.h.in @@ -12,5 +12,6 @@ #cmakedefine01 USE_OPENCL #cmakedefine01 USE_LDAP #cmakedefine01 USE_ROCKSDB +#cmakedefine01 USE_LUCENE #cmakedefine01 USE_LIBPQXX #cmakedefine01 USE_NURAFT diff --git a/src/Storages/StorageTantivy.cpp b/src/Storages/StorageTantivy.cpp index 36ade534c2fd..e8a0dbf6538d 100644 --- a/src/Storages/StorageTantivy.cpp +++ b/src/Storages/StorageTantivy.cpp @@ -23,6 +23,10 @@ #include #include #include +#include +#include + +#include namespace DB { @@ -38,19 +42,35 @@ class TantivySource : public SourceWithProgress public: TantivySource( Names column_names_, - const StorageTantivy & storage, - const StorageMetadataPtr & metadata_snapshot, - const String & tantivy_arg_, - const UInt64 limit_, - TantivySearchIterWrapper *tantivy_iter_) - : SourceWithProgress(metadata_snapshot->getSampleBlockForColumns(column_names_, storage.getVirtuals(), storage.getStorageID())) - , column_names(std::move(column_names_)) - , tantivy_arg(std::move(tantivy_arg_)) - , limit(limit_) - , tantivy_iter(tantivy_iter_) + const StorageTantivy & storage_, + const StorageMetadataPtr & metadata_snapshot_, + const String & query_text_, + //const UInt64 limit_, + Lucene::FSDirectoryPtr dir_) + : SourceWithProgress(metadata_snapshot_->getSampleBlockForColumns(column_names_, storage_.getVirtuals(), storage_.getStorageID())), + column_names(std::move(column_names_)), + metadata_snapshot(metadata_snapshot_), + query_text(std::move(query_text_)) + //limit(limit_), { + + this->reader = Lucene::IndexReader::open(dir_, true); + std::cout << "Opened lucene index path: " << std::endl; + + + this->searcher = Lucene::newLucene(this->reader); + Lucene::AnalyzerPtr analyzer = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT); + Lucene::QueryParserPtr parser = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT, L"body", analyzer); + Lucene::QueryPtr query = parser->parse(Lucene::String(query_text.begin(), query_text.end())); + std::cout << "Search for: " << query_text << std::endl; + Lucene::TopScoreDocCollectorPtr collector = Lucene::TopScoreDocCollector::create(500, false); + searcher->search(query, collector); + this->hits = collector->topDocs()->scoreDocs; } +// ~TantivySource() override { +// this->reader->close(); +// } String getName() const override { return "Tantivy"; } protected: @@ -59,47 +79,34 @@ class TantivySource : public SourceWithProgress if (current_block_idx == 1) return {}; - Columns columns; - columns.reserve(column_names.size()); + const auto & sample_block = metadata_snapshot->getSampleBlock(); + //const auto & key_column = sample_block.getByName("primary_id"); + auto columns = sample_block.cloneEmptyColumns(); + size_t primary_id_pos = sample_block.getPositionByName("primary_id"); + std::wstring_convert> converter; - auto column_primary = ColumnUInt64::create(); - auto & data_primary = column_primary->getData(); - - auto column_secondary = ColumnUInt64::create(); - auto & data_secondary = column_secondary->getData(); - - size_t tantivy_size = tantivysearch_iter_count(tantivy_iter); - if (tantivy_size < limit) - limit = tantivy_size; - data_primary.resize(limit); - data_secondary.resize(limit); - - UInt64 i = 0; - UInt64 primary_id = 0; - UInt64 secondary_id = 0; - int r = tantivysearch_iter_next(tantivy_iter, &primary_id, &secondary_id); - while (r) + for (int i = 0; i < hits.size(); ++i) { - data_primary[i] = primary_id; - data_secondary[i] = secondary_id; - if (i > limit) - break; - i++; - r = tantivysearch_iter_next(tantivy_iter, &primary_id, &secondary_id); + Lucene::DocumentPtr doc = this->searcher->doc(hits[i]->doc); + Lucene::String primary_id = doc->get(L"primary_id"); + Lucene::String secondary_id = doc->get(L"secondary_id"); + std::wcout << "Lucene searched doc: " << primary_id << ", " << secondary_id <size(); return Chunk(std::move(columns), num_rows); @@ -107,10 +114,13 @@ class TantivySource : public SourceWithProgress private: const Names column_names; + const StorageMetadataPtr metadata_snapshot; size_t current_block_idx = 0; - const String tantivy_arg; - UInt64 limit; - TantivySearchIterWrapper *tantivy_iter; + const String query_text; + //UInt64 limit; + Lucene::IndexReaderPtr reader; + Lucene::SearcherPtr searcher; + Lucene::Collection hits; }; class TantivyBlockOutputStream : public IBlockOutputStream @@ -121,7 +131,15 @@ class TantivyBlockOutputStream : public IBlockOutputStream const StorageMetadataPtr & metadata_snapshot_) : storage(storage_) , metadata_snapshot(metadata_snapshot_) - {} + { + Block sample_block = metadata_snapshot->getSampleBlock(); + for (const auto & elem : sample_block) + { + if (elem.name == "primary_id") + break; + ++primary_id_pos; + } + } Block getHeader() const override { return metadata_snapshot->getSampleBlock(); } void write(const Block & block) override { @@ -129,10 +147,6 @@ class TantivyBlockOutputStream : public IBlockOutputStream const auto size_rows_diff = block.rows(); metadata_snapshot->check(block, true); { - // std::lock_guard lock(storage.mutex); - // auto new_data = std::make_unique(*(storage.data.get())); - // new_data->push_back(block); - // storage.data.set(std::move(new_data)); if (block.columns() != 3) { throw Exception( "Inserts need all columns", @@ -148,14 +162,45 @@ class TantivyBlockOutputStream : public IBlockOutputStream if (primary_id_col && secondary_id_col && body_col) { - auto & primary_data = primary_id_col->getData(); - auto & secondary_data = secondary_id_col->getData(); - auto & chars = body_col->getChars(); - auto & offsets = body_col->getOffsets(); - const char * char_ptr = reinterpret_cast(&chars[0]); - - int res = tantivysearch_index(storage.tantivy_index, &primary_data[0], &secondary_data[0], char_ptr, &offsets[0], primary_data.size()); - std::cerr << "index result: " << res << std::endl; + std::wstring_convert> converter; + Lucene::String index_path_ws = converter.from_bytes(storage.index_path); + Lucene::IndexWriterPtr writer = Lucene::newLucene( + Lucene::FSDirectory::open(index_path_ws), + Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT), + true, + Lucene::IndexWriter::MaxFieldLengthLIMITED); + + auto rows = block.rows(); + + WriteBufferFromOwnString write_buffer; + + for (size_t i = 0; i < rows; i++) + { + std::cout << "Lucene inserting row[" << i << "]" <(); + size_t idx = 0; + for (const auto & elem : block) + { + write_buffer.restart(); + auto column_name = block.safeGetByPosition(idx).name; + + if (idx < block.columns() - 1) + { + elem.type->serializeAsText(*elem.column, i, write_buffer, FormatSettings()); + doc->add(Lucene::newLucene(converter.from_bytes(column_name), converter.from_bytes(write_buffer.str()), Lucene::Field::STORE_YES, Lucene::Field::INDEX_NOT_ANALYZED)); + } else { + elem.type->serializeAsText(*elem.column, i, write_buffer, FormatSettings()); + doc->add(Lucene::newLucene(converter.from_bytes(column_name), converter.from_bytes(write_buffer.str()), Lucene::Field::STORE_NO, Lucene::Field::INDEX_ANALYZED)); + } + ++idx; + } + std::cout << "Lucene inserted row[" << i << "]" <addDocument(doc); + } + if (rows > 0) { + writer->optimize(); + } + writer->close(); } else { throw Exception( "Inserts need all columns", @@ -169,6 +214,7 @@ class TantivyBlockOutputStream : public IBlockOutputStream private: StorageTantivy & storage; StorageMetadataPtr metadata_snapshot; + size_t primary_id_pos = 0; }; @@ -179,6 +225,8 @@ StorageTantivy::StorageTantivy(const StorageID & table_id_, ColumnsDescription c storage_metadata.setColumns(std::move(columns_description_)); storage_metadata.setConstraints(std::move(constraints_)); setInMemoryMetadata(storage_metadata); + + Poco::File(index_path + "/").createDirectories(); } @@ -219,13 +267,28 @@ Pipe StorageTantivy::read( } } - String tantivy_text_arg = function->arguments->children[0]->as().value.safeGet(); + String query_text = function->arguments->children[0]->as().value.safeGet(); + + + + //Poco::File(this->index_path) + std::wstring_convert> converter; + Lucene::String index_path_ws = converter.from_bytes(index_path); + Lucene::FSDirectoryPtr dir = Lucene::FSDirectory::open(index_path_ws); + if(dir->listAll().empty()) { + std::cout << "No files in lucene index path: " << this->index_path << std::endl; + return {}; + } - TantivySearchIterWrapper *tantivy_iter = tantivysearch_search(tantivy_index, tantivy_text_arg.c_str(), limit); return Pipe( std::make_shared( - column_names, *this, metadata_snapshot, tantivy_text_arg, limit, tantivy_iter + column_names, + *this, + metadata_snapshot, + query_text, + //limit, + dir )); } @@ -244,10 +307,9 @@ bool StorageTantivy::optimize( const Context & /*context*/) { std::cerr << "Running optimize" << std::endl; - if (tantivysearch_writer_commit(tantivy_index)) - { - return true; - } +// if (this->writer) { +// this->writer->optimize(); +// } return false; } @@ -257,8 +319,6 @@ void StorageTantivy::truncate( const Context & /* context */, TableExclusiveLockHolder &) { - bool res = tantivysearch_index_truncate(tantivy_index); - LOG_DEBUG(log, "Truncated index with result: {}", res); } @@ -276,24 +336,22 @@ std::optional StorageTantivy::totalBytes(const Settings &) const void StorageTantivy::startup() { - this->tantivy_index = tantivysearch_open_or_create_index(index_path.c_str()); return; } void StorageTantivy::shutdown() { - if (tantivy_index != nullptr) - { - tantivysearch_index_free(tantivy_index); - } +// if (this->reader) { +// this->reader->close(); +// } +// if (this->writer) { +// this->writer->close(); +// } return; } void StorageTantivy::drop() { - if (tantivy_index != nullptr) - { - tantivysearch_index_delete(tantivy_index); - } + Poco::File(index_path).remove(true); return; } diff --git a/src/Storages/StorageTantivy.h b/src/Storages/StorageTantivy.h index c2e42cbcbd14..8d6646199d92 100644 --- a/src/Storages/StorageTantivy.h +++ b/src/Storages/StorageTantivy.h @@ -9,9 +9,17 @@ #include #include #include -#include #include +#include +#include +#include +//#include +//namespace Lucene { +// class IndexReader; +// class IndexWriter; +// +//} namespace DB { @@ -79,7 +87,8 @@ friend class TantivyBlockOutputStream; BlocksList data; String index_path; mutable std::mutex mutex; - TantivySearchIndexRW *tantivy_index = nullptr; +// Lucene::IndexReaderPtr reader; +// Lucene::IndexWriterPtr writer; std::atomic total_size_bytes = 0; std::atomic total_size_rows = 0; Poco::Logger * log; From 099e2af0dc4ede005e32523360e76a8942f2ec3e Mon Sep 17 00:00:00 2001 From: madianjun Date: Sun, 18 Jul 2021 06:27:14 +0800 Subject: [PATCH 02/12] Add boost date_time --- contrib/boost | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/boost b/contrib/boost index ee24fa55bc46..eede626248b3 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit ee24fa55bc46e4d2ce7d0d052cc5a0d9b1be8c36 +Subproject commit eede626248b3710fe4f5f9c03b3f479a2da0af41 From 5b8a2919cba20f4c24c0cea3a13c949dff9c5754 Mon Sep 17 00:00:00 2001 From: madianjun Date: Sun, 18 Jul 2021 06:42:12 +0800 Subject: [PATCH 03/12] Update LucenePlusPlus to build successfully --- contrib/LucenePlusPlus | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/LucenePlusPlus b/contrib/LucenePlusPlus index b05729eedf53..c41a52626ecf 160000 --- a/contrib/LucenePlusPlus +++ b/contrib/LucenePlusPlus @@ -1 +1 @@ -Subproject commit b05729eedf53115c0abc3763410e366a74fc8be7 +Subproject commit c41a52626ecfbce83b3d921085bab35c2f9ebe52 From ffc5ea4aa249232b1fe7b62492381f4cccaeae72 Mon Sep 17 00:00:00 2001 From: madianjun Date: Sun, 18 Jul 2021 06:54:33 +0800 Subject: [PATCH 04/12] Update url for LucenePlusPlus --- .gitmodules | 2 +- contrib/LucenePlusPlus | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index afccdbdeab09..ea108b6dbe9e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -223,4 +223,4 @@ url = https://github.com/ClickHouse-Extras/NuRaft.git [submodule "contrib/LucenePlusPlus"] path = contrib/LucenePlusPlus - url = https://github.com/luceneplusplus/LucenePlusPlus.git + url = https://github.com/cloudnativecube/LucenePlusPlus.git diff --git a/contrib/LucenePlusPlus b/contrib/LucenePlusPlus index c41a52626ecf..460945ca3a32 160000 --- a/contrib/LucenePlusPlus +++ b/contrib/LucenePlusPlus @@ -1 +1 @@ -Subproject commit c41a52626ecfbce83b3d921085bab35c2f9ebe52 +Subproject commit 460945ca3a32b51a6fa9314834e976a573c4a44e From 8ff85cc5041e6a2fe22aa78ceab3f63120fd0af2 Mon Sep 17 00:00:00 2001 From: madianjun Date: Sun, 18 Jul 2021 07:06:40 +0800 Subject: [PATCH 05/12] Delete tantivy --- contrib/tantivysearch-cmake/CMakeLists.txt | 15 - contrib/tantivysearch/.gitignore | 1 - contrib/tantivysearch/Cargo.lock | 1369 ----------------- contrib/tantivysearch/Cargo.toml | 18 - contrib/tantivysearch/cbindgen.toml | 20 - contrib/tantivysearch/include/tantivysearch.h | 56 - contrib/tantivysearch/src/cache.rs | 127 -- contrib/tantivysearch/src/lib.rs | 656 -------- 8 files changed, 2262 deletions(-) delete mode 100644 contrib/tantivysearch-cmake/CMakeLists.txt delete mode 100644 contrib/tantivysearch/.gitignore delete mode 100644 contrib/tantivysearch/Cargo.lock delete mode 100644 contrib/tantivysearch/Cargo.toml delete mode 100644 contrib/tantivysearch/cbindgen.toml delete mode 100644 contrib/tantivysearch/include/tantivysearch.h delete mode 100644 contrib/tantivysearch/src/cache.rs delete mode 100644 contrib/tantivysearch/src/lib.rs diff --git a/contrib/tantivysearch-cmake/CMakeLists.txt b/contrib/tantivysearch-cmake/CMakeLists.txt deleted file mode 100644 index fb6608448103..000000000000 --- a/contrib/tantivysearch-cmake/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -set(TANTIVYSEARCH_SOURCE_DIR ${ClickHouse_SOURCE_DIR}/contrib/tantivysearch) - -find_library (LIBRARY_TANTIVYSEARCH NAMES libtantivysearch.a tantivisearch PATHS ${TANTIVYSEARCH_SOURCE_DIR}/target/release REQUIRED) -find_path (INCLUDE_TANTIVYSEARCH NAMES tantivysearch.h PATHS ${TANTIVYSEARCH_SOURCE_DIR}/include) - -if (LIBRARY_TANTIVYSEARCH AND INCLUDE_TANTIVYSEARCH) - set(CMAKE_REQUIRED_LIBRARIES ${LIBRARY_TANTIVYSEARCH}) - set(CMAKE_REQUIRED_INCLUDES ${INCLUDE_TANTIVYSEARCH}) - add_library (tantivysearch INTERFACE) - set_property (TARGET tantivysearch PROPERTY INTERFACE_LINK_LIBRARIES ${LIBRARY_TANTIVYSEARCH}) - set_property (TARGET tantivysearch PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${INCLUDE_TANTIVYSEARCH}) - message (STATUS "Using tantivysearch: ${INCLUDE_TANTIVYSEARCH} : ${LIBRARY_TANTIVYSEARCH}") -else() - message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find tantivysearch") -endif() diff --git a/contrib/tantivysearch/.gitignore b/contrib/tantivysearch/.gitignore deleted file mode 100644 index ea8c4bf7f35f..000000000000 --- a/contrib/tantivysearch/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/target diff --git a/contrib/tantivysearch/Cargo.lock b/contrib/tantivysearch/Cargo.lock deleted file mode 100644 index 05c2359fcad6..000000000000 --- a/contrib/tantivysearch/Cargo.lock +++ /dev/null @@ -1,1369 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -[[package]] -name = "addr2line" -version = "0.14.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a55f82cfe485775d02112886f4169bde0c5894d75e79ead7eafe7e40a25e45f7" -dependencies = [ - "gimli", -] - -[[package]] -name = "adler" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e" - -[[package]] -name = "ahash" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" - -[[package]] -name = "atomicwrites" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a2baf2feb820299c53c7ad1cc4f5914a220a1cb76d7ce321d2522a94b54651f" -dependencies = [ - "nix", - "tempdir", - "winapi 0.3.9", -] - -[[package]] -name = "autocfg" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" - -[[package]] -name = "backtrace" -version = "0.3.55" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef5140344c85b01f9bbb4d4b7288a8aa4b3287ccef913a14bcc78a1063623598" -dependencies = [ - "addr2line", - "cfg-if 1.0.0", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", -] - -[[package]] -name = "base64" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3441f0f7b02788e948e47f457ca01f1d7e6d92c693bc132c22b087d3141c03ff" - -[[package]] -name = "bitflags" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" - -[[package]] -name = "bitpacking" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3744aff20a3437a99ebc0bb7733e9e60c7bf590478c9b897e95b38d57e5acb68" -dependencies = [ - "crunchy", -] - -[[package]] -name = "byteorder" -version = "1.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae44d1a3d5a19df61dd0c8beb138458ac2a53a7ac09eba97d55592540004306b" - -[[package]] -name = "cc" -version = "1.0.66" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c0496836a84f8d0495758516b8621a622beb77c0fed418570e50764093ced48" - -[[package]] -name = "census" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5927edd8345aef08578bcbb4aea7314f340d80c7f4931f99fbeb40b99d8f5060" - -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "chrono" -version = "0.4.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" -dependencies = [ - "libc", - "num-integer", - "num-traits", - "time", - "winapi 0.3.9", -] - -[[package]] -name = "cloudabi" -version = "0.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" -dependencies = [ - "bitflags", -] - -[[package]] -name = "combine" -version = "4.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc4369b5e4c0cddf64ad8981c0111e7df4f7078f4d6ba98fb31f2e17c4c57b7e" -dependencies = [ - "memchr", -] - -[[package]] -name = "const_fn" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28b9d6de7f49e22cf97ad17fc4036ece69300032f45f78f30b4a4482cdc3f4a6" - -[[package]] -name = "crc32fast" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" -dependencies = [ - "cfg-if 1.0.0", -] - -[[package]] -name = "crossbeam" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69323bff1fb41c635347b8ead484a5ca6c3f11914d784170b158d8449ab07f8e" -dependencies = [ - "cfg-if 0.1.10", - "crossbeam-channel 0.4.4", - "crossbeam-deque 0.7.3", - "crossbeam-epoch 0.8.2", - "crossbeam-queue", - "crossbeam-utils 0.7.2", -] - -[[package]] -name = "crossbeam-channel" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b153fe7cbef478c567df0f972e02e6d736db11affe43dfc9c56a9374d1adfb87" -dependencies = [ - "crossbeam-utils 0.7.2", - "maybe-uninit", -] - -[[package]] -name = "crossbeam-channel" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-utils 0.8.1", -] - -[[package]] -name = "crossbeam-deque" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f02af974daeee82218205558e51ec8768b48cf524bd01d550abe5573a608285" -dependencies = [ - "crossbeam-epoch 0.8.2", - "crossbeam-utils 0.7.2", - "maybe-uninit", -] - -[[package]] -name = "crossbeam-deque" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-epoch 0.9.1", - "crossbeam-utils 0.8.1", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace" -dependencies = [ - "autocfg", - "cfg-if 0.1.10", - "crossbeam-utils 0.7.2", - "lazy_static", - "maybe-uninit", - "memoffset 0.5.6", - "scopeguard", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1aaa739f95311c2c7887a76863f500026092fb1dce0161dab577e559ef3569d" -dependencies = [ - "cfg-if 1.0.0", - "const_fn", - "crossbeam-utils 0.8.1", - "lazy_static", - "memoffset 0.6.1", - "scopeguard", -] - -[[package]] -name = "crossbeam-queue" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "774ba60a54c213d409d5353bda12d49cd68d14e45036a285234c8d6f91f92570" -dependencies = [ - "cfg-if 0.1.10", - "crossbeam-utils 0.7.2", - "maybe-uninit", -] - -[[package]] -name = "crossbeam-utils" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8" -dependencies = [ - "autocfg", - "cfg-if 0.1.10", - "lazy_static", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02d96d1e189ef58269ebe5b97953da3274d83a93af647c2ddd6f9dab28cedb8d" -dependencies = [ - "autocfg", - "cfg-if 1.0.0", - "lazy_static", -] - -[[package]] -name = "crunchy" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" - -[[package]] -name = "downcast-rs" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ea835d29036a4087793836fa931b08837ad5e957da9e23886b29586fb9b6650" - -[[package]] -name = "either" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" - -[[package]] -name = "fail" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be3c61c59fdc91f5dbc3ea31ee8623122ce80057058be560654c5d410d181a6" -dependencies = [ - "lazy_static", - "log", - "rand 0.7.3", -] - -[[package]] -name = "failure" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d32e9bd16cc02eae7db7ef620b392808b89f6a5e16bb3497d159c6b92a0f4f86" -dependencies = [ - "backtrace", - "failure_derive", -] - -[[package]] -name = "failure_derive" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa4da3c766cd7a0db8242e326e9e4e081edd567072893ed320008189715366a4" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "synstructure", -] - -[[package]] -name = "filetime" -version = "0.2.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c122a393ea57648015bf06fbd3d372378992e86b9ff5a7a497b076a28c79efe" -dependencies = [ - "cfg-if 1.0.0", - "libc", - "redox_syscall", - "winapi 0.3.9", -] - -[[package]] -name = "flurry" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c0a35f7b50e99185a2825541946252f669f3c3ca77801357cd682a1b356bb3e" -dependencies = [ - "ahash", - "crossbeam-epoch 0.8.2", - "num_cpus", - "parking_lot", -] - -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - -[[package]] -name = "fs2" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "fsevent" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ab7d1bd1bd33cc98b0889831b72da23c0aa4df9cec7e0702f46ecea04b35db6" -dependencies = [ - "bitflags", - "fsevent-sys", -] - -[[package]] -name = "fsevent-sys" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f41b048a94555da0f42f1d632e2e19510084fb8e303b0daa2816e733fb3644a0" -dependencies = [ - "libc", -] - -[[package]] -name = "fuchsia-cprng" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" - -[[package]] -name = "fuchsia-zircon" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" -dependencies = [ - "bitflags", - "fuchsia-zircon-sys", -] - -[[package]] -name = "fuchsia-zircon-sys" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" - -[[package]] -name = "futures" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c70be434c505aee38639abccb918163b63158a4b4bb791b45b7023044bdc3c9c" -dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-channel" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f01c61843314e95f96cc9245702248733a3a3d744e43e2e755e3c7af8348a0a9" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db8d3b0917ff63a2a96173133c02818fac4a746b0a57569d3baca9ec0e945e08" - -[[package]] -name = "futures-executor" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ee9ca2f7eb4475772cf39dd1cd06208dce2670ad38f4d9c7262b3e15f127068" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", - "num_cpus", -] - -[[package]] -name = "futures-io" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e37c1a51b037b80922864b8eed90692c5cd8abd4c71ce49b77146caa47f3253b" - -[[package]] -name = "futures-macro" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f8719ca0e1f3c5e34f3efe4570ef2c0610ca6da85ae7990d472e9cbfba13664" -dependencies = [ - "proc-macro-hack", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "futures-sink" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6adabac1290109cfa089f79192fb6244ad2c3f1cc2281f3e1dd987592b71feb" - -[[package]] -name = "futures-task" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a92a0843a2ff66823a8f7c77bffe9a09be2b64e533562c412d63075643ec0038" -dependencies = [ - "once_cell", -] - -[[package]] -name = "futures-util" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "036a2107cdeb57f6d7322f1b6c363dad67cd63ca3b7d1b925bdf75bd5d96cda9" -dependencies = [ - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite", - "pin-utils", - "proc-macro-hack", - "proc-macro-nested", - "slab", -] - -[[package]] -name = "getrandom" -version = "0.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" -dependencies = [ - "cfg-if 1.0.0", - "libc", - "wasi 0.9.0+wasi-snapshot-preview1", -] - -[[package]] -name = "getrandom" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4060f4657be78b8e766215b02b18a2e862d83745545de804638e2b545e81aee6" -dependencies = [ - "cfg-if 1.0.0", - "libc", - "wasi 0.10.1+wasi-snapshot-preview1", -] - -[[package]] -name = "gimli" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6503fe142514ca4799d4c26297c4248239fe8838d827db6bd6065c6ed29a6ce" - -[[package]] -name = "hermit-abi" -version = "0.1.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aca5565f760fb5b220e499d72710ed156fdb74e631659e99377d9ebfbd13ae8" -dependencies = [ - "libc", -] - -[[package]] -name = "htmlescape" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163" - -[[package]] -name = "inotify" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4816c66d2c8ae673df83366c18341538f234a26d65a9ecea5c348b453ac1d02f" -dependencies = [ - "bitflags", - "inotify-sys", - "libc", -] - -[[package]] -name = "inotify-sys" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4563555856585ab3180a5bf0b2f9f8d301a728462afffc8195b3f5394229c55" -dependencies = [ - "libc", -] - -[[package]] -name = "iovec" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" -dependencies = [ - "libc", -] - -[[package]] -name = "itoa" -version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" - -[[package]] -name = "kernel32-sys" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" -dependencies = [ - "winapi 0.2.8", - "winapi-build", -] - -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - -[[package]] -name = "lazycell" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" - -[[package]] -name = "levenshtein_automata" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f44db4199cdb049b494a92d105acbfa43c25b3925e33803923ba9580b7bc9e1a" - -[[package]] -name = "libc" -version = "0.2.82" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89203f3fba0a3795506acaad8ebce3c80c0af93f994d5a1d7a0b1eeb23271929" - -[[package]] -name = "lock_api" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4da24a77a3d8a6d4862d95f72e6fdb9c09a643ecdb402d754004a557f2bec75" -dependencies = [ - "scopeguard", -] - -[[package]] -name = "log" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcf3805d4480bb5b86070dcfeb9e2cb2ebc148adb753c5cca5f884d1d65a42b2" -dependencies = [ - "cfg-if 0.1.10", -] - -[[package]] -name = "maybe-uninit" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" - -[[package]] -name = "memchr" -version = "2.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" - -[[package]] -name = "memmap" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "memoffset" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "043175f069eda7b85febe4a74abbaeff828d9f8b448515d3151a14a3542811aa" -dependencies = [ - "autocfg", -] - -[[package]] -name = "memoffset" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "157b4208e3059a8f9e78d559edc658e13df41410cb3ae03979c83130067fdd87" -dependencies = [ - "autocfg", -] - -[[package]] -name = "miniz_oxide" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f2d26ec3309788e423cfbf68ad1800f061638098d76a83681af979dc4eda19d" -dependencies = [ - "adler", - "autocfg", -] - -[[package]] -name = "mio" -version = "0.6.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4afd66f5b91bf2a3bc13fad0e21caedac168ca4c707504e75585648ae80e4cc4" -dependencies = [ - "cfg-if 0.1.10", - "fuchsia-zircon", - "fuchsia-zircon-sys", - "iovec", - "kernel32-sys", - "libc", - "log", - "miow", - "net2", - "slab", - "winapi 0.2.8", -] - -[[package]] -name = "mio-extras" -version = "2.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52403fe290012ce777c4626790c8951324a2b9e3316b3143779c72b029742f19" -dependencies = [ - "lazycell", - "log", - "mio", - "slab", -] - -[[package]] -name = "miow" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebd808424166322d4a38da87083bfddd3ac4c131334ed55856112eb06d46944d" -dependencies = [ - "kernel32-sys", - "net2", - "winapi 0.2.8", - "ws2_32-sys", -] - -[[package]] -name = "murmurhash32" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d736ff882f0e85fe9689fb23db229616c4c00aee2b3ac282f666d8f20eb25d4a" -dependencies = [ - "byteorder", -] - -[[package]] -name = "net2" -version = "0.2.37" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "391630d12b68002ae1e25e8f974306474966550ad82dac6886fb8910c19568ae" -dependencies = [ - "cfg-if 0.1.10", - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "nix" -version = "0.14.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c722bee1037d430d0f8e687bbdbf222f27cc6e4e68d5caf630857bb2b6dbdce" -dependencies = [ - "bitflags", - "cc", - "cfg-if 0.1.10", - "libc", - "void", -] - -[[package]] -name = "notify" -version = "4.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80ae4a7688d1fab81c5bf19c64fc8db920be8d519ce6336ed4e7efe024724dbd" -dependencies = [ - "bitflags", - "filetime", - "fsevent", - "fsevent-sys", - "inotify", - "libc", - "mio", - "mio-extras", - "walkdir", - "winapi 0.3.9", -] - -[[package]] -name = "num-integer" -version = "0.1.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" -dependencies = [ - "autocfg", - "num-traits", -] - -[[package]] -name = "num-traits" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" -dependencies = [ - "autocfg", -] - -[[package]] -name = "num_cpus" -version = "1.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" -dependencies = [ - "hermit-abi", - "libc", -] - -[[package]] -name = "object" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d3b63360ec3cb337817c2dbd47ab4a0f170d285d8e5a2064600f3def1402397" - -[[package]] -name = "once_cell" -version = "1.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3" - -[[package]] -name = "owned-read" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b66d1e235abcebc845cf93550b89b74f468c051496fafb433ede4104b9f71ba1" -dependencies = [ - "stable_deref_trait", -] - -[[package]] -name = "owning_ref" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ff55baddef9e4ad00f88b6c743a2a8062d4c6ade126c2a528644b8e444d52ce" -dependencies = [ - "stable_deref_trait", -] - -[[package]] -name = "parking_lot" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3a704eb390aafdc107b0e392f56a82b668e3a71366993b5340f5833fd62505e" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d58c7c768d4ba344e3e8d72518ac13e259d7c7ade24167003b8488e10b6740a3" -dependencies = [ - "cfg-if 0.1.10", - "cloudabi", - "libc", - "redox_syscall", - "smallvec", - "winapi 0.3.9", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "ppv-lite86" -version = "0.2.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" - -[[package]] -name = "proc-macro-hack" -version = "0.5.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" - -[[package]] -name = "proc-macro-nested" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eba180dafb9038b050a4c280019bbedf9f2467b61e5d892dcad585bb57aadc5a" - -[[package]] -name = "proc-macro2" -version = "1.0.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" -dependencies = [ - "unicode-xid", -] - -[[package]] -name = "quote" -version = "1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "991431c3519a3f36861882da93630ce66b52918dcf1b8e2fd66b397fc96f28df" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "rand" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" -dependencies = [ - "fuchsia-cprng", - "libc", - "rand_core 0.3.1", - "rdrand", - "winapi 0.3.9", -] - -[[package]] -name = "rand" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" -dependencies = [ - "getrandom 0.1.16", - "libc", - "rand_chacha", - "rand_core 0.5.1", - "rand_hc", -] - -[[package]] -name = "rand_chacha" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" -dependencies = [ - "ppv-lite86", - "rand_core 0.5.1", -] - -[[package]] -name = "rand_core" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" -dependencies = [ - "rand_core 0.4.2", -] - -[[package]] -name = "rand_core" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" - -[[package]] -name = "rand_core" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" -dependencies = [ - "getrandom 0.1.16", -] - -[[package]] -name = "rand_hc" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" -dependencies = [ - "rand_core 0.5.1", -] - -[[package]] -name = "rayon" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b0d8e0819fadc20c74ea8373106ead0600e3a67ef1fe8da56e39b9ae7275674" -dependencies = [ - "autocfg", - "crossbeam-deque 0.8.0", - "either", - "rayon-core", -] - -[[package]] -name = "rayon-core" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a" -dependencies = [ - "crossbeam-channel 0.5.0", - "crossbeam-deque 0.8.0", - "crossbeam-utils 0.8.1", - "lazy_static", - "num_cpus", -] - -[[package]] -name = "rdrand" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "redox_syscall" -version = "0.1.57" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" - -[[package]] -name = "regex" -version = "1.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9251239e129e16308e70d853559389de218ac275b515068abc96829d05b948a" -dependencies = [ - "regex-syntax 0.6.22", -] - -[[package]] -name = "regex-syntax" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e931c58b93d86f080c734bfd2bce7dd0079ae2331235818133c8be7f422e20e" - -[[package]] -name = "regex-syntax" -version = "0.6.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581" - -[[package]] -name = "remove_dir_all" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "rust-stemmers" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" -dependencies = [ - "serde", - "serde_derive", -] - -[[package]] -name = "rustc-demangle" -version = "0.1.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e3bad0ee36814ca07d7968269dd4b7ec89ec2da10c4bb613928d3077083c232" - -[[package]] -name = "ryu" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" - -[[package]] -name = "same-file" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "scopeguard" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" - -[[package]] -name = "serde" -version = "1.0.118" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06c64263859d87aa2eb554587e2d23183398d617427327cf2b3d0ed8c69e4800" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.118" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c84d3526699cd55261af4b941e4e725444df67aa4f9e6a3564f18030d12672df" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "serde_json" -version = "1.0.61" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fceb2595057b6891a4ee808f70054bd2d12f0e97f1cbb78689b59f676df325a" -dependencies = [ - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "slab" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" - -[[package]] -name = "smallvec" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" - -[[package]] -name = "snap" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98d3306e84bf86710d6cd8b4c9c3b721d5454cc91a603180f8f8cd06cfd317b4" - -[[package]] -name = "stable_deref_trait" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" - -[[package]] -name = "syn" -version = "1.0.58" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc60a3d73ea6594cd712d830cc1f0390fd71542d8c8cd24e70cc54cdfd5e05d5" -dependencies = [ - "proc-macro2", - "quote", - "unicode-xid", -] - -[[package]] -name = "synstructure" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b834f2d66f734cb897113e34aaff2f1ab4719ca946f9a7358dba8f8064148701" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "unicode-xid", -] - -[[package]] -name = "tantivy" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37dfd693ae55afd87e798e967bb2d894b32163e3c9a172958efe5bc89ed7df08" -dependencies = [ - "atomicwrites", - "base64", - "bitpacking", - "byteorder", - "census", - "chrono", - "crc32fast", - "crossbeam", - "downcast-rs", - "fail", - "failure", - "fnv", - "fs2", - "futures", - "htmlescape", - "levenshtein_automata", - "log", - "memmap", - "murmurhash32", - "notify", - "num_cpus", - "once_cell", - "owned-read", - "owning_ref", - "rayon", - "regex", - "rust-stemmers", - "serde", - "serde_json", - "smallvec", - "snap", - "stable_deref_trait", - "tantivy-fst", - "tantivy-query-grammar", - "tempfile", - "uuid", - "winapi 0.3.9", -] - -[[package]] -name = "tantivy-fst" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb20cdc0d83e9184560bdde9cd60142dbb4af2e0f770e88fce45770495224205" -dependencies = [ - "byteorder", - "regex-syntax 0.4.2", - "utf8-ranges", -] - -[[package]] -name = "tantivy-query-grammar" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ea03b8224ca9ff4ccfc7dfab790527c8a9d8edbc53f4677bdf6ba0fd8000c75" -dependencies = [ - "combine", -] - -[[package]] -name = "tantivysearch" -version = "0.1.0" -dependencies = [ - "flurry", - "libc", - "once_cell", - "rayon", - "tantivy", -] - -[[package]] -name = "tempdir" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15f2b5fb00ccdf689e0149d1b1b3c03fead81c2b37735d812fa8bddbbf41b6d8" -dependencies = [ - "rand 0.4.6", - "remove_dir_all", -] - -[[package]] -name = "tempfile" -version = "3.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9" -dependencies = [ - "cfg-if 0.1.10", - "libc", - "rand 0.7.3", - "redox_syscall", - "remove_dir_all", - "winapi 0.3.9", -] - -[[package]] -name = "time" -version = "0.1.43" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "unicode-xid" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" - -[[package]] -name = "utf8-ranges" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ae116fef2b7fea257ed6440d3cfcff7f190865f170cdad00bb6465bf18ecba" - -[[package]] -name = "uuid" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" -dependencies = [ - "getrandom 0.2.1", - "serde", -] - -[[package]] -name = "void" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" - -[[package]] -name = "walkdir" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "777182bc735b6424e1a57516d35ed72cb8019d85c8c9bf536dccb3445c1a2f7d" -dependencies = [ - "same-file", - "winapi 0.3.9", - "winapi-util", -] - -[[package]] -name = "wasi" -version = "0.9.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" - -[[package]] -name = "wasi" -version = "0.10.1+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93c6c3420963c5c64bca373b25e77acb562081b9bb4dd5bb864187742186cea9" - -[[package]] -name = "winapi" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-build" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "ws2_32-sys" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d59cefebd0c892fa2dd6de581e937301d8552cb44489cdff035c6187cb63fa5e" -dependencies = [ - "winapi 0.2.8", - "winapi-build", -] diff --git a/contrib/tantivysearch/Cargo.toml b/contrib/tantivysearch/Cargo.toml deleted file mode 100644 index 7c6667a75181..000000000000 --- a/contrib/tantivysearch/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "tantivysearch" -version = "0.1.0" -authors = ["André Guedes "] -edition = "2018" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[lib] -name = "tantivysearch" -crate-type = ["cdylib", "staticlib"] - -[dependencies] -libc = "0.2.82" -tantivy = "0.13.2" -rayon = "1.5" -once_cell = "1.7" -flurry = "0.3" diff --git a/contrib/tantivysearch/cbindgen.toml b/contrib/tantivysearch/cbindgen.toml deleted file mode 100644 index a61dc8b17a13..000000000000 --- a/contrib/tantivysearch/cbindgen.toml +++ /dev/null @@ -1,20 +0,0 @@ -header = "// SPDX-License-Identifier: Apache-2.0" -sys_includes = ["stddef.h", "stdint.h", "stdlib.h"] -no_includes = true -include_guard = "TANTIVYSEARCH_H" -tab_width = 4 -style = "Type" -# language = "C" -cpp_compat = true - -[parse] -parse_deps = true -include = ['tantivysearch'] - -[export] -prefix = "TantivySearch" -item_types = ["enums", "structs", "unions", "typedefs", "opaque", "functions"] - -[enum] -rename_variants = "ScreamingSnakeCase" -prefix_with_name = true diff --git a/contrib/tantivysearch/include/tantivysearch.h b/contrib/tantivysearch/include/tantivysearch.h deleted file mode 100644 index e58c3a002993..000000000000 --- a/contrib/tantivysearch/include/tantivysearch.h +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -#ifndef TANTIVYSEARCH_H -#define TANTIVYSEARCH_H - -#include -#include -#include - -struct TantivySearchIndexRW; - -struct TantivySearchIterWrapper; - -extern "C" { - -TantivySearchIndexRW *tantivysearch_open_or_create_index(const char *dir_ptr); - -TantivySearchIterWrapper *tantivysearch_search(TantivySearchIndexRW *irw, - const char *query_ptr, - uint64_t limit); - -TantivySearchIterWrapper *tantivysearch_ranked_search(TantivySearchIndexRW *irw, - const char *query_ptr, - uint64_t limit); - -unsigned char tantivysearch_index(TantivySearchIndexRW *irw, - const uint64_t *primary_ids, - const uint64_t *secondary_ids, - const char *chars, - const uint64_t *offsets, - size_t size); - -unsigned char tantivysearch_writer_commit(TantivySearchIndexRW *irw); - -unsigned char tantivysearch_index_truncate(TantivySearchIndexRW *irw); - -unsigned char tantivysearch_iter_next(TantivySearchIterWrapper *iter_ptr, - uint64_t *primary_id_ptr, - uint64_t *secondary_id_ptr); - -size_t tantivysearch_iter_batch(TantivySearchIterWrapper *iter_ptr, - uint64_t count, - uint64_t *primary_ids_ptr, - uint64_t *secondary_ids_ptr); - -size_t tantivysearch_iter_count(TantivySearchIterWrapper *iter_ptr); - -void tantivysearch_iter_free(TantivySearchIterWrapper *iter_ptr); - -void tantivysearch_index_free(TantivySearchIndexRW *irw); - -void tantivysearch_index_delete(TantivySearchIndexRW *irw); - -} // extern "C" - -#endif // TANTIVYSEARCH_H diff --git a/contrib/tantivysearch/src/cache.rs b/contrib/tantivysearch/src/cache.rs deleted file mode 100644 index 13850164962c..000000000000 --- a/contrib/tantivysearch/src/cache.rs +++ /dev/null @@ -1,127 +0,0 @@ -use std::borrow::Borrow; -use std::fmt::{self, Debug, Formatter}; -use std::hash::{BuildHasher, Hash}; -use std::time::Instant; - -use once_cell::sync::OnceCell; -use flurry::{HashMap, DefaultHashBuilder}; - -pub struct ConcurrentCache { - size: usize, - seconds: u64, - items: HashMap), S> -} - -impl ConcurrentCache -where - K: 'static + Hash + Ord + Clone + Send + Sync, - V: 'static + Clone + Send + Sync -{ - /// Constructs a new `ConcurrentCache` with the default hashing algorithm and an - /// initial capacity of 0. - #[must_use] - pub fn new(size: usize, seconds: u64) -> Self { - Self::with_capacity(size, seconds, 0) - } - - /// Constructs a new `ConcurrentCache` with the default hashing algorithm and the - /// specified initial capacity. - #[must_use] - pub fn with_capacity(size: usize, seconds: u64, capacity: usize) -> Self { - Self::with_capacity_and_hasher(size, seconds, capacity, DefaultHashBuilder::default()) - } -} - - -impl ConcurrentCache -where - K: 'static + Hash + Ord + Clone + Send + Sync, - V: 'static + Clone + Send + Sync, - S: BuildHasher + Clone -{ - /// Constructs a new `ConcurrentCache` with the specified hasher and an initial - /// capacity of 0. - #[must_use] - pub fn with_hasher(size: usize, seconds: u64, hasher: S) -> Self { - Self::with_capacity_and_hasher(size, seconds, 0, hasher) - } - - /// Constructs a new `ConcurrentCache` with the specified hasher and initial - /// capacity. - #[must_use] - pub fn with_capacity_and_hasher(size: usize, seconds: u64, capacity: usize, hasher: S) -> Self { - Self { size, seconds, items: HashMap::with_capacity_and_hasher(capacity, hasher) } - } - - /// Returns `true` if the cache currently contains no items and `false` - /// otherwise. - #[must_use] - pub fn is_empty(&self) -> bool { - self.items.pin().is_empty() - } - - /// Returns the number of items currently in the cache. - #[must_use] - pub fn len(&self) -> usize { - self.items.pin().len() - } - - /// Empties the cache of all items. - pub fn clear(&self) { - self.items.pin().clear() - } - - /// Retrieves the value with the specified key, or initializes it if it is - /// not present. - /// - /// If the key is present but the value is not fully resolved, the current - /// thread will block until resolution completes. If the key is not present, - /// `init` is executed to produce a value. In either case, an immutable - /// reference to the value is returned. - /// - /// # Notes - /// The resolution closure, `init`, does not provide access to the key being - /// resolved. You may need to provide a copy of this value to the closure. - /// This is done to allow for maximum concurrency, as it permits the key - /// to be accessed by other threads during the resolution process. - pub fn resolve V>(&self, key: K, init: F) -> V { - let pinned = self.items.pin(); - - if let Some(val_ref) = pinned.get(&key) { - if val_ref.0.elapsed().as_secs() <= self.seconds { - let result_ref = val_ref.1.get_or_init(init); - let result = result_ref.clone(); - return result; - } - } - - match pinned.try_insert(key.clone(), (Instant::now(), OnceCell::new())) { - Ok(val_ref) => { - let result = val_ref.1.get_or_init(init).clone(); - if pinned.len() > self.size { - let mut count = 0; - // Max size reached, try to evict expired items or random valid item - pinned.retain(|k, v| { - let valid = v.0.elapsed().as_secs() <= self.seconds; - if valid { - count += 1; - count <= self.size - } else { - false - } - }); - } - result - } - Err(e) => { - let val_ref = e.current; - if val_ref.0.elapsed().as_secs() <= self.seconds { - val_ref.1.get_or_init(init).clone() - } else { - pinned.insert(key.clone(), e.not_inserted); - pinned.get(&key).expect("this should not happen").1.get_or_init(init).clone() - } - } - } - } -} diff --git a/contrib/tantivysearch/src/lib.rs b/contrib/tantivysearch/src/lib.rs deleted file mode 100644 index 6936d0ea444c..000000000000 --- a/contrib/tantivysearch/src/lib.rs +++ /dev/null @@ -1,656 +0,0 @@ -use std::ffi::CString; -use std::ffi::CStr; -use std::mem; -use std::ptr; -use std::slice; -use std::iter::FusedIterator; -use std::cmp::Ordering; - -use libc::*; - -use tantivy::collector::{TopDocs, Count}; -use tantivy::query::QueryParser; -use tantivy::schema::*; -use tantivy::collector::{Collector, SegmentCollector}; -use tantivy::{Index, IndexReader, IndexWriter, SegmentReader, SegmentLocalId, DocId, Score, DocAddress, TantivyError}; -use tantivy::ReloadPolicy; -use rayon::prelude::*; -use std::sync::Arc; - -mod cache; - -static CACHE: once_cell::sync::Lazy, Vec)>>> = once_cell::sync::Lazy::new(|| { - cache::ConcurrentCache::with_capacity(100, 3600, 110) -}); - -const TIMING: bool = true; - -macro_rules! start { - ($val:ident) => { - let $val = if TIMING { - Some(std::time::Instant::now()) - } else { - None - }; - }; -} - -macro_rules! end { - ($val:ident) => { - if TIMING { - let $val = $val.unwrap(); - dbg!($val.elapsed()); - } - }; - ($val:ident, $ex:expr) => { - if TIMING { - let $val = $val.unwrap(); - dbg!($val.elapsed(), $ex); - } - }; -} - -#[derive(Default)] -pub struct Docs { - limit: usize -} - -impl Docs { - pub fn with_limit(limit: usize) -> Docs { - Docs { limit } - } -} - -impl Collector for Docs { - type Fruit = Vec<(Score, DocAddress)>; - - type Child = SegmentDocsCollector; - - fn for_segment( - &self, - segment_local_id: SegmentLocalId, - _: &SegmentReader, - ) -> tantivy::Result { - Ok(SegmentDocsCollector { docs: vec!(), segment_local_id, limit: self.limit }) - } - - fn requires_scoring(&self) -> bool { - false - } - - fn merge_fruits(&self, segment_docs: Vec>) -> tantivy::Result> { - start!(merge); - let lens: Vec<_> = segment_docs.iter().map(|v| v.len()).collect(); - let full_len = lens.iter().sum(); - - let mut all = Vec::with_capacity(full_len); - unsafe { all.set_len(full_len) }; - - let mut mut_slice = &mut all[..]; - let mut mut_slices = vec!(); - for len in lens { - let (slice, rest) = mut_slice.split_at_mut(len); - mut_slices.push(slice); - mut_slice = rest; - } - - segment_docs.into_par_iter().zip(mut_slices.into_par_iter()).for_each(|(vec, slice)| { - slice.copy_from_slice(&vec[..]); - }); - end!(merge); - - start!(resize); - if all.len() > self.limit { - all.resize(self.limit, (0.0f32, DocAddress(0, 0))); - } - end!(resize); - - Ok(all) - } -} - -#[derive(Clone, Copy, Debug)] -pub struct OrdDoc(Score, DocAddress); - -impl Ord for OrdDoc { - fn cmp(&self, other: &Self) -> Ordering { - self.0.partial_cmp(&other.0).unwrap_or(self.1.cmp(&other.1)) - } -} - -impl PartialOrd for OrdDoc { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl PartialEq for OrdDoc { - fn eq(&self, other: &Self) -> bool { - self.1 == other.1 - } -} - -impl Eq for OrdDoc {} - -#[derive(Default)] -pub struct RankedDocs { - limit: usize -} - -impl RankedDocs { - pub fn with_limit(limit: usize) -> RankedDocs { - RankedDocs { limit } - } -} - -impl Collector for RankedDocs { - type Fruit = Vec; - - type Child = SegmentOrdDocsCollector; - - fn for_segment( - &self, - segment_local_id: SegmentLocalId, - _: &SegmentReader, - ) -> tantivy::Result { - Ok(SegmentOrdDocsCollector { docs: vec!(), segment_local_id, limit: self.limit }) - } - - fn requires_scoring(&self) -> bool { - true - } - - fn merge_fruits(&self, segment_docs: Vec>) -> tantivy::Result> { - start!(merge); - let lens: Vec<_> = segment_docs.iter().map(|v| v.len()).collect(); - let full_len = lens.iter().sum(); - - let mut all = Vec::with_capacity(full_len); - unsafe { all.set_len(full_len) }; - - let mut mut_slice = &mut all[..]; - let mut mut_slices = vec!(); - for len in lens { - let (slice, rest) = mut_slice.split_at_mut(len); - mut_slices.push(slice); - mut_slice = rest; - } - - segment_docs.into_par_iter().zip(mut_slices.into_par_iter()).for_each(|(vec, slice)| { - slice.copy_from_slice(&vec[..]); - }); - end!(merge); - - start!(sort); - all.par_sort(); - end!(sort); - - start!(resize); - if all.len() > self.limit { - all.resize(self.limit, OrdDoc(0.0f32, DocAddress(0, 0))); - } - end!(resize); - - Ok(all) - } -} - -#[derive(Default)] -pub struct SegmentOrdDocsCollector { - docs: Vec, - segment_local_id: SegmentLocalId, - limit: usize -} - -impl SegmentCollector for SegmentOrdDocsCollector { - type Fruit = Vec; - - #[inline] - fn collect(&mut self, doc_id: DocId, score: Score) { - if self.docs.len() < self.limit { - self.docs.push(OrdDoc(score, DocAddress(self.segment_local_id, doc_id))); - } - } - - fn harvest(self) -> Vec { - self.docs - } -} - -#[derive(Default)] -pub struct SegmentDocsCollector { - docs: Vec<(Score, DocAddress)>, - segment_local_id: SegmentLocalId, - limit: usize -} - -impl SegmentCollector for SegmentDocsCollector { - type Fruit = Vec<(Score, DocAddress)>; - - #[inline] - fn collect(&mut self, doc_id: DocId, score: Score) { - if self.docs.len() < self.limit { - self.docs.push((score, DocAddress(self.segment_local_id, doc_id))); - } - } - - fn harvest(self) -> Vec<(Score, DocAddress)> { - self.docs - } -} - -fn leak_buf(v: Vec, vallen: *mut size_t) -> *mut c_char { - unsafe { - *vallen = v.len(); - } - let mut bsv = v.into_boxed_slice(); - let val = bsv.as_mut_ptr() as *mut _; - mem::forget(bsv); - val -} - -// #[no_mangle] -// pub unsafe extern "C" fn tantivy_free_buf(buf: *mut c_char, sz: size_t) { -// drop(Vec::from_raw_parts(buf, sz, sz)); -// } -#[derive(Clone)] -pub struct IterWrapper { - inner: Arc<(Vec, Vec)>, - offset: usize -} - -impl From, Vec)>> for IterWrapper { - fn from(inner: Arc<(Vec, Vec)>) -> IterWrapper { - IterWrapper { inner, offset: 0 } - } -} - -impl Iterator for IterWrapper { - type Item = (u64, u64); - - #[inline] - fn next(&mut self) -> Option<(u64, u64)> { - if self.offset >= self.inner.0.len() { - None - } else { - let result = Some((self.inner.0[self.offset], self.inner.1[self.offset])); - self.offset += 1; - result - } - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let size = self.inner.0.len() - self.offset; - (size, Some(size)) - } - - #[inline] - fn count(self) -> usize { - self.inner.0.len() - self.offset - } -} - -impl FusedIterator for IterWrapper {} - - -#[derive(Clone)] -pub struct VecIterWrapper { - iter: std::vec::IntoIter<(u64, u64)> -} - -impl From> for VecIterWrapper { - fn from(iter: std::vec::IntoIter<(u64, u64)>) -> VecIterWrapper { - VecIterWrapper { iter } - } -} - -impl Iterator for VecIterWrapper { - type Item = (u64, u64); - - #[inline] - fn next(&mut self) -> Option<(u64, u64)> { - self.iter.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.iter.size_hint() - } - - #[inline] - fn count(self) -> usize { - self.iter.count() - } -} - -impl DoubleEndedIterator for VecIterWrapper { - #[inline] - fn next_back(&mut self) -> Option<(u64, u64)> { - self.iter.next_back() - } -} - -impl FusedIterator for VecIterWrapper {} - -pub struct IndexRW { - pub path: String, - pub index: Index, - pub reader: IndexReader, - pub writer: IndexWriter -} - -#[no_mangle] -pub extern "C" fn tantivysearch_open_or_create_index(dir_ptr: *const c_char) -> *mut IndexRW { - let dir_c_str = unsafe { - assert!(!dir_ptr.is_null()); - - CStr::from_ptr(dir_ptr) - }; - - let dir_str = dir_c_str.to_str().expect("failed to get &str from cstr"); - - println!("Opening index on {}", dir_str); - let mut index = match Index::open_in_dir(dir_str) { - Ok(index) => index, - Err(e) => { - match e { - TantivyError::PathDoesNotExist(_) => { - println!("Creating index on {}", dir_str); - std::fs::create_dir_all(dir_str).expect("failed to create index dir"); - let mut schema_builder = Schema::builder(); - schema_builder.add_u64_field("primary_id", FAST); - schema_builder.add_u64_field("secondary_id", FAST); - schema_builder.add_text_field("body", TEXT); - let schema = schema_builder.build(); - Index::create_in_dir(dir_str, schema).expect("failed to create index") - } - _ => { - panic!("this should not happen"); - } - } - } - }; - - index.set_default_multithread_executor().expect("failed to create thread pool"); - let reader = index - .reader_builder() - .reload_policy(ReloadPolicy::OnCommit) - .try_into().expect("failed to create reader"); - let writer = index - .writer(1024 * 1024 * 1024) - .expect("failed to create writer"); - - // let mut policy = tantivy::merge_policy::LogMergePolicy::default(); - // policy.set_max_merge_size(3_000_000); - - // writer.set_merge_policy(Box::new(policy)); - - Box::into_raw(Box::new(IndexRW { index, reader, writer, path: dir_str.to_string() })) -} - -pub fn tantivysearch_search_impl(irw: *mut IndexRW, query_str: &str, limit: u64) -> Arc<(Vec, Vec)> { - CACHE.resolve((irw as usize, query_str.to_string(), limit, false), move || { - println!("Searching index for {} with limit {}", query_str, limit); - let search = std::time::Instant::now(); - - let schema = unsafe { (*irw).index.schema() }; - - let body = schema.get_field("body").expect("missing field body"); - let primary_id = schema.get_field("primary_id").expect("missing field primary_id"); - let secondary_id = schema.get_field("secondary_id").expect("missing field secondary_id"); - - let searcher = unsafe { (*irw).reader.searcher() }; - let segment_readers = searcher.segment_readers(); - let ff_readers_primary: Vec<_> = segment_readers.iter().map(|seg_r| { - let ffs = seg_r.fast_fields(); - ffs.u64(primary_id).unwrap() - }).collect(); - let ff_readers_secondary: Vec<_> = segment_readers.iter().map(|seg_r| { - let ffs = seg_r.fast_fields(); - ffs.u64(secondary_id).unwrap() - }).collect(); - - - let query_parser = QueryParser::for_index(unsafe { &(*irw).index }, vec![body]); - - let query = query_parser.parse_query(query_str).expect("failed to parse query"); - let docs = searcher.search(&query, &Docs::with_limit(limit as usize)).expect("failed to search"); - let mut results: (Vec<_>, Vec<_>) = docs.into_par_iter().map(|(_score, doc_address)| { - let ff_reader_primary = &ff_readers_primary[doc_address.segment_ord() as usize]; - let ff_reader_secondary = &ff_readers_secondary[doc_address.segment_ord() as usize]; - let primary_id: u64 = ff_reader_primary.get(doc_address.doc()); - let secondary_id: u64 = ff_reader_secondary.get(doc_address.doc()); - (primary_id, secondary_id) - }).unzip(); - - dbg!(search.elapsed()); - Arc::new(results) - }) -} - -pub fn tantivysearch_ranked_search_impl(irw: *mut IndexRW, query_str: &str, limit: u64) -> Arc<(Vec, Vec)> { - CACHE.resolve((irw as usize, query_str.to_string(), limit, true), move || { - println!("Searching index for {} with limit {} and ranking", query_str, limit); - let search = std::time::Instant::now(); - - let schema = unsafe { (*irw).index.schema() }; - - let body = schema.get_field("body").expect("missing field body"); - let primary_id = schema.get_field("primary_id").expect("missing field primary_id"); - let secondary_id = schema.get_field("secondary_id").expect("missing field secondary_id"); - - let searcher = unsafe { (*irw).reader.searcher() }; - let segment_readers = searcher.segment_readers(); - let ff_readers_primary: Vec<_> = segment_readers.iter().map(|seg_r| { - let ffs = seg_r.fast_fields(); - ffs.u64(primary_id).unwrap() - }).collect(); - let ff_readers_secondary: Vec<_> = segment_readers.iter().map(|seg_r| { - let ffs = seg_r.fast_fields(); - ffs.u64(secondary_id).unwrap() - }).collect(); - - - let query_parser = QueryParser::for_index(unsafe { &(*irw).index }, vec![body]); - - let query = query_parser.parse_query(query_str).expect("failed to parse query"); - let docs = searcher.search(&query, &RankedDocs::with_limit(limit as usize)).expect("failed to search"); - let mut results: (Vec<_>, Vec<_>) = docs.into_par_iter().map(|OrdDoc(_score, doc_address)| { - let ff_reader_primary = &ff_readers_primary[doc_address.segment_ord() as usize]; - let ff_reader_secondary = &ff_readers_secondary[doc_address.segment_ord() as usize]; - let primary_id: u64 = ff_reader_primary.get(doc_address.doc()); - let secondary_id: u64 = ff_reader_secondary.get(doc_address.doc()); - (primary_id, secondary_id) - }).unzip(); - - dbg!(search.elapsed()); - Arc::new(results) - }) -} - -#[no_mangle] -pub extern "C" fn tantivysearch_search(irw: *mut IndexRW, query_ptr: *const c_char, limit: u64) -> *mut IterWrapper { - assert!(!irw.is_null()); - - let query_c_str = unsafe { - assert!(!query_ptr.is_null()); - - CStr::from_ptr(query_ptr) - }; - - let query_str = query_c_str.to_str().expect("failed to get &str from cstr"); - - let results = tantivysearch_search_impl(irw, query_str, limit); - - println!("Search results: {}", results.0.len()); - - Box::into_raw(Box::new(results.into())) -} - -#[no_mangle] -pub extern "C" fn tantivysearch_ranked_search(irw: *mut IndexRW, query_ptr: *const c_char, limit: u64) -> *mut IterWrapper { - assert!(!irw.is_null()); - - let query_c_str = unsafe { - assert!(!query_ptr.is_null()); - - CStr::from_ptr(query_ptr) - }; - - let query_str = query_c_str.to_str().expect("failed to get &str from cstr"); - - let results = tantivysearch_ranked_search_impl(irw, query_str, limit); - - println!("Search results: {}", results.0.len()); - - Box::into_raw(Box::new(results.into())) -} - -#[no_mangle] -pub extern "C" fn tantivysearch_index(irw: *mut IndexRW, primary_ids: *const u64, secondary_ids: *const u64, chars: *const c_char, offsets: *const u64, size: size_t) -> c_uchar { - assert!(!irw.is_null()); - assert!(!primary_ids.is_null()); - assert!(!secondary_ids.is_null()); - assert!(!offsets.is_null()); - assert!(!chars.is_null()); - if size == 0 { - return 1; - } - let primary_slice = unsafe { slice::from_raw_parts(primary_ids, size) }; - let secondary_slice = unsafe { slice::from_raw_parts(secondary_ids, size) }; - let offsets_slice = unsafe { slice::from_raw_parts(offsets, size) }; - let chars_len: usize = (*offsets_slice.iter().last().unwrap()) as usize; - let chars_slice = unsafe { slice::from_raw_parts(chars as *const u8, chars_len) }; - let mut strs = Vec::with_capacity(size); - let mut current_start = 0; - for i in 0..size { - let end: usize = (offsets_slice[i] as usize - 1); - strs.push(unsafe { std::str::from_utf8_unchecked(&chars_slice[current_start..end]) }); - current_start = end + 1; - } - - let schema = unsafe { (*irw).index.schema() }; - - let body = schema.get_field("body").expect("missing field body"); - let primary_id = schema.get_field("primary_id").expect("missing field primary_id"); - let secondary_id = schema.get_field("secondary_id").expect("missing field secondary_id"); - - for i in 0..size { - let mut doc = Document::default(); - doc.add_u64(primary_id, primary_slice[i]); - doc.add_u64(secondary_id, secondary_slice[i]); - doc.add_text(body, strs[i]); - unsafe { (*irw).writer.add_document(doc) }; - } - - 1 -} - -#[no_mangle] -pub extern "C" fn tantivysearch_writer_commit(irw: *mut IndexRW) -> c_uchar { - assert!(!irw.is_null()); - match unsafe { (*irw).writer.commit() } { - Ok(_) => 1, - Err(e) => { - eprintln!("Failed to commit writer: {}", e); - 0 - } - } -} - -#[no_mangle] -pub extern "C" fn tantivysearch_index_truncate(irw: *mut IndexRW) -> c_uchar { - assert!(!irw.is_null()); - match unsafe { (*irw).writer.delete_all_documents() } { - Ok(_) => { - match unsafe { (*irw).writer.commit() } { - Ok(_) => 1, - Err(e) => { - eprintln!("Failed to commit writer: {}", e); - 0 - } - } - }, - Err(e) => { - eprintln!("Failed to delete all documents: {}", e); - 0 - } - } -} - -#[no_mangle] -pub extern "C" fn tantivysearch_iter_next(iter_ptr: *mut IterWrapper, primary_id_ptr: *mut u64, secondary_id_ptr: *mut u64) -> c_uchar { - assert!(!iter_ptr.is_null()); - match unsafe { (*iter_ptr).next() } { - Some((primary_id, secondary_id)) => { - unsafe { - *primary_id_ptr = primary_id; - *secondary_id_ptr = secondary_id; - } - 1 - } - None => 0 - } -} - -#[no_mangle] -pub extern "C" fn tantivysearch_iter_batch(iter_ptr: *mut IterWrapper, count: u64, primary_ids_ptr: *mut u64, secondary_ids_ptr: *mut u64) -> size_t { - assert!(!iter_ptr.is_null()); - if primary_ids_ptr.is_null() { - return 0; - } - - let iter_size = unsafe { (*iter_ptr).inner.0.len() - (*iter_ptr).offset }; - let n_to_write = std::cmp::min(count as usize, iter_size); - - unsafe { - let src_ptr = (*iter_ptr).inner.0.as_ptr().offset((*iter_ptr).offset as isize); - std::ptr::copy_nonoverlapping(src_ptr, primary_ids_ptr, n_to_write); - } - - if !secondary_ids_ptr.is_null() { - unsafe { - let src_ptr = (*iter_ptr).inner.1.as_ptr().offset((*iter_ptr).offset as isize); - std::ptr::copy_nonoverlapping(src_ptr, secondary_ids_ptr, n_to_write); - } - } - - unsafe { (*iter_ptr).offset += n_to_write }; - - n_to_write -} - -#[no_mangle] -pub extern "C" fn tantivysearch_iter_count(iter_ptr: *mut IterWrapper) -> size_t { - assert!(!iter_ptr.is_null()); - unsafe { (*iter_ptr).inner.0.len() - (*iter_ptr).offset } -} - -#[no_mangle] -pub extern "C" fn tantivysearch_iter_free(iter_ptr: *mut IterWrapper) { - assert!(!iter_ptr.is_null()); - drop(unsafe { Box::from_raw(iter_ptr) }); -} - -#[no_mangle] -pub extern "C" fn tantivysearch_index_free(irw: *mut IndexRW) { - assert!(!irw.is_null()); - drop(unsafe { Box::from_raw(irw) }); -} - -#[no_mangle] -pub extern "C" fn tantivysearch_index_delete(irw: *mut IndexRW) { - assert!(!irw.is_null()); - let path = unsafe { (*irw).path.clone() }; - std::fs::remove_dir_all(path).expect("failed to delete index"); - println!("removed dir"); -} - -#[cfg(test)] -mod tests { - #[test] - fn it_works() { - assert_eq!(2 + 2, 4); - } -} From bab0ac40d4d3b7cf59eb2febbe627bf9a4a6d0ed Mon Sep 17 00:00:00 2001 From: madianjun Date: Sun, 18 Jul 2021 07:37:57 +0800 Subject: [PATCH 06/12] Rename tantivy to lucene --- src/Functions/{Tantivy.cpp => Lucene.cpp} | 10 +-- src/Functions/registerFunctionsString.cpp | 4 +- .../{StorageTantivy.cpp => StorageLucene.cpp} | 74 +++++++++---------- .../{StorageTantivy.h => StorageLucene.h} | 16 ++-- src/Storages/registerStorages.cpp | 4 +- 5 files changed, 51 insertions(+), 57 deletions(-) rename src/Functions/{Tantivy.cpp => Lucene.cpp} (93%) rename src/Storages/{StorageTantivy.cpp => StorageLucene.cpp} (85%) rename src/Storages/{StorageTantivy.h => StorageLucene.h} (83%) diff --git a/src/Functions/Tantivy.cpp b/src/Functions/Lucene.cpp similarity index 93% rename from src/Functions/Tantivy.cpp rename to src/Functions/Lucene.cpp index 0c3800d8e8a6..879b041f3c46 100644 --- a/src/Functions/Tantivy.cpp +++ b/src/Functions/Lucene.cpp @@ -18,14 +18,14 @@ namespace ErrorCodes namespace { -class FunctionTantivy : public IFunction +class FunctionLucene : public IFunction { public: - static constexpr auto name = "tantivy"; + static constexpr auto name = "lucene"; static FunctionPtr create(const Context &) { - return std::make_shared(); + return std::make_shared(); } std::string getName() const override @@ -107,9 +107,9 @@ class FunctionTantivy : public IFunction } -void registerFunctionTantivy(FunctionFactory & factory) +void registerFunctionLucene(FunctionFactory & factory) { - factory.registerFunction(); + factory.registerFunction(); } } diff --git a/src/Functions/registerFunctionsString.cpp b/src/Functions/registerFunctionsString.cpp index d889f52dc5c7..bd096a904881 100644 --- a/src/Functions/registerFunctionsString.cpp +++ b/src/Functions/registerFunctionsString.cpp @@ -35,7 +35,7 @@ void registerFunctionNormalizedQueryHash(FunctionFactory &); void registerFunctionCountMatches(FunctionFactory &); void registerFunctionEncodeXMLComponent(FunctionFactory &); void registerFunctionDecodeXMLComponent(FunctionFactory &); -void registerFunctionTantivy(FunctionFactory &); +void registerFunctionLucene(FunctionFactory &); void registerFunctionExtractTextFromHTML(FunctionFactory &); @@ -75,7 +75,7 @@ void registerFunctionsString(FunctionFactory & factory) registerFunctionCountMatches(factory); registerFunctionEncodeXMLComponent(factory); registerFunctionDecodeXMLComponent(factory); - registerFunctionTantivy(factory); + registerFunctionLucene(factory); registerFunctionExtractTextFromHTML(factory); #if USE_BASE64 registerFunctionBase64Encode(factory); diff --git a/src/Storages/StorageTantivy.cpp b/src/Storages/StorageLucene.cpp similarity index 85% rename from src/Storages/StorageTantivy.cpp rename to src/Storages/StorageLucene.cpp index e8a0dbf6538d..9ffab9e0d1ff 100644 --- a/src/Storages/StorageTantivy.cpp +++ b/src/Storages/StorageLucene.cpp @@ -2,7 +2,7 @@ #include -#include +#include #include #include @@ -37,12 +37,12 @@ namespace ErrorCodes } -class TantivySource : public SourceWithProgress +class LuceneSource : public SourceWithProgress { public: - TantivySource( + LuceneSource( Names column_names_, - const StorageTantivy & storage_, + const StorageLucene & storage_, const StorageMetadataPtr & metadata_snapshot_, const String & query_text_, //const UInt64 limit_, @@ -68,10 +68,10 @@ class TantivySource : public SourceWithProgress this->hits = collector->topDocs()->scoreDocs; } -// ~TantivySource() override { +// ~LuceneSource() override { // this->reader->close(); // } - String getName() const override { return "Tantivy"; } + String getName() const override { return "Lucene"; } protected: Chunk generate() override @@ -123,11 +123,11 @@ class TantivySource : public SourceWithProgress Lucene::Collection hits; }; -class TantivyBlockOutputStream : public IBlockOutputStream +class LuceneBlockOutputStream : public IBlockOutputStream { public: - explicit TantivyBlockOutputStream( - StorageTantivy & storage_, + explicit LuceneBlockOutputStream( + StorageLucene & storage_, const StorageMetadataPtr & metadata_snapshot_) : storage(storage_) , metadata_snapshot(metadata_snapshot_) @@ -212,14 +212,14 @@ class TantivyBlockOutputStream : public IBlockOutputStream } } private: - StorageTantivy & storage; + StorageLucene & storage; StorageMetadataPtr metadata_snapshot; size_t primary_id_pos = 0; }; -StorageTantivy::StorageTantivy(const StorageID & table_id_, ColumnsDescription columns_description_, ConstraintsDescription constraints_, const String & index_path_) - : IStorage(table_id_), index_path(index_path_), log(&Poco::Logger::get("StorageTantivy (" + table_id_.table_name + ")")) +StorageLucene::StorageLucene(const StorageID & table_id_, ColumnsDescription columns_description_, ConstraintsDescription constraints_, const String & index_path_) + : IStorage(table_id_), index_path(index_path_), log(&Poco::Logger::get("StorageLucene (" + table_id_.table_name + ")")) { StorageInMemoryMetadata storage_metadata; storage_metadata.setColumns(std::move(columns_description_)); @@ -230,7 +230,7 @@ StorageTantivy::StorageTantivy(const StorageID & table_id_, ColumnsDescription c } -Pipe StorageTantivy::read( +Pipe StorageLucene::read( const Names & column_names, const StorageMetadataPtr & metadata_snapshot, SelectQueryInfo & query_info, @@ -250,10 +250,10 @@ Pipe StorageTantivy::read( ErrorCodes::NOT_IMPLEMENTED); } const auto * function = where->as(); - if (function->name != "tantivy") + if (function->name != "lucene") { throw Exception( - "WHERE clause should contain only tantivy function", + "WHERE clause should contain only lucene function", ErrorCodes::NOT_IMPLEMENTED); } @@ -270,19 +270,19 @@ Pipe StorageTantivy::read( String query_text = function->arguments->children[0]->as().value.safeGet(); - - //Poco::File(this->index_path) - std::wstring_convert> converter; - Lucene::String index_path_ws = converter.from_bytes(index_path); - Lucene::FSDirectoryPtr dir = Lucene::FSDirectory::open(index_path_ws); - if(dir->listAll().empty()) { - std::cout << "No files in lucene index path: " << this->index_path << std::endl; - return {}; - } + //Poco::File(this->index_path) + std::wstring_convert> converter; + Lucene::String index_path_ws = converter.from_bytes(index_path); + Lucene::FSDirectoryPtr dir = Lucene::FSDirectory::open(index_path_ws); + if (dir->listAll().empty()) + { + std::cout << "No files in lucene index path: " << this->index_path << std::endl; + return {}; + } return Pipe( - std::make_shared( + std::make_shared( column_names, *this, metadata_snapshot, @@ -292,12 +292,12 @@ Pipe StorageTantivy::read( )); } -BlockOutputStreamPtr StorageTantivy::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, const Context & /*context*/) +BlockOutputStreamPtr StorageLucene::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, const Context & /*context*/) { - return std::make_shared(*this, metadata_snapshot); + return std::make_shared(*this, metadata_snapshot); } -bool StorageTantivy::optimize( +bool StorageLucene::optimize( const ASTPtr & /*query*/, const StorageMetadataPtr & /*metadata_snapshot*/, const ASTPtr & /*partition*/, @@ -313,7 +313,7 @@ bool StorageTantivy::optimize( return false; } -void StorageTantivy::truncate( +void StorageLucene::truncate( const ASTPtr & /*query*/, const StorageMetadataPtr & /* metadata_snapshot */, const Context & /* context */, @@ -322,24 +322,24 @@ void StorageTantivy::truncate( } -std::optional StorageTantivy::totalRows(const Settings &) const +std::optional StorageLucene::totalRows(const Settings &) const { /// All modifications of these counters are done under mutex which automatically guarantees synchronization/consistency /// When run concurrently we are fine with any value: "before" or "after" return total_size_rows.load(std::memory_order_relaxed); } -std::optional StorageTantivy::totalBytes(const Settings &) const +std::optional StorageLucene::totalBytes(const Settings &) const { return total_size_bytes.load(std::memory_order_relaxed); } -void StorageTantivy::startup() +void StorageLucene::startup() { return; } -void StorageTantivy::shutdown() +void StorageLucene::shutdown() { // if (this->reader) { // this->reader->close(); @@ -350,14 +350,14 @@ void StorageTantivy::shutdown() return; } -void StorageTantivy::drop() { +void StorageLucene::drop() { Poco::File(index_path).remove(true); return; } -void registerStorageTantivy(StorageFactory & factory) +void registerStorageLucene(StorageFactory & factory) { - factory.registerStorage("Tantivy", [](const StorageFactory::Arguments & args) + factory.registerStorage("Lucene", [](const StorageFactory::Arguments & args) { if (args.engine_args.size() != 1) throw Exception( @@ -366,7 +366,7 @@ void registerStorageTantivy(StorageFactory & factory) String index_path = args.engine_args[0]->as().value.safeGet(); - return StorageTantivy::create(args.table_id, args.columns, args.constraints, index_path); + return StorageLucene::create(args.table_id, args.columns, args.constraints, index_path); }); } diff --git a/src/Storages/StorageTantivy.h b/src/Storages/StorageLucene.h similarity index 83% rename from src/Storages/StorageTantivy.h rename to src/Storages/StorageLucene.h index 8d6646199d92..255347811429 100644 --- a/src/Storages/StorageTantivy.h +++ b/src/Storages/StorageLucene.h @@ -14,12 +14,6 @@ #include #include #include -//#include -//namespace Lucene { -// class IndexReader; -// class IndexWriter; -// -//} namespace DB { @@ -29,14 +23,14 @@ namespace DB * It does not support keys. * Data is stored as a set of blocks and is not stored anywhere else. */ -class StorageTantivy final : public ext::shared_ptr_helper, public IStorage +class StorageLucene final : public ext::shared_ptr_helper, public IStorage { -friend struct ext::shared_ptr_helper; -friend class TantivyBlockOutputStream; +friend struct ext::shared_ptr_helper; +friend class LuceneBlockOutputStream; public: - String getName() const override { return "Tantivy"; } + String getName() const override { return "Lucene"; } size_t getSize() const { return data.size(); } @@ -94,7 +88,7 @@ friend class TantivyBlockOutputStream; Poco::Logger * log; protected: - StorageTantivy(const StorageID & table_id_, ColumnsDescription columns_description_, ConstraintsDescription constraints_, const String & index_path_); + StorageLucene(const StorageID & table_id_, ColumnsDescription columns_description_, ConstraintsDescription constraints_, const String & index_path_); }; } diff --git a/src/Storages/registerStorages.cpp b/src/Storages/registerStorages.cpp index b92ee2a0e276..874956caea42 100644 --- a/src/Storages/registerStorages.cpp +++ b/src/Storages/registerStorages.cpp @@ -26,7 +26,7 @@ void registerStorageJoin(StorageFactory & factory); void registerStorageView(StorageFactory & factory); void registerStorageMaterializedView(StorageFactory & factory); void registerStorageLiveView(StorageFactory & factory); -void registerStorageTantivy(StorageFactory & factory); +void registerStorageLucene(StorageFactory & factory); void registerStorageGenerateRandom(StorageFactory & factory); #if USE_AWS_S3 @@ -84,7 +84,7 @@ void registerStorages() registerStorageView(factory); registerStorageMaterializedView(factory); registerStorageLiveView(factory); - registerStorageTantivy(factory); + registerStorageLucene(factory); registerStorageGenerateRandom(factory); #if USE_AWS_S3 From c5dfc895db90094a7520f557b12801abf25620ee Mon Sep 17 00:00:00 2001 From: madianjun Date: Sun, 18 Jul 2021 08:06:13 +0800 Subject: [PATCH 07/12] Restore files from branch lucene-base --- README.md | 65 ---------------------------------- docker/builder/Dockerfile | 2 -- docker/builder/build.sh | 3 -- docker/server/.gitignore | 1 - docker/server/built.Dockerfile | 49 ------------------------- docker/server/prepare-built | 14 -------- 6 files changed, 134 deletions(-) delete mode 100644 docker/server/built.Dockerfile delete mode 100644 docker/server/prepare-built diff --git a/README.md b/README.md index ce2615f7c1c9..3329a98877ff 100644 --- a/README.md +++ b/README.md @@ -13,68 +13,3 @@ ClickHouse® is an open-source column-oriented database management system that a * [Code Browser](https://clickhouse.tech/codebrowser/html_report/ClickHouse/index.html) with syntax highlight and navigation. * [Contacts](https://clickhouse.tech/#contacts) can help to get your questions answered if there are any. * You can also [fill this form](https://clickhouse.tech/#meet) to meet Yandex ClickHouse team in person. - - -## Neoway Research - -This branch is part of a research where we implemented a proof of concept for full text search using [ClickHouse](https://github.com/ClickHouse/ClickHouse) and [Tantivy](https://github.com/tantivy-search/tantivy). - -Tantivy is a full text search engine library written in Rust. - -The implementation consists in creating the tantivy storage engine and tantivy SQL function. -Because this is just a test, we decided to hard code this three column names in the code so that we don't have to create all the logic behind dynamic column names with different types. It is hard-coded for columns `primary_id`, `secondary_id` and `body`. Then we can create the table using the query - -```sql -CREATE TABLE fulltext_table -( - primary_id UInt64, - secondary_id UInt64, - body String -) -ENGINE = Tantivy('/var/lib/clickhouse/tantivy/fulltext_table') --- Tantivy engine takes as parameter a path to save the data. -``` - -For the [Storage Engine](https://github.com/NeowayLabs/ClickHouse/blob/fulltext-21.3/src/Storages/StorageTantivy.cpp) it has to be able to receive data from the INSERT query and index into tantivy. For the SELECT queries we need to push the full text WHERE clause to tantivy and create a Clickhouse column with the result. - -Because the full text search query needs to be sent to tantivy we created an SQL function named tantivy, so the syntax for making queries is the following -```sql -SELECT primary_id -FROM fulltext_table -WHERE tantivy('full text query here') -``` -The `tantivy` SQL function doesn't return anything and has no logic inside. Its only purpose is to validade the input and generate the `ASTSelectQuery`. -Inside the storage engine we take the AST parameters and push the query to the Rust implementation inside the folder [contrib/tantivysearch](https://github.com/NeowayLabs/ClickHouse/tree/fulltext-21.3/contrib/tantivysearch). - -When data is indexed in tantivy it needs to be commited. That's an expensive job to do every insert so we decided to call it when optimize table is called -```sql -OPTIMIZE TABLE fulltext_table FINAL -``` -After the optimization the data is available for queries. - -## Results -We inserted 39 million texts with an average of 4895 characters, also all the texts were unique. Our testing machine is a n2d-standard-16, 16 CPU, 62.8G Mem, 2 Local SSD 375 GB in RAID 0, on Google Cloud. - -In our implementation we were not interested in retrieving the actual text from the search result. That means we chose to return only the ID columns and don't return the text. It would be easy to return the text, but for our use case we just want to have statistics on the data. An example would be to answer how many rows match with the phrase 'covid 19' ? The result for that is a query that runs at the same speed tantivy would run with a little increment of time to copy the result to a Clickhouse column. For the majority of searches we could get the result in milliseconds. Queries using OR operator and matching almost all the texts were slower and could time more than 1 second. - -Another use case is that we have a table with dozens of columns that is related to our fulltext_table by an ID. So we would have a query like this -```sql -SELECT * -FROM a_very_big_table -WHERE - -- many_filters_here - AND primary_id IN ( - SELECT primary_id - FROM fulltext_table - WHERE tantivy('full text query here') - ) -``` -Also we wanted to do many different queries, all with the same text filter, and running in parallel. Instead of doing the same query on tantivy at the same time, with the same result, we implemented a concurrent bounded cache mechanism that we can set a TTL and perform a single computation for multiple parallel queries on the same input resolving the same result to all once done. We noticed that the speed of those queries were fast making this solution very promising. - - -## Alternatives -Other alternatives to this is to use [data skipping indexes](https://clickhouse.tech/docs/en/engines/table-engines/mergetree-family/mergetree/#table_engine-mergetree-data_skipping-indexes) or implement something akin to an [inverted index](https://hannes.muehleisen.org/SIGIR2014-column-stores-ir-prototyping.pdf) on SQL directly. - -Data skipping indexes requires a lot of parameter tuning and it is very tricky to make they work with the SQL functions. Even with all that tuning we got very poor performance. - -Inverted index is an interesting solution, but it is very complex to implement and requires an external tokenizer and big complicated queries to search the data. The performance is better than data skipping indexes but still too slow for a real scenario. diff --git a/docker/builder/Dockerfile b/docker/builder/Dockerfile index b6fa9ab6b4f1..199b5217d795 100644 --- a/docker/builder/Dockerfile +++ b/docker/builder/Dockerfile @@ -37,8 +37,6 @@ RUN apt-get update \ lldb-${LLVM_VERSION} \ --yes --no-install-recommends -RUN curl https://sh.rustup.rs -sSf | sh -s -- -y - COPY build.sh / CMD ["/bin/bash", "/build.sh"] diff --git a/docker/builder/build.sh b/docker/builder/build.sh index d71411e65c9a..d4cf662e91b4 100755 --- a/docker/builder/build.sh +++ b/docker/builder/build.sh @@ -1,9 +1,6 @@ #!/usr/bin/env bash set -e -cd /server/contrib/tantivysearch -~/.cargo/bin/cargo build --release - #ccache -s # uncomment to display CCache statistics mkdir -p /server/build_docker cd /server/build_docker diff --git a/docker/server/.gitignore b/docker/server/.gitignore index 7f07d17405ab..692758d55aa1 100644 --- a/docker/server/.gitignore +++ b/docker/server/.gitignore @@ -1,3 +1,2 @@ alpine-root/* -built-root/* tgz-packages/* diff --git a/docker/server/built.Dockerfile b/docker/server/built.Dockerfile deleted file mode 100644 index 61be1ad1ae46..000000000000 --- a/docker/server/built.Dockerfile +++ /dev/null @@ -1,49 +0,0 @@ -FROM ubuntu:20.04 - -ARG gosu_ver=1.10 - -RUN apt-get update \ - && apt-get install --yes --no-install-recommends \ - apt-transport-https \ - ca-certificates \ - dirmngr \ - locales \ - wget \ - && rm -rf \ - /var/lib/apt/lists/* \ - /var/cache/debconf \ - /tmp/* \ - && apt-get clean - -ADD https://github.com/tianon/gosu/releases/download/$gosu_ver/gosu-amd64 /bin/gosu - -RUN locale-gen en_US.UTF-8 -ENV LANG en_US.UTF-8 -ENV LANGUAGE en_US:en -ENV LC_ALL en_US.UTF-8 -ENV TZ UTC - -RUN mkdir /docker-entrypoint-initdb.d -RUN mkdir -p /etc/clickhouse-server/config.d/ -RUN mkdir -p /etc/clickhouse-server/users.d/ - -COPY built-root/config.xml /etc/clickhouse-server/config.xml -COPY built-root/users.xml /etc/clickhouse-server/users.xml -COPY built-root/clickhouse /usr/bin/clickhouse -RUN ln -s /usr/bin/clickhouse /usr/bin/clickhouse-client -RUN ln -s /usr/bin/clickhouse /usr/bin/clickhouse-server -COPY docker_related_config.xml /etc/clickhouse-server/config.d/ -COPY entrypoint.sh /entrypoint.sh - -RUN useradd -M -U -u 999 clickhouse - -RUN chmod +x \ - /entrypoint.sh \ - /bin/gosu - -EXPOSE 9000 8123 9009 -VOLUME /var/lib/clickhouse - -ENV CLICKHOUSE_CONFIG /etc/clickhouse-server/config.xml - -ENTRYPOINT ["/entrypoint.sh"] diff --git a/docker/server/prepare-built b/docker/server/prepare-built deleted file mode 100644 index eec10ed032a8..000000000000 --- a/docker/server/prepare-built +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -set -e - -mkdir built-root -cd built-root - -SRC_DIR=../../.. -BUILD_DIR=${SRC_DIR}/build_docker - -cp ${BUILD_DIR}/programs/clickhouse . -cp ${SRC_DIR}/programs/server/{config,users}.xml . - -strip clickhouse \ No newline at end of file From 2895764a5517428a18b10fac6ff7ec61fa02c759 Mon Sep 17 00:00:00 2001 From: madianjun Date: Sun, 18 Jul 2021 20:21:40 +0800 Subject: [PATCH 08/12] Remove fixed column name --- src/Storages/StorageLucene.cpp | 133 ++++++++++++++------------------- 1 file changed, 57 insertions(+), 76 deletions(-) diff --git a/src/Storages/StorageLucene.cpp b/src/Storages/StorageLucene.cpp index 9ffab9e0d1ff..a0e1d709e655 100644 --- a/src/Storages/StorageLucene.cpp +++ b/src/Storages/StorageLucene.cpp @@ -80,29 +80,22 @@ class LuceneSource : public SourceWithProgress return {}; const auto & sample_block = metadata_snapshot->getSampleBlock(); - //const auto & key_column = sample_block.getByName("primary_id"); auto columns = sample_block.cloneEmptyColumns(); - size_t primary_id_pos = sample_block.getPositionByName("primary_id"); std::wstring_convert> converter; for (int i = 0; i < hits.size(); ++i) { Lucene::DocumentPtr doc = this->searcher->doc(hits[i]->doc); - Lucene::String primary_id = doc->get(L"primary_id"); - Lucene::String secondary_id = doc->get(L"secondary_id"); - std::wcout << "Lucene searched doc: " << primary_id << ", " << secondary_id <getSampleBlock(); } void write(const Block & block) override @@ -153,59 +139,57 @@ class LuceneBlockOutputStream : public IBlockOutputStream ErrorCodes::NOT_IMPLEMENTED); } - auto & primary_id = block.getByName("primary_id"); - auto primary_id_col = checkAndGetColumn(primary_id.column.get()); - auto & secondary_id = block.getByName("secondary_id"); - auto secondary_id_col = checkAndGetColumn(secondary_id.column.get()); - auto & body = block.getByName("body"); - auto body_col = checkAndGetColumn(body.column.get()); - if (primary_id_col && secondary_id_col && body_col) - { - std::wstring_convert> converter; - Lucene::String index_path_ws = converter.from_bytes(storage.index_path); - Lucene::IndexWriterPtr writer = Lucene::newLucene( - Lucene::FSDirectory::open(index_path_ws), - Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT), - true, - Lucene::IndexWriter::MaxFieldLengthLIMITED); + std::wstring_convert> converter; + Lucene::String index_path_ws = converter.from_bytes(storage.index_path); + Lucene::IndexWriterPtr writer = Lucene::newLucene( + Lucene::FSDirectory::open(index_path_ws), + Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT), + true, + Lucene::IndexWriter::MaxFieldLengthLIMITED); - auto rows = block.rows(); + auto rows = block.rows(); - WriteBufferFromOwnString write_buffer; + WriteBufferFromOwnString write_buffer; - for (size_t i = 0; i < rows; i++) + for (size_t i = 0; i < rows; i++) + { + std::cout << "Lucene inserting row[" << i << "]" << std::endl; + Lucene::DocumentPtr doc = Lucene::newLucene(); + size_t idx = 0; + for (const auto & elem : block) { - std::cout << "Lucene inserting row[" << i << "]" <(); - size_t idx = 0; - for (const auto & elem : block) + write_buffer.restart(); + auto column_name = block.safeGetByPosition(idx).name; + + if (idx < block.columns() - 1) { - write_buffer.restart(); - auto column_name = block.safeGetByPosition(idx).name; - - if (idx < block.columns() - 1) - { - elem.type->serializeAsText(*elem.column, i, write_buffer, FormatSettings()); - doc->add(Lucene::newLucene(converter.from_bytes(column_name), converter.from_bytes(write_buffer.str()), Lucene::Field::STORE_YES, Lucene::Field::INDEX_NOT_ANALYZED)); - } else { - elem.type->serializeAsText(*elem.column, i, write_buffer, FormatSettings()); - doc->add(Lucene::newLucene(converter.from_bytes(column_name), converter.from_bytes(write_buffer.str()), Lucene::Field::STORE_NO, Lucene::Field::INDEX_ANALYZED)); - } - ++idx; + elem.type->serializeAsText(*elem.column, i, write_buffer, FormatSettings()); + doc->add(Lucene::newLucene( + converter.from_bytes(column_name), + converter.from_bytes(write_buffer.str()), + Lucene::Field::STORE_YES, + Lucene::Field::INDEX_NOT_ANALYZED)); } - std::cout << "Lucene inserted row[" << i << "]" <addDocument(doc); - } - if (rows > 0) { - writer->optimize(); + else + { + elem.type->serializeAsText(*elem.column, i, write_buffer, FormatSettings()); + doc->add(Lucene::newLucene( + converter.from_bytes(column_name), + converter.from_bytes(write_buffer.str()), + Lucene::Field::STORE_YES, + Lucene::Field::INDEX_ANALYZED)); + } + ++idx; } - writer->close(); - } else { - throw Exception( - "Inserts need all columns", - ErrorCodes::NOT_IMPLEMENTED); + std::cout << "Lucene inserted row[" << i << "]" << std::endl; + writer->addDocument(doc); } + if (rows > 0) + { + writer->optimize(); + } + writer->close(); storage.total_size_bytes.fetch_add(size_bytes_diff, std::memory_order_relaxed); storage.total_size_rows.fetch_add(size_rows_diff, std::memory_order_relaxed); @@ -214,7 +198,7 @@ class LuceneBlockOutputStream : public IBlockOutputStream private: StorageLucene & storage; StorageMetadataPtr metadata_snapshot; - size_t primary_id_pos = 0; +// size_t primary_id_pos = 0; }; @@ -269,8 +253,6 @@ Pipe StorageLucene::read( String query_text = function->arguments->children[0]->as().value.safeGet(); - - //Poco::File(this->index_path) std::wstring_convert> converter; Lucene::String index_path_ws = converter.from_bytes(index_path); Lucene::FSDirectoryPtr dir = Lucene::FSDirectory::open(index_path_ws); @@ -307,9 +289,6 @@ bool StorageLucene::optimize( const Context & /*context*/) { std::cerr << "Running optimize" << std::endl; -// if (this->writer) { -// this->writer->optimize(); -// } return false; } @@ -319,6 +298,10 @@ void StorageLucene::truncate( const Context & /* context */, TableExclusiveLockHolder &) { + std::cout << "StorageLucene is truncate" << std::endl; + Poco::File(this->index_path).remove(true); + Poco::File(this->index_path).createDirectories(); + // TODO: init lucene index files } @@ -339,18 +322,16 @@ void StorageLucene::startup() return; } +// when drop table is called void StorageLucene::shutdown() { -// if (this->reader) { -// this->reader->close(); -// } -// if (this->writer) { -// this->writer->close(); -// } + std::cout << "StorageLucene is shutdown" << std::endl; + Poco::File(index_path).remove(true); return; } void StorageLucene::drop() { + std::cout << "StorageLucene is dropped" << std::endl; Poco::File(index_path).remove(true); return; } From f523ccb63af69ddc1dad4bcc7e1c9fbc360952de Mon Sep 17 00:00:00 2001 From: madianjun Date: Mon, 19 Jul 2021 17:14:07 +0800 Subject: [PATCH 09/12] Use relative path to parameter --- src/Storages/StorageLucene.cpp | 94 +++++++++++++++++++--------------- src/Storages/StorageLucene.h | 20 ++++++-- 2 files changed, 70 insertions(+), 44 deletions(-) diff --git a/src/Storages/StorageLucene.cpp b/src/Storages/StorageLucene.cpp index a0e1d709e655..41f500675df2 100644 --- a/src/Storages/StorageLucene.cpp +++ b/src/Storages/StorageLucene.cpp @@ -4,6 +4,8 @@ #include #include + +#include #include #include @@ -34,6 +36,7 @@ namespace DB namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int INCORRECT_FILE_NAME; } @@ -57,7 +60,6 @@ class LuceneSource : public SourceWithProgress this->reader = Lucene::IndexReader::open(dir_, true); std::cout << "Opened lucene index path: " << std::endl; - this->searcher = Lucene::newLucene(this->reader); Lucene::AnalyzerPtr analyzer = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT); Lucene::QueryParserPtr parser = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT, L"body", analyzer); @@ -68,9 +70,11 @@ class LuceneSource : public SourceWithProgress this->hits = collector->topDocs()->scoreDocs; } -// ~LuceneSource() override { -// this->reader->close(); -// } + ~LuceneSource() override { + this->searcher->close(); + this->reader->close(); + } + String getName() const override { return "Lucene"; } protected: @@ -142,10 +146,11 @@ class LuceneBlockOutputStream : public IBlockOutputStream std::wstring_convert> converter; Lucene::String index_path_ws = converter.from_bytes(storage.index_path); + // create a new index if there is not already an index at the provided path + // and otherwise open the existing index. Lucene::IndexWriterPtr writer = Lucene::newLucene( Lucene::FSDirectory::open(index_path_ws), Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT), - true, Lucene::IndexWriter::MaxFieldLengthLIMITED); auto rows = block.rows(); @@ -202,17 +207,26 @@ class LuceneBlockOutputStream : public IBlockOutputStream }; -StorageLucene::StorageLucene(const StorageID & table_id_, ColumnsDescription columns_description_, ConstraintsDescription constraints_, const String & index_path_) - : IStorage(table_id_), index_path(index_path_), log(&Poco::Logger::get("StorageLucene (" + table_id_.table_name + ")")) +StorageLucene::StorageLucene(const std::string & relative_table_dir_path, CommonArguments args) + : StorageLucene(args) { - StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(std::move(columns_description_)); - storage_metadata.setConstraints(std::move(constraints_)); - setInMemoryMetadata(storage_metadata); + if (relative_table_dir_path.empty()) + throw Exception("Storage " + getName() + " requires data path", ErrorCodes::INCORRECT_FILE_NAME); - Poco::File(index_path + "/").createDirectories(); + std::cout << "StorageLucene base_path:" << base_path << ", relative_table_dir_path:" << relative_table_dir_path << std::endl; + this->index_path = base_path + relative_table_dir_path + "/"; + Poco::File(this->index_path).createDirectories(); } +StorageLucene::StorageLucene(CommonArguments args) + : IStorage(args.table_id) + , base_path(args.context.getPath()) +{ + StorageInMemoryMetadata storage_metadata; + storage_metadata.setColumns(args.columns); + storage_metadata.setConstraints(args.constraints); + setInMemoryMetadata(storage_metadata); +} Pipe StorageLucene::read( const Names & column_names, @@ -317,38 +331,38 @@ std::optional StorageLucene::totalBytes(const Settings &) const return total_size_bytes.load(std::memory_order_relaxed); } -void StorageLucene::startup() -{ - return; -} - -// when drop table is called -void StorageLucene::shutdown() -{ - std::cout << "StorageLucene is shutdown" << std::endl; - Poco::File(index_path).remove(true); - return; -} - -void StorageLucene::drop() { - std::cout << "StorageLucene is dropped" << std::endl; - Poco::File(index_path).remove(true); - return; -} +//void StorageLucene::startup() +//{ +// return; +//} + +// when "DROP TABLE" is called, or clickhouse-server is shutdown +//void StorageLucene::shutdown() +//{ +// std::cout << "StorageLucene is shutdown" << std::endl; +// Poco::File(index_path).remove(true); +// return; +//} + +//void StorageLucene::drop() { +// std::cout << "StorageLucene is dropped" << std::endl; +// Poco::File(index_path).remove(true); +// return; +//} void registerStorageLucene(StorageFactory & factory) { - factory.registerStorage("Lucene", [](const StorageFactory::Arguments & args) + factory.registerStorage("Lucene", [](const StorageFactory::Arguments & factory_args) { - if (args.engine_args.size() != 1) - throw Exception( - "Engine " + args.engine_name + " needs the data path argument", - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - - String index_path = args.engine_args[0]->as().value.safeGet(); - - return StorageLucene::create(args.table_id, args.columns, args.constraints, index_path); - }); + StorageLucene::CommonArguments storage_args{ + .table_id = factory_args.table_id, + .columns = factory_args.columns, + .constraints = factory_args.constraints, + .context = factory_args.context + }; + + return StorageLucene::create(factory_args.relative_data_path, storage_args); + }); } } diff --git a/src/Storages/StorageLucene.h b/src/Storages/StorageLucene.h index 255347811429..acc341ba3e8d 100644 --- a/src/Storages/StorageLucene.h +++ b/src/Storages/StorageLucene.h @@ -47,8 +47,8 @@ friend class LuceneBlockOutputStream; size_t max_block_size, unsigned num_streams) override; - void startup() override; - void shutdown() override; +// void startup() override; +// void shutdown() override; bool supportsParallelInsert() const override { return false; } @@ -69,14 +69,23 @@ friend class LuceneBlockOutputStream; const Context & context, TableExclusiveLockHolder &) override; - void drop() override; +// void drop() override; bool supportsSampling() const override { return false; } std::optional totalRows(const Settings &) const override; std::optional totalBytes(const Settings &) const override; + struct CommonArguments + { + StorageID table_id; + const ColumnsDescription & columns; + const ConstraintsDescription & constraints; + const Context & context; + }; + private: + String base_path; /// The data itself. `list` - so that when inserted to the end, the existing iterators are not invalidated. BlocksList data; String index_path; @@ -88,7 +97,10 @@ friend class LuceneBlockOutputStream; Poco::Logger * log; protected: - StorageLucene(const StorageID & table_id_, ColumnsDescription columns_description_, ConstraintsDescription constraints_, const String & index_path_); + StorageLucene(const std::string & relative_table_dir_path, CommonArguments args); + +private: + explicit StorageLucene(CommonArguments args); }; } From 39deeb4c81854455b66bbc9f7f5340b3589f2502 Mon Sep 17 00:00:00 2001 From: madianjun Date: Mon, 19 Jul 2021 19:01:43 +0800 Subject: [PATCH 10/12] Add matchAllQuery and limit --- src/Storages/StorageLucene.cpp | 82 +++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/src/Storages/StorageLucene.cpp b/src/Storages/StorageLucene.cpp index 41f500675df2..4296811acf34 100644 --- a/src/Storages/StorageLucene.cpp +++ b/src/Storages/StorageLucene.cpp @@ -48,26 +48,39 @@ class LuceneSource : public SourceWithProgress const StorageLucene & storage_, const StorageMetadataPtr & metadata_snapshot_, const String & query_text_, - //const UInt64 limit_, - Lucene::FSDirectoryPtr dir_) + const Int32 limit_, + Lucene::FSDirectoryPtr index_dir_) : SourceWithProgress(metadata_snapshot_->getSampleBlockForColumns(column_names_, storage_.getVirtuals(), storage_.getStorageID())), column_names(std::move(column_names_)), metadata_snapshot(metadata_snapshot_), - query_text(std::move(query_text_)) - //limit(limit_), + query_text(std::move(query_text_)), + limit(limit_) { - - this->reader = Lucene::IndexReader::open(dir_, true); - std::cout << "Opened lucene index path: " << std::endl; + this->reader = Lucene::IndexReader::open(index_dir_, true); + std::cout << "Opened lucene index path" << std::endl; this->searcher = Lucene::newLucene(this->reader); - Lucene::AnalyzerPtr analyzer = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT); - Lucene::QueryParserPtr parser = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT, L"body", analyzer); - Lucene::QueryPtr query = parser->parse(Lucene::String(query_text.begin(), query_text.end())); - std::cout << "Search for: " << query_text << std::endl; - Lucene::TopScoreDocCollectorPtr collector = Lucene::TopScoreDocCollector::create(500, false); + Lucene::QueryPtr query; + if (!this->query_text.empty()) + { + Lucene::AnalyzerPtr analyzer = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT); + Lucene::QueryParserPtr parser + = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT, L"body", analyzer); + std::wstring_convert> converter; + query = parser->parse(converter.from_bytes(query_text)); + + std::cout << "Search query_text: " << query_text << std::endl; + } + else + { + query = Lucene::newLucene(); + std::cout << "Search all docs" << std::endl; + } + // Sort, use TopFieldCollector + Lucene::TopScoreDocCollectorPtr collector = Lucene::TopScoreDocCollector::create(limit, false); searcher->search(query, collector); this->hits = collector->topDocs()->scoreDocs; + } ~LuceneSource() override { @@ -114,7 +127,7 @@ class LuceneSource : public SourceWithProgress const StorageMetadataPtr metadata_snapshot; size_t current_block_idx = 0; const String query_text; - //UInt64 limit; + Int32 limit; Lucene::IndexReaderPtr reader; Lucene::SearcherPtr searcher; Lucene::Collection hits; @@ -213,8 +226,8 @@ StorageLucene::StorageLucene(const std::string & relative_table_dir_path, Common if (relative_table_dir_path.empty()) throw Exception("Storage " + getName() + " requires data path", ErrorCodes::INCORRECT_FILE_NAME); - std::cout << "StorageLucene base_path:" << base_path << ", relative_table_dir_path:" << relative_table_dir_path << std::endl; this->index_path = base_path + relative_table_dir_path + "/"; + std::cout << "StorageLucene index_path:" << this->index_path << std::endl; Poco::File(this->index_path).createDirectories(); } @@ -239,38 +252,37 @@ Pipe StorageLucene::read( { metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + String query_text; + Int32 limit = 10000; const ASTSelectQuery & select = query_info.query->as(); const ASTPtr & where = select.where(); - if (!where) - { - throw Exception( - "Missing WHERE clause", - ErrorCodes::NOT_IMPLEMENTED); - } - const auto * function = where->as(); - if (function->name != "lucene") + if (where) { - throw Exception( - "WHERE clause should contain only lucene function", - ErrorCodes::NOT_IMPLEMENTED); - } + const auto * function = where->as(); + if (function->name != "lucene") + { + throw Exception("WHERE clause should contain only lucene function", ErrorCodes::NOT_IMPLEMENTED); + } - UInt64 limit = 1000000UL; + if (function->arguments->children.size() >= 1) + { + query_text = function->arguments->children[0]->as().value.safeGet(); + } - if (function->arguments->children.size() == 2) - { - if (function->arguments->children[1]->as()) + if (function->arguments->children.size() >= 2) { + if (function->arguments->children[1]->as()) + { limit = function->arguments->children[1]->as().value.safeGet(); + } } } - String query_text = function->arguments->children[0]->as().value.safeGet(); std::wstring_convert> converter; Lucene::String index_path_ws = converter.from_bytes(index_path); - Lucene::FSDirectoryPtr dir = Lucene::FSDirectory::open(index_path_ws); - if (dir->listAll().empty()) + Lucene::FSDirectoryPtr index_dir = Lucene::FSDirectory::open(index_path_ws); + if (index_dir->listAll().empty()) { std::cout << "No files in lucene index path: " << this->index_path << std::endl; return {}; @@ -283,8 +295,8 @@ Pipe StorageLucene::read( *this, metadata_snapshot, query_text, - //limit, - dir + limit, + index_dir )); } From ab298dda0db36e717e4d8af8e10167c020abe9d2 Mon Sep 17 00:00:00 2001 From: madianjun Date: Tue, 20 Jul 2021 13:31:54 +0800 Subject: [PATCH 11/12] Add mutiple fields query --- src/Storages/StorageLucene.cpp | 42 ++++++++++++---------------------- 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/src/Storages/StorageLucene.cpp b/src/Storages/StorageLucene.cpp index 4296811acf34..6daea2516b4d 100644 --- a/src/Storages/StorageLucene.cpp +++ b/src/Storages/StorageLucene.cpp @@ -64,9 +64,15 @@ class LuceneSource : public SourceWithProgress if (!this->query_text.empty()) { Lucene::AnalyzerPtr analyzer = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT); - Lucene::QueryParserPtr parser - = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT, L"body", analyzer); std::wstring_convert> converter; + Lucene::Collection fields = Lucene::Collection::newInstance(column_names.size()); + for (size_t i = 0; i < column_names.size(); ++i) + { + fields[i] = (converter.from_bytes(column_names[i])); + } + + Lucene::QueryParserPtr parser + = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT, fields, analyzer); query = parser->parse(converter.from_bytes(query_text)); std::cout << "Search query_text: " << query_text << std::endl; @@ -150,13 +156,6 @@ class LuceneBlockOutputStream : public IBlockOutputStream const auto size_rows_diff = block.rows(); metadata_snapshot->check(block, true); { - if (block.columns() != 3) { - throw Exception( - "Inserts need all columns", - ErrorCodes::NOT_IMPLEMENTED); - } - - std::wstring_convert> converter; Lucene::String index_path_ws = converter.from_bytes(storage.index_path); // create a new index if there is not already an index at the provided path @@ -179,25 +178,13 @@ class LuceneBlockOutputStream : public IBlockOutputStream { write_buffer.restart(); auto column_name = block.safeGetByPosition(idx).name; + elem.type->serializeAsText(*elem.column, i, write_buffer, FormatSettings()); + doc->add(Lucene::newLucene( + converter.from_bytes(column_name), + converter.from_bytes(write_buffer.str()), + Lucene::Field::STORE_YES, + Lucene::Field::INDEX_ANALYZED)); - if (idx < block.columns() - 1) - { - elem.type->serializeAsText(*elem.column, i, write_buffer, FormatSettings()); - doc->add(Lucene::newLucene( - converter.from_bytes(column_name), - converter.from_bytes(write_buffer.str()), - Lucene::Field::STORE_YES, - Lucene::Field::INDEX_NOT_ANALYZED)); - } - else - { - elem.type->serializeAsText(*elem.column, i, write_buffer, FormatSettings()); - doc->add(Lucene::newLucene( - converter.from_bytes(column_name), - converter.from_bytes(write_buffer.str()), - Lucene::Field::STORE_YES, - Lucene::Field::INDEX_ANALYZED)); - } ++idx; } std::cout << "Lucene inserted row[" << i << "]" << std::endl; @@ -216,7 +203,6 @@ class LuceneBlockOutputStream : public IBlockOutputStream private: StorageLucene & storage; StorageMetadataPtr metadata_snapshot; -// size_t primary_id_pos = 0; }; From f03f226407a8aaf50d9ea1fb6a70e8810d4e7d23 Mon Sep 17 00:00:00 2001 From: Allen Zhang <1099905725@qq.com> Date: Wed, 22 Sep 2021 13:59:39 +0800 Subject: [PATCH 12/12] 1.Implement lucene table DDL 2.Support some lucene parameters 3.Support 4 lucene recognition analyzers --- src/CMakeLists.txt | 2 + src/Common/ErrorCodes.cpp | 1 + src/Interpreters/InterpreterCreateQuery.cpp | 40 ++++++++++ src/LuceneAnalyzer/AnalyzerFactory.cpp | 46 +++++++++++ src/LuceneAnalyzer/AnalyzerFactory.h | 30 ++++++++ src/LuceneAnalyzer/CMakeLists.txt | 0 src/LuceneAnalyzer/ya.make | 17 +++++ src/LuceneAnalyzer/ya.make.in | 16 ++++ src/Parsers/ASTColumnDeclaration.cpp | 52 +++++++++++++ src/Parsers/ASTColumnDeclaration.h | 5 ++ src/Parsers/CommonParsers.h | 3 +- src/Parsers/ExpressionElementParsers.cpp | 37 +++++++++ src/Parsers/ExpressionElementParsers.h | 18 +++++ src/Parsers/ParserCreateQuery.h | 84 +++++++++++++++++++++ src/Storages/ColumnsDescription.h | 5 ++ src/Storages/StorageLucene.cpp | 65 ++++++++++++++-- src/ya.make | 1 + 17 files changed, 414 insertions(+), 8 deletions(-) create mode 100644 src/LuceneAnalyzer/AnalyzerFactory.cpp create mode 100644 src/LuceneAnalyzer/AnalyzerFactory.h create mode 100644 src/LuceneAnalyzer/CMakeLists.txt create mode 100644 src/LuceneAnalyzer/ya.make create mode 100644 src/LuceneAnalyzer/ya.make.in diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4e2c5bd579db..83e28663b40e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -51,6 +51,7 @@ add_subdirectory (Storages) add_subdirectory (Parsers) add_subdirectory (Parsers/New) add_subdirectory (IO) +add_subdirectory (LuceneAnalyzer) add_subdirectory (Functions) add_subdirectory (Interpreters) add_subdirectory (AggregateFunctions) @@ -170,6 +171,7 @@ add_object_library(clickhouse_access Access) add_object_library(clickhouse_core Core) add_object_library(clickhouse_core_mysql Core/MySQL) add_object_library(clickhouse_compression Compression) +add_object_library(clickhouse_lucene_analyzer LuceneAnalyzer) add_object_library(clickhouse_datastreams DataStreams) add_object_library(clickhouse_datatypes DataTypes) add_object_library(clickhouse_databases Databases) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index fa921ef7c1c6..9f357019382d 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -541,6 +541,7 @@ M(572, TOO_MANY_QUERY_PLAN_OPTIMIZATIONS) \ M(573, EPOLL_ERROR) \ M(574, DISTRIBUTED_TOO_MANY_PENDING_BYTES) \ + M(801, UNKNOWN_LUCENE_ANLYZER) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index fd4ead58c1fc..17a16952ff88 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -54,6 +54,7 @@ #include #include +#include #include #include @@ -333,6 +334,21 @@ ASTPtr InterpreterCreateQuery::formatColumns(const ColumnsDescription & columns) if (column.ttl) column_declaration->ttl = column.ttl; + if (column.store_modifier) + column_declaration->store_modifier = column.store_modifier; + + if (column.index_modifier) + column_declaration->index_modifier = column.index_modifier; + + if (column.termvector_modifier) + column_declaration->termvector_modifier = column.termvector_modifier; + + if (column.analyzer) + column_declaration->analyzer=column.analyzer; + + if (column.search_analyzer) + column_declaration->search_analyzer=column.search_analyzer; + columns_list->children.push_back(column_declaration_ptr); } @@ -474,6 +490,30 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( if (col_decl.ttl) column.ttl = col_decl.ttl; + if (col_decl.store_modifier) + column.store_modifier = col_decl.store_modifier; + + if (col_decl.index_modifier) + column.index_modifier = col_decl.index_modifier; + + if (col_decl.store_modifier) + column.termvector_modifier = col_decl.termvector_modifier; + + if (col_decl.analyzer) + { + auto& name = col_decl.analyzer->children[0]->children[0]->as()->name; + AnalyzerFactory::instance().validate(name); + + column.analyzer = col_decl.analyzer; + } + + if (col_decl.search_analyzer) + { + auto& name = col_decl.search_analyzer->children[0]->children[0]->as()->name; + AnalyzerFactory::instance().validate(name); + column.search_analyzer = col_decl.search_analyzer; + } + res.add(std::move(column)); } diff --git a/src/LuceneAnalyzer/AnalyzerFactory.cpp b/src/LuceneAnalyzer/AnalyzerFactory.cpp new file mode 100644 index 000000000000..561f73b30e5f --- /dev/null +++ b/src/LuceneAnalyzer/AnalyzerFactory.cpp @@ -0,0 +1,46 @@ +#include +#include +#include + +namespace DB{ + +namespace ErrorCodes +{ + extern const int UNKNOWN_LUCENE_ANLYZER; +} + +LuceneAnalyzerPair genLuceneAnalyzers() +{ + static LuceneAnalyzerPair analyzers; + analyzers["STANDARDANALYZER"] = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT); + // TODO: stop words + analyzers["STOPANALYZER"] = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT); + analyzers["WHITESPACEANALYZER"] = Lucene::newLucene(); + analyzers["SIMPLEANALYZER"] = Lucene::newLucene(); + return analyzers; +} + +const LuceneAnalyzerPair AnalyzerFactory::analyzers = genLuceneAnalyzers(); + +void AnalyzerFactory::validate(const String& name) const +{ + + auto name_u = Poco::toUpper(name); + if (analyzers.find(name_u) == analyzers.end()) + { + throw Exception("Unknown LuceneAnalyzer family analyzer: " + name, ErrorCodes::UNKNOWN_LUCENE_ANLYZER); + } +} + +AnalyzerFactory::AnalyzerFactory() +{ +} + +AnalyzerFactory& AnalyzerFactory::instance() +{ + static AnalyzerFactory ret; + return ret; +} + +} + diff --git a/src/LuceneAnalyzer/AnalyzerFactory.h b/src/LuceneAnalyzer/AnalyzerFactory.h new file mode 100644 index 000000000000..f2aa11bb551e --- /dev/null +++ b/src/LuceneAnalyzer/AnalyzerFactory.h @@ -0,0 +1,30 @@ +#pragma once +#include +#include +#include + +namespace DB +{ + +using LuceneAnalyzerPair = std::unordered_map; + +class AnalyzerFactory +{ +public: + + static AnalyzerFactory & instance(); + + const char* getDefaultAnalyzer() const; + + /// Validate codecs AST specified by user + void validate(const String & analyzer_name) const; + +public: + static const LuceneAnalyzerPair analyzers; + +private: + AnalyzerFactory(); +}; + +} + diff --git a/src/LuceneAnalyzer/CMakeLists.txt b/src/LuceneAnalyzer/CMakeLists.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/src/LuceneAnalyzer/ya.make b/src/LuceneAnalyzer/ya.make new file mode 100644 index 000000000000..0ea23e229160 --- /dev/null +++ b/src/LuceneAnalyzer/ya.make @@ -0,0 +1,17 @@ +# This file is generated automatically, do not edit. See 'ya.make.in' and use 'utils/generate-ya-make' to regenerate it. +OWNER(g:clickhouse) + +LIBRARY() + + +PEERDIR( + clickhouse/src/Common + contrib/LucenePlusPlus +) + + +SRCS( + AnalyzerFactory.cpp +) + +END() diff --git a/src/LuceneAnalyzer/ya.make.in b/src/LuceneAnalyzer/ya.make.in new file mode 100644 index 000000000000..b4c074a6df5e --- /dev/null +++ b/src/LuceneAnalyzer/ya.make.in @@ -0,0 +1,16 @@ +OWNER(g:clickhouse) + +LIBRARY() + + +PEERDIR( + clickhouse/src/Common + contrib/LucenePlusPlus +) + + +SRCS( + +) + +END() diff --git a/src/Parsers/ASTColumnDeclaration.cpp b/src/Parsers/ASTColumnDeclaration.cpp index 4c14230e926b..ee582f4b5c96 100644 --- a/src/Parsers/ASTColumnDeclaration.cpp +++ b/src/Parsers/ASTColumnDeclaration.cpp @@ -43,6 +43,28 @@ ASTPtr ASTColumnDeclaration::clone() const res->children.push_back(res->ttl); } + if (store_modifier) { + res->store_modifier = store_modifier; + } + + if (index_modifier) { + res->index_modifier = index_modifier; + } + + if (termvector_modifier) { + res->termvector_modifier = termvector_modifier; + } + + if (analyzer) + { + res->analyzer = analyzer; + } + + if (search_analyzer) + { + res->search_analyzer = search_analyzer; + } + return res; } @@ -92,6 +114,36 @@ void ASTColumnDeclaration::formatImpl(const FormatSettings & settings, FormatSta settings.ostr << ' ' << (settings.hilite ? hilite_keyword : "") << "TTL" << (settings.hilite ? hilite_none : "") << ' '; ttl->formatImpl(settings, state, frame); } + + if (store_modifier) + { + settings.ostr << ' ' << (settings.hilite ? hilite_keyword : "") + << (*store_modifier ? "STORE" : "NOT_STORE") << (settings.hilite ? hilite_none : ""); + } + + if (index_modifier) + { + settings.ostr << ' ' << (settings.hilite ? hilite_keyword : "") + << (*index_modifier ? "INDEX" : "NOT_INDEX") << (settings.hilite ? hilite_none : ""); + } + + if (termvector_modifier) + { + settings.ostr << ' ' << (settings.hilite ? hilite_keyword : "") + << (*termvector_modifier ? "TERMVECTOR" : "NOT_TERMVECTOR") << (settings.hilite ? hilite_none : ""); + } + + if (analyzer) + { + settings.ostr << ' '; + analyzer->formatImpl(settings, state, frame); + } + + if (search_analyzer) + { + settings.ostr << ' '; + search_analyzer->formatImpl(settings, state, frame); + } } } diff --git a/src/Parsers/ASTColumnDeclaration.h b/src/Parsers/ASTColumnDeclaration.h index ea17a8b4dfa3..8c512bf8684a 100644 --- a/src/Parsers/ASTColumnDeclaration.h +++ b/src/Parsers/ASTColumnDeclaration.h @@ -19,6 +19,11 @@ class ASTColumnDeclaration : public IAST ASTPtr comment; ASTPtr codec; ASTPtr ttl; + std::optional store_modifier; + std::optional index_modifier; + std::optional termvector_modifier; + ASTPtr analyzer; + ASTPtr search_analyzer; String getID(char delim) const override { return "ColumnDeclaration" + (delim + name); } diff --git a/src/Parsers/CommonParsers.h b/src/Parsers/CommonParsers.h index 85b5217b617f..d8cf89ca3726 100644 --- a/src/Parsers/CommonParsers.h +++ b/src/Parsers/CommonParsers.h @@ -17,10 +17,9 @@ class ParserKeyword : public IParserBase public: ParserKeyword(const char * s_); - -protected: const char * getName() const override; +protected: bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 7a426e7774d3..2dd691397d33 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -815,6 +815,13 @@ bool ParserCodecDeclarationList::parseImpl(Pos & pos, ASTPtr & node, Expected & std::make_unique(TokenType::Comma), false).parse(pos, node, expected); } +bool ParserAnalyzerDeclarationList::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +{ + // TODO:: analyzer family + return ParserList(std::make_unique(), + std::make_unique(TokenType::Comma), false).parse(pos, node, expected); +} + bool ParserCodec::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { ParserCodecDeclarationList codecs; @@ -840,6 +847,36 @@ bool ParserCodec::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) return true; } +ParserAnalyzer::ParserAnalyzer(const char* s_):s(s_) +{ + +} + +bool ParserAnalyzer::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +{ + ParserAnalyzerDeclarationList analyzer; + ASTPtr expr_list_args; + + if (pos->type != TokenType::OpeningRoundBracket) + return false; + + ++pos; + if (!analyzer.parse(pos, expr_list_args, expected)) + return false; + + if (pos->type != TokenType::ClosingRoundBracket) + return false; + ++pos; + + auto function_node = std::make_shared(); + function_node->name = s; + function_node->arguments = expr_list_args; + function_node->children.push_back(function_node->arguments); + + node = function_node; + return true; +} + bool ParserCastExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { /// Either CAST(expr AS type) or CAST(expr, 'type') diff --git a/src/Parsers/ExpressionElementParsers.h b/src/Parsers/ExpressionElementParsers.h index b6194f981fec..d816dc041ef6 100644 --- a/src/Parsers/ExpressionElementParsers.h +++ b/src/Parsers/ExpressionElementParsers.h @@ -199,6 +199,13 @@ class ParserCodecDeclarationList : public IParserBase bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; +class ParserAnalyzerDeclarationList : public IParserBase +{ +protected: + const char * getName() const override { return "codec declaration list"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; +}; + /** Parse compression codec * CODEC(ZSTD(2)) */ @@ -209,6 +216,17 @@ class ParserCodec : public IParserBase bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; +class ParserAnalyzer : public IParserBase +{ +private: + const char * s; +public: + ParserAnalyzer(const char * s_); +protected: + const char * getName() const override { return "analyzer"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; +}; + class ParserCastExpression : public IParserBase { protected: diff --git a/src/Parsers/ParserCreateQuery.h b/src/Parsers/ParserCreateQuery.h index fbdc308d5bcd..33cc50854a99 100644 --- a/src/Parsers/ParserCreateQuery.h +++ b/src/Parsers/ParserCreateQuery.h @@ -125,10 +125,22 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E ParserKeyword s_comment{"COMMENT"}; ParserKeyword s_codec{"CODEC"}; ParserKeyword s_ttl{"TTL"}; + ParserKeyword s_store{"STORE"}; + ParserKeyword s_not_store{"NOT_STORE"}; + ParserKeyword s_index{"INDEX"}; + ParserKeyword s_not_index{"NOT_INDEX"}; + ParserKeyword s_termvector{"TERMVECTOR"}; + ParserKeyword s_not_termvector{"NOT_TERMVECTOR"}; + ParserKeyword s_analyzer{"ANALYZER"}; + ParserKeyword s_search_analyzer{"SEARCH_ANALYZER"}; ParserKeyword s_remove{"REMOVE"}; ParserTernaryOperatorExpression expr_parser; ParserStringLiteral string_literal_parser; ParserCodec codec_parser; + // ParserAnalyzer analyzer_parser{"ANALYZER"}; + // ParserAnalyzer search_analyzer_parser{"SEARCH_ANALYZER"}; + ParserAnalyzer analyzer_parser{s_analyzer.getName()}; + ParserAnalyzer search_analyzer_parser{s_search_analyzer.getName()}; ParserExpression expression_parser; /// mandatory column name @@ -164,6 +176,11 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E ASTPtr comment_expression; ASTPtr codec_expression; ASTPtr ttl_expression; + std::optional store_modifier; + std::optional index_modifier; + std::optional termvector_modifier; + ASTPtr analyzer_expression; + ASTPtr search_analyzer_expression; if (!s_default.checkWithoutMoving(pos, expected) && !s_materialized.checkWithoutMoving(pos, expected) @@ -220,6 +237,57 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E return false; } + if (s_store.ignore(pos, expected)) + { + store_modifier.emplace(true); + } + if (s_not_store.ignore(pos, expected)) + { + if (store_modifier) + { + return false; + } + store_modifier.emplace(false); + } + + if (s_index.ignore(pos, expected)) + { + index_modifier.emplace(true); + } + if (s_not_index.ignore(pos, expected)) + { + if (index_modifier) + { + return false; + } + index_modifier.emplace(false); + } + + if (s_termvector.ignore(pos, expected)) + { + termvector_modifier.emplace(true); + } + if (s_not_termvector.ignore(pos, expected)) + { + if (termvector_modifier) + { + return false; + } + termvector_modifier.emplace(false); + } + + if (s_analyzer.ignore(pos, expected)) + { + if (!analyzer_parser.parse(pos, analyzer_expression, expected)) + return false; + } + + if (s_search_analyzer.ignore(pos, expected)) + { + if (!search_analyzer_parser.parse(pos, search_analyzer_expression, expected)) + return false; + } + node = column_declaration; if (type) @@ -255,6 +323,22 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E column_declaration->children.push_back(std::move(ttl_expression)); } + column_declaration->store_modifier = store_modifier; + column_declaration->index_modifier = index_modifier; + column_declaration->termvector_modifier = termvector_modifier; + + if (analyzer_expression) + { + column_declaration->analyzer = analyzer_expression; + column_declaration->children.push_back(std::move(analyzer_expression)); + } + + if (search_analyzer_expression) + { + column_declaration->search_analyzer = search_analyzer_expression; + column_declaration->children.push_back(std::move(search_analyzer_expression)); + } + return true; } diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index 26e300045447..d307f3583a55 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -33,6 +33,11 @@ struct ColumnDescription String comment; ASTPtr codec; ASTPtr ttl; + std::optional store_modifier; + std::optional index_modifier; + std::optional termvector_modifier; + ASTPtr analyzer; + ASTPtr search_analyzer; ColumnDescription() = default; ColumnDescription(ColumnDescription &&) = default; diff --git a/src/Storages/StorageLucene.cpp b/src/Storages/StorageLucene.cpp index 6daea2516b4d..7824a9cda0ce 100644 --- a/src/Storages/StorageLucene.cpp +++ b/src/Storages/StorageLucene.cpp @@ -8,6 +8,8 @@ #include #include +#include + #include #include @@ -39,6 +41,7 @@ namespace ErrorCodes extern const int INCORRECT_FILE_NAME; } +using LuceneConfig = std::unordered_map>; class LuceneSource : public SourceWithProgress { @@ -59,12 +62,33 @@ class LuceneSource : public SourceWithProgress this->reader = Lucene::IndexReader::open(index_dir_, true); std::cout << "Opened lucene index path" << std::endl; + std::wstring_convert> converter; + auto& columns = metadata_snapshot->getColumns(); + for(auto& column : columns) { + auto analyzer = (column.analyzer) ? column.analyzer->children[0]->children[0]->as()->name : "StandardAnalyzer"; + auto search_analyzer = (column.search_analyzer) ? column.search_analyzer->children[0]->children[0]->as()->name : analyzer; + search_analyzer = Poco::toUpper(search_analyzer); + configs[column.name] = std::make_tuple(false, false, false, "", search_analyzer); + } + auto fieldAnalyzers = Lucene::MapStringAnalyzer::newInstance(); + for(auto& config: configs) + { + auto& col_name = config.first; + auto& ana_name = std::get<4>(config.second); + Lucene::String col_name_ws = converter.from_bytes(col_name); + auto ana_name_up = Poco::toUpper(ana_name); + fieldAnalyzers.put(col_name_ws, AnalyzerFactory::analyzers.at(ana_name_up)); + } + Lucene::PerFieldAnalyzerWrapperPtr aWrapper = + Lucene::newLucene( + Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT), + fieldAnalyzers); + + this->searcher = Lucene::newLucene(this->reader); Lucene::QueryPtr query; if (!this->query_text.empty()) { - Lucene::AnalyzerPtr analyzer = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT); - std::wstring_convert> converter; Lucene::Collection fields = Lucene::Collection::newInstance(column_names.size()); for (size_t i = 0; i < column_names.size(); ++i) { @@ -72,7 +96,7 @@ class LuceneSource : public SourceWithProgress } Lucene::QueryParserPtr parser - = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT, fields, analyzer); + = Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT, fields, aWrapper); query = parser->parse(converter.from_bytes(query_text)); std::cout << "Search query_text: " << query_text << std::endl; @@ -137,6 +161,7 @@ class LuceneSource : public SourceWithProgress Lucene::IndexReaderPtr reader; Lucene::SearcherPtr searcher; Lucene::Collection hits; + LuceneConfig configs; }; class LuceneBlockOutputStream : public IBlockOutputStream @@ -148,6 +173,15 @@ class LuceneBlockOutputStream : public IBlockOutputStream : storage(storage_) , metadata_snapshot(metadata_snapshot_) { + auto& columns = metadata_snapshot->getColumns(); + for(auto& column : columns) { + bool store = (column.store_modifier) ? *column.store_modifier: false; + bool index = (column.index_modifier) ? *column.index_modifier: true; + bool termvector = (column.termvector_modifier) ? *column.termvector_modifier: false; + auto analyzer = (column.analyzer) ? column.analyzer->children[0]->children[0]->as()->name : "StandardAnalyzer"; + analyzer = Poco::toUpper(analyzer); + configs[column.name] = std::make_tuple(store, index, termvector, analyzer, ""); + } } Block getHeader() const override { return metadata_snapshot->getSampleBlock(); } void write(const Block & block) override @@ -160,9 +194,21 @@ class LuceneBlockOutputStream : public IBlockOutputStream Lucene::String index_path_ws = converter.from_bytes(storage.index_path); // create a new index if there is not already an index at the provided path // and otherwise open the existing index. + auto fieldAnalyzers = Lucene::MapStringAnalyzer::newInstance(); + for(auto& config: configs) + { + auto& col_name = config.first; + auto& ana_name_up = std::get<3>(config.second); + Lucene::String col_name_ws = converter.from_bytes(col_name); + fieldAnalyzers.put(col_name_ws, AnalyzerFactory::analyzers.at(ana_name_up)); + } + Lucene::PerFieldAnalyzerWrapperPtr aWrapper = + Lucene::newLucene( + Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT), + fieldAnalyzers); Lucene::IndexWriterPtr writer = Lucene::newLucene( Lucene::FSDirectory::open(index_path_ws), - Lucene::newLucene(Lucene::LuceneVersion::LUCENE_CURRENT), + aWrapper, Lucene::IndexWriter::MaxFieldLengthLIMITED); auto rows = block.rows(); @@ -178,12 +224,18 @@ class LuceneBlockOutputStream : public IBlockOutputStream { write_buffer.restart(); auto column_name = block.safeGetByPosition(idx).name; + // TODO: Optimize code structure + auto config = configs[column_name]; + auto store = std::get<0>(config); + auto index = std::get<1>(config); + auto termvector = std::get<2>(config); elem.type->serializeAsText(*elem.column, i, write_buffer, FormatSettings()); doc->add(Lucene::newLucene( converter.from_bytes(column_name), converter.from_bytes(write_buffer.str()), - Lucene::Field::STORE_YES, - Lucene::Field::INDEX_ANALYZED)); + store ? Lucene::Field::STORE_YES : Lucene::Field::STORE_NO, + index ? Lucene::Field::INDEX_ANALYZED : Lucene::Field::INDEX_NOT_ANALYZED, + termvector ? Lucene::Field::TERM_VECTOR_YES : Lucene::Field::TERM_VECTOR_NO)); ++idx; } @@ -203,6 +255,7 @@ class LuceneBlockOutputStream : public IBlockOutputStream private: StorageLucene & storage; StorageMetadataPtr metadata_snapshot; + LuceneConfig configs; }; diff --git a/src/ya.make b/src/ya.make index 5361c8a56953..92ec51d83ec0 100644 --- a/src/ya.make +++ b/src/ya.make @@ -20,6 +20,7 @@ PEERDIR( clickhouse/src/Functions clickhouse/src/Interpreters clickhouse/src/IO + clickhouse/src/LuceneAnalyzer clickhouse/src/Parsers clickhouse/src/Processors clickhouse/src/Server