From 548c6f702b4be0f10ae43b135418a53dfb1c9212 Mon Sep 17 00:00:00 2001 From: nicolaprezza Date: Mon, 26 Apr 2021 12:48:41 +0200 Subject: [PATCH 1/2] added option -i to h0_lz77 --- h0_lz77.cpp | 102 ++++++++++++----- include/dynamic/algorithms/h0_lz77.hpp | 151 ++++++++++++++++++++++--- 2 files changed, 209 insertions(+), 44 deletions(-) diff --git a/h0_lz77.cpp b/h0_lz77.cpp index 2c44eed..03825d9 100644 --- a/h0_lz77.cpp +++ b/h0_lz77.cpp @@ -23,65 +23,105 @@ using namespace std; using namespace dyn; +ulint sa_rate = 0; +bool int_file = false; + +void help(){ + +cout << "Build LZ77 using a zero-order compressed FM index." << endl << endl; + cout << "Usage: h0_lz77 [options] " << endl; + cout << "Options: " << endl; + cout << "-s store one SA sample every sample_rate positions. default: 256." << endl; + cout << "-i Interpret the file as a stream of 32-bits integers." << endl; + cout << "input_file: file to be parsed" << endl; + cout << "output_file: LZ77 triples will be saved in binary format in this file" << endl << endl; + cout << "Note: the file should terminate with a character (or int if -i) not appearing elsewhere." << endl; + + exit(0); + +} + + +void parse_args(char** argv, int argc, int &ptr){ + + assert(ptr " << endl; - cout << " sample_rate: store one SA sample every sample_rate positions. default: 256." << endl; - cout << " input_file: file to be parsed" << endl; - cout << " output_file: LZ77 triples will be saved in text format in this file" << endl; + //parse options - exit(0); + int ptr = 1; - } + if(argc<3) help(); - using lz77_t = h0_lz77; + while(ptr; + string in = string(argv[ptr++]); + string out = string(argv[ptr]); - auto t1 = high_resolution_clock::now(); + using lz77_t = h0_lz77; lz77_t lz77; ulint DEFAULT_SA_RATE = lz77_t::DEFAULT_SA_RATE; - ulint sa_rate = argc == 3 ? DEFAULT_SA_RATE : atoi(argv[1]); + sa_rate = not sa_rate ? DEFAULT_SA_RATE : sa_rate; - sa_rate = sa_rate == 0 ? 1 : sa_rate; + auto t1 = high_resolution_clock::now(); - string in(argv[1+(argc==4)]); - string out(argv[2+(argc==4)]); cout << "Sample rate is " << sa_rate << endl; - { + if(not int_file){ - cout << "Detecting alphabet ... " << flush; - std::ifstream ifs(in); + { + cout << "Detecting alphabet ... " << flush; + std::ifstream ifs(in); - lz77 = lz77_t(ifs, sa_rate); - ifs.close(); + lz77 = lz77_t(ifs, sa_rate); - cout << "done." << endl; + cout << "done." << endl; + } - } + std::ifstream ifs(in); + std::ofstream os(out, ios::binary); + + lz77.parse(ifs,os,1,true); - std::ifstream ifs(in); - std::ofstream os(out); + }else{ - lz77.parse(ifs,os,15,true); + lz77 = lz77_t(~uint(0), sa_rate); + std::ifstream ifs(in, ios::binary); + std::ofstream os(out, ios::binary); - ifs.close(); - os.close(); + lz77.parse_int(ifs,os,1,true); + + } auto t2 = high_resolution_clock::now(); diff --git a/include/dynamic/algorithms/h0_lz77.hpp b/include/dynamic/algorithms/h0_lz77.hpp index 4e9da83..68fc0c6 100644 --- a/include/dynamic/algorithms/h0_lz77.hpp +++ b/include/dynamic/algorithms/h0_lz77.hpp @@ -104,9 +104,7 @@ class h0_lz77 { * input: an input stream and an output stream * the algorithms scans the input (just 1 scan) and * saves to the output stream (could be a file) a series - * of triples of type . Types - * are converted to char* before streaming them to out - * (i.e. ulint to 8 bytes and uchar to 1 byte). len is the length + * of triples of type . len is the length * of the copied string (i.e. excluded skipped characters in the end) * * after the end of a phrase, skip 'skip'>0 characters, included trailing character (LZ77 @@ -184,12 +182,9 @@ class h0_lz77 { exit(0); } - auto start = (char*)(new ulint(p)); - auto l = (char*)(new ulint(len)); - - out.write(start,sizeof(ulint)); - out.write(l,sizeof(ulint)); - out.write(&cc,1); + out.write((char*)&p,sizeof(ulint)); + out.write((char*)&len,sizeof(ulint)); + out.write((char*)&cc,sizeof(cc)); gamma_bits += gamma(uint64_t(backward_pos+1)); gamma_bits += gamma(uint64_t(len+1)); @@ -199,10 +194,6 @@ class h0_lz77 { delta_bits += delta(uint64_t(len+1)); delta_bits += delta(uint64_t(uint8_t(cc))); - - delete start; - delete l; - z++; len = 0; p = 0; @@ -248,6 +239,140 @@ class h0_lz77 { } + /* + * input: an input integer stream (32 bits) and an output stream + * the algorithms scans the input (just 1 scan) and + * saves to the output stream (could be a file) a series + * of triples of type . len is the length + * of the copied string (i.e. excluded skipped characters in the end) + * + * after the end of a phrase, skip 'skip'>0 characters, included trailing character (LZ77 + * sparsification, experimental) + * + * to get also the last factor, input stream should + * terminate with a character that does not appear elsewhere + * in the stream + * + */ + void parse_int(istream& in, ostream& out, ulint skip = 1, bool verbose = false){ + + //size of the output if this is compressed using gamma/delta encoding + uint64_t gamma_bits = 0; + uint64_t delta_bits = 0; + + assert(skip>0); + + long int step = 100000; //print status every step characters + long int last_step = 0; + + assert(fmi.size()==1); //only terminator + + pair range = fmi.get_full_interval(); //BWT range of current phrase + + ulint len = 0; //length of current LZ phrase + ulint i = 0; //position of terminator character in bwt + ulint p = 0; //phrase occurrence + + ulint z = 0; //number of LZ77 phrases + + if(verbose) cout << "Parsing ..." << endl; + + int cc; + ulint n = 0; + while(in.read((char*)&cc,sizeof(int))){ + + n++; + //cout << cc; + + if(verbose){ + + if(n>last_step+(step-1)){ + + last_step = n; + cout << " " << n << " integers processed ..." << endl; + + } + + } + + uint c(cc); + + auto new_range = fmi.LF(range,c); + + if(new_range.first >= new_range.second){ + + //cout << ":"; + + //empty range: new factor + + ulint occ; + + if(len>0){ + + occ = i == range.first ? range.second-1 : range.first; + p = fmi.locate(occ) - len; + + } + + fmi.extend(c); + + uint64_t backward_pos = len == 0 ? 0 : (fmi.text_length() - len - 1) - p; + + if(backward_pos > fmi.text_length()){ + cout << "err" << endl; + exit(0); + } + + out.write((char*)&p,sizeof(ulint)); + out.write((char*)&len,sizeof(ulint)); + out.write((char*)&cc,sizeof(cc)); + + z++; + len = 0; + p = 0; + + //skip characters + + ulint k = 0; + + while(k < skip-1 && in.read((char*)&cc,sizeof(int))){ + + //cout << cc; + + fmi.extend(uint(cc)); + k++; + n++; + + } + + //cout << "|"; + + range = fmi.get_full_interval(); + + }else{ + + len++; //increase current phrase length + fmi.extend(c); //insert character c in the BWT + i = fmi.get_terminator_position(); //get new terminator position + range = {new_range.first, new_range.second+1}; //new suffix falls inside current range: extend + + } + + + } + + if(verbose){ + + cout << "\nNumber of integers: " << n << endl; + cout << "Number of LZ77 phrases: " << z << endl; + + + } + + + } + + /* * Total number of bits allocated in RAM for this structure * From 36aba9517ce80d0a50abb376235fca1185f4b42d Mon Sep 17 00:00:00 2001 From: deliciouslytyped <47436522+deliciouslytyped@users.noreply.github.com> Date: Wed, 7 Jul 2021 23:04:01 +0200 Subject: [PATCH 2/2] Move hopscotch_map to a submodule and update CMakeLists.txt --- .gitmodules | 3 +++ CMakeLists.txt | 24 +----------------------- deps/hopscotch_map | 1 + 3 files changed, 5 insertions(+), 23 deletions(-) create mode 100644 .gitmodules create mode 160000 deps/hopscotch_map diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..9022e3f --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "deps/hopscotch_map"] + path = deps/hopscotch_map + url = https://github.com/Tessil/hopscotch-map.git diff --git a/CMakeLists.txt b/CMakeLists.txt index f2a41c2..f5b335b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,24 +14,12 @@ project (dynamic) include(${CMAKE_ROOT}/Modules/ExternalProject.cmake) -# hopscotch_map -ExternalProject_Add(hopscotch_map - GIT_REPOSITORY "https://github.com/Tessil/hopscotch-map.git" - BUILD_IN_SOURCE TRUE - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${INSTALL_DIR} # TODO ADD static build flag - UPDATE_COMMAND "" - INSTALL_COMMAND "" - BUILD_COMMAND "" - CONFIGURE_COMMAND "") -ExternalProject_Get_property(hopscotch_map INSTALL_DIR) -set(hopscotch_map_INCLUDE "${INSTALL_DIR}/src/hopscotch_map/include/") - include_directories(${PROJECT_SOURCE_DIR}/include) #include_directories(${PROJECT_SOURCE_DIR}/include/dynamic) #include_directories(${PROJECT_SOURCE_DIR}/include/dynamic/internal) #include_directories(${PROJECT_SOURCE_DIR}/include/dynamic/algorithms) #include_directories(${PROJECT_SOURCE_DIR}/include/dynamic/algorithms/cw-bwt) -include_directories(${INSTALL_DIR}/src/hopscotch_map/include) +include_directories(${CMAKE_SOURCE_DIR}/deps/hopscotch_map/include) message("Building in ${CMAKE_BUILD_TYPE} mode") @@ -55,13 +43,3 @@ add_executable(rle_bwt rle_bwt.cpp) add_executable(cw-bwt cw-bwt.cpp) add_executable(benchmark benchmark.cpp) add_executable(wm_string wm_string.cpp) - -add_dependencies(debug hopscotch_map) -add_dependencies(rle_lz77_v1 hopscotch_map) -add_dependencies(rle_lz77_v2 hopscotch_map) -add_dependencies(h0_lz77 hopscotch_map) -add_dependencies(rle_bwt hopscotch_map) -add_dependencies(cw-bwt hopscotch_map) -add_dependencies(benchmark hopscotch_map) -add_dependencies(wm_string hopscotch_map) - diff --git a/deps/hopscotch_map b/deps/hopscotch_map new file mode 160000 index 0000000..8483747 --- /dev/null +++ b/deps/hopscotch_map @@ -0,0 +1 @@ +Subproject commit 848374746a50b3ebebe656611d554cb134e9aeef