Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "deps/hopscotch_map"]
path = deps/hopscotch_map
url = https://github.com/Tessil/hopscotch-map.git
24 changes: 1 addition & 23 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,12 @@ project (dynamic)

include(${CMAKE_ROOT}/Modules/ExternalProject.cmake)

# hopscotch_map
ExternalProject_Add(hopscotch_map
GIT_REPOSITORY "https://github.com/Tessil/hopscotch-map.git"
BUILD_IN_SOURCE TRUE
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${INSTALL_DIR} # TODO ADD static build flag
UPDATE_COMMAND ""
INSTALL_COMMAND ""
BUILD_COMMAND ""
CONFIGURE_COMMAND "")
ExternalProject_Get_property(hopscotch_map INSTALL_DIR)
set(hopscotch_map_INCLUDE "${INSTALL_DIR}/src/hopscotch_map/include/")

include_directories(${PROJECT_SOURCE_DIR}/include)
#include_directories(${PROJECT_SOURCE_DIR}/include/dynamic)
#include_directories(${PROJECT_SOURCE_DIR}/include/dynamic/internal)
#include_directories(${PROJECT_SOURCE_DIR}/include/dynamic/algorithms)
#include_directories(${PROJECT_SOURCE_DIR}/include/dynamic/algorithms/cw-bwt)
include_directories(${INSTALL_DIR}/src/hopscotch_map/include)
include_directories(${CMAKE_SOURCE_DIR}/deps/hopscotch_map/include)

message("Building in ${CMAKE_BUILD_TYPE} mode")

Expand All @@ -55,13 +43,3 @@ add_executable(rle_bwt rle_bwt.cpp)
add_executable(cw-bwt cw-bwt.cpp)
add_executable(benchmark benchmark.cpp)
add_executable(wm_string wm_string.cpp)

add_dependencies(debug hopscotch_map)
add_dependencies(rle_lz77_v1 hopscotch_map)
add_dependencies(rle_lz77_v2 hopscotch_map)
add_dependencies(h0_lz77 hopscotch_map)
add_dependencies(rle_bwt hopscotch_map)
add_dependencies(cw-bwt hopscotch_map)
add_dependencies(benchmark hopscotch_map)
add_dependencies(wm_string hopscotch_map)

1 change: 1 addition & 0 deletions deps/hopscotch_map
Submodule hopscotch_map added at 848374
102 changes: 71 additions & 31 deletions h0_lz77.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,65 +23,105 @@
using namespace std;
using namespace dyn;

ulint sa_rate = 0;
bool int_file = false;

void help(){

cout << "Build LZ77 using a zero-order compressed FM index." << endl << endl;
cout << "Usage: h0_lz77 [options] <input_file> <output_file> " << endl;
cout << "Options: " << endl;
cout << "-s <sample_rate> store one SA sample every sample_rate positions. default: 256." << endl;
cout << "-i Interpret the file as a stream of 32-bits integers." << endl;
cout << "input_file: file to be parsed" << endl;
cout << "output_file: LZ77 triples <start,length,trailing_character> will be saved in binary format in this file" << endl << endl;
cout << "Note: the file should terminate with a character (or int if -i) not appearing elsewhere." << endl;

exit(0);

}


void parse_args(char** argv, int argc, int &ptr){

assert(ptr<argc);

string s(argv[ptr]);
ptr++;

if(s.compare("-s")==0){

sa_rate = atoi(argv[ptr++]);

}else if(s.compare("-i")==0){

int_file = true;

}else{
cout << "Error: unrecognized '" << s << "' option." << endl;
help();
}

}


int main(int argc,char** argv) {

using std::chrono::high_resolution_clock;
using std::chrono::duration_cast;
using std::chrono::duration;

if(argc!=3 and argc !=4){
if(argc < 3) help();

cout << "Build LZ77 using a zero-order compressed FM index." << endl << endl;
cout << "Usage: h0_lz77 [sample_rate] <input_file> <output_file> " << endl;
cout << " sample_rate: store one SA sample every sample_rate positions. default: 256." << endl;
cout << " input_file: file to be parsed" << endl;
cout << " output_file: LZ77 triples <start,length,char> will be saved in text format in this file" << endl;
//parse options

exit(0);
int ptr = 1;

}
if(argc<3) help();

using lz77_t = h0_lz77<wt_fmi>;
while(ptr<argc-2)
parse_args(argv, argc, ptr);

/*
* uncomment this (and comment the above line) to use instead a
* run-length encoded FM index.
*/
//using lz77_t = h0_lz77<rle_fmi>;
string in = string(argv[ptr++]);
string out = string(argv[ptr]);

auto t1 = high_resolution_clock::now();
using lz77_t = h0_lz77<wt_fmi>;

lz77_t lz77;
ulint DEFAULT_SA_RATE = lz77_t::DEFAULT_SA_RATE;

ulint sa_rate = argc == 3 ? DEFAULT_SA_RATE : atoi(argv[1]);
sa_rate = not sa_rate ? DEFAULT_SA_RATE : sa_rate;

sa_rate = sa_rate == 0 ? 1 : sa_rate;
auto t1 = high_resolution_clock::now();

string in(argv[1+(argc==4)]);
string out(argv[2+(argc==4)]);

cout << "Sample rate is " << sa_rate << endl;

{
if(not int_file){

cout << "Detecting alphabet ... " << flush;
std::ifstream ifs(in);
{
cout << "Detecting alphabet ... " << flush;
std::ifstream ifs(in);

lz77 = lz77_t(ifs, sa_rate);
ifs.close();
lz77 = lz77_t(ifs, sa_rate);

cout << "done." << endl;
cout << "done." << endl;
}

}
std::ifstream ifs(in);
std::ofstream os(out, ios::binary);

lz77.parse(ifs,os,1,true);

std::ifstream ifs(in);
std::ofstream os(out);
}else{

lz77.parse(ifs,os,15,true);
lz77 = lz77_t(~uint(0), sa_rate);
std::ifstream ifs(in, ios::binary);
std::ofstream os(out, ios::binary);

ifs.close();
os.close();
lz77.parse_int(ifs,os,1,true);

}

auto t2 = high_resolution_clock::now();

Expand Down
151 changes: 138 additions & 13 deletions include/dynamic/algorithms/h0_lz77.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,7 @@ class h0_lz77 {
* input: an input stream and an output stream
* the algorithms scans the input (just 1 scan) and
* saves to the output stream (could be a file) a series
* of triples <pos,len,c> of type <ulint,ulint,uchar>. Types
* are converted to char* before streaming them to out
* (i.e. ulint to 8 bytes and uchar to 1 byte). len is the length
* of triples <pos,len,c> of type <ulint,ulint,uchar>. len is the length
* of the copied string (i.e. excluded skipped characters in the end)
*
* after the end of a phrase, skip 'skip'>0 characters, included trailing character (LZ77
Expand Down Expand Up @@ -184,12 +182,9 @@ class h0_lz77 {
exit(0);
}

auto start = (char*)(new ulint(p));
auto l = (char*)(new ulint(len));

out.write(start,sizeof(ulint));
out.write(l,sizeof(ulint));
out.write(&cc,1);
out.write((char*)&p,sizeof(ulint));
out.write((char*)&len,sizeof(ulint));
out.write((char*)&cc,sizeof(cc));

gamma_bits += gamma(uint64_t(backward_pos+1));
gamma_bits += gamma(uint64_t(len+1));
Expand All @@ -199,10 +194,6 @@ class h0_lz77 {
delta_bits += delta(uint64_t(len+1));
delta_bits += delta(uint64_t(uint8_t(cc)));


delete start;
delete l;

z++;
len = 0;
p = 0;
Expand Down Expand Up @@ -248,6 +239,140 @@ class h0_lz77 {

}

/*
* input: an input integer stream (32 bits) and an output stream
* the algorithms scans the input (just 1 scan) and
* saves to the output stream (could be a file) a series
* of triples <pos,len,c> of type <ulint,ulint,int>. len is the length
* of the copied string (i.e. excluded skipped characters in the end)
*
* after the end of a phrase, skip 'skip'>0 characters, included trailing character (LZ77
* sparsification, experimental)
*
* to get also the last factor, input stream should
* terminate with a character that does not appear elsewhere
* in the stream
*
*/
void parse_int(istream& in, ostream& out, ulint skip = 1, bool verbose = false){

//size of the output if this is compressed using gamma/delta encoding
uint64_t gamma_bits = 0;
uint64_t delta_bits = 0;

assert(skip>0);

long int step = 100000; //print status every step characters
long int last_step = 0;

assert(fmi.size()==1); //only terminator

pair<ulint, ulint> range = fmi.get_full_interval(); //BWT range of current phrase

ulint len = 0; //length of current LZ phrase
ulint i = 0; //position of terminator character in bwt
ulint p = 0; //phrase occurrence

ulint z = 0; //number of LZ77 phrases

if(verbose) cout << "Parsing ..." << endl;

int cc;
ulint n = 0;
while(in.read((char*)&cc,sizeof(int))){

n++;
//cout << cc;

if(verbose){

if(n>last_step+(step-1)){

last_step = n;
cout << " " << n << " integers processed ..." << endl;

}

}

uint c(cc);

auto new_range = fmi.LF(range,c);

if(new_range.first >= new_range.second){

//cout << ":";

//empty range: new factor

ulint occ;

if(len>0){

occ = i == range.first ? range.second-1 : range.first;
p = fmi.locate(occ) - len;

}

fmi.extend(c);

uint64_t backward_pos = len == 0 ? 0 : (fmi.text_length() - len - 1) - p;

if(backward_pos > fmi.text_length()){
cout << "err" << endl;
exit(0);
}

out.write((char*)&p,sizeof(ulint));
out.write((char*)&len,sizeof(ulint));
out.write((char*)&cc,sizeof(cc));

z++;
len = 0;
p = 0;

//skip characters

ulint k = 0;

while(k < skip-1 && in.read((char*)&cc,sizeof(int))){

//cout << cc;

fmi.extend(uint(cc));
k++;
n++;

}

//cout << "|";

range = fmi.get_full_interval();

}else{

len++; //increase current phrase length
fmi.extend(c); //insert character c in the BWT
i = fmi.get_terminator_position(); //get new terminator position
range = {new_range.first, new_range.second+1}; //new suffix falls inside current range: extend

}


}

if(verbose){

cout << "\nNumber of integers: " << n << endl;
cout << "Number of LZ77 phrases: " << z << endl;


}


}


/*
* Total number of bits allocated in RAM for this structure
*
Expand Down