diff --git a/libtiledbvcf/external/md5/md5.cc b/libtiledbvcf/external/md5/md5.cc new file mode 100644 index 000000000..ac6b6191b --- /dev/null +++ b/libtiledbvcf/external/md5/md5.cc @@ -0,0 +1,215 @@ +#include "md5.h" +#include + +namespace md5 { + +/* + * Constants defined by the MD5 algorithm + */ +#define A 0x67452301 +#define B 0xefcdab89 +#define C 0x98badcfe +#define D 0x10325476 + +static uint32_t S[] = {7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, + 12, 17, 22, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, + 14, 20, 5, 9, 14, 20, 4, 11, 16, 23, 4, 11, 16, + 23, 4, 11, 16, 23, 4, 11, 16, 23, 6, 10, 15, 21, + 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21}; + +static uint32_t K[] = { + 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, + 0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, + 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340, + 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, + 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, + 0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, + 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa, + 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, + 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, + 0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, + 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391}; + +/* + * Padding used to make the size (in bits) of the input congruent to 448 mod 512 + */ +static uint8_t PADDING[] = { + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + +/* + * Bit-manipulation functions defined by the MD5 algorithm + */ +#define F(X, Y, Z) ((X & Y) | (~X & Z)) +#define G(X, Y, Z) ((X & Z) | (Y & ~Z)) +#define H(X, Y, Z) (X ^ Y ^ Z) +#define I(X, Y, Z) (Y ^ (X | ~Z)) + +/* + * Rotates a 32-bit word left by n bits + */ +uint32_t rotateLeft(uint32_t x, uint32_t n) { + return (x << n) | (x >> (32 - n)); +} + +void md5_init(MD5Context* ctx) { + ctx->size = (uint64_t)0; + + ctx->buffer[0] = (uint32_t)A; + ctx->buffer[1] = (uint32_t)B; + ctx->buffer[2] = (uint32_t)C; + ctx->buffer[3] = (uint32_t)D; +} + +void md5_update( + MD5Context* ctx, const uint8_t* input_buffer, size_t input_len) { + uint32_t input[16]; + unsigned int offset = ctx->size % 64; + ctx->size += (uint64_t)input_len; + + // Copy each byte in input_buffer into the next space in our context input + for (unsigned int i = 0; i < input_len; ++i) { + ctx->input[offset++] = (uint8_t)*(input_buffer + i); + + // If we've filled our context input, copy it into our local array input + // then reset the offset to 0 and fill in a new buffer. + // Every time we fill out a chunk, we run it through the algorithm + // to enable some back and forth between cpu and i/o + if (offset % 64 == 0) { + for (unsigned int j = 0; j < 16; ++j) { + // Convert to little-endian + // The local variable `input` our 512-bit chunk separated into 32-bit + // words we can use in calculations + input[j] = (uint32_t)(ctx->input[(j * 4) + 3]) << 24 | + (uint32_t)(ctx->input[(j * 4) + 2]) << 16 | + (uint32_t)(ctx->input[(j * 4) + 1]) << 8 | + (uint32_t)(ctx->input[(j * 4)]); + } + md5_step(ctx->buffer, input); + offset = 0; + } + } +} + +void md5_finalize(MD5Context* ctx) { + uint32_t input[16]; + unsigned int offset = ctx->size % 64; + unsigned int padding_length = offset < 56 ? 56 - offset : (56 + 64) - offset; + + // Fill in the padding and undo the changes to size that resulted from the + // update + md5_update(ctx, PADDING, padding_length); + ctx->size -= (uint64_t)padding_length; + + // Do a final update (internal to this function) + // Last two 32-bit words are the two halves of the size (converted from bytes + // to bits) + for (unsigned int j = 0; j < 14; ++j) { + input[j] = (uint32_t)(ctx->input[(j * 4) + 3]) << 24 | + (uint32_t)(ctx->input[(j * 4) + 2]) << 16 | + (uint32_t)(ctx->input[(j * 4) + 1]) << 8 | + (uint32_t)(ctx->input[(j * 4)]); + } + input[14] = (uint32_t)(ctx->size * 8); + input[15] = (uint32_t)((ctx->size * 8) >> 32); + + md5_step(ctx->buffer, input); + + // Move the result into digest (convert from little-endian) + for (unsigned int i = 0; i < 4; ++i) { + ctx->digest[(i * 4) + 0] = (uint8_t)((ctx->buffer[i] & 0x000000FF)); + ctx->digest[(i * 4) + 1] = (uint8_t)((ctx->buffer[i] & 0x0000FF00) >> 8); + ctx->digest[(i * 4) + 2] = (uint8_t)((ctx->buffer[i] & 0x00FF0000) >> 16); + ctx->digest[(i * 4) + 3] = (uint8_t)((ctx->buffer[i] & 0xFF000000) >> 24); + } +} + +void md5_step(uint32_t* buffer, uint32_t* input) { + uint32_t AA = buffer[0]; + uint32_t BB = buffer[1]; + uint32_t CC = buffer[2]; + uint32_t DD = buffer[3]; + + uint32_t E; + + unsigned int j; + + for (unsigned int i = 0; i < 64; ++i) { + switch (i / 16) { + case 0: + E = F(BB, CC, DD); + j = i; + break; + case 1: + E = G(BB, CC, DD); + j = ((i * 5) + 1) % 16; + break; + case 2: + E = H(BB, CC, DD); + j = ((i * 3) + 5) % 16; + break; + default: + E = I(BB, CC, DD); + j = (i * 7) % 16; + break; + } + + uint32_t temp = DD; + DD = CC; + CC = BB; + BB = BB + rotateLeft(AA + E + K[i] + input[j], S[i]); + AA = temp; + } + + buffer[0] += AA; + buffer[1] += BB; + buffer[2] += CC; + buffer[3] += DD; +} + +void md5_string(const char* input, uint8_t* result) { + MD5Context ctx; + md5_init(&ctx); + md5_update(&ctx, (uint8_t*)input, strlen(input)); + md5_finalize(&ctx); + + memcpy(result, ctx.digest, 16); +} + +void md5_file(FILE* file, uint8_t* result) { + char* input_buffer = (char*)malloc(1024); + size_t input_size = 0; + + MD5Context ctx; + md5_init(&ctx); + + while ((input_size = fread(input_buffer, 1, 1024, file)) > 0) { + md5_update(&ctx, (uint8_t*)input_buffer, input_size); + } + + md5_finalize(&ctx); + + free(input_buffer); + + memcpy(result, ctx.digest, 16); +} + +void md5_to_hex(const uint8_t* input, char* output) { + for (size_t i = 0; i < 16; i++) { + sprintf(output, "%02x", input[i]); + output += 2; + } +} + +std::string md5_to_hex(const uint8_t* input) { + char output[33]; + md5_to_hex(input, output); + output[32] = '\0'; + return std::string(output); +} + +} // namespace md5 diff --git a/libtiledbvcf/external/md5/md5.h b/libtiledbvcf/external/md5/md5.h new file mode 100644 index 000000000..c581dc902 --- /dev/null +++ b/libtiledbvcf/external/md5/md5.h @@ -0,0 +1,77 @@ +/* + * This file (md5.h) and the corresponding md5.cc file are licensed under the + * Unlicense license and are therefore released into the public domain. + * + * This is an a adaptation of the MD5 reference implementation from + * https://github.com/Zunawe/md5-c. It is derived from the RSA Data Security, + * Inc. MD5 Message-Digest Algorithm and modified slightly to be functionally + * identical but condensed into control structures. + */ + +#ifndef TILEDB_EXTERNAL_MD5_H +#define TILEDB_EXTERNAL_MD5_H + +#include +#include +#include +#include +#include + +namespace md5 { + +typedef struct { + uint64_t size; // Size of input in bytes + uint32_t buffer[4]; // Current accumulation of hash + uint8_t input[64]; // Input to be used in the next step + uint8_t digest[16]; // Result of algorithm +} MD5Context; + +/* + * Initialize an md5 context + */ +void md5_init(MD5Context* ctx); + +/* + * Add some amount of input to the context + * + * If the input fills out a block of 512 bits, apply the algorithm (md5_step) + * and save the result in the buffer. Also updates the overall size. + */ +void md5_update(MD5Context* ctx, const uint8_t* input, size_t input_len); + +/* + * Pad the current input to get to 448 bytes, append the size in bits to the + * very end, and save the result of the final iteration into digest. + */ +void md5_finalize(MD5Context* ctx); + +/* + * Step on 512 bits of input with the main MD5 algorithm. + */ +void md5_step(uint32_t* buffer, uint32_t* input); + +/* + * Functions that run the algorithm on the provided input and put the digest + * into result. result should be able to store 16 bytes. + */ +void md5_string(const char* input, uint8_t* result); + +/* + * Functions that run the algorithm on the contents of the provided file and + * put the digest into result. result should be able to store 16 bytes. + */ +void md5_file(FILE* file, uint8_t* result); + +/* + * Converts an md5 array into a C string. + */ +void md5_to_hex(const uint8_t* input, char* output); + +/* + * Converts an md5 array into a string. + */ +std::string md5_to_hex(const uint8_t* input); + +} // namespace md5 + +#endif diff --git a/libtiledbvcf/src/CMakeLists.txt b/libtiledbvcf/src/CMakeLists.txt index 53753f9c8..1b7eff1c6 100644 --- a/libtiledbvcf/src/CMakeLists.txt +++ b/libtiledbvcf/src/CMakeLists.txt @@ -90,6 +90,7 @@ set(TILEDB_VCF_SOURCES set(TILEDB_VCF_EXTERNAL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/../external/base64/base64.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../external/md5/md5.cc ) add_library(TILEDB_VCF_OBJECTS OBJECT diff --git a/libtiledbvcf/src/dataset/tiledbvcfdataset.cc b/libtiledbvcf/src/dataset/tiledbvcfdataset.cc index 1727df8e7..26d2eabe9 100644 --- a/libtiledbvcf/src/dataset/tiledbvcfdataset.cc +++ b/libtiledbvcf/src/dataset/tiledbvcfdataset.cc @@ -25,13 +25,17 @@ */ #include +#include +#include #include #include #include #include +#include #include "base64/base64.h" #include "dataset/tiledbvcfdataset.h" +#include "md5/md5.h" #include "read/export_format.h" #include "read/reader.h" #include "stats/allele_count.h" @@ -723,16 +727,15 @@ void TileDBVCFDataset::load_field_type_maps() const { uint32_t first_sample_id = metadata_.sample_ids.at(first_sample_name); SampleAndId first_sample = {first_sample_name, first_sample_id}; - std::unordered_map hdrs; - hdrs = fetch_vcf_headers({first_sample}); + SampleHeaders hdrs = fetch_vcf_headers({first_sample}); if (hdrs.size() != 1) throw std::runtime_error( "Error loading dataset field types; no headers fetched."); - const auto& hdr_ptr = hdrs.at(first_sample_id); - bcf_hdr_t* hdr = hdr_ptr.get(); - for (int i = 0; i < hdr->n[BCF_DT_ID]; i++) { - bcf_idpair_t* idpair = hdr->id[BCF_DT_ID] + i; + SafeBCFHdr hdr = hdrs.get_sample_header(first_sample_name); + bcf_hdr_t* hdr_ptr = hdr.get(); + for (int i = 0; i < hdr_ptr->n[BCF_DT_ID]; i++) { + bcf_idpair_t* idpair = hdr_ptr->id[BCF_DT_ID] + i; if (idpair == nullptr) throw std::runtime_error( "Error loading dataset field types; null idpair."); @@ -745,12 +748,12 @@ void TileDBVCFDataset::load_field_type_maps() const { bool is_fmt = idpair->val->hrec[2] != nullptr; if (is_info) { - int type = bcf_hdr_id2type(hdr, idpair->val->hrec[1]->type, i); + int type = bcf_hdr_id2type(hdr_ptr, idpair->val->hrec[1]->type, i); info_field_types_[name] = type; } if (is_fmt) { - int type = bcf_hdr_id2type(hdr, idpair->val->hrec[2]->type, i); + int type = bcf_hdr_id2type(hdr_ptr, idpair->val->hrec[2]->type, i); fmt_field_types_[name] = type; } } @@ -758,31 +761,34 @@ void TileDBVCFDataset::load_field_type_maps() const { info_fmt_field_types_loaded_ = true; } +void TileDBVCFDataset::load_field_type_maps_v4(bool add_iaf) const { + SampleHeaders hdrs = fetch_vcf_headers_v4({}, false, true); + if (hdrs.empty()) + return; + else if (hdrs.size() != 1) + throw std::runtime_error( + "Error loading dataset field types; no headers fetched."); + SafeBCFHdr hdr = hdrs.first(); + TileDBVCFDataset::load_field_type_maps_v4(hdr.get(), add_iaf); +} + void TileDBVCFDataset::load_field_type_maps_v4( const bcf_hdr_t* hdr, bool add_iaf) const { + if (!hdr) { + load_field_type_maps_v4(add_iaf); + return; + } + utils::UniqueWriteLock lck_(const_cast(&type_field_rw_lock_)); // After we acquire the write lock we need to check if another thread has // loaded the field types if (info_fmt_field_types_loaded_ && (!add_iaf || info_iaf_field_type_added_)) return; - std::unordered_map hdrs; - if (hdr == nullptr) { - hdrs = fetch_vcf_headers_v4({}, nullptr, false, true); - - if (hdrs.empty()) - return; - else if (hdrs.size() != 1) - throw std::runtime_error( - "Error loading dataset field types; no headers fetched."); - - hdr = hdrs.begin()->second.get(); - } - + // TODO: duplicate header and modify duplicate rather than modifying passed + // header if (add_iaf) { int header_appended = bcf_hdr_append( - // TODO: do something better than promoting this pointer type; - // perhaps the header should be duplicated and later modified const_cast(hdr), "##INFO="); @@ -791,8 +797,6 @@ void TileDBVCFDataset::load_field_type_maps_v4( "Error appending to header for internal allele frequency."); } if (bcf_hdr_append( - // TODO: do something better than promoting this pointer type; - // perhaps the header should be duplicated and later modified const_cast(hdr), "##INFO=") < @@ -801,8 +805,6 @@ void TileDBVCFDataset::load_field_type_maps_v4( "Error appending to header for internal allele count."); } if (bcf_hdr_append( - // TODO: do something better than promoting this pointer type; - // perhaps the header should be duplicated and later modified const_cast(hdr), "##INFO=") < @@ -1208,9 +1210,24 @@ std::shared_ptr TileDBVCFDataset::open_data_array( data_array_uri(root_uri_, false, true)); } -std::unordered_map TileDBVCFDataset::fetch_vcf_headers_v4( +size_t TileDBVCFDataset::SampleHeaders::empty() const { + return sample_index_lookup.empty(); +} + +size_t TileDBVCFDataset::SampleHeaders::size() const { + return sample_index_lookup.size(); +} + +SafeBCFHdr TileDBVCFDataset::SampleHeaders::first() const { + if (empty()) { + return SafeBCFHdr(nullptr, bcf_hdr_destroy); + } + const std::string& first_header_sample = sample_index_lookup.begin()->first; + return get_sample_header(first_header_sample); +} + +TileDBVCFDataset::SampleHeaders TileDBVCFDataset::fetch_vcf_headers_v4( const std::vector& samples, - std::unordered_map* lookup_map, bool all_samples, bool first_sample) const { LOG_DEBUG( @@ -1237,7 +1254,7 @@ std::unordered_map TileDBVCFDataset::fetch_vcf_headers_v4( "Cannot set first_sample and samples list in same fetch vcf headers " "request"); - std::unordered_map result; + SampleHeaders headers; if (vcf_header_array_ == nullptr) throw std::runtime_error( @@ -1273,7 +1290,7 @@ std::unordered_map TileDBVCFDataset::fetch_vcf_headers_v4( } } - uint32_t sample_idx = 0; + size_t sample_idx = 0; while (!mq->is_complete()) { mq->submit(); auto num_rows = mq->results()->num_rows(); @@ -1294,41 +1311,9 @@ std::unordered_map TileDBVCFDataset::fetch_vcf_headers_v4( // Create a string from the string_view to guarantee null termination // as required by htslib APIs. auto hdr_str = std::string(mq->string_view("header", i)); - auto sample = std::string(mq->string_view("sample", i)); - - bcf_hdr_t* hdr = bcf_hdr_init("r"); - if (!hdr) { - throw std::runtime_error( - "Error fetching VCF header data; error allocating VCF header."); - } + auto sample_name = std::string(mq->string_view("sample", i)); - if (0 != bcf_hdr_parse(hdr, const_cast(hdr_str.c_str()))) { - throw std::runtime_error( - "TileDBVCFDataset::fetch_vcf_headers_v4: Error parsing the BCF " - "header for sample " + - sample + "."); - } - - if (!sample.empty()) { - if (0 != bcf_hdr_add_sample(hdr, sample.c_str())) { - throw std::runtime_error( - "TileDBVCFDataset::fetch_vcf_headers_v4: Error adding sample " - "to " - "BCF header for sample " + - sample + "."); - } - } - - if (bcf_hdr_sync(hdr) < 0) { - throw std::runtime_error( - "Error in bcftools: failed to update VCF header."); - } - - result.emplace( - std::make_pair(sample_idx, SafeBCFHdr(hdr, bcf_hdr_destroy))); - if (lookup_map != nullptr) { - (*lookup_map)[sample] = sample_idx; - } + headers.set_sample_header(hdr_str.c_str(), sample_name, sample_idx); ++sample_idx; @@ -1344,10 +1329,106 @@ std::unordered_map TileDBVCFDataset::fetch_vcf_headers_v4( tiledb::Stats::enable(); LOG_DEBUG( "[fetch_vcf_headers_v4] done (VmRSS = {})", utils::memory_usage_str()); - return result; + return headers; +} + +void TileDBVCFDataset::SampleHeaders::set_sample_header( + const char* hdr_str, const std::string& sample_name, size_t sample_idx) { + // Compute the MD5 checksum for the header + // TODO: Can we use the MD5 digest directly as the map's hash rather + // than hashing the hex string? + md5::md5_string(hdr_str, md5_buffer); + std::string hdr_md5 = md5::md5_to_hex(md5_buffer); + + // Get the header's unique index and save the header, if necessary + size_t hdr_idx; + if (!md5_lookup.contains(hdr_md5)) { + hdr_idx = unique_headers.size(); + md5_lookup[hdr_md5] = hdr_idx; + unique_headers.emplace_back(hdr_str); + } else { + hdr_idx = md5_lookup[hdr_md5]; + // Check for MD5 collisions + const std::string& unique_hdr = unique_headers[hdr_idx]; + if (unique_hdr.compare(hdr_str)) { + auto unique_hdr_filter = [this, hdr_idx](const std::string& sample_name) { + Indexes& indexes = sample_index_lookup[sample_name]; + return indexes.header_idx == hdr_idx; + }; + auto samples = samples_view(); + std::vector unique_hdr_samples; + std::copy_if( + samples.begin(), + samples.end(), + std::back_inserter(unique_hdr_samples), + unique_hdr_filter); + std::string unqiue_hdr_samples_str = utils::join(unique_hdr_samples, ','); + throw std::runtime_error( + "TileDBVCFDataset::SampleHeaders::set_sample_header: MD5 for " + "sample " + + sample_name + + " header exists but header is different. Collision with header for " + "samples: " + + unqiue_hdr_samples_str); + } + } + + // Save the sample's unique header index and insertion order + sample_index_lookup.emplace(sample_name, Indexes{hdr_idx, sample_idx}); } -std::unordered_map TileDBVCFDataset::fetch_vcf_headers( +SafeBCFHdr TileDBVCFDataset::SampleHeaders::get_sample_header( + const std::string& sample) const { + bcf_hdr_t* hdr = bcf_hdr_init("r"); + if (!hdr) { + throw std::runtime_error( + "TileDBVCFDataset::SampleHeaders::get_sample_header: Error fetching " + "VCF header data; error allocating VCF header."); + } + + const size_t hdr_idx = sample_index_lookup.at(sample).header_idx; + const std::string& hdr_str = unique_headers[hdr_idx]; + + if (0 != bcf_hdr_parse(hdr, const_cast(hdr_str.c_str()))) { + throw std::runtime_error( + "TileDBVCFDataset::SampleHeaders::get_sample_header: Error parsing the " + "BCF header for sample " + + sample + "."); + } + + if (!sample.empty()) { + if (0 != bcf_hdr_add_sample(hdr, sample.c_str())) { + throw std::runtime_error( + "TileDBVCFDataset::SampleHeaders::get_sample_header: Error adding " + "sample to BCF header for sample " + + sample + "."); + } + } + + if (bcf_hdr_sync(hdr) < 0) { + throw std::runtime_error( + "TileDBVCFDataset::SampleHeaders::get_sample_header: Error in " + "bcftools: failed to update VCF header."); + } + + return SafeBCFHdr(hdr, bcf_hdr_destroy); +} + +std::vector +TileDBVCFDataset::SampleHeaders::header_ordered_samples() const { + auto samples = samples_view(); + std::vector sorted_samples{samples.begin(), samples.end()}; + auto comparison = [this](const auto& sample_a, const auto& sample_b) { + const SampleHeaders::Indexes& indexes_a = sample_index_lookup.at(sample_a); + const SampleHeaders::Indexes& indexes_b = sample_index_lookup.at(sample_b); + return indexes_a.header_idx < indexes_b.header_idx; + }; + // sort samples by header array index + std::sort(sorted_samples.begin(), sorted_samples.end(), comparison); + return sorted_samples; +} + +TileDBVCFDataset::SampleHeaders TileDBVCFDataset::fetch_vcf_headers( const std::vector& samples) const { // Grab a read lock of concurrency so we don't destroy the vcf_header_array // during fetching @@ -1357,11 +1438,12 @@ std::unordered_map TileDBVCFDataset::fetch_vcf_headers( if (!tiledb_stats_enabled_vcf_header_) tiledb::Stats::disable(); - std::unordered_map result; + SampleHeaders headers; if (vcf_header_array_ == nullptr) throw std::runtime_error( - "Cannot fetch TileDB-VCF vcf headers; Array object unexpectedly null"); + "Cannot fetch TileDB-VCF vcf headers; Array object unexpectedly " + "null"); Query query(*ctx_, *vcf_header_array_); Subarray subarray = @@ -1434,7 +1516,8 @@ std::unordered_map TileDBVCFDataset::fetch_vcf_headers( for (size_t offset_idx = 0; offset_idx < num_offsets; ++offset_idx) { // Get sample - uint32_t sample = sample_data[offset_idx]; + size_t sample_idx = sample_data[offset_idx]; + const std::string& sample_name = metadata_.sample_names_[sample_idx]; char* beg_hdr = data.data() + offsets[offset_idx]; uint64_t hdr_size = @@ -1445,38 +1528,16 @@ std::unordered_map TileDBVCFDataset::fetch_vcf_headers( memcpy(hdr_str, beg_hdr, hdr_size); hdr_str[hdr_size] = '\0'; - bcf_hdr_t* hdr = bcf_hdr_init("r"); - if (!hdr) - throw std::runtime_error( - "Error fetching VCF header data; error allocating VCF header."); - - if (0 != bcf_hdr_parse(hdr, hdr_str)) { - throw std::runtime_error( - "TileDBVCFDataset::fetch_vcf_headers: Error parsing the BCF " - "header for sample " + - std::to_string(sample) + "."); - } + headers.set_sample_header(hdr_str, sample_name, sample_idx); std::free(hdr_str); - - if (0 != - bcf_hdr_add_sample(hdr, metadata_.sample_names_[sample].c_str())) { - throw std::runtime_error("Error adding the sample."); - } - - if (bcf_hdr_sync(hdr) < 0) - throw std::runtime_error( - "Error in bcftools: failed to update VCF header."); - - result.emplace( - std::make_pair(sample, SafeBCFHdr(hdr, bcf_hdr_destroy))); } } } while (status == Query::Status::INCOMPLETE); if (tiledb_stats_enabled_) tiledb::Stats::enable(); - return result; -} // namespace vcf + return headers; +} std::vector TileDBVCFDataset::all_contigs() const { std::vector result; @@ -1539,17 +1600,13 @@ std::list TileDBVCFDataset::all_contigs_list() const { } std::vector TileDBVCFDataset::all_contigs_v4() const { - std::unordered_map hdrs = - fetch_vcf_headers_v4({}, nullptr, false, true); - + SampleHeaders hdrs = fetch_vcf_headers_v4({}, false, true); if (hdrs.empty()) return {}; else if (hdrs.size() != 1) throw std::runtime_error( "Error loading dataset field types; no headers fetched."); - - const auto& hdr = hdrs.begin()->second; - + SafeBCFHdr hdr = hdrs.first(); return VCFUtils::hdr_get_contigs_regions(hdr.get()); } @@ -2235,7 +2292,7 @@ std::map TileDBVCFDataset::info_field_types() const { load_field_type_maps(); else { assert(metadata_.version == Version::V4); - load_field_type_maps_v4(nullptr); + load_field_type_maps_v4(); } lck_.lock(); } @@ -2250,7 +2307,7 @@ std::map TileDBVCFDataset::fmt_field_types() const { load_field_type_maps(); else { assert(metadata_.version == Version::V4); - load_field_type_maps_v4(nullptr); + load_field_type_maps_v4(); } lck_.lock(); } diff --git a/libtiledbvcf/src/dataset/tiledbvcfdataset.h b/libtiledbvcf/src/dataset/tiledbvcfdataset.h index a142e7a63..8765a3b94 100644 --- a/libtiledbvcf/src/dataset/tiledbvcfdataset.h +++ b/libtiledbvcf/src/dataset/tiledbvcfdataset.h @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -209,6 +210,97 @@ class TileDBVCFDataset { /* PUBLIC DATATYPES */ /* ********************************* */ + /** + * This class stores headers and associates samples with them in a memory + * efficient manner. + */ + class SampleHeaders { + private: + // Saves indexes for each sample + struct Indexes { + size_t header_idx; // index in unique_headers + size_t tiledb_idx; // index in TileDB header array + }; + // Save unique headers as strings for on-demand bcf_hdr_t parsing + std::vector unique_headers; + // Use the same buffer for all MD5s to avoid reallocations + uint8_t md5_buffer[16]; + // Maps MD5s to indexes in unqiue_headers + std::unordered_map md5_lookup; + // Maps sample names to their indexes + std::unordered_map sample_index_lookup; + + /** + * Sets the header for a particular sample. + * + * @param hdr_str The header to be set + * @param sample_name The name of the sample the header is being set for + * @param sample_idx The TileDB index of the sample being set + */ + void set_sample_header( + const char* hdr_str, const std::string& sample_name, size_t sample_idx); + + public: + friend class TileDBVCFDataset; + + /** + * Clears the instantiated data structures. + */ + void clear() { + unique_headers.clear(); + md5_lookup.clear(); + sample_index_lookup.clear(); + } + + /** + * Checks if no headers are loaded. + * + * @return True if no headers are loaded + */ + size_t empty() const; + + /** + * Gets the number of headers loaded. + * + * @return The number of headers loaded + */ + size_t size() const; + + /** + * Gets the "first" header via the internal lookup map's begin() + * iterator. + * + * @return The first header + */ + SafeBCFHdr first() const; + + /** + * Gets the header for the given sample name. + * + * @param sample The name of the sample to get the header for + * @return The first header + */ + SafeBCFHdr get_sample_header(const std::string& sample) const; + + /** + * Returns the sample names stored as a view of the internal lookup map. + * + * @param sample The name of the sample to get the header for + * @return The first header + */ + auto samples_view() const { + return std::views::keys(sample_index_lookup); + }; + + /** + * Gets sample names in the order their headers are returned by the + * TileDB header array query. + * + * @return A vector containing the ordered sample names + */ + std::vector header_ordered_samples() const; + }; + /** * The format version. */ @@ -367,7 +459,7 @@ class TileDBVCFDataset { std::string root_uri() const; - std::unordered_map fetch_vcf_headers( + SampleHeaders fetch_vcf_headers( const std::vector& samples) const; /** @@ -376,9 +468,8 @@ class TileDBVCFDataset { * @param lookup_map * @return */ - std::unordered_map fetch_vcf_headers_v4( + SampleHeaders fetch_vcf_headers_v4( const std::vector& samples, - std::unordered_map* lookup_map, bool all_samples, bool first_sample) const; @@ -1014,6 +1105,7 @@ class TileDBVCFDataset { /** * Populate the metadata maps of info/fmt field name -> htslib types. */ + void load_field_type_maps_v4(bool add_iaf = false) const; void load_field_type_maps_v4( const bcf_hdr_t* hdr, bool add_iaf = false) const; diff --git a/libtiledbvcf/src/read/pvcf_exporter.cc b/libtiledbvcf/src/read/pvcf_exporter.cc index c6006b8ee..130df6cef 100644 --- a/libtiledbvcf/src/read/pvcf_exporter.cc +++ b/libtiledbvcf/src/read/pvcf_exporter.cc @@ -59,15 +59,8 @@ PVCFExporter::PVCFExporter(const std::string& output_uri, ExportFormat fmt) PVCFExporter::~PVCFExporter() { } -void PVCFExporter::init( - const std::unordered_map& hdrs_lookup, - const std::unordered_map& hdrs) { - // sort sample names to match the order returned by tiledb - std::vector> sorted_hdrs( - hdrs_lookup.begin(), hdrs_lookup.end()); - std::sort(sorted_hdrs.begin(), sorted_hdrs.end()); - - merger_.init(sorted_hdrs, hdrs); +void PVCFExporter::init(const TileDBVCFDataset::SampleHeaders& headers) { + merger_.init(headers); std::string mode = "w" + fmt_code_; fp_.reset(bcf_open(uri_.c_str(), mode.c_str())); diff --git a/libtiledbvcf/src/read/pvcf_exporter.h b/libtiledbvcf/src/read/pvcf_exporter.h index ed50e6dfc..5c5528cca 100644 --- a/libtiledbvcf/src/read/pvcf_exporter.h +++ b/libtiledbvcf/src/read/pvcf_exporter.h @@ -40,9 +40,7 @@ class PVCFExporter : public Exporter { ~PVCFExporter(); - void init( - const std::unordered_map& hdrs_lookup, - const std::unordered_map& hdrs); + void init(const TileDBVCFDataset::SampleHeaders& headers); void reset() override; diff --git a/libtiledbvcf/src/read/reader.cc b/libtiledbvcf/src/read/reader.cc index 67298988c..a2fcfccef 100644 --- a/libtiledbvcf/src/read/reader.cc +++ b/libtiledbvcf/src/read/reader.cc @@ -650,6 +650,7 @@ bool Reader::next_read_batch_v2_v3() { read_state_.region_idx = 0; // Headers + // TODO: Is the clear necessary? read_state_.current_hdrs.clear(); // Sample handles @@ -798,15 +799,13 @@ bool Reader::next_read_batch_v4() { // Fetch new headers for new sample batch if (read_state_.need_headers) { + // TODO: Is the clear necessary? read_state_.current_hdrs.clear(); read_state_.current_hdrs = dataset_->fetch_vcf_headers_v4( - read_state_.current_sample_batches, - &read_state_.current_hdrs_lookup, - read_state_.all_samples, - false); + read_state_.current_sample_batches, read_state_.all_samples, false); if (params_.export_combined_vcf) { static_cast(exporter_.get()) - ->init(read_state_.current_hdrs_lookup, read_state_.current_hdrs); + ->init(read_state_.current_hdrs); } } } @@ -1194,8 +1193,9 @@ bool Reader::read_current_batch() { if (dataset_->metadata().version == TileDBVCFDataset::Version::V3 || dataset_->metadata().version == TileDBVCFDataset::Version::V2) { for (const auto& s : read_state_.sample_batches[read_state_.batch_idx]) { - bcf_hdr_t* hdr = read_state_.current_hdrs.at(s.sample_id).get(); - exporter_->finalize_export(s, hdr); + SafeBCFHdr hdr = + read_state_.current_hdrs.get_sample_header(s.sample_name); + exporter_->finalize_export(s, hdr.get()); } } else { assert(dataset_->metadata().version == TileDBVCFDataset::Version::V4); @@ -1204,10 +1204,9 @@ bool Reader::read_current_batch() { // finalize the export if (read_state_.query_contig_batch_idx == read_state_.query_regions_v4.size() - 1) { - for (const auto& s : read_state_.current_hdrs_lookup) { - std::string sample_name = s.first; - bcf_hdr_t* hdr = read_state_.current_hdrs.at(s.second).get(); - exporter_->finalize_export(SampleAndId{sample_name, 0}, hdr); + for (const auto& s : read_state_.current_hdrs.samples_view()) { + SafeBCFHdr hdr = read_state_.current_hdrs.get_sample_header(s); + exporter_->finalize_export(SampleAndId{s, 0}, hdr.get()); } } } @@ -1720,40 +1719,29 @@ bool Reader::report_cell( } SampleAndId sample; - uint64_t hdr_index = 0; const auto& results = read_state_.query_results; if (dataset_->metadata().version == TileDBVCFDataset::Version::V2 || dataset_->metadata().version == TileDBVCFDataset::Version::V3) { uint32_t samp_idx = results.buffers()->sample().value(cell_idx); - // Skip this cell if we are not reporting its sample. if (read_state_.current_samples.count(samp_idx) == 0) { return true; } sample = read_state_.current_samples[samp_idx]; - hdr_index = samp_idx; } else { assert(dataset_->metadata().version == TileDBVCFDataset::Version::V4); uint64_t size = 0; const char* sample_name = results.buffers()->sample_name().value(cell_idx, &size); sample = SampleAndId{std::string(sample_name, size)}; - hdr_index = read_state_.current_hdrs_lookup[sample.sample_name]; } - bcf_hdr_t* hdr_ptr = nullptr; + SafeBCFHdr hdr(nullptr, bcf_hdr_destroy); if (read_state_.need_headers) { - auto hdr_iter = read_state_.current_hdrs.find(hdr_index); - if (hdr_iter == read_state_.current_hdrs.end()) - throw std::runtime_error( - "Could not find VCF header for " + sample.sample_name + - " in report_cell"); - - const auto& hdr = read_state_.current_hdrs.at(hdr_index); - hdr_ptr = hdr.get(); + hdr = read_state_.current_hdrs.get_sample_header(sample.sample_name); } if (!exporter_->export_record( - sample, hdr_ptr, region, contig_offset, results, cell_idx)) + sample, hdr.get(), region, contig_offset, results, cell_idx)) return false; // If no overflow, increment num records count. diff --git a/libtiledbvcf/src/read/reader.h b/libtiledbvcf/src/read/reader.h index 6e9be6b7b..83f093f80 100644 --- a/libtiledbvcf/src/read/reader.h +++ b/libtiledbvcf/src/read/reader.h @@ -36,7 +36,6 @@ #include #include -#include #include "dataset/attribute_buffer_set.h" #include "dataset/tiledbvcfdataset.h" @@ -603,9 +602,7 @@ class Reader { std::unordered_map current_samples; /** Map of current relative sample ID -> VCF header instance. */ - std::unordered_map current_hdrs; - - std::unordered_map current_hdrs_lookup; + TileDBVCFDataset::SampleHeaders current_hdrs; /** * Stores the index to a region that was unsuccessfully reported diff --git a/libtiledbvcf/src/utils/utils.cc b/libtiledbvcf/src/utils/utils.cc index 23823a30e..d0a5eddf2 100644 --- a/libtiledbvcf/src/utils/utils.cc +++ b/libtiledbvcf/src/utils/utils.cc @@ -33,7 +33,9 @@ #include #include #include +#include #include +#include #include "htslib_plugin/hfile_tiledb_vfs.h" #include "utils/logger_public.h" @@ -90,6 +92,25 @@ std::set split_set(const std::string& s, char delim) { return results; } +std::string join( + const std::vector& v, char delim, bool skip_empty) { + auto empty_filter = [skip_empty](const std::string& s) { + return (skip_empty && !s.empty()) || !skip_empty; + }; + auto filtered_v = std::views::filter(v, empty_filter); + if (filtered_v.empty()) + return ""; + auto operation = [delim](std::string a, std::string b) { + return a + delim + b; + }; + std::string s = std::accumulate( + std::next(filtered_v.begin()), + filtered_v.end(), + filtered_v.front(), + operation); + return s; +} + bool starts_with(const std::string& value, const std::string& prefix) { if (prefix.size() > value.size()) return false; diff --git a/libtiledbvcf/src/utils/utils.h b/libtiledbvcf/src/utils/utils.h index c288d8baf..d8e6499ea 100644 --- a/libtiledbvcf/src/utils/utils.h +++ b/libtiledbvcf/src/utils/utils.h @@ -229,6 +229,17 @@ std::set split_set(const std::string& s, char delim); */ std::vector split(const std::string& s, char delim); +/** + * @brief + * Joins a vector of strings into a string given some character delimiter. + * @param v Vector to join + * @param delim The character to join the vector string with + * @param skip_empty Skip empty elements + * @return vector to store results in + */ +std::string join( + const std::vector& v, char delim, bool skip_empty = true); + /** * @tparam T * @param start_time diff --git a/libtiledbvcf/src/vcf/vcf_merger.cc b/libtiledbvcf/src/vcf/vcf_merger.cc index f60348048..c1cca0930 100644 --- a/libtiledbvcf/src/vcf/vcf_merger.cc +++ b/libtiledbvcf/src/vcf/vcf_merger.cc @@ -15,23 +15,23 @@ VCFMerger::VCFMerger() VCFMerger::~VCFMerger() { } -void VCFMerger::init( - const std::vector>& sorted_hdrs, - const std::unordered_map& hdr_map) { +void VCFMerger::init(const TileDBVCFDataset::SampleHeaders& headers) { hdr_.reset(bcf_hdr_init("w")); hdrs_.clear(); - for (const auto& [name, hdr_key] : sorted_hdrs) { - LOG_DEBUG("Adding sample_num {}: {}", hdrs_.size(), name); - sample_map_[name] = hdrs_.size(); - auto hdr = hdr_map.at(hdr_key).get(); - hdrs_.push_back(hdr); - if (bcf_hdr_merge(hdr_.get(), hdr) == NULL) { - LOG_FATAL("Error merging header from sample: {}", name); + const std::vector sorted_samples = + headers.header_ordered_samples(); + for (const auto& sample : sorted_samples) { + LOG_DEBUG("Adding sample_num {}: {}", hdrs_.size(), sample); + sample_map_[sample] = hdrs_.size(); + SafeBCFHdr hdr = headers.get_sample_header(sample); + if (bcf_hdr_merge(hdr_.get(), hdr.get()) == NULL) { + LOG_FATAL("Error merging header from sample: {}", sample); } - if (bcf_hdr_add_sample(hdr_.get(), name.c_str()) < 0) { - LOG_FATAL("Error adding sample to merged header: {}", name); + if (bcf_hdr_add_sample(hdr_.get(), sample.c_str()) < 0) { + LOG_FATAL("Error adding sample to merged header: {}", sample); } + hdrs_.push_back(std::move(hdr)); } if (bcf_hdr_sync(hdr_.get()) < 0) { @@ -198,7 +198,7 @@ void VCFMerger::merge_alleles(int sample_num, bcf1_t* input) { std::tuple VCFMerger::get_number_type_values( int id, int hdr_type, int sample_num) { - auto hdr = sample_num > 0 ? hdrs_[sample_num] : hdr_.get(); + bcf_hdr_t* hdr = sample_num > 0 ? hdrs_[sample_num].get() : hdr_.get(); int number = bcf_hdr_id2length(hdr, hdr_type, id); int type = bcf_hdr_id2type(hdr, hdr_type, id); @@ -294,7 +294,7 @@ void VCFMerger::finish_info(SafeBCFRec& rec) { // merge FORMAT:GT and INFO:AC,AN,DP for (const auto& [sample_num, rec] : md_.samples) { int values_read = - bcf_get_genotypes(hdrs_[sample_num], rec.get(), &dst_, &ndst_); + bcf_get_genotypes(hdrs_[sample_num].get(), rec.get(), &dst_, &ndst_); if (values_read == 1) { md_.gts[2 * sample_num + 1] = bcf_int32_vector_end; @@ -329,8 +329,8 @@ void VCFMerger::finish_info(SafeBCFRec& rec) { } // add sample read depth to total depth - if (bcf_get_info_int32(hdrs_[sample_num], rec.get(), "DP", &dst_, &ndst_) > - 0) { + if (bcf_get_info_int32( + hdrs_[sample_num].get(), rec.get(), "DP", &dst_, &ndst_) > 0) { md_.depth_total += *dst_; } } @@ -346,7 +346,12 @@ void VCFMerger::finish_info(SafeBCFRec& rec) { (void)vector_end; // unused uint32_t values_read = bcf_get_info_values( - hdrs_[sample_num], rec.get(), key_str, (void**)(&dst_), &ndst_, type); + hdrs_[sample_num].get(), + rec.get(), + key_str, + (void**)(&dst_), + &ndst_, + type); int* data = (int*)(dst_); // set expected values to number of values read @@ -492,7 +497,7 @@ void VCFMerger::finish_format(SafeBCFRec& rec) { int* hts_dst = nullptr; int hts_ndst = 0; int values_read = bcf_get_format_values( - hdrs_[sample_num], + hdrs_[sample_num].get(), rec_in.get(), key_str, (void**)(&hts_dst), diff --git a/libtiledbvcf/src/vcf/vcf_merger.h b/libtiledbvcf/src/vcf/vcf_merger.h index 082c10ed9..7079781b9 100644 --- a/libtiledbvcf/src/vcf/vcf_merger.h +++ b/libtiledbvcf/src/vcf/vcf_merger.h @@ -9,6 +9,7 @@ #include #include +#include "dataset/tiledbvcfdataset.h" #include "utils/logger_public.h" #include "utils/utils.h" @@ -100,9 +101,7 @@ class VCFMerger { ~VCFMerger(); - void init( - const std::vector>& sorted_hdrs, - const std::unordered_map& hdr_map); + void init(const TileDBVCFDataset::SampleHeaders& headers); void reset(); @@ -250,8 +249,7 @@ class VCFMerger { SafeBCFHdr hdr_; // vector of sample VCF headers, indexed by sample number - // TODO: update ReadState to use shared_ptrs - std::vector hdrs_; + std::vector hdrs_; // combined VCF record SafeBCFRec rec_; diff --git a/libtiledbvcf/src/vcf/vcf_utils.cc b/libtiledbvcf/src/vcf/vcf_utils.cc index df12b4f6c..64ad5521a 100644 --- a/libtiledbvcf/src/vcf/vcf_utils.cc +++ b/libtiledbvcf/src/vcf/vcf_utils.cc @@ -147,7 +147,8 @@ std::map VCFUtils::hdr_get_contig_offsets( bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seqname.c_str(), 0); if (!hrec) throw std::invalid_argument( - "Cannot get contig offsets from header; error reading contig header " + "Cannot get contig offsets from header; error reading contig " + "header " "line " + std::to_string(i)); int j = bcf_hrec_find_key(hrec, "length"); @@ -164,7 +165,7 @@ std::map VCFUtils::hdr_get_contig_offsets( return offsets; } -std::vector VCFUtils::hdr_get_contigs_regions(bcf_hdr_t* hdr) { +std::vector VCFUtils::hdr_get_contigs_regions(const bcf_hdr_t* hdr) { std::vector contigs; if (!hdr) throw std::invalid_argument( @@ -178,7 +179,8 @@ std::vector VCFUtils::hdr_get_contigs_regions(bcf_hdr_t* hdr) { bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seqname.c_str(), 0); if (!hrec) throw std::invalid_argument( - "Cannot get contig offsets from header; error reading contig header " + "Cannot get contig offsets from header; error reading contig " + "header " "line " + std::to_string(i)); int j = bcf_hrec_find_key(hrec, "length"); diff --git a/libtiledbvcf/src/vcf/vcf_utils.h b/libtiledbvcf/src/vcf/vcf_utils.h index cb050ea4b..ba438928f 100644 --- a/libtiledbvcf/src/vcf/vcf_utils.h +++ b/libtiledbvcf/src/vcf/vcf_utils.h @@ -134,7 +134,7 @@ class VCFUtils { * @return Vector of contig regions */ static std::vector hdr_get_contigs_regions( - bcf_hdr_t* hdr); + const bcf_hdr_t* hdr); /** * Helper function that normalizes a sample name: diff --git a/libtiledbvcf/test/CMakeLists.txt b/libtiledbvcf/test/CMakeLists.txt index db5893039..a4d045efb 100644 --- a/libtiledbvcf/test/CMakeLists.txt +++ b/libtiledbvcf/test/CMakeLists.txt @@ -37,6 +37,7 @@ add_executable(tiledb_vcf_unit EXCLUDE_FROM_ALL ${CMAKE_CURRENT_SOURCE_DIR}/src/unit-bitmap.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/unit-c-api-reader.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/unit-c-api-writer.cc + ${CMAKE_CURRENT_SOURCE_DIR}/src/unit-utils.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/unit-vcf-export.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/unit-vcf-delete.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/unit-vcf-iter.cc diff --git a/libtiledbvcf/test/src/unit-utils.cc b/libtiledbvcf/test/src/unit-utils.cc new file mode 100644 index 000000000..aed193ae6 --- /dev/null +++ b/libtiledbvcf/test/src/unit-utils.cc @@ -0,0 +1,65 @@ +/** + * @file unit-utils.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2026 TileDB Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * Tests for utils. + */ + +#include "catch.hpp" + +#include "utils/utils.h" + +#include + +using namespace tiledb::vcf; + +TEST_CASE("TileDB-VCF: Test join util", "[tiledbvcf][utils]") { + std::vector v; + std::string expected_result = ""; + REQUIRE(utils::join(v, ',').compare(expected_result) == 0); + REQUIRE(utils::join(v, ',', true).compare(expected_result) == 0); + REQUIRE(utils::join(v, ',', false).compare(expected_result) == 0); + + v.emplace_back("a"); + v.emplace_back("b"); + v.emplace_back("c"); + expected_result = "a,b,c"; + REQUIRE(utils::join(v, ',').compare(expected_result) == 0); + REQUIRE(utils::join(v, ',', true).compare(expected_result) == 0); + REQUIRE(utils::join(v, ',', false).compare(expected_result) == 0); + + v.emplace_back(""); + v.emplace_back(""); + v.emplace_back("d"); + v.emplace_back(""); + expected_result = "a,b,c,d"; + REQUIRE(utils::join(v, ',').compare(expected_result) == 0); + REQUIRE(utils::join(v, ',', true).compare(expected_result) == 0); + expected_result = "a,b,c,,,d,"; + REQUIRE(utils::join(v, ',', false).compare(expected_result) == 0); +} diff --git a/libtiledbvcf/test/src/unit-vcf-store.cc b/libtiledbvcf/test/src/unit-vcf-store.cc index 802bcacca..9389bb782 100644 --- a/libtiledbvcf/test/src/unit-vcf-store.cc +++ b/libtiledbvcf/test/src/unit-vcf-store.cc @@ -112,15 +112,14 @@ TEST_CASE("TileDB-VCF: Test register", "[tiledbvcf][ingest]") { ds.metadata().sample_names_, Catch::Matchers::VectorContains(std::string("HG01762"))); - auto hdrs = - ds.fetch_vcf_headers_v4({{"HG01762", 0}}, nullptr, false, false); + auto hdrs = ds.fetch_vcf_headers_v4({{"HG01762", 0}}, false, false); REQUIRE(hdrs.size() == 1); - REQUIRE(bcf_hdr_nsamples(hdrs.at(0)) == 1); - REQUIRE(hdrs.at(0)->samples[0] == std::string("HG01762")); + auto safe_hdr = hdrs.get_sample_header("HG01762"); + REQUIRE(bcf_hdr_nsamples(safe_hdr.get()) == 1); + REQUIRE(safe_hdr.get()->samples[0] == std::string("HG01762")); - REQUIRE(ds.fmt_field_type("GQ", hdrs.at(0).get()) == BCF_HT_INT); - REQUIRE( - ds.info_field_type("BaseQRankSum", hdrs.at(0).get()) == BCF_HT_REAL); + REQUIRE(ds.fmt_field_type("GQ", safe_hdr.get()) == BCF_HT_INT); + REQUIRE(ds.info_field_type("BaseQRankSum", safe_hdr.get()) == BCF_HT_REAL); } // Ingest the samples @@ -149,12 +148,14 @@ TEST_CASE("TileDB-VCF: Test register", "[tiledbvcf][ingest]") { REQUIRE_THAT( ds.metadata().sample_names_, Catch::Matchers::Contains(samples)); - auto hdrs = ds.fetch_vcf_headers_v4( - {{"HG01762", 0}, {"HG00280", 1}}, nullptr, false, false); + auto hdrs = + ds.fetch_vcf_headers_v4({{"HG01762", 0}, {"HG00280", 1}}, false, false); REQUIRE(hdrs.size() == 2); std::vector expected_samples = {"HG01762", "HG00280"}; + auto safe_hdr1 = hdrs.get_sample_header("HG01762"); + auto safe_hdr2 = hdrs.get_sample_header("HG00280"); std::vector result_samples = { - hdrs.at(1)->samples[0], hdrs.at(0)->samples[0]}; + safe_hdr1.get()->samples[0], safe_hdr2.get()->samples[0]}; REQUIRE_THAT( expected_samples, Catch::Matchers::UnorderedEquals(result_samples)); }