From c7f525813c06eb9de0932afc02be73500a7b97df Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Tue, 13 Jan 2026 10:16:41 -0700 Subject: [PATCH 01/13] Added external MD5 library --- libtiledbvcf/external/md5/md5.cc | 214 +++++++++++++++++++++++++++++++ libtiledbvcf/external/md5/md5.h | 77 +++++++++++ libtiledbvcf/src/CMakeLists.txt | 1 + 3 files changed, 292 insertions(+) create mode 100644 libtiledbvcf/external/md5/md5.cc create mode 100644 libtiledbvcf/external/md5/md5.h diff --git a/libtiledbvcf/external/md5/md5.cc b/libtiledbvcf/external/md5/md5.cc new file mode 100644 index 000000000..1509976e6 --- /dev/null +++ b/libtiledbvcf/external/md5/md5.cc @@ -0,0 +1,214 @@ +#include "md5.h" +#include + +namespace md5 { + +/* + * Constants defined by the MD5 algorithm + */ +#define A 0x67452301 +#define B 0xefcdab89 +#define C 0x98badcfe +#define D 0x10325476 + +static uint32_t S[] = {7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, + 12, 17, 22, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, + 14, 20, 5, 9, 14, 20, 4, 11, 16, 23, 4, 11, 16, + 23, 4, 11, 16, 23, 4, 11, 16, 23, 6, 10, 15, 21, + 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21}; + +static uint32_t K[] = { + 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, + 0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, + 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340, + 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, + 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, + 0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, + 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa, + 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, + 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, + 0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, + 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391}; + +/* + * Padding used to make the size (in bits) of the input congruent to 448 mod 512 + */ +static uint8_t PADDING[] = { + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + +/* + * Bit-manipulation functions defined by the MD5 algorithm + */ +#define F(X, Y, Z) ((X & Y) | (~X & Z)) +#define G(X, Y, Z) ((X & Z) | (Y & ~Z)) +#define H(X, Y, Z) (X ^ Y ^ Z) +#define I(X, Y, Z) (Y ^ (X | ~Z)) + +/* + * Rotates a 32-bit word left by n bits + */ +uint32_t rotateLeft(uint32_t x, uint32_t n) { + return (x << n) | (x >> (32 - n)); +} + +void md5_init(MD5Context* ctx) { + ctx->size = (uint64_t)0; + + ctx->buffer[0] = (uint32_t)A; + ctx->buffer[1] = (uint32_t)B; + ctx->buffer[2] = (uint32_t)C; + ctx->buffer[3] = (uint32_t)D; +} + +void md5_update( + MD5Context* ctx, const uint8_t* input_buffer, size_t input_len) { + uint32_t input[16]; + unsigned int offset = ctx->size % 64; + ctx->size += (uint64_t)input_len; + + // Copy each byte in input_buffer into the next space in our context input + for (unsigned int i = 0; i < input_len; ++i) { + ctx->input[offset++] = (uint8_t)*(input_buffer + i); + + // If we've filled our context input, copy it into our local array input + // then reset the offset to 0 and fill in a new buffer. + // Every time we fill out a chunk, we run it through the algorithm + // to enable some back and forth between cpu and i/o + if (offset % 64 == 0) { + for (unsigned int j = 0; j < 16; ++j) { + // Convert to little-endian + // The local variable `input` our 512-bit chunk separated into 32-bit + // words we can use in calculations + input[j] = (uint32_t)(ctx->input[(j * 4) + 3]) << 24 | + (uint32_t)(ctx->input[(j * 4) + 2]) << 16 | + (uint32_t)(ctx->input[(j * 4) + 1]) << 8 | + (uint32_t)(ctx->input[(j * 4)]); + } + md5_step(ctx->buffer, input); + offset = 0; + } + } +} + +void md5_finalize(MD5Context* ctx) { + uint32_t input[16]; + unsigned int offset = ctx->size % 64; + unsigned int padding_length = offset < 56 ? 56 - offset : (56 + 64) - offset; + + // Fill in the padding and undo the changes to size that resulted from the + // update + md5_update(ctx, PADDING, padding_length); + ctx->size -= (uint64_t)padding_length; + + // Do a final update (internal to this function) + // Last two 32-bit words are the two halves of the size (converted from bytes + // to bits) + for (unsigned int j = 0; j < 14; ++j) { + input[j] = (uint32_t)(ctx->input[(j * 4) + 3]) << 24 | + (uint32_t)(ctx->input[(j * 4) + 2]) << 16 | + (uint32_t)(ctx->input[(j * 4) + 1]) << 8 | + (uint32_t)(ctx->input[(j * 4)]); + } + input[14] = (uint32_t)(ctx->size * 8); + input[15] = (uint32_t)((ctx->size * 8) >> 32); + + md5_step(ctx->buffer, input); + + // Move the result into digest (convert from little-endian) + for (unsigned int i = 0; i < 4; ++i) { + ctx->digest[(i * 4) + 0] = (uint8_t)((ctx->buffer[i] & 0x000000FF)); + ctx->digest[(i * 4) + 1] = (uint8_t)((ctx->buffer[i] & 0x0000FF00) >> 8); + ctx->digest[(i * 4) + 2] = (uint8_t)((ctx->buffer[i] & 0x00FF0000) >> 16); + ctx->digest[(i * 4) + 3] = (uint8_t)((ctx->buffer[i] & 0xFF000000) >> 24); + } +} + +void md5_step(uint32_t* buffer, uint32_t* input) { + uint32_t AA = buffer[0]; + uint32_t BB = buffer[1]; + uint32_t CC = buffer[2]; + uint32_t DD = buffer[3]; + + uint32_t E; + + unsigned int j; + + for (unsigned int i = 0; i < 64; ++i) { + switch (i / 16) { + case 0: + E = F(BB, CC, DD); + j = i; + break; + case 1: + E = G(BB, CC, DD); + j = ((i * 5) + 1) % 16; + break; + case 2: + E = H(BB, CC, DD); + j = ((i * 3) + 5) % 16; + break; + default: + E = I(BB, CC, DD); + j = (i * 7) % 16; + break; + } + + uint32_t temp = DD; + DD = CC; + CC = BB; + BB = BB + rotateLeft(AA + E + K[i] + input[j], S[i]); + AA = temp; + } + + buffer[0] += AA; + buffer[1] += BB; + buffer[2] += CC; + buffer[3] += DD; +} + +void md5_string(const char* input, uint8_t* result) { + MD5Context ctx; + md5_init(&ctx); + md5_update(&ctx, (uint8_t*)input, strlen(input)); + md5_finalize(&ctx); + + memcpy(result, ctx.digest, 16); +} + +void md5_file(FILE* file, uint8_t* result) { + char* input_buffer = (char*)malloc(1024); + size_t input_size = 0; + + MD5Context ctx; + md5_init(&ctx); + + while ((input_size = fread(input_buffer, 1, 1024, file)) > 0) { + md5_update(&ctx, (uint8_t*)input_buffer, input_size); + } + + md5_finalize(&ctx); + + free(input_buffer); + + memcpy(result, ctx.digest, 16); +} + +void md5_to_hex(const uint8_t* input, char* output) { + for (size_t i = 0; i < 16; i++) { + sprintf(output, "%02x", input[i]); + output += 2; + } +} + +std::string md5_to_hex(const uint8_t* input) { + char output[32]; + md5_to_hex(input, output); + return std::string(output, 32); +} + +} // namespace md5 diff --git a/libtiledbvcf/external/md5/md5.h b/libtiledbvcf/external/md5/md5.h new file mode 100644 index 000000000..c581dc902 --- /dev/null +++ b/libtiledbvcf/external/md5/md5.h @@ -0,0 +1,77 @@ +/* + * This file (md5.h) and the corresponding md5.cc file are licensed under the + * Unlicense license and are therefore released into the public domain. + * + * This is an a adaptation of the MD5 reference implementation from + * https://github.com/Zunawe/md5-c. It is derived from the RSA Data Security, + * Inc. MD5 Message-Digest Algorithm and modified slightly to be functionally + * identical but condensed into control structures. + */ + +#ifndef TILEDB_EXTERNAL_MD5_H +#define TILEDB_EXTERNAL_MD5_H + +#include +#include +#include +#include +#include + +namespace md5 { + +typedef struct { + uint64_t size; // Size of input in bytes + uint32_t buffer[4]; // Current accumulation of hash + uint8_t input[64]; // Input to be used in the next step + uint8_t digest[16]; // Result of algorithm +} MD5Context; + +/* + * Initialize an md5 context + */ +void md5_init(MD5Context* ctx); + +/* + * Add some amount of input to the context + * + * If the input fills out a block of 512 bits, apply the algorithm (md5_step) + * and save the result in the buffer. Also updates the overall size. + */ +void md5_update(MD5Context* ctx, const uint8_t* input, size_t input_len); + +/* + * Pad the current input to get to 448 bytes, append the size in bits to the + * very end, and save the result of the final iteration into digest. + */ +void md5_finalize(MD5Context* ctx); + +/* + * Step on 512 bits of input with the main MD5 algorithm. + */ +void md5_step(uint32_t* buffer, uint32_t* input); + +/* + * Functions that run the algorithm on the provided input and put the digest + * into result. result should be able to store 16 bytes. + */ +void md5_string(const char* input, uint8_t* result); + +/* + * Functions that run the algorithm on the contents of the provided file and + * put the digest into result. result should be able to store 16 bytes. + */ +void md5_file(FILE* file, uint8_t* result); + +/* + * Converts an md5 array into a C string. + */ +void md5_to_hex(const uint8_t* input, char* output); + +/* + * Converts an md5 array into a string. + */ +std::string md5_to_hex(const uint8_t* input); + +} // namespace md5 + +#endif diff --git a/libtiledbvcf/src/CMakeLists.txt b/libtiledbvcf/src/CMakeLists.txt index 53753f9c8..1b7eff1c6 100644 --- a/libtiledbvcf/src/CMakeLists.txt +++ b/libtiledbvcf/src/CMakeLists.txt @@ -90,6 +90,7 @@ set(TILEDB_VCF_SOURCES set(TILEDB_VCF_EXTERNAL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/../external/base64/base64.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../external/md5/md5.cc ) add_library(TILEDB_VCF_OBJECTS OBJECT From 7d3a5f923246caf42999cca3be61b22787a541c0 Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Tue, 13 Jan 2026 11:31:53 -0700 Subject: [PATCH 02/13] Added a join utility function This function joins a vector of strings using a given delimiter character while optionally omitting empty strings. --- libtiledbvcf/src/utils/utils.cc | 16 ++++++++++++++++ libtiledbvcf/src/utils/utils.h | 11 +++++++++++ 2 files changed, 27 insertions(+) diff --git a/libtiledbvcf/src/utils/utils.cc b/libtiledbvcf/src/utils/utils.cc index 23823a30e..3d86d67ff 100644 --- a/libtiledbvcf/src/utils/utils.cc +++ b/libtiledbvcf/src/utils/utils.cc @@ -33,7 +33,9 @@ #include #include #include +#include #include +#include #include "htslib_plugin/hfile_tiledb_vfs.h" #include "utils/logger_public.h" @@ -90,6 +92,20 @@ std::set split_set(const std::string& s, char delim) { return results; } +std::string join( + const std::vector& v, char delim, bool skip_empty) { + auto empty_filter = [skip_empty](const std::string& s) { + return skip_empty && s.empty(); + }; + auto filtered_v = std::views::filter(v, empty_filter); + std::string s = std::accumulate( + std::next(filtered_v.begin()), + filtered_v.end(), + v.at(0), + [delim](std::string a, std::string b) { return a + delim + b; }); + return s; +} + bool starts_with(const std::string& value, const std::string& prefix) { if (prefix.size() > value.size()) return false; diff --git a/libtiledbvcf/src/utils/utils.h b/libtiledbvcf/src/utils/utils.h index c288d8baf..d8e6499ea 100644 --- a/libtiledbvcf/src/utils/utils.h +++ b/libtiledbvcf/src/utils/utils.h @@ -229,6 +229,17 @@ std::set split_set(const std::string& s, char delim); */ std::vector split(const std::string& s, char delim); +/** + * @brief + * Joins a vector of strings into a string given some character delimiter. + * @param v Vector to join + * @param delim The character to join the vector string with + * @param skip_empty Skip empty elements + * @return vector to store results in + */ +std::string join( + const std::vector& v, char delim, bool skip_empty = true); + /** * @tparam T * @param start_time From 1a138d8552e44baabb6c5dab50a59eabb1bd5aad Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Tue, 13 Jan 2026 11:48:19 -0700 Subject: [PATCH 03/13] Added a SampleHeaders class to abstract the representation of loaded sample headers SampleHeaders is a subclass of the TileDBVCFDataset class since it is intended to be instantiated and loaded exclusively by TileDBVCFDataset. Internally SampleHeaders uses MD5 checksums and maps to only load unique headers and to associate samples with these unique headers, respectively. --- libtiledbvcf/src/dataset/tiledbvcfdataset.cc | 266 ++++++++++++------- libtiledbvcf/src/dataset/tiledbvcfdataset.h | 87 +++++- 2 files changed, 247 insertions(+), 106 deletions(-) diff --git a/libtiledbvcf/src/dataset/tiledbvcfdataset.cc b/libtiledbvcf/src/dataset/tiledbvcfdataset.cc index 1727df8e7..f5c45774f 100644 --- a/libtiledbvcf/src/dataset/tiledbvcfdataset.cc +++ b/libtiledbvcf/src/dataset/tiledbvcfdataset.cc @@ -25,13 +25,17 @@ */ #include +#include #include #include +#include #include #include +#include #include "base64/base64.h" #include "dataset/tiledbvcfdataset.h" +#include "md5/md5.h" #include "read/export_format.h" #include "read/reader.h" #include "stats/allele_count.h" @@ -723,16 +727,15 @@ void TileDBVCFDataset::load_field_type_maps() const { uint32_t first_sample_id = metadata_.sample_ids.at(first_sample_name); SampleAndId first_sample = {first_sample_name, first_sample_id}; - std::unordered_map hdrs; - hdrs = fetch_vcf_headers({first_sample}); + SampleHeaders hdrs = fetch_vcf_headers({first_sample}); if (hdrs.size() != 1) throw std::runtime_error( "Error loading dataset field types; no headers fetched."); - const auto& hdr_ptr = hdrs.at(first_sample_id); - bcf_hdr_t* hdr = hdr_ptr.get(); - for (int i = 0; i < hdr->n[BCF_DT_ID]; i++) { - bcf_idpair_t* idpair = hdr->id[BCF_DT_ID] + i; + SafeBCFHdr hdr = hdrs.get_sample_header(first_sample_name); + bcf_hdr_t* hdr_ptr = hdr.release(); + for (int i = 0; i < hdr_ptr->n[BCF_DT_ID]; i++) { + bcf_idpair_t* idpair = hdr_ptr->id[BCF_DT_ID] + i; if (idpair == nullptr) throw std::runtime_error( "Error loading dataset field types; null idpair."); @@ -745,12 +748,12 @@ void TileDBVCFDataset::load_field_type_maps() const { bool is_fmt = idpair->val->hrec[2] != nullptr; if (is_info) { - int type = bcf_hdr_id2type(hdr, idpair->val->hrec[1]->type, i); + int type = bcf_hdr_id2type(hdr_ptr, idpair->val->hrec[1]->type, i); info_field_types_[name] = type; } if (is_fmt) { - int type = bcf_hdr_id2type(hdr, idpair->val->hrec[2]->type, i); + int type = bcf_hdr_id2type(hdr_ptr, idpair->val->hrec[2]->type, i); fmt_field_types_[name] = type; } } @@ -758,31 +761,35 @@ void TileDBVCFDataset::load_field_type_maps() const { info_fmt_field_types_loaded_ = true; } +void TileDBVCFDataset::load_field_type_maps_v4(bool add_iaf) const { + SampleHeaders hdrs = fetch_vcf_headers_v4({}, false, true); + if (hdrs.empty()) + return; + else if (hdrs.size() != 1) + throw std::runtime_error( + "Error loading dataset field types; no headers fetched."); + SafeBCFHdr hdr = hdrs.first(); + TileDBVCFDataset::load_field_type_maps_v4(hdr.release(), add_iaf); +} + void TileDBVCFDataset::load_field_type_maps_v4( const bcf_hdr_t* hdr, bool add_iaf) const { - utils::UniqueWriteLock lck_(const_cast(&type_field_rw_lock_)); // After we acquire the write lock we need to check if another thread has // loaded the field types if (info_fmt_field_types_loaded_ && (!add_iaf || info_iaf_field_type_added_)) return; - std::unordered_map hdrs; - if (hdr == nullptr) { - hdrs = fetch_vcf_headers_v4({}, nullptr, false, true); - - if (hdrs.empty()) - return; - else if (hdrs.size() != 1) - throw std::runtime_error( - "Error loading dataset field types; no headers fetched."); - - hdr = hdrs.begin()->second.get(); + if (!hdr) { + load_field_type_maps_v4(add_iaf); + return; } + utils::UniqueWriteLock lck_(const_cast(&type_field_rw_lock_)); + + // TODO: duplicate header and modify duplicate rather than modifying passed + // header if (add_iaf) { int header_appended = bcf_hdr_append( - // TODO: do something better than promoting this pointer type; - // perhaps the header should be duplicated and later modified const_cast(hdr), "##INFO="); @@ -791,8 +798,6 @@ void TileDBVCFDataset::load_field_type_maps_v4( "Error appending to header for internal allele frequency."); } if (bcf_hdr_append( - // TODO: do something better than promoting this pointer type; - // perhaps the header should be duplicated and later modified const_cast(hdr), "##INFO=") < @@ -801,8 +806,6 @@ void TileDBVCFDataset::load_field_type_maps_v4( "Error appending to header for internal allele count."); } if (bcf_hdr_append( - // TODO: do something better than promoting this pointer type; - // perhaps the header should be duplicated and later modified const_cast(hdr), "##INFO=") < @@ -1208,9 +1211,24 @@ std::shared_ptr TileDBVCFDataset::open_data_array( data_array_uri(root_uri_, false, true)); } -std::unordered_map TileDBVCFDataset::fetch_vcf_headers_v4( +size_t TileDBVCFDataset::SampleHeaders::empty() const { + return sample_index_lookup.empty(); +} + +size_t TileDBVCFDataset::SampleHeaders::size() const { + return sample_index_lookup.size(); +} + +SafeBCFHdr TileDBVCFDataset::SampleHeaders::first() const { + if (empty()) { + return SafeBCFHdr(nullptr, bcf_hdr_destroy); + } + const std::string& first_header_sample = sample_index_lookup.begin()->first; + return get_sample_header(first_header_sample); +} + +TileDBVCFDataset::SampleHeaders TileDBVCFDataset::fetch_vcf_headers_v4( const std::vector& samples, - std::unordered_map* lookup_map, bool all_samples, bool first_sample) const { LOG_DEBUG( @@ -1237,7 +1255,7 @@ std::unordered_map TileDBVCFDataset::fetch_vcf_headers_v4( "Cannot set first_sample and samples list in same fetch vcf headers " "request"); - std::unordered_map result; + SampleHeaders headers; if (vcf_header_array_ == nullptr) throw std::runtime_error( @@ -1273,7 +1291,7 @@ std::unordered_map TileDBVCFDataset::fetch_vcf_headers_v4( } } - uint32_t sample_idx = 0; + size_t sample_idx = 0; while (!mq->is_complete()) { mq->submit(); auto num_rows = mq->results()->num_rows(); @@ -1294,41 +1312,9 @@ std::unordered_map TileDBVCFDataset::fetch_vcf_headers_v4( // Create a string from the string_view to guarantee null termination // as required by htslib APIs. auto hdr_str = std::string(mq->string_view("header", i)); - auto sample = std::string(mq->string_view("sample", i)); + auto sample_name = std::string(mq->string_view("sample", i)); - bcf_hdr_t* hdr = bcf_hdr_init("r"); - if (!hdr) { - throw std::runtime_error( - "Error fetching VCF header data; error allocating VCF header."); - } - - if (0 != bcf_hdr_parse(hdr, const_cast(hdr_str.c_str()))) { - throw std::runtime_error( - "TileDBVCFDataset::fetch_vcf_headers_v4: Error parsing the BCF " - "header for sample " + - sample + "."); - } - - if (!sample.empty()) { - if (0 != bcf_hdr_add_sample(hdr, sample.c_str())) { - throw std::runtime_error( - "TileDBVCFDataset::fetch_vcf_headers_v4: Error adding sample " - "to " - "BCF header for sample " + - sample + "."); - } - } - - if (bcf_hdr_sync(hdr) < 0) { - throw std::runtime_error( - "Error in bcftools: failed to update VCF header."); - } - - result.emplace( - std::make_pair(sample_idx, SafeBCFHdr(hdr, bcf_hdr_destroy))); - if (lookup_map != nullptr) { - (*lookup_map)[sample] = sample_idx; - } + headers.set_sample_header(hdr_str.c_str(), sample_name, sample_idx); ++sample_idx; @@ -1344,10 +1330,108 @@ std::unordered_map TileDBVCFDataset::fetch_vcf_headers_v4( tiledb::Stats::enable(); LOG_DEBUG( "[fetch_vcf_headers_v4] done (VmRSS = {})", utils::memory_usage_str()); - return result; + return headers; +} + +void TileDBVCFDataset::SampleHeaders::set_sample_header( + const char* hdr_str, const std::string& sample_name, size_t sample_idx) { + // Compute the MD5 checksum for the header + // TODO: Can we use the MD5 digest directly as the map's hash rather + // than hashing the hex string? + md5::md5_string(hdr_str, md5_buffer); + std::string hdr_md5 = md5::md5_to_hex(md5_buffer); + + // Get the header's unique index and save the header, if necessary + size_t hdr_idx; + if (!md5_lookup.contains(hdr_md5)) { + hdr_idx = unique_headers.size(); + md5_lookup[hdr_md5] = hdr_idx; + unique_headers.emplace_back(hdr_str); + } else { + hdr_idx = md5_lookup[hdr_md5]; + // Check for MD5 collisions + const std::string& unique_hdr = unique_headers[hdr_idx]; + if (unique_hdr.compare(hdr_str)) { + auto unique_hdr_filter = [this, hdr_idx](const std::string& sample_name) { + Indexes& indexes = sample_index_lookup[sample_name]; + return indexes.header_idx == hdr_idx; + }; + auto samples_view = std::views::keys(sample_index_lookup); + std::vector unique_hdr_samples; + std::copy_if( + samples_view.begin(), + samples_view.end(), + std::back_inserter(unique_hdr_samples), + unique_hdr_filter); + std::string unqiue_hdr_samples_str = utils::join(unique_hdr_samples, ','); + throw std::runtime_error( + "TileDBVCFDataset::SampleHeaders::set_sample_header: MD5 for " + "sample " + + sample_name + + " header exists but header is different. Collision with header for " + "samples: " + + unqiue_hdr_samples_str); + } + } + + // Save the sample's unique header index and insertion order + sample_index_lookup.emplace(sample_name, Indexes{hdr_idx, sample_idx}); +} + +SafeBCFHdr TileDBVCFDataset::SampleHeaders::get_sample_header( + const std::string& sample) const { + bcf_hdr_t* hdr = bcf_hdr_init("r"); + if (!hdr) { + throw std::runtime_error( + "TileDBVCFDataset::SampleHeaders::get_sample_header: Error fetching " + "VCF header data; error allocating VCF header."); + } + + const size_t hdr_idx = sample_index_lookup.at(sample).header_idx; + const std::string& hdr_str = unique_headers[hdr_idx]; + + if (0 != bcf_hdr_parse(hdr, const_cast(hdr_str.c_str()))) { + throw std::runtime_error( + "TileDBVCFDataset::SampleHeaders::get_sample_header: Error parsing the " + "BCF header for sample " + + sample + "."); + } + + if (!sample.empty()) { + if (0 != bcf_hdr_add_sample(hdr, sample.c_str())) { + throw std::runtime_error( + "TileDBVCFDataset::SampleHeaders::get_sample_header: Error adding " + "sample to BCF header for sample " + + sample + "."); + } + } + + if (bcf_hdr_sync(hdr) < 0) { + throw std::runtime_error( + "TileDBVCFDataset::SampleHeaders::get_sample_header: Error in " + "bcftools: failed to update VCF header."); + } + + return SafeBCFHdr(hdr, bcf_hdr_destroy); +} + +std::vector +TileDBVCFDataset::SampleHeaders::header_ordered_samples() const { + auto samples = std::views::keys(sample_index_lookup); + std::vector sorted_samples{samples.begin(), samples.end()}; + std::sort( + sorted_samples.begin(), + sorted_samples.end(), + // sort samples by header array index + [this](const auto& sample_a, const auto& sample_b) { + SampleHeaders::Indexes indexes_a = sample_index_lookup.at(sample_a); + SampleHeaders::Indexes indexes_b = sample_index_lookup.at(sample_b); + return indexes_a.header_idx < indexes_b.header_idx; + }); + return sorted_samples; } -std::unordered_map TileDBVCFDataset::fetch_vcf_headers( +TileDBVCFDataset::SampleHeaders TileDBVCFDataset::fetch_vcf_headers( const std::vector& samples) const { // Grab a read lock of concurrency so we don't destroy the vcf_header_array // during fetching @@ -1357,11 +1441,12 @@ std::unordered_map TileDBVCFDataset::fetch_vcf_headers( if (!tiledb_stats_enabled_vcf_header_) tiledb::Stats::disable(); - std::unordered_map result; + SampleHeaders headers; if (vcf_header_array_ == nullptr) throw std::runtime_error( - "Cannot fetch TileDB-VCF vcf headers; Array object unexpectedly null"); + "Cannot fetch TileDB-VCF vcf headers; Array object unexpectedly " + "null"); Query query(*ctx_, *vcf_header_array_); Subarray subarray = @@ -1434,7 +1519,8 @@ std::unordered_map TileDBVCFDataset::fetch_vcf_headers( for (size_t offset_idx = 0; offset_idx < num_offsets; ++offset_idx) { // Get sample - uint32_t sample = sample_data[offset_idx]; + size_t sample_idx = sample_data[offset_idx]; + const std::string& sample_name = metadata_.sample_names_[sample_idx]; char* beg_hdr = data.data() + offsets[offset_idx]; uint64_t hdr_size = @@ -1445,38 +1531,16 @@ std::unordered_map TileDBVCFDataset::fetch_vcf_headers( memcpy(hdr_str, beg_hdr, hdr_size); hdr_str[hdr_size] = '\0'; - bcf_hdr_t* hdr = bcf_hdr_init("r"); - if (!hdr) - throw std::runtime_error( - "Error fetching VCF header data; error allocating VCF header."); - - if (0 != bcf_hdr_parse(hdr, hdr_str)) { - throw std::runtime_error( - "TileDBVCFDataset::fetch_vcf_headers: Error parsing the BCF " - "header for sample " + - std::to_string(sample) + "."); - } + headers.set_sample_header(hdr_str, sample_name, sample_idx); std::free(hdr_str); - - if (0 != - bcf_hdr_add_sample(hdr, metadata_.sample_names_[sample].c_str())) { - throw std::runtime_error("Error adding the sample."); - } - - if (bcf_hdr_sync(hdr) < 0) - throw std::runtime_error( - "Error in bcftools: failed to update VCF header."); - - result.emplace( - std::make_pair(sample, SafeBCFHdr(hdr, bcf_hdr_destroy))); } } } while (status == Query::Status::INCOMPLETE); if (tiledb_stats_enabled_) tiledb::Stats::enable(); - return result; -} // namespace vcf + return headers; +} std::vector TileDBVCFDataset::all_contigs() const { std::vector result; @@ -1539,18 +1603,14 @@ std::list TileDBVCFDataset::all_contigs_list() const { } std::vector TileDBVCFDataset::all_contigs_v4() const { - std::unordered_map hdrs = - fetch_vcf_headers_v4({}, nullptr, false, true); - + SampleHeaders hdrs = fetch_vcf_headers_v4({}, false, true); if (hdrs.empty()) return {}; else if (hdrs.size() != 1) throw std::runtime_error( "Error loading dataset field types; no headers fetched."); - - const auto& hdr = hdrs.begin()->second; - - return VCFUtils::hdr_get_contigs_regions(hdr.get()); + SafeBCFHdr hdr = hdrs.first(); + return VCFUtils::hdr_get_contigs_regions(hdr.release()); } std::list TileDBVCFDataset::all_contigs_list_v4() const { @@ -2235,7 +2295,7 @@ std::map TileDBVCFDataset::info_field_types() const { load_field_type_maps(); else { assert(metadata_.version == Version::V4); - load_field_type_maps_v4(nullptr); + load_field_type_maps_v4(); } lck_.lock(); } @@ -2250,7 +2310,7 @@ std::map TileDBVCFDataset::fmt_field_types() const { load_field_type_maps(); else { assert(metadata_.version == Version::V4); - load_field_type_maps_v4(nullptr); + load_field_type_maps_v4(); } lck_.lock(); } diff --git a/libtiledbvcf/src/dataset/tiledbvcfdataset.h b/libtiledbvcf/src/dataset/tiledbvcfdataset.h index a142e7a63..577a16dbd 100644 --- a/libtiledbvcf/src/dataset/tiledbvcfdataset.h +++ b/libtiledbvcf/src/dataset/tiledbvcfdataset.h @@ -209,6 +209,87 @@ class TileDBVCFDataset { /* PUBLIC DATATYPES */ /* ********************************* */ + /** + * This class stores headers and associates samples with them in a memory + * efficient manner. + */ + class SampleHeaders { + private: + // Saves indexes for each sample + struct Indexes { + size_t header_idx; // index in unique_headers + size_t tiledb_idx; // index in TileDB header array + }; + // Save unique headers as strings for on-demand bcf_hdr_t parsing + std::vector unique_headers; + // Use the same buffer for all MD5s to avoid reallocations + uint8_t md5_buffer[16]; + // Maps MD5s to indexes in unqiue_headers + std::unordered_map md5_lookup; + // Maps sample names to their indexes + std::unordered_map sample_index_lookup; + + /** + * Sets the header for a particular sample. + * + * @param hdr_str The header to be set + * @param sample_name The name of the sample the header is being set for + * @param sample_idx The TileDB index of the sample being set + */ + void set_sample_header( + const char* hdr_str, const std::string& sample_name, size_t sample_idx); + + public: + friend class TileDBVCFDataset; + + /** + * Clears the instantiated data structures. + */ + void clear() { + unique_headers.clear(); + md5_lookup.clear(); + sample_index_lookup.clear(); + } + + /** + * Checks if no headers are loaded. + * + * @return True if no headers are loaded + */ + size_t empty() const; + + /** + * Gets the number of headers loaded. + * + * @return The number of headers loaded + */ + size_t size() const; + + /** + * Gets the "first" header via the internal lookup map's begin() + * iterator. + * + * @return The first header + */ + SafeBCFHdr first() const; + + /** + * Gets the header for the given sample name. + * + * @param sample The name of the sample to get the header for + * @return The first header + */ + SafeBCFHdr get_sample_header(const std::string& sample) const; + + /** + * Gets sample names in the order their heaeders are returned by the + * TileDB header array query. + * + * @return A vector containing the ordered sample names + */ + std::vector header_ordered_samples() const; + }; + /** * The format version. */ @@ -367,7 +448,7 @@ class TileDBVCFDataset { std::string root_uri() const; - std::unordered_map fetch_vcf_headers( + SampleHeaders fetch_vcf_headers( const std::vector& samples) const; /** @@ -376,9 +457,8 @@ class TileDBVCFDataset { * @param lookup_map * @return */ - std::unordered_map fetch_vcf_headers_v4( + SampleHeaders fetch_vcf_headers_v4( const std::vector& samples, - std::unordered_map* lookup_map, bool all_samples, bool first_sample) const; @@ -1014,6 +1094,7 @@ class TileDBVCFDataset { /** * Populate the metadata maps of info/fmt field name -> htslib types. */ + void load_field_type_maps_v4(bool add_iaf = false) const; void load_field_type_maps_v4( const bcf_hdr_t* hdr, bool add_iaf = false) const; From 7cdc920f0a497fb5d50adbed4b4a544f671768a7 Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Tue, 13 Jan 2026 11:53:54 -0700 Subject: [PATCH 04/13] Updated sample header code to use new SampleHeaders class --- libtiledbvcf/src/read/pvcf_exporter.cc | 11 ++----- libtiledbvcf/src/read/pvcf_exporter.h | 4 +-- libtiledbvcf/src/read/reader.cc | 41 ++++++++++---------------- libtiledbvcf/src/read/reader.h | 5 +--- libtiledbvcf/src/vcf/vcf_merger.cc | 21 ++++++------- libtiledbvcf/src/vcf/vcf_merger.h | 5 ++-- libtiledbvcf/src/vcf/vcf_utils.cc | 8 +++-- libtiledbvcf/src/vcf/vcf_utils.h | 2 +- 8 files changed, 39 insertions(+), 58 deletions(-) diff --git a/libtiledbvcf/src/read/pvcf_exporter.cc b/libtiledbvcf/src/read/pvcf_exporter.cc index c6006b8ee..130df6cef 100644 --- a/libtiledbvcf/src/read/pvcf_exporter.cc +++ b/libtiledbvcf/src/read/pvcf_exporter.cc @@ -59,15 +59,8 @@ PVCFExporter::PVCFExporter(const std::string& output_uri, ExportFormat fmt) PVCFExporter::~PVCFExporter() { } -void PVCFExporter::init( - const std::unordered_map& hdrs_lookup, - const std::unordered_map& hdrs) { - // sort sample names to match the order returned by tiledb - std::vector> sorted_hdrs( - hdrs_lookup.begin(), hdrs_lookup.end()); - std::sort(sorted_hdrs.begin(), sorted_hdrs.end()); - - merger_.init(sorted_hdrs, hdrs); +void PVCFExporter::init(const TileDBVCFDataset::SampleHeaders& headers) { + merger_.init(headers); std::string mode = "w" + fmt_code_; fp_.reset(bcf_open(uri_.c_str(), mode.c_str())); diff --git a/libtiledbvcf/src/read/pvcf_exporter.h b/libtiledbvcf/src/read/pvcf_exporter.h index ed50e6dfc..b2cb45c32 100644 --- a/libtiledbvcf/src/read/pvcf_exporter.h +++ b/libtiledbvcf/src/read/pvcf_exporter.h @@ -40,9 +40,7 @@ class PVCFExporter : public Exporter { ~PVCFExporter(); - void init( - const std::unordered_map& hdrs_lookup, - const std::unordered_map& hdrs); + void init(const TileDBVCFDataset::SampleHeaders& heaeders); void reset() override; diff --git a/libtiledbvcf/src/read/reader.cc b/libtiledbvcf/src/read/reader.cc index 67298988c..37d6901d9 100644 --- a/libtiledbvcf/src/read/reader.cc +++ b/libtiledbvcf/src/read/reader.cc @@ -650,6 +650,7 @@ bool Reader::next_read_batch_v2_v3() { read_state_.region_idx = 0; // Headers + // TODO: Is the clear necessary? read_state_.current_hdrs.clear(); // Sample handles @@ -798,15 +799,13 @@ bool Reader::next_read_batch_v4() { // Fetch new headers for new sample batch if (read_state_.need_headers) { + // TODO: Is the clear necessary? read_state_.current_hdrs.clear(); read_state_.current_hdrs = dataset_->fetch_vcf_headers_v4( - read_state_.current_sample_batches, - &read_state_.current_hdrs_lookup, - read_state_.all_samples, - false); + read_state_.current_sample_batches, read_state_.all_samples, false); if (params_.export_combined_vcf) { static_cast(exporter_.get()) - ->init(read_state_.current_hdrs_lookup, read_state_.current_hdrs); + ->init(read_state_.current_hdrs); } } } @@ -1194,8 +1193,9 @@ bool Reader::read_current_batch() { if (dataset_->metadata().version == TileDBVCFDataset::Version::V3 || dataset_->metadata().version == TileDBVCFDataset::Version::V2) { for (const auto& s : read_state_.sample_batches[read_state_.batch_idx]) { - bcf_hdr_t* hdr = read_state_.current_hdrs.at(s.sample_id).get(); - exporter_->finalize_export(s, hdr); + SafeBCFHdr hdr = + read_state_.current_hdrs.get_sample_header(s.sample_name); + exporter_->finalize_export(s, hdr.release()); } } else { assert(dataset_->metadata().version == TileDBVCFDataset::Version::V4); @@ -1204,10 +1204,11 @@ bool Reader::read_current_batch() { // finalize the export if (read_state_.query_contig_batch_idx == read_state_.query_regions_v4.size() - 1) { - for (const auto& s : read_state_.current_hdrs_lookup) { - std::string sample_name = s.first; - bcf_hdr_t* hdr = read_state_.current_hdrs.at(s.second).get(); - exporter_->finalize_export(SampleAndId{sample_name, 0}, hdr); + // TODO: Replace with iterator of underlying map to avoid copy + for (const auto& s : + read_state_.current_hdrs.header_ordered_samples()) { + SafeBCFHdr hdr = read_state_.current_hdrs.get_sample_header(s); + exporter_->finalize_export(SampleAndId{s, 0}, hdr.release()); } } } @@ -1720,40 +1721,30 @@ bool Reader::report_cell( } SampleAndId sample; - uint64_t hdr_index = 0; const auto& results = read_state_.query_results; if (dataset_->metadata().version == TileDBVCFDataset::Version::V2 || dataset_->metadata().version == TileDBVCFDataset::Version::V3) { uint32_t samp_idx = results.buffers()->sample().value(cell_idx); - // Skip this cell if we are not reporting its sample. if (read_state_.current_samples.count(samp_idx) == 0) { return true; } sample = read_state_.current_samples[samp_idx]; - hdr_index = samp_idx; } else { assert(dataset_->metadata().version == TileDBVCFDataset::Version::V4); uint64_t size = 0; const char* sample_name = results.buffers()->sample_name().value(cell_idx, &size); sample = SampleAndId{std::string(sample_name, size)}; - hdr_index = read_state_.current_hdrs_lookup[sample.sample_name]; } - bcf_hdr_t* hdr_ptr = nullptr; + SafeBCFHdr hdr(nullptr, bcf_hdr_destroy); if (read_state_.need_headers) { - auto hdr_iter = read_state_.current_hdrs.find(hdr_index); - if (hdr_iter == read_state_.current_hdrs.end()) - throw std::runtime_error( - "Could not find VCF header for " + sample.sample_name + - " in report_cell"); - - const auto& hdr = read_state_.current_hdrs.at(hdr_index); - hdr_ptr = hdr.get(); + hdr = read_state_.current_hdrs.get_sample_header(sample.sample_name); } + // TODO: Is releasing the pointer the best thing to do heree? if (!exporter_->export_record( - sample, hdr_ptr, region, contig_offset, results, cell_idx)) + sample, hdr.release(), region, contig_offset, results, cell_idx)) return false; // If no overflow, increment num records count. diff --git a/libtiledbvcf/src/read/reader.h b/libtiledbvcf/src/read/reader.h index 6e9be6b7b..83f093f80 100644 --- a/libtiledbvcf/src/read/reader.h +++ b/libtiledbvcf/src/read/reader.h @@ -36,7 +36,6 @@ #include #include -#include #include "dataset/attribute_buffer_set.h" #include "dataset/tiledbvcfdataset.h" @@ -603,9 +602,7 @@ class Reader { std::unordered_map current_samples; /** Map of current relative sample ID -> VCF header instance. */ - std::unordered_map current_hdrs; - - std::unordered_map current_hdrs_lookup; + TileDBVCFDataset::SampleHeaders current_hdrs; /** * Stores the index to a region that was unsuccessfully reported diff --git a/libtiledbvcf/src/vcf/vcf_merger.cc b/libtiledbvcf/src/vcf/vcf_merger.cc index f60348048..0a9259ca1 100644 --- a/libtiledbvcf/src/vcf/vcf_merger.cc +++ b/libtiledbvcf/src/vcf/vcf_merger.cc @@ -15,22 +15,23 @@ VCFMerger::VCFMerger() VCFMerger::~VCFMerger() { } -void VCFMerger::init( - const std::vector>& sorted_hdrs, - const std::unordered_map& hdr_map) { +void VCFMerger::init(const TileDBVCFDataset::SampleHeaders& headers) { hdr_.reset(bcf_hdr_init("w")); hdrs_.clear(); - for (const auto& [name, hdr_key] : sorted_hdrs) { - LOG_DEBUG("Adding sample_num {}: {}", hdrs_.size(), name); - sample_map_[name] = hdrs_.size(); - auto hdr = hdr_map.at(hdr_key).get(); + const std::vector sorted_samples = + headers.header_ordered_samples(); + for (const auto& sample : sorted_samples) { + LOG_DEBUG("Adding sample_num {}: {}", hdrs_.size(), sample); + sample_map_[sample] = hdrs_.size(); + // TODO: Is release the right thing to do here? + bcf_hdr_t* hdr = headers.get_sample_header(sample).release(); hdrs_.push_back(hdr); if (bcf_hdr_merge(hdr_.get(), hdr) == NULL) { - LOG_FATAL("Error merging header from sample: {}", name); + LOG_FATAL("Error merging header from sample: {}", sample); } - if (bcf_hdr_add_sample(hdr_.get(), name.c_str()) < 0) { - LOG_FATAL("Error adding sample to merged header: {}", name); + if (bcf_hdr_add_sample(hdr_.get(), sample.c_str()) < 0) { + LOG_FATAL("Error adding sample to merged header: {}", sample); } } diff --git a/libtiledbvcf/src/vcf/vcf_merger.h b/libtiledbvcf/src/vcf/vcf_merger.h index 082c10ed9..4d129bbee 100644 --- a/libtiledbvcf/src/vcf/vcf_merger.h +++ b/libtiledbvcf/src/vcf/vcf_merger.h @@ -9,6 +9,7 @@ #include #include +#include "dataset/tiledbvcfdataset.h" #include "utils/logger_public.h" #include "utils/utils.h" @@ -100,9 +101,7 @@ class VCFMerger { ~VCFMerger(); - void init( - const std::vector>& sorted_hdrs, - const std::unordered_map& hdr_map); + void init(const TileDBVCFDataset::SampleHeaders& headers); void reset(); diff --git a/libtiledbvcf/src/vcf/vcf_utils.cc b/libtiledbvcf/src/vcf/vcf_utils.cc index df12b4f6c..64ad5521a 100644 --- a/libtiledbvcf/src/vcf/vcf_utils.cc +++ b/libtiledbvcf/src/vcf/vcf_utils.cc @@ -147,7 +147,8 @@ std::map VCFUtils::hdr_get_contig_offsets( bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seqname.c_str(), 0); if (!hrec) throw std::invalid_argument( - "Cannot get contig offsets from header; error reading contig header " + "Cannot get contig offsets from header; error reading contig " + "header " "line " + std::to_string(i)); int j = bcf_hrec_find_key(hrec, "length"); @@ -164,7 +165,7 @@ std::map VCFUtils::hdr_get_contig_offsets( return offsets; } -std::vector VCFUtils::hdr_get_contigs_regions(bcf_hdr_t* hdr) { +std::vector VCFUtils::hdr_get_contigs_regions(const bcf_hdr_t* hdr) { std::vector contigs; if (!hdr) throw std::invalid_argument( @@ -178,7 +179,8 @@ std::vector VCFUtils::hdr_get_contigs_regions(bcf_hdr_t* hdr) { bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seqname.c_str(), 0); if (!hrec) throw std::invalid_argument( - "Cannot get contig offsets from header; error reading contig header " + "Cannot get contig offsets from header; error reading contig " + "header " "line " + std::to_string(i)); int j = bcf_hrec_find_key(hrec, "length"); diff --git a/libtiledbvcf/src/vcf/vcf_utils.h b/libtiledbvcf/src/vcf/vcf_utils.h index cb050ea4b..ba438928f 100644 --- a/libtiledbvcf/src/vcf/vcf_utils.h +++ b/libtiledbvcf/src/vcf/vcf_utils.h @@ -134,7 +134,7 @@ class VCFUtils { * @return Vector of contig regions */ static std::vector hdr_get_contigs_regions( - bcf_hdr_t* hdr); + const bcf_hdr_t* hdr); /** * Helper function that normalizes a sample name: From 3f20ecb417572cb3ce54966cc1fdc2348ae5aa3d Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Tue, 13 Jan 2026 11:55:36 -0700 Subject: [PATCH 05/13] Updated VCF store unit tests to use new SampleHeaders class --- libtiledbvcf/test/src/unit-vcf-store.cc | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/libtiledbvcf/test/src/unit-vcf-store.cc b/libtiledbvcf/test/src/unit-vcf-store.cc index 802bcacca..9389bb782 100644 --- a/libtiledbvcf/test/src/unit-vcf-store.cc +++ b/libtiledbvcf/test/src/unit-vcf-store.cc @@ -112,15 +112,14 @@ TEST_CASE("TileDB-VCF: Test register", "[tiledbvcf][ingest]") { ds.metadata().sample_names_, Catch::Matchers::VectorContains(std::string("HG01762"))); - auto hdrs = - ds.fetch_vcf_headers_v4({{"HG01762", 0}}, nullptr, false, false); + auto hdrs = ds.fetch_vcf_headers_v4({{"HG01762", 0}}, false, false); REQUIRE(hdrs.size() == 1); - REQUIRE(bcf_hdr_nsamples(hdrs.at(0)) == 1); - REQUIRE(hdrs.at(0)->samples[0] == std::string("HG01762")); + auto safe_hdr = hdrs.get_sample_header("HG01762"); + REQUIRE(bcf_hdr_nsamples(safe_hdr.get()) == 1); + REQUIRE(safe_hdr.get()->samples[0] == std::string("HG01762")); - REQUIRE(ds.fmt_field_type("GQ", hdrs.at(0).get()) == BCF_HT_INT); - REQUIRE( - ds.info_field_type("BaseQRankSum", hdrs.at(0).get()) == BCF_HT_REAL); + REQUIRE(ds.fmt_field_type("GQ", safe_hdr.get()) == BCF_HT_INT); + REQUIRE(ds.info_field_type("BaseQRankSum", safe_hdr.get()) == BCF_HT_REAL); } // Ingest the samples @@ -149,12 +148,14 @@ TEST_CASE("TileDB-VCF: Test register", "[tiledbvcf][ingest]") { REQUIRE_THAT( ds.metadata().sample_names_, Catch::Matchers::Contains(samples)); - auto hdrs = ds.fetch_vcf_headers_v4( - {{"HG01762", 0}, {"HG00280", 1}}, nullptr, false, false); + auto hdrs = + ds.fetch_vcf_headers_v4({{"HG01762", 0}, {"HG00280", 1}}, false, false); REQUIRE(hdrs.size() == 2); std::vector expected_samples = {"HG01762", "HG00280"}; + auto safe_hdr1 = hdrs.get_sample_header("HG01762"); + auto safe_hdr2 = hdrs.get_sample_header("HG00280"); std::vector result_samples = { - hdrs.at(1)->samples[0], hdrs.at(0)->samples[0]}; + safe_hdr1.get()->samples[0], safe_hdr2.get()->samples[0]}; REQUIRE_THAT( expected_samples, Catch::Matchers::UnorderedEquals(result_samples)); } From 90e38f7674d000f5e2bad102ca33268cd84c49f7 Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Tue, 13 Jan 2026 14:14:11 -0700 Subject: [PATCH 06/13] unique_ptr instances returned by the SampleHeaders class are no longer released This is to prevent memory leaks while making the lifetime of these managed pointers more obvious. --- libtiledbvcf/src/dataset/tiledbvcfdataset.cc | 15 ++++++------ libtiledbvcf/src/read/reader.cc | 7 +++--- libtiledbvcf/src/vcf/vcf_merger.cc | 24 ++++++++++++-------- libtiledbvcf/src/vcf/vcf_merger.h | 3 +-- 4 files changed, 25 insertions(+), 24 deletions(-) diff --git a/libtiledbvcf/src/dataset/tiledbvcfdataset.cc b/libtiledbvcf/src/dataset/tiledbvcfdataset.cc index f5c45774f..7d36edf6d 100644 --- a/libtiledbvcf/src/dataset/tiledbvcfdataset.cc +++ b/libtiledbvcf/src/dataset/tiledbvcfdataset.cc @@ -733,7 +733,7 @@ void TileDBVCFDataset::load_field_type_maps() const { "Error loading dataset field types; no headers fetched."); SafeBCFHdr hdr = hdrs.get_sample_header(first_sample_name); - bcf_hdr_t* hdr_ptr = hdr.release(); + bcf_hdr_t* hdr_ptr = hdr.get(); for (int i = 0; i < hdr_ptr->n[BCF_DT_ID]; i++) { bcf_idpair_t* idpair = hdr_ptr->id[BCF_DT_ID] + i; if (idpair == nullptr) @@ -769,22 +769,21 @@ void TileDBVCFDataset::load_field_type_maps_v4(bool add_iaf) const { throw std::runtime_error( "Error loading dataset field types; no headers fetched."); SafeBCFHdr hdr = hdrs.first(); - TileDBVCFDataset::load_field_type_maps_v4(hdr.release(), add_iaf); + TileDBVCFDataset::load_field_type_maps_v4(hdr.get(), add_iaf); } void TileDBVCFDataset::load_field_type_maps_v4( const bcf_hdr_t* hdr, bool add_iaf) const { - // After we acquire the write lock we need to check if another thread has - // loaded the field types - if (info_fmt_field_types_loaded_ && (!add_iaf || info_iaf_field_type_added_)) - return; - if (!hdr) { load_field_type_maps_v4(add_iaf); return; } utils::UniqueWriteLock lck_(const_cast(&type_field_rw_lock_)); + // After we acquire the write lock we need to check if another thread has + // loaded the field types + if (info_fmt_field_types_loaded_ && (!add_iaf || info_iaf_field_type_added_)) + return; // TODO: duplicate header and modify duplicate rather than modifying passed // header @@ -1610,7 +1609,7 @@ std::vector TileDBVCFDataset::all_contigs_v4() const { throw std::runtime_error( "Error loading dataset field types; no headers fetched."); SafeBCFHdr hdr = hdrs.first(); - return VCFUtils::hdr_get_contigs_regions(hdr.release()); + return VCFUtils::hdr_get_contigs_regions(hdr.get()); } std::list TileDBVCFDataset::all_contigs_list_v4() const { diff --git a/libtiledbvcf/src/read/reader.cc b/libtiledbvcf/src/read/reader.cc index 37d6901d9..24b9ee0a0 100644 --- a/libtiledbvcf/src/read/reader.cc +++ b/libtiledbvcf/src/read/reader.cc @@ -1195,7 +1195,7 @@ bool Reader::read_current_batch() { for (const auto& s : read_state_.sample_batches[read_state_.batch_idx]) { SafeBCFHdr hdr = read_state_.current_hdrs.get_sample_header(s.sample_name); - exporter_->finalize_export(s, hdr.release()); + exporter_->finalize_export(s, hdr.get()); } } else { assert(dataset_->metadata().version == TileDBVCFDataset::Version::V4); @@ -1208,7 +1208,7 @@ bool Reader::read_current_batch() { for (const auto& s : read_state_.current_hdrs.header_ordered_samples()) { SafeBCFHdr hdr = read_state_.current_hdrs.get_sample_header(s); - exporter_->finalize_export(SampleAndId{s, 0}, hdr.release()); + exporter_->finalize_export(SampleAndId{s, 0}, hdr.get()); } } } @@ -1742,9 +1742,8 @@ bool Reader::report_cell( if (read_state_.need_headers) { hdr = read_state_.current_hdrs.get_sample_header(sample.sample_name); } - // TODO: Is releasing the pointer the best thing to do heree? if (!exporter_->export_record( - sample, hdr.release(), region, contig_offset, results, cell_idx)) + sample, hdr.get(), region, contig_offset, results, cell_idx)) return false; // If no overflow, increment num records count. diff --git a/libtiledbvcf/src/vcf/vcf_merger.cc b/libtiledbvcf/src/vcf/vcf_merger.cc index 0a9259ca1..c1cca0930 100644 --- a/libtiledbvcf/src/vcf/vcf_merger.cc +++ b/libtiledbvcf/src/vcf/vcf_merger.cc @@ -24,15 +24,14 @@ void VCFMerger::init(const TileDBVCFDataset::SampleHeaders& headers) { for (const auto& sample : sorted_samples) { LOG_DEBUG("Adding sample_num {}: {}", hdrs_.size(), sample); sample_map_[sample] = hdrs_.size(); - // TODO: Is release the right thing to do here? - bcf_hdr_t* hdr = headers.get_sample_header(sample).release(); - hdrs_.push_back(hdr); - if (bcf_hdr_merge(hdr_.get(), hdr) == NULL) { + SafeBCFHdr hdr = headers.get_sample_header(sample); + if (bcf_hdr_merge(hdr_.get(), hdr.get()) == NULL) { LOG_FATAL("Error merging header from sample: {}", sample); } if (bcf_hdr_add_sample(hdr_.get(), sample.c_str()) < 0) { LOG_FATAL("Error adding sample to merged header: {}", sample); } + hdrs_.push_back(std::move(hdr)); } if (bcf_hdr_sync(hdr_.get()) < 0) { @@ -199,7 +198,7 @@ void VCFMerger::merge_alleles(int sample_num, bcf1_t* input) { std::tuple VCFMerger::get_number_type_values( int id, int hdr_type, int sample_num) { - auto hdr = sample_num > 0 ? hdrs_[sample_num] : hdr_.get(); + bcf_hdr_t* hdr = sample_num > 0 ? hdrs_[sample_num].get() : hdr_.get(); int number = bcf_hdr_id2length(hdr, hdr_type, id); int type = bcf_hdr_id2type(hdr, hdr_type, id); @@ -295,7 +294,7 @@ void VCFMerger::finish_info(SafeBCFRec& rec) { // merge FORMAT:GT and INFO:AC,AN,DP for (const auto& [sample_num, rec] : md_.samples) { int values_read = - bcf_get_genotypes(hdrs_[sample_num], rec.get(), &dst_, &ndst_); + bcf_get_genotypes(hdrs_[sample_num].get(), rec.get(), &dst_, &ndst_); if (values_read == 1) { md_.gts[2 * sample_num + 1] = bcf_int32_vector_end; @@ -330,8 +329,8 @@ void VCFMerger::finish_info(SafeBCFRec& rec) { } // add sample read depth to total depth - if (bcf_get_info_int32(hdrs_[sample_num], rec.get(), "DP", &dst_, &ndst_) > - 0) { + if (bcf_get_info_int32( + hdrs_[sample_num].get(), rec.get(), "DP", &dst_, &ndst_) > 0) { md_.depth_total += *dst_; } } @@ -347,7 +346,12 @@ void VCFMerger::finish_info(SafeBCFRec& rec) { (void)vector_end; // unused uint32_t values_read = bcf_get_info_values( - hdrs_[sample_num], rec.get(), key_str, (void**)(&dst_), &ndst_, type); + hdrs_[sample_num].get(), + rec.get(), + key_str, + (void**)(&dst_), + &ndst_, + type); int* data = (int*)(dst_); // set expected values to number of values read @@ -493,7 +497,7 @@ void VCFMerger::finish_format(SafeBCFRec& rec) { int* hts_dst = nullptr; int hts_ndst = 0; int values_read = bcf_get_format_values( - hdrs_[sample_num], + hdrs_[sample_num].get(), rec_in.get(), key_str, (void**)(&hts_dst), diff --git a/libtiledbvcf/src/vcf/vcf_merger.h b/libtiledbvcf/src/vcf/vcf_merger.h index 4d129bbee..7079781b9 100644 --- a/libtiledbvcf/src/vcf/vcf_merger.h +++ b/libtiledbvcf/src/vcf/vcf_merger.h @@ -249,8 +249,7 @@ class VCFMerger { SafeBCFHdr hdr_; // vector of sample VCF headers, indexed by sample number - // TODO: update ReadState to use shared_ptrs - std::vector hdrs_; + std::vector hdrs_; // combined VCF record SafeBCFRec rec_; From a29e6b1ed1c00af788f5dd2932f391c63072adc9 Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Tue, 13 Jan 2026 14:43:55 -0700 Subject: [PATCH 07/13] Added a samples_view() method to the SampleHeaders class This allows the names of samples stored in a SampleHeaders instance to be efficiently iterated via a view of the keys of an internal map. --- libtiledbvcf/src/dataset/tiledbvcfdataset.cc | 7 +++---- libtiledbvcf/src/dataset/tiledbvcfdataset.h | 11 +++++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/libtiledbvcf/src/dataset/tiledbvcfdataset.cc b/libtiledbvcf/src/dataset/tiledbvcfdataset.cc index 7d36edf6d..f15411326 100644 --- a/libtiledbvcf/src/dataset/tiledbvcfdataset.cc +++ b/libtiledbvcf/src/dataset/tiledbvcfdataset.cc @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -1355,11 +1354,11 @@ void TileDBVCFDataset::SampleHeaders::set_sample_header( Indexes& indexes = sample_index_lookup[sample_name]; return indexes.header_idx == hdr_idx; }; - auto samples_view = std::views::keys(sample_index_lookup); + auto samples = samples_view(); std::vector unique_hdr_samples; std::copy_if( - samples_view.begin(), - samples_view.end(), + samples.begin(), + samples.end(), std::back_inserter(unique_hdr_samples), unique_hdr_filter); std::string unqiue_hdr_samples_str = utils::join(unique_hdr_samples, ','); diff --git a/libtiledbvcf/src/dataset/tiledbvcfdataset.h b/libtiledbvcf/src/dataset/tiledbvcfdataset.h index 577a16dbd..5613f2dfd 100644 --- a/libtiledbvcf/src/dataset/tiledbvcfdataset.h +++ b/libtiledbvcf/src/dataset/tiledbvcfdataset.h @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -281,6 +282,16 @@ class TileDBVCFDataset { */ SafeBCFHdr get_sample_header(const std::string& sample) const; + /** + * Returns the sample names stored as a view of the internal lookup map. + * + * @param sample The name of the sample to get the header for + * @return The first header + */ + auto samples_view() { + return std::views::keys(sample_index_lookup); + }; + /** * Gets sample names in the order their heaeders are returned by the * TileDB header array query. From 084c9cf1e66bfe04cf3a78ce7ba8d25775f2e4f8 Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Tue, 13 Jan 2026 14:50:05 -0700 Subject: [PATCH 08/13] Updated Reader to use samples_view() method of SampleHeaders instance This allows the names of only samples that have headers loaded to be iterated efficiently, both in terms of run-time and memory consumption. --- libtiledbvcf/src/read/reader.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/libtiledbvcf/src/read/reader.cc b/libtiledbvcf/src/read/reader.cc index 24b9ee0a0..a2fcfccef 100644 --- a/libtiledbvcf/src/read/reader.cc +++ b/libtiledbvcf/src/read/reader.cc @@ -1204,9 +1204,7 @@ bool Reader::read_current_batch() { // finalize the export if (read_state_.query_contig_batch_idx == read_state_.query_regions_v4.size() - 1) { - // TODO: Replace with iterator of underlying map to avoid copy - for (const auto& s : - read_state_.current_hdrs.header_ordered_samples()) { + for (const auto& s : read_state_.current_hdrs.samples_view()) { SafeBCFHdr hdr = read_state_.current_hdrs.get_sample_header(s); exporter_->finalize_export(SampleAndId{s, 0}, hdr.get()); } From 529c817b781af2ac06ebc5fc15baede22ff57a31 Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Wed, 14 Jan 2026 10:49:54 -0700 Subject: [PATCH 09/13] Fixed MD5 stack overflow bug on macOS Specifically, the std::string(const char* s, size_t n) constructor on macOS seems to ignore the n parameter and overflows. This was mitigated by adding a terminating character to s and using the std::string(const char* s) constructor instead. --- libtiledbvcf/external/md5/md5.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libtiledbvcf/external/md5/md5.cc b/libtiledbvcf/external/md5/md5.cc index 1509976e6..ac6b6191b 100644 --- a/libtiledbvcf/external/md5/md5.cc +++ b/libtiledbvcf/external/md5/md5.cc @@ -206,9 +206,10 @@ void md5_to_hex(const uint8_t* input, char* output) { } std::string md5_to_hex(const uint8_t* input) { - char output[32]; + char output[33]; md5_to_hex(input, output); - return std::string(output, 32); + output[32] = '\0'; + return std::string(output); } } // namespace md5 From f6832c35c22d80fe115bdd92edecf4f878ee5682 Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Wed, 14 Jan 2026 11:54:57 -0700 Subject: [PATCH 10/13] Fixed join() utils and added utils unit tests --- libtiledbvcf/src/utils/utils.cc | 11 +++-- libtiledbvcf/test/CMakeLists.txt | 1 + libtiledbvcf/test/src/unit-utils.cc | 65 +++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 3 deletions(-) create mode 100644 libtiledbvcf/test/src/unit-utils.cc diff --git a/libtiledbvcf/src/utils/utils.cc b/libtiledbvcf/src/utils/utils.cc index 3d86d67ff..d0a5eddf2 100644 --- a/libtiledbvcf/src/utils/utils.cc +++ b/libtiledbvcf/src/utils/utils.cc @@ -95,14 +95,19 @@ std::set split_set(const std::string& s, char delim) { std::string join( const std::vector& v, char delim, bool skip_empty) { auto empty_filter = [skip_empty](const std::string& s) { - return skip_empty && s.empty(); + return (skip_empty && !s.empty()) || !skip_empty; }; auto filtered_v = std::views::filter(v, empty_filter); + if (filtered_v.empty()) + return ""; + auto operation = [delim](std::string a, std::string b) { + return a + delim + b; + }; std::string s = std::accumulate( std::next(filtered_v.begin()), filtered_v.end(), - v.at(0), - [delim](std::string a, std::string b) { return a + delim + b; }); + filtered_v.front(), + operation); return s; } diff --git a/libtiledbvcf/test/CMakeLists.txt b/libtiledbvcf/test/CMakeLists.txt index db5893039..a4d045efb 100644 --- a/libtiledbvcf/test/CMakeLists.txt +++ b/libtiledbvcf/test/CMakeLists.txt @@ -37,6 +37,7 @@ add_executable(tiledb_vcf_unit EXCLUDE_FROM_ALL ${CMAKE_CURRENT_SOURCE_DIR}/src/unit-bitmap.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/unit-c-api-reader.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/unit-c-api-writer.cc + ${CMAKE_CURRENT_SOURCE_DIR}/src/unit-utils.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/unit-vcf-export.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/unit-vcf-delete.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/unit-vcf-iter.cc diff --git a/libtiledbvcf/test/src/unit-utils.cc b/libtiledbvcf/test/src/unit-utils.cc new file mode 100644 index 000000000..aed193ae6 --- /dev/null +++ b/libtiledbvcf/test/src/unit-utils.cc @@ -0,0 +1,65 @@ +/** + * @file unit-utils.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2026 TileDB Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * Tests for utils. + */ + +#include "catch.hpp" + +#include "utils/utils.h" + +#include + +using namespace tiledb::vcf; + +TEST_CASE("TileDB-VCF: Test join util", "[tiledbvcf][utils]") { + std::vector v; + std::string expected_result = ""; + REQUIRE(utils::join(v, ',').compare(expected_result) == 0); + REQUIRE(utils::join(v, ',', true).compare(expected_result) == 0); + REQUIRE(utils::join(v, ',', false).compare(expected_result) == 0); + + v.emplace_back("a"); + v.emplace_back("b"); + v.emplace_back("c"); + expected_result = "a,b,c"; + REQUIRE(utils::join(v, ',').compare(expected_result) == 0); + REQUIRE(utils::join(v, ',', true).compare(expected_result) == 0); + REQUIRE(utils::join(v, ',', false).compare(expected_result) == 0); + + v.emplace_back(""); + v.emplace_back(""); + v.emplace_back("d"); + v.emplace_back(""); + expected_result = "a,b,c,d"; + REQUIRE(utils::join(v, ',').compare(expected_result) == 0); + REQUIRE(utils::join(v, ',', true).compare(expected_result) == 0); + expected_result = "a,b,c,,,d,"; + REQUIRE(utils::join(v, ',', false).compare(expected_result) == 0); +} From b4c8f51a6d287944e64084419dd6ca103558c7e2 Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Wed, 14 Jan 2026 12:03:29 -0700 Subject: [PATCH 11/13] Made SampleHeaders::samples_view() a const function --- libtiledbvcf/src/dataset/tiledbvcfdataset.cc | 2 +- libtiledbvcf/src/dataset/tiledbvcfdataset.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libtiledbvcf/src/dataset/tiledbvcfdataset.cc b/libtiledbvcf/src/dataset/tiledbvcfdataset.cc index f15411326..0275745b5 100644 --- a/libtiledbvcf/src/dataset/tiledbvcfdataset.cc +++ b/libtiledbvcf/src/dataset/tiledbvcfdataset.cc @@ -1415,7 +1415,7 @@ SafeBCFHdr TileDBVCFDataset::SampleHeaders::get_sample_header( std::vector TileDBVCFDataset::SampleHeaders::header_ordered_samples() const { - auto samples = std::views::keys(sample_index_lookup); + auto samples = samples_view(); std::vector sorted_samples{samples.begin(), samples.end()}; std::sort( sorted_samples.begin(), diff --git a/libtiledbvcf/src/dataset/tiledbvcfdataset.h b/libtiledbvcf/src/dataset/tiledbvcfdataset.h index 5613f2dfd..4454b1c77 100644 --- a/libtiledbvcf/src/dataset/tiledbvcfdataset.h +++ b/libtiledbvcf/src/dataset/tiledbvcfdataset.h @@ -288,7 +288,7 @@ class TileDBVCFDataset { * @param sample The name of the sample to get the header for * @return The first header */ - auto samples_view() { + auto samples_view() const { return std::views::keys(sample_index_lookup); }; From 4db485d9c1356290c50780c0dd63adbef8561a99 Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Wed, 14 Jan 2026 12:11:41 -0700 Subject: [PATCH 12/13] The SampleHeaders::header_ordered_samples() comparison function now uses Indexes references, rather than copies --- libtiledbvcf/src/dataset/tiledbvcfdataset.cc | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/libtiledbvcf/src/dataset/tiledbvcfdataset.cc b/libtiledbvcf/src/dataset/tiledbvcfdataset.cc index 0275745b5..26d2eabe9 100644 --- a/libtiledbvcf/src/dataset/tiledbvcfdataset.cc +++ b/libtiledbvcf/src/dataset/tiledbvcfdataset.cc @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -1417,15 +1418,13 @@ std::vector TileDBVCFDataset::SampleHeaders::header_ordered_samples() const { auto samples = samples_view(); std::vector sorted_samples{samples.begin(), samples.end()}; - std::sort( - sorted_samples.begin(), - sorted_samples.end(), - // sort samples by header array index - [this](const auto& sample_a, const auto& sample_b) { - SampleHeaders::Indexes indexes_a = sample_index_lookup.at(sample_a); - SampleHeaders::Indexes indexes_b = sample_index_lookup.at(sample_b); - return indexes_a.header_idx < indexes_b.header_idx; - }); + auto comparison = [this](const auto& sample_a, const auto& sample_b) { + const SampleHeaders::Indexes& indexes_a = sample_index_lookup.at(sample_a); + const SampleHeaders::Indexes& indexes_b = sample_index_lookup.at(sample_b); + return indexes_a.header_idx < indexes_b.header_idx; + }; + // sort samples by header array index + std::sort(sorted_samples.begin(), sorted_samples.end(), comparison); return sorted_samples; } From 0e02c903c0e7e780a2a9150cdfc6ecd9209d0297 Mon Sep 17 00:00:00 2001 From: Alan Cleary Date: Wed, 14 Jan 2026 12:14:27 -0700 Subject: [PATCH 13/13] Fixed misspellings of "headers" --- libtiledbvcf/src/dataset/tiledbvcfdataset.h | 2 +- libtiledbvcf/src/read/pvcf_exporter.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libtiledbvcf/src/dataset/tiledbvcfdataset.h b/libtiledbvcf/src/dataset/tiledbvcfdataset.h index 4454b1c77..8765a3b94 100644 --- a/libtiledbvcf/src/dataset/tiledbvcfdataset.h +++ b/libtiledbvcf/src/dataset/tiledbvcfdataset.h @@ -293,7 +293,7 @@ class TileDBVCFDataset { }; /** - * Gets sample names in the order their heaeders are returned by the + * Gets sample names in the order their headers are returned by the * TileDB header array query. * * @return A vector containing the ordered sample names diff --git a/libtiledbvcf/src/read/pvcf_exporter.h b/libtiledbvcf/src/read/pvcf_exporter.h index b2cb45c32..5c5528cca 100644 --- a/libtiledbvcf/src/read/pvcf_exporter.h +++ b/libtiledbvcf/src/read/pvcf_exporter.h @@ -40,7 +40,7 @@ class PVCFExporter : public Exporter { ~PVCFExporter(); - void init(const TileDBVCFDataset::SampleHeaders& heaeders); + void init(const TileDBVCFDataset::SampleHeaders& headers); void reset() override;