From b4795a105cab04adfe377137fe58ff1915c5154f Mon Sep 17 00:00:00 2001 From: peter Date: Wed, 24 Sep 2025 13:31:37 -0700 Subject: [PATCH 1/2] update to tskit 1.0.0b2 omit integrity check in compute_mutation_parents --- core/species.cpp | 4 +- treerec/_README | 1 + treerec/tskit/core.c | 15 +- treerec/tskit/core.h | 20 +- treerec/tskit/tables.c | 538 ++++++++++++----------------------- treerec/tskit/tables.h | 34 +-- treerec/tskit/trees.c | 632 +++++++++++++++++++++++++++++++++-------- treerec/tskit/trees.h | 115 +++++--- 8 files changed, 829 insertions(+), 530 deletions(-) diff --git a/core/species.cpp b/core/species.cpp index cc3ee742..9ca1b541 100644 --- a/core/species.cpp +++ b/core/species.cpp @@ -7414,7 +7414,7 @@ void Species::WriteTreeSequence(std::string &p_recording_tree_path, bool p_simpl // Add in the mutation.parent information; valid tree sequences need parents, but we don't keep them while running ret = tsk_table_collection_build_index(&output_tables, 0); if (ret < 0) handle_error("tsk_table_collection_build_index", ret); - ret = tsk_table_collection_compute_mutation_parents(&output_tables, 0); + ret = tsk_table_collection_compute_mutation_parents(&output_tables, TSK_NO_CHECK_INTEGRITY); if (ret < 0) handle_error("tsk_table_collection_compute_mutation_parents", ret); { @@ -7801,7 +7801,7 @@ void Species::CrosscheckTreeSeqIntegrity(void) ret = tsk_table_collection_build_index(tables_copy, 0); if (ret < 0) handle_error("tsk_table_collection_build_index", ret); - ret = tsk_table_collection_compute_mutation_parents(tables_copy, 0); + ret = tsk_table_collection_compute_mutation_parents(tables_copy, TSK_NO_CHECK_INTEGRITY); if (ret < 0) handle_error("tsk_table_collection_compute_mutation_parents", ret); } diff --git a/treerec/_README b/treerec/_README index addbba27..45526099 100644 --- a/treerec/_README +++ b/treerec/_README @@ -8,6 +8,7 @@ The code in `tskit/kastore` is from [kastore/c](https://github.com/tskit-dev/kas **Changelog:** +- *24 September 2025*: updated to tskit C_1.2.0 (=1.0.0b2) (still kastore C_2.1.1) - *15 April 2025*: updated to tskit C_1.1.4 (=0.6.2) (still kastore C_2.1.1) - *7 November 2023*: updated to tskit C_1.1.2 (=0.5.5) (still kastore C_2.1.1) - *2 June 2022*: updated to tskit C_1.0.0 and kastore C_2.1.1 diff --git a/treerec/tskit/core.c b/treerec/tskit/core.c index 886b52b2..53cc0ce6 100644 --- a/treerec/tskit/core.c +++ b/treerec/tskit/core.c @@ -168,7 +168,8 @@ tsk_strerror_internal(int err) break; case TSK_ERR_FILE_VERSION_TOO_OLD: ret = "tskit file version too old. Please upgrade using the " - "'tskit upgrade' command. (TSK_ERR_FILE_VERSION_TOO_OLD)"; + "'tskit upgrade' command from tskit version<0.6.2. " + "(TSK_ERR_FILE_VERSION_TOO_OLD)"; break; case TSK_ERR_FILE_VERSION_TOO_NEW: ret = "tskit file version is too new for this instance. " @@ -346,6 +347,12 @@ tsk_strerror_internal(int err) "(TSK_ERR_DISALLOWED_UNKNOWN_MUTATION_TIME)"; break; + case TSK_ERR_BAD_MUTATION_PARENT: + ret = "A mutation's parent is not consistent with the topology of the tree. " + "Use compute_mutation_parents to set the parents correctly." + "(TSK_ERR_BAD_MUTATION_PARENT)"; + break; + /* Migration errors */ case TSK_ERR_UNSORTED_MIGRATIONS: ret = "Migrations must be sorted by time. (TSK_ERR_UNSORTED_MIGRATIONS)"; @@ -519,9 +526,13 @@ tsk_strerror_internal(int err) "(TSK_ERR_BAD_SAMPLE_PAIR_TIMES)"; break; case TSK_ERR_BAD_TIME_WINDOWS: - ret = "Time windows must be strictly increasing and end at infinity. " + ret = "Time windows must start at zero and be strictly increasing. " "(TSK_ERR_BAD_TIME_WINDOWS)"; break; + case TSK_ERR_BAD_TIME_WINDOWS_END: + ret = "Time windows must end at infinity for this method. " + "(TSK_ERR_BAD_TIME_WINDOWS_END)"; + break; case TSK_ERR_BAD_NODE_TIME_WINDOW: ret = "Node time does not fall within assigned time window. " "(TSK_ERR_BAD_NODE_TIME_WINDOW)"; diff --git a/treerec/tskit/core.h b/treerec/tskit/core.h index 14664678..7dd24eba 100644 --- a/treerec/tskit/core.h +++ b/treerec/tskit/core.h @@ -147,12 +147,12 @@ sizes and types of externally visible structs. The library minor version. Incremented when non-breaking backward-compatible changes to the API or ABI are introduced, i.e., the addition of a new function. */ -#define TSK_VERSION_MINOR 1 +#define TSK_VERSION_MINOR 2 /** The library patch version. Incremented when any changes not relevant to the to the API or ABI are introduced, i.e., internal refactors of bugfixes. */ -#define TSK_VERSION_PATCH 4 +#define TSK_VERSION_PATCH 0 /** @} */ /* @@ -283,7 +283,7 @@ A file could not be read because it is in the wrong format /** The file is in tskit format, but the version is too old for the library to read. The file should be upgraded to the latest version -using the ``tskit upgrade`` command line utility. +using the ``tskit upgrade`` command line utility from tskit version<0.6.2. */ #define TSK_ERR_FILE_VERSION_TOO_OLD -101 /** @@ -510,6 +510,12 @@ Some mutations have TSK_UNKNOWN_TIME in an algorithm where that's disallowed (use compute_mutation_times?). */ #define TSK_ERR_DISALLOWED_UNKNOWN_MUTATION_TIME -510 + +/** +A mutation's parent was not consistent with the topology of the tree. + */ +#define TSK_ERR_BAD_MUTATION_PARENT -511 + /** @} */ /** @@ -740,13 +746,17 @@ Sample times do not all equal the start of first time window */ #define TSK_ERR_BAD_SAMPLE_PAIR_TIMES -923 /** -Time windows are not strictly increasing ending at infinity +Time windows are not strictly increasing */ #define TSK_ERR_BAD_TIME_WINDOWS -924 /** +Time windows do not end at infinity +*/ +#define TSK_ERR_BAD_TIME_WINDOWS_END -925 +/** Node time does not fall within assigned time window */ -#define TSK_ERR_BAD_NODE_TIME_WINDOW -925 +#define TSK_ERR_BAD_NODE_TIME_WINDOW -926 /** @} */ /** diff --git a/treerec/tskit/tables.c b/treerec/tskit/tables.c index 4b35ff5c..9805d669 100644 --- a/treerec/tskit/tables.c +++ b/treerec/tskit/tables.c @@ -6756,7 +6756,8 @@ typedef struct { typedef struct { tsk_mutation_t mut; int num_descendants; -} mutation_canonical_sort_t; + double node_time; +} mutation_sort_t; typedef struct { tsk_individual_t ind; @@ -6797,39 +6798,30 @@ cmp_site(const void *a, const void *b) static int cmp_mutation(const void *a, const void *b) { - const tsk_mutation_t *ia = (const tsk_mutation_t *) a; - const tsk_mutation_t *ib = (const tsk_mutation_t *) b; - /* Compare mutations by site */ - int ret = (ia->site > ib->site) - (ia->site < ib->site); - /* Within a particular site sort by time if known, then ID. This ensures that - * relative ordering within a site is maintained */ - if (ret == 0 && !tsk_is_unknown_time(ia->time) && !tsk_is_unknown_time(ib->time)) { - ret = (ia->time < ib->time) - (ia->time > ib->time); - } - if (ret == 0) { - ret = (ia->id > ib->id) - (ia->id < ib->id); - } - return ret; -} - -static int -cmp_mutation_canonical(const void *a, const void *b) -{ - const mutation_canonical_sort_t *ia = (const mutation_canonical_sort_t *) a; - const mutation_canonical_sort_t *ib = (const mutation_canonical_sort_t *) b; + const mutation_sort_t *ia = (const mutation_sort_t *) a; + const mutation_sort_t *ib = (const mutation_sort_t *) b; /* Compare mutations by site */ int ret = (ia->mut.site > ib->mut.site) - (ia->mut.site < ib->mut.site); + + /* Within a particular site sort by time if known */ if (ret == 0 && !tsk_is_unknown_time(ia->mut.time) && !tsk_is_unknown_time(ib->mut.time)) { ret = (ia->mut.time < ib->mut.time) - (ia->mut.time > ib->mut.time); } + /* Or node times when mutation times are unknown or equal */ + if (ret == 0) { + ret = (ia->node_time < ib->node_time) - (ia->node_time > ib->node_time); + } + /* If node times are equal, sort by number of descendants */ if (ret == 0) { ret = (ia->num_descendants < ib->num_descendants) - (ia->num_descendants > ib->num_descendants); } + /* If number of descendants are equal, sort by node */ if (ret == 0) { ret = (ia->mut.node > ib->mut.node) - (ia->mut.node < ib->mut.node); } + /* Final tiebreaker: ID */ if (ret == 0) { ret = (ia->mut.id > ib->mut.id) - (ia->mut.id < ib->mut.id); } @@ -7056,77 +7048,15 @@ tsk_table_sorter_sort_sites(tsk_table_sorter_t *self) static int tsk_table_sorter_sort_mutations(tsk_table_sorter_t *self) -{ - int ret = 0; - tsk_size_t j; - tsk_id_t ret_id, parent, mapped_parent; - tsk_mutation_table_t *mutations = &self->tables->mutations; - tsk_size_t num_mutations = mutations->num_rows; - tsk_mutation_table_t copy; - tsk_mutation_t *sorted_mutations - = tsk_malloc(num_mutations * sizeof(*sorted_mutations)); - tsk_id_t *mutation_id_map = tsk_malloc(num_mutations * sizeof(*mutation_id_map)); - - ret = tsk_mutation_table_copy(mutations, ©, 0); - if (ret != 0) { - goto out; - } - if (mutation_id_map == NULL || sorted_mutations == NULL) { - ret = tsk_trace_error(TSK_ERR_NO_MEMORY); - goto out; - } - - for (j = 0; j < num_mutations; j++) { - tsk_mutation_table_get_row_unsafe(©, (tsk_id_t) j, sorted_mutations + j); - sorted_mutations[j].site = self->site_id_map[sorted_mutations[j].site]; - } - ret = tsk_mutation_table_clear(mutations); - if (ret != 0) { - goto out; - } - - qsort(sorted_mutations, (size_t) num_mutations, sizeof(*sorted_mutations), - cmp_mutation); - - /* Make a first pass through the sorted mutations to build the ID map. */ - for (j = 0; j < num_mutations; j++) { - mutation_id_map[sorted_mutations[j].id] = (tsk_id_t) j; - } - - for (j = 0; j < num_mutations; j++) { - mapped_parent = TSK_NULL; - parent = sorted_mutations[j].parent; - if (parent != TSK_NULL) { - mapped_parent = mutation_id_map[parent]; - } - ret_id = tsk_mutation_table_add_row(mutations, sorted_mutations[j].site, - sorted_mutations[j].node, mapped_parent, sorted_mutations[j].time, - sorted_mutations[j].derived_state, sorted_mutations[j].derived_state_length, - sorted_mutations[j].metadata, sorted_mutations[j].metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - ret = 0; - -out: - tsk_safe_free(mutation_id_map); - tsk_safe_free(sorted_mutations); - tsk_mutation_table_free(©); - return ret; -} - -static int -tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self) { int ret = 0; tsk_size_t j; tsk_id_t ret_id, parent, mapped_parent, p; tsk_mutation_table_t *mutations = &self->tables->mutations; + tsk_node_table_t *nodes = &self->tables->nodes; tsk_size_t num_mutations = mutations->num_rows; tsk_mutation_table_t copy; - mutation_canonical_sort_t *sorted_mutations + mutation_sort_t *sorted_mutations = tsk_malloc(num_mutations * sizeof(*sorted_mutations)); tsk_id_t *mutation_id_map = tsk_malloc(num_mutations * sizeof(*mutation_id_map)); @@ -7158,6 +7088,7 @@ tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self) for (j = 0; j < num_mutations; j++) { tsk_mutation_table_get_row_unsafe(©, (tsk_id_t) j, &sorted_mutations[j].mut); sorted_mutations[j].mut.site = self->site_id_map[sorted_mutations[j].mut.site]; + sorted_mutations[j].node_time = nodes->time[sorted_mutations[j].mut.node]; } ret = tsk_mutation_table_clear(mutations); if (ret != 0) { @@ -7165,7 +7096,7 @@ tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self) } qsort(sorted_mutations, (size_t) num_mutations, sizeof(*sorted_mutations), - cmp_mutation_canonical); + cmp_mutation); /* Make a first pass through the sorted mutations to build the ID map. */ for (j = 0; j < num_mutations; j++) { @@ -10986,11 +10917,159 @@ tsk_table_collection_check_index_integrity(const tsk_table_collection_t *self) return ret; } +static int TSK_WARN_UNUSED +tsk_table_collection_compute_mutation_parents_to_array( + const tsk_table_collection_t *self, tsk_id_t *mutation_parent) +{ + int ret = 0; + const tsk_id_t *I, *O; + const tsk_edge_table_t edges = self->edges; + const tsk_node_table_t nodes = self->nodes; + const tsk_site_table_t sites = self->sites; + const tsk_mutation_table_t mutations = self->mutations; + const tsk_id_t M = (tsk_id_t) edges.num_rows; + tsk_id_t tj, tk; + tsk_id_t *parent = NULL; + tsk_id_t *bottom_mutation = NULL; + tsk_id_t u; + double left, right; + tsk_id_t site; + /* Using unsigned values here avoids potentially undefined behaviour */ + tsk_size_t j, mutation, first_mutation; + + parent = tsk_malloc(nodes.num_rows * sizeof(*parent)); + bottom_mutation = tsk_malloc(nodes.num_rows * sizeof(*bottom_mutation)); + if (parent == NULL || bottom_mutation == NULL) { + ret = tsk_trace_error(TSK_ERR_NO_MEMORY); + goto out; + } + tsk_memset(parent, 0xff, nodes.num_rows * sizeof(*parent)); + tsk_memset(bottom_mutation, 0xff, nodes.num_rows * sizeof(*bottom_mutation)); + tsk_memset(mutation_parent, 0xff, self->mutations.num_rows * sizeof(tsk_id_t)); + + I = self->indexes.edge_insertion_order; + O = self->indexes.edge_removal_order; + tj = 0; + tk = 0; + site = 0; + mutation = 0; + left = 0; + while (tj < M || left < self->sequence_length) { + while (tk < M && edges.right[O[tk]] == left) { + parent[edges.child[O[tk]]] = TSK_NULL; + tk++; + } + while (tj < M && edges.left[I[tj]] == left) { + parent[edges.child[I[tj]]] = edges.parent[I[tj]]; + tj++; + } + right = self->sequence_length; + if (tj < M) { + right = TSK_MIN(right, edges.left[I[tj]]); + } + if (tk < M) { + right = TSK_MIN(right, edges.right[O[tk]]); + } + + /* Tree is now ready. We look at each site on this tree in turn */ + while (site < (tsk_id_t) sites.num_rows && sites.position[site] < right) { + /* Create a mapping from mutations to nodes. If we see more than one + * mutation at a node, the previously seen one must be the parent + * of the current since we assume they are in order. */ + first_mutation = mutation; + while (mutation < mutations.num_rows && mutations.site[mutation] == site) { + u = mutations.node[mutation]; + if (bottom_mutation[u] != TSK_NULL) { + mutation_parent[mutation] = bottom_mutation[u]; + } + bottom_mutation[u] = (tsk_id_t) mutation; + mutation++; + } + /* Make the common case of 1 mutation fast */ + if (mutation > first_mutation + 1) { + /* If we have more than one mutation, compute the parent for each + * one by traversing up the tree until we find a node that has a + * mutation. */ + for (j = first_mutation; j < mutation; j++) { + if (mutation_parent[j] == TSK_NULL) { + u = parent[mutations.node[j]]; + while (u != TSK_NULL && bottom_mutation[u] == TSK_NULL) { + u = parent[u]; + } + if (u != TSK_NULL) { + mutation_parent[j] = bottom_mutation[u]; + } + } + } + } + /* Reset the mapping for the next site */ + for (j = first_mutation; j < mutation; j++) { + u = mutations.node[j]; + bottom_mutation[u] = TSK_NULL; + /* Check that we haven't violated the sortedness property */ + if (mutation_parent[j] > (tsk_id_t) j) { + ret = tsk_trace_error(TSK_ERR_MUTATION_PARENT_AFTER_CHILD); + goto out; + } + } + site++; + } + /* Move on to the next tree */ + left = right; + } + +out: + tsk_safe_free(parent); + tsk_safe_free(bottom_mutation); + return ret; +} + +static int TSK_WARN_UNUSED +tsk_table_collection_check_mutation_parents(const tsk_table_collection_t *self) +{ + int ret = 0; + tsk_mutation_table_t mutations = self->mutations; + tsk_id_t *new_parents = NULL; + tsk_size_t j; + + if (mutations.num_rows == 0) { + return ret; + } + + new_parents = tsk_malloc(mutations.num_rows * sizeof(*new_parents)); + if (new_parents == NULL) { + ret = tsk_trace_error(TSK_ERR_NO_MEMORY); + goto out; + } + + ret = tsk_table_collection_compute_mutation_parents_to_array(self, new_parents); + if (ret != 0) { + goto out; + } + + for (j = 0; j < mutations.num_rows; j++) { + if (mutations.parent[j] != new_parents[j]) { + ret = tsk_trace_error(TSK_ERR_BAD_MUTATION_PARENT); + goto out; + } + } + +out: + tsk_safe_free(new_parents); + return ret; +} + tsk_id_t TSK_WARN_UNUSED tsk_table_collection_check_integrity( const tsk_table_collection_t *self, tsk_flags_t options) { tsk_id_t ret = 0; + int mut_ret = 0; + + if (options & TSK_CHECK_MUTATION_PARENTS) { + /* If we're checking mutation parents, we need to check the trees first */ + options |= TSK_CHECK_TREES; + } if (options & TSK_CHECK_TREES) { /* Checking the trees implies these checks */ @@ -11043,6 +11122,14 @@ tsk_table_collection_check_integrity( if (ret < 0) { goto out; } + /* This check requires tree integrity so do it last */ + if (options & TSK_CHECK_MUTATION_PARENTS) { + mut_ret = tsk_table_collection_check_mutation_parents(self); + if (mut_ret != 0) { + ret = mut_ret; + goto out; + } + } } out: return ret; @@ -12245,7 +12332,7 @@ tsk_table_collection_canonicalise(tsk_table_collection_t *self, tsk_flags_t opti if (ret != 0) { goto out; } - sorter.sort_mutations = tsk_table_sorter_sort_mutations_canonical; + sorter.sort_mutations = tsk_table_sorter_sort_mutations; sorter.sort_individuals = tsk_table_sorter_sort_individuals_canonical; nodes = tsk_malloc(self->nodes.num_rows * sizeof(*nodes)); @@ -12346,117 +12433,25 @@ tsk_table_collection_deduplicate_sites( int TSK_WARN_UNUSED tsk_table_collection_compute_mutation_parents( - tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) + tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; - tsk_id_t num_trees; - const tsk_id_t *I, *O; - const tsk_edge_table_t edges = self->edges; - const tsk_node_table_t nodes = self->nodes; - const tsk_site_table_t sites = self->sites; - const tsk_mutation_table_t mutations = self->mutations; - const tsk_id_t M = (tsk_id_t) edges.num_rows; - tsk_id_t tj, tk; - tsk_id_t *parent = NULL; - tsk_id_t *bottom_mutation = NULL; - tsk_id_t u; - double left, right; - tsk_id_t site; - /* Using unsigned values here avoids potentially undefined behaviour */ - tsk_size_t j, mutation, first_mutation; - - /* Set the mutation parent to TSK_NULL so that we don't check the - * parent values we are about to write over. */ - tsk_memset(mutations.parent, 0xff, mutations.num_rows * sizeof(*mutations.parent)); - num_trees = tsk_table_collection_check_integrity(self, TSK_CHECK_TREES); - if (num_trees < 0) { - ret = (int) num_trees; - goto out; - } - parent = tsk_malloc(nodes.num_rows * sizeof(*parent)); - bottom_mutation = tsk_malloc(nodes.num_rows * sizeof(*bottom_mutation)); - if (parent == NULL || bottom_mutation == NULL) { - ret = tsk_trace_error(TSK_ERR_NO_MEMORY); - goto out; - } - tsk_memset(parent, 0xff, nodes.num_rows * sizeof(*parent)); - tsk_memset(bottom_mutation, 0xff, nodes.num_rows * sizeof(*bottom_mutation)); - tsk_memset(mutations.parent, 0xff, self->mutations.num_rows * sizeof(tsk_id_t)); - I = self->indexes.edge_insertion_order; - O = self->indexes.edge_removal_order; - tj = 0; - tk = 0; - site = 0; - mutation = 0; - left = 0; - while (tj < M || left < self->sequence_length) { - while (tk < M && edges.right[O[tk]] == left) { - parent[edges.child[O[tk]]] = TSK_NULL; - tk++; - } - while (tj < M && edges.left[I[tj]] == left) { - parent[edges.child[I[tj]]] = edges.parent[I[tj]]; - tj++; - } - right = self->sequence_length; - if (tj < M) { - right = TSK_MIN(right, edges.left[I[tj]]); - } - if (tk < M) { - right = TSK_MIN(right, edges.right[O[tk]]); + if (!(options & TSK_NO_CHECK_INTEGRITY)) { + /* Safe to cast here as we're not counting trees */ + ret = (int) tsk_table_collection_check_integrity(self, TSK_CHECK_TREES); + if (ret < 0) { + goto out; } + } - /* Tree is now ready. We look at each site on this tree in turn */ - while (site < (tsk_id_t) sites.num_rows && sites.position[site] < right) { - /* Create a mapping from mutations to nodes. If we see more than one - * mutation at a node, the previously seen one must be the parent - * of the current since we assume they are in order. */ - first_mutation = mutation; - while (mutation < mutations.num_rows && mutations.site[mutation] == site) { - u = mutations.node[mutation]; - if (bottom_mutation[u] != TSK_NULL) { - mutations.parent[mutation] = bottom_mutation[u]; - } - bottom_mutation[u] = (tsk_id_t) mutation; - mutation++; - } - /* Make the common case of 1 mutation fast */ - if (mutation > first_mutation + 1) { - /* If we have more than one mutation, compute the parent for each - * one by traversing up the tree until we find a node that has a - * mutation. */ - for (j = first_mutation; j < mutation; j++) { - if (mutations.parent[j] == TSK_NULL) { - u = parent[mutations.node[j]]; - while (u != TSK_NULL && bottom_mutation[u] == TSK_NULL) { - u = parent[u]; - } - if (u != TSK_NULL) { - mutations.parent[j] = bottom_mutation[u]; - } - } - } - } - /* Reset the mapping for the next site */ - for (j = first_mutation; j < mutation; j++) { - u = mutations.node[j]; - bottom_mutation[u] = TSK_NULL; - /* Check that we haven't violated the sortedness property */ - if (mutations.parent[j] > (tsk_id_t) j) { - ret = tsk_trace_error(TSK_ERR_MUTATION_PARENT_AFTER_CHILD); - goto out; - } - } - site++; - } - /* Move on to the next tree */ - left = right; + ret = tsk_table_collection_compute_mutation_parents_to_array( + self, self->mutations.parent); + if (ret != 0) { + goto out; } out: - tsk_safe_free(parent); - tsk_safe_free(bottom_mutation); return ret; } @@ -12494,6 +12489,7 @@ tsk_table_collection_compute_mutation_times( for (j = 0; j < mutations.num_rows; j++) { mutations.time[j] = TSK_UNKNOWN_TIME; } + /* TSK_CHECK_MUTATION_PARENTS isn't needed here as we're not using the parents */ num_trees = tsk_table_collection_check_integrity(self, TSK_CHECK_TREES); if (num_trees < 0) { ret = (int) num_trees; @@ -13448,165 +13444,3 @@ tsk_squash_edges(tsk_edge_t *edges, tsk_size_t num_edges, tsk_size_t *num_output out: return ret; } - -/* ======================================================== * - * Tree diff iterator. - * ======================================================== */ - -int TSK_WARN_UNUSED -tsk_diff_iter_init(tsk_diff_iter_t *self, const tsk_table_collection_t *tables, - tsk_id_t num_trees, tsk_flags_t options) -{ - int ret = 0; - - tsk_bug_assert(tables != NULL); - tsk_memset(self, 0, sizeof(tsk_diff_iter_t)); - self->num_nodes = tables->nodes.num_rows; - self->num_edges = tables->edges.num_rows; - self->tables = tables; - self->insertion_index = 0; - self->removal_index = 0; - self->tree_left = 0; - self->tree_index = -1; - if (num_trees < 0) { - num_trees = tsk_table_collection_check_integrity(self->tables, TSK_CHECK_TREES); - if (num_trees < 0) { - ret = (int) num_trees; - goto out; - } - } - self->last_index = num_trees; - - if (options & TSK_INCLUDE_TERMINAL) { - self->last_index = self->last_index + 1; - } - self->edge_list_nodes = tsk_malloc(self->num_edges * sizeof(*self->edge_list_nodes)); - if (self->edge_list_nodes == NULL) { - ret = tsk_trace_error(TSK_ERR_NO_MEMORY); - goto out; - } -out: - return ret; -} - -int -tsk_diff_iter_free(tsk_diff_iter_t *self) -{ - tsk_safe_free(self->edge_list_nodes); - return 0; -} - -void -tsk_diff_iter_print_state(const tsk_diff_iter_t *self, FILE *out) -{ - fprintf(out, "tree_diff_iterator state\n"); - fprintf(out, "num_edges = %lld\n", (long long) self->num_edges); - fprintf(out, "insertion_index = %lld\n", (long long) self->insertion_index); - fprintf(out, "removal_index = %lld\n", (long long) self->removal_index); - fprintf(out, "tree_left = %f\n", self->tree_left); - fprintf(out, "tree_index = %lld\n", (long long) self->tree_index); -} - -int TSK_WARN_UNUSED -tsk_diff_iter_next(tsk_diff_iter_t *self, double *ret_left, double *ret_right, - tsk_edge_list_t *edges_out_ret, tsk_edge_list_t *edges_in_ret) -{ - int ret = 0; - tsk_id_t k; - const double sequence_length = self->tables->sequence_length; - double left = self->tree_left; - double right = sequence_length; - tsk_size_t next_edge_list_node = 0; - tsk_edge_list_node_t *out_head = NULL; - tsk_edge_list_node_t *out_tail = NULL; - tsk_edge_list_node_t *in_head = NULL; - tsk_edge_list_node_t *in_tail = NULL; - tsk_edge_list_node_t *w = NULL; - tsk_edge_list_t edges_out; - tsk_edge_list_t edges_in; - const tsk_edge_table_t *edges = &self->tables->edges; - const tsk_id_t *insertion_order = self->tables->indexes.edge_insertion_order; - const tsk_id_t *removal_order = self->tables->indexes.edge_removal_order; - - tsk_memset(&edges_out, 0, sizeof(edges_out)); - tsk_memset(&edges_in, 0, sizeof(edges_in)); - - if (self->tree_index + 1 < self->last_index) { - /* First we remove the stale records */ - while (self->removal_index < (tsk_id_t) self->num_edges - && left == edges->right[removal_order[self->removal_index]]) { - k = removal_order[self->removal_index]; - tsk_bug_assert(next_edge_list_node < self->num_edges); - w = &self->edge_list_nodes[next_edge_list_node]; - next_edge_list_node++; - w->edge.id = k; - w->edge.left = edges->left[k]; - w->edge.right = edges->right[k]; - w->edge.parent = edges->parent[k]; - w->edge.child = edges->child[k]; - w->edge.metadata = edges->metadata + edges->metadata_offset[k]; - w->edge.metadata_length - = edges->metadata_offset[k + 1] - edges->metadata_offset[k]; - w->next = NULL; - w->prev = NULL; - if (out_head == NULL) { - out_head = w; - out_tail = w; - } else { - out_tail->next = w; - w->prev = out_tail; - out_tail = w; - } - self->removal_index++; - } - edges_out.head = out_head; - edges_out.tail = out_tail; - - /* Now insert the new records */ - while (self->insertion_index < (tsk_id_t) self->num_edges - && left == edges->left[insertion_order[self->insertion_index]]) { - k = insertion_order[self->insertion_index]; - tsk_bug_assert(next_edge_list_node < self->num_edges); - w = &self->edge_list_nodes[next_edge_list_node]; - next_edge_list_node++; - w->edge.id = k; - w->edge.left = edges->left[k]; - w->edge.right = edges->right[k]; - w->edge.parent = edges->parent[k]; - w->edge.child = edges->child[k]; - w->edge.metadata = edges->metadata + edges->metadata_offset[k]; - w->edge.metadata_length - = edges->metadata_offset[k + 1] - edges->metadata_offset[k]; - w->next = NULL; - w->prev = NULL; - if (in_head == NULL) { - in_head = w; - in_tail = w; - } else { - in_tail->next = w; - w->prev = in_tail; - in_tail = w; - } - self->insertion_index++; - } - edges_in.head = in_head; - edges_in.tail = in_tail; - - right = sequence_length; - if (self->insertion_index < (tsk_id_t) self->num_edges) { - right = TSK_MIN(right, edges->left[insertion_order[self->insertion_index]]); - } - if (self->removal_index < (tsk_id_t) self->num_edges) { - right = TSK_MIN(right, edges->right[removal_order[self->removal_index]]); - } - self->tree_index++; - ret = TSK_TREE_OK; - } - *edges_out_ret = edges_out; - *edges_in_ret = edges_in; - *ret_left = left; - *ret_right = right; - /* Set the left coordinate for the next tree */ - self->tree_left = right; - return ret; -} diff --git a/treerec/tskit/tables.h b/treerec/tskit/tables.h index 204e39e2..85ed29d5 100644 --- a/treerec/tskit/tables.h +++ b/treerec/tskit/tables.h @@ -158,6 +158,10 @@ typedef struct { /** @brief The ID of the edge that this mutation lies on, or TSK_NULL if there is no corresponding edge.*/ tsk_id_t edge; + /** @brief Inherited state. */ + const char *inherited_state; + /** @brief Size of the inherited state in bytes. */ + tsk_size_t inherited_state_length; } tsk_mutation_t; /** @@ -682,18 +686,6 @@ typedef struct { tsk_edge_list_node_t *tail; } tsk_edge_list_t; -typedef struct { - tsk_size_t num_nodes; - tsk_size_t num_edges; - double tree_left; - const tsk_table_collection_t *tables; - tsk_id_t insertion_index; - tsk_id_t removal_index; - tsk_id_t tree_index; - tsk_id_t last_index; - tsk_edge_list_node_t *edge_list_nodes; -} tsk_diff_iter_t; - /****************************************************************************/ /* Common function options */ /****************************************************************************/ @@ -797,6 +789,11 @@ All checks needed to define a valid tree sequence. Note that this implies all of the above checks. */ #define TSK_CHECK_TREES (1 << 7) +/** +Check mutation parents are consistent with topology. +Implies TSK_CHECK_TREES. +*/ +#define TSK_CHECK_MUTATION_PARENTS (1 << 8) /* Leave room for more positive check flags */ /** @@ -4771,19 +4768,6 @@ int tsk_identity_segments_get(const tsk_identity_segments_t *self, tsk_id_t a, void tsk_identity_segments_print_state(tsk_identity_segments_t *self, FILE *out); int tsk_identity_segments_free(tsk_identity_segments_t *self); -/* Edge differences */ - -/* Internal API - currently used in a few places, but a better API is envisaged - * at some point. - * IMPORTANT: tskit-rust uses this API, so don't break without discussing! - */ -int tsk_diff_iter_init(tsk_diff_iter_t *self, const tsk_table_collection_t *tables, - tsk_id_t num_trees, tsk_flags_t options); -int tsk_diff_iter_free(tsk_diff_iter_t *self); -int tsk_diff_iter_next(tsk_diff_iter_t *self, double *left, double *right, - tsk_edge_list_t *edges_out, tsk_edge_list_t *edges_in); -void tsk_diff_iter_print_state(const tsk_diff_iter_t *self, FILE *out); - #ifdef __cplusplus } #endif diff --git a/treerec/tskit/trees.c b/treerec/tskit/trees.c index 4cf479c3..7a159a7f 100644 --- a/treerec/tskit/trees.c +++ b/treerec/tskit/trees.c @@ -253,6 +253,13 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) const tsk_size_t num_nodes = self->tables->nodes.num_rows; const double *restrict site_position = self->tables->sites.position; const tsk_id_t *restrict mutation_site = self->tables->mutations.site; + const tsk_id_t *restrict mutation_parent = self->tables->mutations.parent; + const char *restrict sites_ancestral_state = self->tables->sites.ancestral_state; + const tsk_size_t *restrict sites_ancestral_state_offset + = self->tables->sites.ancestral_state_offset; + const char *restrict mutations_derived_state = self->tables->mutations.derived_state; + const tsk_size_t *restrict mutations_derived_state_offset + = self->tables->mutations.derived_state_offset; const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; const double *restrict edge_right = self->tables->edges.right; @@ -262,6 +269,7 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) bool discrete_breakpoints = true; tsk_id_t *node_edge_map = tsk_malloc(num_nodes * sizeof(*node_edge_map)); tsk_mutation_t *mutation; + tsk_id_t parent_id; self->tree_sites_length = tsk_malloc(num_trees_alloc * sizeof(*self->tree_sites_length)); @@ -311,6 +319,26 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) mutation_id < num_mutations && mutation_site[mutation_id] == site_id) { mutation = self->site_mutations_mem + mutation_id; mutation->edge = node_edge_map[mutation->node]; + + /* Compute inherited state */ + if (mutation_parent[mutation_id] == TSK_NULL) { + /* No parent: inherited state is the site's ancestral state */ + mutation->inherited_state + = sites_ancestral_state + sites_ancestral_state_offset[site_id]; + mutation->inherited_state_length + = sites_ancestral_state_offset[site_id + 1] + - sites_ancestral_state_offset[site_id]; + } else { + /* Has parent: inherited state is parent's derived state */ + parent_id = mutation_parent[mutation_id]; + mutation->inherited_state + = mutations_derived_state + + mutations_derived_state_offset[parent_id]; + mutation->inherited_state_length + = mutations_derived_state_offset[parent_id + 1] + - mutations_derived_state_offset[parent_id]; + } + mutation_id++; } site_id++; @@ -458,10 +486,29 @@ tsk_treeseq_init( goto out; } } - num_trees = tsk_table_collection_check_integrity(self->tables, TSK_CHECK_TREES); - if (num_trees < 0) { - ret = (int) num_trees; - goto out; + + if (options & TSK_TS_INIT_COMPUTE_MUTATION_PARENTS) { + /* As tsk_table_collection_compute_mutation_parents performs an + integrity check, and we don't wish to do that twice we perform + our own check here */ + num_trees = tsk_table_collection_check_integrity(self->tables, TSK_CHECK_TREES); + if (num_trees < 0) { + ret = (int) num_trees; + goto out; + } + + ret = tsk_table_collection_compute_mutation_parents( + self->tables, TSK_NO_CHECK_INTEGRITY); + if (ret != 0) { + goto out; + } + } else { + num_trees = tsk_table_collection_check_integrity( + self->tables, TSK_CHECK_TREES | TSK_CHECK_MUTATION_PARENTS); + if (num_trees < 0) { + ret = (int) num_trees; + goto out; + } } self->num_trees = (tsk_size_t) num_trees; self->discrete_genome = true; @@ -1237,6 +1284,33 @@ tsk_treeseq_check_windows(const tsk_treeseq_t *self, tsk_size_t num_windows, return ret; } +static int +tsk_treeseq_check_time_windows(tsk_size_t num_windows, const double *windows) +{ + // This does not check the last window ends at infinity, + // which is required for some time window functions. + int ret = TSK_ERR_BAD_TIME_WINDOWS; + tsk_size_t j; + + if (num_windows < 1) { + ret = TSK_ERR_BAD_TIME_WINDOWS_DIM; + goto out; + } + + if (windows[0] != 0.0) { + goto out; + } + + for (j = 0; j < num_windows; j++) { + if (windows[j] >= windows[j + 1]) { + goto out; + } + } + ret = 0; +out: + return ret; +} + /* TODO make these functions more consistent in how the arguments are ordered */ static inline void @@ -2225,7 +2299,7 @@ get_allele_samples(const tsk_site_t *site, const tsk_bit_array_t *state, } static int -norm_hap_weighted(tsk_size_t state_dim, const double *hap_weights, +norm_hap_weighted(tsk_size_t result_dim, const double *hap_weights, tsk_size_t TSK_UNUSED(n_a), tsk_size_t TSK_UNUSED(n_b), double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; @@ -2233,7 +2307,7 @@ norm_hap_weighted(tsk_size_t state_dim, const double *hap_weights, double n; tsk_size_t k; - for (k = 0; k < state_dim; k++) { + for (k = 0; k < result_dim; k++) { weight_row = GET_2D_ROW(hap_weights, 3, k); n = (double) args.sample_set_sizes[k]; // TODO: what to do when n = 0 @@ -2243,12 +2317,38 @@ norm_hap_weighted(tsk_size_t state_dim, const double *hap_weights, } static int -norm_total_weighted(tsk_size_t state_dim, const double *TSK_UNUSED(hap_weights), +norm_hap_weighted_ij(tsk_size_t result_dim, const double *hap_weights, + tsk_size_t TSK_UNUSED(n_a), tsk_size_t TSK_UNUSED(n_b), double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + const double *weight_row; + double ni, nj, wAB_i, wAB_j; + tsk_id_t i, j; + tsk_size_t k; + + for (k = 0; k < result_dim; k++) { + i = args.set_indexes[2 * k]; + j = args.set_indexes[2 * k + 1]; + ni = (double) args.sample_set_sizes[i]; + nj = (double) args.sample_set_sizes[j]; + weight_row = GET_2D_ROW(hap_weights, 3, i); + wAB_i = weight_row[0]; + weight_row = GET_2D_ROW(hap_weights, 3, j); + wAB_j = weight_row[0]; + + result[k] = (wAB_i + wAB_j) / (ni + nj); + } + + return 0; +} + +static int +norm_total_weighted(tsk_size_t result_dim, const double *TSK_UNUSED(hap_weights), tsk_size_t n_a, tsk_size_t n_b, double *result, void *TSK_UNUSED(params)) { tsk_size_t k; - for (k = 0; k < state_dim; k++) { + for (k = 0; k < result_dim; k++) { result[k] = 1 / (double) (n_a * n_b); } return 0; @@ -2268,9 +2368,6 @@ get_all_samples_bits(tsk_bit_array_t *all_samples, tsk_size_t n) } } -typedef int norm_func_t(tsk_size_t state_dim, const double *hap_weights, tsk_size_t n_a, - tsk_size_t n_b, double *result, void *params); - static int compute_general_two_site_stat_result(const tsk_bit_array_t *site_a_state, const tsk_bit_array_t *site_b_state, tsk_size_t num_a_alleles, @@ -2290,14 +2387,15 @@ compute_general_two_site_stat_result(const tsk_bit_array_t *site_a_state, // a2 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] // a3 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] tsk_size_t k, mut_a, mut_b; - tsk_size_t row_len = num_b_alleles * state_dim; + tsk_size_t result_row_len = num_b_alleles * result_dim; tsk_size_t w_A = 0, w_B = 0, w_AB = 0; uint8_t polarised_val = polarised ? 1 : 0; double *hap_weight_row; double *result_tmp_row; double *weights = tsk_malloc(3 * state_dim * sizeof(*weights)); - double *norm = tsk_malloc(state_dim * sizeof(*norm)); - double *result_tmp = tsk_malloc(row_len * num_a_alleles * sizeof(*result_tmp)); + double *norm = tsk_malloc(result_dim * sizeof(*norm)); + double *result_tmp + = tsk_malloc(result_row_len * num_a_alleles * sizeof(*result_tmp)); tsk_memset(&ss_A_samples, 0, sizeof(ss_A_samples)); tsk_memset(&ss_B_samples, 0, sizeof(ss_B_samples)); @@ -2327,7 +2425,7 @@ compute_general_two_site_stat_result(const tsk_bit_array_t *site_a_state, } for (mut_a = polarised_val; mut_a < num_a_alleles; mut_a++) { - result_tmp_row = GET_2D_ROW(result_tmp, row_len, mut_a); + result_tmp_row = GET_2D_ROW(result_tmp, result_row_len, mut_a); for (mut_b = polarised_val; mut_b < num_b_alleles; mut_b++) { tsk_bit_array_get_row(site_a_state, mut_a, &A_samples); tsk_bit_array_get_row(site_b_state, mut_b, &B_samples); @@ -2352,15 +2450,15 @@ compute_general_two_site_stat_result(const tsk_bit_array_t *site_a_state, if (ret != 0) { goto out; } - ret = norm_f(state_dim, weights, num_a_alleles - polarised_val, + ret = norm_f(result_dim, weights, num_a_alleles - polarised_val, num_b_alleles - polarised_val, norm, f_params); if (ret != 0) { goto out; } - for (k = 0; k < state_dim; k++) { + for (k = 0; k < result_dim; k++) { result[k] += result_tmp_row[k] * norm[k]; } - result_tmp_row += state_dim; // Advance to the next column + result_tmp_row += result_dim; // Advance to the next column } } @@ -2538,8 +2636,8 @@ tsk_treeseq_two_site_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, get_site_row_col_indices( n_rows, row_sites, n_cols, col_sites, sites, &n_sites, row_idx, col_idx); - // We rely on n_sites to allocate these arrays, they're initialized to NULL for safe - // deallocation if the previous allocation fails + // We rely on n_sites to allocate these arrays, which are initialized + // to NULL for safe deallocation if the previous allocation fails num_alleles = tsk_malloc(n_sites * sizeof(*num_alleles)); site_offsets = tsk_malloc(n_sites * sizeof(*site_offsets)); if (num_alleles == NULL || site_offsets == NULL) { @@ -3195,7 +3293,7 @@ tsk_treeseq_two_branch_count_stat(const tsk_treeseq_t *self, tsk_size_t state_di return ret; } -static int +int tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t result_dim, const tsk_id_t *set_indexes, general_stat_func_t *f, @@ -3209,7 +3307,6 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl tsk_bit_array_t sample_sets_bits; bool stat_site = !!(options & TSK_STAT_SITE); bool stat_branch = !!(options & TSK_STAT_BRANCH); - // double default_windows[] = { 0, self->tables->sequence_length }; tsk_size_t state_dim = num_sample_sets; sample_count_stat_params_t f_params = { .sample_sets = sample_sets, .num_sample_sets = num_sample_sets, @@ -3232,17 +3329,15 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl ret = tsk_trace_error(TSK_ERR_MULTIPLE_STAT_MODES); goto out; } - // TODO: impossible until we implement branch/windows - // if (result_dim < 1) { - // ret = tsk_trace_error(TSK_ERR_BAD_RESULT_DIMS); - // goto out; - // } ret = tsk_treeseq_check_sample_sets( self, num_sample_sets, sample_set_sizes, sample_sets); if (ret != 0) { goto out; } - tsk_bug_assert(state_dim > 0); + if (result_dim < 1) { + ret = tsk_trace_error(TSK_ERR_BAD_RESULT_DIMS); + goto out; + } ret = sample_sets_to_bit_array( self, sample_set_sizes, sample_sets, num_sample_sets, &sample_sets_bits); if (ret != 0) { @@ -3467,49 +3562,57 @@ tsk_treeseq_site_allele_frequency_spectrum(const tsk_treeseq_t *self, return ret; } -static int TSK_WARN_UNUSED +static void tsk_treeseq_update_branch_afs(const tsk_treeseq_t *self, tsk_id_t u, double right, - const double *restrict branch_length, double *restrict last_update, - const double *counts, tsk_size_t num_sample_sets, tsk_size_t window_index, + double *restrict last_update, const double *restrict time, tsk_id_t *restrict parent, + tsk_size_t *restrict coordinate, const double *counts, tsk_size_t num_sample_sets, + tsk_size_t num_time_windows, const double *time_windows, tsk_size_t window_index, const tsk_size_t *result_dims, tsk_flags_t options, double *result) { - int ret = 0; tsk_size_t afs_size; tsk_size_t k; + tsk_size_t time_window_index; double *afs; - tsk_size_t *coordinate = tsk_malloc(num_sample_sets * sizeof(*coordinate)); bool polarised = !!(options & TSK_STAT_POLARISED); const double *count_row = GET_2D_ROW(counts, num_sample_sets + 1, u); - double x = (right - last_update[u]) * branch_length[u]; + double x = 0; + double t_u, t_v; + double tw_branch_length = 0; const tsk_size_t all_samples = (tsk_size_t) count_row[num_sample_sets]; - - if (coordinate == NULL) { - ret = tsk_trace_error(TSK_ERR_NO_MEMORY); - goto out; - } - - if (0 < all_samples && all_samples < self->num_samples) { - afs_size = result_dims[num_sample_sets]; - afs = result + afs_size * window_index; - for (k = 0; k < num_sample_sets; k++) { - coordinate[k] = (tsk_size_t) count_row[k]; - } - if (!polarised) { - fold(coordinate, result_dims, num_sample_sets); + if (parent[u] != TSK_NULL) { + t_u = time[u]; + t_v = time[parent[u]]; + if (0 < all_samples && all_samples < self->num_samples) { + time_window_index = 0; + afs_size = result_dims[num_sample_sets]; + while (time_window_index < num_time_windows + && time_windows[time_window_index] < t_v) { + afs = result + + afs_size * (window_index * num_time_windows + time_window_index); + for (k = 0; k < num_sample_sets; k++) { + coordinate[k] = (tsk_size_t) count_row[k]; + } + if (!polarised) { + fold(coordinate, result_dims, num_sample_sets); + } + tw_branch_length + = TSK_MAX(0.0, TSK_MIN(time_windows[time_window_index + 1], t_v) + - TSK_MAX(time_windows[time_window_index], t_u)); + x = (right - last_update[u]) * tw_branch_length; + increment_nd_array_value( + afs, num_sample_sets, result_dims, coordinate, x); + time_window_index++; + } } - increment_nd_array_value(afs, num_sample_sets, result_dims, coordinate, x); } last_update[u] = right; -out: - tsk_safe_free(coordinate); - return ret; } static int tsk_treeseq_branch_allele_frequency_spectrum(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, double *counts, tsk_size_t num_windows, - const double *windows, const tsk_size_t *result_dims, tsk_flags_t options, - double *result) + const double *windows, tsk_size_t num_time_windows, const double *time_windows, + const tsk_size_t *result_dims, tsk_flags_t options, double *result) { int ret = 0; tsk_id_t u, v; @@ -3527,6 +3630,7 @@ tsk_treeseq_branch_allele_frequency_spectrum(const tsk_treeseq_t *self, tsk_id_t *restrict parent = tsk_malloc(num_nodes * sizeof(*parent)); double *restrict last_update = tsk_calloc(num_nodes, sizeof(*last_update)); double *restrict branch_length = tsk_calloc(num_nodes, sizeof(*branch_length)); + tsk_size_t *restrict coordinate = tsk_malloc(num_sample_sets * sizeof(*coordinate)); tsk_id_t tj, tk, h; double t_left, t_right, w_right; const tsk_size_t K = num_sample_sets + 1; @@ -3536,7 +3640,7 @@ tsk_treeseq_branch_allele_frequency_spectrum(const tsk_treeseq_t *self, goto out; } - if (parent == NULL || last_update == NULL) { + if (parent == NULL || last_update == NULL || coordinate == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } @@ -3554,19 +3658,13 @@ tsk_treeseq_branch_allele_frequency_spectrum(const tsk_treeseq_t *self, tk++; u = edge_child[h]; v = edge_parent[h]; - ret = tsk_treeseq_update_branch_afs(self, u, t_left, branch_length, - last_update, counts, num_sample_sets, window_index, result_dims, options, - result); - if (ret != 0) { - goto out; - } + tsk_treeseq_update_branch_afs(self, u, t_left, last_update, node_time, + parent, coordinate, counts, num_sample_sets, num_time_windows, + time_windows, window_index, result_dims, options, result); while (v != TSK_NULL) { - ret = tsk_treeseq_update_branch_afs(self, v, t_left, branch_length, - last_update, counts, num_sample_sets, window_index, result_dims, - options, result); - if (ret != 0) { - goto out; - } + tsk_treeseq_update_branch_afs(self, v, t_left, last_update, node_time, + parent, coordinate, counts, num_sample_sets, num_time_windows, + time_windows, window_index, result_dims, options, result); update_state(counts, K, v, u, -1); v = parent[v]; } @@ -3582,12 +3680,9 @@ tsk_treeseq_branch_allele_frequency_spectrum(const tsk_treeseq_t *self, parent[u] = v; branch_length[u] = node_time[v] - node_time[u]; while (v != TSK_NULL) { - ret = tsk_treeseq_update_branch_afs(self, v, t_left, branch_length, - last_update, counts, num_sample_sets, window_index, result_dims, - options, result); - if (ret != 0) { - goto out; - } + tsk_treeseq_update_branch_afs(self, v, t_left, last_update, node_time, + parent, coordinate, counts, num_sample_sets, num_time_windows, + time_windows, window_index, result_dims, options, result); update_state(counts, K, v, u, +1); v = parent[v]; } @@ -3606,12 +3701,9 @@ tsk_treeseq_branch_allele_frequency_spectrum(const tsk_treeseq_t *self, /* Flush the contributions of all nodes to the current window */ for (u = 0; u < (tsk_id_t) num_nodes; u++) { tsk_bug_assert(last_update[u] < w_right); - ret = tsk_treeseq_update_branch_afs(self, u, w_right, branch_length, - last_update, counts, num_sample_sets, window_index, result_dims, - options, result); - if (ret != 0) { - goto out; - } + tsk_treeseq_update_branch_afs(self, u, w_right, last_update, node_time, + parent, coordinate, counts, num_sample_sets, num_time_windows, + time_windows, window_index, result_dims, options, result); } window_index++; } @@ -3629,6 +3721,9 @@ tsk_treeseq_branch_allele_frequency_spectrum(const tsk_treeseq_t *self, if (branch_length != NULL) { free(branch_length); } + if (coordinate != NULL) { + free(coordinate); + } return ret; } @@ -3636,13 +3731,15 @@ int tsk_treeseq_allele_frequency_spectrum(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, - tsk_flags_t options, double *result) + tsk_size_t num_time_windows, const double *time_windows, tsk_flags_t options, + double *result) { int ret = 0; bool stat_site = !!(options & TSK_STAT_SITE); bool stat_branch = !!(options & TSK_STAT_BRANCH); bool stat_node = !!(options & TSK_STAT_NODE); const double default_windows[] = { 0, self->tables->sequence_length }; + const double default_time_windows[] = { 0, INFINITY }; const tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_size_t K = num_sample_sets + 1; tsk_size_t j, k, l, afs_size; @@ -3652,7 +3749,6 @@ tsk_treeseq_allele_frequency_spectrum(const tsk_treeseq_t *self, * reuse code from the general_stats code paths. */ double *counts = NULL; double *count_row; - if (stat_node) { ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_STAT_MODE); goto out; @@ -3676,13 +3772,27 @@ tsk_treeseq_allele_frequency_spectrum(const tsk_treeseq_t *self, goto out; } } + if (time_windows == NULL) { + num_time_windows = 1; + time_windows = default_time_windows; + } else { + ret = tsk_treeseq_check_time_windows(num_time_windows, time_windows); + if (ret != 0) { + goto out; + } + // Site mode does not support time windows + if (stat_site && !(time_windows[0] == 0.0 && isinf((float) time_windows[1]))) { + ret = TSK_ERR_UNSUPPORTED_STAT_MODE; + goto out; + } + } ret = tsk_treeseq_check_sample_sets( self, num_sample_sets, sample_set_sizes, sample_sets); if (ret != 0) { goto out; } - /* the last element of result_dims stores the total size of the dimenensions */ + /* the last element of result_dims stores the total size of the dimensions */ result_dims = tsk_malloc((num_sample_sets + 1) * sizeof(*result_dims)); counts = tsk_calloc(num_nodes * K, sizeof(*counts)); if (counts == NULL || result_dims == NULL) { @@ -3711,19 +3821,20 @@ tsk_treeseq_allele_frequency_spectrum(const tsk_treeseq_t *self, count_row[num_sample_sets] = 1; } result_dims[num_sample_sets] = (tsk_size_t) afs_size; + tsk_memset(result, 0, num_windows * num_time_windows * afs_size * sizeof(*result)); - tsk_memset(result, 0, num_windows * afs_size * sizeof(*result)); if (stat_site) { ret = tsk_treeseq_site_allele_frequency_spectrum(self, num_sample_sets, sample_set_sizes, counts, num_windows, windows, result_dims, options, result); } else { ret = tsk_treeseq_branch_allele_frequency_spectrum(self, num_sample_sets, counts, - num_windows, windows, result_dims, options, result); + num_windows, windows, num_time_windows, time_windows, result_dims, options, + result); } if (options & TSK_STAT_SPAN_NORMALISE) { - span_normalise(num_windows, windows, afs_size, result); + span_normalise(num_windows, windows, afs_size * num_time_windows, result); } out: tsk_safe_free(counts); @@ -4231,7 +4342,7 @@ tsk_treeseq_D_prime(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, { options |= TSK_STAT_POLARISED; // TODO: allow user to pick? return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, - sample_sets, num_sample_sets, NULL, D_prime_summary_func, norm_hap_weighted, + sample_sets, num_sample_sets, NULL, D_prime_summary_func, norm_total_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); } @@ -4475,7 +4586,7 @@ check_sample_stat_inputs(tsk_size_t num_sample_sets, tsk_size_t tuple_size, { int ret = 0; - if (num_sample_sets < tuple_size) { + if (num_sample_sets < 1) { ret = tsk_trace_error(TSK_ERR_INSUFFICIENT_SAMPLE_SETS); goto out; } @@ -4781,6 +4892,200 @@ tsk_treeseq_f2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, return ret; } +static int +D2_ij_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, + tsk_size_t result_dim, double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + const double *state_row; + double n; + tsk_size_t k; + tsk_id_t i, j; + double p_A, p_B, p_AB, p_Ab, p_aB, D_i, D_j; + + for (k = 0; k < result_dim; k++) { + i = args.set_indexes[2 * k]; + j = args.set_indexes[2 * k + 1]; + + n = (double) args.sample_set_sizes[i]; + state_row = GET_2D_ROW(state, 3, i); + p_AB = state_row[0] / n; + p_Ab = state_row[1] / n; + p_aB = state_row[2] / n; + p_A = p_AB + p_Ab; + p_B = p_AB + p_aB; + D_i = p_AB - (p_A * p_B); + + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + p_AB = state_row[0] / n; + p_Ab = state_row[1] / n; + p_aB = state_row[2] / n; + p_A = p_AB + p_Ab; + p_B = p_AB + p_aB; + D_j = p_AB - (p_A * p_B); + + result[k] = D_i * D_j; + } + + return 0; +} + +int +tsk_treeseq_D2_ij(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_rows, + const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, + const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, + double *result) +{ + int ret = 0; + ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); + if (ret != 0) { + goto out; + } + ret = tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_index_tuples, index_tuples, D2_ij_summary_func, + norm_total_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, + col_positions, options, result); +out: + return ret; +} + +static int +D2_ij_unbiased_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, + tsk_size_t result_dim, double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + const double *state_row; + tsk_size_t k; + tsk_id_t i, j; + double n_i, n_j; + double w_AB_i, w_Ab_i, w_aB_i, w_ab_i; + double w_AB_j, w_Ab_j, w_aB_j, w_ab_j; + + for (k = 0; k < result_dim; k++) { + i = args.set_indexes[2 * k]; + j = args.set_indexes[2 * k + 1]; + if (i == j) { + // We require disjoint sample sets because we test equality here + n_i = (double) args.sample_set_sizes[i]; + state_row = GET_2D_ROW(state, 3, i); + w_AB_i = state_row[0]; + w_Ab_i = state_row[1]; + w_aB_i = state_row[2]; + w_ab_i = n_i - (w_AB_i + w_Ab_i + w_aB_i); + result[k] = (w_AB_i * (w_AB_i - 1) * w_ab_i * (w_ab_i - 1) + + w_Ab_i * (w_Ab_i - 1) * w_aB_i * (w_aB_i - 1) + - 2 * w_AB_i * w_Ab_i * w_aB_i * w_ab_i) + / n_i / (n_i - 1) / (n_i - 2) / (n_i - 3); + } + + else { + n_i = (double) args.sample_set_sizes[i]; + state_row = GET_2D_ROW(state, 3, i); + w_AB_i = state_row[0]; + w_Ab_i = state_row[1]; + w_aB_i = state_row[2]; + w_ab_i = n_i - (w_AB_i + w_Ab_i + w_aB_i); + + n_j = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + w_AB_j = state_row[0]; + w_Ab_j = state_row[1]; + w_aB_j = state_row[2]; + w_ab_j = n_j - (w_AB_j + w_Ab_j + w_aB_j); + + result[k] = (w_Ab_i * w_aB_i - w_AB_i * w_ab_i) + * (w_Ab_j * w_aB_j - w_AB_j * w_ab_j) / n_i / (n_i - 1) / n_j + / (n_j - 1); + } + } + + return 0; +} + +int +tsk_treeseq_D2_ij_unbiased(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_rows, + const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, + const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, + double *result) +{ + int ret = 0; + ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); + if (ret != 0) { + goto out; + } + ret = tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_index_tuples, index_tuples, D2_ij_unbiased_summary_func, + norm_total_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, + col_positions, options, result); +out: + return ret; +} + +static int +r2_ij_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, + tsk_size_t result_dim, double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + const double *state_row; + tsk_size_t k; + tsk_id_t i, j; + double n, pAB, pAb, paB, pA, pB, D_i, D_j, denom_i, denom_j; + + for (k = 0; k < result_dim; k++) { + i = args.set_indexes[2 * k]; + j = args.set_indexes[2 * k + 1]; + + n = (double) args.sample_set_sizes[i]; + state_row = GET_2D_ROW(state, 3, i); + pAB = state_row[0] / n; + pAb = state_row[1] / n; + paB = state_row[2] / n; + pA = pAB + pAb; + pB = pAB + paB; + D_i = pAB - (pA * pB); + denom_i = sqrt(pA * (1 - pA) * pB * (1 - pB)); + + n = (double) args.sample_set_sizes[j]; + state_row = GET_2D_ROW(state, 3, j); + pAB = state_row[0] / n; + pAb = state_row[1] / n; + paB = state_row[2] / n; + pA = pAB + pAb; + pB = pAB + paB; + D_j = pAB - (pA * pB); + denom_j = sqrt(pA * (1 - pA) * pB * (1 - pB)); + + result[k] = (D_i * D_j) / (denom_i * denom_j); + } + return 0; +} + +int +tsk_treeseq_r2_ij(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_rows, + const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, + const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, + double *result) +{ + int ret = 0; + ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); + if (ret != 0) { + goto out; + } + ret = tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_index_tuples, index_tuples, r2_ij_summary_func, + norm_hap_weighted_ij, num_rows, row_sites, row_positions, num_cols, col_sites, + col_positions, options, result); +out: + return ret; +} + /*********************************** * Three way stats ***********************************/ @@ -4951,6 +5256,9 @@ tsk_treeseq_get_mutation( goto out; } mutation->edge = self->site_mutations_mem[index].edge; + mutation->inherited_state = self->site_mutations_mem[index].inherited_state; + mutation->inherited_state_length + = self->site_mutations_mem[index].inherited_state_length; out: return ret; } @@ -6146,17 +6454,18 @@ tsk_tree_print_state(const tsk_tree_t *self, FILE *out) fprintf(out, "left = %f\n", self->interval.left); fprintf(out, "right = %f\n", self->interval.right); fprintf(out, "index = %lld\n", (long long) self->index); - fprintf(out, "node\tparent\tlchild\trchild\tlsib\trsib"); + fprintf(out, "num_edges = %d\n", (int) self->num_edges); + fprintf(out, "node\tedge\tparent\tlchild\trchild\tlsib\trsib"); if (self->options & TSK_SAMPLE_LISTS) { fprintf(out, "\thead\ttail"); } fprintf(out, "\n"); for (j = 0; j < self->num_nodes + 1; j++) { - fprintf(out, "%lld\t%lld\t%lld\t%lld\t%lld\t%lld", (long long) j, - (long long) self->parent[j], (long long) self->left_child[j], - (long long) self->right_child[j], (long long) self->left_sib[j], - (long long) self->right_sib[j]); + fprintf(out, "%lld\t%lld\t%lld\t%lld\t%lld\t%lld\t%lld", (long long) j, + (long long) self->edge[j], (long long) self->parent[j], + (long long) self->left_child[j], (long long) self->right_child[j], + (long long) self->left_sib[j], (long long) self->right_sib[j]); if (self->options & TSK_SAMPLE_LISTS) { fprintf(out, "\t%lld\t%lld\t", (long long) self->left_sample[j], (long long) self->right_sample[j]); @@ -6284,7 +6593,8 @@ tsk_tree_remove_root(tsk_tree_t *self, tsk_id_t root, tsk_id_t *restrict parent) } static void -tsk_tree_remove_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) +tsk_tree_remove_edge( + tsk_tree_t *self, tsk_id_t p, tsk_id_t c, tsk_id_t TSK_UNUSED(edge_id)) { tsk_id_t *restrict parent = self->parent; tsk_size_t *restrict num_samples = self->num_samples; @@ -6425,7 +6735,7 @@ tsk_tree_next(tsk_tree_t *self) if (valid) { for (j = tree_pos.out.start; j != tree_pos.out.stop; j++) { e = tree_pos.out.order[j]; - tsk_tree_remove_edge(self, edge_parent[e], edge_child[e]); + tsk_tree_remove_edge(self, edge_parent[e], edge_child[e], e); } for (j = tree_pos.in.start; j != tree_pos.in.stop; j++) { @@ -6457,7 +6767,7 @@ tsk_tree_prev(tsk_tree_t *self) if (valid) { for (j = tree_pos.out.start; j != tree_pos.out.stop; j--) { e = tree_pos.out.order[j]; - tsk_tree_remove_edge(self, edge_parent[e], edge_child[e]); + tsk_tree_remove_edge(self, edge_parent[e], edge_child[e], e); } for (j = tree_pos.in.start; j != tree_pos.in.stop; j--) { @@ -6532,6 +6842,90 @@ tsk_tree_seek_from_null(tsk_tree_t *self, double x, tsk_flags_t TSK_UNUSED(optio return ret; } +static int TSK_WARN_UNUSED +tsk_tree_seek_forward(tsk_tree_t *self, tsk_id_t index) +{ + int ret = 0; + tsk_table_collection_t *tables = self->tree_sequence->tables; + const tsk_id_t *restrict edge_parent = tables->edges.parent; + const tsk_id_t *restrict edge_child = tables->edges.child; + const double *restrict edge_left = tables->edges.left; + const double *restrict edge_right = tables->edges.right; + double interval_left, e_left; + const double old_right = self->interval.right; + tsk_id_t j, e; + tsk_tree_position_t tree_pos; + + ret = tsk_tree_position_seek_forward(&self->tree_pos, index); + if (ret != 0) { + goto out; + } + tree_pos = self->tree_pos; + interval_left = tree_pos.interval.left; + + for (j = tree_pos.out.start; j != tree_pos.out.stop; j++) { + e = tree_pos.out.order[j]; + e_left = edge_left[e]; + if (e_left < old_right) { + tsk_bug_assert(edge_parent[e] != TSK_NULL); + tsk_tree_remove_edge(self, edge_parent[e], edge_child[e], e); + } + tsk_bug_assert(e_left < interval_left); + } + + for (j = tree_pos.in.start; j != tree_pos.in.stop; j++) { + e = tree_pos.in.order[j]; + if (edge_left[e] <= interval_left && interval_left < edge_right[e]) { + tsk_tree_insert_edge(self, edge_parent[e], edge_child[e], e); + } + } + tsk_tree_update_index_and_interval(self); +out: + return ret; +} + +static int TSK_WARN_UNUSED +tsk_tree_seek_backward(tsk_tree_t *self, tsk_id_t index) +{ + int ret = 0; + tsk_table_collection_t *tables = self->tree_sequence->tables; + const tsk_id_t *restrict edge_parent = tables->edges.parent; + const tsk_id_t *restrict edge_child = tables->edges.child; + const double *restrict edge_left = tables->edges.left; + const double *restrict edge_right = tables->edges.right; + double interval_right, e_right; + const double old_right = self->interval.right; + tsk_id_t j, e; + tsk_tree_position_t tree_pos; + + ret = tsk_tree_position_seek_backward(&self->tree_pos, index); + if (ret != 0) { + goto out; + } + tree_pos = self->tree_pos; + interval_right = tree_pos.interval.right; + + for (j = tree_pos.out.start; j != tree_pos.out.stop; j--) { + e = tree_pos.out.order[j]; + e_right = edge_right[e]; + if (e_right >= old_right) { + tsk_bug_assert(edge_parent[e] != TSK_NULL); + tsk_tree_remove_edge(self, edge_parent[e], edge_child[e], e); + } + tsk_bug_assert(e_right > interval_right); + } + + for (j = tree_pos.in.start; j != tree_pos.in.stop; j--) { + e = tree_pos.in.order[j]; + if (edge_right[e] >= interval_right && interval_right > edge_left[e]) { + tsk_tree_insert_edge(self, edge_parent[e], edge_child[e], e); + } + } + tsk_tree_update_index_and_interval(self); +out: + return ret; +} + int TSK_WARN_UNUSED tsk_tree_seek_index(tsk_tree_t *self, tsk_id_t tree, tsk_flags_t options) { @@ -6549,7 +6943,7 @@ tsk_tree_seek_index(tsk_tree_t *self, tsk_id_t tree, tsk_flags_t options) } static int TSK_WARN_UNUSED -tsk_tree_seek_linear(tsk_tree_t *self, double x, tsk_flags_t TSK_UNUSED(options)) +tsk_tree_seek_linear(tsk_tree_t *self, double x) { const double L = tsk_treeseq_get_sequence_length(self->tree_sequence); const double t_l = self->interval.left; @@ -6588,6 +6982,29 @@ tsk_tree_seek_linear(tsk_tree_t *self, double x, tsk_flags_t TSK_UNUSED(options) return ret; } +static int TSK_WARN_UNUSED +tsk_tree_seek_skip(tsk_tree_t *self, double x) +{ + const double t_l = self->interval.left; + int ret = 0; + tsk_id_t index; + const tsk_size_t num_trees = self->tree_sequence->num_trees; + const double *restrict breakpoints = self->tree_sequence->breakpoints; + + index = (tsk_id_t) tsk_search_sorted(breakpoints, num_trees + 1, x); + if (breakpoints[index] > x) { + index--; + } + + if (x < t_l) { + ret = tsk_tree_seek_backward(self, index); + } else { + ret = tsk_tree_seek_forward(self, index); + } + tsk_bug_assert(tsk_tree_position_in_interval(self, x)); + return ret; +} + int TSK_WARN_UNUSED tsk_tree_seek(tsk_tree_t *self, double x, tsk_flags_t options) { @@ -6602,7 +7019,11 @@ tsk_tree_seek(tsk_tree_t *self, double x, tsk_flags_t options) if (self->index == -1) { ret = tsk_tree_seek_from_null(self, x, options); } else { - ret = tsk_tree_seek_linear(self, x, options); + if (options & TSK_SEEK_SKIP) { + ret = tsk_tree_seek_skip(self, x); + } else { + ret = tsk_tree_seek_linear(self, x); + } } out: @@ -7358,18 +7779,6 @@ tsk_tree_map_mutations(tsk_tree_t *self, int32_t *genotypes, return ret; } -/* Compatibility shim for initialising the diff iterator from a tree sequence. We are - * using this function in a small number of places internally, so simplest to keep it - * until a more satisfactory "diff" API comes along. - */ -int TSK_WARN_UNUSED -tsk_diff_iter_init_from_ts( - tsk_diff_iter_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t options) -{ - return tsk_diff_iter_init( - self, tree_sequence->tables, (tsk_id_t) tree_sequence->num_trees, options); -} - /* ======================================================== * * KC Distance * ======================================================== */ @@ -9586,8 +9995,9 @@ tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_samp window_span = windows[w + 1] - windows[w] - missing_span; missing_span = 0.0; if (num_edges == 0) { + /* missing interval, so remove overcounted missing span */ remaining_span = right - windows[w + 1]; - window_span -= remaining_span; + window_span += remaining_span; missing_span += remaining_span; } for (i = 0; i < (tsk_id_t) num_set_indexes; i++) { @@ -9842,7 +10252,7 @@ check_coalescence_rate_time_windows(const tsk_treeseq_t *self, timepoint = time_windows[i + 1]; } if (timepoint != INFINITY) { - ret = tsk_trace_error(TSK_ERR_BAD_TIME_WINDOWS); + ret = tsk_trace_error(TSK_ERR_BAD_TIME_WINDOWS_END); goto out; } /* all sample times align with start of first time window */ diff --git a/treerec/tskit/trees.h b/treerec/tskit/trees.h index bef944ff..21495edb 100644 --- a/treerec/tskit/trees.h +++ b/treerec/tskit/trees.h @@ -71,6 +71,11 @@ when the tree sequence is initialised. Indexes are required for a valid tree sequence, and are not built by default for performance reasons. */ #define TSK_TS_INIT_BUILD_INDEXES (1 << 0) +/** +If specified, mutation parents in the table collection will be overwritten +with those computed from the topology when the tree sequence is initialised. +*/ +#define TSK_TS_INIT_COMPUTE_MUTATION_PARENTS (1 << 1) /** @} */ // clang-format on @@ -981,15 +986,17 @@ typedef int general_stat_func_t(tsk_size_t state_dim, const double *state, int tsk_treeseq_general_stat(const tsk_treeseq_t *self, tsk_size_t K, const double *W, tsk_size_t M, general_stat_func_t *f, void *f_params, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); -// TODO: expose this externally? -/* int tsk_treeseq_two_locus_general_stat(const tsk_treeseq_t *self, */ -/* tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, */ -/* const tsk_id_t *sample_sets, tsk_size_t result_dim, const tsk_id_t *set_indexes, - */ -/* general_stat_func_t *f, norm_func_t *norm_f, tsk_size_t num_left_windows, */ -/* const double *left_windows, tsk_size_t num_right_windows, */ -/* const double *right_windows, tsk_flags_t options, tsk_size_t num_result, */ -/* double *result); */ + +typedef int norm_func_t(tsk_size_t result_dim, const double *hap_weights, tsk_size_t n_a, + tsk_size_t n_b, double *result, void *params); + +int tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, + tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, + const tsk_id_t *sample_sets, tsk_size_t result_dim, const tsk_id_t *set_indexes, + general_stat_func_t *f, norm_func_t *norm_f, tsk_size_t out_rows, + const tsk_id_t *row_sites, const double *row_positions, tsk_size_t out_cols, + const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, + double *result); /* One way weighted stats */ @@ -1056,31 +1063,14 @@ int tsk_treeseq_Y1(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, int tsk_treeseq_allele_frequency_spectrum(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, - tsk_flags_t options, double *result); + tsk_size_t num_time_windows, const double *time_windows, tsk_flags_t options, + double *result); typedef int general_sample_stat_method(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_indexes, const tsk_id_t *indexes, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); -int tsk_treeseq_divergence(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result); -int tsk_treeseq_Y2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result); -int tsk_treeseq_f2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, - tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, - const double *windows, tsk_flags_t options, double *result); -int tsk_treeseq_genetic_relatedness(const tsk_treeseq_t *self, - tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, - const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, - const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, - tsk_flags_t options, double *result); - typedef int two_locus_count_stat_method(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, @@ -1138,6 +1128,51 @@ int tsk_treeseq_pi2_unbiased(const tsk_treeseq_t *self, tsk_size_t num_sample_se const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); +typedef int k_way_two_locus_count_stat_method(const tsk_treeseq_t *self, + tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, + const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, + const tsk_id_t *index_tuples, tsk_size_t num_rows, const tsk_id_t *row_sites, + const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, + const double *col_positions, tsk_flags_t options, double *result); + +/* Two way sample set stats */ + +int tsk_treeseq_divergence(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, + const double *windows, tsk_flags_t options, double *result); +int tsk_treeseq_Y2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, + const double *windows, tsk_flags_t options, double *result); +int tsk_treeseq_f2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, + const double *windows, tsk_flags_t options, double *result); +int tsk_treeseq_genetic_relatedness(const tsk_treeseq_t *self, + tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, + const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, + const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, + tsk_flags_t options, double *result); +int tsk_treeseq_D2_ij(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_rows, + const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, + const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, + double *result); +int tsk_treeseq_D2_ij_unbiased(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_rows, + const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, + const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, + double *result); +int tsk_treeseq_r2_ij(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_rows, + const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, + const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, + double *result); + /* Three way sample set stats */ int tsk_treeseq_Y3(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, @@ -1270,6 +1305,13 @@ int tsk_tree_copy(const tsk_tree_t *self, tsk_tree_t *dest, tsk_flags_t options) @{ */ +/** @brief Option to seek by skipping to the target tree, adding and removing as few + edges as possible. If not specified, a linear time algorithm is used instead. + + @ingroup TREE_API_SEEKING_GROUP +*/ +#define TSK_SEEK_SKIP (1 << 0) + /** @brief Seek to the first tree in the sequence. @@ -1375,12 +1417,22 @@ we will have ``position < tree.interval.right``. Seeking to a position currently covered by the tree is a constant time operation. + +Seeking to a position from a non-null tree uses a linear time +algorithm by default, unless the option :c:macro:`TSK_SEEK_SKIP` +is specified. In this case, a faster algorithm is employed which skips +to the target tree by removing and adding the minimal number of edges +possible. However, this approach does not guarantee that edges are +inserted and removed in time-sorted order. + +.. warning:: Using the :c:macro:`TSK_SEEK_SKIP` option + may lead to edges not being inserted or removed in time-sorted order. + @endrst @param self A pointer to an initialised tsk_tree_t object. @param position The position in genome coordinates -@param options Seek options. Currently unused. Set to 0 for compatibility - with future versions of tskit. +@param options Seek options. See the notes above for details. @return Return 0 on success or a negative value on failure. */ int tsk_tree_seek(tsk_tree_t *self, double position, tsk_flags_t options); @@ -1904,9 +1956,6 @@ bool tsk_tree_is_sample(const tsk_tree_t *self, tsk_id_t u); */ bool tsk_tree_equals(const tsk_tree_t *self, const tsk_tree_t *other); -int tsk_diff_iter_init_from_ts( - tsk_diff_iter_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t options); - int tsk_tree_position_init( tsk_tree_position_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t options); int tsk_tree_position_free(tsk_tree_position_t *self); From 2f6ce2368a1d352eda3c2c5cc59d6a1d8ef0d394 Mon Sep 17 00:00:00 2001 From: peter Date: Sun, 30 Nov 2025 08:00:13 -0800 Subject: [PATCH 2/2] update to 1.0.0 --- treerec/_README | 1 + treerec/tskit/core.c | 125 +++++---- treerec/tskit/core.h | 56 ++-- treerec/tskit/genotypes.c | 239 +++++++++++++++- treerec/tskit/tables.c | 51 +++- treerec/tskit/tables.h | 16 +- treerec/tskit/trees.c | 578 ++++++++++++++++++++++---------------- treerec/tskit/trees.h | 59 +++- 8 files changed, 788 insertions(+), 337 deletions(-) diff --git a/treerec/_README b/treerec/_README index 45526099..121a4c7a 100644 --- a/treerec/_README +++ b/treerec/_README @@ -8,6 +8,7 @@ The code in `tskit/kastore` is from [kastore/c](https://github.com/tskit-dev/kas **Changelog:** +- *30 November 2025*: updated to tskit 1.0.0 - *24 September 2025*: updated to tskit C_1.2.0 (=1.0.0b2) (still kastore C_2.1.1) - *15 April 2025*: updated to tskit C_1.1.4 (=0.6.2) (still kastore C_2.1.1) - *7 November 2023*: updated to tskit C_1.1.2 (=0.5.5) (still kastore C_2.1.1) diff --git a/treerec/tskit/core.c b/treerec/tskit/core.c index 53cc0ce6..0f31550a 100644 --- a/treerec/tskit/core.c +++ b/treerec/tskit/core.c @@ -584,6 +584,14 @@ tsk_strerror_internal(int err) ret = "Must have at least one allele when specifying an allele map. " "(TSK_ERR_ZERO_ALLELES)"; break; + case TSK_ERR_BAD_ALLELE_LENGTH: + ret = "Alleles used when decoding alignments must have length one. " + "(TSK_ERR_BAD_ALLELE_LENGTH)"; + break; + case TSK_ERR_MISSING_CHAR_COLLISION: + ret = "Alleles used when decoding alignments must not match the missing " + "data character. (TSK_ERR_MISSING_CHAR_COLLISION)"; + break; /* Distance metric errors */ case TSK_ERR_SAMPLE_SIZE_MISMATCH: @@ -1260,16 +1268,16 @@ tsk_avl_tree_int_ordered_nodes(const tsk_avl_tree_int_t *self, tsk_avl_node_int_ } // Bit Array implementation. Allows us to store unsigned integers in a compact manner. -// Currently implemented as an array of 32-bit unsigned integers for ease of counting. +// Currently implemented as an array of 32-bit unsigned integers. int -tsk_bit_array_init(tsk_bit_array_t *self, tsk_size_t num_bits, tsk_size_t length) +tsk_bitset_init(tsk_bitset_t *self, tsk_size_t num_bits, tsk_size_t length) { int ret = 0; - self->size = (num_bits >> TSK_BIT_ARRAY_CHUNK) - + (num_bits % TSK_BIT_ARRAY_NUM_BITS ? 1 : 0); - self->data = tsk_calloc(self->size * length, sizeof(*self->data)); + self->row_len = (num_bits / TSK_BITSET_BITS) + (num_bits % TSK_BITSET_BITS ? 1 : 0); + self->len = length; + self->data = tsk_calloc(self->row_len * length, sizeof(*self->data)); if (self->data == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; @@ -1278,96 +1286,111 @@ tsk_bit_array_init(tsk_bit_array_t *self, tsk_size_t num_bits, tsk_size_t length return ret; } -void -tsk_bit_array_get_row(const tsk_bit_array_t *self, tsk_size_t row, tsk_bit_array_t *out) -{ - out->size = self->size; - out->data = self->data + (row * self->size); -} +#define BITSET_DATA_ROW(bs, row) ((bs)->data + (row) * (bs)->row_len) void -tsk_bit_array_intersect( - const tsk_bit_array_t *self, const tsk_bit_array_t *other, tsk_bit_array_t *out) +tsk_bitset_intersect(const tsk_bitset_t *self, tsk_size_t self_row, + const tsk_bitset_t *other, tsk_size_t other_row, tsk_bitset_t *out) { - for (tsk_size_t i = 0; i < self->size; i++) { - out->data[i] = self->data[i] & other->data[i]; + const tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, self_row); + const tsk_bitset_val_t *restrict other_d = BITSET_DATA_ROW(other, other_row); + tsk_bitset_val_t *restrict out_d = out->data; + for (tsk_size_t i = 0; i < self->row_len; i++) { + out_d[i] = self_d[i] & other_d[i]; } } void -tsk_bit_array_subtract(tsk_bit_array_t *self, const tsk_bit_array_t *other) +tsk_bitset_subtract(tsk_bitset_t *self, tsk_size_t self_row, const tsk_bitset_t *other, + tsk_size_t other_row) { - for (tsk_size_t i = 0; i < self->size; i++) { - self->data[i] &= ~(other->data[i]); + tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, self_row); + const tsk_bitset_val_t *restrict other_d = BITSET_DATA_ROW(other, other_row); + for (tsk_size_t i = 0; i < self->row_len; i++) { + self_d[i] &= ~(other_d[i]); } } void -tsk_bit_array_add(tsk_bit_array_t *self, const tsk_bit_array_t *other) +tsk_bitset_union(tsk_bitset_t *self, tsk_size_t self_row, const tsk_bitset_t *other, + tsk_size_t other_row) { - for (tsk_size_t i = 0; i < self->size; i++) { - self->data[i] |= other->data[i]; + tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, self_row); + const tsk_bitset_val_t *restrict other_d = BITSET_DATA_ROW(other, other_row); + for (tsk_size_t i = 0; i < self->row_len; i++) { + self_d[i] |= other_d[i]; } } void -tsk_bit_array_add_bit(tsk_bit_array_t *self, const tsk_bit_array_value_t bit) +tsk_bitset_set_bit(tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit) { - tsk_bit_array_value_t i = bit >> TSK_BIT_ARRAY_CHUNK; - self->data[i] |= (tsk_bit_array_value_t) 1 << (bit - (TSK_BIT_ARRAY_NUM_BITS * i)); + tsk_bitset_val_t i = (bit / TSK_BITSET_BITS); + *(BITSET_DATA_ROW(self, row) + i) |= (tsk_bitset_val_t) 1 + << (bit - (TSK_BITSET_BITS * i)); } bool -tsk_bit_array_contains(const tsk_bit_array_t *self, const tsk_bit_array_value_t bit) +tsk_bitset_contains(const tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit) { - tsk_bit_array_value_t i = bit >> TSK_BIT_ARRAY_CHUNK; - return self->data[i] - & ((tsk_bit_array_value_t) 1 << (bit - (TSK_BIT_ARRAY_NUM_BITS * i))); + tsk_bitset_val_t i = (bit / TSK_BITSET_BITS); + return *(BITSET_DATA_ROW(self, row) + i) + & ((tsk_bitset_val_t) 1 << (bit - (TSK_BITSET_BITS * i))); } -tsk_size_t -tsk_bit_array_count(const tsk_bit_array_t *self) +static inline uint32_t +popcount(tsk_bitset_val_t v) { - // Utilizes 12 operations per bit array. NB this only works on 32 bit integers. + // Utilizes 12 operations per chunk. NB this only works on 32 bit integers. // Taken from: // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel // There's a nice breakdown of this algorithm here: // https://stackoverflow.com/a/109025 - // Could probably do better with explicit SIMD (instead of SWAR), but not as - // portable: https://arxiv.org/pdf/1611.07612.pdf // - // There is one solution to explore further, which uses __builtin_popcountll. - // This option is relatively simple, but requires a 64 bit bit array and also - // involves some compiler flag plumbing (-mpopcnt) + // The gcc/clang compiler flag will -mpopcnt will convert this code to a + // popcnt instruction (most if not all modern CPUs will support this). The + // popcnt instruction will yield some speed improvements, which depend on + // the tree sequence. + // + // NB: 32bit counting is typically faster than 64bit counting for this task. + // (at least on x86-64) + + v = v - ((v >> 1) & 0x55555555); + v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + return (((v + (v >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; +} - tsk_bit_array_value_t tmp; - tsk_size_t i, count = 0; +tsk_size_t +tsk_bitset_count(const tsk_bitset_t *self, tsk_size_t row) +{ + tsk_size_t i = 0; + tsk_size_t count = 0; + const tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, row); - for (i = 0; i < self->size; i++) { - tmp = self->data[i] - ((self->data[i] >> 1) & 0x55555555); - tmp = (tmp & 0x33333333) + ((tmp >> 2) & 0x33333333); - count += (((tmp + (tmp >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; + for (i = 0; i < self->row_len; i++) { + count += popcount(self_d[i]); } return count; } void -tsk_bit_array_get_items( - const tsk_bit_array_t *self, tsk_id_t *items, tsk_size_t *n_items) +tsk_bitset_get_items( + const tsk_bitset_t *self, tsk_size_t row, tsk_id_t *items, tsk_size_t *n_items) { // Get the items stored in the row of a bitset. - // Uses a de Bruijn sequence lookup table to determine the lowest bit set. See the - // wikipedia article for more info: https://w.wiki/BYiF + // Uses a de Bruijn sequence lookup table to determine the lowest bit set. + // See the wikipedia article for more info: https://w.wiki/BYiF tsk_size_t i, n, off; - tsk_bit_array_value_t v, lsb; // least significant bit + tsk_bitset_val_t v, lsb; // least significant bit static const tsk_id_t lookup[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; + const tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, row); n = 0; - for (i = 0; i < self->size; i++) { - v = self->data[i]; - off = i * ((tsk_size_t) TSK_BIT_ARRAY_NUM_BITS); + for (i = 0; i < self->row_len; i++) { + v = self_d[i]; + off = i * TSK_BITSET_BITS; if (v == 0) { continue; } @@ -1381,7 +1404,7 @@ tsk_bit_array_get_items( } void -tsk_bit_array_free(tsk_bit_array_t *self) +tsk_bitset_free(tsk_bitset_t *self) { tsk_safe_free(self->data); } diff --git a/treerec/tskit/core.h b/treerec/tskit/core.h index 7dd24eba..481905b7 100644 --- a/treerec/tskit/core.h +++ b/treerec/tskit/core.h @@ -147,7 +147,7 @@ sizes and types of externally visible structs. The library minor version. Incremented when non-breaking backward-compatible changes to the API or ABI are introduced, i.e., the addition of a new function. */ -#define TSK_VERSION_MINOR 2 +#define TSK_VERSION_MINOR 3 /** The library patch version. Incremented when any changes not relevant to the to the API or ABI are introduced, i.e., internal refactors of bugfixes. @@ -803,6 +803,14 @@ More than 2147483647 alleles were specified. A user-specified allele map was used, but it contained zero alleles. */ #define TSK_ERR_ZERO_ALLELES -1103 +/** +An allele used when decoding alignments had length other than one. +*/ +#define TSK_ERR_BAD_ALLELE_LENGTH -1104 +/** +An allele used when decoding alignments matched the missing data character. +*/ +#define TSK_ERR_MISSING_CHAR_COLLISION -1105 /** @} */ /** @@ -1104,29 +1112,31 @@ FILE *tsk_get_debug_stream(void); /* Bit Array functionality */ -typedef uint32_t tsk_bit_array_value_t; +// define a 32-bit chunk size for our bitsets. +// this means we'll be able to hold 32 distinct items in each 32 bit uint +#define TSK_BITSET_BITS ((tsk_size_t) 32) +typedef uint32_t tsk_bitset_val_t; + typedef struct { - tsk_size_t size; // Number of chunks per row - tsk_bit_array_value_t *data; // Array data -} tsk_bit_array_t; - -#define TSK_BIT_ARRAY_CHUNK 5U -#define TSK_BIT_ARRAY_NUM_BITS (1U << TSK_BIT_ARRAY_CHUNK) - -int tsk_bit_array_init(tsk_bit_array_t *self, tsk_size_t num_bits, tsk_size_t length); -void tsk_bit_array_free(tsk_bit_array_t *self); -void tsk_bit_array_get_row( - const tsk_bit_array_t *self, tsk_size_t row, tsk_bit_array_t *out); -void tsk_bit_array_intersect( - const tsk_bit_array_t *self, const tsk_bit_array_t *other, tsk_bit_array_t *out); -void tsk_bit_array_subtract(tsk_bit_array_t *self, const tsk_bit_array_t *other); -void tsk_bit_array_add(tsk_bit_array_t *self, const tsk_bit_array_t *other); -void tsk_bit_array_add_bit(tsk_bit_array_t *self, const tsk_bit_array_value_t bit); -bool tsk_bit_array_contains( - const tsk_bit_array_t *self, const tsk_bit_array_value_t bit); -tsk_size_t tsk_bit_array_count(const tsk_bit_array_t *self); -void tsk_bit_array_get_items( - const tsk_bit_array_t *self, tsk_id_t *items, tsk_size_t *n_items); + tsk_size_t row_len; // Number of size TSK_BITSET_BITS chunks per row + tsk_size_t len; // Number of rows + tsk_bitset_val_t *data; +} tsk_bitset_t; + +int tsk_bitset_init(tsk_bitset_t *self, tsk_size_t num_bits, tsk_size_t length); +void tsk_bitset_free(tsk_bitset_t *self); +void tsk_bitset_intersect(const tsk_bitset_t *self, tsk_size_t self_row, + const tsk_bitset_t *other, tsk_size_t other_row, tsk_bitset_t *out); +void tsk_bitset_subtract(tsk_bitset_t *self, tsk_size_t self_row, + const tsk_bitset_t *other, tsk_size_t other_row); +void tsk_bitset_union(tsk_bitset_t *self, tsk_size_t self_row, const tsk_bitset_t *other, + tsk_size_t other_row); +void tsk_bitset_set_bit(tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit); +bool tsk_bitset_contains( + const tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit); +tsk_size_t tsk_bitset_count(const tsk_bitset_t *self, tsk_size_t row); +void tsk_bitset_get_items( + const tsk_bitset_t *self, tsk_size_t row, tsk_id_t *items, tsk_size_t *n_items); #ifdef __cplusplus } diff --git a/treerec/tskit/genotypes.c b/treerec/tskit/genotypes.c index c2385281..304fb3f3 100644 --- a/treerec/tskit/genotypes.c +++ b/treerec/tskit/genotypes.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -90,12 +91,10 @@ tsk_variant_copy_alleles(tsk_variant_t *self, const char **alleles) static int variant_init_samples_and_index_map(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, const tsk_id_t *samples, tsk_size_t num_samples, - size_t num_samples_alloc, tsk_flags_t options) + size_t num_samples_alloc, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; - const tsk_flags_t *flags = tree_sequence->tables->nodes.flags; tsk_size_t j, num_nodes; - bool impute_missing = !!(options & TSK_ISOLATED_NOT_MISSING); tsk_id_t u; num_nodes = tsk_treeseq_get_num_nodes(tree_sequence); @@ -120,11 +119,6 @@ variant_init_samples_and_index_map(tsk_variant_t *self, ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } - /* We can only detect missing data for samples */ - if (!impute_missing && !(flags[u] & TSK_NODE_IS_SAMPLE)) { - ret = tsk_trace_error(TSK_ERR_MUST_IMPUTE_NON_SAMPLES); - goto out; - } self->alt_sample_index_map[samples[j]] = (tsk_id_t) j; } out: @@ -439,6 +433,27 @@ tsk_variant_mark_missing(tsk_variant_t *self) return num_missing; } +/* Mark missing for any requested node (sample or non-sample) that is isolated + * in the current tree, i.e., has no parent and no children at this position. */ +static tsk_size_t +tsk_variant_mark_missing_any(tsk_variant_t *self) +{ + tsk_size_t num_missing = 0; + int32_t *restrict genotypes = self->genotypes; + const tsk_id_t *restrict parent = self->tree.parent; + const tsk_id_t *restrict left_child = self->tree.left_child; + tsk_size_t j; + + for (j = 0; j < self->num_samples; j++) { + tsk_id_t u = self->samples[j]; + if (parent[u] == TSK_NULL && left_child[u] == TSK_NULL) { + genotypes[j] = TSK_MISSING_DATA; + num_missing++; + } + } + return num_missing; +} + static tsk_id_t tsk_variant_get_allele_index(tsk_variant_t *self, const char *allele, tsk_size_t length) { @@ -502,6 +517,10 @@ tsk_variant_decode( update_genotypes = tsk_variant_update_genotypes_sample_list; if (by_traversal) { update_genotypes = tsk_variant_update_genotypes_traversal; + /* When decoding a user-provided list of nodes (which may include + * non-samples), mark isolated nodes as missing directly by checking + * isolation status for each requested node. */ + mark_missing = tsk_variant_mark_missing_any; } if (self->user_alleles) { @@ -644,3 +663,207 @@ tsk_vargen_next(tsk_vargen_t *self, tsk_variant_t **variant) out: return ret; } + +static int +tsk_treeseq_decode_alignments_overlay_missing(const tsk_treeseq_t *self, + const tsk_id_t *nodes, tsk_size_t num_nodes, double left, double right, + char missing_data_character, tsk_size_t L, char *alignments_out) +{ + int ret = 0; + tsk_tree_t tree; + tsk_size_t i, seg_left, seg_right; + char *row = NULL; + tsk_id_t u; + + tsk_memset(&tree, 0, sizeof(tree)); + + ret = tsk_tree_init(&tree, self, 0); + if (ret != 0) { + goto out; + } + ret = tsk_tree_seek(&tree, left, 0); + if (ret != 0) { + goto out; + } + while (tree.index != -1 && tree.interval.left < right) { + seg_left = TSK_MAX((tsk_size_t) tree.interval.left, (tsk_size_t) left); + seg_right = TSK_MIN((tsk_size_t) tree.interval.right, (tsk_size_t) right); + if (seg_right > seg_left) { + for (i = 0; i < num_nodes; i++) { + u = nodes[i]; + if (tree.parent[u] == TSK_NULL && tree.left_child[u] == TSK_NULL) { + row = alignments_out + i * L; + /* memset takes an `int`, `missing_data_character` is a `char` which + * can be signed or unsigned depending on the platform, so we need to + * cast. Some tools/compilers will warn if we just cast + * to `unsigned char` and leave the cast to `int` as implicit, hence + * the double cast. */ + tsk_memset(row + (seg_left - (tsk_size_t) left), + (int) (unsigned char) missing_data_character, + seg_right - seg_left); + } + } + } + ret = tsk_tree_next(&tree); + if (ret < 0) { + goto out; + } + } + + /* On success we should return 0, not TSK_TREE_OK from the last tsk_tree_next */ + ret = 0; +out: + tsk_tree_free(&tree); + return ret; +} + +static int +tsk_treeseq_decode_alignments_overlay_sites(const tsk_treeseq_t *self, + const tsk_id_t *nodes, tsk_size_t num_nodes, double left, double right, + char missing_data_character, tsk_size_t L, char *alignments_out, tsk_flags_t options) +{ + int ret = 0; + tsk_variant_t var; + tsk_id_t site_id; + tsk_site_t site; + char *allele_byte = NULL; + tsk_size_t allele_cap = 0; + tsk_size_t i, j; + char *row = NULL; + int32_t g; + char c; + char *tmp = NULL; + + tsk_memset(&var, 0, sizeof(var)); + + ret = tsk_variant_init(&var, self, nodes, num_nodes, NULL, options); + if (ret != 0) { + goto out; + } + for (site_id = 0; site_id < (tsk_id_t) tsk_treeseq_get_num_sites(self); site_id++) { + ret = tsk_treeseq_get_site(self, site_id, &site); + if (ret != 0) { + goto out; + } + if (site.position < left) { + continue; + } + if (site.position >= right) { + break; + } + ret = tsk_variant_decode(&var, site_id, 0); + if (ret != 0) { + goto out; + } + if (var.num_alleles > 0) { + if (var.num_alleles > allele_cap) { + tmp = tsk_realloc(allele_byte, var.num_alleles * sizeof(*allele_byte)); + if (tmp == NULL) { + ret = tsk_trace_error(TSK_ERR_NO_MEMORY); + goto out; + } + allele_byte = tmp; + allele_cap = var.num_alleles; + } + for (j = 0; j < var.num_alleles; j++) { + if (var.allele_lengths[j] != 1) { + ret = tsk_trace_error(TSK_ERR_BAD_ALLELE_LENGTH); + goto out; + } + allele_byte[j] = var.alleles[j][0]; + if (allele_byte[j] == missing_data_character) { + ret = tsk_trace_error(TSK_ERR_MISSING_CHAR_COLLISION); + goto out; + } + } + for (i = 0; i < num_nodes; i++) { + row = alignments_out + i * L; + g = var.genotypes[i]; + c = missing_data_character; + if (g != TSK_MISSING_DATA) { + tsk_bug_assert(g >= 0); + tsk_bug_assert((tsk_size_t) g < var.num_alleles); + c = allele_byte[g]; + } + row[((tsk_size_t) site.position) - (tsk_size_t) left] = (char) c; + } + } + } + +out: + tsk_safe_free(allele_byte); + tsk_variant_free(&var); + return ret; +} + +/* NOTE: We usually keep functions with a tsk_treeseq_t signature in trees.c. + * tsk_treeseq_decode_alignments is implemented here instead because it + * depends directly on tsk_variant_t and the genotype/allele machinery in + * this file (and thus on genotypes.h). This slightly breaks that layering + * convention but keeps the implementation close to the variant code. */ +int +tsk_treeseq_decode_alignments(const tsk_treeseq_t *self, const char *ref_seq, + tsk_size_t ref_seq_length, const tsk_id_t *nodes, tsk_size_t num_nodes, double left, + double right, char missing_data_character, char *alignments_out, tsk_flags_t options) +{ + int ret = 0; + tsk_size_t i, L; + char *row = NULL; + + if (!tsk_treeseq_get_discrete_genome(self)) { + ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); + goto out; + } + if (ref_seq == NULL) { + ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); + goto out; + } + if (ref_seq_length != (tsk_size_t) tsk_treeseq_get_sequence_length(self)) { + ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); + goto out; + } + if (trunc(left) != left || trunc(right) != right) { + ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); + goto out; + } + if (left < 0 || right > tsk_treeseq_get_sequence_length(self) + || (tsk_size_t) left >= (tsk_size_t) right) { + ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); + goto out; + } + L = (tsk_size_t) right - (tsk_size_t) left; + if (num_nodes == 0) { + return 0; + } + if (nodes == NULL || alignments_out == NULL) { + ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); + goto out; + } + for (i = 0; i < num_nodes; i++) { + if (nodes[i] < 0 || nodes[i] >= (tsk_id_t) tsk_treeseq_get_num_nodes(self)) { + ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); + goto out; + } + } + + /* Fill rows with the reference slice */ + for (i = 0; i < num_nodes; i++) { + row = alignments_out + i * L; + tsk_memcpy(row, ref_seq + (tsk_size_t) left, L); + } + if (!(options & TSK_ISOLATED_NOT_MISSING)) { + ret = tsk_treeseq_decode_alignments_overlay_missing(self, nodes, num_nodes, left, + right, missing_data_character, L, alignments_out); + if (ret != 0) { + goto out; + } + } + ret = tsk_treeseq_decode_alignments_overlay_sites(self, nodes, num_nodes, left, + right, missing_data_character, L, alignments_out, options); + if (ret != 0) { + goto out; + } + +out: + return ret; +} diff --git a/treerec/tskit/tables.c b/treerec/tskit/tables.c index 9805d669..8b92be49 100644 --- a/treerec/tskit/tables.c +++ b/treerec/tskit/tables.c @@ -11078,7 +11078,7 @@ tsk_table_collection_check_integrity( | TSK_CHECK_MIGRATION_ORDERING | TSK_CHECK_INDEXES; } - if (self->sequence_length <= 0) { + if (!tsk_isfinite(self->sequence_length) || self->sequence_length <= 0) { ret = tsk_trace_error(TSK_ERR_BAD_SEQUENCE_LENGTH); goto out; } @@ -12436,8 +12436,27 @@ tsk_table_collection_compute_mutation_parents( tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; + tsk_mutation_table_t *mutations = &self->mutations; + tsk_id_t *parent_backup = NULL; + bool restore_parents = false; if (!(options & TSK_NO_CHECK_INTEGRITY)) { + if (mutations->num_rows > 0) { + /* We need to wipe the parent column before computing, as otherwise invalid + * parents can cause integrity checks to fail. We take a copy to restore on + * error */ + parent_backup = tsk_malloc(mutations->num_rows * sizeof(*parent_backup)); + if (parent_backup == NULL) { + ret = tsk_trace_error(TSK_ERR_NO_MEMORY); + goto out; + } + tsk_memcpy(parent_backup, mutations->parent, + mutations->num_rows * sizeof(*parent_backup)); + /* Set the parent pointers to TSK_NULL */ + tsk_memset(mutations->parent, 0xff, + mutations->num_rows * sizeof(*mutations->parent)); + restore_parents = true; + } /* Safe to cast here as we're not counting trees */ ret = (int) tsk_table_collection_check_integrity(self, TSK_CHECK_TREES); if (ret < 0) { @@ -12452,6 +12471,11 @@ tsk_table_collection_compute_mutation_parents( } out: + if (ret != 0 && restore_parents) { + tsk_memcpy(mutations->parent, parent_backup, + mutations->num_rows * sizeof(*parent_backup)); + } + tsk_safe_free(parent_backup); return ret; } @@ -13202,6 +13226,8 @@ tsk_table_collection_union(tsk_table_collection_t *self, tsk_id_t *site_map = NULL; bool add_populations = !(options & TSK_UNION_NO_ADD_POP); bool check_shared_portion = !(options & TSK_UNION_NO_CHECK_SHARED); + bool all_edges = !!(options & TSK_UNION_ALL_EDGES); + bool all_mutations = !!(options & TSK_UNION_ALL_MUTATIONS); /* Not calling TSK_CHECK_TREES so casting to int is safe */ ret = (int) tsk_table_collection_check_integrity(self, 0); @@ -13285,7 +13311,7 @@ tsk_table_collection_union(tsk_table_collection_t *self, // edges for (k = 0; k < (tsk_id_t) other->edges.num_rows; k++) { tsk_edge_table_get_row_unsafe(&other->edges, k, &edge); - if ((other_node_mapping[edge.parent] == TSK_NULL) + if (all_edges || (other_node_mapping[edge.parent] == TSK_NULL) || (other_node_mapping[edge.child] == TSK_NULL)) { new_parent = node_map[edge.parent]; new_child = node_map[edge.child]; @@ -13298,14 +13324,31 @@ tsk_table_collection_union(tsk_table_collection_t *self, } } - // mutations and sites + // sites + // first do the "disjoint" (all_mutations) case, where we just add all sites; + // otherwise we want to just add sites for new mutations + if (all_mutations) { + for (k = 0; k < (tsk_id_t) other->sites.num_rows; k++) { + tsk_site_table_get_row_unsafe(&other->sites, k, &site); + ret_id = tsk_site_table_add_row(&self->sites, site.position, + site.ancestral_state, site.ancestral_state_length, site.metadata, + site.metadata_length); + if (ret_id < 0) { + ret = (int) ret_id; + goto out; + } + site_map[site.id] = ret_id; + } + } + + // mutations (and maybe sites) i = 0; for (k = 0; k < (tsk_id_t) other->sites.num_rows; k++) { tsk_site_table_get_row_unsafe(&other->sites, k, &site); while ((i < (tsk_id_t) other->mutations.num_rows) && (other->mutations.site[i] == site.id)) { tsk_mutation_table_get_row_unsafe(&other->mutations, i, &mut); - if (other_node_mapping[mut.node] == TSK_NULL) { + if (all_mutations || (other_node_mapping[mut.node] == TSK_NULL)) { if (site_map[site.id] == TSK_NULL) { ret_id = tsk_site_table_add_row(&self->sites, site.position, site.ancestral_state, site.ancestral_state_length, site.metadata, diff --git a/treerec/tskit/tables.h b/treerec/tskit/tables.h index 85ed29d5..9523ee12 100644 --- a/treerec/tskit/tables.h +++ b/treerec/tskit/tables.h @@ -858,11 +858,21 @@ equality of the subsets. */ #define TSK_UNION_NO_CHECK_SHARED (1 << 0) /** - By default, all nodes new to ``self`` are assigned new populations. If this +By default, all nodes new to ``self`` are assigned new populations. If this option is specified, nodes that are added to ``self`` will retain the population IDs they have in ``other``. */ #define TSK_UNION_NO_ADD_POP (1 << 1) +/** +By default, union only adds edges adjacent to a newly added node; +this option adds all edges. + */ +#define TSK_UNION_ALL_EDGES (1 << 2) +/** +By default, union only adds only mutations on newly added edges, and +sites for those mutations; this option adds all mutations and all sites. + */ +#define TSK_UNION_ALL_MUTATIONS (1 << 3) /** @} */ /** @@ -4414,6 +4424,10 @@ that are exclusive ``other`` are added to ``self``, along with: By default, populations of newly added nodes are assumed to be new populations, and added to the population table as well. +The behavior can be changed by the flags ``TSK_UNION_ALL_EDGES`` and +``TSK_UNION_ALL_MUTATIONS``, which will (respectively) add *all* edges +or *all* sites and mutations instead. + This operation will also sort the resulting tables, so the tables may change even if nothing new is added, if the original tables were not sorted. diff --git a/treerec/tskit/trees.c b/treerec/tskit/trees.c index 7a159a7f..fdc043ec 100644 --- a/treerec/tskit/trees.c +++ b/treerec/tskit/trees.c @@ -31,6 +31,7 @@ #include #include +#include static inline bool is_discrete(double x) @@ -2223,20 +2224,18 @@ tsk_treeseq_sample_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_s ***********************************/ static int -get_allele_samples(const tsk_site_t *site, const tsk_bit_array_t *state, - tsk_bit_array_t *out_allele_samples, tsk_size_t *out_num_alleles) +get_allele_samples(const tsk_site_t *site, tsk_size_t site_offset, + const tsk_bitset_t *state, tsk_bitset_t *out_allele_samples, + tsk_size_t *out_num_alleles) { int ret = 0; tsk_mutation_t mutation, parent_mut; - tsk_size_t mutation_index, allele, alt_allele_length; + tsk_size_t mutation_index, allele, alt_allele, alt_allele_length; /* The allele table */ tsk_size_t max_alleles = site->mutations_length + 1; const char **alleles = tsk_malloc(max_alleles * sizeof(*alleles)); tsk_size_t *allele_lengths = tsk_calloc(max_alleles, sizeof(*allele_lengths)); - const char *alt_allele; - tsk_bit_array_t state_row; - tsk_bit_array_t allele_samples_row; - tsk_bit_array_t alt_allele_samples_row; + const char *alt_allele_state; tsk_size_t num_alleles = 1; if (alleles == NULL || allele_lengths == NULL) { @@ -2267,29 +2266,29 @@ get_allele_samples(const tsk_site_t *site, const tsk_bit_array_t *state, } /* Add the mutation's samples to this allele */ - tsk_bit_array_get_row(out_allele_samples, allele, &allele_samples_row); - tsk_bit_array_get_row(state, mutation_index, &state_row); - tsk_bit_array_add(&allele_samples_row, &state_row); + tsk_bitset_union( + out_allele_samples, allele + site_offset, state, mutation_index); /* Get the index for the alternate allele that we must subtract from */ - alt_allele = site->ancestral_state; + alt_allele_state = site->ancestral_state; alt_allele_length = site->ancestral_state_length; if (mutation.parent != TSK_NULL) { parent_mut = site->mutations[mutation.parent - site->mutations[0].id]; - alt_allele = parent_mut.derived_state; + alt_allele_state = parent_mut.derived_state; alt_allele_length = parent_mut.derived_state_length; } - for (allele = 0; allele < num_alleles; allele++) { - if (alt_allele_length == allele_lengths[allele] - && tsk_memcmp(alt_allele, alleles[allele], allele_lengths[allele]) + for (alt_allele = 0; alt_allele < num_alleles; alt_allele++) { + if (alt_allele_length == allele_lengths[alt_allele] + && tsk_memcmp( + alt_allele_state, alleles[alt_allele], allele_lengths[alt_allele]) == 0) { break; } } tsk_bug_assert(allele < num_alleles); - tsk_bit_array_get_row(out_allele_samples, allele, &alt_allele_samples_row); - tsk_bit_array_subtract(&alt_allele_samples_row, &allele_samples_row); + tsk_bitset_subtract(out_allele_samples, alt_allele + site_offset, + out_allele_samples, allele + site_offset); } *out_num_alleles = num_alleles; out: @@ -2310,7 +2309,6 @@ norm_hap_weighted(tsk_size_t result_dim, const double *hap_weights, for (k = 0; k < result_dim; k++) { weight_row = GET_2D_ROW(hap_weights, 3, k); n = (double) args.sample_set_sizes[k]; - // TODO: what to do when n = 0 result[k] = weight_row[0] / n; } return 0; @@ -2347,111 +2345,108 @@ norm_total_weighted(tsk_size_t result_dim, const double *TSK_UNUSED(hap_weights) tsk_size_t n_a, tsk_size_t n_b, double *result, void *TSK_UNUSED(params)) { tsk_size_t k; + double norm = 1 / (double) (n_a * n_b); for (k = 0; k < result_dim; k++) { - result[k] = 1 / (double) (n_a * n_b); + result[k] = norm; } return 0; } static void -get_all_samples_bits(tsk_bit_array_t *all_samples, tsk_size_t n) +get_all_samples_bits(tsk_bitset_t *all_samples, tsk_size_t n) { tsk_size_t i; - const tsk_bit_array_value_t all = ~((tsk_bit_array_value_t) 0); - const tsk_bit_array_value_t remainder_samples = n % TSK_BIT_ARRAY_NUM_BITS; + const tsk_bitset_val_t all = ~((tsk_bitset_val_t) 0); + const tsk_bitset_val_t remainder_samples = n % TSK_BITSET_BITS; - all_samples->data[all_samples->size - 1] + all_samples->data[all_samples->row_len - 1] = remainder_samples ? ~(all << remainder_samples) : all; - for (i = 0; i < all_samples->size - 1; i++) { + for (i = 0; i < all_samples->row_len - 1; i++) { all_samples->data[i] = all; } } +// Stores the intermediate values for computing two-locus statistics. +typedef struct { + double *weights; + double *norm; + double *result_tmp; + tsk_bitset_t AB_samples; +} two_locus_work_t; + static int -compute_general_two_site_stat_result(const tsk_bit_array_t *site_a_state, - const tsk_bit_array_t *site_b_state, tsk_size_t num_a_alleles, - tsk_size_t num_b_alleles, tsk_size_t num_samples, tsk_size_t state_dim, - const tsk_bit_array_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, - sample_count_stat_params_t *f_params, norm_func_t *norm_f, bool polarised, - double *result) +two_locus_work_init(tsk_size_t max_alleles, tsk_size_t num_samples, + tsk_size_t result_dim, tsk_size_t state_dim, two_locus_work_t *out) { int ret = 0; - tsk_bit_array_t A_samples, B_samples; - // ss_ prefix refers to a sample set - tsk_bit_array_t ss_row; - tsk_bit_array_t ss_A_samples, ss_B_samples, ss_AB_samples, AB_samples; - // Sample sets and b sites are rows, a sites are columns - // b1 b2 b3 - // a1 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] - // a2 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] - // a3 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] - tsk_size_t k, mut_a, mut_b; - tsk_size_t result_row_len = num_b_alleles * result_dim; - tsk_size_t w_A = 0, w_B = 0, w_AB = 0; - uint8_t polarised_val = polarised ? 1 : 0; - double *hap_weight_row; - double *result_tmp_row; - double *weights = tsk_malloc(3 * state_dim * sizeof(*weights)); - double *norm = tsk_malloc(result_dim * sizeof(*norm)); - double *result_tmp - = tsk_malloc(result_row_len * num_a_alleles * sizeof(*result_tmp)); - - tsk_memset(&ss_A_samples, 0, sizeof(ss_A_samples)); - tsk_memset(&ss_B_samples, 0, sizeof(ss_B_samples)); - tsk_memset(&ss_AB_samples, 0, sizeof(ss_AB_samples)); - tsk_memset(&AB_samples, 0, sizeof(AB_samples)); - - if (weights == NULL || norm == NULL || result_tmp == NULL) { - ret = tsk_trace_error(TSK_ERR_NO_MEMORY); - goto out; - } - ret = tsk_bit_array_init(&ss_A_samples, num_samples, 1); - if (ret != 0) { - goto out; - } - ret = tsk_bit_array_init(&ss_B_samples, num_samples, 1); - if (ret != 0) { - goto out; - } - ret = tsk_bit_array_init(&ss_AB_samples, num_samples, 1); - if (ret != 0) { + out->weights = tsk_malloc(3 * state_dim * sizeof(*out->weights)); + out->norm = tsk_malloc(result_dim * sizeof(*out->norm)); + out->result_tmp + = tsk_malloc(result_dim * max_alleles * max_alleles * sizeof(*out->result_tmp)); + tsk_memset(&out->AB_samples, 0, sizeof(out->AB_samples)); + if (out->weights == NULL || out->norm == NULL || out->result_tmp == NULL) { + ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } - ret = tsk_bit_array_init(&AB_samples, num_samples, 1); + ret = tsk_bitset_init(&out->AB_samples, num_samples, 1); if (ret != 0) { goto out; } +out: + return ret; +} + +static void +two_locus_work_free(two_locus_work_t *work) +{ + tsk_safe_free(work->weights); + tsk_safe_free(work->norm); + tsk_safe_free(work->result_tmp); + tsk_bitset_free(&work->AB_samples); +} - for (mut_a = polarised_val; mut_a < num_a_alleles; mut_a++) { +static int +compute_general_normed_two_site_stat_result(const tsk_bitset_t *state, + const tsk_size_t *allele_counts, tsk_size_t a_off, tsk_size_t b_off, + tsk_size_t num_a_alleles, tsk_size_t num_b_alleles, tsk_size_t state_dim, + tsk_size_t result_dim, general_stat_func_t *f, sample_count_stat_params_t *f_params, + norm_func_t *norm_f, bool polarised, two_locus_work_t *restrict work, double *result) +{ + int ret = 0; + // Sample sets and b sites are rows, a sites are columns + // b1 b2 b3 + // a1 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] + // a2 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] + // a3 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] + tsk_size_t k, mut_a, mut_b, result_row_len = num_b_alleles * result_dim; + uint8_t is_polarised = polarised ? 1 : 0; + double *restrict hap_row, *restrict result_tmp_row; + double *restrict norm = work->norm; + double *restrict weights = work->weights; + double *restrict result_tmp = work->result_tmp; + tsk_bitset_t AB_samples = work->AB_samples; + + for (mut_a = is_polarised; mut_a < num_a_alleles; mut_a++) { result_tmp_row = GET_2D_ROW(result_tmp, result_row_len, mut_a); - for (mut_b = polarised_val; mut_b < num_b_alleles; mut_b++) { - tsk_bit_array_get_row(site_a_state, mut_a, &A_samples); - tsk_bit_array_get_row(site_b_state, mut_b, &B_samples); - tsk_bit_array_intersect(&A_samples, &B_samples, &AB_samples); + for (mut_b = is_polarised; mut_b < num_b_alleles; mut_b++) { for (k = 0; k < state_dim; k++) { - tsk_bit_array_get_row(sample_sets, k, &ss_row); - hap_weight_row = GET_2D_ROW(weights, 3, k); - - tsk_bit_array_intersect(&A_samples, &ss_row, &ss_A_samples); - tsk_bit_array_intersect(&B_samples, &ss_row, &ss_B_samples); - tsk_bit_array_intersect(&AB_samples, &ss_row, &ss_AB_samples); - - w_AB = tsk_bit_array_count(&ss_AB_samples); - w_A = tsk_bit_array_count(&ss_A_samples); - w_B = tsk_bit_array_count(&ss_B_samples); - - hap_weight_row[0] = (double) w_AB; - hap_weight_row[1] = (double) (w_A - w_AB); // w_Ab - hap_weight_row[2] = (double) (w_B - w_AB); // w_aB + tsk_bitset_intersect(state, a_off + (mut_a * state_dim) + k, state, + b_off + (mut_b * state_dim) + k, &AB_samples); + hap_row = GET_2D_ROW(weights, 3, k); + hap_row[0] = (double) tsk_bitset_count(&AB_samples, 0); + hap_row[1] = (double) allele_counts[a_off + (mut_a * state_dim) + k] + - hap_row[0]; + hap_row[2] = (double) allele_counts[b_off + (mut_b * state_dim) + k] + - hap_row[0]; } ret = f(state_dim, weights, result_dim, result_tmp_row, f_params); if (ret != 0) { goto out; } - ret = norm_f(result_dim, weights, num_a_alleles - polarised_val, - num_b_alleles - polarised_val, norm, f_params); + ret = norm_f(result_dim, weights, num_a_alleles - is_polarised, + num_b_alleles - is_polarised, norm, f_params); if (ret != 0) { goto out; } @@ -2461,15 +2456,38 @@ compute_general_two_site_stat_result(const tsk_bit_array_t *site_a_state, result_tmp_row += result_dim; // Advance to the next column } } +out: + return ret; +} + +static int +compute_general_two_site_stat_result(const tsk_bitset_t *state, + const tsk_size_t *allele_counts, tsk_size_t a_off, tsk_size_t b_off, + tsk_size_t state_dim, tsk_size_t result_dim, general_stat_func_t *f, + sample_count_stat_params_t *f_params, two_locus_work_t *restrict work, + double *result) +{ + int ret = 0; + tsk_size_t k; + tsk_bitset_t AB_samples = work->AB_samples; + tsk_size_t mut_a = 1, mut_b = 1; + double *restrict hap_row, *restrict weights = work->weights; + for (k = 0; k < state_dim; k++) { + tsk_bitset_intersect(state, a_off + (mut_a * state_dim) + k, state, + b_off + (mut_b * state_dim) + k, &AB_samples); + hap_row = GET_2D_ROW(weights, 3, k); + hap_row[0] = (double) tsk_bitset_count(&AB_samples, 0); + hap_row[1] + = (double) allele_counts[a_off + (mut_a * state_dim) + k] - hap_row[0]; + hap_row[2] + = (double) allele_counts[b_off + (mut_b * state_dim) + k] - hap_row[0]; + } + ret = f(state_dim, weights, result_dim, result, f_params); + if (ret != 0) { + goto out; + } out: - tsk_safe_free(weights); - tsk_safe_free(norm); - tsk_safe_free(result_tmp); - tsk_bit_array_free(&ss_A_samples); - tsk_bit_array_free(&ss_B_samples); - tsk_bit_array_free(&ss_AB_samples); - tsk_bit_array_free(&AB_samples); return ret; } @@ -2520,7 +2538,7 @@ get_site_row_col_indices(tsk_size_t n_rows, const tsk_id_t *row_sites, tsk_size_ static int get_mutation_samples(const tsk_treeseq_t *ts, const tsk_id_t *sites, tsk_size_t n_sites, - tsk_size_t *num_alleles, tsk_bit_array_t *allele_samples) + tsk_size_t *num_alleles, tsk_bitset_t *allele_samples) { int ret = 0; const tsk_flags_t *restrict flags = ts->tables->nodes.flags; @@ -2528,7 +2546,7 @@ get_mutation_samples(const tsk_treeseq_t *ts, const tsk_id_t *sites, tsk_size_t const tsk_size_t *restrict site_muts_len = ts->site_mutations_length; tsk_site_t site; tsk_tree_t tree; - tsk_bit_array_t all_samples_bits, mut_samples, mut_samples_row, out_row; + tsk_bitset_t all_samples_bits, mut_samples; tsk_size_t max_muts_len, site_offset, num_nodes, site_idx, s, m, n; tsk_id_t node, *nodes = NULL; void *tmp_nodes; @@ -2538,16 +2556,14 @@ get_mutation_samples(const tsk_treeseq_t *ts, const tsk_id_t *sites, tsk_size_t max_muts_len = 0; for (s = 0; s < n_sites; s++) { - if (site_muts_len[sites[s]] > max_muts_len) { - max_muts_len = site_muts_len[sites[s]]; - } + max_muts_len = TSK_MAX(site_muts_len[sites[s]], max_muts_len); } // Allocate a bit array of size max alleles for all sites - ret = tsk_bit_array_init(&mut_samples, num_samples, max_muts_len); + ret = tsk_bitset_init(&mut_samples, num_samples, max_muts_len); if (ret != 0) { goto out; } - ret = tsk_bit_array_init(&all_samples_bits, num_samples, 1); + ret = tsk_bitset_init(&all_samples_bits, num_samples, 1); if (ret != 0) { goto out; } @@ -2572,15 +2588,11 @@ get_mutation_samples(const tsk_treeseq_t *ts, const tsk_id_t *sites, tsk_size_t goto out; } nodes = tmp_nodes; - - tsk_bit_array_get_row(allele_samples, site_offset, &out_row); - tsk_bit_array_add(&out_row, &all_samples_bits); - + tsk_bitset_union(allele_samples, site_offset, &all_samples_bits, 0); // Zero out results before the start of each iteration tsk_memset(mut_samples.data, 0, - mut_samples.size * max_muts_len * sizeof(tsk_bit_array_value_t)); + mut_samples.row_len * max_muts_len * sizeof(tsk_bitset_val_t)); for (m = 0; m < site.mutations_length; m++) { - tsk_bit_array_get_row(&mut_samples, m, &mut_samples_row); node = site.mutations[m].node; ret = tsk_tree_preorder_from(&tree, node, nodes, &num_nodes); if (ret != 0) { @@ -2589,43 +2601,77 @@ get_mutation_samples(const tsk_treeseq_t *ts, const tsk_id_t *sites, tsk_size_t for (n = 0; n < num_nodes; n++) { node = nodes[n]; if (flags[node] & TSK_NODE_IS_SAMPLE) { - tsk_bit_array_add_bit(&mut_samples_row, - (tsk_bit_array_value_t) ts->sample_index_map[node]); + tsk_bitset_set_bit( + &mut_samples, m, (tsk_bitset_val_t) ts->sample_index_map[node]); } } } + get_allele_samples( + &site, site_offset, &mut_samples, allele_samples, &(num_alleles[site_idx])); site_offset += site.mutations_length + 1; - get_allele_samples(&site, &mut_samples, &out_row, &(num_alleles[site_idx])); } // if adding code below, check ret before continuing out: tsk_safe_free(nodes); tsk_tree_free(&tree); - tsk_bit_array_free(&mut_samples); - tsk_bit_array_free(&all_samples_bits); + tsk_bitset_free(&mut_samples); + tsk_bitset_free(&all_samples_bits); return ret == TSK_TREE_OK ? 0 : ret; } +// Given the samples under each allele's node and the sample sets, get the samples under +// each allele's node for each sample set. We pack this data into a bitset +// (`allele_sample_sets`) that is size m x n, where m is (n_alleles * num_sample_sets) +// and n is the size of the largest sample set. In addition, we compute the number of +// samples contained in the intersection of each allele's samples and each sample set in +// an array (`allele_sample_sets`) of length (n_alleles * num_sample_sets). +static void +get_mutation_sample_sets(const tsk_bitset_t *allele_samples, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + const tsk_id_t *sample_index_map, tsk_bitset_t *allele_sample_sets, + tsk_size_t *allele_sample_set_counts) +{ + tsk_bitset_val_t k, sample; + tsk_size_t i, j, ss_off; + + for (i = 0; i < allele_samples->len; i++) { + ss_off = 0; + for (j = 0; j < num_sample_sets; j++) { + for (k = 0; k < sample_set_sizes[j]; k++) { + sample = (tsk_bitset_val_t) sample_index_map[sample_sets[k + ss_off]]; + if (tsk_bitset_contains(allele_samples, i, sample)) { + tsk_bitset_set_bit(allele_sample_sets, j + i * num_sample_sets, k); + allele_sample_set_counts[j + i * num_sample_sets]++; + } + } + ss_off += sample_set_sizes[j]; + } + } +} + static int tsk_treeseq_two_site_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, - const tsk_bit_array_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, + tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, + const tsk_id_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, sample_count_stat_params_t *f_params, norm_func_t *norm_f, tsk_size_t n_rows, const tsk_id_t *row_sites, tsk_size_t n_cols, const tsk_id_t *col_sites, tsk_flags_t options, double *result) { - int ret = 0; - tsk_bit_array_t allele_samples, c_state, r_state; - bool polarised = false; + tsk_bitset_t allele_samples, allele_sample_sets; + bool polarised = options & TSK_STAT_POLARISED; tsk_id_t *sites; - tsk_size_t r, c, s, n_alleles, n_sites, *row_idx, *col_idx; + tsk_size_t i, j, n_sites, *row_idx, *col_idx; double *result_row; const tsk_size_t num_samples = self->num_samples; - tsk_size_t *num_alleles = NULL, *site_offsets = NULL; + tsk_size_t *num_alleles = NULL, *site_offsets = NULL, *allele_counts = NULL; tsk_size_t result_row_len = n_cols * result_dim; + tsk_size_t max_ss_size = 0, max_alleles = 0, n_alleles = 0; + two_locus_work_t work; + tsk_memset(&work, 0, sizeof(work)); tsk_memset(&allele_samples, 0, sizeof(allele_samples)); - + tsk_memset(&allele_sample_sets, 0, sizeof(allele_sample_sets)); sites = tsk_malloc(self->tables->sites.num_rows * sizeof(*sites)); row_idx = tsk_malloc(self->tables->sites.num_rows * sizeof(*row_idx)); col_idx = tsk_malloc(self->tables->sites.num_rows * sizeof(*col_idx)); @@ -2635,44 +2681,67 @@ tsk_treeseq_two_site_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, } get_site_row_col_indices( n_rows, row_sites, n_cols, col_sites, sites, &n_sites, row_idx, col_idx); - - // We rely on n_sites to allocate these arrays, which are initialized - // to NULL for safe deallocation if the previous allocation fails + // depends on n_sites num_alleles = tsk_malloc(n_sites * sizeof(*num_alleles)); site_offsets = tsk_malloc(n_sites * sizeof(*site_offsets)); if (num_alleles == NULL || site_offsets == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } - - n_alleles = 0; - for (s = 0; s < n_sites; s++) { - site_offsets[s] = n_alleles; - n_alleles += self->site_mutations_length[sites[s]] + 1; + for (i = 0; i < n_sites; i++) { + site_offsets[i] = n_alleles * num_sample_sets; + n_alleles += self->site_mutations_length[sites[i]] + 1; + max_alleles = TSK_MAX(self->site_mutations_length[sites[i]], max_alleles); } - ret = tsk_bit_array_init(&allele_samples, num_samples, n_alleles); + max_alleles++; // add 1 for the ancestral allele + // depends on n_alleles + ret = tsk_bitset_init(&allele_samples, num_samples, n_alleles); if (ret != 0) { goto out; } - ret = get_mutation_samples(self, sites, n_sites, num_alleles, &allele_samples); + for (i = 0; i < num_sample_sets; i++) { + max_ss_size = TSK_MAX(sample_set_sizes[i], max_ss_size); + } + // depend on n_alleles and max_ss_size + ret = tsk_bitset_init(&allele_sample_sets, max_ss_size, n_alleles * num_sample_sets); if (ret != 0) { goto out; } - - if (options & TSK_STAT_POLARISED) { - polarised = true; + allele_counts = tsk_calloc(n_alleles * num_sample_sets, sizeof(*allele_counts)); + if (allele_counts == NULL) { + ret = tsk_trace_error(TSK_ERR_NO_MEMORY); + goto out; } - + // depends on max_ss_size and max_alleles + ret = two_locus_work_init(max_alleles, max_ss_size, result_dim, state_dim, &work); + if (ret != 0) { + goto out; + } + // we track the number of alleles to account for backmutations + ret = get_mutation_samples(self, sites, n_sites, num_alleles, &allele_samples); + if (ret != 0) { + goto out; + } + get_mutation_sample_sets(&allele_samples, num_sample_sets, sample_set_sizes, + sample_sets, self->sample_index_map, &allele_sample_sets, allele_counts); // For each row/column pair, fill in the sample set in the result matrix. - for (r = 0; r < n_rows; r++) { - result_row = GET_2D_ROW(result, result_row_len, r); - for (c = 0; c < n_cols; c++) { - tsk_bit_array_get_row(&allele_samples, site_offsets[row_idx[r]], &r_state); - tsk_bit_array_get_row(&allele_samples, site_offsets[col_idx[c]], &c_state); - ret = compute_general_two_site_stat_result(&r_state, &c_state, - num_alleles[row_idx[r]], num_alleles[col_idx[c]], num_samples, state_dim, - sample_sets, result_dim, f, f_params, norm_f, polarised, - &(result_row[c * result_dim])); + for (i = 0; i < n_rows; i++) { + result_row = GET_2D_ROW(result, result_row_len, i); + for (j = 0; j < n_cols; j++) { + if (num_alleles[row_idx[i]] == 2 && num_alleles[col_idx[j]] == 2) { + // both sites are biallelic + ret = compute_general_two_site_stat_result(&allele_sample_sets, + allele_counts, site_offsets[row_idx[i]], site_offsets[col_idx[j]], + state_dim, result_dim, f, f_params, &work, + &(result_row[j * result_dim])); + } else { + // at least one site is multiallelic + ret = compute_general_normed_two_site_stat_result(&allele_sample_sets, + allele_counts, site_offsets[row_idx[i]], site_offsets[col_idx[j]], + num_alleles[row_idx[i]], num_alleles[col_idx[j]], state_dim, + result_dim, f, f_params, norm_f, polarised, &work, + &(result_row[j * result_dim])); + } if (ret != 0) { goto out; } @@ -2685,37 +2754,37 @@ tsk_treeseq_two_site_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, tsk_safe_free(col_idx); tsk_safe_free(num_alleles); tsk_safe_free(site_offsets); - tsk_bit_array_free(&allele_samples); + tsk_safe_free(allele_counts); + two_locus_work_free(&work); + tsk_bitset_free(&allele_samples); + tsk_bitset_free(&allele_sample_sets); return ret; } static int -sample_sets_to_bit_array(const tsk_treeseq_t *self, const tsk_size_t *sample_set_sizes, +sample_sets_to_bitset(const tsk_treeseq_t *self, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_sample_sets, - tsk_bit_array_t *sample_sets_bits) + tsk_bitset_t *sample_sets_bits) { int ret; - tsk_bit_array_t bits_row; tsk_size_t j, k, l; tsk_id_t u, sample_index; - ret = tsk_bit_array_init(sample_sets_bits, self->num_samples, num_sample_sets); + ret = tsk_bitset_init(sample_sets_bits, self->num_samples, num_sample_sets); if (ret != 0) { return ret; } - j = 0; for (k = 0; k < num_sample_sets; k++) { - tsk_bit_array_get_row(sample_sets_bits, k, &bits_row); for (l = 0; l < sample_set_sizes[k]; l++) { u = sample_sets[j]; sample_index = self->sample_index_map[u]; - if (tsk_bit_array_contains( - &bits_row, (tsk_bit_array_value_t) sample_index)) { + if (tsk_bitset_contains( + sample_sets_bits, k, (tsk_bitset_val_t) sample_index)) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } - tsk_bit_array_add_bit(&bits_row, (tsk_bit_array_value_t) sample_index); + tsk_bitset_set_bit(sample_sets_bits, k, (tsk_bitset_val_t) sample_index); j++; } } @@ -2852,7 +2921,7 @@ get_index_counts( typedef struct { tsk_tree_t tree; - tsk_bit_array_t *node_samples; + tsk_bitset_t *node_samples; tsk_id_t *parent; tsk_id_t *edges_out; tsk_id_t *edges_in; @@ -2876,7 +2945,7 @@ iter_state_init(iter_state *self, const tsk_treeseq_t *ts, tsk_size_t state_dim) ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } - ret = tsk_bit_array_init(self->node_samples, ts->num_samples, state_dim * num_nodes); + ret = tsk_bitset_init(self->node_samples, ts->num_samples, state_dim * num_nodes); if (ret != 0) { goto out; } @@ -2895,29 +2964,25 @@ iter_state_init(iter_state *self, const tsk_treeseq_t *ts, tsk_size_t state_dim) static int get_node_samples(const tsk_treeseq_t *ts, tsk_size_t state_dim, - const tsk_bit_array_t *sample_sets, tsk_bit_array_t *node_samples) + const tsk_bitset_t *sample_sets, tsk_bitset_t *node_samples) { int ret = 0; tsk_size_t n, k; - tsk_bit_array_t sample_set_row, node_samples_row; tsk_size_t num_nodes = ts->tables->nodes.num_rows; - tsk_bit_array_value_t sample; + tsk_bitset_val_t sample; const tsk_id_t *restrict sample_index_map = ts->sample_index_map; const tsk_flags_t *restrict flags = ts->tables->nodes.flags; - ret = tsk_bit_array_init(node_samples, ts->num_samples, num_nodes * state_dim); + ret = tsk_bitset_init(node_samples, ts->num_samples, num_nodes * state_dim); if (ret != 0) { goto out; } for (k = 0; k < state_dim; k++) { - tsk_bit_array_get_row(sample_sets, k, &sample_set_row); for (n = 0; n < num_nodes; n++) { if (flags[n] & TSK_NODE_IS_SAMPLE) { - sample = (tsk_bit_array_value_t) sample_index_map[n]; - if (tsk_bit_array_contains(&sample_set_row, sample)) { - tsk_bit_array_get_row( - node_samples, (state_dim * n) + k, &node_samples_row); - tsk_bit_array_add_bit(&node_samples_row, sample); + sample = (tsk_bitset_val_t) sample_index_map[n]; + if (tsk_bitset_contains(sample_sets, k, sample)) { + tsk_bitset_set_bit(node_samples, (state_dim * n) + k, sample); } } } @@ -2928,7 +2993,7 @@ get_node_samples(const tsk_treeseq_t *ts, tsk_size_t state_dim, static void iter_state_clear(iter_state *self, tsk_size_t state_dim, tsk_size_t num_nodes, - const tsk_bit_array_t *node_samples) + const tsk_bitset_t *node_samples) { self->n_edges_out = 0; self->n_edges_in = 0; @@ -2938,14 +3003,14 @@ iter_state_clear(iter_state *self, tsk_size_t state_dim, tsk_size_t num_nodes, tsk_memset(self->edges_in, TSK_NULL, num_nodes * sizeof(*self->edges_in)); tsk_memset(self->branch_len, 0, num_nodes * sizeof(*self->branch_len)); tsk_memcpy(self->node_samples->data, node_samples->data, - node_samples->size * state_dim * num_nodes * sizeof(*node_samples->data)); + node_samples->row_len * state_dim * num_nodes * sizeof(*node_samples->data)); } static void iter_state_free(iter_state *self) { tsk_tree_free(&self->tree); - tsk_bit_array_free(self->node_samples); + tsk_bitset_free(self->node_samples); tsk_safe_free(self->node_samples); tsk_safe_free(self->parent); tsk_safe_free(self->edges_out); @@ -3025,41 +3090,26 @@ static int compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, const iter_state *A_state, const iter_state *B_state, tsk_size_t state_dim, tsk_size_t result_dim, int sign, general_stat_func_t *f, - sample_count_stat_params_t *f_params, double *result) + sample_count_stat_params_t *f_params, two_locus_work_t *restrict work, + double *result) { int ret = 0; double a_len, b_len; double *restrict B_branch_len = B_state->branch_len; - double *weights = NULL, *weights_row, *result_tmp = NULL; + double *weights_row; tsk_size_t n, k, a_row, b_row; - tsk_bit_array_t A_samples, B_samples, AB_samples, B_samples_tmp; const double *restrict A_branch_len = A_state->branch_len; - const tsk_bit_array_t *restrict A_state_samples = A_state->node_samples; - const tsk_bit_array_t *restrict B_state_samples = B_state->node_samples; - tsk_size_t num_samples = ts->num_samples; + const tsk_bitset_t *restrict A_state_samples = A_state->node_samples; + const tsk_bitset_t *restrict B_state_samples = B_state->node_samples; tsk_size_t num_nodes = ts->tables->nodes.num_rows; + double *weights = work->weights; + double *result_tmp = work->result_tmp; + tsk_bitset_t AB_samples = work->AB_samples; + b_len = B_branch_len[c] * sign; if (b_len == 0) { return ret; } - - tsk_memset(&AB_samples, 0, sizeof(AB_samples)); - tsk_memset(&B_samples_tmp, 0, sizeof(B_samples_tmp)); - - weights = tsk_calloc(3 * state_dim, sizeof(*weights)); - result_tmp = tsk_calloc(result_dim, sizeof(*result_tmp)); - if (weights == NULL || result_tmp == NULL) { - ret = tsk_trace_error(TSK_ERR_NO_MEMORY); - goto out; - } - ret = tsk_bit_array_init(&AB_samples, num_samples, 1); - if (ret != 0) { - goto out; - } - ret = tsk_bit_array_init(&B_samples_tmp, num_samples, 1); - if (ret != 0) { - goto out; - } for (n = 0; n < num_nodes; n++) { a_len = A_branch_len[n]; if (a_len == 0) { @@ -3068,15 +3118,14 @@ compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, for (k = 0; k < state_dim; k++) { a_row = (state_dim * n) + k; b_row = (state_dim * (tsk_size_t) c) + k; - tsk_bit_array_get_row(A_state_samples, a_row, &A_samples); - tsk_bit_array_get_row(B_state_samples, b_row, &B_samples); - tsk_bit_array_intersect(&A_samples, &B_samples, &AB_samples); weights_row = GET_2D_ROW(weights, 3, k); - weights_row[0] = (double) tsk_bit_array_count(&AB_samples); // w_AB + tsk_bitset_intersect( + A_state_samples, a_row, B_state_samples, b_row, &AB_samples); + weights_row[0] = (double) tsk_bitset_count(&AB_samples, 0); weights_row[1] - = (double) tsk_bit_array_count(&A_samples) - weights_row[0]; // w_Ab + = (double) tsk_bitset_count(A_state_samples, a_row) - weights_row[0]; weights_row[2] - = (double) tsk_bit_array_count(&B_samples) - weights_row[0]; // w_aB + = (double) tsk_bitset_count(B_state_samples, b_row) - weights_row[0]; } ret = f(state_dim, weights, result_dim, result_tmp, f_params); if (ret != 0) { @@ -3087,10 +3136,6 @@ compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, } } out: - tsk_safe_free(weights); - tsk_safe_free(result_tmp); - tsk_bit_array_free(&AB_samples); - tsk_bit_array_free(&B_samples_tmp); return ret; } @@ -3106,10 +3151,17 @@ compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, const tsk_id_t *restrict edges_child = ts->tables->edges.child; const tsk_id_t *restrict edges_parent = ts->tables->edges.parent; const tsk_size_t num_nodes = ts->tables->nodes.num_rows; - tsk_bit_array_t updates, row, ec_row, *r_samples = r_state->node_samples; + tsk_bitset_t updates, *r_samples = r_state->node_samples; + two_locus_work_t work; + tsk_memset(&work, 0, sizeof(work)); tsk_memset(&updates, 0, sizeof(updates)); - ret = tsk_bit_array_init(&updates, num_nodes, 1); + // only two alleles are possible for branch stats + ret = two_locus_work_init(2, ts->num_samples, result_dim, state_dim, &work); + if (ret != 0) { + goto out; + } + ret = tsk_bitset_init(&updates, num_nodes, 1); if (ret != 0) { goto out; } @@ -3126,18 +3178,18 @@ compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, c = edges_child[e]; // Identify affected nodes above child while (p != TSK_NULL) { - tsk_bit_array_add_bit(&updates, (tsk_bit_array_value_t) c); + tsk_bitset_set_bit(&updates, 0, (tsk_bitset_val_t) c); c = p; p = r_state->parent[p]; } } // Subtract the whole contribution from the child node - tsk_bit_array_get_items(&updates, updated_nodes, &n_updates); + tsk_bitset_get_items(&updates, 0, updated_nodes, &n_updates); while (n_updates != 0) { n_updates--; c = updated_nodes[n_updates]; - compute_two_tree_branch_state_update( - ts, c, l_state, r_state, state_dim, result_dim, -1, f, f_params, result); + compute_two_tree_branch_state_update(ts, c, l_state, r_state, state_dim, + result_dim, -1, f, f_params, &work, result); } // Remove samples under nodes from removed edges to parent nodes for (j = 0; j < r_state->n_edges_out; j++) { @@ -3146,10 +3198,8 @@ compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, ec = edges_child[e]; // edge child while (p != TSK_NULL) { for (k = 0; k < state_dim; k++) { - tsk_bit_array_get_row( - r_samples, (state_dim * (tsk_size_t) ec) + k, &ec_row); - tsk_bit_array_get_row(r_samples, (state_dim * (tsk_size_t) p) + k, &row); - tsk_bit_array_subtract(&row, &ec_row); + tsk_bitset_subtract(r_samples, (state_dim * (tsk_size_t) p) + k, + r_samples, (state_dim * (tsk_size_t) ec) + k); } p = r_state->parent[p]; } @@ -3164,12 +3214,10 @@ compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, r_state->branch_len[c] = time[p] - time[c]; r_state->parent[c] = p; while (p != TSK_NULL) { - tsk_bit_array_add_bit(&updates, (tsk_bit_array_value_t) c); + tsk_bitset_set_bit(&updates, 0, (tsk_bitset_val_t) c); for (k = 0; k < state_dim; k++) { - tsk_bit_array_get_row( - r_samples, (state_dim * (tsk_size_t) ec) + k, &ec_row); - tsk_bit_array_get_row(r_samples, (state_dim * (tsk_size_t) p) + k, &row); - tsk_bit_array_add(&row, &ec_row); + tsk_bitset_union(r_samples, (state_dim * (tsk_size_t) p) + k, r_samples, + (state_dim * (tsk_size_t) ec) + k); } c = p; p = r_state->parent[p]; @@ -3177,22 +3225,24 @@ compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, } // Update all affected child nodes (fully subtracted, deferred from addition) n_updates = 0; - tsk_bit_array_get_items(&updates, updated_nodes, &n_updates); + tsk_bitset_get_items(&updates, 0, updated_nodes, &n_updates); while (n_updates != 0) { n_updates--; c = updated_nodes[n_updates]; - compute_two_tree_branch_state_update( - ts, c, l_state, r_state, state_dim, result_dim, +1, f, f_params, result); + compute_two_tree_branch_state_update(ts, c, l_state, r_state, state_dim, + result_dim, +1, f, f_params, &work, result); } out: tsk_safe_free(updated_nodes); - tsk_bit_array_free(&updates); + two_locus_work_free(&work); + tsk_bitset_free(&updates); return ret; } static int tsk_treeseq_two_branch_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, - const tsk_bit_array_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, + tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, + const tsk_id_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, sample_count_stat_params_t *f_params, norm_func_t *TSK_UNUSED(norm_f), tsk_size_t n_rows, const double *row_positions, tsk_size_t n_cols, const double *col_positions, tsk_flags_t TSK_UNUSED(options), double *result) @@ -3201,11 +3251,12 @@ tsk_treeseq_two_branch_count_stat(const tsk_treeseq_t *self, tsk_size_t state_di int r, c; tsk_id_t *row_indexes = NULL, *col_indexes = NULL; tsk_size_t i, j, k, row, col, *row_repeats = NULL, *col_repeats = NULL; - tsk_bit_array_t node_samples; + tsk_bitset_t node_samples, sample_sets_bits; iter_state l_state, r_state; double *result_tmp = NULL, *result_row; const tsk_size_t num_nodes = self->tables->nodes.num_rows; + tsk_memset(&sample_sets_bits, 0, sizeof(sample_sets_bits)); tsk_memset(&node_samples, 0, sizeof(node_samples)); tsk_memset(&l_state, 0, sizeof(l_state)); tsk_memset(&r_state, 0, sizeof(r_state)); @@ -3222,6 +3273,11 @@ tsk_treeseq_two_branch_count_stat(const tsk_treeseq_t *self, tsk_size_t state_di if (ret != 0) { goto out; } + ret = sample_sets_to_bitset( + self, sample_set_sizes, sample_sets, num_sample_sets, &sample_sets_bits); + if (ret != 0) { + goto out; + } ret = positions_to_tree_indexes(self, row_positions, n_rows, &row_indexes); if (ret != 0) { goto out; @@ -3238,7 +3294,7 @@ tsk_treeseq_two_branch_count_stat(const tsk_treeseq_t *self, tsk_size_t state_di if (ret != 0) { goto out; } - ret = get_node_samples(self, state_dim, sample_sets, &node_samples); + ret = get_node_samples(self, state_dim, &sample_sets_bits, &node_samples); if (ret != 0) { goto out; } @@ -3289,7 +3345,42 @@ tsk_treeseq_two_branch_count_stat(const tsk_treeseq_t *self, tsk_size_t state_di tsk_safe_free(col_repeats); iter_state_free(&l_state); iter_state_free(&r_state); - tsk_bit_array_free(&node_samples); + tsk_bitset_free(&node_samples); + tsk_bitset_free(&sample_sets_bits); + return ret; +} + +static int +check_sample_set_dups(tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, + const tsk_id_t *sample_sets, const tsk_id_t *restrict sample_index_map, + tsk_size_t num_samples) +{ + int ret; + tsk_size_t j, k, l; + tsk_id_t u, sample_index; + tsk_bitset_t tmp; + + tsk_memset(&tmp, 0, sizeof(tmp)); + ret = tsk_bitset_init(&tmp, num_samples, 1); + if (ret != 0) { + goto out; + } + j = 0; + for (k = 0; k < num_sample_sets; k++) { + tsk_memset(tmp.data, 0, sizeof(*tmp.data) * tmp.row_len); + for (l = 0; l < sample_set_sizes[k]; l++) { + u = sample_sets[j]; + sample_index = sample_index_map[u]; + if (tsk_bitset_contains(&tmp, 0, (tsk_bitset_val_t) sample_index)) { + ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); + goto out; + } + tsk_bitset_set_bit(&tmp, 0, (tsk_bitset_val_t) sample_index); + j++; + } + } +out: + tsk_bitset_free(&tmp); return ret; } @@ -3304,7 +3395,6 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl // TODO: generalize this function if we ever decide to do weighted two_locus stats. // We only implement count stats and therefore we don't handle weights. int ret = 0; - tsk_bit_array_t sample_sets_bits; bool stat_site = !!(options & TSK_STAT_SITE); bool stat_branch = !!(options & TSK_STAT_BRANCH); tsk_size_t state_dim = num_sample_sets; @@ -3313,8 +3403,6 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl .sample_set_sizes = sample_set_sizes, .set_indexes = set_indexes }; - tsk_memset(&sample_sets_bits, 0, sizeof(sample_sets_bits)); - // We do not support two-locus node stats if (!!(options & TSK_STAT_NODE)) { ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_STAT_MODE); @@ -3338,12 +3426,6 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl ret = tsk_trace_error(TSK_ERR_BAD_RESULT_DIMS); goto out; } - ret = sample_sets_to_bit_array( - self, sample_set_sizes, sample_sets, num_sample_sets, &sample_sets_bits); - if (ret != 0) { - goto out; - } - if (stat_site) { ret = check_sites(row_sites, out_rows, self->tables->sites.num_rows); if (ret != 0) { @@ -3353,9 +3435,14 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl if (ret != 0) { goto out; } - ret = tsk_treeseq_two_site_count_stat(self, state_dim, &sample_sets_bits, - result_dim, f, &f_params, norm_f, out_rows, row_sites, out_cols, col_sites, - options, result); + ret = check_sample_set_dups(num_sample_sets, sample_set_sizes, sample_sets, + self->sample_index_map, self->num_samples); + if (ret != 0) { + goto out; + } + ret = tsk_treeseq_two_site_count_stat(self, state_dim, num_sample_sets, + sample_set_sizes, sample_sets, result_dim, f, &f_params, norm_f, out_rows, + row_sites, out_cols, col_sites, options, result); } else if (stat_branch) { ret = check_positions( row_positions, out_rows, tsk_treeseq_get_sequence_length(self)); @@ -3367,13 +3454,11 @@ tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sampl if (ret != 0) { goto out; } - ret = tsk_treeseq_two_branch_count_stat(self, state_dim, &sample_sets_bits, - result_dim, f, &f_params, norm_f, out_rows, row_positions, out_cols, - col_positions, options, result); + ret = tsk_treeseq_two_branch_count_stat(self, state_dim, num_sample_sets, + sample_set_sizes, sample_sets, result_dim, f, &f_params, norm_f, out_rows, + row_positions, out_cols, col_positions, options, result); } - out: - tsk_bit_array_free(&sample_sets_bits); return ret; } @@ -10687,6 +10772,9 @@ tsk_matvec_calculator_run(tsk_matvec_calculator_t *self) tsk_matvec_calculator_print_state(self, tsk_get_debug_stream()); } } + if (!!(self->options & TSK_STAT_SPAN_NORMALISE)) { + span_normalise(self->num_windows, windows, out_size, self->result); + } /* out: */ return ret; diff --git a/treerec/tskit/trees.h b/treerec/tskit/trees.h index 21495edb..2e495a40 100644 --- a/treerec/tskit/trees.h +++ b/treerec/tskit/trees.h @@ -933,17 +933,20 @@ path from `p` to `c`. For instance, if `p` is the parent of `n` and `n` is the parent of `c`, then the span of the edges from `p` to `n` and `n` to `c` are extended, and the span of the edge from `p` to `c` is reduced. However, any edges whose child node is a sample are not -modified. The `node` of certain mutations may also be remapped; to do this +modified. See Fritze et al. (2025): +https://doi.org/10.1093/genetics/iyaf198 for more details. + +The method works by iterating over the genome to look for edges that can +be extended in this way; the maximum number of such iterations is +controlled by ``max_iter``. + +The `node` of certain mutations may also be remapped; to do this unambiguously we need to know mutation times. If mutations times are unknown, use `tsk_table_collection_compute_mutation_times` first. The method will not affect any tables except the edge table, or the node column in the mutation table. -The method works by iterating over the genome to look for edges that can -be extended in this way; the maximum number of such iterations is -controlled by ``max_iter``. - @rst **Options**: None currently defined. @@ -966,6 +969,52 @@ int tsk_treeseq_split_edges(const tsk_treeseq_t *self, double time, tsk_flags_t bool tsk_treeseq_has_reference_sequence(const tsk_treeseq_t *self); +/** +@brief Decode full-length alignments for specified nodes over an interval. + +@rst +Fills a caller-provided buffer with per-node sequence alignments for the interval +``[left, right)``. Each row is exactly ``L = right - left`` bytes with no trailing +terminator, and rows are tightly packed in row-major order in the output buffer. + +The output at non-site positions comes from the provided ``ref_seq`` slice +(``ref_seq[left:right]``); per-site alleles are overlaid onto this for each node. + +If the :c:macro:`TSK_ISOLATED_NOT_MISSING` option is +not set, nodes that are isolated (no parent and no children) within a tree +interval in ``[left, right)`` are rendered as the ``missing_data_character`` for +that interval. At site positions, decoded genotypes override any previous value; +if a genotype is missing (``TSK_MISSING_DATA``), the ``missing_data_character`` is +overlaid onto the reference base. + +Requirements and validation: + +- The tree sequence must have a discrete genome. +- ``left`` and ``right`` must be integers with ``0 <= left < right <= sequence_length``. +- ``ref_seq`` must be non-NULL and ``ref_seq_length == sequence_length``. +- Each allele at a site must be exactly one byte; alleles equal to + ``missing_data_character`` are not permitted. + +@endrst + +@param self A pointer to a :c:type:`tsk_treeseq_t` object. +@param ref_seq Pointer to a reference sequence buffer of length ``ref_seq_length``. +@param ref_seq_length The total length of ``ref_seq``; must equal the tree sequence +length. +@param nodes Array of node IDs to decode (may include non-samples). +@param num_nodes The number of nodes in ``nodes`` and rows in the output. +@param left The inclusive-left genomic coordinate of the output interval. +@param right The exclusive-right genomic coordinate of the output interval. +@param missing_data_character The byte to use for missing data. +@param alignments_out Output buffer of size at least ``num_nodes * (right - left)``. +@param options Bitwise option flags; supports :c:macro:`TSK_ISOLATED_NOT_MISSING`. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_treeseq_decode_alignments(const tsk_treeseq_t *self, const char *ref_seq, + tsk_size_t ref_seq_length, const tsk_id_t *nodes, tsk_size_t num_nodes, double left, + double right, char missing_data_character, char *alignments_out, + tsk_flags_t options); + int tsk_treeseq_get_individuals_population(const tsk_treeseq_t *self, tsk_id_t *output); int tsk_treeseq_get_individuals_time(const tsk_treeseq_t *self, double *output);