Skip to content

Commit a05b7ad

Browse files
committed
Add 'mutation' mode; switch genetic_relatedness_matrix and
divergence_matrix to this mode; document divergence_matrix
1 parent dacbf38 commit a05b7ad

File tree

10 files changed

+193
-138
lines changed

10 files changed

+193
-138
lines changed

c/tests/test_stats.c

Lines changed: 35 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1315,6 +1315,12 @@ test_general_stat_input_errors(void)
13151315
&ts, 1, &W, 0, general_stat_sum, NULL, 0, NULL, 0, &result);
13161316
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_RESULT_DIMS);
13171317

1318+
/* Unsupported mode */
1319+
/* TODO: change when STAT_MUTATION is supported */
1320+
ret = tsk_treeseq_general_stat(
1321+
&ts, 1, &W, 1, general_stat_sum, NULL, 0, NULL, TSK_STAT_MUTATION, &result);
1322+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSUPPORTED_STAT_MODE);
1323+
13181324
/* Multiple stats*/
13191325
ret = tsk_treeseq_general_stat(&ts, 1, &W, 1, general_stat_sum, NULL, 0, NULL,
13201326
TSK_STAT_SITE | TSK_STAT_BRANCH, &result);
@@ -1464,18 +1470,26 @@ test_single_tree_divergence_matrix(void)
14641470
tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL,
14651471
single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0);
14661472

1473+
ret = tsk_treeseq_divergence_matrix(
1474+
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_SITE, result);
1475+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSUPPORTED_STAT_MODE);
1476+
1477+
ret = tsk_treeseq_divergence_matrix(
1478+
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_NODE, result);
1479+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSUPPORTED_STAT_MODE);
1480+
14671481
ret = tsk_treeseq_divergence_matrix(
14681482
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_BRANCH, result);
14691483
CU_ASSERT_EQUAL_FATAL(ret, 0);
14701484
assert_arrays_almost_equal(16, result, D_branch);
14711485

14721486
ret = tsk_treeseq_divergence_matrix(
1473-
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_SITE, result);
1487+
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_MUTATION, result);
14741488
CU_ASSERT_EQUAL_FATAL(ret, 0);
14751489
assert_arrays_almost_equal(16, result, D_site);
14761490

14771491
ret = tsk_treeseq_divergence_matrix(
1478-
&ts, 2, sample_set_sizes, NULL, 0, NULL, TSK_STAT_SITE, result);
1492+
&ts, 2, sample_set_sizes, NULL, 0, NULL, TSK_STAT_MUTATION, result);
14791493
CU_ASSERT_EQUAL_FATAL(ret, 0);
14801494

14811495
ret = tsk_treeseq_divergence_matrix(
@@ -1488,15 +1502,15 @@ test_single_tree_divergence_matrix(void)
14881502
&ts, 2, sample_set_sizes, NULL, 0, NULL, TSK_STAT_BRANCH, result);
14891503
CU_ASSERT_EQUAL_FATAL(ret, 0);
14901504
ret = tsk_treeseq_divergence_matrix(
1491-
&ts, 2, sample_set_sizes, NULL, 0, NULL, TSK_STAT_SITE, result);
1505+
&ts, 2, sample_set_sizes, NULL, 0, NULL, TSK_STAT_MUTATION, result);
14921506
CU_ASSERT_EQUAL_FATAL(ret, 0);
14931507

14941508
/* assert_arrays_almost_equal(4, result, D_site); */
14951509

14961510
verify_divergence_matrix(&ts, TSK_STAT_BRANCH);
14971511
verify_divergence_matrix(&ts, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE);
1498-
verify_divergence_matrix(&ts, TSK_STAT_SITE);
1499-
verify_divergence_matrix(&ts, TSK_STAT_SITE | TSK_STAT_SPAN_NORMALISE);
1512+
verify_divergence_matrix(&ts, TSK_STAT_MUTATION);
1513+
verify_divergence_matrix(&ts, TSK_STAT_MUTATION | TSK_STAT_SPAN_NORMALISE);
15001514

15011515
tsk_treeseq_free(&ts);
15021516
}
@@ -1542,7 +1556,7 @@ test_single_tree_divergence_matrix_internal_samples(void)
15421556
CU_ASSERT_EQUAL_FATAL(ret, 0);
15431557
assert_arrays_almost_equal(16, result, D);
15441558
ret = tsk_treeseq_divergence_matrix(
1545-
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_SITE, result);
1559+
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_MUTATION, result);
15461560
CU_ASSERT_EQUAL_FATAL(ret, 0);
15471561
assert_arrays_almost_equal(16, result, D);
15481562

@@ -1551,7 +1565,7 @@ test_single_tree_divergence_matrix_internal_samples(void)
15511565
CU_ASSERT_EQUAL_FATAL(ret, 0);
15521566
assert_arrays_almost_equal(16, result, D);
15531567
ret = tsk_treeseq_divergence_matrix(
1554-
&ts, 4, sizes, samples, 0, NULL, TSK_STAT_SITE, result);
1568+
&ts, 4, sizes, samples, 0, NULL, TSK_STAT_MUTATION, result);
15551569
CU_ASSERT_EQUAL_FATAL(ret, 0);
15561570
assert_arrays_almost_equal(16, result, D);
15571571

@@ -1560,14 +1574,14 @@ test_single_tree_divergence_matrix_internal_samples(void)
15601574
CU_ASSERT_EQUAL_FATAL(ret, 0);
15611575
assert_arrays_almost_equal(16, result, D);
15621576
ret = tsk_treeseq_divergence_matrix(
1563-
&ts, 4, NULL, samples, 0, NULL, TSK_STAT_SITE, result);
1577+
&ts, 4, NULL, samples, 0, NULL, TSK_STAT_MUTATION, result);
15641578
CU_ASSERT_EQUAL_FATAL(ret, 0);
15651579
assert_arrays_almost_equal(16, result, D);
15661580

15671581
verify_divergence_matrix(&ts, TSK_STAT_BRANCH);
15681582
verify_divergence_matrix(&ts, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE);
1569-
verify_divergence_matrix(&ts, TSK_STAT_SITE);
1570-
verify_divergence_matrix(&ts, TSK_STAT_SITE | TSK_STAT_SPAN_NORMALISE);
1583+
verify_divergence_matrix(&ts, TSK_STAT_MUTATION);
1584+
verify_divergence_matrix(&ts, TSK_STAT_MUTATION | TSK_STAT_SPAN_NORMALISE);
15711585

15721586
tsk_treeseq_free(&ts);
15731587
free(result);
@@ -1616,8 +1630,8 @@ test_single_tree_divergence_matrix_multi_root(void)
16161630

16171631
verify_divergence_matrix(&ts, TSK_STAT_BRANCH);
16181632
verify_divergence_matrix(&ts, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE);
1619-
verify_divergence_matrix(&ts, TSK_STAT_SITE);
1620-
verify_divergence_matrix(&ts, TSK_STAT_SITE | TSK_STAT_SPAN_NORMALISE);
1633+
verify_divergence_matrix(&ts, TSK_STAT_MUTATION);
1634+
verify_divergence_matrix(&ts, TSK_STAT_MUTATION | TSK_STAT_SPAN_NORMALISE);
16211635

16221636
tsk_treeseq_free(&ts);
16231637
}
@@ -2557,8 +2571,8 @@ test_paper_ex_divergence_matrix(void)
25572571

25582572
verify_divergence_matrix(&ts, TSK_STAT_BRANCH);
25592573
verify_divergence_matrix(&ts, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE);
2560-
verify_divergence_matrix(&ts, TSK_STAT_SITE);
2561-
verify_divergence_matrix(&ts, TSK_STAT_SITE | TSK_STAT_SPAN_NORMALISE);
2574+
verify_divergence_matrix(&ts, TSK_STAT_MUTATION);
2575+
verify_divergence_matrix(&ts, TSK_STAT_MUTATION | TSK_STAT_SPAN_NORMALISE);
25622576

25632577
tsk_treeseq_free(&ts);
25642578
}
@@ -3856,7 +3870,7 @@ test_simplest_divergence_matrix(void)
38563870
assert_arrays_almost_equal(4, D_site, result);
38573871

38583872
ret = tsk_treeseq_divergence_matrix(
3859-
&ts, 2, NULL, sample_ids, 0, NULL, TSK_STAT_SITE, result);
3873+
&ts, 2, NULL, sample_ids, 0, NULL, TSK_STAT_MUTATION, result);
38603874
CU_ASSERT_EQUAL_FATAL(ret, 0);
38613875
assert_arrays_almost_equal(4, D_site, result);
38623876

@@ -3866,7 +3880,7 @@ test_simplest_divergence_matrix(void)
38663880
assert_arrays_almost_equal(4, D_branch, result);
38673881

38683882
ret = tsk_treeseq_divergence_matrix(
3869-
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_SITE, result);
3883+
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_MUTATION, result);
38703884
CU_ASSERT_EQUAL_FATAL(ret, 0);
38713885
assert_arrays_almost_equal(4, D_site, result);
38723886

@@ -3879,7 +3893,7 @@ test_simplest_divergence_matrix(void)
38793893
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_STAT_POLARISED_UNSUPPORTED);
38803894

38813895
ret = tsk_treeseq_divergence_matrix(
3882-
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_SITE | TSK_STAT_BRANCH, result);
3896+
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_MUTATION | TSK_STAT_BRANCH, result);
38833897
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_STAT_MODES);
38843898

38853899
sample_ids[0] = -1;
@@ -3935,7 +3949,7 @@ test_simplest_divergence_matrix_windows(void)
39353949

39363950
/* Windows for the second half */
39373951
ret = tsk_treeseq_divergence_matrix(
3938-
&ts, 2, NULL, sample_ids, 1, windows + 1, TSK_STAT_SITE, result);
3952+
&ts, 2, NULL, sample_ids, 1, windows + 1, TSK_STAT_MUTATION, result);
39393953
CU_ASSERT_EQUAL_FATAL(ret, 0);
39403954
assert_arrays_almost_equal(4, D_site, result);
39413955
ret = tsk_treeseq_divergence_matrix(
@@ -3985,7 +3999,7 @@ test_simplest_divergence_matrix_internal_sample(void)
39853999
assert_arrays_almost_equal(9, D_branch, result);
39864000

39874001
ret = tsk_treeseq_divergence_matrix(
3988-
&ts, 3, NULL, sample_ids, 0, NULL, TSK_STAT_SITE, result);
4002+
&ts, 3, NULL, sample_ids, 0, NULL, TSK_STAT_MUTATION, result);
39894003
CU_ASSERT_EQUAL_FATAL(ret, 0);
39904004
assert_arrays_almost_equal(9, D_site, result);
39914005

@@ -4002,8 +4016,8 @@ test_multiroot_divergence_matrix(void)
40024016

40034017
verify_divergence_matrix(&ts, TSK_STAT_BRANCH);
40044018
verify_divergence_matrix(&ts, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE);
4005-
verify_divergence_matrix(&ts, TSK_STAT_SITE);
4006-
verify_divergence_matrix(&ts, TSK_STAT_SITE | TSK_STAT_SPAN_NORMALISE);
4019+
verify_divergence_matrix(&ts, TSK_STAT_MUTATION);
4020+
verify_divergence_matrix(&ts, TSK_STAT_MUTATION | TSK_STAT_SPAN_NORMALISE);
40074021

40084022
tsk_treeseq_free(&ts);
40094023
}

c/tskit/trees.c

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2041,13 +2041,18 @@ tsk_treeseq_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim,
20412041
bool stat_site = !!(options & TSK_STAT_SITE);
20422042
bool stat_branch = !!(options & TSK_STAT_BRANCH);
20432043
bool stat_node = !!(options & TSK_STAT_NODE);
2044+
bool stat_mutation = !!(options & TSK_STAT_MUTATION);
20442045
double default_windows[] = { 0, self->tables->sequence_length };
20452046
tsk_size_t row_size;
20462047

20472048
/* If no mode is specified, we default to site mode */
2048-
if (!(stat_site || stat_branch || stat_node)) {
2049+
if (!(stat_site || stat_branch || stat_node || stat_mutation)) {
20492050
stat_site = true;
20502051
}
2052+
if (stat_mutation) {
2053+
ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_STAT_MODE);
2054+
goto out;
2055+
}
20512056
/* It's an error to specify more than one mode */
20522057
if (stat_site + stat_branch + stat_node > 1) {
20532058
ret = tsk_trace_error(TSK_ERR_MULTIPLE_STAT_MODES);
@@ -8751,11 +8756,11 @@ remap_to_sample_sets(const tsk_size_t num_samples, const tsk_id_t *restrict samp
87518756
}
87528757

87538758
static int
8754-
tsk_treeseq_divergence_matrix_site(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,
8755-
const tsk_id_t *restrict sample_set_index_map, const tsk_size_t num_samples,
8756-
const tsk_id_t *restrict samples, tsk_size_t num_windows,
8757-
const double *restrict windows, tsk_flags_t TSK_UNUSED(options),
8758-
double *restrict result)
8759+
tsk_treeseq_divergence_matrix_mutation(const tsk_treeseq_t *self,
8760+
tsk_size_t num_sample_sets, const tsk_id_t *restrict sample_set_index_map,
8761+
const tsk_size_t num_samples, const tsk_id_t *restrict samples,
8762+
tsk_size_t num_windows, const double *restrict windows,
8763+
tsk_flags_t TSK_UNUSED(options), double *restrict result)
87598764
{
87608765
int ret = 0;
87618766
tsk_size_t i;
@@ -8913,20 +8918,21 @@ tsk_treeseq_divergence_matrix(const tsk_treeseq_t *self, tsk_size_t num_sample_s
89138918
bool stat_site = !!(options & TSK_STAT_SITE);
89148919
bool stat_branch = !!(options & TSK_STAT_BRANCH);
89158920
bool stat_node = !!(options & TSK_STAT_NODE);
8921+
bool stat_mutation = !!(options & TSK_STAT_MUTATION);
89168922
tsk_id_t *sample_set_index_map
89178923
= tsk_malloc(num_nodes * sizeof(*sample_set_index_map));
89188924
tsk_size_t j;
89198925

8920-
if (stat_node) {
8926+
if (stat_node || stat_site) {
89218927
ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_STAT_MODE);
89228928
goto out;
89238929
}
8924-
/* If no mode is specified, we default to site mode */
8925-
if (!(stat_site || stat_branch)) {
8926-
stat_site = true;
8930+
/* If no mode is specified, we default to mutation mode */
8931+
if (!(stat_mutation || stat_branch)) {
8932+
stat_mutation = true;
89278933
}
89288934
/* It's an error to specify more than one mode */
8929-
if (stat_site + stat_branch > 1) {
8935+
if (stat_mutation + stat_branch > 1) {
89308936
ret = tsk_trace_error(TSK_ERR_MULTIPLE_STAT_MODES);
89318937
goto out;
89328938
}
@@ -8982,8 +8988,8 @@ tsk_treeseq_divergence_matrix(const tsk_treeseq_t *self, tsk_size_t num_sample_s
89828988
ret = tsk_treeseq_divergence_matrix_branch(self, N, sample_set_sizes,
89838989
sample_sets, num_windows, windows, options, result);
89848990
} else {
8985-
tsk_bug_assert(stat_site);
8986-
ret = tsk_treeseq_divergence_matrix_site(self, N, sample_set_index_map,
8991+
tsk_bug_assert(stat_mutation);
8992+
ret = tsk_treeseq_divergence_matrix_mutation(self, N, sample_set_index_map,
89878993
total_samples, sample_sets, num_windows, windows, options, result);
89888994
}
89898995
if (ret != 0) {

c/tskit/trees.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ extern "C" {
4747
#define TSK_STAT_SITE (1 << 0)
4848
#define TSK_STAT_BRANCH (1 << 1)
4949
#define TSK_STAT_NODE (1 << 2)
50+
#define TSK_STAT_MUTATION (1 << 3)
5051

5152
/* Leave room for other stat types */
5253
#define TSK_STAT_POLARISED (1 << 10)

docs/python-api.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,7 @@ Single site
323323
TreeSequence.genetic_relatedness_weighted
324324
TreeSequence.genetic_relatedness_vector
325325
TreeSequence.genetic_relatedness_matrix
326+
TreeSequence.divergence_matrix
326327
TreeSequence.general_stat
327328
TreeSequence.segregating_sites
328329
TreeSequence.sample_count_stat

docs/stats.md

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -204,29 +204,43 @@ statistics API in use.
204204

205205
#### Mode
206206

207-
There are three **modes** of statistic: `site`, `branch`, and `node`,
207+
There are four **modes** of statistic: `site`, `branch`, `mutation`, and `node`,
208208
that each summarize aspects of the tree sequence in different but related ways.
209209
Roughly speaking, these answer the following sorts of question:
210210

211211
site
212-
: How many mutations differentiate these two genomes?
212+
: At how many sites do these two genomes differ?
213213

214214
branch
215215
: How long since these genomes' common ancestor?
216216

217+
branch
218+
: How many mutations have occurred since these genomes' common ancestor?
219+
217220
node
218221
: On how much of the genome is each node an ancestor of only one of these genomes, but not both?
219222

220-
These three examples can all be answered in the same way with the tree sequence:
223+
These examples can all be answered in the same way with the tree sequence:
221224
first, draw all the paths from one genome to the other through the tree sequence
222225
(back up to their common ancestor and back down in each marginal tree).
223226
Then,
224-
(`site`) count the number of mutations falling on the paths,
227+
(`site`) check if the allelic state at the endpoints is the same, or
225228
(`branch`) measure the length of the paths, or
229+
(`mutation`) count the number of mutations falling on the paths, or
226230
(`node`) count how often the path goes through each node.
227231
There is more discussion of this correspondence in the paper describing these statistics,
228232
and precise definitions are given in each statistic.
229233

234+
Furthermore, since mutations are rare, if two genomes differ in state at a site,
235+
then it's probably because there was a single mutation.
236+
This means that `site` and `mutation` are *almost* the same:
237+
and, they will be identical if there is at most one mutation per site
238+
and no mutations are silent.
239+
So, we tend to think of `site` in the same way, as counting numbers of mutations;
240+
with a small correction due to multiple mutations.
241+
Furthermore, `mutation` mode is not implemented for many statistics at this point,
242+
so use `site` to answer the same questions.
243+
230244
Here's an example of using the {meth}`~TreeSequence.diversity` statistic to return the
231245
average branch length between all pairs of samples:
232246

@@ -235,7 +249,7 @@ ts.diversity(mode="branch")
235249
```
236250

237251
One important thing to know is that `node` statistics have somewhat different output.
238-
While `site` and `branch` statistics naturally return one number
252+
While `site`, `branch`, and `mutation` statistics naturally return one number
239253
for each portion of the genome (and thus incorporates information about many nodes: see below),
240254
the `node` statistics return one number **for each node** in the tree sequence (and for each window).
241255
There can be a lot of nodes in the tree sequence, so beware.
@@ -795,8 +809,9 @@ pairwise) and Patterson's f statistics (which are four-way).
795809
### Derived statistics
796810

797811
Most statistics have the property that `mode="branch"` and
798-
`mode="site"` are "dual" in the sense that they are equal, on average, under
799-
a high neutral mutation rate. {meth}`~TreeSequence.Fst` and {meth}`~TreeSequence.Tajimas_D`
812+
`mode="mutation"` are "dual" in the sense that they are equal, on average,
813+
and so is `mode="site"` under the infinite sites model of mutation.
814+
{meth}`~TreeSequence.Fst` and {meth}`~TreeSequence.Tajimas_D`
800815
do not have this property (since both are ratios of statistics that do have this property).
801816

802817

python/CHANGELOG.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
is not the case.
1515
(:user:`benjeffery`, :issue:`2729`, :issue:`2732`, :pr:`3212`).
1616

17+
- The previous behavior of ``genetic_relatedness_matrix`` with ``mode="site"`` is
18+
now provided by ``mode="branch"``, and ``mode="site"`` is not supported.
19+
(:user:`petrelharp`, :pr:`3314`)
20+
1721
- Drop Python 3.9 support, require Python >= 3.10 (:pr:`3267`, :user:`benjeffery`)
1822

1923

@@ -59,6 +63,12 @@
5963
allowing greater flexibility in "disjoint union" situations.
6064
(:user:`hyanwong`, :user:`petrelharp`, :issue:`3181`)
6165

66+
- Add a new statistics mode, ``mode="mutation"``, that counts numbers of mutations
67+
rather than branch length (``mode="branch"``) or allelic state (``mode="site"``)
68+
(:user:`petrelharp`, :pr:`3314`).
69+
70+
- Add ``TreeSequence.divergence_matrix``, which was previously undocumented.
71+
6272
**Bugfixes**
6373

6474
- In some tables with mutations out-of-order ``TableCollection.sort`` did not re-order
@@ -102,6 +112,7 @@
102112
- ``ltrim``, ``rtrim``, ``trim`` and ``shift`` raise an error if used on a tree sequence
103113
containing a reference sequence (:user:`hyanwong`, :pr:`3210`, :issue:`2091`)
104114

115+
105116
--------------------
106117
[0.6.4] - 2025-05-21
107118
--------------------

python/_tskitmodule.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6442,6 +6442,8 @@ parse_stats_mode(char *mode, tsk_flags_t *ret)
64426442
value = TSK_STAT_BRANCH;
64436443
} else if (strcmp(mode, "node") == 0) {
64446444
value = TSK_STAT_NODE;
6445+
} else if (strcmp(mode, "mutation") == 0) {
6446+
value = TSK_STAT_MUTATION;
64456447
} else {
64466448
PyErr_SetString(PyExc_ValueError, "Unrecognised stats mode");
64476449
return -1;

0 commit comments

Comments
 (0)