@@ -16,6 +16,34 @@ kernelspec:
1616
1717(sec_counting_topologies)=
1818
19+ ``` {code-cell} ipython3
20+ :tags: [remove-cell]
21+ import msprime
22+ import stdpopsim
23+
24+ def topologies_sim_speciestree():
25+ newick_species_tree = "((A:100.0,B:100.0):100.0,C:200.0)"
26+ demography = msprime.Demography.from_species_tree(newick_species_tree, initial_size=100)
27+ ts = msprime.sim_ancestry({0: 2, 1: 2, 2: 2}, demography=demography, random_seed=321)
28+ ts.dump("data/topologies_sim_speciestree.trees")
29+
30+ def topologies_sim_stdpopsim():
31+ species = stdpopsim.get_species("HomSap")
32+ model = species.get_demographic_model("OutOfAfrica_3G09")
33+ contig = species.get_contig("chr1", length_multiplier=0.0002, mutation_rate=model.mutation_rate)
34+ samples = {"YRI": 1000, "CEU": 1000, "CHB": 1000}
35+ engine = stdpopsim.get_engine("msprime")
36+ ts = engine.simulate(model, contig, samples, seed=321)
37+ ts.dump("data/topologies_sim_stdpopsim.trees")
38+
39+
40+ def create_notebook_data():
41+ topologies_sim_speciestree()
42+ topologies_sim_stdpopsim()
43+
44+ # create_notebook_data() # uncomment to recreate the tree seqs used in this notebook
45+ ```
46+
1947# Counting topologies
2048
2149** Yan Wong**
@@ -225,21 +253,22 @@ one tip from each group using the {meth}`Tree.count_topologies` method.
225253
226254``` {code-cell}
227255:tags: [hide-input]
228-
229- newick_species_tree = "((A:100.0,B:100.0):100.0,C:200.0)"
230- demography = msprime.Demography.from_species_tree(newick_species_tree, initial_size=100)
231- ts = msprime.sim_ancestry({0: 2, 1: 2, 2: 2}, demography=demography, random_seed=321)
232- big_tree = ts.first()
256+ big_tree = tskit.load("data/topologies_sim_speciestree.trees").first()
257+ # Check all observed topologies have the same counts
258+ assert list(big_tree.count_topologies()[0, 1, 2].values()) == [32, 32]
233259styles = [
234- f".node.sample.p{p.id} > .sym " + "{" + f"fill: {colour}" + "}"
235- for colour, p in zip(['red', 'green', 'blue'], big_tree.tree_sequence.populations())
260+ f".node.sample.p{p.id} > .sym " + "{" + f"fill: {colour}" + "}"
261+ for colour, p in zip(['red', 'green', 'blue'], big_tree.tree_sequence.populations())
236262]
237263big_tree.draw_svg(style="".join(styles), node_labels={}, time_scale="rank", x_label="big_tree")
238264```
239265
240- In this tree, it is fairly clear that the red and green tips cluster together (although not exclusively),
241- so that if we took one red, one blue, and one green tip at random, we nearly always see the
242- same embedded topology. The {meth}` Tree.count_topologies ` method does this exhaustively:
266+ In this tree, it is clear that the green and blue tips never cluster together.
267+ The {meth}` Tree.count_topologies ` method exhaustively looks at all
268+ combinations of one red, one blue, and one green tip, and confirms that we never see
269+ the topology grouping green and blue. However, as might be expected from
270+ examination of the plot above, a red tip is equally likely to be a sister to a
271+ green tip as to a blue tip:
243272
244273``` {code-cell}
245274# By default `count_topologies` chooses one tip from each population, like setting
@@ -268,27 +297,24 @@ do this [incrementally](sec_incremental), without having to re-count the topolog
268297independently in each tree.
269298
270299``` {code-cell}
271- import stdpopsim
272- species = stdpopsim.get_species("HomSap")
273- model = species.get_demographic_model("OutOfAfrica_3G09")
274- contig = species.get_contig("chr1", length_multiplier=0.0002, mutation_rate=model.mutation_rate)
275- samples = {"YRI": 1000, "CEU": 1000, "CHB": 1000}
276- engine = stdpopsim.get_engine("msprime")
277- ts = engine.simulate(model, contig, samples)
278- print("Simulated", ts.num_trees, "African+European+Chinese trees, each with", ts.num_samples, "tips")
300+ :tags: [hide-input]
301+ from myst_nb import glue
302+ ts = tskit.load("data/topologies_sim_stdpopsim.trees")
303+ print(f"Loaded a stdpopsim of {ts.num_trees} African+European+Chinese trees, each with {ts.num_samples} tips")
304+ glue("seq_len", int(ts.sequence_length/1000), display=False)
279305```
280306
281307Although the trees in this tree sequence are very large, counting the embedded topologies is
282- quite doable (for speed we are only simulating 0.02% of chromosome 1 , but calculating the
283- average over an entire chromosome simply takes a little longer)
308+ quite doable (for speed in this demo we are only simulating {glue:} ` seq_len ` kilobases , but
309+ calculating the average over an entire chromosome simply takes a little longer)
284310
285311``` {code-cell}
286312from datetime import datetime
287313names = {"YRI": "African", "CEU": "European", "CHB": "Chinese"}
288314colours = {"YRI": "yellow", "CEU": "green", "CHB": "blue"}
289315
290316population_map = {p.metadata["id"]: p.id for p in ts.populations()}
291- sample_populations = [population_map[name] for name in samples.keys()]
317+ sample_populations = list(sorted({ts.node(u).population for u in ts.samples()}))
292318topology_span = {tree.rank(): 0 for tree in tskit.all_trees(len(sample_populations))}
293319
294320start = datetime.now()
@@ -320,5 +346,9 @@ for rank, weight in topology_span.items():
320346 display(embedded_tree.draw_svg(size=(160, 150), style="".join(styles), node_labels=node_labels, x_label=label))
321347```
322348
349+ Perhaps unsurprisingly, the most common topology is the one that groups the non-African
350+ populations together (although there are many trees of the other two topologies,
351+ mostly reflecting genetic divergence prior to the emergence of humans out of Africa).
352+
323353For an example with real data, see {ref}` sec_popgen_topological `
324354in the {ref}` sec_intro_popgen ` tutorial.
0 commit comments