@@ -2031,6 +2031,10 @@ def haplotypes(self, samples=None, sites=None):
20312031 ``None``, return haplotypes for all sample nodes, otherwise this may be a
20322032 numpy array (or array-like) object (converted to dtype=np.int32).
20332033 :param array sites: A numpy array of sites to use.
2034+
2035+
2036+ :return: An iterator returning sucessive instances of (sample_id, haplotype).
2037+ :rtype: iter(int, numpy.ndarray(dtype=int8))
20342038 """
20352039 if samples is None :
20362040 samples = np .arange (self .num_samples )
@@ -2123,6 +2127,7 @@ class Ancestor:
21232127 time = attr .ib ()
21242128 focal_sites = attr .ib ()
21252129 haplotype = attr .ib ()
2130+ sample_id = attr .ib ()
21262131
21272132 def __eq__ (self , other ):
21282133 return (
@@ -2170,7 +2175,7 @@ class AncestorData(DataContainer):
21702175 """
21712176
21722177 FORMAT_NAME = "tsinfer-ancestor-data"
2173- FORMAT_VERSION = (3 , 0 )
2178+ FORMAT_VERSION = (3 , 1 )
21742179
21752180 def __init__ (self , sample_data , ** kwargs ):
21762181 super ().__init__ (** kwargs )
@@ -2229,6 +2234,13 @@ def __init__(self, sample_data, **kwargs):
22292234 dtype = "array:i1" ,
22302235 compressor = self ._compressor ,
22312236 )
2237+ self .data .create_dataset (
2238+ "ancestors/sample_id" ,
2239+ shape = (0 ,),
2240+ chunks = chunks ,
2241+ compressor = self ._compressor ,
2242+ dtype = np .int32 ,
2243+ )
22322244
22332245 self ._alloc_ancestor_writer ()
22342246
@@ -2244,6 +2256,7 @@ def _alloc_ancestor_writer(self):
22442256 "time" : self .ancestors_time ,
22452257 "focal_sites" : self .ancestors_focal_sites ,
22462258 "haplotype" : self .ancestors_haplotype ,
2259+ "sample_id" : self .ancestors_sample_id ,
22472260 },
22482261 num_threads = self ._num_flush_threads ,
22492262 )
@@ -2265,6 +2278,7 @@ def __str__(self):
22652278 ("ancestors/time" , zarr_summary (self .ancestors_time )),
22662279 ("ancestors/focal_sites" , zarr_summary (self .ancestors_focal_sites )),
22672280 ("ancestors/haplotype" , zarr_summary (self .ancestors_haplotype )),
2281+ ("ancestors/sample_id" , zarr_summary (self .ancestors_sample_id )),
22682282 ]
22692283 return super ().__str__ () + self ._format_str (values )
22702284
@@ -2289,6 +2303,9 @@ def data_equal(self, other):
22892303 self .ancestors_focal_sites [:], other .ancestors_focal_sites [:]
22902304 )
22912305 and np_obj_equal (self .ancestors_haplotype [:], other .ancestors_haplotype [:])
2306+ and np .array_equal (
2307+ self .ancestors_sample_id [:], other .ancestors_sample_id [:]
2308+ )
22922309 )
22932310
22942311 @property
@@ -2340,6 +2357,10 @@ def ancestors_focal_sites(self):
23402357 def ancestors_haplotype (self ):
23412358 return self .data ["ancestors/haplotype" ]
23422359
2360+ @property
2361+ def ancestors_sample_id (self ):
2362+ return self .data ["ancestors/sample_id" ]
2363+
23432364 @property
23442365 def ancestors_length (self ):
23452366 """
@@ -2358,6 +2379,7 @@ def insert_proxy_samples(
23582379 * ,
23592380 sample_ids = None ,
23602381 epsilon = None ,
2382+ map_ancestors = False ,
23612383 allow_mutation = False ,
23622384 require_same_sample_data = True ,
23632385 ** kwargs ,
@@ -2370,7 +2392,8 @@ def insert_proxy_samples(
23702392
23712393 A *proxy sample ancestor* is an ancestor based upon a known sample. At
23722394 sites used in the full inference process, the haplotype of this ancestor
2373- is identical to that of the sample on which it is based. The time of the
2395+ is identical to that of the sample on which it is based, and the
2396+ The time of the
23742397 ancestor is taken to be a fraction ``epsilon`` older than the sample on
23752398 which it is based.
23762399
@@ -2384,11 +2407,11 @@ def insert_proxy_samples(
23842407
23852408 .. note::
23862409
2387- The proxy sample ancestors inserted here will correspond to extra nodes
2388- in the inferred tree sequence. At sites which are not used in the full
2410+ The proxy sample ancestors inserted here will end up as extra nodes
2411+ in the inferred tree sequence, but at sites which are not used in the full
23892412 inference process (e.g. sites unique to a single historical sample),
2390- these proxy sample ancestor nodes may have a different genotype from
2391- their corresponding sample.
2413+ it is possible for these proxy sample ancestor nodes to have a different
2414+ genotype from their corresponding sample.
23922415
23932416 :param SampleData sample_data: The :class:`.SampleData` instance
23942417 from which to select the samples used to create extra ancestors.
@@ -2423,7 +2446,8 @@ def insert_proxy_samples(
24232446 to ensure that the encoding of alleles in ``sample_data`` matches the
24242447 encoding in the current :class:`AncestorData` instance (i.e. that in the
24252448 original :class:`.SampleData` instance on which the current ancestors
2426- are based).
2449+ are based). Note that in this case, the sample_id is not recorded in the
2450+ returned object.
24272451 :param \\ **kwargs: Further arguments passed to the constructor when creating
24282452 the new :class:`AncestorData` instance which will be returned.
24292453
@@ -2521,7 +2545,11 @@ def insert_proxy_samples(
25212545 time = proxy_time ,
25222546 focal_sites = [],
25232547 haplotype = haplotype ,
2548+ sample_id = sample_id
2549+ if sample_data .uuid == self .sample_data_uuid
2550+ else tskit .NULL ,
25242551 )
2552+
25252553 # Add any ancestors remaining in the current instance
25262554 while ancestor is not None :
25272555 other .add_ancestor (** attr .asdict (ancestor , filter = exclude_id ))
@@ -2603,7 +2631,6 @@ def truncate_ancestors(
26032631 start = self .ancestors_start [:]
26042632 end = self .ancestors_end [:]
26052633 time = self .ancestors_time [:]
2606- focal_sites = self .ancestors_focal_sites [:]
26072634 haplotypes = self .ancestors_haplotype [:]
26082635 if upper_time_bound > np .max (time ) or lower_time_bound > np .max (time ):
26092636 raise ValueError ("Time bounds cannot be greater than older ancestor" )
@@ -2641,16 +2668,12 @@ def truncate_ancestors(
26412668 )
26422669 start [anc .id ] = insert_pos_start
26432670 end [anc .id ] = insert_pos_end
2644- time [anc .id ] = anc .time
2645- focal_sites [anc .id ] = anc .focal_sites
26462671 haplotypes [anc .id ] = anc .haplotype [
26472672 insert_pos_start - anc .start : insert_pos_end - anc .start
26482673 ]
26492674 # TODO - record truncation in ancestors' metadata when supported
26502675 truncated .ancestors_start [:] = start
26512676 truncated .ancestors_end [:] = end
2652- truncated .ancestors_time [:] = time
2653- truncated .ancestors_focal_sites [:] = focal_sites
26542677 truncated .ancestors_haplotype [:] = haplotypes
26552678 truncated .record_provenance (command = "truncate_ancestors" )
26562679 truncated .finalise ()
@@ -2671,6 +2694,12 @@ def set_inference_sites(self, site_ids):
26712694 sites in the sample data file, and the IDs must be in increasing order.
26722695
26732696 This must be called before the first call to :meth:`.add_ancestor`.
2697+
2698+ .. note::
2699+ To obtain a list of which sites in a sample data or a tree sequence have
2700+ been placed into the ancestors file for use in inference, you can apply
2701+ :func:`numpy.isin` to the list of positions, e.g.
2702+ ``np.isin(sample_data.sites_position[:], ancestors.sites_position[:])``
26742703 """
26752704 self ._check_build_mode ()
26762705 position = self .sample_data .sites_position [:][site_ids ]
@@ -2679,12 +2708,18 @@ def set_inference_sites(self, site_ids):
26792708 array [:] = position
26802709 self ._num_alleles = self .sample_data .num_alleles (site_ids )
26812710
2682- def add_ancestor (self , start , end , time , focal_sites , haplotype ):
2711+ def add_ancestor (
2712+ self , start , end , time , focal_sites , haplotype , sample_id = tskit .NULL
2713+ ):
26832714 """
26842715 Adds an ancestor with the specified haplotype, with ancestral material over the
26852716 interval [start:end], that is associated with the specified timepoint and has new
2686- mutations at the specified list of focal sites. Ancestors should be added in time
2687- order, with the oldest first. The id of the added ancestor is returned.
2717+ mutations at the specified list of focal sites. If this ancestor is based on a
2718+ specific sample from the associated sample_data file (i.e. a historical sample)
2719+ then the ``sample_id`` in the sample data file can also be passed as a parameter.
2720+
2721+ The Ancestors should be added in time order, with the oldest first. The id of
2722+ the added ancestor is returned.
26882723 """
26892724 self ._check_build_mode ()
26902725 haplotype = tskit .util .safe_np_int_cast (haplotype , dtype = np .int8 , copy = True )
@@ -2714,6 +2749,7 @@ def add_ancestor(self, start, end, time, focal_sites, haplotype):
27142749 time = time ,
27152750 focal_sites = focal_sites ,
27162751 haplotype = haplotype ,
2752+ sample_id = sample_id ,
27172753 )
27182754
27192755 def finalise (self ):
@@ -2739,6 +2775,7 @@ def ancestor(self, id_):
27392775 time = self .ancestors_time [id_ ],
27402776 focal_sites = self .ancestors_focal_sites [id_ ],
27412777 haplotype = self .ancestors_haplotype [id_ ],
2778+ sample_id = self .ancestors_sample_id [id_ ],
27422779 )
27432780
27442781 def ancestors (self ):
@@ -2750,6 +2787,7 @@ def ancestors(self):
27502787 end = self .ancestors_end [:]
27512788 time = self .ancestors_time [:]
27522789 focal_sites = self .ancestors_focal_sites [:]
2790+ sample_id = self .ancestors_sample_id [:]
27532791 for j , h in enumerate (chunk_iterator (self .ancestors_haplotype )):
27542792 yield Ancestor (
27552793 id = j ,
@@ -2758,6 +2796,7 @@ def ancestors(self):
27582796 time = time [j ],
27592797 focal_sites = focal_sites [j ],
27602798 haplotype = h ,
2799+ sample_id = sample_id [j ],
27612800 )
27622801
27632802
0 commit comments