Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.venv
.vscode
130 changes: 130 additions & 0 deletions figure_one/ancestors_number.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import msprime
import concurrent
import pandas as pd
import matplotlib.pyplot as plt
import time
import numpy as np

sample_size = 10000
r = 1e-9
Ne = 1e6
L = 1e6
filename = 'lineages.csv'

models = {'Hudson':'Hudson',
'SMCK(1)': msprime.SMCK(1),
}

def csv(x):
return ",".join(map(str, x)) + "\n"

def save(name):
plt.tight_layout()
plt.savefig(f"figures/{name}.pdf")

def get_ancestors_after_n_generations(params):
model = params[0]; generations = params[1]

model_class = models[model]
sim = msprime.ancestry._parse_simulate(
sample_size=sample_size,
recombination_rate=r,
Ne=Ne,
length=L,
model=model_class,
end_time=generations,
)
sim.run()
ancestors = len(sim.ancestors)
finish_time = sim.time
return [Ne, L, r, sample_size, str(model), generations, finish_time, ancestors]

def generate_data(gens_list=[1,10,100,1000,10000,100000,100000,1000000, None], reps=25):
tasks = []
for gen in gens_list:
for model in models.keys():
for rep in range(reps):
tasks.append((model, gen))

with open(filename, "w") as f:
f.write(csv(['N', 'L', 'r','num_samples', 'model', 'gens', 'finish_time', 'ancestors']))


with concurrent.futures.ProcessPoolExecutor(max_workers=12) as executor:
# Submit all tasks and get futures
future_results = {executor.submit(get_ancestors_after_n_generations, params): params for params in tasks}

# Process results as they complete
for future in concurrent.futures.as_completed(future_results):
try:
result = future.result()
f.write(csv(result))
f.flush()
except Exception as exc:
params = future_results[future]
print(f"Task {params} generated an exception: {exc}")

def plot():
df = pd.read_csv(filename)

models = sorted(df['model'].unique())



# Define a colormap for sample sizes
cmap = plt.get_cmap("viridis").resampled(len(models))

# Create the plot
plt.figure(figsize=(10, 6))

for i, model in enumerate(models):
subset = df[df['model'] == model]
subset = subset[subset['ancestors'] != 0]
grouped_data = subset.groupby('gens')['ancestors'].mean().reset_index()
grouped_data = grouped_data.sort_values('gens')
xs = list(grouped_data['gens'])
ys = list(grouped_data['ancestors'])

# deal with simulations to the end of time
subset = df[df['gens'].isna()]
grouped_data = (
subset[subset['model'] == model]
.groupby('model')[['ancestors', 'finish_time']]
.mean()
.reset_index()
)
grouped_data = grouped_data.sort_values('finish_time')

xs += list(grouped_data['finish_time'])
ys += list(grouped_data['ancestors'])


plt.plot(xs, ys, linestyle='-', color=cmap(i),
linewidth=2, marker='o', markersize=5, label=f'{model}', alpha=0.5)


plt.xscale('log')
#plt.yscale('log')

# Add labels and title
plt.xlabel('generations')
plt.title(f'Number of Ancestors after n generations. Ne:{Ne}, L:{L}, r:{r}, num_samples:{sample_size}')

# Add legend
plt.legend()

# Add grid for better readability
plt.grid(True, which="both", ls="--", alpha=0.3)

# Save the plot
plt.tight_layout()
save(filename.split('.')[0])
plt.close()
#plt.show()

if __name__ == "__main__":
gens_list= np.logspace(0, 4, num=8, dtype=int).tolist() + \
np.logspace(4, 6, num=6, dtype=int).tolist() +\
np.logspace(6, 7, num=5, dtype=int).tolist() + [None]
generate_data(gens_list=gens_list, reps=25)
plot()
41 changes: 41 additions & 0 deletions figure_one/archive/draw-figure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import msprime
import numpy as np
import matplotlib.pyplot as plt
import concurrent.futures

# Parameters
sample_size = 10
sequence_length = 1e6 # total length of genome
recombination_rate = 1e-2
num_replicates = 100

def simulate_tmrcas(model):
ts = msprime.sim_ancestry(
samples=sample_size,
sequence_length=sequence_length,
recombination_rate=recombination_rate,
model=model,
)
# Return the TMRCA for all trees in this replicate
return [tree.time(tree.root) for tree in ts.trees()]

def parallel_tmrcas(model):
tmrcas = []

with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(simulate_tmrcas, [model] * num_replicates)
for tmrcas_per_replicate in results:
tmrcas.extend(tmrcas_per_replicate)
return tmrcas

# Standard model
tmrcas_standard = parallel_tmrcas(msprime.StandardCoalescent()) # standard coalescent
print(f"Standard model finished")
# SMC' model
tmrcas_smck = parallel_tmrcas(msprime.SmcKApproxCoalescent())

# Box plot
plt.boxplot([tmrcas_standard, tmrcas_smck], labels=["Standard", "SMC'"], showfliers=False)
plt.ylabel("TMRCA")
plt.title("Distribution of Coalescence Times")
plt.show()
82 changes: 82 additions & 0 deletions figure_one/archive/generate-data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import msprime
import numpy as np

# Parameters
sample_size = 10
sequence_length = 1e5 # total length of genome
recombination_rate = 1e-3
mutation_rate = 0 # not needed here
distance = 1000 # distance between sites (in bp)
num_replicates = 1000 # to average over simulations

def average_coalescence_time_at_distance_d(ts, d):
times = []
positions = np.arange(0, ts.sequence_length - d, d)
for pos in positions:
left_ts = ts.at(pos)
right_ts = ts.at(pos + d)

for tree in left_ts.trees():
if tree in right_ts:
print(f"Tree {tree} is in both left and right trees")

# Ensure both positions are covered by trees
if left_tree is None or right_tree is None:
continue

# Pick pairs of samples
for i in range(0, ts.num_samples, 2):
if i + 1 >= ts.num_samples:
break
n1, n2 = i, i + 1

# Get TMRCA at the two positions
t1 = left_tree.tmrca(n1, n2)
t2 = right_tree.tmrca(n1, n2)

# Average TMRCA for the pair of positions
times.append((t1 + t2) / 2)

return np.mean(times) if times else np.nan

def _average_coalescence_time_at_distance_d(ts, d):
times = []
positions = np.arange(0, ts.sequence_length - d, d)
for pos in positions:
left_tree = ts.at(pos)
right_tree = ts.at(pos + d)

# Ensure both positions are covered by trees
if left_tree is None or right_tree is None:
continue

# Pick pairs of samples
for i in range(0, ts.num_samples, 2):
if i + 1 >= ts.num_samples:
break
n1, n2 = i, i + 1

# Get TMRCA at the two positions
t1 = left_tree.tmrca(n1, n2)
t2 = right_tree.tmrca(n1, n2)

# Average TMRCA for the pair of positions
times.append((t1 + t2) / 2)

return np.mean(times) if times else np.nan

# Run simulation
avg_tmrcas = []
for _ in range(num_replicates):
ts = msprime.sim_ancestry(
samples=sample_size,
sequence_length=sequence_length,
recombination_rate=recombination_rate,
random_seed=None
)
avg_tmrcas.append(average_coalescence_time_at_distance_d(ts, distance))

# Filter out None values
avg_tmrcas = [tmrca for tmrca in avg_tmrcas if tmrca is not None]
overall_average = np.nanmean(avg_tmrcas)
print(f"Average coalescence time for site pairs {distance} bp apart: {overall_average}")
Loading