This notebook shows the recreation of plots showing the theoretical impact of time-stratified sampling
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.stats import pearsonr
from scipy.stats import linregress
import pandas as pd
import sys
sys.path.append('../../src/')
from coal_cov import *
from seg_sites_covar import CorrSegSites
from draw_demography import *
from plot_utils import *
%matplotlib inline
%load_ext autoreload
%autoreload 2
plt.rcParams['font.sans-serif'] = "Arial"
plt.rcParams['figure.facecolor'] = "w"
plt.rcParams['figure.autolayout'] = True
plt.rcParams['pdf.fonttype'] = 3
from mpl_toolkits.axes_grid1 import make_axes_locatable
import os
main_figdir = '../../plots/two_locus_stats/'
supp_figdir = '../../plots/supp_figs/two_locus_stats/'
os.makedirs(main_figdir, exist_ok=True)
os.makedirs(supp_figdir, exist_ok=True)
def estNe(t2s):
"""Given modern estimate of t2s calculate Ne."""
return(np.mean(t2s)/2./2.)
def plot_bot_demo(ax, N0=1e4, T_bot=100, b=0.1, **kwargs):
"""Plot bottleneck demography."""
xs = np.arange(0, int(10*T_bot))
ns = np.repeat(N0, xs.size)
ns[T_bot:] = N0*b
ax.plot(xs,ns, **kwargs)
def corr_bl_2_corr_pi_kb(ta, corr_bl, theta=0.4, Ne=1e4):
"""Compute the correlation in pairwise diversity using the correlation in branch length."""
coeff = 1. / (1. + (2 + (ta /2./Ne)/theta))
corr_pi = coeff * corr_bl
return(corr_pi, coeff)
def linear_appx_exp_slope(rho=1e-4):
"""Compute the expected slope """
return(-rho/2. * (rho + 12.)/(rho**2 + 13*rho + 18.))
def intercept_appx(rho):
"""Approximatethe intecept in the constant sized case """
return((rho + 18.)/(rho**2 + 13.*rho+ 18.))
# Plotting just the first two plots
fig, axs = plt.subplots(1,1, figsize=(5,3), tight_layout=True)
# Plot Empirical Simulations + overlayed theory
colors = {0.0: 'green', 0.1:'blue', 1.0 : 'orange'}
tas = [0.0, 0.10,1.0]
#Paired correlation estimates
corr_s1_s2_df = pd.read_csv('../../results/two_loci/theory_mut_corr.csv')
for t in tqdm(tas):
df = corr_s1_s2_df[(corr_s1_s2_df.ta == t) & (corr_s1_s2_df.nreps == 5000)]
rec_rate_mean = df.rec_rate
corr_s1_s2 = df.corr_piApiB
se_r = df.se_corr_piApiB
axs.errorbar(rec_rate_mean/2, corr_s1_s2, yerr=2*se_r,
capsize=2, ls='none', alpha=0.25, color=colors[t])
axs.plot(np.sort(rec_rate_mean)/2, TwoLocusTheoryConstant._corrSASB(4*np.sort(rec_rate_mean), ta=t/2, theta=4*0.4),
zorder=10, color=colors[t], lw=1, label=r'$t_a = %d$' % int(t*1e4))
axs.legend(fontsize=10, labelspacing=-0.0)
axs.set_xscale('log')
axs.set_xlabel(r'$\rho$', fontsize=14)
axs.legend(fontsize=8, labelspacing=-0.0)
axs.set_ylabel(r'$Corr(\pi_A, \pi_B)$', fontsize=14)
debox(axs);
# axs.set_xlim(1e-5,1e-1)
plt.tight_layout()
plt.savefig(main_figdir + 'corr_seg_sites_theory_sims_fixed.pdf', dpi=300, bbox_inches='tight')
## ------ Figure of correlation in branch length in various demographies ------- ##
fig, axs = plt.subplots(2,3, figsize=(7, 3), sharex='col')
# Part 1: Constant population size
ax3 = axs[0,0]; debox(ax3)
ax4 = axs[1,0]; debox(ax4)
# Loading in the metadata for correlation in branch length
corr_bl_df = pd.read_csv('../../results/two_loci/multi_scenario_branch_len.csv')
corr_bl_df = corr_bl_df.dropna()
# compute the asymptotic standard errors (Note - these are similar to bootstrapped values)
corr_bl_df['se_r'] = np.sqrt((1. - corr_bl_df.corr_bl.values ** 2) / (corr_bl_df.Nreps.values - 2.))
# Plotting constant tests
Ns = [5000,10000,20000]
colors = ['blue','orange', 'green']
mu = 1e-8
# This is what we have simulated ...
tas = np.arange(0,501,50)
i = 0
for n in tqdm(Ns):
cur_df = corr_bl_df[(corr_bl_df.Ne == n) & (corr_bl_df.scenario == 'SerialConstant')]
# use the pairwise times to approximate Ne
mean_et2 = np.mean(cur_df.exp_bl)
Ne_hat = mean_et2/2./2.
print("Ne (Constant-Size): %0.2f" % Ne_hat)
ax3.plot(tas, np.repeat(n,tas.size), lw=3, color=colors[i])
# Note - this is a fixed theta here ...
theta = 4 * cur_df.Ne.values[0] * 1e3 * 1e-8
rho = 2*Ne_hat*1e-4
corr_pi, coeff = corr_bl_2_corr_pi_kb(cur_df.ta, theta=theta, corr_bl=cur_df.corr_bl)
ax4.errorbar(cur_df.ta, cur_df.corr_bl, yerr=2*cur_df.se_r, capsize=2, color=colors[i])
i += 1
ax5 = axs[0,1]; debox(ax5)
# Plot both demographies
demo_model1_file = {'Tennessen (2012)': '../../data/demo_models/tennessen_european.txt',
'IBDNe UK10K (2015)': '../../data/demo_models/uk10k.IBDNe.txt'}
i = 0
for x in demo_model1_file:
_,demo = read_demography(demo_model1_file[x])
t,nt = generate_demography(demo)
ax5.plot(t,nt,lw=3,label=x, color=colors[i])
i += 1
ax5.set_yscale('log')
ax5.set_ylim(1e1,1e7)
ax5.set_xlim(0, 500)
ax5.legend(fontsize=8, labelspacing=-0.0)
ax6 = axs[1,1]; debox(ax6)
# Plot the Tennessen et al model...
i = 0
cur_df = corr_bl_df[corr_bl_df.scenario == 'Tennessen']
mean_et2 = np.mean(cur_df.exp_bl)
Ne_hat = mean_et2/2./2.
print('Ne (Tennessen Model) : %0.2f' % Ne_hat)
theta = 4 * Ne_hat * 1e3 * 1e-8
corr_pi, coeff = corr_bl_2_corr_pi_kb(cur_df.ta, theta=theta, corr_bl=cur_df.corr_bl, Ne=Ne_hat)
ax6.errorbar(cur_df.ta, cur_df.corr_bl, yerr=2*cur_df.se_r, capsize=2, color=colors[i])
# Plot the UK10K results
i = 1
cur_df = corr_bl_df[corr_bl_df.scenario == 'IBDNeUK10K']
mean_et2 = np.mean(cur_df.exp_bl)
Ne_hat = mean_et2/2./2.
print('Ne (Browning Model): %0.2f' % Ne_hat)
theta = 4 * Ne_hat * 1e3 * 1e-8
corr_pi, coeff = corr_bl_2_corr_pi_kb(cur_df.ta, theta=theta, corr_bl=cur_df.corr_bl, Ne=Ne_hat)
ax6.errorbar(cur_df.ta, cur_df.corr_bl, yerr=2*cur_df.se_r, capsize=2, color=colors[i])
ax7 = axs[0,2]; debox(ax7)
tbot = [100,200,400]
i = 0
for tb in tbot:
plot_bot_demo(ax7, N0=1e6, T_bot=tb, b=0.0001, lw=2, color=colors[i])
i += 1
ax7.set_yscale('log')
ax7.set_ylim(1e1,1e7)
ax7.set_xlim(0,500)
ax8 = axs[1,2]; debox(ax8)
i = 0
for x in tqdm([7,8,9]):
cur_df = corr_bl_df[corr_bl_df.scenario == 'InstantGrowth%d' % x]
mean_et2 = np.mean(cur_df.exp_bl)
Ne_hat = mean_et2/2./2.
print("Ne (Instant Growth %d): %0.2f" % (x, Ne_hat))
theta = 4 * Ne_hat * 1e3 * 1e-8
corr_pi, coeff = corr_bl_2_corr_pi_kb(cur_df.ta, theta=theta, corr_bl=cur_df.corr_bl, Ne=Ne_hat)
ax8.errorbar(cur_df.ta, cur_df.corr_bl, yerr=2*cur_df.se_r, capsize=2, color=colors[i])
i += 1
# Labeling all of the axes
for (axi,lbl) in zip([axs[0,0], axs[0,1], axs[0,2]],['A','B','C']):
axi.text(-0.06, 1.25, lbl, fontsize=14,
fontweight='bold', va='top', ha='right', transform=axi.transAxes);
axs[0,0].set_ylabel(r'$N_t$', fontsize=14)
axs[1,0].set_ylabel(r'$Corr(L_A,L_B)$', fontsize=14)
plt.tight_layout()
fig.text(0.45, -0.025, r'Sample age (generations)', fontsize=14)
plt.savefig(main_figdir + 'full_demography.corr_branch_len.pdf', dpi=300, bbox_inches='tight')
# Checking the linear approximation for constant-size
corr_bl_df = pd.read_csv('../../results/two_loci/multi_scenario_branch_len.csv')
corr_bl_df = corr_bl_df.dropna()
# Compute the asymptotic standard errors (Note - these are very similar to bootstrapped values)
corr_bl_df['se_r'] = np.sqrt((1. - corr_bl_df.corr_bl.values ** 2) / (corr_bl_df.Nreps.values - 2.))
# Plotting constant tests
Ns = [5000,10000,20000]
colors = ['blue','orange', 'green']
mu = 1e-8
# This is what we have simulated ...
tas = np.arange(0,501,50)
i = 0
for n in Ns:
print("Ne = %d" % n)
cur_df = corr_bl_df[(corr_bl_df.Ne == n) & (corr_bl_df.scenario == 'SerialConstant')]
# use the pairwise times to approximate Ne
mean_et2 = np.mean(cur_df.exp_bl)
Ne_hat = mean_et2/2./2.
# Note - this is a fixed theta here ...
rho = 4*(2*cur_df.Ne.values[0])*1e-4
print("Predicted Slope: %0.4f; Predicted Intercept: %0.4f"% (linear_appx_exp_slope(rho), intercept_appx(rho)))
slope, intercept, _, pval, stderr = linregress(cur_df.ta/(2.*cur_df.Ne.values[0]), cur_df.corr_bl)
print("Estimated Slope: %0.4f; Estimated Intercept: %0.4f; p-value: %0.6f" % (slope, intercept, pval))
# Verify that the product of ta*rho << 1
print("ta * rho: %0.4f" % (np.max(cur_df.ta)/(2.*n) * rho))
print("") # empty line for style, becuase ...
# Checking the linear approximation for constant-size
corr_bl_df = pd.read_csv('../../results/two_loci/multi_scenario_branch_len.csv')
corr_bl_df = corr_bl_df.dropna()
# Compute the asymptotic standard errors (Note - these are very similar to bootstrapped values)
corr_bl_df['se_r'] = np.sqrt((1. - corr_bl_df.corr_bl.values ** 2) / (corr_bl_df.Nreps.values - 2.))
# Plotting constant tests
Ns = [5000,10000,20000]
colors = ['blue','orange', 'green']
mu = 1e-8
# This is what we have simulated ...
tas = np.arange(0,501,50)
i = 0
for n in Ns:
print("Ne = %d" % n)
cur_df = corr_bl_df[(corr_bl_df.Ne == n) & (corr_bl_df.scenario == 'SerialConstant')]
# use the pairwise times to approximate Ne
mean_et2 = np.mean(cur_df.exp_bl)
Ne_hat = mean_et2/2./2.
# Note - this is a fixed theta here ...
rho = 4*(2*cur_df.Ne.values[0])*1e-4
theta = 4 * cur_df.Ne.values[0] * 1e3 * 1e-8
corr_pi, coeff = corr_bl_2_corr_pi_kb(cur_df.ta, theta=theta, corr_bl=cur_df.corr_bl)
# print(coeff)
print("Predicted Slope: %0.4f; Predicted Intercept: %0.4f"% (linear_appx_exp_slope(rho), intercept_appx(rho)))
# NOTE: this doesn't make sense to compute becuase its not an approximation to the correlation in branch length
slope, intercept, _, pval, stderr = linregress(cur_df.ta/(2.*cur_df.Ne.values[0]), corr_pi)
print("Estimated Slope: %0.4f; Estimated Intercept: %0.4f; p-value: %0.6f" % (slope, intercept, pval))
# Verify that the product of ta*rho << 1
print("ta * rho: %0.4f" % (np.max(cur_df.ta)/(2.*n) * rho))
print("")
## ------ Testing figure ------- ##
fig, axs = plt.subplots(2,3, figsize=(7, 3), sharex='col')
# Part 1: Constant population size
ax3 = axs[0,0]; debox(ax3)
ax4 = axs[1,0]; debox(ax4)
# Loading in the metadata for correlation in branch length
corr_bl_df = pd.read_csv('../../results/two_loci/multi_scenario_branch_len.csv')
corr_bl_df = corr_bl_df.dropna()
# compute the asymptotic standard errors (Note - these are similar to bootstrapped values)
corr_bl_df['se_r'] = np.sqrt((1. - corr_bl_df.corr_bl.values ** 2) / (corr_bl_df.Nreps.values - 2.))
# Plotting constant tests
Ns = [5000,10000,20000]
colors = ['blue','orange', 'green']
mu = 1e-8
# This is what we have simulated ...
tas = np.arange(0,501,50)
i = 0
for n in tqdm(Ns):
print("Ne = %d Constant Model" % n)
cur_df = corr_bl_df[(corr_bl_df.Ne == n) & (corr_bl_df.scenario == 'SerialConstant')]
# use the pairwise times to approximate Ne
mean_et2 = np.mean(cur_df.exp_bl)
ax3.plot(tas, np.repeat(n,tas.size), lw=3, color=colors[i])
# Note - this is a fixed theta here ...
theta = 4 * cur_df.Ne.values[0] * 1e3 * 1e-8
corr_pi, coeff = corr_bl_2_corr_pi_kb(cur_df.ta, theta=theta, corr_bl=cur_df.corr_bl)
slope, intercept, _, pval, stderr = linregress(cur_df.ta/(2.*cur_df.Ne.values[0]), corr_pi)
print("Estimated Slope: %0.4f; Estimated Intercept: %0.4f; p-value: %0.6f" % (slope, intercept, pval))
# Note - we linearly scale the standard error here ...
ax4.errorbar(cur_df.ta, corr_pi, yerr=2*cur_df.se_r*coeff, capsize=2, color=colors[i])
i += 1
ax5 = axs[0,1]; debox(ax5)
# Plot both demographies
demo_model1_file = {'Tennessen et al (2012)': '../../data/demo_models/tennessen_european.txt',
'IBDNe UK10K (2015)': '../../data/demo_models/uk10k.IBDNe.txt'}
i = 0
for x in demo_model1_file:
_,demo = read_demography(demo_model1_file[x])
t,nt = generate_demography(demo)
ax5.plot(t,nt,lw=3,label=x, color=colors[i])
i += 1
ax5.set_yscale('log')
ax5.set_ylim(1e1,1e7)
ax5.set_xlim(0, 500)
ax5.legend(fontsize=8, labelspacing=-0.0)
ax6 = axs[1,1]; debox(ax6)
# Plot the Tennessen et al model...
print("Tennessen et al 2012 Model")
i = 0
cur_df = corr_bl_df[corr_bl_df.scenario == 'Tennessen']
mean_et2 = np.mean(cur_df.exp_bl)
# Ne_hat = mean_et2/2./2.
Ne_hat = cur_df.Ne_est.values[0]
theta = 4 * Ne_hat * 1e3 * 1e-8
corr_pi, coeff = corr_bl_2_corr_pi_kb(cur_df.ta, theta=theta, corr_bl=cur_df.corr_bl)
slope, intercept, _, pval, stderr = linregress(cur_df.ta/(2.*cur_df.Ne.values[0]), corr_pi)
print("Estimated Slope: %0.4f; Estimated Intercept: %0.4f; p-value: %0.6f" % (slope, intercept, pval))
ax6.errorbar(cur_df.ta, corr_pi, yerr=2*cur_df.se_r*coeff, capsize=2, color=colors[i])
# Plot the UK10K results
print("UK10K IBDNe Model")
i = 1
cur_df = corr_bl_df[corr_bl_df.scenario == 'IBDNeUK10K']
mean_et2 = np.mean(cur_df.exp_bl)
# Ne_hat = mean_et2/2./2.
Ne_hat = cur_df.Ne_est.values[0]
# Ne_hat = 1e4
theta = 4 * Ne_hat * 1e3 * 1e-8
corr_pi, coeff = corr_bl_2_corr_pi_kb(cur_df.ta, theta=theta, corr_bl=cur_df.corr_bl)
slope, intercept, _, pval, stderr = linregress(cur_df.ta/(2.*cur_df.Ne.values[0]), corr_pi)
print("Estimated Slope: %0.4f; Estimated Intercept: %0.4f; p-value: %0.6f" % (slope, intercept, pval))
ax6.errorbar(cur_df.ta, corr_pi, yerr=2*cur_df.se_r*coeff, capsize=2, color=colors[i])
ax7 = axs[0,2]; debox(ax7)
tbot = [100,200,400]
i = 0
for tb in tbot:
plot_bot_demo(ax7, N0=1e6, T_bot=tb, b=0.0001, lw=2, color=colors[i])
i += 1
ax7.set_yscale('log')
ax7.set_ylim(1e1,1e7)
ax7.set_xlim(0,500)
ax8 = axs[1,2]; debox(ax8)
i = 0
for x in tqdm([7,8,9]):
print("InstantGrowth%d Model" % x)
cur_df = corr_bl_df[corr_bl_df.scenario == 'InstantGrowth%d' % x]
mean_et2 = np.mean(cur_df.exp_bl)
# Ne_hat = mean_et2/2./2.
Ne_hat = cur_df.Ne_est.values[0]
theta = 4 * Ne_hat * 1e3 * 1e-8
corr_pi, coeff = corr_bl_2_corr_pi_kb(cur_df.ta, theta=theta, corr_bl=cur_df.corr_bl)
slope, intercept, _, pval, stderr = linregress(cur_df.ta/(2.*cur_df.Ne.values[0]), corr_pi)
print("Estimated Slope: %0.4f; Estimated Intercept: %0.4f; p-value: %0.6f" % (slope, intercept, pval))
ax8.errorbar(cur_df.ta, corr_pi, yerr=2*cur_df.se_r*coeff, capsize=2, color=colors[i])
i += 1
# Labeling all of the axes
for (axi,lbl) in zip([axs[0,0], axs[0,1], axs[0,2]],['A','B','C']):
axi.text(-0.06, 1.25, lbl, fontsize=14,
fontweight='bold', va='top', ha='right', transform=axi.transAxes);
# Setting the y-limit to make things more comparable on bottom row
for x in axs[1,:]:
x.set_ylim(0,0.35);
axs[0,0].set_ylabel(r'$N_t$', fontsize=14)
axs[1,0].set_ylabel(r'$Corr(\pi_A, \pi_B)$', fontsize=14)
plt.tight_layout()
fig.text(0.55, -0.025, r'Sample age (generations)', ha='center', fontsize=14)
plt.savefig(main_figdir + 'full_demography.corr_piA_piB.pdf', dpi=300, bbox_inches='tight')
test_df = pd.read_csv('../../results/corr_seg_sites/monte_carlo_sims_sA_sB_demography.csv')
test_df.head()
filt_df1 = test_df[(test_df.scenario == 'SerialConstant') & (test_df.ta == 0) & (test_df.seed == 42)]
filt_df2 = test_df[(test_df.scenario == 'TennessenEuropean') & (test_df.ta == 0) & (test_df.seed == 42)]
fig, ax = plt.subplots(1,2,figsize=(5,2.5), sharex=True, sharey=True)
ax[0].errorbar(filt_df1.rec_rate_mean, filt_df1.corr_s1_s2,
yerr=2*filt_df1.se_corr, capsize=2, lw=2, label=r'Constant Size')
ax[0].errorbar(filt_df2.rec_rate_mean, filt_df2.corr_s1_s2,
yerr=2*filt_df2.se_corr, capsize=2, lw=2, label=r'Tennessen et al. (2012)')
ax[0].set_xscale('log')
ax[0].set_title(r'$t_a = %d$' % 0, fontsize=14)
# Making the plot with new timepoints
filt_df1 = test_df[(test_df.scenario == 'SerialConstant') & (test_df.ta == 1000) & (test_df.seed == 42)]
filt_df2 = test_df[(test_df.scenario == 'TennessenEuropean') & (test_df.ta == 1000) & (test_df.seed == 42)]
ax[1].errorbar(filt_df1.rec_rate_mean, filt_df1.corr_s1_s2,
yerr=2*filt_df1.se_corr, capsize=2, lw=2, label=r'Constant Size')
ax[1].errorbar(filt_df2.rec_rate_mean, filt_df2.corr_s1_s2,
yerr=2*filt_df2.se_corr, capsize=2, lw=2, label=r'Tennessen et al. (2012)')
ax[1].set_title(r'$t_a = %d$' % 1000, fontsize=14)
# Plotting aesthetics
debox(ax[0]); debox(ax[1]);
ax[1].legend(fontsize=8)
# Setting labels
ax[0].set_ylabel(r'$Corr(\pi_A,\pi_B)$',fontsize=14);
fig.text(0.45, -0.025, r'Morgans', fontsize=14);
plt.savefig(supp_figdir + 'demo_test_v_constant.corr_piA_piB.pdf', dpi=300, bbox_inches='tight')