In [1]:
import numpy as np
from sympy import *
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from scipy.stats import binned_statistic, describe

import sys
sys.path.append('../../src/')
from aDNA_coal_sim import *
from plot_utils import *

%matplotlib inline
In [2]:
plt.rcParams['font.sans-serif'] = "Arial"
plt.rcParams['figure.facecolor'] = "w"
plt.rcParams['figure.autolayout'] = True
plt.rcParams['pdf.fonttype'] = 3
from mpl_toolkits.axes_grid1 import make_axes_locatable

import os
main_figdir = '../../plots/jointLD_stats/'
supp_figdir = '../../plots/supp_figs/jointLD_stats/'
os.makedirs(main_figdir, exist_ok=True)
os.makedirs(supp_figdir, exist_ok=True)

Deriving the joint LD statistics

In [3]:
# Define variables in sympy... 
rho,t,eta,gamma = symbols('rho t \eta \gamma')

gamma = rho*(1  - exp(-t *(rho/2 + 1)))/(rho + 2)
eta = 2*(1  - exp(-t *(rho/2 + 1)))/(rho + 2)

eT_AT_B_200 = (rho**2 + 14*rho + 36)/(rho**2 + 13*rho + 18)
eT_AT_B_111 = (rho**2 + 13*rho + 24)/(rho**2 + 13*rho + 18)
eT_AT_B_022 = (rho**2 + 13*rho + 22)/(rho**2 + 13*rho + 18)
In [4]:
# Defining the staggered statistics (from the note)
e_TATB_200_anc = (1 - gamma)*eT_AT_B_200 + gamma*eT_AT_B_111
e_TATB_022_anc = (1 - eta).simplify() * eT_AT_B_022 + eta*eT_AT_B_111
e_TATB_111_mod_coupled = (1-gamma)*eT_AT_B_111 + gamma*eT_AT_B_022
e_TATB_111_mod_uncoupled = (1-eta)*eT_AT_B_111 + eta*eT_AT_B_200

# Simplifying the statistics
e_TATB_200_anc = e_TATB_200_anc.factor().simplify()
e_TATB_022_anc = e_TATB_022_anc.factor().simplify()
e_TATB_111_mod_coupled = e_TATB_111_mod_coupled.factor().simplify()
e_TATB_111_mod_uncoupled = e_TATB_111_mod_uncoupled.factor().simplify()


e_TATB_022_anc
Out[4]:
$\displaystyle \frac{\left(\rho^{3} e^{\frac{t \left(\rho + 2\right)}{2}} + 15 \rho^{2} e^{\frac{t \left(\rho + 2\right)}{2}} + 48 \rho e^{\frac{t \left(\rho + 2\right)}{2}} + 48 e^{\frac{t \left(\rho + 2\right)}{2}} - 4\right) e^{- \frac{t \left(\rho + 2\right)}{2}}}{\left(\rho + 2\right) \left(\rho^{2} + 13 \rho + 18\right)}$
In [5]:
eD0DT_numerator = (e_TATB_200_anc - e_TATB_111_mod_coupled - e_TATB_111_mod_uncoupled + e_TATB_022_anc)

e_D0DT_numerator = eD0DT_numerator.factor().simplify()
e_D0DT_numerator
Out[5]:
$\displaystyle \frac{\left(\rho + 10\right) e^{- \frac{t \left(\rho + 2\right)}{2}}}{\rho^{2} + 13 \rho + 18}$
In [6]:
e_D0Dt_norm = e_D0DT_numerator / e_TATB_022_anc
e_D0Dt_norm # Note this does not actually go "negative"
Out[6]:
$\displaystyle \frac{\left(\rho + 2\right) \left(\rho + 10\right)}{\rho^{3} e^{\frac{t \left(\rho + 2\right)}{2}} + 15 \rho^{2} e^{\frac{t \left(\rho + 2\right)}{2}} + 48 \rho e^{\frac{t \left(\rho + 2\right)}{2}} + 48 e^{\frac{t \left(\rho + 2\right)}{2}} - 4}$
In [7]:
def eD0Dt_norm_eval(r,ta):
    """Function just to evaluate the expression..."""
    return(e_D0Dt_norm.evalf(subs={rho: r, t:ta}))

fig, ax = plt.subplots(1,1,figsize=(4,3))
rhos = np.logspace(-3, 3, 50)
tas = [0, 1e-2, 1e-1]
for ta in tqdm(tas):
    cur_d0dt_norm = np.array([eD0Dt_norm_eval(r,ta) for r in rhos])
    ax.plot(rhos, cur_d0dt_norm, lw=2, label=r'$t_a = %0.2f$' % ta)

ax.legend(fontsize=10)
ax.set_xscale('log')
ax.set_xlabel(r'$\rho$', fontsize=14)
ax.set_ylabel(r'$\frac{\mathbb{E}[D^{(0)}D^{(t)}]}{\mathbb{E}[p^{(0)}_A(1 - p^{(t)}_A)p_B^{(0)}(1 - p_B^{(t)}]}$', fontsize=14)
debox(ax);
plt.savefig(main_figdir + 'eD0Dt_theory.pdf', dpi=300, bbox_inches='tight')
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  6.40it/s]

Corroborating theory w/ simulation results

In [8]:
ld_raw_test = pd.read_csv('../../results/ld_stats_raw/ld_stats_time_sep_raw.csv.gz')
ld_raw_test['ed0dt'] = ld_raw_test['d0dt']/ld_raw_test['pApB']
ld_raw_test.head()
Out[8]:
d0dt std_d0dt pApB std_pApB rec_dist ta scenario maf seed ed0dt
0 0.001312 0.005836 0.005246 0.011681 0.000050 0 SerialConstant 0.0 11 0.250146
1 0.000558 0.002567 0.004923 0.010981 0.000149 0 SerialConstant 0.0 11 0.113276
2 0.000378 0.001777 0.004866 0.010911 0.000249 0 SerialConstant 0.0 11 0.077621
3 0.000297 0.001414 0.004862 0.010842 0.000348 0 SerialConstant 0.0 11 0.061098
4 0.000229 0.001069 0.004790 0.010660 0.000448 0 SerialConstant 0.0 11 0.047914
In [9]:
def eD0Dt_norm_test(r,t):
    """ Revamped version of that function ... """
    return(((r+2)*(r+10))/((r**3 + 15*r**2 + 48*r + 48)*np.exp(t*(r+2)/2) - 4.0))

fig, ax = plt.subplots(1,1,figsize=(4,4))
for x in tqdm([0,100, 1000]):
    ld_raw_df_filt = ld_raw_test[(ld_raw_test['ta'] == x) & (ld_raw_test.maf == 0.00)]
    ax.scatter(ld_raw_df_filt.rec_dist*1e4, ld_raw_df_filt.ed0dt, s=5)
    z = x/(1e4)
    # Dealing with some rescaling between sims vs. theory (factors of two here ... )
    cur_d0dt_norm = np.array([eD0Dt_norm_test(y,z/2) for y in np.unique(ld_raw_df_filt.rec_dist*1e4*4)])
    ax.plot(np.unique(ld_raw_df_filt.rec_dist*1e4), cur_d0dt_norm, lw=2, label=r'$t_a = %d$' % int(z*1e4))

ax.legend()
ax.set_xscale('log')
ax.set_xlabel(r'$\rho$', fontsize=14)
ax.set_ylabel(r'$\sigma^2_t$', fontsize=14)
debox(ax);
ax.set_xlim(0.9, 1.5e2)
ax.set_ylim(-0.01,0.175)
plt.savefig(main_figdir + 'eD0Dt_theory_sims.pdf', dpi=300, bbox_inches='tight')
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 70.96it/s]
In [ ]: