In [1]:
# Exploring the covariance in segregating sites
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Import the relevant functions / custom classes
import sys
sys.path.insert(0, '../../src/')
from seg_sites_covar import *
from coal_cov import TwoLocusSimulation, TwoLocusTheoryConstant
from fit_corr_segsites import *
from plot_utils import *

import os
from tqdm import tqdm
from glob import glob

%load_ext autoreload
%autoreload 2
In [2]:
plt.rcParams['font.sans-serif'] = "Arial"
plt.rcParams['figure.facecolor'] = "w"
plt.rcParams['figure.autolayout'] = True
plt.rcParams['pdf.fonttype'] = 3
from mpl_toolkits.axes_grid1 import make_axes_locatable

main_figdir = '../../plots/two_locus_stats/corrSASB_realdata/'
supp_figdir = '../../plots/supp_figs/two_locus_stats/corrSASB_realdata/'
os.makedirs(main_figdir, exist_ok=True)
os.makedirs(supp_figdir, exist_ok=True)

%matplotlib inline

Monte-Carlo Estimation for Haploid Samples

In [3]:
# Reading in the data
rec_rate_corr_piA_piB = pd.read_csv('../../results/corr_seg_sites/monte_carlo_est_LBK_UstIshim_modern.csv')
rec_rate_corr_piA_piB.head()
Out[3]:
ANC_ID MOD_ID rec_rate_mean rec_rate_se corr_piA_piB se_corr_piA_piB
0 LBK NA18486 0.000011 4.968652e-07 0.249415 0.006133
1 LBK NA18486 0.000013 5.821711e-07 0.244006 0.006105
2 LBK NA18486 0.000015 6.855675e-07 0.194202 0.006082
3 LBK NA18486 0.000017 7.973908e-07 0.197501 0.005991
4 LBK NA18486 0.000020 9.310998e-07 0.179601 0.005952
In [4]:
# Reading in a lot of VCF
pop_1kg_df = pd.read_table('../../data/raw_data/1kg_panel/integrated_call_samples_v3.20130502.ALL.panel')

test_yri_1kg = pop_1kg_df[pop_1kg_df['pop'] == 'YRI']['sample'].values[:3]
test_chb_1kg = pop_1kg_df[pop_1kg_df['pop'] == 'CHB']['sample'].values[:3]
test_ceu_1kg = pop_1kg_df[pop_1kg_df['pop'] == 'CEU']['sample'].values[:3]
test_gih_1kg = pop_1kg_df[pop_1kg_df['pop'] == 'GIH']['sample'].values[:3]
test_indivs_1kg = np.hstack([test_yri_1kg, test_chb_1kg, test_ceu_1kg, test_gih_1kg])
In [5]:
mod_1kg = 'NA12383'
anc_ids = ['LBK', 'UstIshim']
In [6]:
fig, ax = plt.subplots(1,1,figsize=(5,3))
axins = ax.inset_axes([0.5, 0.6, 0.47, 0.4])

# Make theory blue and orange ... 
colors = ['blue', 'orange']
idx = 0
for x in anc_ids:
    cur_df = rec_rate_corr_piA_piB[(rec_rate_corr_piA_piB['ANC_ID'] == x) & (rec_rate_corr_piA_piB['MOD_ID'] == mod_1kg)]
    ax.errorbar(rand_jitter(cur_df.rec_rate_mean, scale=1e-3), cur_df.corr_piA_piB,
                xerr=2*cur_df.rec_rate_se, yerr=2*cur_df.se_corr_piA_piB,
                capsize=2, alpha=0.75, color=colors[idx], label=r'%s' % anc_ids[idx])
    axins.errorbar(rand_jitter(cur_df.rec_rate_mean, scale=1e-3), cur_df.corr_piA_piB,
                   xerr=2*cur_df.rec_rate_se, yerr=2*cur_df.se_corr_piA_piB,
                   capsize=2, alpha=0.75, color=colors[idx])
    idx += 1


# Use wattersons estimator on a sample of size 3? 
Ne = 1e4
gen_time = 30.
rec_rates = np.logspace(-5, -3, 1000)
idx = 0
for t in [7e3/gen_time, 45e3/gen_time]:
    corrs_test1 = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rates, ta=t/Ne/2, theta=4*Ne*1e3*1.2e-8)
    ax.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[idx], label=r'Theory ($t_a$ = %d gen)' % t)
    axins.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[idx], label=r'Theory ($t_a$ = %d gen)' % t)
    idx += 1

# sub region of the original image
x1, x2, y1, y2 = 5e-5, 3e-4, 0.025, 0.1
axins.set_xlim(x1, x2)
axins.set_ylim(y1, y2)
axins.set_xticklabels('')
axins.set_yticklabels('')
ax.indicate_inset_zoom(axins, label='')

ax.legend(loc=3, fontsize=10, handletextpad=0.1);
ax.set_xscale('log')
ax.set_xlabel(r'Morgans', fontsize=14)
ax.set_ylabel(r'$Corr(\pi_A,\pi_B)$', fontsize=14)
debox(ax);
plt.savefig(supp_figdir + 'ceu_ustishim_LBK_single_CEU.pdf', dpi=300, bbox_inches='tight')
In [7]:
fig, ax = plt.subplots(1,1,figsize=(5,3))
axins = ax.inset_axes([0.5, 0.6, 0.47, 0.4])

# Make theory blue and orange ... 
colors = ['green', 'blue', 'orange']
mod_id = 'NA12750'
idx = 0
for x in ['NA06984', 'LBK','UstIshim']:
    cur_df = rec_rate_corr_piA_piB[(rec_rate_corr_piA_piB['ANC_ID'] == x) & (rec_rate_corr_piA_piB['MOD_ID'] == mod_1kg)]
    ax.errorbar(cur_df.rec_rate_mean, cur_df.corr_piA_piB,
                xerr=2*cur_df.rec_rate_se, yerr=2*cur_df.se_corr_piA_piB,
                capsize=2, alpha=0.75, color=colors[idx], label=r'%s' % x)
    axins.errorbar(cur_df.rec_rate_mean, cur_df.corr_piA_piB,
                   xerr=2*cur_df.rec_rate_se, yerr=2*cur_df.se_corr_piA_piB,
                   capsize=2, alpha=0.75, color=colors[idx])
    idx += 1


# Use wattersons estimator on a sample of size 3? 
Ne = 1e4
gen_time = 30.
rec_rates = np.logspace(-5, -3, 1000)
idx = 0
for t in [0.0, 7e3/gen_time, 45e3/gen_time]:
    corrs_test1 = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rates, ta=t/Ne/2, theta=4*Ne*1e3*1.2e-8)
    ax.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[idx], label=r' $t_a$ = %d gen' % t)
    axins.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[idx], label=r' $t_a$ = %d gen' % t)
    idx += 1

# sub region of the original image
x1, x2, y1, y2 = 5e-5, 3e-4, 0.025, 0.1
axins.set_xlim(x1, x2)
axins.set_ylim(y1, y2)
axins.set_xticklabels('')
axins.set_yticklabels('')
ax.indicate_inset_zoom(axins, label='')

ax.legend(loc=3, fontsize=8, handletextpad=0.0);
ax.set_xscale('log')
ax.set_xlabel(r'Morgans', fontsize=14)
ax.set_ylabel(r'$Corr(\pi_A,\pi_B)$', fontsize=14)
debox(ax);
plt.savefig(supp_figdir + 'ceu_ustishim_LBK_single_CEU_w_modern.pdf', dpi=300, bbox_inches='tight')

Averaging over many modern CEU samples

In [8]:
fig, ax = plt.subplots(1,1,figsize=(5,3))
# axins = ax.inset_axes([0.5, 0.6, 0.47, 0.4])

test_ceu_1kg = pop_1kg_df[pop_1kg_df['pop'] == 'CEU']['sample'].values
anc_ids = ['NA06984','LBK', 'UstIshim']

# Make theory blue and orange ... 
colors = ['green','blue', 'orange']
idx = 0
for a in anc_ids:
    # Getting the 
    cur_df_test = rec_rate_corr_piA_piB[(rec_rate_corr_piA_piB.ANC_ID == a) & (np.isin(rec_rate_corr_piA_piB.MOD_ID,test_ceu_1kg))]
    rec_rate_mean = [df_region.rec_rate_mean.values for _, df_region in cur_df_test.groupby('MOD_ID')]
    corr_piA_piB = [df_region.corr_piA_piB.values for _, df_region in cur_df_test.groupby('MOD_ID')]
    mean_rec_rates = np.vstack(rec_rate_mean)
    mean_corr_s1_s2 = np.vstack(corr_piA_piB)

    ax.scatter(np.mean(mean_rec_rates, axis=0), np.mean(mean_corr_s1_s2,axis=0), marker='x', s = 20, color=colors[idx], label=r'%s' % a)
    idx += 1


Ne = 1e4
gen_time = 30.
rec_rates = np.logspace(-5, -3, 1000)
idx = 0
for t in [0.0/gen_time, 7e3/gen_time, 45e3/gen_time]:
    corrs_test1 = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rates, ta=t/Ne/2, theta=4*Ne*1e3*1.2e-8)
    ax.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[idx], label=r'$t_a$ = %d gen' % t)
    idx += 1


ax.legend(fontsize=10, labelspacing = 0.1);
ax.set_xscale('log')
ax.set_xlabel(r'Morgans', fontsize=14)
ax.set_ylabel(r'$Corr(\pi_A,\pi_B)$', fontsize=14);
plt.tight_layout(); debox(ax);
plt.savefig(main_figdir + 'ceu_ustishim_LBK_trial_all_ceu_samples.pdf', dpi=300, bbox_inches='tight')

Generating a table for comparing differences in the correlation of pairwise differences

In [9]:
from scipy.stats import wilcoxon
from scipy.stats import binom

def sign_test_corr_piA_piB(x, y, test='binom', **kwargs):
    """
        Function to compute a non-parametric sign test
    """
    assert(x.size == y.size)
    diff = x - y
    if test == 'wilcoxon':
        stat, pval = wilcoxon(x, y, **kwargs)
    elif test == 'binom':
        npos = np.sum(diff > 0)
        nneg = np.sum(diff < 0)
        n = x.size
        stat = np.mean(diff)
        pval = binom.cdf(np.min([nneg, npos]), n=n, p=0.5) + (1. - binom.cdf(np.max([npos, nneg]), n=n, p=0.5))
    else:
        raise ValueError('Test is not supported!')
    return(diff, stat, pval)
In [10]:
for a in anc_ids:
    # Getting the 
    cur_df_test = rec_rate_corr_piA_piB[(rec_rate_corr_piA_piB.ANC_ID == a) & (np.isin(rec_rate_corr_piA_piB.MOD_ID,test_ceu_1kg))]
    rec_rate_mean = np.array([df_region.rec_rate_mean.values for _, df_region in cur_df_test.groupby('MOD_ID')])
    corr_piA_piB = np.array([df_region.corr_piA_piB.values for _, df_region in cur_df_test.groupby('MOD_ID')])
    # Check lower and upper tails here 
    corrs_theory_lower = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rate_mean[rec_rate_mean < 1e-4], ta=t/Ne/2, theta=4*Ne*1e3*1.2e-8)
    corrs_theory_upper = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rate_mean[rec_rate_mean > 1e-4], ta=t/Ne/2, theta=4*Ne*1e3*1.2e-8)
    _,_,p_val_binom_lower = sign_test_corr_piA_piB(corrs_theory_lower, corr_piA_piB[rec_rate_mean < 1e-4])
    _,_,p_val_binom_upper = sign_test_corr_piA_piB(corrs_theory_upper, corr_piA_piB[rec_rate_mean > 1e-4])
    print(a, p_val_binom_lower, p_val_binom_upper)
NA06984 6.576674038205957e-259 0.0
LBK 7.478985727921435e-08 0.0
UstIshim 3.367332435000667e-24 0.0
In [11]:
# Comparing the estimates across the mean-averaged versions
test_ceu_1kg = pop_1kg_df[pop_1kg_df['pop'] == 'CEU']['sample'].values
anc_ids = ['LBK', 'UstIshim', 'NA06984']

colors = ['blue', 'orange', 'green']
corr_piA_piB_dict = {}
idx = 0
for a in anc_ids:
    # Getting the 
    cur_df_test = rec_rate_corr_piA_piB[(rec_rate_corr_piA_piB.ANC_ID == a) & (np.isin(rec_rate_corr_piA_piB.MOD_ID,test_ceu_1kg))]
    rec_rate_mean = [df_region.rec_rate_mean.values for _, df_region in cur_df_test.groupby('MOD_ID')]
    corr_piA_piB = [df_region.corr_piA_piB.values for _, df_region in cur_df_test.groupby('MOD_ID')]
    mean_rec_rates = np.vstack(rec_rate_mean)
    mean_corr_s1_s2 = np.vstack(corr_piA_piB)
    corr_piA_piB_dict[a] = np.mean(mean_corr_s1_s2, axis=0)

collapsed_rows = []
for k in corr_piA_piB_dict.keys():
    for j in corr_piA_piB_dict.keys():
        if j != k:
            _, _, pval_wilcoxon = sign_test_corr_piA_piB(corr_piA_piB_dict[j], corr_piA_piB_dict[k], test='wilcoxon')
            _, stat, pval_binom = sign_test_corr_piA_piB(corr_piA_piB_dict[j], corr_piA_piB_dict[k], test='binom')
            cur_row = [j, k, pval_wilcoxon, pval_binom]
            collapsed_rows.append(cur_row)

df = pd.DataFrame(collapsed_rows)
df.columns = ['ID1', 'ID2', 'p-value (Wilcoxon)', 'p-value (Binomial)']
df.to_csv('../../results/corr_seg_sites/pvals_mean_LBK_UstIshim_CEU_1kg_indivs.csv', index=False)
df[:4]
Out[11]:
ID1 ID2 p-value (Wilcoxon) p-value (Binomial)
0 UstIshim LBK 0.705129 3.615946e-01
1 NA06984 LBK 0.000003 1.862645e-09
2 LBK UstIshim 0.705129 3.615946e-01
3 NA06984 UstIshim 0.000003 1.862645e-09
In [12]:
collapsed_rows = []
print(test_indivs_1kg)
for sampID in tqdm(test_indivs_1kg):
    pop = pop_1kg_df[pop_1kg_df['sample'] == sampID]['pop'].values[0]
    filt_df = rec_rate_corr_piA_piB[rec_rate_corr_piA_piB['MOD_ID'] == sampID]
    if filt_df.size > 0:
        df1 = filt_df[filt_df.ANC_ID == 'LBK']
        df2 = filt_df[filt_df.ANC_ID == 'UstIshim']

        _, _, pval_wilcoxon = sign_test_corr_piA_piB(df1['corr_piA_piB'], df2['corr_piA_piB'], test='wilcoxon')
        _, stat, pval_binom = sign_test_corr_piA_piB(df1['corr_piA_piB'].values, df2['corr_piA_piB'].values, test='binom')
        cur_row = [sampID, pop, np.round(pval_wilcoxon,5), np.round(pval_binom,5)]
        collapsed_rows.append(cur_row)

# testing this?
df = pd.DataFrame(collapsed_rows)
df.columns = ['Sample ID', 'Population','p-value (Wilcoxon)', 'p-value (Binomial)']
df = df.sort_values('Population')
df.to_csv('../../results/corr_seg_sites/pvals_LBK_UstIshim_1kg_indivs.csv', index=False)
df
['NA18486' 'NA18488' 'NA18489' 'NA18525' 'NA18526' 'NA18528' 'NA06984'
 'NA06985' 'NA06986' 'NA20845' 'NA20846' 'NA20847']
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 213.20it/s]
Out[12]:
Sample ID Population p-value (Wilcoxon) p-value (Binomial)
6 NA06984 CEU 0.62660 0.58466
7 NA06985 CEU 0.83724 0.85554
8 NA06986 CEU 0.32519 0.58466
3 NA18525 CHB 0.02522 0.36159
4 NA18526 CHB 0.00008 0.00001
5 NA18528 CHB 0.00052 0.00522
9 NA20845 GIH 0.40513 0.85554
10 NA20846 GIH 0.00710 0.04277
11 NA20847 GIH 0.00029 0.00006
0 NA18486 YRI 0.11199 0.20049
1 NA18488 YRI 0.00977 0.00143
2 NA18489 YRI 0.90533 0.36159
In [13]:
test_indivs_ceu = rec_rate_corr_piA_piB[(rec_rate_corr_piA_piB['ANC_ID'] == 'NA06984')]['MOD_ID'].unique()
In [14]:
collapsed_rows = []
# print(test_indivs_ceu)
for sampID in tqdm(test_indivs_ceu):
    if sampID != 'NA06984':
        pop = pop_1kg_df[pop_1kg_df['sample'] == sampID]['pop'].values[0]
        filt_df = rec_rate_corr_piA_piB[rec_rate_corr_piA_piB['MOD_ID'] == sampID]
        if filt_df.size > 0:
            df1 = filt_df[filt_df.ANC_ID == 'NA06984']
            df2 = filt_df[filt_df.ANC_ID == 'UstIshim']
            _, _, pval_wilcoxon = sign_test_corr_piA_piB(df1['corr_piA_piB'], df2['corr_piA_piB'], test='wilcoxon')
            _, stat, pval_binom = sign_test_corr_piA_piB(df1['corr_piA_piB'].values, df2['corr_piA_piB'].values, test='binom')
            cur_row = [sampID, pop, np.round(pval_wilcoxon,5), np.round(pval_binom,5)]
            collapsed_rows.append(cur_row)

# testing this?
df = pd.DataFrame(collapsed_rows)
df.columns = ['Sample ID', 'Population','p-value (Wilcoxon)', 'p-value (Binomial)']
df = df.sort_values('Population')
df.to_csv('../../results/corr_seg_sites/pvals_NA06984_UstIshim_1kg_indivs.csv', index=False)
df
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 99/99 [00:00<00:00, 176.00it/s]
Out[14]:
Sample ID Population p-value (Wilcoxon) p-value (Binomial)
0 NA06985 CEU 0.00001 0.00000
70 NA12748 CEU 0.01790 0.01612
69 NA12718 CEU 0.00000 0.00000
68 NA12717 CEU 0.00012 0.00006
67 NA12716 CEU 0.00005 0.00032
... ... ... ... ...
28 NA11932 CEU 0.00011 0.00001
27 NA11931 CEU 0.00002 0.00001
26 NA11930 CEU 0.00123 0.00143
24 NA11919 CEU 0.00123 0.00522
97 NA12890 CEU 0.00003 0.00006

98 rows × 4 columns

$Corr(\pi_A, \pi_B)$ under Tennessen et al Demography

In [15]:
df = pd.read_csv('../../results/two_loci/tennessen_real_data_branch_len.csv')
fig, (ax,ax2) = plt.subplots(1,2,figsize=(8,4), sharey=True)

def corr_bl_2_corr_pi_kb(ta, corr_bl, theta=0.4, Ne=1e4):
    """Compute the correlation in pairwise diversity using the correlation in branch length."""
    coeff = 1. / (1. + (2 + (ta /2./Ne)/theta))
    corr_pi = coeff * corr_bl
    return(corr_pi, coeff)

colors = ['blue', 'orange', 'green']
Ne_hats = []
for i,ta in enumerate([0, 233,1500]):
    rec_rates = 10**(-df[(df.ta == ta) & (df.scenario == "Tennessen")].rec_rate.values)
    corr_bl = df[(df.ta == ta) & (df.scenario == "Tennessen")].corr_bl.values
    Ne_hat = df[(df.ta == ta) & (df.scenario == "Tennessen")].Ne_est.values[0]
    Ne_hats.append(Ne_hat)
    theta = 4.0 * Ne_hat * 1e3 * 2e-8
    corr_pi = corr_bl_2_corr_pi_kb(ta, corr_bl, theta=theta, Ne=Ne_hat)
    ax.scatter(rec_rates, corr_pi[0], marker='x', color=colors[i])

rec_rates = np.logspace(-5, -3, 1000)
for i,t in enumerate([0,233,1500]):
    Ne = Ne_hats[i]
    corrs_test1 = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rates, ta=t/Ne, theta=4.0*Ne*1e3*1.2e-8)
    ax.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[i], label=r'$t_a$ = %d (Constant)' % t)
    idx += 1

ax.legend()
ax.set_xscale('log')
ax.set_ylabel(r'$Corr(\pi_A, \pi_B)$', fontsize=14)
ax.set_xlabel(r'Morgans', fontsize=14)
debox(ax);


colors=['blue', 'orange', 'green']
for i,t in enumerate([100, 500, 1000]):
    df = np.load('/Users/aabiddanda/Workdir/aDNA_LD/corr_seg_sites/test_corr_haphap/autosomes.paired_seg_sites.hap0.ta%d.seed1.monte_carlo_L1000.N200.npz' % t)
    ax2.scatter(df['rec_rate_mean'], df['corr_s1_s2'], marker='x', color=colors[i], label=r'$t_a=%d$' % t)

Ne_hats = [1e4, 1e4, 1e4]
rec_rates = np.logspace(-5, -2, 1000)
for i,t in enumerate([100, 500, 1000]):
    Ne = Ne_hats[i]
    corrs_test1 = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rates, ta=t/Ne, theta=4.0*Ne*1e3*1.2e-8)
    ax2.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[i])
    idx += 1

ax2.set_xscale('log')
ax2.legend(frameon=False)
ax2.set_xlabel(r'Morgans', fontsize=14)
ax2.axhline(y=0, linestyle='--', color='black')
debox(ax2);

label_multipanel([ax, ax2], ['A','B'], yoff=1.1,
                 fontsize=14, fontweight='bold', va='top', ha='right')
plt.tight_layout()
plt.savefig(supp_figdir + 'corr_piApiB_tennessen_demography.pdf', dpi=300, bbox_inches='tight')
In [16]:
# DANGER: testing out with some simulated data
fig, ax = plt.subplots(1,1,figsize=(4,4))

colors=['blue', 'orange', 'green']
for i,t in enumerate([100, 500, 1000]):
    df = np.load('/Users/aabiddanda/Workdir/aDNA_LD/corr_seg_sites/test_corr_haphap/autosomes.paired_seg_sites.hap0.ta%d.seed1.monte_carlo_L1000.N200.npz' % t)
    ax.scatter(df['rec_rate_mean'], df['corr_s1_s2'], marker='x', color=colors[i], label=r'$t_a=%d$' % t)


Ne_hats = [1e4, 1e4, 1e4]
rec_rates = np.logspace(-5, -2, 1000)
for i,t in enumerate([100, 500, 1000]):
    Ne = Ne_hats[i]
    corrs_test1 = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rates, ta=t/Ne, theta=4.0*Ne*1e3*1.2e-8)
    ax.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[i])
    idx += 1


ax.set_xscale('log')
ax.legend(frameon=False)
ax.set_xlabel(r'Morgans', fontsize=14)
ax.set_ylabel(r'$Corr(\pi_A, \pi_B)$', fontsize=14)
ax.axhline(y=0, linestyle='--', color='black')
debox(ax);
ax.set_title(r'Estimation from simulations (not real-data)')
# ax.set_yscale('log')
plt.savefig(supp_figdir + 'corr_piA_piB_test_sims_w_realdataestimation.pdf', bbox_inches='tight')