# Exploring the covariance in segregating sites
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# Import the relevant functions / custom classes
import sys
sys.path.insert(0, '../../src/')
from seg_sites_covar import *
from coal_cov import TwoLocusSimulation, TwoLocusTheoryConstant
from fit_corr_segsites import *
from plot_utils import *
import os
from tqdm import tqdm
from glob import glob
%load_ext autoreload
%autoreload 2
plt.rcParams['font.sans-serif'] = "Arial"
plt.rcParams['figure.facecolor'] = "w"
plt.rcParams['figure.autolayout'] = True
plt.rcParams['pdf.fonttype'] = 3
from mpl_toolkits.axes_grid1 import make_axes_locatable
main_figdir = '../../plots/two_locus_stats/corrSASB_realdata/'
supp_figdir = '../../plots/supp_figs/two_locus_stats/corrSASB_realdata/'
os.makedirs(main_figdir, exist_ok=True)
os.makedirs(supp_figdir, exist_ok=True)
%matplotlib inline
# Reading in the data
rec_rate_corr_piA_piB = pd.read_csv('../../results/corr_seg_sites/monte_carlo_est_LBK_UstIshim_modern.csv')
rec_rate_corr_piA_piB.head()
# Reading in a lot of VCF
pop_1kg_df = pd.read_table('../../data/raw_data/1kg_panel/integrated_call_samples_v3.20130502.ALL.panel')
test_yri_1kg = pop_1kg_df[pop_1kg_df['pop'] == 'YRI']['sample'].values[:3]
test_chb_1kg = pop_1kg_df[pop_1kg_df['pop'] == 'CHB']['sample'].values[:3]
test_ceu_1kg = pop_1kg_df[pop_1kg_df['pop'] == 'CEU']['sample'].values[:3]
test_gih_1kg = pop_1kg_df[pop_1kg_df['pop'] == 'GIH']['sample'].values[:3]
test_indivs_1kg = np.hstack([test_yri_1kg, test_chb_1kg, test_ceu_1kg, test_gih_1kg])
mod_1kg = 'NA12383'
anc_ids = ['LBK', 'UstIshim']
fig, ax = plt.subplots(1,1,figsize=(5,3))
axins = ax.inset_axes([0.5, 0.6, 0.47, 0.4])
# Make theory blue and orange ...
colors = ['blue', 'orange']
idx = 0
for x in anc_ids:
cur_df = rec_rate_corr_piA_piB[(rec_rate_corr_piA_piB['ANC_ID'] == x) & (rec_rate_corr_piA_piB['MOD_ID'] == mod_1kg)]
ax.errorbar(rand_jitter(cur_df.rec_rate_mean, scale=1e-3), cur_df.corr_piA_piB,
xerr=2*cur_df.rec_rate_se, yerr=2*cur_df.se_corr_piA_piB,
capsize=2, alpha=0.75, color=colors[idx], label=r'%s' % anc_ids[idx])
axins.errorbar(rand_jitter(cur_df.rec_rate_mean, scale=1e-3), cur_df.corr_piA_piB,
xerr=2*cur_df.rec_rate_se, yerr=2*cur_df.se_corr_piA_piB,
capsize=2, alpha=0.75, color=colors[idx])
idx += 1
# Use wattersons estimator on a sample of size 3?
Ne = 1e4
gen_time = 30.
rec_rates = np.logspace(-5, -3, 1000)
idx = 0
for t in [7e3/gen_time, 45e3/gen_time]:
corrs_test1 = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rates, ta=t/Ne/2, theta=4*Ne*1e3*1.2e-8)
ax.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[idx], label=r'Theory ($t_a$ = %d gen)' % t)
axins.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[idx], label=r'Theory ($t_a$ = %d gen)' % t)
idx += 1
# sub region of the original image
x1, x2, y1, y2 = 5e-5, 3e-4, 0.025, 0.1
axins.set_xlim(x1, x2)
axins.set_ylim(y1, y2)
axins.set_xticklabels('')
axins.set_yticklabels('')
ax.indicate_inset_zoom(axins, label='')
ax.legend(loc=3, fontsize=10, handletextpad=0.1);
ax.set_xscale('log')
ax.set_xlabel(r'Morgans', fontsize=14)
ax.set_ylabel(r'$Corr(\pi_A,\pi_B)$', fontsize=14)
debox(ax);
plt.savefig(supp_figdir + 'ceu_ustishim_LBK_single_CEU.pdf', dpi=300, bbox_inches='tight')
fig, ax = plt.subplots(1,1,figsize=(5,3))
axins = ax.inset_axes([0.5, 0.6, 0.47, 0.4])
# Make theory blue and orange ...
colors = ['green', 'blue', 'orange']
mod_id = 'NA12750'
idx = 0
for x in ['NA06984', 'LBK','UstIshim']:
cur_df = rec_rate_corr_piA_piB[(rec_rate_corr_piA_piB['ANC_ID'] == x) & (rec_rate_corr_piA_piB['MOD_ID'] == mod_1kg)]
ax.errorbar(cur_df.rec_rate_mean, cur_df.corr_piA_piB,
xerr=2*cur_df.rec_rate_se, yerr=2*cur_df.se_corr_piA_piB,
capsize=2, alpha=0.75, color=colors[idx], label=r'%s' % x)
axins.errorbar(cur_df.rec_rate_mean, cur_df.corr_piA_piB,
xerr=2*cur_df.rec_rate_se, yerr=2*cur_df.se_corr_piA_piB,
capsize=2, alpha=0.75, color=colors[idx])
idx += 1
# Use wattersons estimator on a sample of size 3?
Ne = 1e4
gen_time = 30.
rec_rates = np.logspace(-5, -3, 1000)
idx = 0
for t in [0.0, 7e3/gen_time, 45e3/gen_time]:
corrs_test1 = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rates, ta=t/Ne/2, theta=4*Ne*1e3*1.2e-8)
ax.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[idx], label=r' $t_a$ = %d gen' % t)
axins.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[idx], label=r' $t_a$ = %d gen' % t)
idx += 1
# sub region of the original image
x1, x2, y1, y2 = 5e-5, 3e-4, 0.025, 0.1
axins.set_xlim(x1, x2)
axins.set_ylim(y1, y2)
axins.set_xticklabels('')
axins.set_yticklabels('')
ax.indicate_inset_zoom(axins, label='')
ax.legend(loc=3, fontsize=8, handletextpad=0.0);
ax.set_xscale('log')
ax.set_xlabel(r'Morgans', fontsize=14)
ax.set_ylabel(r'$Corr(\pi_A,\pi_B)$', fontsize=14)
debox(ax);
plt.savefig(supp_figdir + 'ceu_ustishim_LBK_single_CEU_w_modern.pdf', dpi=300, bbox_inches='tight')
fig, ax = plt.subplots(1,1,figsize=(5,3))
# axins = ax.inset_axes([0.5, 0.6, 0.47, 0.4])
test_ceu_1kg = pop_1kg_df[pop_1kg_df['pop'] == 'CEU']['sample'].values
anc_ids = ['NA06984','LBK', 'UstIshim']
# Make theory blue and orange ...
colors = ['green','blue', 'orange']
idx = 0
for a in anc_ids:
# Getting the
cur_df_test = rec_rate_corr_piA_piB[(rec_rate_corr_piA_piB.ANC_ID == a) & (np.isin(rec_rate_corr_piA_piB.MOD_ID,test_ceu_1kg))]
rec_rate_mean = [df_region.rec_rate_mean.values for _, df_region in cur_df_test.groupby('MOD_ID')]
corr_piA_piB = [df_region.corr_piA_piB.values for _, df_region in cur_df_test.groupby('MOD_ID')]
mean_rec_rates = np.vstack(rec_rate_mean)
mean_corr_s1_s2 = np.vstack(corr_piA_piB)
ax.scatter(np.mean(mean_rec_rates, axis=0), np.mean(mean_corr_s1_s2,axis=0), marker='x', s = 20, color=colors[idx], label=r'%s' % a)
idx += 1
Ne = 1e4
gen_time = 30.
rec_rates = np.logspace(-5, -3, 1000)
idx = 0
for t in [0.0/gen_time, 7e3/gen_time, 45e3/gen_time]:
corrs_test1 = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rates, ta=t/Ne/2, theta=4*Ne*1e3*1.2e-8)
ax.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[idx], label=r'$t_a$ = %d gen' % t)
idx += 1
ax.legend(fontsize=10, labelspacing = 0.1);
ax.set_xscale('log')
ax.set_xlabel(r'Morgans', fontsize=14)
ax.set_ylabel(r'$Corr(\pi_A,\pi_B)$', fontsize=14);
plt.tight_layout(); debox(ax);
plt.savefig(main_figdir + 'ceu_ustishim_LBK_trial_all_ceu_samples.pdf', dpi=300, bbox_inches='tight')
from scipy.stats import wilcoxon
from scipy.stats import binom
def sign_test_corr_piA_piB(x, y, test='binom', **kwargs):
"""
Function to compute a non-parametric sign test
"""
assert(x.size == y.size)
diff = x - y
if test == 'wilcoxon':
stat, pval = wilcoxon(x, y, **kwargs)
elif test == 'binom':
npos = np.sum(diff > 0)
nneg = np.sum(diff < 0)
n = x.size
stat = np.mean(diff)
pval = binom.cdf(np.min([nneg, npos]), n=n, p=0.5) + (1. - binom.cdf(np.max([npos, nneg]), n=n, p=0.5))
else:
raise ValueError('Test is not supported!')
return(diff, stat, pval)
for a in anc_ids:
# Getting the
cur_df_test = rec_rate_corr_piA_piB[(rec_rate_corr_piA_piB.ANC_ID == a) & (np.isin(rec_rate_corr_piA_piB.MOD_ID,test_ceu_1kg))]
rec_rate_mean = np.array([df_region.rec_rate_mean.values for _, df_region in cur_df_test.groupby('MOD_ID')])
corr_piA_piB = np.array([df_region.corr_piA_piB.values for _, df_region in cur_df_test.groupby('MOD_ID')])
# Check lower and upper tails here
corrs_theory_lower = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rate_mean[rec_rate_mean < 1e-4], ta=t/Ne/2, theta=4*Ne*1e3*1.2e-8)
corrs_theory_upper = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rate_mean[rec_rate_mean > 1e-4], ta=t/Ne/2, theta=4*Ne*1e3*1.2e-8)
_,_,p_val_binom_lower = sign_test_corr_piA_piB(corrs_theory_lower, corr_piA_piB[rec_rate_mean < 1e-4])
_,_,p_val_binom_upper = sign_test_corr_piA_piB(corrs_theory_upper, corr_piA_piB[rec_rate_mean > 1e-4])
print(a, p_val_binom_lower, p_val_binom_upper)
# Comparing the estimates across the mean-averaged versions
test_ceu_1kg = pop_1kg_df[pop_1kg_df['pop'] == 'CEU']['sample'].values
anc_ids = ['LBK', 'UstIshim', 'NA06984']
colors = ['blue', 'orange', 'green']
corr_piA_piB_dict = {}
idx = 0
for a in anc_ids:
# Getting the
cur_df_test = rec_rate_corr_piA_piB[(rec_rate_corr_piA_piB.ANC_ID == a) & (np.isin(rec_rate_corr_piA_piB.MOD_ID,test_ceu_1kg))]
rec_rate_mean = [df_region.rec_rate_mean.values for _, df_region in cur_df_test.groupby('MOD_ID')]
corr_piA_piB = [df_region.corr_piA_piB.values for _, df_region in cur_df_test.groupby('MOD_ID')]
mean_rec_rates = np.vstack(rec_rate_mean)
mean_corr_s1_s2 = np.vstack(corr_piA_piB)
corr_piA_piB_dict[a] = np.mean(mean_corr_s1_s2, axis=0)
collapsed_rows = []
for k in corr_piA_piB_dict.keys():
for j in corr_piA_piB_dict.keys():
if j != k:
_, _, pval_wilcoxon = sign_test_corr_piA_piB(corr_piA_piB_dict[j], corr_piA_piB_dict[k], test='wilcoxon')
_, stat, pval_binom = sign_test_corr_piA_piB(corr_piA_piB_dict[j], corr_piA_piB_dict[k], test='binom')
cur_row = [j, k, pval_wilcoxon, pval_binom]
collapsed_rows.append(cur_row)
df = pd.DataFrame(collapsed_rows)
df.columns = ['ID1', 'ID2', 'p-value (Wilcoxon)', 'p-value (Binomial)']
df.to_csv('../../results/corr_seg_sites/pvals_mean_LBK_UstIshim_CEU_1kg_indivs.csv', index=False)
df[:4]
collapsed_rows = []
print(test_indivs_1kg)
for sampID in tqdm(test_indivs_1kg):
pop = pop_1kg_df[pop_1kg_df['sample'] == sampID]['pop'].values[0]
filt_df = rec_rate_corr_piA_piB[rec_rate_corr_piA_piB['MOD_ID'] == sampID]
if filt_df.size > 0:
df1 = filt_df[filt_df.ANC_ID == 'LBK']
df2 = filt_df[filt_df.ANC_ID == 'UstIshim']
_, _, pval_wilcoxon = sign_test_corr_piA_piB(df1['corr_piA_piB'], df2['corr_piA_piB'], test='wilcoxon')
_, stat, pval_binom = sign_test_corr_piA_piB(df1['corr_piA_piB'].values, df2['corr_piA_piB'].values, test='binom')
cur_row = [sampID, pop, np.round(pval_wilcoxon,5), np.round(pval_binom,5)]
collapsed_rows.append(cur_row)
# testing this?
df = pd.DataFrame(collapsed_rows)
df.columns = ['Sample ID', 'Population','p-value (Wilcoxon)', 'p-value (Binomial)']
df = df.sort_values('Population')
df.to_csv('../../results/corr_seg_sites/pvals_LBK_UstIshim_1kg_indivs.csv', index=False)
df
test_indivs_ceu = rec_rate_corr_piA_piB[(rec_rate_corr_piA_piB['ANC_ID'] == 'NA06984')]['MOD_ID'].unique()
collapsed_rows = []
# print(test_indivs_ceu)
for sampID in tqdm(test_indivs_ceu):
if sampID != 'NA06984':
pop = pop_1kg_df[pop_1kg_df['sample'] == sampID]['pop'].values[0]
filt_df = rec_rate_corr_piA_piB[rec_rate_corr_piA_piB['MOD_ID'] == sampID]
if filt_df.size > 0:
df1 = filt_df[filt_df.ANC_ID == 'NA06984']
df2 = filt_df[filt_df.ANC_ID == 'UstIshim']
_, _, pval_wilcoxon = sign_test_corr_piA_piB(df1['corr_piA_piB'], df2['corr_piA_piB'], test='wilcoxon')
_, stat, pval_binom = sign_test_corr_piA_piB(df1['corr_piA_piB'].values, df2['corr_piA_piB'].values, test='binom')
cur_row = [sampID, pop, np.round(pval_wilcoxon,5), np.round(pval_binom,5)]
collapsed_rows.append(cur_row)
# testing this?
df = pd.DataFrame(collapsed_rows)
df.columns = ['Sample ID', 'Population','p-value (Wilcoxon)', 'p-value (Binomial)']
df = df.sort_values('Population')
df.to_csv('../../results/corr_seg_sites/pvals_NA06984_UstIshim_1kg_indivs.csv', index=False)
df
df = pd.read_csv('../../results/two_loci/tennessen_real_data_branch_len.csv')
fig, (ax,ax2) = plt.subplots(1,2,figsize=(8,4), sharey=True)
def corr_bl_2_corr_pi_kb(ta, corr_bl, theta=0.4, Ne=1e4):
"""Compute the correlation in pairwise diversity using the correlation in branch length."""
coeff = 1. / (1. + (2 + (ta /2./Ne)/theta))
corr_pi = coeff * corr_bl
return(corr_pi, coeff)
colors = ['blue', 'orange', 'green']
Ne_hats = []
for i,ta in enumerate([0, 233,1500]):
rec_rates = 10**(-df[(df.ta == ta) & (df.scenario == "Tennessen")].rec_rate.values)
corr_bl = df[(df.ta == ta) & (df.scenario == "Tennessen")].corr_bl.values
Ne_hat = df[(df.ta == ta) & (df.scenario == "Tennessen")].Ne_est.values[0]
Ne_hats.append(Ne_hat)
theta = 4.0 * Ne_hat * 1e3 * 2e-8
corr_pi = corr_bl_2_corr_pi_kb(ta, corr_bl, theta=theta, Ne=Ne_hat)
ax.scatter(rec_rates, corr_pi[0], marker='x', color=colors[i])
rec_rates = np.logspace(-5, -3, 1000)
for i,t in enumerate([0,233,1500]):
Ne = Ne_hats[i]
corrs_test1 = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rates, ta=t/Ne, theta=4.0*Ne*1e3*1.2e-8)
ax.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[i], label=r'$t_a$ = %d (Constant)' % t)
idx += 1
ax.legend()
ax.set_xscale('log')
ax.set_ylabel(r'$Corr(\pi_A, \pi_B)$', fontsize=14)
ax.set_xlabel(r'Morgans', fontsize=14)
debox(ax);
colors=['blue', 'orange', 'green']
for i,t in enumerate([100, 500, 1000]):
df = np.load('/Users/aabiddanda/Workdir/aDNA_LD/corr_seg_sites/test_corr_haphap/autosomes.paired_seg_sites.hap0.ta%d.seed1.monte_carlo_L1000.N200.npz' % t)
ax2.scatter(df['rec_rate_mean'], df['corr_s1_s2'], marker='x', color=colors[i], label=r'$t_a=%d$' % t)
Ne_hats = [1e4, 1e4, 1e4]
rec_rates = np.logspace(-5, -2, 1000)
for i,t in enumerate([100, 500, 1000]):
Ne = Ne_hats[i]
corrs_test1 = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rates, ta=t/Ne, theta=4.0*Ne*1e3*1.2e-8)
ax2.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[i])
idx += 1
ax2.set_xscale('log')
ax2.legend(frameon=False)
ax2.set_xlabel(r'Morgans', fontsize=14)
ax2.axhline(y=0, linestyle='--', color='black')
debox(ax2);
label_multipanel([ax, ax2], ['A','B'], yoff=1.1,
fontsize=14, fontweight='bold', va='top', ha='right')
plt.tight_layout()
plt.savefig(supp_figdir + 'corr_piApiB_tennessen_demography.pdf', dpi=300, bbox_inches='tight')
# DANGER: testing out with some simulated data
fig, ax = plt.subplots(1,1,figsize=(4,4))
colors=['blue', 'orange', 'green']
for i,t in enumerate([100, 500, 1000]):
df = np.load('/Users/aabiddanda/Workdir/aDNA_LD/corr_seg_sites/test_corr_haphap/autosomes.paired_seg_sites.hap0.ta%d.seed1.monte_carlo_L1000.N200.npz' % t)
ax.scatter(df['rec_rate_mean'], df['corr_s1_s2'], marker='x', color=colors[i], label=r'$t_a=%d$' % t)
Ne_hats = [1e4, 1e4, 1e4]
rec_rates = np.logspace(-5, -2, 1000)
for i,t in enumerate([100, 500, 1000]):
Ne = Ne_hats[i]
corrs_test1 = TwoLocusTheoryConstant._corrSASB(4*Ne*rec_rates, ta=t/Ne, theta=4.0*Ne*1e3*1.2e-8)
ax.plot(rec_rates, corrs_test1, lw=2, zorder=-1, color=colors[i])
idx += 1
ax.set_xscale('log')
ax.legend(frameon=False)
ax.set_xlabel(r'Morgans', fontsize=14)
ax.set_ylabel(r'$Corr(\pi_A, \pi_B)$', fontsize=14)
ax.axhline(y=0, linestyle='--', color='black')
debox(ax);
ax.set_title(r'Estimation from simulations (not real-data)')
# ax.set_yscale('log')
plt.savefig(supp_figdir + 'corr_piA_piB_test_sims_w_realdataestimation.pdf', bbox_inches='tight')