# Exploring the covariance in segregating sites
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import pandas as pd
import statsmodels.api as sm
lowess = sm.nonparametric.lowess

import warnings
warnings.filterwarnings("ignore")

import re

import os
import sys
sys.path.append('../../src/')
from plot_utils import *

from tqdm import tqdm
import cartopy
import cartopy.mpl.geoaxes
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from scipy.stats import linregress
from geopy.distance import geodesic

from patsy import dmatrices
from statsmodels.api import OLS

%load_ext autoreload
%autoreload 2

# NOTE : this should be from the plot utils 
plt.rcParams['font.sans-serif'] = "Arial"
plt.rcParams['figure.facecolor'] = "w"
plt.rcParams['figure.autolayout'] = True
plt.rcParams['pdf.fonttype'] = 3
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import matplotlib.ticker as mticker


main_figdir = '../../plots/hap_copying/jump_rate_chrXdata/'
supp_figdir = '../../plots/supp_figs/hap_copying/'
os.makedirs(main_figdir, exist_ok=True)
os.makedirs(supp_figdir, exist_ok=True)

%matplotlib inline

Obtaining Raw Statistics from a 1KG Panel¶

total_raw_stats_df = pd.read_csv('../../results/hap_copying/chrX_male_analysis/mle_est_real_1kg/chrX_filt.total.ls_stats.merged.csv')
total_raw_stats_df.head(10)

def quality_control_panel_select(df, panel='ceu', nsnps=8000, dups=True):
    """Filter the data frame by the panel"""
    pass_indices = np.array(['PASS' in df['Assessment'].values[i] for i in range(df.shape[0])])
    pass_indivs_raw = df.iloc[np.where(pass_indices)[0]]
    pass_indivs_filt_nsnps = pass_indivs_raw[(pass_indivs_raw['nsnps'] >= nsnps)]
    pass_indivs = pass_indivs_filt_nsnps[pass_indivs_filt_nsnps.panelID == panel]
    if dups:
        ids = []
        for i in pass_indivs.indivID.unique():
            ids.append(re.split('_|\.SG|\.DG', i)[0])

        pass_indivs['true_id'] = ids
        pass_indivs = pass_indivs.sort_values('Cov_Autosomes', ascending=False).drop_duplicates(['true_id'])
    return(pass_indivs)


def spatial_restrict(df, x=48., y=6., km_thresh=1500., age_thresh=1.5e4):
    """Restrict to samples with a given Lat/Long within a distance from a centroid."""

    global_lats = df.Lat
    global_longs = df.Long
    idx = (global_lats != '..') & (global_longs != '..')
    test_longs = global_longs[idx].values.astype(float)
    test_lats = global_lats[idx].values.astype(float)

    assert(test_longs.size == test_lats.size)

    # Setting coordinates for the middle of Europe
    centroid_coords = (x, y)
    geo_dist_centroid = np.zeros(test_longs.size)
    for i in range(test_longs.size):
        geo_dist_centroid[i] = geodesic(centroid_coords, (test_lats[i], test_longs[i])).kilometers

    # Putting in the distances from centroid here ... 
    dist = np.zeros(df.shape[0])
    dist[idx] = geo_dist_centroid
    df['centroid_dist'] = dist

    filt_df = df[(df.centroid_dist != 0) & (df.centroid_dist <= km_thresh) & (df.AvgDate_calBP <= 1.5e4)]
    filt_df = filt_df.astype({'Lat': 'float64', 'Long': 'float64', 'AvgDate_calBP': 'float64', 'Cov_Autosomes': 'float64'})
    return(filt_df)

# Do a little filtering on the number of snps per kb
snps_total = 49704
tot_len = 155234382 - 990180
nsnps = 8000
snps_per_kb = nsnps  / (tot_len / 1e3)
# print(nsnps, snps_per_kb, snps_per_kb*25)

# options for panel: (ceu|eur|fullkg)
panel = 'ceu'

# Make sure the individuals "PASS" according to the Reich-Lab assessment
pass_indivs = quality_control_panel_select(total_raw_stats_df, panel=panel, nsnps=nsnps);
# NOTE: should have a way to filter duplicate samples (and only take the higher coverage sample ... )
pass_indivs.head()
print(nsnps, snps_per_kb, snps_per_kb*25, pass_indivs.shape[0])

8000 0.05186580692349137 1.296645173087284 344

cov_auto = pass_indivs['Cov_Autosomes']
print(np.nanmean(cov_auto), np.nanstd(cov_auto), np.median(cov_auto), cov_auto.size)
fig, ax = plt.subplots(1,1,figsize=(3,3))
ax.hist(cov_auto, bins=100);
debox(ax);
ax.set_xscale('log')
ax.set_xlabel(r'Autosomal Coverage', fontsize=14);
ax.set_ylabel(r'Number of Samples', fontsize=14);

3.0218699040697676 2.906627422504752 2.3082450000000003 344

mean_age = pass_indivs.groupby('Country').mean()['AvgDate_calBP']
std_age = pass_indivs.groupby('Country').std()['AvgDate_calBP']
min_age = pass_indivs.groupby('Country')['AvgDate_calBP'].min()
max_age = pass_indivs.groupby('Country')['AvgDate_calBP'].max()

countries, cnts = np.unique(pass_indivs.Country, return_counts=True)
# Return countries in reverse chronological order 
countries_srt = countries[np.argsort(cnts)[::-1]]
cnts_srt = np.sort(cnts)[::-1]

# Make a plot of this information
fig, ax = plt.subplots(2,1,figsize=(8,5))
ax[1].bar(countries_srt, cnts_srt, color='blue')

for tick in ax[1].get_xticklabels():
    tick.set_rotation(90)
    tick.set_fontsize(8);

delta=0.35
i = 0
for c in countries_srt:
    cur_min_age = min_age[c]
    cur_max_age = max_age[c]
    ax[0].plot([i,i], [min_age[c]/1e3, max_age[c]/1e3], color='blue', solid_capstyle='butt')
    ax[0].plot([i-delta,i+delta], [min_age[c]/1e3, min_age[c]/1e3], color='blue', solid_capstyle='butt')
    ax[0].plot([i-delta,i+delta], [max_age[c]/1e3, max_age[c]/1e3], color='blue', solid_capstyle='butt')
    i += 1
ax[0].set_xticks([])
ax[0].set_ylabel(r'Age Range (kya)', fontsize=12)
ax[1].set_ylabel(r'Number of samples', fontsize=12)

fig.text(0.5, 0.00, 'Country', fontsize=14)
debox(ax[1]); debox(ax[0]);

for i, label in enumerate(('A', 'B')):
    ax[i].text(-0.05, 1.15, label, fontsize=16,
               fontweight='bold', va='top', ha='right', transform=ax[i].transAxes);

plt.tight_layout()

global_lats = pass_indivs.Lat
global_longs = pass_indivs.Long
global_lambda = pass_indivs.scale_marginal
global_lambda_se = pass_indivs.se_scale_marginal_fd
global_age = pass_indivs.AvgDate_calBP


# Some filters here with the distance here 
idx = (global_lats != '..') & (global_longs != '..') & (global_lambda < 8e2)
fig = plt.figure(figsize=(6,3))

# ax = fig.add_subplot(1,1,1, projection=ccrs.PlateCarree())
ax = plt.axes(projection=ccrs.Robinson())
# ax.set_global()
ax.coastlines(lw=0.5)
im = ax.scatter(global_longs[idx].values.astype(float),
           global_lats[idx].values.astype(float),
           c=global_lambda[idx].values.astype(float),
           transform=ccrs.PlateCarree(), s=10, alpha=0.75)

ax.set_aspect('auto')
ax.outline_patch.set_visible(False)

fig.canvas.draw()
# ax.set_title(r'Geographic Variation in $\hat{\lambda}$', fontsize=14)
plt.colorbar(im, label=r'$\hat{\lambda}$')
plt.tight_layout()
plt.savefig(supp_figdir + 'eurasia_panel_%s_global_lambda_%d.pdf' % (panel, nsnps), bbox_inches='tight')

# Calculate Distance from central europe (ideally add as another annotation)
test_longs = global_longs[idx].values.astype(float)
test_lats = global_lats[idx].values.astype(float)
test_lambda = global_lambda[idx].values.astype(float)
test_lambda_se = global_lambda_se[idx].values.astype(float)
test_ages = global_age[idx].values.astype(float)

assert(test_longs.size == test_lats.size)
assert(test_longs.size == test_lambda.size)

# Setting coordinates for the middle of Europe
ceu_coords = (48., 6.)
geo_dist_ceu = np.zeros(test_longs.size)
for i in tqdm(range(test_longs.size)):
    geo_dist_ceu[i] = geodesic(ceu_coords, (test_lats[i], test_longs[i])).kilometers

# Putting in the distances from CEU coordinates here ... 
dist = np.zeros(pass_indivs.shape[0])
dist[idx] = geo_dist_ceu
pass_indivs['CEU_dist'] = dist

100%|██████████| 344/344 [00:00<00:00, 1647.87it/s]

#Do the slight filtering here ... 
km_thresh = 1500
gen_time = 30.
test_df = pass_indivs[(pass_indivs.CEU_dist != 0) & (pass_indivs.CEU_dist <= km_thresh) & (pass_indivs.AvgDate_calBP <= 1.5e4)]
test_df = test_df.astype({'Lat': 'float64', 'Long': 'float64', 'AvgDate_calBP': 'float64', 'Cov_Autosomes': 'float64'})
test_df['AvgDateGenerations'] = test_df.AvgDate_calBP.values / gen_time

print(test_df.shape[0])

fig = plt.figure(figsize=(7.5,3.5))
gs = fig.add_gridspec(1, 6)


# age_gen = test_df.AvgDateGenerations.values.astype(float)
age_gen = test_df.AvgDateGenerations.values.astype(float)
age_years = test_df.AvgDate_calBP.values.astype(float)

ax1 = fig.add_subplot(gs[:, :3], projection=ccrs.PlateCarree())
ax1.coastlines(resolution='auto', color='k')
im = ax1.scatter(test_df.Long.values.astype(float),
           test_df.Lat.values.astype(float),
           transform=ccrs.PlateCarree(), c=age_years, s=10, alpha=0.75, rasterized=True)

fig.colorbar(im, ax=ax1, orientation='vertical', pad=0.01, fraction=0.2, label=r'Sample age (Years)')

ax1.set_aspect('auto')
ax1.outline_patch.set_visible(False)
debox(ax1);


# Linear Model to check for effect of age 
y, X = dmatrices('scale_marginal ~ AvgDateGenerations + Lat + Long + Lat:Long',
                 data=test_df,
                 return_type='dataframe')

res = sm.OLS(y,X).fit()
# print(res.summary())
intercept = res.params['Intercept']
beta_age = res.params['AvgDateGenerations']
p_value = res.pvalues['AvgDateGenerations']

marginal_lambda = test_df.scale_marginal.values
marginal_lambda_se = test_df.se_marginal.values
idx = (marginal_lambda_se < 100)

ax3 = fig.add_subplot(gs[:, 3:])
im2 = ax3.errorbar(rand_jitter(age_years[idx], scale=1e-3), marginal_lambda[idx],
                   yerr=2*marginal_lambda_se[idx],  color='blue', alpha=0.5,
                   linestyle='none', markersize=2, marker='o', capsize=2)

# Print out the results of the linear regression
print(intercept, beta_age, p_value)

ax3.set_ylabel(r'$\hat{\lambda}$  (Morgans)', fontsize=14)
ax3.set_xlabel(r'Sample age (Years)', fontsize=14)
debox(ax3);


ax1.text(-0.05, 1.1, 'A', fontsize=14, fontweight='bold', va='top', ha='right', transform=ax1.transAxes);
ax3.text(-0.05, 1.1, 'B', fontsize=14, fontweight='bold', va='top', ha='right', transform=ax3.transAxes);

fig.canvas.draw()
plt.tight_layout()
# plt.savefig(supp_figdir + 'geography_dist_CEU_thresh_%d_%s_%d.pdf' % (km_thresh, panel, nsnps), bbox_inches='tight', dpi=300)

344
606.1874580716019 -0.5260316866436665 1.2890955212357306e-30

fig, ax = plt.subplots(2,1,figsize=(4,4), sharex=True)
ax[0].scatter(test_df.AvgDate_calBP.values.astype(float), test_df.eps_jt.values, s=10 )
ax[1].scatter(test_df.AvgDate_calBP.values.astype(float), test_df.scale_jt.values, s=10)
debox_all(ax);
ax[0].set_ylabel(r'$\epsilon$')
ax[1].set_ylabel(r'$\lambda$')
ax[1].set_xlabel(r'Age (years)')

Text(0.5, 0, 'Age (years)')

Plotting results of simulated ChrX with realistic time-points¶

We essentially want to compare realistically simulated real data with the 1240K real data. Here we have focused on using the same panel size as with the real data ($K = 49$) and using variants ascertained to have $MAF > 5\%$

# Reading in the CSV with all of the simulation results
chrX_hapsim_df = pd.read_csv('../../results/hap_copying/simulations/jump_rate_est_sims_ceu_real_chrX.csv')
chrX_hapsim_df.head()

import cartopy
import cartopy.mpl.geoaxes
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

fig, ax = plt.subplots(1,3, figsize=(7,2.5), sharex=True, sharey=True)


# Axis 1: The real data 
axins = inset_axes(ax[0], width="60%", height="60%", loc="upper right", borderpad=0.0,
                   axes_class=cartopy.mpl.geoaxes.GeoAxes,
                   axes_kwargs=dict(map_projection=cartopy.crs.PlateCarree()))

axins.coastlines(resolution='auto', lw=0.5, color='k')

im = axins.scatter(test_df.Long.values.astype(float),
           test_df.Lat.values.astype(float),
           transform=ccrs.PlateCarree(), c=age_years, s=5, alpha=0.75)
axins.spines['geo'].set_edgecolor('white')

im2 = ax[0].errorbar(rand_jitter(age_gen, scale=1e-3),
                     marginal_lambda,
                     yerr=2*marginal_lambda_se,
                     color='blue', linestyle='none', capsize=2, alpha=0.3)

z = lowess(marginal_lambda, rand_jitter(age_gen), frac=1/3)
ax[0].plot(z[:,0], z[:,1], color='orange', lw=3, linestyle='--', zorder=100)


# AXIS 1 : Tennessen et al model
filt_df = chrX_hapsim_df[chrX_hapsim_df.scenario == 'TennessenEuropeanXchr']
ax[1].errorbar(rand_jitter(filt_df.ta), filt_df.scale_marginal,
               yerr=2*filt_df.se_scale_marginal, capsize=2, linestyle='none', color='blue', alpha=0.3)

z = lowess(filt_df.scale_marginal, rand_jitter(filt_df.ta), frac=1/3)
ax[1].plot(z[:,0], z[:,1], color='orange', lw=3, linestyle='--')


# AXIS 2: 
filt_df = chrX_hapsim_df[chrX_hapsim_df.scenario == 'IBDNeUK10KXchr']
ax[2].errorbar(rand_jitter(filt_df.ta), filt_df.scale_marginal,
               yerr=2*filt_df.se_scale_marginal, capsize=2, linestyle='none', color='blue', alpha=0.3)

z = lowess(filt_df.scale_marginal, rand_jitter(filt_df.ta), frac=1/3)
ax[2].plot(z[:,0], z[:,1], color='orange', lw=3, linestyle='--')


# Debox all of the axes
debox_all(ax);

# Setting some labeling
ax[0].set_ylim(200,1000)
ax[0].set_xlim(0,500)
label_multipanel(ax, ['A','B','C'], fontsize=14, fontweight='bold', xoff=-0.06, yoff=1.2, va='top', ha='right')
ax[1].set_title(r'Tennessen et al (2012)', fontsize=12)
ax[2].set_title(r'IBDNe UK10K (2015)', fontsize=12)
ax[0].set_title(r'1240K Ancient Data', fontsize=12)

ax[0].set_ylabel(r'$\hat{\lambda}$ (per Morgan)', fontsize=14)
ax[1].set_xlabel('Sample age (Generations)', fontsize=14);
plt.tight_layout()
plt.savefig(main_figdir + 'copying_rate_demography_chrX_experiment_panel_%s_%d.pdf' % (panel,nsnps), dpi=300, bbox_inches='tight')

Comparing Different reference panels for estimation¶

# Running the filtering step
res_df = quality_control_panel_select(total_raw_stats_df);
euro_ceu_ancients = spatial_restrict(res_df)

res1_df = quality_control_panel_select(total_raw_stats_df, panel='eur');
euro_eur_ancients = spatial_restrict(res1_df)

res2_df = quality_control_panel_select(total_raw_stats_df, panel='fullkg');
euro_fullkg_ancients = spatial_restrict(res2_df)

fig, axs = plt.subplots(1,3, figsize=(7,2.5), sharex=True, sharey=True)


axs[0].errorbar(euro_ceu_ancients.AvgDate_calBP.values,
                euro_ceu_ancients.scale_marginal.values,
                yerr=2*euro_ceu_ancients.se_scale_marginal_fd.values,
                marker='o', markersize=2, linestyle='none', rasterized=True)

axs[1].errorbar(euro_eur_ancients.AvgDate_calBP.values,
                euro_eur_ancients.scale_marginal.values,
                yerr=2*euro_eur_ancients.se_scale_marginal_fd.values,
                marker='o', markersize=2, linestyle='none', rasterized=True)

axs[2].errorbar(euro_fullkg_ancients.AvgDate_calBP.values,
                euro_fullkg_ancients.scale_marginal.values,
                yerr=2*euro_fullkg_ancients.se_scale_marginal_fd.values,
                marker='o', markersize=2, linestyle='none')


axs[0].set_ylim(100,1200)
axs[0].set_xlim(0,1.5e4)
axs[0].set_ylabel(r'$\hat{\lambda}$ (per Morgan)', fontsize=14)
axs[1].set_xlabel(r'Sample age (years)', fontsize=14)


# Setting titles
axs[0].set_title(r'CEU', fontsize=14)
axs[1].set_title(r'EUR', fontsize=14)
axs[2].set_title(r'Full-1KG', fontsize=14)


label_multipanel(axs, ['A','B','C'], fontsize=14,
                 fontweight='bold', xoff=-0.06, yoff=1.2, va='top', ha='right')

debox_all(axs);
plt.savefig(supp_figdir + 'compare_panels_1500_central_europe.pdf', bbox_inches='tight')

	indivID	Country	Location	Lat	Long	AvgDate_calBP	Sex	Cov_Autosomes	Data_Type	Publication	...	panelID	scale_marginal	se_marginal	se_scale_marginal_fd	scale_jt	se_scale_jt	eps_jt	se_eps_jt	nsnps	nref_haps
0	I0172	Germany	Esperstedt	51.420	11.6800	5173.0	M	25.372	1240K	MathiesonNature2015 (1240k of same same sample...	...	ceu	445.274956	3.982112	3.971789	414.761336	3.625205	0.004298	0.000113	22919	49
1	I0172	Germany	Esperstedt	51.420	11.6800	5173.0	M	25.372	1240K	MathiesonNature2015 (1240k of same same sample...	...	eur	246.763477	1.149983	1.150651	243.505637	34.648345	0.001696	0.000853	26103	240
2	I0172	Germany	Esperstedt	51.420	11.6800	5173.0	M	25.372	1240K	MathiesonNature2015 (1240k of same same sample...	...	fullkg	189.884152	0.413236	0.411298	185.781447	0.319851	0.001817	0.000012	35889	1233
3	Loschbour_snpAD.DG	Luxembourg	Echternach	49.810	6.4000	8050.0	M	22.000	Shotgun.diploid	Pruefer2017	...	ceu	414.610688	3.855402	3.800422	400.689829	138.162974	0.002128	0.001628	25426	49
4	Loschbour_snpAD.DG	Luxembourg	Echternach	49.810	6.4000	8050.0	M	22.000	Shotgun.diploid	Pruefer2017	...	eur	241.639812	1.101647	1.098853	243.994774	360.262569	0.000461	0.003016	28922	240
5	Loschbour_snpAD.DG	Luxembourg	Echternach	49.810	6.4000	8050.0	M	22.000	Shotgun.diploid	Pruefer2017	...	fullkg	183.563112	0.376346	0.377115	185.471361	53.034588	0.000467	0.008552	39852	1233
6	I0585	Spain	Leon, La Brana-Arintero	42.911	-5.3778	7815.0	M	19.536	1240K	MathiesonNature2015 (1240k of same same sample...	...	ceu	462.674994	4.597107	4.607756	439.568829	753.624343	0.003314	0.016738	17521	49
7	I0585	Spain	Leon, La Brana-Arintero	42.911	-5.3778	7815.0	M	19.536	1240K	MathiesonNature2015 (1240k of same same sample...	...	eur	265.638400	1.275114	1.285644	259.841874	4352.181904	0.001969	0.033179	19899	240
8	I0585	Spain	Leon, La Brana-Arintero	42.911	-5.3778	7815.0	M	19.536	1240K	MathiesonNature2015 (1240k of same same sample...	...	fullkg	193.494748	0.421991	0.418317	191.104493	151.577852	0.001511	0.002733	27230	1233
9	BR2.SG	Hungary	Ludas-Varju-Dulo	47.820	19.9500	3140.0	M	19.164	Shotgun	GambaNatureCommunications2014	...	ceu	548.043198	4.583140	4.600606	505.970151	747.623704	0.004622	0.012192	29116	49

	scenario	Ne	scale_marginal	se_scale_marginal	scale_jt	eps_jt	se_scale_jt	se_eps_jt	n_panel	n_snps	ta	min_maf	seed
0	SerialConstant	10000	809.836054	4.179890	811.423864	0.011611	137.981242	0.000206	49.0	187165.0	10.0	5	42
1	SerialConstant	10000	870.825554	4.403957	871.474276	0.006551	99.463677	0.000124	49.0	187165.0	15.0	5	42
2	SerialConstant	10000	790.228755	4.049053	791.268439	0.008469	123.242022	0.000174	49.0	187165.0	17.0	5	42
3	SerialConstant	10000	839.006868	4.239120	839.933998	0.009002	103.383608	0.000201	49.0	187165.0	19.0	5	42
4	SerialConstant	10000	782.446199	4.070916	783.354878	0.007652	119.227691	0.000201	49.0	187165.0	25.0	5	42