In [1]:
# Exploring the covariance in segregating sites
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import pandas as pd
import statsmodels.api as sm
lowess = sm.nonparametric.lowess

import warnings
warnings.filterwarnings("ignore")

import re

import os
import sys
sys.path.append('../../src/')
from plot_utils import *

from tqdm import tqdm
import cartopy
import cartopy.mpl.geoaxes
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from scipy.stats import linregress
from geopy.distance import geodesic

from patsy import dmatrices
from statsmodels.api import OLS

%load_ext autoreload
%autoreload 2
In [2]:
# NOTE : this should be from the plot utils 
plt.rcParams['font.sans-serif'] = "Arial"
plt.rcParams['figure.facecolor'] = "w"
plt.rcParams['figure.autolayout'] = True
plt.rcParams['pdf.fonttype'] = 3
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import matplotlib.ticker as mticker


main_figdir = '../../plots/hap_copying/jump_rate_chrXdata/'
supp_figdir = '../../plots/supp_figs/hap_copying/'
os.makedirs(main_figdir, exist_ok=True)
os.makedirs(supp_figdir, exist_ok=True)

%matplotlib inline

Obtaining Raw Statistics from a 1KG Panel

In [3]:
total_raw_stats_df = pd.read_csv('../../results/hap_copying/chrX_male_analysis/mle_est_real_1kg/chrX_filt.total.ls_stats.merged.csv')
total_raw_stats_df.head(10)
Out[3]:
indivID Country Location Lat Long AvgDate_calBP Sex Cov_Autosomes Data_Type Publication ... panelID scale_marginal se_marginal se_scale_marginal_fd scale_jt se_scale_jt eps_jt se_eps_jt nsnps nref_haps
0 I0172 Germany Esperstedt 51.420 11.6800 5173.0 M 25.372 1240K MathiesonNature2015 (1240k of same same sample... ... ceu 445.274956 3.982112 3.971789 414.761336 3.625205 0.004298 0.000113 22919 49
1 I0172 Germany Esperstedt 51.420 11.6800 5173.0 M 25.372 1240K MathiesonNature2015 (1240k of same same sample... ... eur 246.763477 1.149983 1.150651 243.505637 34.648345 0.001696 0.000853 26103 240
2 I0172 Germany Esperstedt 51.420 11.6800 5173.0 M 25.372 1240K MathiesonNature2015 (1240k of same same sample... ... fullkg 189.884152 0.413236 0.411298 185.781447 0.319851 0.001817 0.000012 35889 1233
3 Loschbour_snpAD.DG Luxembourg Echternach 49.810 6.4000 8050.0 M 22.000 Shotgun.diploid Pruefer2017 ... ceu 414.610688 3.855402 3.800422 400.689829 138.162974 0.002128 0.001628 25426 49
4 Loschbour_snpAD.DG Luxembourg Echternach 49.810 6.4000 8050.0 M 22.000 Shotgun.diploid Pruefer2017 ... eur 241.639812 1.101647 1.098853 243.994774 360.262569 0.000461 0.003016 28922 240
5 Loschbour_snpAD.DG Luxembourg Echternach 49.810 6.4000 8050.0 M 22.000 Shotgun.diploid Pruefer2017 ... fullkg 183.563112 0.376346 0.377115 185.471361 53.034588 0.000467 0.008552 39852 1233
6 I0585 Spain Leon, La Brana-Arintero 42.911 -5.3778 7815.0 M 19.536 1240K MathiesonNature2015 (1240k of same same sample... ... ceu 462.674994 4.597107 4.607756 439.568829 753.624343 0.003314 0.016738 17521 49
7 I0585 Spain Leon, La Brana-Arintero 42.911 -5.3778 7815.0 M 19.536 1240K MathiesonNature2015 (1240k of same same sample... ... eur 265.638400 1.275114 1.285644 259.841874 4352.181904 0.001969 0.033179 19899 240
8 I0585 Spain Leon, La Brana-Arintero 42.911 -5.3778 7815.0 M 19.536 1240K MathiesonNature2015 (1240k of same same sample... ... fullkg 193.494748 0.421991 0.418317 191.104493 151.577852 0.001511 0.002733 27230 1233
9 BR2.SG Hungary Ludas-Varju-Dulo 47.820 19.9500 3140.0 M 19.164 Shotgun GambaNatureCommunications2014 ... ceu 548.043198 4.583140 4.600606 505.970151 747.623704 0.004622 0.012192 29116 49

10 rows × 23 columns

In [4]:
def quality_control_panel_select(df, panel='ceu', nsnps=8000, dups=True):
    """Filter the data frame by the panel"""
    pass_indices = np.array(['PASS' in df['Assessment'].values[i] for i in range(df.shape[0])])
    pass_indivs_raw = df.iloc[np.where(pass_indices)[0]]
    pass_indivs_filt_nsnps = pass_indivs_raw[(pass_indivs_raw['nsnps'] >= nsnps)]
    pass_indivs = pass_indivs_filt_nsnps[pass_indivs_filt_nsnps.panelID == panel]
    if dups:
        ids = []
        for i in pass_indivs.indivID.unique():
            ids.append(re.split('_|\.SG|\.DG', i)[0])

        pass_indivs['true_id'] = ids
        pass_indivs = pass_indivs.sort_values('Cov_Autosomes', ascending=False).drop_duplicates(['true_id'])
    return(pass_indivs)


def spatial_restrict(df, x=48., y=6., km_thresh=1500., age_thresh=1.5e4):
    """Restrict to samples with a given Lat/Long within a distance from a centroid."""

    global_lats = df.Lat
    global_longs = df.Long
    idx = (global_lats != '..') & (global_longs != '..')
    test_longs = global_longs[idx].values.astype(float)
    test_lats = global_lats[idx].values.astype(float)

    assert(test_longs.size == test_lats.size)

    # Setting coordinates for the middle of Europe
    centroid_coords = (x, y)
    geo_dist_centroid = np.zeros(test_longs.size)
    for i in range(test_longs.size):
        geo_dist_centroid[i] = geodesic(centroid_coords, (test_lats[i], test_longs[i])).kilometers

    # Putting in the distances from centroid here ... 
    dist = np.zeros(df.shape[0])
    dist[idx] = geo_dist_centroid
    df['centroid_dist'] = dist

    filt_df = df[(df.centroid_dist != 0) & (df.centroid_dist <= km_thresh) & (df.AvgDate_calBP <= 1.5e4)]
    filt_df = filt_df.astype({'Lat': 'float64', 'Long': 'float64', 'AvgDate_calBP': 'float64', 'Cov_Autosomes': 'float64'})
    return(filt_df)
In [5]:
# Do a little filtering on the number of snps per kb
snps_total = 49704
tot_len = 155234382 - 990180
nsnps = 8000
snps_per_kb = nsnps  / (tot_len / 1e3)
# print(nsnps, snps_per_kb, snps_per_kb*25)

# options for panel: (ceu|eur|fullkg)
panel = 'ceu'

# Make sure the individuals "PASS" according to the Reich-Lab assessment
pass_indivs = quality_control_panel_select(total_raw_stats_df, panel=panel, nsnps=nsnps);
# NOTE: should have a way to filter duplicate samples (and only take the higher coverage sample ... )
pass_indivs.head()
print(nsnps, snps_per_kb, snps_per_kb*25, pass_indivs.shape[0])
8000 0.05186580692349137 1.296645173087284 344
In [6]:
cov_auto = pass_indivs['Cov_Autosomes']
print(np.nanmean(cov_auto), np.nanstd(cov_auto), np.median(cov_auto), cov_auto.size)
fig, ax = plt.subplots(1,1,figsize=(3,3))
ax.hist(cov_auto, bins=100);
debox(ax);
ax.set_xscale('log')
ax.set_xlabel(r'Autosomal Coverage', fontsize=14);
ax.set_ylabel(r'Number of Samples', fontsize=14);
3.0218699040697676 2.906627422504752 2.3082450000000003 344
In [7]:
mean_age = pass_indivs.groupby('Country').mean()['AvgDate_calBP']
std_age = pass_indivs.groupby('Country').std()['AvgDate_calBP']
min_age = pass_indivs.groupby('Country')['AvgDate_calBP'].min()
max_age = pass_indivs.groupby('Country')['AvgDate_calBP'].max()
In [8]:
countries, cnts = np.unique(pass_indivs.Country, return_counts=True)
# Return countries in reverse chronological order 
countries_srt = countries[np.argsort(cnts)[::-1]]
cnts_srt = np.sort(cnts)[::-1]

# Make a plot of this information
fig, ax = plt.subplots(2,1,figsize=(8,5))
ax[1].bar(countries_srt, cnts_srt, color='blue')

for tick in ax[1].get_xticklabels():
    tick.set_rotation(90)
    tick.set_fontsize(8);

delta=0.35
i = 0
for c in countries_srt:
    cur_min_age = min_age[c]
    cur_max_age = max_age[c]
    ax[0].plot([i,i], [min_age[c]/1e3, max_age[c]/1e3], color='blue', solid_capstyle='butt')
    ax[0].plot([i-delta,i+delta], [min_age[c]/1e3, min_age[c]/1e3], color='blue', solid_capstyle='butt')
    ax[0].plot([i-delta,i+delta], [max_age[c]/1e3, max_age[c]/1e3], color='blue', solid_capstyle='butt')
    i += 1
ax[0].set_xticks([])
ax[0].set_ylabel(r'Age Range (kya)', fontsize=12)
ax[1].set_ylabel(r'Number of samples', fontsize=12)

fig.text(0.5, 0.00, 'Country', fontsize=14)
debox(ax[1]); debox(ax[0]);

for i, label in enumerate(('A', 'B')):
    ax[i].text(-0.05, 1.15, label, fontsize=16,
               fontweight='bold', va='top', ha='right', transform=ax[i].transAxes);

plt.tight_layout()
In [9]:
global_lats = pass_indivs.Lat
global_longs = pass_indivs.Long
global_lambda = pass_indivs.scale_marginal
global_lambda_se = pass_indivs.se_scale_marginal_fd
global_age = pass_indivs.AvgDate_calBP


# Some filters here with the distance here 
idx = (global_lats != '..') & (global_longs != '..') & (global_lambda < 8e2)
fig = plt.figure(figsize=(6,3))

# ax = fig.add_subplot(1,1,1, projection=ccrs.PlateCarree())
ax = plt.axes(projection=ccrs.Robinson())
# ax.set_global()
ax.coastlines(lw=0.5)
im = ax.scatter(global_longs[idx].values.astype(float),
           global_lats[idx].values.astype(float),
           c=global_lambda[idx].values.astype(float),
           transform=ccrs.PlateCarree(), s=10, alpha=0.75)

ax.set_aspect('auto')
ax.outline_patch.set_visible(False)

fig.canvas.draw()
# ax.set_title(r'Geographic Variation in $\hat{\lambda}$', fontsize=14)
plt.colorbar(im, label=r'$\hat{\lambda}$')
plt.tight_layout()
plt.savefig(supp_figdir + 'eurasia_panel_%s_global_lambda_%d.pdf' % (panel, nsnps), bbox_inches='tight')
In [10]:
# Calculate Distance from central europe (ideally add as another annotation)
test_longs = global_longs[idx].values.astype(float)
test_lats = global_lats[idx].values.astype(float)
test_lambda = global_lambda[idx].values.astype(float)
test_lambda_se = global_lambda_se[idx].values.astype(float)
test_ages = global_age[idx].values.astype(float)

assert(test_longs.size == test_lats.size)
assert(test_longs.size == test_lambda.size)

# Setting coordinates for the middle of Europe
ceu_coords = (48., 6.)
geo_dist_ceu = np.zeros(test_longs.size)
for i in tqdm(range(test_longs.size)):
    geo_dist_ceu[i] = geodesic(ceu_coords, (test_lats[i], test_longs[i])).kilometers

# Putting in the distances from CEU coordinates here ... 
dist = np.zeros(pass_indivs.shape[0])
dist[idx] = geo_dist_ceu
pass_indivs['CEU_dist'] = dist
100%|██████████| 344/344 [00:00<00:00, 1647.87it/s]
In [11]:
#Do the slight filtering here ... 
km_thresh = 1500
gen_time = 30.
test_df = pass_indivs[(pass_indivs.CEU_dist != 0) & (pass_indivs.CEU_dist <= km_thresh) & (pass_indivs.AvgDate_calBP <= 1.5e4)]
test_df = test_df.astype({'Lat': 'float64', 'Long': 'float64', 'AvgDate_calBP': 'float64', 'Cov_Autosomes': 'float64'})
test_df['AvgDateGenerations'] = test_df.AvgDate_calBP.values / gen_time

print(test_df.shape[0])

fig = plt.figure(figsize=(7.5,3.5))
gs = fig.add_gridspec(1, 6)


# age_gen = test_df.AvgDateGenerations.values.astype(float)
age_gen = test_df.AvgDateGenerations.values.astype(float)
age_years = test_df.AvgDate_calBP.values.astype(float)

ax1 = fig.add_subplot(gs[:, :3], projection=ccrs.PlateCarree())
ax1.coastlines(resolution='auto', color='k')
im = ax1.scatter(test_df.Long.values.astype(float),
           test_df.Lat.values.astype(float),
           transform=ccrs.PlateCarree(), c=age_years, s=10, alpha=0.75, rasterized=True)

fig.colorbar(im, ax=ax1, orientation='vertical', pad=0.01, fraction=0.2, label=r'Sample age (Years)')

ax1.set_aspect('auto')
ax1.outline_patch.set_visible(False)
debox(ax1);


# Linear Model to check for effect of age 
y, X = dmatrices('scale_marginal ~ AvgDateGenerations + Lat + Long + Lat:Long',
                 data=test_df,
                 return_type='dataframe')

res = sm.OLS(y,X).fit()
# print(res.summary())
intercept = res.params['Intercept']
beta_age = res.params['AvgDateGenerations']
p_value = res.pvalues['AvgDateGenerations']

marginal_lambda = test_df.scale_marginal.values
marginal_lambda_se = test_df.se_marginal.values
idx = (marginal_lambda_se < 100)

ax3 = fig.add_subplot(gs[:, 3:])
im2 = ax3.errorbar(rand_jitter(age_years[idx], scale=1e-3), marginal_lambda[idx],
                   yerr=2*marginal_lambda_se[idx],  color='blue', alpha=0.5,
                   linestyle='none', markersize=2, marker='o', capsize=2)

# Print out the results of the linear regression
print(intercept, beta_age, p_value)

ax3.set_ylabel(r'$\hat{\lambda}$  (Morgans)', fontsize=14)
ax3.set_xlabel(r'Sample age (Years)', fontsize=14)
debox(ax3);


ax1.text(-0.05, 1.1, 'A', fontsize=14, fontweight='bold', va='top', ha='right', transform=ax1.transAxes);
ax3.text(-0.05, 1.1, 'B', fontsize=14, fontweight='bold', va='top', ha='right', transform=ax3.transAxes);

fig.canvas.draw()
plt.tight_layout()
# plt.savefig(supp_figdir + 'geography_dist_CEU_thresh_%d_%s_%d.pdf' % (km_thresh, panel, nsnps), bbox_inches='tight', dpi=300)
344
606.1874580716019 -0.5260316866436665 1.2890955212357306e-30
In [12]:
fig, ax = plt.subplots(2,1,figsize=(4,4), sharex=True)
ax[0].scatter(test_df.AvgDate_calBP.values.astype(float), test_df.eps_jt.values, s=10 )
ax[1].scatter(test_df.AvgDate_calBP.values.astype(float), test_df.scale_jt.values, s=10)
debox_all(ax);
ax[0].set_ylabel(r'$\epsilon$')
ax[1].set_ylabel(r'$\lambda$')
ax[1].set_xlabel(r'Age (years)')
Out[12]:
Text(0.5, 0, 'Age (years)')

Plotting results of simulated ChrX with realistic time-points

We essentially want to compare realistically simulated real data with the 1240K real data. Here we have focused on using the same panel size as with the real data ($K = 49$) and using variants ascertained to have $MAF > 5\%$

In [13]:
# Reading in the CSV with all of the simulation results
chrX_hapsim_df = pd.read_csv('../../results/hap_copying/simulations/jump_rate_est_sims_ceu_real_chrX.csv')
chrX_hapsim_df.head()
Out[13]:
scenario Ne scale_marginal se_scale_marginal scale_jt eps_jt se_scale_jt se_eps_jt n_panel n_snps ta min_maf seed
0 SerialConstant 10000 809.836054 4.179890 811.423864 0.011611 137.981242 0.000206 49.0 187165.0 10.0 5 42
1 SerialConstant 10000 870.825554 4.403957 871.474276 0.006551 99.463677 0.000124 49.0 187165.0 15.0 5 42
2 SerialConstant 10000 790.228755 4.049053 791.268439 0.008469 123.242022 0.000174 49.0 187165.0 17.0 5 42
3 SerialConstant 10000 839.006868 4.239120 839.933998 0.009002 103.383608 0.000201 49.0 187165.0 19.0 5 42
4 SerialConstant 10000 782.446199 4.070916 783.354878 0.007652 119.227691 0.000201 49.0 187165.0 25.0 5 42
In [14]:
import cartopy
import cartopy.mpl.geoaxes
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

fig, ax = plt.subplots(1,3, figsize=(7,2.5), sharex=True, sharey=True)


# Axis 1: The real data 
axins = inset_axes(ax[0], width="60%", height="60%", loc="upper right", borderpad=0.0,
                   axes_class=cartopy.mpl.geoaxes.GeoAxes,
                   axes_kwargs=dict(map_projection=cartopy.crs.PlateCarree()))

axins.coastlines(resolution='auto', lw=0.5, color='k')

im = axins.scatter(test_df.Long.values.astype(float),
           test_df.Lat.values.astype(float),
           transform=ccrs.PlateCarree(), c=age_years, s=5, alpha=0.75)
axins.spines['geo'].set_edgecolor('white')

im2 = ax[0].errorbar(rand_jitter(age_gen, scale=1e-3),
                     marginal_lambda,
                     yerr=2*marginal_lambda_se,
                     color='blue', linestyle='none', capsize=2, alpha=0.3)

z = lowess(marginal_lambda, rand_jitter(age_gen), frac=1/3)
ax[0].plot(z[:,0], z[:,1], color='orange', lw=3, linestyle='--', zorder=100)


# AXIS 1 : Tennessen et al model
filt_df = chrX_hapsim_df[chrX_hapsim_df.scenario == 'TennessenEuropeanXchr']
ax[1].errorbar(rand_jitter(filt_df.ta), filt_df.scale_marginal,
               yerr=2*filt_df.se_scale_marginal, capsize=2, linestyle='none', color='blue', alpha=0.3)

z = lowess(filt_df.scale_marginal, rand_jitter(filt_df.ta), frac=1/3)
ax[1].plot(z[:,0], z[:,1], color='orange', lw=3, linestyle='--')


# AXIS 2: 
filt_df = chrX_hapsim_df[chrX_hapsim_df.scenario == 'IBDNeUK10KXchr']
ax[2].errorbar(rand_jitter(filt_df.ta), filt_df.scale_marginal,
               yerr=2*filt_df.se_scale_marginal, capsize=2, linestyle='none', color='blue', alpha=0.3)

z = lowess(filt_df.scale_marginal, rand_jitter(filt_df.ta), frac=1/3)
ax[2].plot(z[:,0], z[:,1], color='orange', lw=3, linestyle='--')


# Debox all of the axes
debox_all(ax);

# Setting some labeling
ax[0].set_ylim(200,1000)
ax[0].set_xlim(0,500)
label_multipanel(ax, ['A','B','C'], fontsize=14, fontweight='bold', xoff=-0.06, yoff=1.2, va='top', ha='right')
ax[1].set_title(r'Tennessen et al (2012)', fontsize=12)
ax[2].set_title(r'IBDNe UK10K (2015)', fontsize=12)
ax[0].set_title(r'1240K Ancient Data', fontsize=12)

ax[0].set_ylabel(r'$\hat{\lambda}$ (per Morgan)', fontsize=14)
ax[1].set_xlabel('Sample age (Generations)', fontsize=14);
plt.tight_layout()
plt.savefig(main_figdir + 'copying_rate_demography_chrX_experiment_panel_%s_%d.pdf' % (panel,nsnps), dpi=300, bbox_inches='tight')

Comparing Different reference panels for estimation

In [15]:
# Running the filtering step
res_df = quality_control_panel_select(total_raw_stats_df);
euro_ceu_ancients = spatial_restrict(res_df)

res1_df = quality_control_panel_select(total_raw_stats_df, panel='eur');
euro_eur_ancients = spatial_restrict(res1_df)

res2_df = quality_control_panel_select(total_raw_stats_df, panel='fullkg');
euro_fullkg_ancients = spatial_restrict(res2_df)
In [16]:
fig, axs = plt.subplots(1,3, figsize=(7,2.5), sharex=True, sharey=True)


axs[0].errorbar(euro_ceu_ancients.AvgDate_calBP.values,
                euro_ceu_ancients.scale_marginal.values,
                yerr=2*euro_ceu_ancients.se_scale_marginal_fd.values,
                marker='o', markersize=2, linestyle='none', rasterized=True)

axs[1].errorbar(euro_eur_ancients.AvgDate_calBP.values,
                euro_eur_ancients.scale_marginal.values,
                yerr=2*euro_eur_ancients.se_scale_marginal_fd.values,
                marker='o', markersize=2, linestyle='none', rasterized=True)

axs[2].errorbar(euro_fullkg_ancients.AvgDate_calBP.values,
                euro_fullkg_ancients.scale_marginal.values,
                yerr=2*euro_fullkg_ancients.se_scale_marginal_fd.values,
                marker='o', markersize=2, linestyle='none')


axs[0].set_ylim(100,1200)
axs[0].set_xlim(0,1.5e4)
axs[0].set_ylabel(r'$\hat{\lambda}$ (per Morgan)', fontsize=14)
axs[1].set_xlabel(r'Sample age (years)', fontsize=14)


# Setting titles
axs[0].set_title(r'CEU', fontsize=14)
axs[1].set_title(r'EUR', fontsize=14)
axs[2].set_title(r'Full-1KG', fontsize=14)


label_multipanel(axs, ['A','B','C'], fontsize=14,
                 fontweight='bold', xoff=-0.06, yoff=1.2, va='top', ha='right')

debox_all(axs);
plt.savefig(supp_figdir + 'compare_panels_1500_central_europe.pdf', bbox_inches='tight')