# Exploring the covariance in segregating sites
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import pandas as pd
import statsmodels.api as sm
lowess = sm.nonparametric.lowess
import warnings
warnings.filterwarnings("ignore")
import re
import os
import sys
sys.path.append('../../src/')
from plot_utils import *
from tqdm import tqdm
import cartopy
import cartopy.mpl.geoaxes
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from scipy.stats import linregress
from geopy.distance import geodesic
from patsy import dmatrices
from statsmodels.api import OLS
%load_ext autoreload
%autoreload 2
# NOTE : this should be from the plot utils
plt.rcParams['font.sans-serif'] = "Arial"
plt.rcParams['figure.facecolor'] = "w"
plt.rcParams['figure.autolayout'] = True
plt.rcParams['pdf.fonttype'] = 3
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import matplotlib.ticker as mticker
main_figdir = '../../plots/hap_copying/jump_rate_chrXdata/'
supp_figdir = '../../plots/supp_figs/hap_copying/'
os.makedirs(main_figdir, exist_ok=True)
os.makedirs(supp_figdir, exist_ok=True)
%matplotlib inline
total_raw_stats_df = pd.read_csv('../../results/hap_copying/chrX_male_analysis/mle_est_real_1kg/chrX_filt.total.ls_stats.merged.csv')
total_raw_stats_df.head(10)
def quality_control_panel_select(df, panel='ceu', nsnps=8000, dups=True):
"""Filter the data frame by the panel"""
pass_indices = np.array(['PASS' in df['Assessment'].values[i] for i in range(df.shape[0])])
pass_indivs_raw = df.iloc[np.where(pass_indices)[0]]
pass_indivs_filt_nsnps = pass_indivs_raw[(pass_indivs_raw['nsnps'] >= nsnps)]
pass_indivs = pass_indivs_filt_nsnps[pass_indivs_filt_nsnps.panelID == panel]
if dups:
ids = []
for i in pass_indivs.indivID.unique():
ids.append(re.split('_|\.SG|\.DG', i)[0])
pass_indivs['true_id'] = ids
pass_indivs = pass_indivs.sort_values('Cov_Autosomes', ascending=False).drop_duplicates(['true_id'])
return(pass_indivs)
def spatial_restrict(df, x=48., y=6., km_thresh=1500., age_thresh=1.5e4):
"""Restrict to samples with a given Lat/Long within a distance from a centroid."""
global_lats = df.Lat
global_longs = df.Long
idx = (global_lats != '..') & (global_longs != '..')
test_longs = global_longs[idx].values.astype(float)
test_lats = global_lats[idx].values.astype(float)
assert(test_longs.size == test_lats.size)
# Setting coordinates for the middle of Europe
centroid_coords = (x, y)
geo_dist_centroid = np.zeros(test_longs.size)
for i in range(test_longs.size):
geo_dist_centroid[i] = geodesic(centroid_coords, (test_lats[i], test_longs[i])).kilometers
# Putting in the distances from centroid here ...
dist = np.zeros(df.shape[0])
dist[idx] = geo_dist_centroid
df['centroid_dist'] = dist
filt_df = df[(df.centroid_dist != 0) & (df.centroid_dist <= km_thresh) & (df.AvgDate_calBP <= 1.5e4)]
filt_df = filt_df.astype({'Lat': 'float64', 'Long': 'float64', 'AvgDate_calBP': 'float64', 'Cov_Autosomes': 'float64'})
return(filt_df)
# Do a little filtering on the number of snps per kb
snps_total = 49704
tot_len = 155234382 - 990180
nsnps = 8000
snps_per_kb = nsnps / (tot_len / 1e3)
# print(nsnps, snps_per_kb, snps_per_kb*25)
# options for panel: (ceu|eur|fullkg)
panel = 'ceu'
# Make sure the individuals "PASS" according to the Reich-Lab assessment
pass_indivs = quality_control_panel_select(total_raw_stats_df, panel=panel, nsnps=nsnps);
# NOTE: should have a way to filter duplicate samples (and only take the higher coverage sample ... )
pass_indivs.head()
print(nsnps, snps_per_kb, snps_per_kb*25, pass_indivs.shape[0])
cov_auto = pass_indivs['Cov_Autosomes']
print(np.nanmean(cov_auto), np.nanstd(cov_auto), np.median(cov_auto), cov_auto.size)
fig, ax = plt.subplots(1,1,figsize=(3,3))
ax.hist(cov_auto, bins=100);
debox(ax);
ax.set_xscale('log')
ax.set_xlabel(r'Autosomal Coverage', fontsize=14);
ax.set_ylabel(r'Number of Samples', fontsize=14);
mean_age = pass_indivs.groupby('Country').mean()['AvgDate_calBP']
std_age = pass_indivs.groupby('Country').std()['AvgDate_calBP']
min_age = pass_indivs.groupby('Country')['AvgDate_calBP'].min()
max_age = pass_indivs.groupby('Country')['AvgDate_calBP'].max()
countries, cnts = np.unique(pass_indivs.Country, return_counts=True)
# Return countries in reverse chronological order
countries_srt = countries[np.argsort(cnts)[::-1]]
cnts_srt = np.sort(cnts)[::-1]
# Make a plot of this information
fig, ax = plt.subplots(2,1,figsize=(8,5))
ax[1].bar(countries_srt, cnts_srt, color='blue')
for tick in ax[1].get_xticklabels():
tick.set_rotation(90)
tick.set_fontsize(8);
delta=0.35
i = 0
for c in countries_srt:
cur_min_age = min_age[c]
cur_max_age = max_age[c]
ax[0].plot([i,i], [min_age[c]/1e3, max_age[c]/1e3], color='blue', solid_capstyle='butt')
ax[0].plot([i-delta,i+delta], [min_age[c]/1e3, min_age[c]/1e3], color='blue', solid_capstyle='butt')
ax[0].plot([i-delta,i+delta], [max_age[c]/1e3, max_age[c]/1e3], color='blue', solid_capstyle='butt')
i += 1
ax[0].set_xticks([])
ax[0].set_ylabel(r'Age Range (kya)', fontsize=12)
ax[1].set_ylabel(r'Number of samples', fontsize=12)
fig.text(0.5, 0.00, 'Country', fontsize=14)
debox(ax[1]); debox(ax[0]);
for i, label in enumerate(('A', 'B')):
ax[i].text(-0.05, 1.15, label, fontsize=16,
fontweight='bold', va='top', ha='right', transform=ax[i].transAxes);
plt.tight_layout()
global_lats = pass_indivs.Lat
global_longs = pass_indivs.Long
global_lambda = pass_indivs.scale_marginal
global_lambda_se = pass_indivs.se_scale_marginal_fd
global_age = pass_indivs.AvgDate_calBP
# Some filters here with the distance here
idx = (global_lats != '..') & (global_longs != '..') & (global_lambda < 8e2)
fig = plt.figure(figsize=(6,3))
# ax = fig.add_subplot(1,1,1, projection=ccrs.PlateCarree())
ax = plt.axes(projection=ccrs.Robinson())
# ax.set_global()
ax.coastlines(lw=0.5)
im = ax.scatter(global_longs[idx].values.astype(float),
global_lats[idx].values.astype(float),
c=global_lambda[idx].values.astype(float),
transform=ccrs.PlateCarree(), s=10, alpha=0.75)
ax.set_aspect('auto')
ax.outline_patch.set_visible(False)
fig.canvas.draw()
# ax.set_title(r'Geographic Variation in $\hat{\lambda}$', fontsize=14)
plt.colorbar(im, label=r'$\hat{\lambda}$')
plt.tight_layout()
plt.savefig(supp_figdir + 'eurasia_panel_%s_global_lambda_%d.pdf' % (panel, nsnps), bbox_inches='tight')
# Calculate Distance from central europe (ideally add as another annotation)
test_longs = global_longs[idx].values.astype(float)
test_lats = global_lats[idx].values.astype(float)
test_lambda = global_lambda[idx].values.astype(float)
test_lambda_se = global_lambda_se[idx].values.astype(float)
test_ages = global_age[idx].values.astype(float)
assert(test_longs.size == test_lats.size)
assert(test_longs.size == test_lambda.size)
# Setting coordinates for the middle of Europe
ceu_coords = (48., 6.)
geo_dist_ceu = np.zeros(test_longs.size)
for i in tqdm(range(test_longs.size)):
geo_dist_ceu[i] = geodesic(ceu_coords, (test_lats[i], test_longs[i])).kilometers
# Putting in the distances from CEU coordinates here ...
dist = np.zeros(pass_indivs.shape[0])
dist[idx] = geo_dist_ceu
pass_indivs['CEU_dist'] = dist
#Do the slight filtering here ...
km_thresh = 1500
gen_time = 30.
test_df = pass_indivs[(pass_indivs.CEU_dist != 0) & (pass_indivs.CEU_dist <= km_thresh) & (pass_indivs.AvgDate_calBP <= 1.5e4)]
test_df = test_df.astype({'Lat': 'float64', 'Long': 'float64', 'AvgDate_calBP': 'float64', 'Cov_Autosomes': 'float64'})
test_df['AvgDateGenerations'] = test_df.AvgDate_calBP.values / gen_time
print(test_df.shape[0])
fig = plt.figure(figsize=(7.5,3.5))
gs = fig.add_gridspec(1, 6)
# age_gen = test_df.AvgDateGenerations.values.astype(float)
age_gen = test_df.AvgDateGenerations.values.astype(float)
age_years = test_df.AvgDate_calBP.values.astype(float)
ax1 = fig.add_subplot(gs[:, :3], projection=ccrs.PlateCarree())
ax1.coastlines(resolution='auto', color='k')
im = ax1.scatter(test_df.Long.values.astype(float),
test_df.Lat.values.astype(float),
transform=ccrs.PlateCarree(), c=age_years, s=10, alpha=0.75, rasterized=True)
fig.colorbar(im, ax=ax1, orientation='vertical', pad=0.01, fraction=0.2, label=r'Sample age (Years)')
ax1.set_aspect('auto')
ax1.outline_patch.set_visible(False)
debox(ax1);
# Linear Model to check for effect of age
y, X = dmatrices('scale_marginal ~ AvgDateGenerations + Lat + Long + Lat:Long',
data=test_df,
return_type='dataframe')
res = sm.OLS(y,X).fit()
# print(res.summary())
intercept = res.params['Intercept']
beta_age = res.params['AvgDateGenerations']
p_value = res.pvalues['AvgDateGenerations']
marginal_lambda = test_df.scale_marginal.values
marginal_lambda_se = test_df.se_marginal.values
idx = (marginal_lambda_se < 100)
ax3 = fig.add_subplot(gs[:, 3:])
im2 = ax3.errorbar(rand_jitter(age_years[idx], scale=1e-3), marginal_lambda[idx],
yerr=2*marginal_lambda_se[idx], color='blue', alpha=0.5,
linestyle='none', markersize=2, marker='o', capsize=2)
# Print out the results of the linear regression
print(intercept, beta_age, p_value)
ax3.set_ylabel(r'$\hat{\lambda}$ (Morgans)', fontsize=14)
ax3.set_xlabel(r'Sample age (Years)', fontsize=14)
debox(ax3);
ax1.text(-0.05, 1.1, 'A', fontsize=14, fontweight='bold', va='top', ha='right', transform=ax1.transAxes);
ax3.text(-0.05, 1.1, 'B', fontsize=14, fontweight='bold', va='top', ha='right', transform=ax3.transAxes);
fig.canvas.draw()
plt.tight_layout()
# plt.savefig(supp_figdir + 'geography_dist_CEU_thresh_%d_%s_%d.pdf' % (km_thresh, panel, nsnps), bbox_inches='tight', dpi=300)
fig, ax = plt.subplots(2,1,figsize=(4,4), sharex=True)
ax[0].scatter(test_df.AvgDate_calBP.values.astype(float), test_df.eps_jt.values, s=10 )
ax[1].scatter(test_df.AvgDate_calBP.values.astype(float), test_df.scale_jt.values, s=10)
debox_all(ax);
ax[0].set_ylabel(r'$\epsilon$')
ax[1].set_ylabel(r'$\lambda$')
ax[1].set_xlabel(r'Age (years)')
We essentially want to compare realistically simulated real data with the 1240K real data. Here we have focused on using the same panel size as with the real data ($K = 49$) and using variants ascertained to have $MAF > 5\%$
# Reading in the CSV with all of the simulation results
chrX_hapsim_df = pd.read_csv('../../results/hap_copying/simulations/jump_rate_est_sims_ceu_real_chrX.csv')
chrX_hapsim_df.head()
import cartopy
import cartopy.mpl.geoaxes
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
fig, ax = plt.subplots(1,3, figsize=(7,2.5), sharex=True, sharey=True)
# Axis 1: The real data
axins = inset_axes(ax[0], width="60%", height="60%", loc="upper right", borderpad=0.0,
axes_class=cartopy.mpl.geoaxes.GeoAxes,
axes_kwargs=dict(map_projection=cartopy.crs.PlateCarree()))
axins.coastlines(resolution='auto', lw=0.5, color='k')
im = axins.scatter(test_df.Long.values.astype(float),
test_df.Lat.values.astype(float),
transform=ccrs.PlateCarree(), c=age_years, s=5, alpha=0.75)
axins.spines['geo'].set_edgecolor('white')
im2 = ax[0].errorbar(rand_jitter(age_gen, scale=1e-3),
marginal_lambda,
yerr=2*marginal_lambda_se,
color='blue', linestyle='none', capsize=2, alpha=0.3)
z = lowess(marginal_lambda, rand_jitter(age_gen), frac=1/3)
ax[0].plot(z[:,0], z[:,1], color='orange', lw=3, linestyle='--', zorder=100)
# AXIS 1 : Tennessen et al model
filt_df = chrX_hapsim_df[chrX_hapsim_df.scenario == 'TennessenEuropeanXchr']
ax[1].errorbar(rand_jitter(filt_df.ta), filt_df.scale_marginal,
yerr=2*filt_df.se_scale_marginal, capsize=2, linestyle='none', color='blue', alpha=0.3)
z = lowess(filt_df.scale_marginal, rand_jitter(filt_df.ta), frac=1/3)
ax[1].plot(z[:,0], z[:,1], color='orange', lw=3, linestyle='--')
# AXIS 2:
filt_df = chrX_hapsim_df[chrX_hapsim_df.scenario == 'IBDNeUK10KXchr']
ax[2].errorbar(rand_jitter(filt_df.ta), filt_df.scale_marginal,
yerr=2*filt_df.se_scale_marginal, capsize=2, linestyle='none', color='blue', alpha=0.3)
z = lowess(filt_df.scale_marginal, rand_jitter(filt_df.ta), frac=1/3)
ax[2].plot(z[:,0], z[:,1], color='orange', lw=3, linestyle='--')
# Debox all of the axes
debox_all(ax);
# Setting some labeling
ax[0].set_ylim(200,1000)
ax[0].set_xlim(0,500)
label_multipanel(ax, ['A','B','C'], fontsize=14, fontweight='bold', xoff=-0.06, yoff=1.2, va='top', ha='right')
ax[1].set_title(r'Tennessen et al (2012)', fontsize=12)
ax[2].set_title(r'IBDNe UK10K (2015)', fontsize=12)
ax[0].set_title(r'1240K Ancient Data', fontsize=12)
ax[0].set_ylabel(r'$\hat{\lambda}$ (per Morgan)', fontsize=14)
ax[1].set_xlabel('Sample age (Generations)', fontsize=14);
plt.tight_layout()
plt.savefig(main_figdir + 'copying_rate_demography_chrX_experiment_panel_%s_%d.pdf' % (panel,nsnps), dpi=300, bbox_inches='tight')
# Running the filtering step
res_df = quality_control_panel_select(total_raw_stats_df);
euro_ceu_ancients = spatial_restrict(res_df)
res1_df = quality_control_panel_select(total_raw_stats_df, panel='eur');
euro_eur_ancients = spatial_restrict(res1_df)
res2_df = quality_control_panel_select(total_raw_stats_df, panel='fullkg');
euro_fullkg_ancients = spatial_restrict(res2_df)
fig, axs = plt.subplots(1,3, figsize=(7,2.5), sharex=True, sharey=True)
axs[0].errorbar(euro_ceu_ancients.AvgDate_calBP.values,
euro_ceu_ancients.scale_marginal.values,
yerr=2*euro_ceu_ancients.se_scale_marginal_fd.values,
marker='o', markersize=2, linestyle='none', rasterized=True)
axs[1].errorbar(euro_eur_ancients.AvgDate_calBP.values,
euro_eur_ancients.scale_marginal.values,
yerr=2*euro_eur_ancients.se_scale_marginal_fd.values,
marker='o', markersize=2, linestyle='none', rasterized=True)
axs[2].errorbar(euro_fullkg_ancients.AvgDate_calBP.values,
euro_fullkg_ancients.scale_marginal.values,
yerr=2*euro_fullkg_ancients.se_scale_marginal_fd.values,
marker='o', markersize=2, linestyle='none')
axs[0].set_ylim(100,1200)
axs[0].set_xlim(0,1.5e4)
axs[0].set_ylabel(r'$\hat{\lambda}$ (per Morgan)', fontsize=14)
axs[1].set_xlabel(r'Sample age (years)', fontsize=14)
# Setting titles
axs[0].set_title(r'CEU', fontsize=14)
axs[1].set_title(r'EUR', fontsize=14)
axs[2].set_title(r'Full-1KG', fontsize=14)
label_multipanel(axs, ['A','B','C'], fontsize=14,
fontweight='bold', xoff=-0.06, yoff=1.2, va='top', ha='right')
debox_all(axs);
plt.savefig(supp_figdir + 'compare_panels_1500_central_europe.pdf', bbox_inches='tight')