This notebook explores a verification of our Li-Stephens Model implementation. Specifically we focus on: (1) estimation of the haplotype copying jump rate when the error rate is given and (2) joint estimation of the jump-rate and the error probability via numerical approximation.
For the joint estimation of the jump-rate and the error probability we are able to obtain the standard errors of each estimate as well by taking the square root of the diagonal of the Hessian matrix. In the case of the marginal jump rate estimation the error probability was fixed at $\epsilon = 10^{-2}$.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import msprime as msp
plt.rcParams['pdf.fonttype'] = 3
import sys
sys.path.append('../../src/')
from plot_utils import *
%matplotlib inline
# Reading in the CSV of results from our various simulations ...
results_csv = '../../results/ls_verify/ls_simulations_100.csv'
ls_verify_df = pd.read_csv(results_csv)
ls_verify_df.head()
fig, ax = plt.subplots(2,1,figsize=(4,4), sharex=True, sharey=True)
ax[0].scatter(ls_verify_df['scales_true'].values,
ls_verify_df['scales_marg_hat'].values, s=10);
ax[1].scatter(ls_verify_df['scales_true'].values,
ls_verify_df['scales_jt_hat'].values, s=10);
plot_yx(ax[0], linestyle='--', color='black')
plot_yx(ax[1], linestyle='--', color='black')
ax[1].set_xlabel(r'$\lambda_{true}$', fontsize=14);
ax[1].set_ylabel(r'$\hat{\lambda}_{jt}$', fontsize=14);
ax[0].set_ylabel(r'$\hat{\lambda}_{marginal}$', fontsize=14);
debox(ax[0]); debox(ax[1]);
fig.tight_layout();
fig, ax = plt.subplots(1,1,figsize=(3,3))
ax.scatter(ls_verify_df['scales_marg_hat'],
ls_verify_df['scales_jt_hat'], s=10)
debox(ax);
plot_yx(ax, color='black', linestyle='--');
ax.set_ylabel(r'$\hat{\lambda}_{jt}$', fontsize=14);
ax.set_xlabel(r'$\hat{\lambda}_{marginal}$', fontsize=14);
fig, ax = plt.subplots(1,1,figsize=(3,3))
ax.scatter(ls_verify_df['scales_jt_hat'],
ls_verify_df['eps_jt_hat'], s=10)
debox(ax);
# plot_yx(ax, color='black', linestyle='--');
ax.set_xlabel(r'$\hat{\lambda}_{jt}$', fontsize=14);
ax.set_ylabel(r'$\hat{\epsilon}_{jt}$', fontsize=14);
from li_stephens import LiStephensHMM
from tqdm import tqdm
mut_rate = 1e-8
rec_rate = 2e-8
length = 10e6
seed = 24
n = 50
ts = msp.simulate(Ne=1e4, sample_size=n,
mutation_rate=mut_rate,
recombination_rate=rec_rate, length=length, random_seed=seed)
# Generating the haplotype reference panel...
geno = ts.genotype_matrix().T
phys_pos = np.array([v.position for v in ts.variants()])
rec_pos = phys_pos * rec_rate
ac = np.sum(geno, axis=0)
mac = np.minimum(ac, n - ac)
maf = mac / n
idx = np.where(maf > 0.02)[0]
print(idx.size)
ls_model = LiStephensHMM(haps=geno[:,idx], positions=rec_pos[idx])
test_hap,_ = ls_model._sim_haplotype(scale=1.5e2, eps=1e-2, seed=42)
x = 20
M = np.zeros(shape=(x,x))
scales=np.linspace(5, 300, x)
eps = np.linspace(5e-3, 5e-2, x)
for i,s in tqdm(enumerate(scales)):
for j,e in enumerate(eps):
M[i,j]=ls_model._negative_logll(test_hap, scale=s, eps=e)
bounds = [(10., 1e4), (1e-3, 0.2)]
start_scale = np.random.uniform(low=10, high=1e4, size=3)
start_eps = np.random.uniform(low=1e-3, high=0.2, size=3)
%time res1 = ls_model._infer_params(test_hap, x0=[start_scale[0], start_eps[0]], bounds=bounds)
%time res2 = ls_model._infer_params(test_hap, x0=[start_scale[1], start_eps[1]], bounds=bounds)
%time res3 = ls_model._infer_params(test_hap, x0=[start_scale[2], start_eps[2]], bounds=bounds)
fig, ax = plt.subplots(1,1,figsize=(5,4))
im = ax.imshow(M, extent=[5e-3, 5e-2, 5, 300], interpolation='Gaussian', origin='lower', aspect='auto')
ax.scatter(1e-2, 1.5e2, marker='x', color='red')
ax.scatter(res1.x[1], res1.x[0], marker='+', color='green')
ax.scatter(res2.x[1], res2.x[0], marker='+', color='green')
ax.scatter(res3.x[1], res3.x[0], marker='+', color='green')
# ax.axvline(x=1e-2, color='red')
ax.set_ylabel(r'$\lambda$', fontsize=14)
ax.set_xlabel(r'$\epsilon$', fontsize=14)
fig.colorbar(im)