Estimating migration surfaces

This tutorial demonstrates how to estimate a spatially varying migration rate surface from coalescence time data, similar to the EEMS method (Petkova, Novembre & Stephens, 2016) but using phasic’s exact phase-type distributions and SVGD inference.

The approach parameterizes the migration surface using radial basis functions (RBFs):

m_{i \to j} = \exp\!\Bigl(-\sum_k \theta_k \, \phi_k(\bar{x}_{ij})\Bigr)

where \phi_k are RBFs centered at K locations across the landscape, and \bar{x}_{ij} is the midpoint of the edge between cells i and j. Positive \theta_k values create barriers (low migration), negative values create corridors (high migration), and \theta_k = 0 gives a baseline rate of 1.

We simulate coalescence times from a known resistance surface with a barrier, then recover it with SVGD.

from phasic import (
    Graph, HexGrid, StateIndexer, Property, GaussPrior
) # ALWAYS import phasic first to set jax backend correctly
import numpy as np
import jax.numpy as jnp
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.patches import RegularPolygon
from matplotlib.collections import PatchCollection
import seaborn as sns
%config InlineBackend.figure_format = 'svg'
from vscodenb import set_vscode_theme

np.random.seed(42)
set_vscode_theme()
sns.set_palette('tab10')

Hex grid over Africa

url = 'https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip'
world = gpd.read_file(url)
africa = world[world['CONTINENT'] == 'Africa'].dissolve()
africa.to_file('/tmp/africa.shp')

grid = HexGrid.from_shapefile('/tmp/africa.shp', hex_size=5)
valid_cells = grid.valid_cells()
print(f'Grid: {grid.n_rows}x{grid.n_cols}, valid cells: {len(valid_cells)}')

RBF basis functions

We place RBF centers at a subset of cell locations across the continent. Each basis function \phi_k(x) = \exp(-\|x - c_k\|^2 / 2\sigma^2) is a Gaussian bump centered at c_k.

# Place RBF centers at every 4th valid cell
rbf_center_cells = valid_cells[::4]
rbf_centers = [grid.rowcol_to_coords(r, c) for r, c in rbf_center_cells]
K = len(rbf_centers)
rbf_sigma = 10.0  # Bandwidth in degrees

print(f'Number of RBF centers (parameters): {K}')
print(f'RBF bandwidth: {rbf_sigma}°')

def plot_grid_values(ax, grid, values, title, cmap='viridis', vmin=None, vmax=None):
    """Plot values on a hex grid."""
    africa.boundary.plot(ax=ax, color='black', linewidth=1)
    patches, vals = [], []
    for r, c in grid.valid_cells():
        x, y = grid.rowcol_to_coords(r, c)
        patches.append(RegularPolygon(
            (x, y), numVertices=6, radius=grid.hex_size, orientation=0
        ))
        vals.append(values[r, c])
    pc = PatchCollection(patches, cmap=cmap, linewidths=0.3, edgecolors='black')
    pc.set_array(np.array(vals))
    if vmin is not None: pc.set_clim(vmin, vmax)
    ax.add_collection(pc)
    ax.set_aspect('equal')
    ax.set_title(title)
    return plt.colorbar(pc, ax=ax, shrink=0.7)

# Visualize the basis functions
fig, axes = plt.subplots(2, (K + 1) // 2, figsize=(3 * ((K + 1) // 2), 6))
axes = axes.flatten()

for k in range(K):
    cx, cy = rbf_centers[k]
    phi = np.zeros((grid.n_rows, grid.n_cols))
    for r, c in valid_cells:
        x, y = grid.rowcol_to_coords(r, c)
        phi[r, c] = np.exp(-((x - cx)**2 + (y - cy)**2) / (2 * rbf_sigma**2))
    plot_grid_values(axes[k], grid, phi, f'$\\phi_{{{k+1}}}$', vmin=0, vmax=1)
    axes[k].plot(cx, cy, 'r+', markersize=10, markeredgewidth=2)

for k in range(K, len(axes)):
    axes[k].set_visible(False)

plt.tight_layout() ;

True resistance surface

We define a “true” parameter vector \theta with a barrier in the Sahara region (northern Africa). RBF centers with latitude above 20°N get positive \theta_k values, creating low migration rates. Centers in sub-Saharan Africa get \theta_k \approx 0, leaving migration at the baseline rate.

# Define true theta: barrier where RBF centers are in northern Africa
true_theta = np.zeros(K)
for k, (cx, cy) in enumerate(rbf_centers):
    if cy > 18:  # Sahara region
        true_theta[k] = 3.0

print(f'True theta: {true_theta}')
print(f'Barrier centers: {[k for k in range(K) if true_theta[k] > 0]}')

def compute_migration_surface(theta, grid, rbf_centers, rbf_sigma):
    """Compute migration rate at each cell: m(x) = exp(-sum_k theta_k phi_k(x))."""
    m = np.zeros((grid.n_rows, grid.n_cols))
    for r, c in grid.valid_cells():
        x, y = grid.rowcol_to_coords(r, c)
        resistance = sum(
            theta[k] * np.exp(-((x - cx)**2 + (y - cy)**2) / (2 * rbf_sigma**2))
            for k, (cx, cy) in enumerate(rbf_centers)
        )
        m[r, c] = np.exp(-resistance)
    return m

true_surface = compute_migration_surface(true_theta, grid, rbf_centers, rbf_sigma)

fig, ax = plt.subplots(figsize=(5, 5))
plot_grid_values(ax, grid, true_surface, 'True migration rate surface', vmin=0, vmax=1) ;

Building the parameterized graph

We build a two-lineage spatial coalescent where migration edge weights are parameterized by RBF coefficients. The graph uses manual construction because migration edges (parameterized) and coalescence edges (constant) must both be parameterized — we use zero coefficients for coalescence so that exp(-dot([0,...,0], theta)) = 1 regardless of \theta.

indexer = StateIndexer(
    cell=grid.properties() + [Property('lineage', min_value=1, max_value=2)]
)

def compute_rbf_coeffs(r1, c1, r2, c2):
    """RBF coefficient vector for the edge midpoint between two cells."""
    x1, y1 = grid.rowcol_to_coords(r1, c1)
    x2, y2 = grid.rowcol_to_coords(r2, c2)
    mx, my = (x1 + x2) / 2, (y1 + y2) / 2
    return [np.exp(-((mx - cx)**2 + (my - cy)**2) / (2 * rbf_sigma**2))
            for cx, cy in rbf_centers]

# Sampling locations
west_cell = grid.coords_to_rowcol(-5, 10)
east_cell = grid.coords_to_rowcol(35, 0)
print(f'West Africa: cell {west_cell}')
print(f'East Africa: cell {east_cell}')

%%time

# Build graph with manual construction
graph = Graph(indexer.state_length)
sv = graph.starting_vertex()

# Initial state: lineage 1 in west, lineage 2 in east
initial = np.zeros(indexer.state_length, dtype=int)
initial[indexer.cell.p2i(row=west_cell[0], col=west_cell[1], lineage=1)] = 1
initial[indexer.cell.p2i(row=east_cell[0], col=east_cell[1], lineage=2)] = 1

v_init = graph.find_or_create_vertex(list(initial))
sv.add_edge(v_init, 1.0)

# Absorbing state and zero coefficients for constant-rate edges
zero_state = [0] * indexer.state_length
v_absorb = graph.find_or_create_vertex(zero_state)
zero_coeffs = [0.0] * K

# BFS: visit all reachable states
idx = 1
while idx < graph.vertices_length():
    v = graph.vertex_at(idx)
    state = np.array(v.state())

    # Find lineage positions
    pos = {}
    for i in range(indexer.cell.state_length):
        if state[i] == 1:
            props = indexer.cell.i2p(i)
            pos[props.lineage] = (props.row, props.col)

    if len(pos) == 2:
        # Coalescence: both lineages in same cell -> absorb at rate 1
        if pos[1] == pos[2]:
            v.add_edge(v_absorb, zero_coeffs)

        # Migration: each lineage moves to a neighbor
        for lin_id in [1, 2]:
            r, c = pos[lin_id]
            for nr, nc in grid.neighbors(r, c):
                child = state.copy()
                child[indexer.cell.p2i(row=r, col=c, lineage=lin_id)] = 0
                child[indexer.cell.p2i(row=nr, col=nc, lineage=lin_id)] = 1
                v_child = graph.find_or_create_vertex(list(child))
                v.add_edge(v_child, compute_rbf_coeffs(r, c, nr, nc))

    idx += 1

print(f'Graph vertices: {graph.vertices_length()}')
print(f'Parameters: {K}')

Weight callback

The callback computes migration rates from the resistance parameterization. For each edge, the stored coefficients are the RBF values at the edge midpoint, and the weight is exp(-dot(coefficients, theta)).

def resistance_weight(theta, coeffs):
    """Migration rate = exp(-resistance) where resistance = dot(coeffs, theta)."""
    return float(np.exp(-np.dot(coeffs, theta)))

Simulating data

We apply the true parameters to simulate coalescence times between the West and East Africa lineage pair.

# Apply true theta via callback and sample
graph.update_weights(true_theta, callback=resistance_weight)

E_true = graph.expectation()
print(f'Expected coalescence time (with barrier): {E_true:.1f}')

# Compare with no barrier
graph.update_weights(np.zeros(K), callback=resistance_weight)
E_no_barrier = graph.expectation()
print(f'Expected coalescence time (no barrier): {E_no_barrier:.1f}')
print(f'Barrier increases coalescence time by {E_true / E_no_barrier:.1f}x')

graph.update_weights(true_theta, callback=resistance_weight)
observations = graph.sample(1000)

fig, ax = plt.subplots(figsize=(5, 3))
ax.hist(observations, bins=50, density=True, alpha=0.6)
ax.axvline(np.mean(observations), color='red', linestyle='--', label=f'Mean = {np.mean(observations):.1f}')
ax.set_xlabel('Coalescence time')
ax.set_ylabel('Density')
ax.legend() ;

SVGD inference

We set graph.weight_callback so that the SVGD pipeline uses the resistance parameterization. The prior is a Gaussian centered at 0 (no barrier), which encodes the null hypothesis of uniform migration.

graph.weight_callback = resistance_weight

prior = [GaussPrior(mean=0, std=3)] * K

svgd = graph.svgd(
    observations,
    prior=prior,
    n_particles=30,
    n_iterations=100,
    progress=True,
)

svgd.summary()

svgd.plot_convergence()

Comparing true and estimated surfaces

estimated_theta = svgd.theta_mean

print('Parameter estimates vs truth:')
for k in range(K):
    cx, cy = rbf_centers[k]
    print(f'  theta[{k}]: estimated={estimated_theta[k]:+.2f}, true={true_theta[k]:+.2f}  (center at {cx:.0f}°E, {cy:.0f}°N)')

estimated_surface = compute_migration_surface(estimated_theta, grid, rbf_centers, rbf_sigma)

vmin = min(true_surface[true_surface > 0].min(), estimated_surface[estimated_surface > 0].min())
vmax = max(true_surface.max(), estimated_surface.max())

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
plot_grid_values(ax1, grid, true_surface, 'True migration surface', vmin=vmin, vmax=vmax)
plot_grid_values(ax2, grid, estimated_surface, 'Estimated migration surface', vmin=vmin, vmax=vmax)
plt.tight_layout() ;

Summary

This tutorial demonstrated:

RBF-parameterized migration surfaces — a smooth, low-dimensional parameterization of spatially varying migration rates using radial basis functions.
Log-linear resistance model — migration rates as \exp(-\text{resistance}) where resistance is a weighted sum of basis functions. Barriers have positive weights, corridors negative.
Weight callbacks — graph.weight_callback enables arbitrary non-linear weight functions in the JAX/SVGD pipeline.
Simulation-recovery — simulating from a known surface and recovering it with SVGD.

Compared to EEMS (Petkova, Novembre & Stephens, 2016):

Exact distributions: phasic computes exact coalescence time distributions, not pairwise distance summaries.
Gradient-based inference: SVGD is faster than reversible-jump MCMC.
Smooth surfaces: RBFs produce smooth migration surfaces without the discontinuities of Voronoi tessellations.
Fewer parameters: K \approx 10 RBF weights vs hundreds of Voronoi tiles.