Overdispersion 1

Poisson model alternative for counts with excess variance

bayes
stan
counts
glm
Author

maj

Published

July 9, 2025

Modified

July 10, 2025

Setup and dependencies
library(data.table)

library(ggplot2)
library(ggh4x)
library(gt)
suppressPackageStartupMessages(library(cmdstanr))
suppressPackageStartupMessages(library(brms))
suppressPackageStartupMessages(library(mgcv))

# devtools::install_github("thomasp85/patchwork")
library(patchwork)



toks <- unlist(tstrsplit(getwd(), "/")) 
if(toks[length(toks)] == "maj-biostat.github.io"){
  prefix_stan <- "./stan"
} else {
  prefix_stan <- "../stan"
}

Introduction

For Poisson, if \(y\) is the counts, we have probability density

\[ \begin{align*} f(y) = \frac{\mu^y \exp(-\mu)}{y!} \end{align*} \]

and \(\mu\) is the expected number of occurrences, which also equals the variance. Sometimes, you think about \(\mu\) as a rate, e.g. average number of car crashes per 1000 population, per 1000 licensed drivers, per 10000 km travelled etc. In the latter view, rate is in terms of an exposure.

If \(y_i\) denote the number of events over exposure \(n_i\) for the \(i^\text{th}\) covariate pattern, the expected value is

\[ \begin{align*} E[y_i] = \mu_i = n_i \lambda_i \end{align*} \]

Explanatory variables are usually modelled via

\[ \begin{align*} \lambda_i = \exp(X \beta) \end{align*} \]

and so the GLM is

\[ \begin{align*} y_i &\sim \text{Pois}(\mu_i) \\ \mu_i &= n_i \exp(X \beta) \end{align*} \]

the natural link is \(\log\)

\[ \begin{align*} \log(\mu_i) &= \log(n_i) + X \beta \end{align*} \]

The inclusion of \(\log(n_i)\) is the offset, a known constant whereas the \(X\) and \(\beta\) represent the usual covariate pattern and parameters.

When over-dispersion is present (can indicate something is missing from the linear predictor or independence between observations is questionable) we can switch to a Negative binomial distribution for the likelihood. This has a number of parameterisations, one of which uses the mean directly then controls the overdispersion relative to the square of the mean.

\[ \begin{align*} y_i &\sim \text{NegBin}(\mu_i, \phi) \end{align*} \]

with variance

\[ \begin{align*} \text{Var}(y_i) = \mu + \frac{\mu^2}{\phi} \end{align*} \]

i.e. \(\frac{\mu^2}{\phi}\) is the additional variance over that for the Poisson.

I believe this is actually the poisson gamma mixture but it is worth noting that there are other options for incorporating the excess variance. Some of those I have come across include a poisson-lognormal (https://www.sciencedirect.com/science/article/pii/S2001037025000856 and https://solomonkurz.netlify.app/blog/2021-07-12-got-overdispersion-try-observation-level-random-effects-with-the-poisson-lognormal-mixture/) and additive random effects (https://pmc.ncbi.nlm.nih.gov/articles/PMC4194460/).

Overdispersed data

Simulate overdispersed counts associated with 3 vaccine groups and where counts are represented per 35k but our interest is per million.

Simulate overdispersed counts
# we have 20 observations per vax, each having a count and a somewhat distinct 
# covariate pattern (which I am ignoring for now)
n_per_group <- 50
cells_per_well <- 35000

# arbitrary values
true_rates <- c(vaxA = 5, vaxB = 10, vaxC = 15) / cells_per_well  
# overdispersion (smaller => more dispersion)
phi <- 2  

# Simulate data
d_sim <- rbindlist(lapply(names(true_rates), function(vaccine) {
  mu <- true_rates[vaccine] * cells_per_well
  data.table(
    vaccine = vaccine,
    response = rnbinom(n_per_group, mu = mu, size = phi),
    # technically this could be varying for each obs
    cells = cells_per_well
  )
}))
d_sim[, group := as.integer(factor(vaccine))]
d_sim[, log_cells := log(cells)]

head(d_sim)
   vaccine response cells group log_cells
    <char>    <num> <num> <int>     <num>
1:    vaxA        4 35000     1   10.4631
2:    vaxA       12 35000     1   10.4631
3:    vaxA        7 35000     1   10.4631
4:    vaxA        2 35000     1   10.4631
5:    vaxA        4 35000     1   10.4631
6:    vaxA        2 35000     1   10.4631

Stan model implementation

Code
cat(readLines(paste0(prefix_stan, "/negbin-1.stan")), sep = "\n")
data {
  int<lower=1> N;
  int<lower=1> G;
  array[N] int<lower=1, upper=G> group;
  array[N] int<lower=0> y;
  vector[N] log_cells;
}
parameters {
  vector[G] b0;               // log(rate per cell for each group)
  real<lower=0> phi;             // overdispersion parameter
}

model {
  target += normal_lpdf(b0 | 0, 2);
  target += gamma_lpdf(phi | 2, 0.1);
  
  for (i in 1:N){
    target += neg_binomial_2_log_lpmf(y[i] | b0[group[i]] + log_cells[i], phi);
  } 
}

generated quantities {
  vector[N] y_rep;
  vector[G] mu;
  
  for (i in 1:G){
    mu[i] = exp(b0[i]) * pow(10,6);
  }
  for (i in 1:N){
    y_rep[i] = neg_binomial_2_log_rng(b0[group[i]] + log(pow(10,6)), phi);  
  }
  
    
}

Fit with a stan model, compute the expected values per million in the generated quantities block. The reference values are 143, 286, 429 for groups A, B and C.

Fit model with stan
m1 <- cmdstanr::cmdstan_model(paste0(prefix_stan, "/negbin-1.stan"))

# Prepare Stan data
ld <- list(
  N = nrow(d_sim),
  G = length(unique(d_sim$group)),
  group = d_sim$group,
  y = d_sim$response,
  log_cells = d_sim$log_cells
)

f1 <- m1$sample(
  ld, iter_warmup = 1000, iter_sampling = 1000,
  parallel_chains = 1, chains = 1, refresh = 0, show_exceptions = F,
  max_treedepth = 10)
Running MCMC with 1 chain...

Chain 1 finished in 0.2 seconds.
Fit model with stan
# mu is scaled up to per 1 million cells
f1$summary(variables = c("mu"))
# A tibble: 3 × 10
  variable  mean median    sd   mad    q5   q95  rhat ess_bulk ess_tail
  <chr>    <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
1 mu[1]     146.   144.  19.0  17.0  120.  178.  1.00    1011.     647.
2 mu[2]     274.   270.  31.2  30.7  227.  326.  1.00    1129.     656.
3 mu[3]     405.   402.  44.4  41.6  337.  488.  1.00    1041.     720.

Represent the posterior view of the means (per million cells). These are the expected counts, i.e. on average (central tendency) we expect blah from vaccine A, B, C.

Posterior means
d_post <- data.table(f1$draws(variables = c("mu"), format = "matrix"))
d_post <- melt(d_post, measure.vars = names(d_post))

d_fig_1 <- copy(d_post)
d_fig_1[, group := gsub("mu\\[", "", variable)]
d_fig_1[, group := as.factor(gsub("\\]", "", group))]

d_fig_2 <- data.table(
  group = factor(1:3),
  tru = true_rates * 1e6
)


ggplot(d_fig_1, aes(x = value, group = group, col = group)) +
  geom_vline(
    data = d_fig_2, 
    aes(xintercept = tru, group = group, col = group),
    lwd = 0.2
  ) +
  geom_density() +
  theme_bw()
Figure 1: Posterior means per million cells by group
Posterior predictive implementation
d_post <- data.table(f1$draws(variables = c("y_rep"), format = "matrix"))
d_post[, ix := 1:.N]
d_post <- melt(d_post, id.vars = "ix")

ix_rnd <- sort(sample(1:max(d_post$ix), size = 10, replace = F))
d_fig_1 <- copy(d_post[ix %in% ix_rnd])
d_fig_1[, id := gsub("y_rep\\[", "", variable)]
d_fig_1[, id := as.integer(gsub("\\]", "", id))]
d_fig_1[, group := d_sim[id, group]]


ggplot(d_fig_1, aes(x = value)) +
  geom_histogram(
    fill = "white", col = "black", bins = 17
  ) +
  geom_histogram(
    data = d_sim,
    aes(x = 1e6 * response / cells_per_well),
    fill = "red", alpha = 0.3, bins = 15, lwd = 0.3
  ) +
  theme_bw() +
  ggh4x::facet_grid2(ix ~ group, scales = "free_x", labeller = label_both)
Figure 2: Posterior predictive (counts per million) based on 10 samples of equal size to what we observed (in red) by vaccine group