Gpdd And Kalman Exploration Continued

Load libraries and data as before

library("ggplot2")
library("dplyr")
library("tidyr")
library("knitcitations")
library("rgpdd")
library("FKF")

Parallel version of `dplyr::do`

do_parallel <- function(df, f, ...){

  # supports only one group for now

  require("parallel")
  require("lazyeval")
  require("reshape2")
  options(mc.cores = detectCores())

  grps <- groups(df)
  ids <- sapply(grps, function(i) unique(df[[as.character(i)]]))
  names(ids) <- as.character(ids)
  ## turn grouped data.frame to a list of data.frames by MainID
  list_data <- lapply(ids, 
                      function(id){ 
                        .dots <- list(interp(~y == x, .values = list(y = grps[[1]], x = id)))
                        filter_(df, .dots = .dots)
                      })

  ## Actually do the fitting in parallel
  list_out <- mclapply(list_data, f, ...)

  ## reshape outputs back to a data.frame
  melt(list_out, id=names(list_out[[1]])) %>% 
    rename_(.dots = setNames(list("L1"), as.character(grps[[1]])) ) %>%
              as_data_frame()
}

Prepare data

Prepare data, as before: we filter on the stated criteria

gpdd_main %>% 
  filter(SamplingProtocol == "Count",
         SourceDimension %in% c("Count", "Index"), 
         SamplingFrequency == "1",
         DatasetLength >= 15) %>%
  select(MainID) %>%
  arrange(MainID) ->
filtered

and select data matching this filter. We add a column for the log of the population size and group by data ID:

gpdd_data %>% 
  filter(MainID %in% filtered$MainID) %>%
  select(MainID, Population, SampleYear) %>%
  group_by(MainID) %>% 
  mutate(logN = log(Population)) ->
df

Lastly, we replace -Inf (introduced from log(0) terms) with smallest finite values observed. (arbitrary, authors do not specify how these values are handled.)

i <- which(df$logN == -Inf)
df$logN[i] <- min(df$logN[-i])-1

We may test on a subset of the data first:

#not run
some <- sample(unique(df$MainID), 10)
df %>% filter(MainID %in% some) -> df

Import function definitions

We import our previous model definitions:

downloader::download("https://github.com/ropensci/rgpdd/raw/master/inst/scripts/knape-de-valpine.R", "knape-de-valpine.R")
source("knape-de-valpine.R")
unlink("knape-de-valpine.R")

Simulating

FKF package doesn’t bother to define a simulation method, so we can simply define one directly from the state equations. Though a C implementation would be preferrable, fitting will always be much more rate-limiting. (We will also ignore the multi-variate definition for simplicity here).

use <- function(x, default){
  if(is.null(x))
    default
  else
    x
}

sim_fkf <- function(fit){
  n <- fit[["n"]]
  dt <- fit[["dt"]]
  HHt <- fit[["HHt"]]
  Tt <- use(fit[["Tt"]], 1)
  GGt <- use(fit[["GGt"]], 0)
  a0 <- fit[["a0"]]
  ct <- 0
  Zt <- 1
  
  a <- numeric(n)
  y <- numeric(n)
  eta <- rnorm(n, dt, sqrt(HHt))
  epsilon <- rnorm(n, ct, sqrt(GGt))
  a[1] <- a0
    for(t in 1:(n-1)){
      a[t+1] <- Tt * a[t] + eta[t]
      y[t] <- Zt * a[t] + epsilon[t]
    }
        y[n] <- Zt * a[n] + epsilon[n]
  y
}

The study also creates simulated datasets based on the real data but explicitly making the assumption of either density independence (DI) or density dependence (DD). For each dataset, a density-independent simulated dataset is created by simulating under the SSRW model that was fit. The density-dependent model is created by explicitly fixing the density dependent parameter (\(c\) in the language of the paper, Tt in FKF notation) to 0.8 and estimating the other parameters of this modified SSG model. We can define this model analgously to the others, only this time fixing Tt = 0.8:

fit_dd <- function(y, 
                   init = c(dt = mean(y), HHt = log(var(y)/2), GGt = log(var(y)/2)),
                   ...){
    
    o <- optim(init,
                 fn =  function(par, ...)
                   -fkf(dt = matrix(par[1]), HHt = matrix(exp(par[2])), 
                        GGt = matrix(exp(par[3])), ...)$logLik,
                 Tt = matrix(0.8), a0 = y[1], P0 = matrix(10), 
                 ct = matrix(0), Zt = matrix(1), yt = rbind(y), 
                 check.input = FALSE, ...)
  o$par[["HHt"]] <- exp(o$par[["HHt"]])
  o$par[["GGt"]] <- exp(o$par[["GGt"]])
  c(o, list(a0 = y[1], n = length(y)))
   
}

The script adds a method for this to robust_fit() as well; though given the computational cost it is not clear if a robust fit is actually used in generating the data.

sim_di <- function(df) data.frame(logN = sim_fkf(robust_fit("ssrw", df$logN, N = 3)))
sim_dd <- function(df) data.frame(logN = sim_fkf(robust_fit("dd", df$logN, N = 3)))

system.time(
df %>% group_by(MainID) %>% do_parallel(sim_di) -> DI
)

   user  system elapsed 
123.079   7.039   8.836

system.time(
  df %>% group_by(MainID) %>% do_parallel(sim_dd) -> DD
)

   user  system elapsed 
120.578   5.209   8.023

We can then use these two collections of datasets just as before:

system.time(
  DD %>% group_by(MainID) %>% do_parallel(kalman, method = "BFGS") -> DD_fits
)

    user   system  elapsed 
9218.572  248.985  324.883

system.time(
  DI %>% group_by(MainID) %>% do_parallel(kalman, method = "BFGS") -> DI_fits
)

    user   system  elapsed 
8074.330  230.513  280.989

and from before:

system.time(
df %>% group_by(MainID) %>% do_parallel(kalman, method = "BFGS") -> fits
)

    user   system  elapsed 
7761.519  201.922  256.863

Figure 2

From these simulations and corresponding parameter estimates of the density-dependent parameter, we can create our version of Figure 2:

order <- c("real", "independent", "dependent")

combined <- rbind(
  DD_fits %>% 
  filter(model %in% c("ssg", "g"), parameter == "Tt") %>%
  select(model, value, MainID) %>% 
  mutate(type = factor("dependent", levels=order)),
  
  DI_fits %>% 
  filter(model %in% c("ssg", "g"), parameter == "Tt") %>%
  select(model, value, MainID) %>% 
  mutate(type = factor("independent", levels=order)),
  
  fits %>% 
  filter(model %in% c("ssg", "g"), parameter == "Tt") %>%
  select(model, value, MainID) %>% 
  mutate(type = factor("real", levels=order))) %>%
  
  ungroup() %>% 
  
  transmute(uncertainty = 
              plyr::revalue(model, 
                            c(ssg = "accounting for uncertainty",
                              g = "ignoring uncertainty")),
            type, value, MainID)

ggplot(combined) + 
  geom_histogram(aes(value), binwidth=0.2) + 
  facet_grid(uncertainty ~ type) + 
  geom_vline(aes(xintercept = mean(value)), lwd=.5) +
  geom_vline(aes(xintercept = median(value)), lwd=.5, col="grey") +
  xlim(c(-1.1,1.1)) + 
  xlab("c value") +
  theme_bw(16)

Bootstrapping

With fitting and simulating functions in place, defining the bootstrap is straight forward. We define these separately for the state-space Gompertz (ssg; i.e. the model with both density dependence and observational errors) and the Gompertz (g; density dependence, no observational error). We compare in each case to the simulations of the corresponding model without density dependence.

bootstrap <- function(df, null_model = "ssrw", test_model = "ssg", N=100){
  y <- df$logN
  
  ssg <- robust_fit(test_model, y)
  ssrw <- robust_fit(null_model, y)
  sims <- as.data.frame(t(replicate(N, sim_fkf(ssrw))))
  
  # We use a relaxed version of robust_fit, with N=3
  sims %>% rowwise() %>% do(robust_fit(null_model, y = as.numeric(.), N = 3)) %>% select(mloglik) -> null
  
  # compute p value of observed LR statistic relative to null distribution
  lr <- 2 * (ssrw$mloglik - ssg$mloglik)
  null_dist <- 2 * (null$mloglik - ssg$mloglik) 
  data.frame(p = sum(null_dist < lr)/N)
}

With these functions defined, we can perform the actual analysis.

system.time(
  df %>% group_by(MainID) %>% do_parallel(bootstrap, "ssrw", "ssg") -> ssg_p_values
  )

     user    system   elapsed 
18552.515   509.365   630.503

system.time(
  df %>% group_by(MainID) %>% do_parallel(bootstrap, "rw", "g") -> g_p_values
  )

    user   system  elapsed 
7958.974  215.100  259.174

We can also do the bootstrapping for the simulated data:

system.time(
 DD %>% group_by(MainID) %>% do_parallel(bootstrap, "ssrw", "ssg") -> dd_ssg_p_values
)

     user    system   elapsed 
20739.616   533.335   704.335

system.time(
 DD %>% group_by(MainID) %>% do_parallel(bootstrap, "rw", "g") -> dd_g_p_values
)

    user   system  elapsed 
9970.254  285.473  332.748

system.time(
 DI %>% group_by(MainID) %>% do_parallel(bootstrap, "ssrw", "ssg") -> di_ssg_p_values
)

     user    system   elapsed 
18631.331   479.485   622.542

system.time(
  DI %>% group_by(MainID) %>% do_parallel(bootstrap, "rw", "g") -> di_g_p_values
)

    user   system  elapsed 
8021.423  227.704  264.113

P <- rbind(
  ssg_p_values %>% mutate(data = "real", model = "ssg"),
  g_p_values %>% mutate(data = "real", model = "g"),
  di_ssg_p_values %>% mutate(data = "DI", model = "ssg"),
  di_g_p_values %>% mutate(data = "DI", model = "g"),
  dd_g_p_values %>% mutate(data = "DD", model = "g"),
  dd_ssg_p_values %>% mutate(data = "DD", model = "ssg"))


ggplot(P) + 
  geom_histogram(aes(p)) + 
  facet_grid(model ~ data) + 
  theme_bw(16)

Parallel version of dplyr::do