Intro to GAMs

Example set up and context:

For this example, let’s say we want to assess:

In this example, we will use data from NOAA Fisheries’ eastern Bering sea (EBS) bottom trawl survey. The Resource Assessment and Conservation Engineering (RACE) Division Groundfish Assessment Program (GAP) of the Alaska Fisheries Science Center (AFSC) conducts fisheries-independent bottom trawl surveys to assess the populations of demersal fish and crab stocks of Alaska. Data presented here are presence-only (non-zero) observations from those surveys and therefore CANNOT be aggregated.

For the sake of a simple example, we will only assess data from 2015 to 2021.

SPECIES <- c("walleye pollock")
YEARS <- 2015:2021
SRVY <- "EBS"

1. What data area we using?

Here, we use the public facing data from the NOAA AFSC groundfish Bering sea bottom trawl survey. For more information about how these data were compiled, see afsc-gap-products GitHub repo.

dat <- sdmgamindex::noaa_afsc_public_foss %>% 
  dplyr::filter(srvy == SRVY &
                  year %in% YEARS &
                  common_name %in% SPECIES) %>%
  dplyr::mutate(hauljoin = paste0(stratum, "_", station, "_", date_time)) %>%
  dplyr::select(
    year, date_time, latitude_dd, longitude_dd, # spatiotemproal data
    cpue_kgha, common_name, # catch data
    bottom_temperature_c, depth_m, # possible covariate data
    srvy, area_swept_ha, duration_hr, vessel_id, hauljoin # haul/effort data)
  )

head(dat)

2. Prepare the data for sdmgamindex::get_surveyidx():


# project spatial data
crs_proj <- "EPSG:3338" # NAD83 / Alaska Albers
crs_latlon <- "+proj=longlat +datum=WGS84" # decimal degrees

ll <- sdmgamindex::convert_crs( 
  x = dat$longitude_dd,
  y = dat$latitude_dd, 
  crs_in = crs_latlon, 
  crs_out = crs_proj) 

YEARS <- sort(unique(dat$year))

# The sdmgamindex::get_surveyidx() expects some columns to be named in a specific way
dat_wrangled <- dat %>% 
  dplyr::rename(
    Year = year,
    wCPUE = cpue_kgha, 
    COMMON_NAME = common_name,
    GEAR_TEMPERATURE = bottom_temperature_c, 
    BOTTOM_DEPTH = depth_m,
    HaulDur = duration_hr,
    EFFORT = area_swept_ha,
    Ship = vessel_id) %>%
  dplyr::mutate( 
    # create some other vars
    Lon = longitude_dd, 
    Lat = latitude_dd, 
    lon = ll$X,
    lat = ll$Y,
    sx = ((longitude_dd - mean(longitude_dd, na.rm = TRUE))/1000),
    sy = ((latitude_dd - mean(latitude_dd, na.rm = TRUE))/1000), 
    ctime = as.numeric(as.character(Year)),
    date_time = as.Date(x = date_time, format = "%m/%d/%Y %H:%M:%S"), 
    hour = as.numeric(format(date_time,"%H")),
    minute = as.numeric(format(date_time,"%M")),
    day = as.numeric(format(date_time,"%d")),
    month = as.numeric(format(date_time,"%m")),
    TimeShotHour = hour + minute/60,
    timeOfYear = (month - 1) * 1/12 + (day - 1)/365,   
    
    # add some dummy vars and create some other vars
    Country = "USA",
    Gear = "dummy",
    Quarter = "2")  %>%
  dplyr::mutate(across((c("Year", "Ship", "COMMON_NAME")), as.factor)) %>% 
  dplyr::select(wCPUE, GEAR_TEMPERATURE, BOTTOM_DEPTH, COMMON_NAME, EFFORT, 
                Year, Ship, Lon, Lat, lat, lon, sx, sy, 
                ctime, TimeShotHour, timeOfYear, Gear, Quarter, HaulDur, hauljoin)

head(dat_wrangled)

3. Define representitive station points to fit and predict the model at

Since surveys are not done at the same exact location each year (it’s the intention, but impossible in practice), we need to define what representative latitudes and longitudes we are going to predict at.

These are the same prediction grids AFSC uses for their 2021 VAST model-based indices (which is subject to change - do not use this without asking/checking that this is still current!).

pred_grid <- sdmgamindex::pred_grid_ebs

ll <- sdmgamindex::convert_crs( 
  x = pred_grid$lon,
  y = pred_grid$lat, 
  crs_in = crs_latlon, 
  crs_out = crs_proj) 

pred_grid <- pred_grid %>% 
  dplyr::mutate( 
    lon = ll$X,
    lat = ll$Y,
    sx = ((lon - mean(lon, na.rm = TRUE))/1000),
    sy = ((lat - mean(lat, na.rm = TRUE))/1000))

head(pred_grid)

4. Prepare covariate data

Here we want to match covariate data to the prediction grid.


dat_cov <- sdmgamindex::pred_grid_ebs %>% 
  dplyr::select(-Shape_Area) %>% 
  dplyr::mutate( 
    sx = ((lon - mean(lon, na.rm = TRUE))/1000),
    sy = ((lat - mean(lat, na.rm = TRUE))/1000))

sp_extrap_raster <- SpatialPoints(
  coords = coordinates(as.matrix(dat_cov[,c("lon", "lat")])), 
  proj4string = CRS(crs_latlon) )

4a. Data that varies over only space (depth)

Here in the Bering sea, the depth rarely changes. The modeler may consider making this variable time-varying as well if they are say, in the Gulf of Alaska or the Aleutian Islands where currents and island formation can markedly change depth.

For this, we are going to create a raster of depth in the Bering sea from the survey data so we can merge that into the dataset at the prediction grid lat/lons.

x <- dat_wrangled %>%
  dplyr::select(Lon, Lat, BOTTOM_DEPTH) %>%
  stats::na.omit()  %>% 
  sf::st_as_sf(x = ., 
               coords = c(x = "Lon", y = "Lat"), 
               crs = sf::st_crs(crs_latlon))

idw_fit <- gstat::gstat(formula = BOTTOM_DEPTH ~ 1,
                        locations = x,
                        nmax = 4)

# stn_predict <- raster::predict(idw_fit, x)

extrap_data0 <- raster::predict(
  idw_fit, sp_extrap_raster) %>%
  # as(sp_extrap_raster, Class = "SpatialPoints")) %>%
  sf::st_as_sf() %>%
  sf::st_transform(crs = crs_latlon)  %>%
  stars::st_rasterize() 

extrap_data <- stars::st_extract(x = extrap_data0,
                                 at = as.matrix(dat_cov[,c("lon", "lat")]))

# to make future runs of this faster:
save(extrap_data0, extrap_data, 
     file = paste0("../inst/VigA_bottom_depth_raster_",
                   min(YEARS),"-",max(YEARS), ".rdata"))

# Just so we can see what we are looking at:
plot(extrap_data0, main = "Interpolated Bottom Depths") 

dat_cov <- cbind.data.frame(dat_cov, 
                            "BOTTOM_DEPTH" = extrap_data$var1.pred) %>%
  stats::na.omit()

head(dat_cov)

4b. Data that varies over space and time (bottom temperature)

Here, bottom temperature, and thereby the cold pool extent, have been show to drive the distribution of many species. This is especially true for walleye pollock.

For this we are going to lean on our in-house prepared validated and pre-prepared {coldpool} R package (S. Rohan, L. Barnett, and N. Charriere). This data interpolates over the whole area of the survey so there are no missing data.


plot(coldpool::ebs_bottom_temperature[[1]]) # Just so we can see what we are looking at: 

tmp <- c()
for (i in 1:length(YEARS)) {
  tmp <- c(tmp, 
           grep(pattern = YEARS[i], x = names(coldpool::ebs_bottom_temperature)))
}

extrap_data0 <- coldpool::ebs_bottom_temperature[[tmp]] %>% 
  as(., Class = "SpatialPointsDataFrame") %>%
  sf::st_as_sf() %>%
  sf::st_transform(crs = crs_latlon)  %>%
  stars::st_rasterize() %>% 
  stars::st_extract(x = .,
                    at = as.matrix(dat_cov[,c("lon", "lat")]))

names(extrap_data0) <- paste0("GEAR_TEMPERATURE", YEARS)

dat_cov <- dplyr::bind_cols(dat_cov, extrap_data0) %>% 
  na.omit()

head(dat_cov)

5. DATRAS structure

5a. Catch Data

Now, we need to fill in the data with the zeros!


# Identify vars that will be used --------------------------------------------

varsbyyr <- unique( # c("GEAR_TEMPERATURE", "cpi")
  gsub(pattern = "[0-9]+", 
       replacement = "", 
       x = names(dat_cov)[grepl(names(dat_cov), 
                                pattern = "[0-9]+")]))

vars <- unique( # c("BOTTOM_DEPTH")
  names(dat_cov)[!grepl(names(dat_cov), 
                        pattern = "[0-9]+")])
vars <- vars[!(vars %in% c("LONG", "LAT", "lon", "lat", "sx", "sy"))]

## Fill catch data with zeros ---------------------------------------------------------

data_hauls <- dat_wrangled %>% 
  dplyr::select(Year, sx, sy, 
                dplyr::all_of(varsbyyr), dplyr::all_of(vars), 
                Ship, hauljoin, 
                lat, lon, Lat, Lon, 
                ctime, TimeShotHour, timeOfYear, Gear, Quarter, 
                EFFORT, HaulDur)  %>% 
  # dplyr::filter(!is.na(GEAR_TEMPERATURE)) %>% 
  na.omit() %>%
  dplyr::distinct()

data_catch <- dat_wrangled %>% 
  dplyr::select(COMMON_NAME, wCPUE, hauljoin)

dat_catch_haul <- dplyr::left_join(x = data_hauls, 
                                   y = data_catch, 
                                   by = c("hauljoin")) %>% 
  dplyr::mutate(wCPUE = ifelse(is.na(wCPUE), 0, wCPUE))

head(dat_catch_haul)

allpd <- lapply(YEARS, FUN = sdmgamindex::get_prediction_grid, x = dat_cov, 
                vars = vars, varsbyyr = varsbyyr)
names(allpd) <- as.character(YEARS)

head(allpd[1][[1]])

5b. Covariate Data


## split data by species, make into DATRASraw + Nage matrix
ds <- split(dat_catch_haul,dat_catch_haul$COMMON_NAME)
ds <- lapply(ds, sdmgamindex::get_datrasraw)
## OBS, response is added here in "Nage" matrix -- use wCPUE
ds <- lapply(ds,function(x) { x[[2]]$Nage <- matrix(x$wCPUE,ncol=1); colnames(x[[2]]$Nage)<-1; x } )

ds

6. Formulas

fm <-  list(
  # Null model spatial and temporal with an additional year effect
    "fm_0_s" = "Year +
    s(sx,sy,bs=c('ts'),k=376)",
    
      "fm_0_s" = "Year +
    s(Year,bs=c('ts'),k=10)",
    
      "fm_0_st" = "Year +
    s(sx,sy,bs=c('ts'),k=10,by=Year)",
  
  "fm_1_s_t_st" = "Year +
    s(sx,sy,bs=c('ts'),k=376) +
    s(sx,sy,bs=c('ts'),k=10,by=Year)",

  # Model with simple covariates
  "fm_2_cov" =
    "s(BOTTOM_DEPTH,bs='ts',k=10) +
s(log(GEAR_TEMPERATURE+3),bs='ts',k=10)"
)

7. Fit the Model

Here are all of the models we want to try fitting:

comb <- tidyr::crossing(
  "SPECIES" = SPECIES, 
  "fm_name" = gsub(pattern = " ", replacement = "_", x = names(fm))) %>% 
  dplyr::left_join(
    x = ., 
    y = data.frame("fm" = gsub(pattern = "\n", replacement = "", 
                               x = unlist(fm), fixed = TRUE), 
                   "fm_name" = gsub(pattern = " ", replacement = "_", 
                                    x = names(fm))), 
    by = "fm_name")

comb

models <- fittimes <- list()

for(i in 1:nrow(comb)){
  cat("Fitting ",comb$SPECIES[i],"\n", comb$fm_name[i], ": ", comb$fm[i], "\n")
  
  temp <- paste0(comb$SPECIES[i], " ", comb$fm_name[i])

    fittimes[[ temp ]] <-
    system.time ( models[[ temp ]] <-
                      mgcv::gam(stats::as.formula(paste0("wCPUE ~ ", comb$fm[i])),
      dat = dat_wrangled, 
      family = mgcv::tw, 
      gamma = 1)  )
}  

save(models, fittimes, file = paste0("../inst/VigA_simple_gam_model_fits.Rdata"))


# Lesson 8: Modling
# Created by: Emily Markowitz
# Contact: Emily.Markowitz@noaa.gov
# Created: 2020-12-18
# Modified: 2021-02-17


# tasks -------------------------------------------------------------------

# Be creative! 
# Make a lm(), glm(), and gam() using either of these datasets to answer a 
# question you have about the data. Be prepared to share your cool code with 
# the class! 



# 3. lm() models --------------- 

# Quickly, I am going to show you all of the combinations using the purrr::map() again:

lm_mods <- map(dat, ~lm(dat$wtcpue ~ .x, data = dat) %>% 
                 broom::tidy())

lm_mods 
# The best model looks to be the one with longitude!

# Another way of looking at this:

# p-values
dat %>% 
  map(~lm(dat$wtcpue ~ .x, data = dat)) %>% 
  map(summary) %>% 
  map(c("coefficients")) %>% 
  map_dbl(8)

# r2
dat %>% 
  map(~lm(dat$wtcpue ~ .x, data = dat)) %>% 
  map(summary) %>% 
  map(c("r.squared")) %>%
  unlist()

# 4. glm() models --------------- 

glm_fit1 <- glm(wtcpue ~ longitude, 
                family = gaussian(link = "identity"), # same as an lm()
                # family = "gaussian", # *same as line above
                data = dat)

glm_fit2 <- glm(wtcpue ~ longitude, 
                family = Gamma(), 
                data = dat)

glm_fit3 <- glm(wtcpue ~ longitude + latitude, 
                family = gaussian(link = "identity"), # same as an lm()
                # family = "gaussian", # *same as line above
                data = dat)

glm_fit4 <- glm(wtcpue ~ longitude + latitude, 
                family = Gamma(), 
                data = dat)

glm_fit5 <- glm(wtcpue ~ longitude + latitude + year, 
                family = gaussian(link = "identity"), # same as an lm()
                # family = "gaussian", # *same as line above
                data = dat)

glm_fit6 <- glm(wtcpue ~ longitude + latitude + year, 
                family = Gamma(), 
                data = dat)

glm_fit7 <- glm(wtcpue ~ longitude + latitude + year + bot_temp, 
                family = gaussian(link = "identity"), # same as an lm()
                # family = "gaussian", # *same as line above
                data = dat)

glm_fit8 <- glm(wtcpue ~ longitude + latitude + year + bot_temp, 
                family = Gamma(), 
                data = dat)

AIC(glm_fit1, glm_fit2, glm_fit3, glm_fit4, 
    glm_fit5, glm_fit6, glm_fit7, glm_fit8)
# Model 6 has the lowest AIC and is the most parsimonious!
# bot_temp did not improve the model at all here, so why include it?
# AIC is not the only metric to consider here, but I'll let you read up on that!
# we can see that this model has room for improvement from looking at the plots: 
plot(glm_fit6)

# Now let's predict our outputs

# make up x
pred<-data.frame("longitude" = rnorm(n = 30, 
                                     mean = mean(dat$longitude), 
                                     sd = sd(dat$longitude)), 
                 "latitude" = rnorm(n = 30, 
                                    mean = mean(dat$latitude), 
                                    sd = sd(dat$latitude)), 
                 "year" = rep_len(x = c(2016, 2017, 2018), 
                                  length.out = 10))
# predict y with your equation
pred$x<-predict(object = glm_fit6, 
                newdata = pred, 
                type = "response")
pred

# 5. gam() models --------------- 

library(mgcv)

# Create our gam models
gam_fit1 <- gam(
  wtcpue ~ s(longitude), 
  data = dat
)

gam_fit2 <- gam(
  wtcpue ~ s(longitude),
  family = Gamma(link=log), 
  data = dat
)

gam_fit3 <- gam(
  wtcpue ~ s(longitude) + s(latitude),
  data = dat
)

gam_fit4 <- gam(
  wtcpue ~ s(longitude) + s(latitude),
  family = Gamma(link=log), 
  data = dat
)

gam_fit5 <- gam(
  wtcpue ~ s(longitude) + s(latitude) + s(year, k = 2),
  data = dat
)

gam_fit6 <- gam(
  wtcpue ~ s(longitude) + s(latitude) + s(year, k = 2),
  family = Gamma(link=log), 
  data = dat
)


gam_fit7 <- gam(
  wtcpue ~ s(longitude, latitude, year),
  data = dat
)

gam_fit8 <- gam(
  wtcpue ~ s(longitude, latitude, year), 
  family = Gamma(link=log), 
  data = dat
)


AIC(gam_fit1, gam_fit2, gam_fit3, gam_fit4, 
    gam_fit5, gam_fit6, gam_fit7, gam_fit8)
# Model 8 has the lowest AIC! 
# by explicityly making a spatio-temporal term (as opposed to assessing 
# each sepeately) we were able to obtain a better model
# Again, AIC is not the only metric to consider here, but I'll let you read up on that!


# crazy, just for giggles (aka an abridged model I am playing with in real life!)
gam_fit9 <- gam(
  wtcpue ~ year + # a linear variable
    s(longitude, latitude, bs = c('ts'), k = 379) + # ts = tensor spline, k = knots, here the number of stations (?)
    s(longitude, latitude,bs=c('ts'),k=50, by=year, id=1), # the above but with a by year term
  family = Gamma(link=log), 
  data = dat
)

# Will this more developed model be better than our gam_fit8?
AIC(gam_fit8, gam_fit9)
# Our new gam_fit9 is just that much better than our gam_fit8!

AIC(models$`walleye pollock fm_1_s_t_st`$pModels[[1]], 
    models$`walleye pollock fm_2_cov`$pModels[[1]])

par(mfrow = c(2,2))
lapply(models,function(x) gam.check(x$pModels[[1]]))

lapply(models,function(x) summary(x$pModels[[1]]))

8. Indicies of Abundance

dat_design <- dplyr::bind_rows(read.csv(file = system.file("YFS_10210_estimate_summary.csv", 
                        package = "sdmgamindex" )) %>% 
                          dplyr::mutate(common_name = "yellowfin sole"),
                 read.csv(file = system.file("WEP_21740_estimate_summary.csv", 
                        package = "sdmgamindex" ))  %>% 
                          dplyr::mutate(common_name = "walleye pollock"), 
                 read.csv(file = system.file("RKC_Table_for_SS3.csv", 
                        package = "sdmgamindex" )) %>% 
  dplyr::rename(design_mt = Estimate_metric_tons, 
                design_se = SD_mt) %>% 
  dplyr::mutate(design_se = (design_se)^2, 
                design_CV = NA, 
                VAST_mt = NA,
                VAST_se = NA, 
                VAST_CV = NA, 
                common_name = "red king crab") %>% 
  dplyr::select(-Unit, -Fleet, -SD_log)) 


dat <- data.frame()
for (i in 1:length(models)){
  temp <- models[[i]]
  dat0 <- data.frame(idx = temp$idx[,1], 
                     lo = temp$lo[,1], 
                     up = temp$up[,1],
                     Year = rownames(temp$idx), 
                     group = names(models)[i],
                     formula = paste0("cpue_kgha ~ ", 
                                      as.character(temp$pModels[[1]]$formula)[[3]]))
  
  dat <- dplyr::bind_rows(dat, dat0) 
}

dat$common_name <- paste0(sapply(X = strsplit(x = dat$group, split = " fm"), `[`, 1))

dat <- dplyr::bind_rows(dat %>% 
                          dplyr::mutate(Year = as.numeric(Year)) %>% 
                          dplyr::select(-group), 
                        dat_design %>% 
                          dplyr::select(design_mt, common_name, Year) %>%
                          dplyr::rename(idx = design_mt) %>%
                          dplyr::mutate(lo = NA, 
                                        up = NA, 
                                        formula = "design")) %>% 
  dplyr::filter(Year %in% YEARS)
  

dat[dat$Year == 2020, c("idx", "up", "lo")] <- NA

ggplot2::ggplot(data = dat, 
                mapping = aes(x = Year, 
                              y = idx, 
                              group = formula, 
                              color = formula)) +
  geom_line(size = 1.5) + 
  geom_point(size = 2)  + 
  geom_ribbon(aes(ymin = lo, ymax = up, fill = formula), 
              alpha=0.1, 
              linetype="dashed",
              color="grey") + 
  ggtitle("Annual Index Model Results") +
  facet_wrap(vars(common_name), scales = "free", ncol = 1) +
  theme(legend.position = "bottom", 
        legend.direction = "vertical")

9. Predict and plot


dat_pred <- dat_catch_haul %>%
  dplyr::select(Year, sx, sy, Lon, Lat, GEAR_TEMPERATURE, BOTTOM_DEPTH)

dat <- data.frame()
for (i in 1:length(models)) {
  temp <- models[[i]]
  dat0 <- data.frame(idx = 
                 predict.gam(
                   object = temp$pModels[[1]],
                   newdata = dat_pred),  
                     group = names(models)[i], 
                     formula = paste0("cpue_kgha ~ ", 
                                      as.character(temp$pModels[[1]]$formula)[[3]])
               )
  dat00 <- dplyr::bind_cols(dat0, dat_pred) 
  dat <- dplyr::bind_rows(dat, dat00) 
  
# dat_r <- raster::rasterFromXYZ(xyz = dat00[,c("lon", "lat", "idx")])
    
}

dat$facet_group <- paste0(sapply(X = strsplit(x = dat$group, split = " fm"), `[`, 1))

for (i in 1:length(unique(dat$facet_group))){
  
  ggplot2::ggplot(data = dat %>% 
                  dplyr::filter(facet_group == unique(dat$facet_group)[i]), 
                mapping = aes(x = Lon, 
                              y = Lat, 
                              group = group, 
                              color = idx)) +
    scale_color_viridis_c(option = "D") +
  geom_point()  + 
  ggtitle(paste0("Annual Index Model Results for ", unique(dat$facet_group)[i])) +
  facet_grid(cols = vars(group), 
             rows = vars(Year)) +
  theme_bw()
  
}


sdmgamindex::plot_surveyidx(
  x = models, 
  dat = ds, 
  myids = NULL, 
  predD = allpd)

10. Simulations

REPS <- 4
ests <- list()

for(i in 1:nrow(comb)){

  cat("Simulating ",comb$SPECIES[i],"\n", comb$fm_name[i], ": ", comb$fm[i], "\n")
  temp <- paste0(comb$SPECIES[i], " ", comb$fm_name[i])

# for(SPECIES in specLevels){
  ests[[ temp ]] <- list()
  
  ## simulate data
  csim <- sdmgamindex::get_surveyidx_sim(models[[i]], ds[[comb$SPECIES[i]]])
  sims <-lapply(1:REPS,function(j) sdmgamindex::get_surveyidx_sim(
    model = models[[i]],
    d = ds[[comb$SPECIES[i]]], 
    sampleFit = FALSE,
    condSim = csim) )
  
  ## re-estimate
  tmp <- ds[[i]]
  for(i in 1:REPS) {
    tmp[[2]]$Nage <- matrix(sims[[i]][[1]][,1],ncol=1)
    colnames(tmp$Nage)<-1
    
    ests[[SPECIES]][[i]]  <-
      sdmgamindex::get_surveyidx(
        x = tmp,
        ages = 1,
        myids=NULL,
        predD=allpd,
        cutOff=0,
        fam="Tweedie",
        modelP=fm,
        gamma=1,
        control=list(trace=TRUE,maxit=10))
    # cat(i, " done.\n")
  }
  
}

png("simest.png",width=640*pngscal,height=480*pngscal)
par(mfrow=c(2,2))

for(i in 1:nrow(comb)){

# for(SPECIES in specLevels){
  sdmgamindex::plot_simulation_list(
    x = ests[[temp]],
    base=models[[temp]],
    main=temp,
    lwd=2)
}
dev.off()

March 11 2024