Course Notes: Spatial Statistics in R
  • AI Chat
  • Code
  • Report
  • Spinner

    Course Notes

    Use this workspace to take notes, store code snippets, or build your own interactive cheatsheet! The datasets used in this course are available in the datasets folder.

    # Import any packages you want to use here
    library(spatstat)

    Take Notes

    Add notes here about the concepts you've learned and code cells with code you want to keep.

    Add your notes here

    # Add your code snippets here
    # Get some summary information on the dataset
    summary(preston_crime)
    
    # Get a table of marks
    table(marks(preston_crime))
    
    # Define a function to create a map
    preston_map <- function(cols = c("green","red"), cex = c(1, 1), pch = c(1, 1)) {
      plotRGB(preston_osm) # from the raster package
      plot(preston_crime, cols = cols, pch = pch, cex = cex, add = TRUE, show.window = TRUE)
    }
    
    # Draw the map with colors, sizes and plot character
    preston_map(
      cols = c("black", "red"), 
      cex = c(0.5, 1), 
      pch = (19, 19)
    )
    
    # Scan from 500m to 1000m in steps of 50m
    bw_choice <- spseg(
        preston_crime, 
        h = seq(500, 1000, by = 50),
        opt = 1)
    
    # Plot the results and highlight the best bandwidth
    plotcv(bw_choice); abline(v = bw_choice$hcv, lty = 2, col = "red")
    
    # Print the best bandwidth
    print(bw_choice$hcv)
    
    # Set the correct bandwidth and run for 10 simulations only
    seg10 <- spseg(
        pts = preston_crime, 
        h = 800,
        opt = 3,
        ntest = 10, 
        proc = FALSE)
    # Plot the segregation map for violent crime
    plotmc(seg10, "Violent crime")
    
    # Plot seg, the result of running 1000 simulations
    plotmc(seg, "Violent crime")
    
    # Inspect the structure of the spatial segregation object
    str(seg)
    
    # Get the number of columns in the data so we can rearrange to a grid
    ncol <- length(seg$gridx)
    
    # Rearrange the probability column into a grid
    prob_violent <- list(x = seg$gridx,
                         y = seg$gridy,
                         z = matrix(seg$p[, "Violent crime"],
                                    ncol = ncol))
    image(prob_violent)
    
    # Rearrange the p-values, but choose a p-value threshold
    p_value <- list(x = seg$gridx,
                    y = seg$gridy,
                    z = matrix(seg$stpvalue[, "Violent crime"] < 0.05,
                               ncol = ncol))
    image(p_value)
    
    # Create a mapping function
    segmap <- function(prob_list, pv_list, low, high){
    
      # background map
      plotRGB(preston_osm)
    
      # p-value areas
      image(pv_list, 
            col = c("#00000000", "#FF808080"), add = TRUE) 
    
      # probability contours
      contour(prob_list,
              levels = c(low, high),
              col = c("#206020", "red"),
              labels = c("Low", "High"),
              add = TRUE)
    
      # boundary window
      plot(Window(preston_crime), add = TRUE)
    }
    
    # Map the probability and p-value
    segmap(prob_violent, p_value, 0.05, 0.15)
    
    # Show the available marks
    names(marks(sasq))
    
    # Histogram the dates of the sightings, grouped by year
    hist(marks(sasq)$date, "years", freq = TRUE)
    
    # Plot and tabulate the calendar month of all the sightings
    plot(table(marks(sasq)$month))
    
    # Split on the month mark
    sasq_by_month <- split(sasq, "month", un = TRUE)
    
    # Plot monthly maps
    plot(sasq_by_month)
    
    # Plot smoothed versions of the above split maps
    plot(density(sasq_by_month))
    
    # Get a matrix of event coordinates
    sasq_xy <- as.matrix(coords(sasq))
    
    # Check the matrix has two columns
    dim(sasq_xy)
    
    # Get a vector of event times
    sasq_t <- marks(sasq)$date
    
    # Extract a two-column matrix from the ppp object
    sasq_poly <- as.matrix(as.data.frame(Window(sasq)))
    dim(sasq_poly)
    
    # Set the time limit to 1 day before and 1 day after the range of times
    tlimits <- range(sasq_t) + c(-1, 1)
    
    # Scan over 400m intervals from 100m to 20km
    s <- seq(100, 20000, by = 400)
    
    # Scan over 14 day intervals from one week to 31 weeks
    tm <- seq(7, 217, by = 14)
    
    # Run 999 simulations 
    sasq_mc <- stmctest(sasq_xy, sasq_t, sasq_poly, tlimits, s, tm, nsim = 999, quiet = TRUE)
    names(sasq_mc)
    
    # Histogram the simulated statistics and add a line at the data value
    ggplot(data.frame(sasq_mc), aes(x = t)) +
      geom_histogram(binwidth = 1e13) +
      geom_vline(aes(xintercept = t0))
    
    # Compute the p-value as the proportion of tests greater than the data
    sum(sasq_mc$t > sasq_mc$t0) / 1000
    
    #Chapter 3
    # See what information we have for each borough
    summary(london_ref)
    
    # Which boroughs voted to "Leave"?
    london_ref$NAME[london_ref$Leave > london_ref$Remain]
    
    # Plot a map of the percentage that voted "Remain"
    spplot(london_ref, zcol = "Pct_Remain")
    
    # Use the cartogram and rgeos packages
    library(cartogram)
    library(rgeos)
    
    # Make a scatterplot of electorate vs borough area
    names(london_ref)
    plot(london_ref$Electorate, gArea(london_ref, byid = TRUE))
    
    # Make a cartogram, scaling the area to the electorate
    carto_ref <- cartogram(london_ref, "Electorate")
    plot(carto_ref)
    
    # Check the linearity of the electorate-area plot
    plot(carto_ref$Electorate, gArea(carto_ref, byid = TRUE))
    
    # Make a fairer map of the Remain percentage
    spplot(carto_ref, "Pct_Remain")
    
    # Use the spdep package
    library(spdep)
    
    # Make neighbor list
    borough_nb <- poly2nb(london_ref)
    
    # Get center points of each borough
    borough_centers <- coordinates(london_ref)
    
    # Show the connections
    plot(london_ref); plot(borough_nb, borough_centers, add = TRUE)
    
    # Map the total pop'n
    spplot(london_ref, zcol = "TOTAL_POP")
    
    # Run a Moran I test on total pop'n
    moran.test(
      london_ref$TOTAL_POP, 
      nb2listw(borough_nb)
    )
    
    # Map % Remain
    spplot(london_ref, zcol = "Pct_Remain")
    
    # Run a Moran I MC test on % Remain
    moran.mc(
      london_ref$Pct_Remain, 
      nb2listw(borough_nb), 
      nsim = 999
    )
    
    # Get a summary of the data set
    summary(london)
    
    # Map the OBServed number of flu reports
    spplot(london, "Flu_OBS")
    
    # Compute and print the overall incidence of flu
    r <- sum(london$Flu_OBS) / sum(london$TOTAL_POP)
    r
    
    # Calculate the expected number for each borough
    london$Flu_EXP <- london$TOTAL_POP * r
    
    # Calculate the ratio of OBServed to EXPected
    london$Flu_SMR <- london$Flu_OBS / london$Flu_EXP
    
    # Map the SMR
    spplot(london, "Flu_SMR")
    
    # For the binomial statistics function
    library(epitools)
    
    # Get CI from binomial distribution
    flu_ci <- binom.exact(london$Flu_OBS, london$TOTAL_POP)
    
    # Add borough names
    flu_ci$NAME <- london$NAME
    
    # Calculate London rate, then compute SMR
    r <- sum(london$Flu_OBS) / sum(london$TOTAL_POP)
    flu_ci$SMR <- flu_ci$proportion / r
    
    # Subset the high SMR data
    flu_high <- flu_ci[flu_ci$SMR > 1, ]
    
    # Plot estimates with CIs
    library(ggplot2)
    ggplot(flu_high, aes(x = NAME, y = proportion / r,
                         ymin = lower / r, ymax = upper / r)) +
      geom_pointrange() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    
    # Fit a poisson GLM.
    model_flu <- glm(
      Flu_OBS ~ HealthDeprivation, 
      offset = log(TOTAL_POP), 
      data = london, 
      family = poisson)
    
    # Is HealthDeprivation significant?
    summary(model_flu)
    
    # Put residuals into the spatial data.
    london$Flu_Resid <- residuals(model_flu)
    
    # Map the residuals using spplot
    spplot(london, "Flu_Resid")
    
    # Use R2BayesX
    library(R2BayesX)
    
    # Fit a GLM
    model_flu <- glm(Flu_OBS ~ HealthDeprivation, offset = log(TOTAL_POP),
                    data = london, family = poisson)
                        
    # Summarize it                    
    summary(model_flu)
    
    # Calculate coeff confidence intervals
    confint(model_flu)
    
    # Fit a Bayesian GLM
    bayes_flu <- bayesx(Flu_OBS ~ HealthDeprivation, offset = log(london$TOTAL_POP), 
                        family = "poisson", data = data.frame(london), 
                        control = bayesx.control(seed = 17610407))
                        
    # Summarize it                    
    summary(bayes_flu)
    
    # Look at the samples from the Bayesian model
    plot(samples(bayes_flu))
    
    # Compute adjacency objects
    borough_nb <- poly2nb(london)
    borough_gra <- nb2gra(borough_nb)
    
    # Fit spatial model
    flu_spatial <- bayesx(
      Flu_OBS ~ HealthDeprivation + sx(i, bs = "spatial", map = borough_gra),
      offset = log(london$TOTAL_POP),
      family = "poisson", data = data.frame(london), 
      control = bayesx.control(seed = 17610407)
    )
    
    # Summarize the model
    summary(flu_spatial)
    
    # Summarize the model
    summary(flu_spatial)
    
    # Map the fitted spatial term only
    london$spatial <- fitted(flu_spatial, term = "sx(i):mrf")[, "Mean"]
    spplot(london, zcol = "spatial")
    
    # Map the residuals
    london$spatial_resid <- residuals(flu_spatial)[, "mu"]
    spplot(london, zcol = "spatial_resid")
    
    # Test residuals for spatial correlation
    moran.mc(london$spatial_resid, nb2listw(borough_nb), 999)

    The london data set has been loaded.

    The R2BayesX package provides an interface to the BayesX code. Fit the GLM for flu as before. Show its summary… and the confidence intervals of its coefficients, using confint() Check the model coefficients and standard errors for significance. The syntax for bayesx() is similar, but the offset has to be specified explicitly from the data frame, the family name is in quotes, and the spatial data frame needs to be turned into a plain data frame. Run the model fitting and inspect with summary(). Plot the samples from the Bayesian model. On the left is the "trace" of samples in sequential order, and on the right is the parameter density. For this model there is an intercept and a slope for the Health Deprivation score. The parameter density should correspond with the parameter summary. Adding a spatially autocorrelated effect

    You've fitted a non-spatial GLM with BayesX. You can include a spatially correlated term based on the adjacency structure by adding a term to the formula specifying a spatially correlated model. Instructions 100 XP

    The spatial data object, london is already loaded.

    Use poly2nb() to compute the neighborhood structure of london to an nb object. R2BayesX uses its own objects for the adjacency. Convert the nb object to a gra object. The sx function specifies additional terms to bayesx. Create a term using the "spatial" basis and the gra object for the boroughs to define the map. Print a summary of the model object. You should see a table of coefficients for the parametric part of the model as in the previous exercise, and then a table of "Smooth terms variance" with one row for the spatial term. Note that since BayesX can fit many different forms in its sx terms, most of which, like the spatial model here, cannot be simply expressed with a parameter or two. This table shows the variance of the random effects - for further explanation consult a text on random effects modeling.

    Mapping the spatial effects

    As with glm, you can get the fitted values and residuals from your model using the fitted and residuals functions. bayesx models are a bit more complex, since you have the linear predictor and terms from sx elements, such as the spatially correlated term.

    The summary function will show you information for the linear model terms and the smoothing terms in two separate tables. The spatial term is called "sx(i):mrf" - standing for "Markov Random Field".

    Bayesian analysis returns samples from a distribution for our S(x) term at each of the London boroughs. The fitted function from bayesx models returns summary statistics for each borough. You'll just look at the mean of that distribution for now. Instructions 100 XP

    The model from the BayesX output is available as flu_spatial.

    Get a summary of the model and see where the parameter information is. Does the Health Deprivation parameter look significant? Add a column named spatial to london with the mean of the distribution of the fitted spatial term, and map this. Add another column, named spatial_resid, with the residuals. Use the "mu" column from residuals, as this is based on the rate rather than the number of cases, so can be compared across areas with different populations. Plot a map of spatial_resid using spplot(). Run a Moran statistic Monte-Carlo test on the residuals - do they show spatial correlation? Call moran.mc(), passing the residual vector as the first argument.