Species distribution modelling in R
On this page
- Getting started
- Load libraries
- Download WorldClim climate data
- Crop to your area of interest
- Or crop to country borders
- Extract raster values to points
- Use your own tabular data
- Get data from GBIF
- Working with species occurence and environmental data tables
- Get your table into the right format
- Multi-site generalised dissimilarity modelling for a set of environmental variables and distances
Author: Sandra MacFadyen
Getting started
This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
Load libraries
# See https://rsh249.github.io/bioinformatics/spatial.html
# install.packages("ENMeval", INSTALL_opts="--no-multiarch")
library(dismo) # Species Distribution Modeling tools
library(raster) # R tools for GIS raster files >>> USE terra{} instead
library(spocc) # Access species occurrence data from GBIF/iNaturalist
library(ENMeval) # Tuning SDMs
# library(mapr) # Quick mapping tools
library(sf)
library(ggplot2)
library(geodata)
library(sfheaders)
library(dplyr)
library(tidyr) ## spread()
library(reshape2) ## dcast(), melt()
# library(gdm)
library(zetadiv)
Download WorldClim climate data
# Download the WorldClim Bioclimatic variables for the world at a 10 arc-minute resolution
bio_10m = getData('worldclim', var='bio', res=10, path='D:/Workshops/Zeta_MSGDM/Data') # Set your own path directory
# Use geodata package instead
bio_10m_rsa = worldclim_country("South Africa", var="tmin", path='D:/Workshops/Zeta_MSGDM/Data')
summary(bio_10m)
bio_10m[[1]] #BIO1 = Annual Mean Temperature
Read descriptions/definitions of Bioclimatic variables.
Crop to your area of interest
# Define 'extent' of boundary
# South Africa
rsa_ext = extent(16, 33, -35, -22)
# Crop Bioclimatic variables to extent of South African boundary
rsaExt_bio_10m = crop(bio_10m, rsa_ext)
plot(rsaExt_bio_10m[[1]]) # Basic plotting
Or crop to country borders
# Using GADM (gives a SpatialPolygonsDataFrame)
rsa = getData('GADM', country='South Africa', level=0, path='D:/Workshops/Zeta_MSGDM/Data')
# rsa = sf::st_as_sf(rsa)
# OR use geodata instead
# rsa = gadm(country="ZA", level=1, path='D:/Workshops/Zeta_MSGDM/Data') # or path=tempdir()
# Use country_codes() to find correct code
# cc = country_codes()
# View(cc)
# Basic National map
ggplot()+
geom_sf(data=sf::st_as_sf(rsa))+
ggtitle("South Africa")
# Crop Bioclimatic variables to South African border
rsa_bio_10m = crop(bio_10m, rsa)
plot(rsa_bio_10m[[1]]) # See result
# OR use geodata package instead
# rsa_bio_10m = worldclim_country("South Africa", var="tmin", path='D:/Workshops/Zeta_MSGDM/Data')
Extract raster values to points
# Create 100 random points across South Africa
random_pts = spsample(rsa, n=100, type="random")
# See randomly generate points on RSA map
plot(rsa_bio_10m[[1]])
plot(random_pts, add=T)
# Extract data to points
bio_values_1 = extract(rsa_bio_10m, random_pts)
# Use cbind.data.frame to get results as data.frame
bio_values_df = cbind.data.frame(coordinates(random_pts), bio_values_1)
head(bio_values_df)
Use your own tabular data
# Read directly from csv file
all_lepidoptera.sf = st_as_sf(read.csv('D:/Workshops/Zeta_MSGDM/Data/0097482-230224095556074.csv'), coords = c("x", "y"), crs = 4326)
plot(all_lepidoptera.sf['family'])
###############################################################################
# EXTRACT RASTER DATA TO POINT LOCALITIES
# Extract raster value by points
lepidop_enviro = raster::extract(rsa_bio_10m, all_lepidoptera.sf)
lepidop_enviro
# Combine raster values with point and save as a CSV file.
lepidop_enviro.ptData = cbind(all_lepidoptera.sf, lepidop_enviro)
# Check output
names(lepidop_enviro.ptData)
head(lepidop_enviro.ptData)
Get data from GBIF
See https://poldham.github.io/abs/gbif.html
First sign-up for a free account.
library(dplyr)
library(readr)
library(rgbif) # for occ_download
# Fill in your gbif.org credentials
user = "xxxxxx" # your gbif.org username
pwd = "xxxxxx" # your gbif.org password
email = "xxxx@xxxxx" # your email
The main functions related to downloads are:
occ_download()
: start a download on GBIF servers.
occ_download_prep()
: preview a download request before sending to GBIF.
occ_download_get()
: retrieve a download from GBIF to your computer.
occ_download_import()
: load a download from your computer to R.
gbif_download = occ_data(scientificName='Lepidoptera',
country='ZA',
hasCoordinate=TRUE,
hasGeospatialIssue=FALSE,
limit=500)
lepidoptera.df = as.data.frame(gbif_download$data)
head(lepidoptera.df)
lepidop.df = lepidoptera.df[,c(1,3,4,16,31:33,42:46)]
# head(lepidop.df)
lepidop.sf = st_as_sf(lepidop.df, coords = c("decimalLongitude", "decimalLatitude"), crs = 4326)
plot(lepidop.sf['stateProvince'])
Citing your download
If you end up using your download in a research paper, you will want to cite the download’s DOI. Please see these citation guidelines for properly citing your download. When using this dataset please use the following citation:
GBIF.org (16 March 2023) GBIF Occurrence Download https://doi.org/10.15468/dl.uvu2qm
Working with species occurence and environmental data tables
# CONVERT TO DATAFRAME
# all_lepidoptera.df = as.data.frame(all_lepidoptera.sf)
all_lepidoptera.df = as.data.frame(cbind(all_lepidoptera.sf, st_coordinates(all_lepidoptera.sf)))[,-11]
head(all_lepidoptera.df)
# lepidop_enviro.df = as.data.frame(lepidop_enviro.ptData)
lepidop_enviro.df = as.data.frame(cbind(lepidop_enviro.ptData, st_coordinates(lepidop_enviro.ptData)))[,-30]
head(lepidop_enviro.df)
# OR use sfheaders{}
# lepidop_enviro.df = sf_to_df(lepidop_enviro.ptData, fill=TRUE)
# head(lepidop_enviro.df)
# str(lepidop_enviro.df)
# summary(lepidop_enviro.df)
# SET COLUMN FORMATS
lepidop_enviro.df$nameF = as.factor(lepidop_enviro.df$scientific)
lepidop_enviro.df$dateT = as.Date(lepidop_enviro.df$date)
head(lepidop_enviro.df)
# Create unique SITE IDs
lepidop_enviro.df$unqIDF = as.factor(as.integer(as.factor(paste(lepidop_enviro.df$X,lepidop_enviro.df$Y, sep='_'))))
head(lepidop_enviro.df)
str(lepidop_enviro.df)
Get your table into the right format
# str(lepidop_enviro.df)
head(lepidop_enviro.df)
lepidop_enviro.pa =lepidop_enviro.df %>%
group_by(unqIDF, across(28:29), nameF, across(9:27)) %>%
tally() %>%
spread(nameF, n, fill=0)
head(lepidop_enviro.pa)
# View(lepidop_enviro.pa)
Multi-site generalised dissimilarity modelling for a set of environmental variables and distances
How to Compute Compositional Turnover Using Zeta Diversity
Using zetadiv: https://rdrr.io/cran/zetadiv
lepidop_enviro.pa_noNA = lepidop_enviro.pa[complete.cases(lepidop_enviro.pa), ]
Sitexy = as.data.frame(lepidop_enviro.pa_noNA[,1:3])
# xy = st_as_sf(Sitexy, coords = c("X", "Y"), crs = 4326)
xy = as.data.frame(lepidop_enviro.pa_noNA[,2:3])
envRast = rsa_bio_10m
envTab = as.data.frame(lepidop_enviro.pa_noNA[,c(1:22)])
sppTab = as.data.frame(lepidop_enviro.pa_noNA[,c(1:3,23:1109)])
# envRast
# str(envTab) # 5430 obs(sites) of 20 variables(enviro)
# str(sppTab) # 5430 obs(sites) of 1088 variables(species)
# ##site-species, table-table
# exFormat1a = formatsitepair(sppTab, 1, siteColumn="unqIDF", XColumn="X", YColumn="Y", predData=envTab)
# exFormat1a
#
# ##site-species, table-raster
# exFormat1b = formatsitepair(sppTab, 1, siteColumn="unqIDF", XColumn="X", YColumn="Y", predData=envRast)
# exFormat1b
#
# # plot(exFormat1b, plot.layout=c(2,3))
# plot(exFormat1b[[1]])
# OR USE tidyr
long = sppTab %>%
pivot_longer(!c(unqIDF,X,Y), names_to = "nameF", values_to = "value")
# Dealing with biases associated with presence-only data
#--------------------------------------
# weight by site richness
# long[complete.cases(long), ]
gdmTab = formatsitepair(long, bioFormat=2, XColumn="X", YColumn="Y",abundColumn='value',
sppColumn="nameF", siteColumn="unqIDF", predData=envTab)
gdmTab.rw = formatsitepair(long, bioFormat=2, XColumn="X", YColumn="Y",
sppColumn="nameF", siteColumn="unqIDF",abundColumn='value',
predData=envTab, weightType="richness")
# weights based on richness (number of species records)
gdmTab.rw$weights[1:5]
# remove sites with < 10 species records
gdmTab.sf = formatsitepair(long, bioFormat=2, XColumn="X", YColumn="Y",
sppColumn="nameF", siteColumn="unqIDF",abundColumn='value',
predData=envTab, sppFilter=10)
gdm
: Generalized Dissimilarity Modeling
Read more about gdm
analysis here
gdm.1 = gdm(gdmTab.sf, geo=T)
# summary(gdm.1)
str(gdm.1)
# gdm plots
#--------------------------------------------------------
length(gdm.1$predictors) # get idea of number of panels
plot(gdm.1, plot.layout=c(4,3))
gdm.1.splineDat = isplineExtract(gdm.1)
str(gdm.1.splineDat)
par(mfrow=c(1,1))
plot(gdm.1.splineDat$x[,"Geographic"], gdm.1.splineDat$y[,"Geographic"], lwd=3,
type="l", xlab="Geographic distance", ylab="Partial ecological distance")
zeta.ispline = Zeta.msgdm(sppTab[,c(4:1088)], envTab[,c(4:20)], xy, order=2,
rescale = TRUE,
rescale.pred = TRUE,
method = "mean",
normalize = "Jaccard",
reg.type = "ispline",
sam = 100)
# zeta.ispline.r = Return.ispline(zeta.ispline, envTab[,c(4:20)], distance = TRUE)
# zeta.ispline.r
dev.new()
Plot.ispline(msgdm = zeta.ispline, data.env = envTab[c(4:20)], distance = TRUE)