Converting GBIF data cubes in cloud-native data formats
Suggestion citation:
Trekels M (2025). Converting GBIF data cubes in cloud native data formats. https://docs.b-cubed.eu/tutorials/conversion-cube-to-zarr/
Building cloud-native GBIF occurrence cubes
This tutorial is going step by step over the creation of two additional data formats to store the B3 data cubes. We are focusing on GeoParquet and Zarr as potential candidates that contain the geometry within the files.
In this example, we are using the GPKG files for Extended Quarter Degree Grid Cells that were created by GBIF: https://download.gbif.org/grids/EQDGC/
Initializing the environment
Loading the Python packages
import osimport pandas as pdimport geopandas as gpdimport dask.dataframe as ddimport xarray as xrimport s3fsimport zarrfrom shapely import wkt
Setting environmental variables
# File paths to the stored filesgbif_cube = "/location/of/gbif/download.csv"geometry_file = "/location/of/grid.gpkg"
'''REMARK: in this example we use pre-generated GPKG files of the geospacial grids.However, it is possible to generate this file from any geospatial file format to GPKG using GDAL.'''
# Evironment variablesos.environ["AWS_ACCESS_KEY_ID"] = "your AWS access key ID"os.environ["AWS_SECRET_ACCESS_KEY"] = "your AWS secret access key"
# S3 regions3_region = "region" # e.g. eu-north-1
# Location to which the GeoParquet file needs to be storedgeoparquet_path = "/path/to/cube.parquet"
# Link to S3 Bucket to store the Zarr files3_path = "s3://your/S3/bucket/cubeName.zarr"
Loading the data in GeoDataFrame
# Load CSVdata = pd.read_csv(gbif_cube, sep='\t')
# Load GRID (Geopackage)qdgc_ref = gpd.read_file(geometry_file, engine='pyogrio', use_arrow=False)
# Ensure CRS is set (modify CRS if another datum is used!)if qdgc_ref.crs is None: qdgc_ref.set_crs("EPSG:4326", inplace=True)
# Merge Data, in this step you need to check the columns on which to perform the matchingtest_merge = pd.merge(data, qdgc_ref, left_on='eqdgccellcode', right_on='cellCode')gdf = gpd.GeoDataFrame(test_merge, geometry='geometry')
Exporting the data to GeoParquet
gdf.to_parquet(geoparquet_path, geometry_encoding='WKB')
Exporting the data to Zarr in an AWS S3 bucket
# Convert to Dask DataFrameddf = dd.from_pandas(gdf, npartitions=max(1, gdf["specieskey"].nunique() // 1000)) # Dynamic partitioningcolumns_to_compute = ["yearmonth", "eqdgccellcode", "familykey", "family", "specieskey", "species", "occurrences", "distinctobservers", "familycount", "geometry"]
pdf = ddf[columns_to_compute].compute()
# Ensure geometry is still a GeoSeries before conversionif not isinstance(pdf["geometry"], gpd.GeoSeries): pdf["geometry"] = gpd.GeoSeries(pdf["geometry"], crs="EPSG:4326")
# Convert geometry column to WKT (text format for serialization)pdf["geometry"] = pdf["geometry"].apply(lambda geom: geom.wkt if geom and geom is not None else "")
# Ensure all other columns have appropriate typesfor col in pdf.columns: if pdf[col].dtype.name == "string[pyarrow]": pdf[col] = pdf[col].astype(str).fillna("") # Convert to string and replace NaN elif pdf[col].dtype.kind in ['i', 'f']: pdf[col] = pdf[col].fillna(0) # Replace NaN with 0 for numbers elif pdf[col].dtype == "object": pdf[col] = pdf[col].astype(str).fillna("") # Ensure object columns are converted to string
# Convert to Xarrayds = xr.Dataset.from_dataframe(pdf)ds = ds.chunk({"index": 10000}) # Optimize chunking for large datasets
# S3 Configs3_kwargs = { "key": os.getenv("AWS_ACCESS_KEY_ID"), "secret": os.getenv("AWS_SECRET_ACCESS_KEY"),}fs = s3fs.S3FileSystem(client_kwargs={'region_name': s3_region})
# Ensure no existing file conflictif fs.exists(s3_path): fs.rm(s3_path, recursive=True)
# Save to Zarr (Local Write First, Then Move to S3)try: ds.to_zarr("local_temp.zarr", mode="w") fs.put("local_temp.zarr", s3_path, recursive=True, batch_size=50) print("Zarr store written to S3 successfully.")except Exception as e: print(f"Error writing to Zarr: {e}")