High-Performance Geospatial Analytics with DuckDB

Contents

25. High-Performance Geospatial Analytics with DuckDB#

25.1. Introduction#

25.1.1. What Makes DuckDB Special for Geospatial Work?#

25.2. Learning Objectives#

25.3. Installation and Setup#

25.3.1. Installing Required Packages#

# %pip install duckdb pygis

25.3.2. Library Import and Configuration#

# Configure jupysql for optimal output
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False
import duckdb
import leafmap
import pandas as pd

# Import jupysql Jupyter extension for SQL magic commands
%load_ext sql

25.3.3. Understanding DuckDB Connections#

# Connect to in-memory database
con = duckdb.connect()

# Connect to persistent file-based database
con = duckdb.connect("filename.db")

# Connect using SQL magic for jupyter notebooks
%sql duckdb:///:memory:

25.3.4. Installing and Loading Extensions#

# Install and load essential extensions
con.install_extension("httpfs")  # For remote file access
con.load_extension("httpfs")

con.install_extension("spatial")  # For spatial operations
con.load_extension("spatial")
con.sql("FROM duckdb_extensions();")
con.sql("FROM duckdb_extensions();").df()

25.4. SQL Basics for Spatial Analysis#

25.4.1. Understanding SQL for Geospatial Work#

25.4.2. Sample Datasets#

25.4.3. Reading Data Directly from URLs#

%%sql
SELECT * FROM 'https://opengeos.org/data/duckdb/cities.csv';
%%sql
SELECT * FROM 'https://opengeos.org/data/duckdb/countries.csv';

25.4.4. Creating Persistent Tables#

%%sql
CREATE OR REPLACE TABLE cities AS SELECT * FROM 'https://opengeos.org/data/duckdb/cities.csv';
%%sql
CREATE OR REPLACE TABLE countries AS SELECT * FROM 'https://opengeos.org/data/duckdb/countries.csv';

25.4.5. Viewing Your Data#

%%sql
FROM cities;
%%sql
FROM countries;

25.4.6. Essential SQL Commands#

25.4.6.1. Selecting Data#

%%sql
SELECT * FROM cities LIMIT 10;

25.4.6.2. Choosing Specific Columns#

%%sql
SELECT name, country FROM cities LIMIT 10;

25.4.6.3. Finding Unique Values#

%%sql
SELECT DISTINCT country FROM cities LIMIT 10;

25.4.6.4. Counting and Aggregation#

%%sql
SELECT COUNT(*) FROM cities;
%%sql
SELECT COUNT(DISTINCT country) FROM cities;
%%sql
SELECT MAX(population) FROM cities;
%%sql
SELECT MIN(population) FROM cities;
%%sql
SELECT AVG(population) FROM cities;

25.4.7. Filtering and Sorting Data#

25.4.7.1. Filtering with WHERE Clauses#

%%sql
SELECT * FROM cities WHERE population > 1000000 LIMIT 10;

25.4.7.2. Sorting Your Results#

%%sql
SELECT * FROM cities ORDER BY population DESC LIMIT 10;

25.4.7.3. Grouping and Aggregating Data#

%%sql
SELECT country, COUNT(*) as city_count, AVG(population) as avg_population
FROM cities
GROUP BY country
ORDER BY city_count DESC
LIMIT 10;

25.5. Python API Integration#

25.5.1. Understanding Result Formats#

# Execute SQL and get different result formats
con.sql("SELECT 42").fetchall()  # Python objects
con.sql("SELECT 42").df()  # Pandas DataFrame
con.sql("SELECT 42").fetchnumpy()  # NumPy Arrays

25.5.2. Seamless DataFrame Integration#

# Query Pandas DataFrames directly
pandas_df = pd.DataFrame({"a": [42]})
con.sql("SELECT * FROM pandas_df")
# Convert remote data to DataFrame
df = con.read_csv("https://opengeos.org/data/duckdb/cities.csv").df()
df.head()

25.5.3. Result Conversion and Export#

# Write results to various formats
con.sql("SELECT 42").write_parquet("out.parquet")
con.sql("SELECT 42").write_csv("out.csv")
con.sql("COPY (SELECT 42) TO 'out.parquet'")

25.5.4. Persistent Storage#

# Create persistent database connection
# create a connection to a file called 'file.db'
con_persistent = duckdb.connect("file.db")
# create a table and load data into it
con_persistent.sql(
    'CREATE TABLE IF NOT EXISTS cities AS FROM read_csv_auto("https://opengeos.org/data/duckdb/cities.csv")'
)
# query the table
con_persistent.table("cities").show()
# explicitly close the connection
con_persistent.close()
with duckdb.connect("file.db") as con:
    con.sql(
        'CREATE TABLE IF NOT EXISTS cities AS FROM read_csv_auto("https://opengeos.org/data/duckdb/cities.csv")'
    )
    con.table("cities").show(max_rows=5)
    # the context manager closes the connection automatically

25.6. Data Import#

25.6.1. Understanding Data Formats in Geospatial Work#

25.6.2. Downloading Sample Data#

url = "https://opengeos.org/data/duckdb/cities.zip"
leafmap.download_file(url, unzip=True)

25.6.3. Working with CSV Files#

# Read CSV with auto-detection
con = duckdb.connect()
con.read_csv("cities.csv")
# Read CSV with specific options
con.read_csv("cities.csv", header=True, sep=",")
# Use parallel CSV reader
con.read_csv("cities.csv", parallel=True)
# Read CSV directly in SQL
con.sql("SELECT * FROM 'cities.csv'")
# Call read_csv from within SQL
con.sql("SELECT * FROM read_csv_auto('cities.csv')")

25.6.4. JSON Files#

# Read JSON files
con.read_json("cities.json")
# Read JSON directly in SQL
con.sql("SELECT * FROM 'cities.json'")
# Call read_json from within SQL
con.sql("SELECT * FROM read_json_auto('cities.json')")

25.6.5. DataFrames#

df = pd.read_csv("cities.csv")
df
con.sql("SELECT * FROM df").fetchall()

25.6.6. Parquet Files#

# Read from a single Parquet file
con.read_parquet("cities.parquet")
# Read Parquet directly in SQL
con.sql("SELECT * FROM 'cities.parquet'")
# Call read_parquet from within SQL
con.sql("SELECT * FROM read_parquet('cities.parquet')")

25.6.7. Spatial Data Formats#

con.load_extension("spatial")
con.sql("SELECT * FROM ST_Drivers()")
# Read GeoJSON
con.sql("SELECT * FROM ST_Read('cities.geojson')")
# Create spatial table from GeoJSON
con.sql("CREATE TABLE cities AS SELECT * FROM ST_Read('cities.geojson')")
con.table("cities")
con.sql("SELECT * FROM ST_Read('cities.shp')")
con.sql(
    """
        CREATE TABLE IF NOT EXISTS cities2 AS
        SELECT * FROM ST_Read('cities.shp')
        """
)
con.table("cities2")

25.7. Data Export#

25.7.1. Sample Spatial Data Setup#

con = duckdb.connect()
con.load_extension("spatial")
con.sql(
    """
CREATE TABLE IF NOT EXISTS cities AS
FROM 'https://opengeos.org/data/duckdb/cities.parquet'
"""
)
con.table("cities").show()

25.7.2. Export to DataFrames#

con.table("cities").df()

25.7.3. Export to Files#

# Export to CSV
con.sql("COPY cities TO 'cities.csv' (HEADER, DELIMITER ',')")
con.sql(
    "COPY (SELECT * FROM cities WHERE country='USA') TO 'cities_us.csv' (HEADER, DELIMITER ',')"
)

25.7.4. Export to JSON#

con.sql("COPY cities TO 'cities.json'")
con.sql("COPY (SELECT * FROM cities WHERE country='USA') TO 'cities_us.json'")

25.7.5. Export to Excel#

con.sql(
    "COPY (SELECT * EXCLUDE geometry FROM cities) TO 'cities.xlsx' WITH (FORMAT GDAL, DRIVER 'XLSX')"
)

25.7.6. Export to Parquet#

con.sql("COPY cities TO 'cities.parquet' (FORMAT PARQUET)")
con.sql(
    "COPY (SELECT * FROM cities WHERE country='USA') TO 'cities_us.parquet' (FORMAT PARQUET)"
)

25.7.7. Export Spatial Formats#

# Export to GeoJSON
con.sql("COPY cities TO 'cities.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON')")
con.sql(
    "COPY (SELECT * FROM cities WHERE country='USA') TO 'cities_us.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON')"
)
# Export to Shapefile
con.sql("COPY cities TO 'cities.shp' WITH (FORMAT GDAL, DRIVER 'ESRI Shapefile')")
# Export to GeoPackage
con.sql("COPY cities TO 'cities.gpkg' WITH (FORMAT GDAL, DRIVER 'GPKG')")

25.8. Working with Geometries#

25.8.1. Understanding Spatial Data Types#

25.8.2. Sample Data Setup#

# Download NYC sample data
url = "https://opengeos.org/data/duckdb/nyc_data.db.zip"
leafmap.download_file(url, unzip=True)
# Connect to spatial database
con = duckdb.connect("nyc_data.db")
con.install_extension("spatial")
con.load_extension("spatial")
con.sql("SHOW TABLES;")

25.8.3. Creating and Understanding Geometries#

con.sql(
    """

CREATE or REPLACE TABLE samples (name VARCHAR, geom GEOMETRY);

INSERT INTO samples VALUES
  ('Point', ST_GeomFromText('POINT(-100 40)')),
  ('Linestring', ST_GeomFromText('LINESTRING(0 0, 1 1, 2 1, 2 2)')),
  ('Polygon', ST_GeomFromText('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))')),
  ('PolygonWithHole', ST_GeomFromText('POLYGON((0 0, 10 0, 10 10, 0 10, 0 0),(1 1, 1 2, 2 2, 2 1, 1 1))')),
  ('Collection', ST_GeomFromText('GEOMETRYCOLLECTION(POINT(2 0),POLYGON((0 0, 1 0, 1 1, 0 1, 0 0)))'));

SELECT * FROM samples;

  """
)
con.sql("SELECT name, ST_AsText(geom) AS geometry FROM samples;")
con.sql(
    """
COPY samples TO 'samples.geojson' (FORMAT GDAL, DRIVER GeoJSON);
"""
)

25.8.4. Working with Points#

con.sql(
    """
SELECT ST_AsText(geom)
  FROM samples
  WHERE name = 'Point';
"""
)
con.sql(
    """
SELECT ST_X(geom), ST_Y(geom)
  FROM samples
  WHERE name = 'Point';
"""
)
con.sql(
    """
SELECT * FROM nyc_subway_stations
"""
)
con.sql(
    """
SELECT name, ST_AsText(geom)
  FROM nyc_subway_stations
  LIMIT 10;
"""
)

25.8.5. Working with LineStrings#

con.sql(
    """
SELECT ST_AsText(geom)
  FROM samples
  WHERE name = 'Linestring';
"""
)
con.sql(
    """
SELECT ST_Length(geom)
  FROM samples
  WHERE name = 'Linestring';
"""
)

25.8.6. Working with Polygons#

con.sql(
    """
SELECT ST_AsText(geom)
  FROM samples
  WHERE name = 'Polygon';
"""
)
con.sql(
    """
SELECT
    ST_Area(geom) as area,
    ST_Perimeter(geom) as perimeter
FROM samples
WHERE name = 'Polygon';
"""
)

25.9. Spatial Relationships#

25.9.1. Understanding Spatial Predicates#

25.9.2. Working with Real Spatial Data#

# Find subway station
con.sql(
    """
SELECT name, geom
FROM nyc_subway_stations
WHERE name = 'Broad St';
"""
)

25.9.3. Testing Spatial Equality#

# Test spatial equality
con.sql(
    """
SELECT name
FROM nyc_subway_stations
WHERE ST_Equals(geom, ST_GeomFromText('POINT (583571.9059213118 4506714.341192182)'));
"""
)

25.9.4. Point-in-Polygon Analysis#

con.sql("FROM nyc_neighborhoods LIMIT 5")
# Find neighborhood containing a point
con.sql(
    """
SELECT name, boroname
FROM nyc_neighborhoods
WHERE ST_Intersects(geom, ST_GeomFromText('POINT(583571 4506714)'));
"""
)

25.9.5. Proximity Analysis with Distance#

# Find features within distance
con.sql(
    """
SELECT COUNT(*) as nearby_stations
FROM nyc_subway_stations
WHERE ST_DWithin(geom, ST_GeomFromText('POINT(583571 4506714)'), 500);
"""
)

25.10. Spatial Joins#

25.10.1. Understanding Spatial vs. Regular Joins#

25.10.2. Exploring Our Data#

con.sql("FROM nyc_neighborhoods SELECT * LIMIT 5;")
con.sql("FROM nyc_subway_stations SELECT * LIMIT 5;")

25.10.3. Your First Spatial Join#

# Find subway stations and their neighborhoods
con.sql(
    """
SELECT
  subways.name AS subway_name,
  neighborhoods.name AS neighborhood_name,
  neighborhoods.boroname AS borough
FROM nyc_neighborhoods AS neighborhoods
JOIN nyc_subway_stations AS subways
ON ST_Intersects(neighborhoods.geom, subways.geom)
WHERE subways.NAME = 'Broad St';
"""
)

25.10.4. Advanced Spatial Analysis#

con.sql(
    """
SELECT DISTINCT COLOR FROM nyc_subway_stations;
"""
)
# Find RED subway stations and their neighborhoods
con.sql(
    """
SELECT
  subways.name AS subway_name,
  neighborhoods.name AS neighborhood_name,
  neighborhoods.boroname AS borough
FROM nyc_neighborhoods AS neighborhoods
JOIN nyc_subway_stations AS subways
ON ST_Intersects(neighborhoods.geom, subways.geom)
WHERE subways.color = 'RED';
"""
)

25.10.5. Distance-Based Analysis#

# Get baseline demographics
con.sql(
    """
SELECT
  100.0 * Sum(popn_white) / Sum(popn_total) AS white_pct,
  100.0 * Sum(popn_black) / Sum(popn_total) AS black_pct,
  Sum(popn_total) AS popn_total
FROM nyc_census_blocks;
"""
)
con.sql(
    """
SELECT DISTINCT routes
FROM nyc_subway_stations AS subways
WHERE strpos(subways.routes,'A') > 0;
"""
)
# Demographics within 200 meters of A-train
con.sql(
    """
SELECT
  100.0 * Sum(popn_white) / Sum(popn_total) AS white_pct,
  100.0 * Sum(popn_black) / Sum(popn_total) AS black_pct,
  Sum(popn_total) AS popn_total
FROM nyc_census_blocks AS census
JOIN nyc_subway_stations AS subways
ON ST_DWithin(census.geom, subways.geom, 200)
WHERE strpos(subways.routes,'A') > 0;
"""
)

25.10.6. Advanced Multi-Table Joins#

# Create subway lines reference table
con.sql(
    """
CREATE OR REPLACE TABLE subway_lines ( route char(1) );
INSERT INTO subway_lines (route) VALUES
  ('A'),('B'),('C'),('D'),('E'),('F'),('G'),
  ('J'),('L'),('M'),('N'),('Q'),('R'),('S'),
  ('Z'),('1'),('2'),('3'),('4'),('5'),('6'),
  ('7');
"""
)
# Analyze demographics by subway line
con.sql(
    """
SELECT
  lines.route,
  100.0 * Sum(popn_white) / Sum(popn_total) AS white_pct,
  100.0 * Sum(popn_black) / Sum(popn_total) AS black_pct,
  Sum(popn_total) AS popn_total
FROM nyc_census_blocks AS census
JOIN nyc_subway_stations AS subways
ON ST_DWithin(census.geom, subways.geom, 200)
JOIN subway_lines AS lines
ON strpos(subways.routes, lines.route) > 0
GROUP BY lines.route
ORDER BY black_pct DESC;
"""
)

25.11. Large-Scale Data Analysis#

25.11.1. Analyzing the National Wetlands Inventory#

# Connect and prepare for large-scale analysis
con = duckdb.connect()
con.install_extension("spatial")
con.load_extension("spatial")
# Analyze single state data
state = "DC"  # Change to the US State of your choice
url = f"https://data.source.coop/giswqs/nwi/wetlands/{state}_Wetlands.parquet"
con.sql(f"SELECT * FROM '{url}'")
# Inspect the table schema
con.sql(f"DESCRIBE FROM '{url}'")

25.11.2. Scaling Up: Nationwide Wetland Analysis#

25.11.2.1. Count the Total Number of Wetlands#

con.sql(
    f"""
SELECT COUNT(*) AS Count
FROM 's3://us-west-2.opendata.source.coop/giswqs/nwi/wetlands/*.parquet'
"""
)

25.11.2.2. Count Wetlands by State#

# Count wetlands by state using filename
df = con.sql(
    f"""
SELECT filename, COUNT(*) AS Count
FROM read_parquet('s3://us-west-2.opendata.source.coop/giswqs/nwi/wetlands/*.parquet', filename=true)
GROUP BY filename
ORDER BY COUNT(*) DESC;
"""
).df()
df.head()
# Extract state codes from filenames
count_df = con.sql(
    f"""
SELECT SUBSTRING(filename, LENGTH(filename) - 18, 2) AS State, COUNT(*) AS Count
FROM read_parquet('s3://us-west-2.opendata.source.coop/giswqs/nwi/wetlands/*.parquet', filename=true)
GROUP BY State
ORDER BY COUNT(*) DESC;
"""
).df()
count_df.head(10)
# Create a wetlands table from the DataFrame
con.sql("CREATE OR REPLACE TABLE wetlands AS FROM count_df")
con.sql("FROM wetlands")

25.11.3. Mapping Wetland Counts by State#

# Create states table with geometries
url = "https://opengeos.org/data/us/us_states.parquet"
con.sql(
    f"""
CREATE OR REPLACE TABLE states AS
SELECT * FROM '{url}'
"""
)
con.sql(
    """
SELECT * FROM states INNER JOIN wetlands ON states.id = wetlands.State
"""
)
# Join wetlands count with state geometries for visualization
file_path = "states_with_wetlands.geojson"
con.sql(
    f"""
    COPY (
    SELECT s.name, s.id, w.Count, s.geometry
    FROM states s
    JOIN wetlands w ON s.id = w.State
    ORDER BY w.Count DESC
    ) TO '{file_path}' WITH (FORMAT GDAL, DRIVER 'GeoJSON')
"""
)
m = leafmap.Map()
m.add_data(
    file_path,
    column="Count",
    scheme="Quantiles",
    cmap="Greens",
    legend_title="Wetland Count",
)
m

25.11.4. Wetland Distribution Charts#

25.11.4.1. Pie Chart of Wetlands by State#

leafmap.pie_chart(
    count_df, "State", "Count", height=800, title="Number of Wetlands by State"
)

25.11.4.2. Bar Chart of Wetlands by State#

leafmap.bar_chart(count_df, "State", "Count", title="Number of Wetlands by State")

25.11.5. Wetland Area Analysis#

25.11.5.1. Total Wetland Area in the U.S.#

con.sql(
    f"""
SELECT SUM(Shape_Area) /  1000000 AS Area_SqKm
FROM 's3://us-west-2.opendata.source.coop/giswqs/nwi/wetlands/*.parquet'
"""
)

25.11.5.2. Wetland Area by State#

area_df = con.sql(
    f"""
SELECT SUBSTRING(filename, LENGTH(filename) - 18, 2) AS State, SUM(Shape_Area) /  1000000 AS Area_SqKm
FROM read_parquet('s3://us-west-2.opendata.source.coop/giswqs/nwi/wetlands/*.parquet', filename=true)
GROUP BY State
ORDER BY COUNT(*) DESC;
"""
).df()
area_df.head(10)

25.11.5.3. Pie Chart of Wetland Area by State#

leafmap.pie_chart(
    area_df, "State", "Area_SqKm", height=850, title="Wetland Area by State"
)

25.11.5.4. Bar Chart of Wetland Area by State#

leafmap.bar_chart(area_df, "State", "Area_SqKm", title="Wetland Area by State")

25.12. Key Takeaways#

25.12.1. Fundamental Concepts You’ve Learned#

25.12.2. Why This Approach is Powerful#

25.12.3. Building Your Spatial Analysis Skills#

25.13. Exercises#

25.13.1. Exercise 1: Creating Tables#

25.13.2. Exercise 2: Column Filtering#

25.13.3. Exercise 3: Row Filtering#

25.13.4. Exercise 4: Sorting Results#

25.13.5. Exercise 5: Unique Values#

25.13.6. Exercise 6: Counting Rows#

25.13.7. Exercise 7: Aggregating Data#

25.13.8. Exercise 8: Joining Tables#

25.13.9. Exercise 9: String Manipulation#

25.13.10. Exercise 10: Filtering with Multiple Conditions#

25.13.11. Exercise 11: Basic Setup and Data Loading#

25.13.12. Exercise 12: Spatial Relationships Analysis#

25.13.13. Exercise 13: Advanced Spatial Joins#

25.13.14. Exercise 14: Data Import and Export#

25.13.15. Exercise 15: Large-Scale Analysis#