Introduction¶
What Makes DuckDB Special for Geospatial Work?¶
Learning Objectives¶
Installation and Setup¶
Installing Required Packages¶
# %pip install duckdb pygisLibrary Import and Configuration¶
# Configure jupysql for optimal output
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = Falseimport duckdb
import leafmap
import pandas as pd
# Import jupysql Jupyter extension for SQL magic commands
%load_ext sqlUnderstanding DuckDB Connections¶
# Connect to in-memory database
con = duckdb.connect()
# Connect to persistent file-based database
con = duckdb.connect('filename.db')
# Connect using SQL magic for jupyter notebooks
%sql duckdb:///:memory:Installing and Loading Extensions¶
# Install and load essential extensions
con.install_extension("httpfs") # For remote file access
con.load_extension("httpfs")
con.install_extension("spatial") # For spatial operations
con.load_extension("spatial")con.sql("FROM duckdb_extensions();")con.sql("FROM duckdb_extensions();").df()SQL Basics for Spatial Analysis¶
Understanding SQL for Geospatial Work¶
Sample Datasets¶
Reading Data Directly from URLs¶
%%sql
SELECT * FROM 'https://opengeos.org/data/duckdb/cities.csv';%%sql
SELECT * FROM 'https://opengeos.org/data/duckdb/countries.csv';Creating Persistent Tables¶
%%sql
CREATE OR REPLACE TABLE cities AS SELECT * FROM 'https://opengeos.org/data/duckdb/cities.csv';%%sql
CREATE OR REPLACE TABLE countries AS SELECT * FROM 'https://opengeos.org/data/duckdb/countries.csv';Viewing Your Data¶
%%sql
FROM cities;%%sql
FROM countries;Essential SQL Commands¶
Selecting Data¶
%%sql
SELECT * FROM cities LIMIT 10;Choosing Specific Columns¶
%%sql
SELECT name, country FROM cities LIMIT 10;Finding Unique Values¶
%%sql
SELECT DISTINCT country FROM cities LIMIT 10;Counting and Aggregation¶
%%sql
SELECT COUNT(*) FROM cities;%%sql
SELECT COUNT(DISTINCT country) FROM cities;%%sql
SELECT MAX(population) FROM cities;%%sql
SELECT MIN(population) FROM cities;%%sql
SELECT AVG(population) FROM cities;Filtering and Sorting Data¶
Filtering with WHERE Clauses¶
%%sql
SELECT * FROM cities WHERE population > 1000000 LIMIT 10;Sorting Your Results¶
%%sql
SELECT * FROM cities ORDER BY population DESC LIMIT 10;Grouping and Aggregating Data¶
%%sql
SELECT country, COUNT(*) as city_count, AVG(population) as avg_population
FROM cities
GROUP BY country
ORDER BY city_count DESC
LIMIT 10;Python API Integration¶
Understanding Result Formats¶
# Execute SQL and get different result formats
con.sql('SELECT 42').fetchall() # Python objectscon.sql('SELECT 42').df() # Pandas DataFramecon.sql('SELECT 42').fetchnumpy() # NumPy ArraysSeamless DataFrame Integration¶
# Query Pandas DataFrames directly
pandas_df = pd.DataFrame({'a': [42]})
con.sql('SELECT * FROM pandas_df')# Convert remote data to DataFrame
df = con.read_csv('https://opengeos.org/data/duckdb/cities.csv').df()
df.head()Result Conversion and Export¶
# Write results to various formats
con.sql('SELECT 42').write_parquet('out.parquet')
con.sql('SELECT 42').write_csv('out.csv')
con.sql("COPY (SELECT 42) TO 'out.parquet'")Persistent Storage¶
# Create persistent database connection
# create a connection to a file called 'file.db'
con_persistent = duckdb.connect('file.db')
# create a table and load data into it
con_persistent.sql(
'CREATE TABLE IF NOT EXISTS cities AS FROM read_csv_auto("https://opengeos.org/data/duckdb/cities.csv")'
)
# query the table
con_persistent.table('cities').show()# explicitly close the connection
con_persistent.close()with duckdb.connect('file.db') as con:
con.sql(
'CREATE TABLE IF NOT EXISTS cities AS FROM read_csv_auto("https://opengeos.org/data/duckdb/cities.csv")'
)
con.table('cities').show(max_rows=5)
# the context manager closes the connection automaticallyData Import¶
Understanding Data Formats in Geospatial Work¶
Downloading Sample Data¶
url = "https://opengeos.org/data/duckdb/cities.zip"
leafmap.download_file(url, unzip=True)Working with CSV Files¶
# Read CSV with auto-detection
con = duckdb.connect()
con.read_csv('cities.csv')# Read CSV with specific options
con.read_csv('cities.csv', header=True, sep=',')# Use parallel CSV reader
con.read_csv('cities.csv', parallel=True)# Read CSV directly in SQL
con.sql("SELECT * FROM 'cities.csv'")# Call read_csv from within SQL
con.sql("SELECT * FROM read_csv_auto('cities.csv')")JSON Files¶
# Read JSON files
con.read_json('cities.json')# Read JSON directly in SQL
con.sql("SELECT * FROM 'cities.json'")# Call read_json from within SQL
con.sql("SELECT * FROM read_json_auto('cities.json')")DataFrames¶
df = pd.read_csv('cities.csv')
dfcon.sql('SELECT * FROM df').fetchall()Parquet Files¶
# Read from a single Parquet file
con.read_parquet('cities.parquet')# Read Parquet directly in SQL
con.sql("SELECT * FROM 'cities.parquet'")# Call read_parquet from within SQL
con.sql("SELECT * FROM read_parquet('cities.parquet')")Spatial Data Formats¶
con.load_extension('spatial')con.sql('SELECT * FROM ST_Drivers()')# Read GeoJSON
con.sql("SELECT * FROM ST_Read('cities.geojson')")# Create spatial table from GeoJSON
con.sql("CREATE TABLE cities AS SELECT * FROM ST_Read('cities.geojson')")con.table('cities')con.sql("SELECT * FROM ST_Read('cities.shp')")con.sql(
"""
CREATE TABLE IF NOT EXISTS cities2 AS
SELECT * FROM ST_Read('cities.shp')
"""
)con.table('cities2')Data Export¶
Sample Spatial Data Setup¶
con = duckdb.connect()
con.load_extension('spatial')
con.sql("""
CREATE TABLE IF NOT EXISTS cities AS
FROM 'https://opengeos.org/data/duckdb/cities.parquet'
""")con.table("cities").show()Export to DataFrames¶
con.table("cities").df()Export to Files¶
# Export to CSV
con.sql("COPY cities TO 'cities.csv' (HEADER, DELIMITER ',')")con.sql(
"COPY (SELECT * FROM cities WHERE country='USA') TO 'cities_us.csv' (HEADER, DELIMITER ',')"
)Export to JSON¶
con.sql("COPY cities TO 'cities.json'")con.sql("COPY (SELECT * FROM cities WHERE country='USA') TO 'cities_us.json'")Export to Excel¶
con.sql(
"COPY (SELECT * EXCLUDE geometry FROM cities) TO 'cities.xlsx' WITH (FORMAT GDAL, DRIVER 'XLSX')"
)Export to Parquet¶
con.sql("COPY cities TO 'cities.parquet' (FORMAT PARQUET)")con.sql(
"COPY (SELECT * FROM cities WHERE country='USA') TO 'cities_us.parquet' (FORMAT PARQUET)"
)Export Spatial Formats¶
# Export to GeoJSON
con.sql("COPY cities TO 'cities.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON')")con.sql(
"COPY (SELECT * FROM cities WHERE country='USA') TO 'cities_us.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON')"
)# Export to Shapefile
con.sql("COPY cities TO 'cities.shp' WITH (FORMAT GDAL, DRIVER 'ESRI Shapefile')")# Export to GeoPackage
con.sql("COPY cities TO 'cities.gpkg' WITH (FORMAT GDAL, DRIVER 'GPKG')")Working with Geometries¶
Understanding Spatial Data Types¶
Sample Data Setup¶
# Download NYC sample data
url = "https://opengeos.org/data/duckdb/nyc_data.db.zip"
leafmap.download_file(url, unzip=True)# Connect to spatial database
con = duckdb.connect("nyc_data.db")
con.install_extension("spatial")
con.load_extension("spatial")con.sql("SHOW TABLES;")Creating and Understanding Geometries¶
con.sql("""
CREATE or REPLACE TABLE samples (name VARCHAR, geom GEOMETRY);
INSERT INTO samples VALUES
('Point', ST_GeomFromText('POINT(-100 40)')),
('Linestring', ST_GeomFromText('LINESTRING(0 0, 1 1, 2 1, 2 2)')),
('Polygon', ST_GeomFromText('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))')),
('PolygonWithHole', ST_GeomFromText('POLYGON((0 0, 10 0, 10 10, 0 10, 0 0),(1 1, 1 2, 2 2, 2 1, 1 1))')),
('Collection', ST_GeomFromText('GEOMETRYCOLLECTION(POINT(2 0),POLYGON((0 0, 1 0, 1 1, 0 1, 0 0)))'));
SELECT * FROM samples;
""")con.sql("SELECT name, ST_AsText(geom) AS geometry FROM samples;")con.sql("""
COPY samples TO 'samples.geojson' (FORMAT GDAL, DRIVER GeoJSON);
""")Working with Points¶
con.sql("""
SELECT ST_AsText(geom)
FROM samples
WHERE name = 'Point';
""")con.sql("""
SELECT ST_X(geom), ST_Y(geom)
FROM samples
WHERE name = 'Point';
""")con.sql("""
SELECT * FROM nyc_subway_stations
""")con.sql("""
SELECT name, ST_AsText(geom)
FROM nyc_subway_stations
LIMIT 10;
""")Working with LineStrings¶
con.sql("""
SELECT ST_AsText(geom)
FROM samples
WHERE name = 'Linestring';
""")con.sql("""
SELECT ST_Length(geom)
FROM samples
WHERE name = 'Linestring';
""")Working with Polygons¶
con.sql("""
SELECT ST_AsText(geom)
FROM samples
WHERE name = 'Polygon';
""")con.sql("""
SELECT
ST_Area(geom) as area,
ST_Perimeter(geom) as perimeter
FROM samples
WHERE name = 'Polygon';
""")Spatial Relationships¶
Understanding Spatial Predicates¶
Working with Real Spatial Data¶
# Find subway station
con.sql("""
SELECT name, geom
FROM nyc_subway_stations
WHERE name = 'Broad St';
""")Testing Spatial Equality¶
# Test spatial equality
con.sql("""
SELECT name
FROM nyc_subway_stations
WHERE ST_Equals(geom, ST_GeomFromText('POINT (583571.9059213118 4506714.341192182)'));
""")Point-in-Polygon Analysis¶
con.sql("FROM nyc_neighborhoods LIMIT 5")# Find neighborhood containing a point
con.sql("""
SELECT name, boroname
FROM nyc_neighborhoods
WHERE ST_Intersects(geom, ST_GeomFromText('POINT(583571 4506714)'));
""")Proximity Analysis with Distance¶
# Find features within distance
con.sql("""
SELECT COUNT(*) as nearby_stations
FROM nyc_subway_stations
WHERE ST_DWithin(geom, ST_GeomFromText('POINT(583571 4506714)'), 500);
""")Spatial Joins¶
Understanding Spatial vs. Regular Joins¶
Exploring Our Data¶
con.sql("FROM nyc_neighborhoods SELECT * LIMIT 5;")con.sql("FROM nyc_subway_stations SELECT * LIMIT 5;")Your First Spatial Join¶
# Find subway stations and their neighborhoods
con.sql("""
SELECT
subways.name AS subway_name,
neighborhoods.name AS neighborhood_name,
neighborhoods.boroname AS borough
FROM nyc_neighborhoods AS neighborhoods
JOIN nyc_subway_stations AS subways
ON ST_Intersects(neighborhoods.geom, subways.geom)
WHERE subways.NAME = 'Broad St';
""")Advanced Spatial Analysis¶
con.sql("""
SELECT DISTINCT COLOR FROM nyc_subway_stations;
""")# Find RED subway stations and their neighborhoods
con.sql("""
SELECT
subways.name AS subway_name,
neighborhoods.name AS neighborhood_name,
neighborhoods.boroname AS borough
FROM nyc_neighborhoods AS neighborhoods
JOIN nyc_subway_stations AS subways
ON ST_Intersects(neighborhoods.geom, subways.geom)
WHERE subways.color = 'RED';
""")Distance-Based Analysis¶
# Get baseline demographics
con.sql("""
SELECT
100.0 * Sum(popn_white) / Sum(popn_total) AS white_pct,
100.0 * Sum(popn_black) / Sum(popn_total) AS black_pct,
Sum(popn_total) AS popn_total
FROM nyc_census_blocks;
""")con.sql("""
SELECT DISTINCT routes
FROM nyc_subway_stations AS subways
WHERE strpos(subways.routes,'A') > 0;
""")# Demographics within 200 meters of A-train
con.sql("""
SELECT
100.0 * Sum(popn_white) / Sum(popn_total) AS white_pct,
100.0 * Sum(popn_black) / Sum(popn_total) AS black_pct,
Sum(popn_total) AS popn_total
FROM nyc_census_blocks AS census
JOIN nyc_subway_stations AS subways
ON ST_DWithin(census.geom, subways.geom, 200)
WHERE strpos(subways.routes,'A') > 0;
""")Advanced Multi-Table Joins¶
# Create subway lines reference table
con.sql("""
CREATE OR REPLACE TABLE subway_lines ( route char(1) );
INSERT INTO subway_lines (route) VALUES
('A'),('B'),('C'),('D'),('E'),('F'),('G'),
('J'),('L'),('M'),('N'),('Q'),('R'),('S'),
('Z'),('1'),('2'),('3'),('4'),('5'),('6'),
('7');
""")# Analyze demographics by subway line
con.sql("""
SELECT
lines.route,
100.0 * Sum(popn_white) / Sum(popn_total) AS white_pct,
100.0 * Sum(popn_black) / Sum(popn_total) AS black_pct,
Sum(popn_total) AS popn_total
FROM nyc_census_blocks AS census
JOIN nyc_subway_stations AS subways
ON ST_DWithin(census.geom, subways.geom, 200)
JOIN subway_lines AS lines
ON strpos(subways.routes, lines.route) > 0
GROUP BY lines.route
ORDER BY black_pct DESC;
""")Large-Scale Data Analysis¶
Analyzing the National Wetlands Inventory¶
# Connect and prepare for large-scale analysis
con = duckdb.connect()
con.install_extension("spatial")
con.load_extension("spatial")# Analyze single state data
state = "DC" # Change to the US State of your choice
url = f"https://data.source.coop/giswqs/nwi/wetlands/{state}_Wetlands.parquet"
con.sql(f"SELECT * FROM '{url}'")# Inspect the table schema
con.sql(f"DESCRIBE FROM '{url}'")Scaling Up: Nationwide Wetland Analysis¶
Count the Total Number of Wetlands¶
con.sql(f"""
SELECT COUNT(*) AS Count
FROM 's3://us-west-2.opendata.source.coop/giswqs/nwi/wetlands/*.parquet'
""")Count Wetlands by State¶
# Count wetlands by state using filename
df = con.sql(f"""
SELECT filename, COUNT(*) AS Count
FROM read_parquet('s3://us-west-2.opendata.source.coop/giswqs/nwi/wetlands/*.parquet', filename=true)
GROUP BY filename
ORDER BY COUNT(*) DESC;
""").df()
df.head()# Extract state codes from filenames
count_df = con.sql(f"""
SELECT SUBSTRING(filename, LENGTH(filename) - 18, 2) AS State, COUNT(*) AS Count
FROM read_parquet('s3://us-west-2.opendata.source.coop/giswqs/nwi/wetlands/*.parquet', filename=true)
GROUP BY State
ORDER BY COUNT(*) DESC;
""").df()
count_df.head(10)# Create a wetlands table from the DataFrame
con.sql("CREATE OR REPLACE TABLE wetlands AS FROM count_df")
con.sql("FROM wetlands")Mapping Wetland Counts by State¶
# Create states table with geometries
url = 'https://opengeos.org/data/us/us_states.parquet'
con.sql(
f"""
CREATE OR REPLACE TABLE states AS
SELECT * FROM '{url}'
"""
)con.sql(
"""
SELECT * FROM states INNER JOIN wetlands ON states.id = wetlands.State
"""
)# Join wetlands count with state geometries for visualization
file_path = "states_with_wetlands.geojson"
con.sql(f"""
COPY (
SELECT s.name, s.id, w.Count, s.geometry
FROM states s
JOIN wetlands w ON s.id = w.State
ORDER BY w.Count DESC
) TO '{file_path}' WITH (FORMAT GDAL, DRIVER 'GeoJSON')
""")m = leafmap.Map()
m.add_data(
file_path, column="Count", scheme="Quantiles", cmap="Greens", legend_title="Wetland Count"
)
mWetland Distribution Charts¶
Pie Chart of Wetlands by State¶
leafmap.pie_chart(
count_df, "State", "Count", height=800, title="Number of Wetlands by State"
)Bar Chart of Wetlands by State¶
leafmap.bar_chart(count_df, "State", "Count", title="Number of Wetlands by State")Wetland Area Analysis¶
Total Wetland Area in the U.S.¶
con.sql(
f"""
SELECT SUM(Shape_Area) / 1000000 AS Area_SqKm
FROM 's3://us-west-2.opendata.source.coop/giswqs/nwi/wetlands/*.parquet'
"""
)Wetland Area by State¶
area_df = con.sql(
f"""
SELECT SUBSTRING(filename, LENGTH(filename) - 18, 2) AS State, SUM(Shape_Area) / 1000000 AS Area_SqKm
FROM read_parquet('s3://us-west-2.opendata.source.coop/giswqs/nwi/wetlands/*.parquet', filename=true)
GROUP BY State
ORDER BY COUNT(*) DESC;
"""
).df()
area_df.head(10)Pie Chart of Wetland Area by State¶
leafmap.pie_chart(
area_df, "State", "Area_SqKm", height=850, title="Wetland Area by State"
)Bar Chart of Wetland Area by State¶
leafmap.bar_chart(area_df, "State", "Area_SqKm", title="Wetland Area by State")Key Takeaways¶
Fundamental Concepts You’ve Learned¶
Why This Approach is Powerful¶
Building Your Spatial Analysis Skills¶
Exercises¶
Exercise 1: Creating Tables¶
Exercise 2: Column Filtering¶
Exercise 3: Row Filtering¶
Exercise 4: Sorting Results¶
Exercise 5: Unique Values¶
Exercise 6: Counting Rows¶
Exercise 7: Aggregating Data¶
Exercise 8: Joining Tables¶
Exercise 9: String Manipulation¶
Exercise 10: Filtering with Multiple Conditions¶
Exercise 11: Basic Setup and Data Loading¶
Exercise 12: Spatial Relationships Analysis¶
Exercise 13: Advanced Spatial Joins¶
Exercise 14: Data Import and Export¶
Exercise 15: Large-Scale Analysis¶