16. Vector Data Analysis with GeoPandas#
16.1. Introduction#
16.2. Learning Objectives#
16.3. Core Concepts#
16.3.1. GeoDataFrame and GeoSeries#
16.3.2. Active Geometry Concept#
16.4. Installing GeoPandas#
# %pip install geopandas pygis
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
16.5. Creating GeoDataFrames#
16.5.1. Creating Points from Coordinate Data#
# Creating a GeoDataFrame from coordinate data
data = {
"City": ["Tokyo", "New York", "London", "Paris"],
"Latitude": [35.6895, 40.7128, 51.5074, 48.8566],
"Longitude": [139.6917, -74.0060, -0.1278, 2.3522],
}
# First create a regular pandas DataFrame
df = pd.DataFrame(data)
# Convert to GeoDataFrame by creating Point geometries from coordinates
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Longitude, df.Latitude))
gdf
16.6. Reading and Writing Geospatial Data#
16.6.1. Understanding Geospatial File Formats#
16.6.2. Reading a GeoJSON File#
url = "https://github.com/opengeos/datasets/releases/download/vector/nybb.geojson"
gdf = gpd.read_file(url)
gdf.head()
16.6.3. Writing Geospatial Data#
output_file = "nyc_boroughs.geojson"
gdf.to_file(output_file, driver="GeoJSON")
print(f"GeoDataFrame has been written to {output_file}")
# Save as Shapefile (traditional GIS format)
output_file = "nyc_boroughs.shp"
gdf.to_file(output_file)
# Save as GeoPackage (modern, single-file format)
output_file = "nyc_boroughs.gpkg"
gdf.to_file(output_file, driver="GPKG")
16.7. Projections and Coordinate Reference Systems (CRS)#
16.7.1. Understanding Coordinate Systems#
16.7.2. Checking and Understanding CRS#
print(f"Current CRS: {gdf.crs}")
16.7.3. Reprojecting to Different Coordinate Systems#
# Reproject to WGS84 (latitude/longitude) for global compatibility
gdf_4326 = gdf.to_crs(epsg=4326)
print(f"Reprojected CRS: {gdf_4326.crs}")
gdf_4326.head()
16.8. Spatial Measurements and Analysis#
16.8.1. Preparing Data for Accurate Measurements#
# Reproject to Web Mercator for accurate area calculations in square meters
gdf = gdf.to_crs("EPSG:3857")
# Set BoroName as index for easier data access
gdf = gdf.set_index("BoroName")
print(f"Now using CRS: {gdf.crs}")
16.8.2. Calculating Areas#
# Calculate area in square meters
gdf["area"] = gdf.area
# Convert to more readable units (square kilometers)
gdf["area_km2"] = gdf["area"] / 1_000_000
# Display results sorted by area
gdf[["area", "area_km2"]].sort_values("area_km2", ascending=False)
16.8.3. Extracting Geometric Features#
# Extract boundary lines from polygons
gdf["boundary"] = gdf.boundary
# Calculate centroids (geometric centers)
gdf["centroid"] = gdf.centroid
# Display the geometric features
gdf[["boundary", "centroid"]].head()
16.8.4. Distance Calculations#
# Use Manhattan's centroid as the reference point
manhattan_centroid = gdf.loc["Manhattan", "centroid"]
# Calculate distance from each borough centroid to Manhattan
gdf["distance_to_manhattan"] = gdf["centroid"].distance(manhattan_centroid)
# Convert to kilometers and display results
gdf["distance_to_manhattan_km"] = gdf["distance_to_manhattan"] / 1000
gdf[["distance_to_manhattan_km"]].sort_values("distance_to_manhattan_km")
16.8.5. Statistical Analysis of Spatial Data#
# Calculate summary statistics
mean_distance = gdf["distance_to_manhattan_km"].mean()
max_distance = gdf["distance_to_manhattan_km"].max()
total_area = gdf["area_km2"].sum()
print(f"Mean distance to Manhattan: {mean_distance:.2f} km")
print(f"Maximum distance to Manhattan: {max_distance:.2f} km")
print(f"Total NYC area: {total_area:.2f} km²")
16.9. Visualizing Geospatial Data#
16.9.1. Setting Up Plotting Environment#
import matplotlib.pyplot as plt
# Set high resolution for better quality plots
plt.rcParams["figure.dpi"] = 150
16.9.2. Thematic Mapping#
# Create a choropleth map showing borough areas
fig, ax = plt.subplots(figsize=(10, 6))
gdf.plot(
column="area_km2",
ax=ax,
legend=True,
cmap="YlOrRd", # Yellow-Orange-Red colormap
edgecolor="black",
linewidth=0.5,
)
plt.title("NYC Boroughs by Area (km²)", fontsize=16, fontweight="bold")
plt.axis("off") # Remove coordinate axes for cleaner appearance
plt.tight_layout()
plt.show()
16.9.3. Multi-Layer Visualization#
# Create a comprehensive map with multiple layers
fig, ax = plt.subplots(figsize=(10, 6))
# Plot borough boundaries as base layer
gdf["geometry"].plot(
ax=ax, color="lightblue", edgecolor="navy", linewidth=1.5, alpha=0.7
)
# Add centroids as point layer
gdf["centroid"].plot(
ax=ax, color="red", markersize=80, edgecolor="darkred", linewidth=1
)
# Add borough labels
for idx, row in gdf.iterrows():
# Get centroid coordinates for label placement
x = row.centroid.x
y = row.centroid.y
ax.annotate(
idx,
(x, y),
xytext=(5, 5),
textcoords="offset points",
fontsize=10,
fontweight="bold",
bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8),
)
plt.title("NYC Borough Boundaries and Centroids", fontsize=16, fontweight="bold")
plt.axis("off")
plt.tight_layout()
plt.show()
16.9.4. Interactive Visualization#
# Create an interactive map using Folium integration
m = gdf.explore(
column="area_km2",
cmap="YlOrRd",
tooltip=["area_km2", "distance_to_manhattan_km"],
popup=True,
legend=True,
)
m
16.10. Advanced Geometric Operations#
16.10.1. Buffer Analysis#
# Create 3-kilometer buffer zones around each borough
buffer_distance = 3000 # meters
gdf["buffered"] = gdf.buffer(buffer_distance)
print(f"Created {buffer_distance/1000} km buffer zones around each borough")
# Visualize original vs buffered geometries
fig, ax = plt.subplots(figsize=(10, 6))
# Plot buffered areas first (background)
gdf["buffered"].plot(
ax=ax,
alpha=0.3,
color="orange",
edgecolor="red",
linewidth=1,
label="3km Buffer Zone",
)
# Plot original geometries on top
gdf["geometry"].plot(
ax=ax,
color="lightblue",
edgecolor="navy",
linewidth=1.5,
label="Original Boundaries",
)
plt.title("NYC Boroughs: Original vs 3km Buffer Zones", fontsize=16, fontweight="bold")
plt.legend(loc="upper right")
plt.axis("off")
plt.tight_layout()
plt.show()
16.10.2. Convex Hull Analysis#
# Calculate convex hulls for each borough
gdf["convex_hull"] = gdf.convex_hull
# Compare areas between original shapes and convex hulls
gdf["convex_hull_area"] = gdf["convex_hull"].area / 1_000_000 # Convert to km²
gdf["area_ratio"] = gdf["convex_hull_area"] / gdf["area_km2"]
print("Convex Hull Analysis:")
print(gdf[["area_km2", "convex_hull_area", "area_ratio"]].round(2))
# Create comparison visualization
fig, ax = plt.subplots(figsize=(10, 6))
# Plot original geometries
gdf["geometry"].plot(
ax=ax, color="lightblue", edgecolor="navy", linewidth=2, label="Original Shape"
)
# Plot convex hulls as outlines
gdf["convex_hull"].plot(
ax=ax,
facecolor="none",
edgecolor="red",
linewidth=2,
linestyle="--",
label="Convex Hull",
)
plt.title(
"NYC Boroughs: Original Shapes vs Convex Hulls", fontsize=16, fontweight="bold"
)
plt.legend(loc="upper right")
plt.axis("off")
plt.tight_layout()
plt.show()
16.11. Spatial Relationships and Queries#
16.11.1. Intersection Analysis#
# Test which buffered boroughs intersect with Manhattan's original boundary
manhattan_geom = gdf.loc["Manhattan", "geometry"]
gdf["intersects_manhattan"] = gdf["buffered"].intersects(manhattan_geom)
gdf["touches_manhattan"] = gdf["geometry"].touches(manhattan_geom)
# Display results
intersection_results = gdf[["intersects_manhattan", "touches_manhattan"]]
intersection_results
16.11.2. Containment and Spatial Validation#
# Verify that centroids fall within their respective borough boundaries
gdf["centroid_within_borough"] = gdf["centroid"].within(gdf["geometry"])
# Check for any anomalies
anomalies = gdf[~gdf["centroid_within_borough"]]
if len(anomalies) > 0:
print("Warning: Some centroids fall outside their borough boundaries")
print(anomalies.index.tolist())
else:
print("✓ All centroids correctly fall within their borough boundaries")