import xarray as xr
import rasterio
import rasterio.mask
import geopandas as gpd
import matplotlib.pyplot as plt
import pygris
import numpy as np
# Fetch Detroit metro area counties using pygris
= pygris.counties(state="MI", year=2022)
metro_counties = metro_counties[metro_counties['NAME'].isin([
detroit_metro 'Wayne', 'Oakland', 'Macomb', 'Livingston', 'St. Clair', 'Lapeer'
])]
# Dissolve the counties into a single polygon
= detroit_metro.dissolve(by='STATEFP')
detroit_metro
# Convert to GeoDataFrame
= gpd.GeoDataFrame(detroit_metro, geometry='geometry', crs='EPSG:4269')
detroit_metro
# Specify the TIF files
= [
tif_files "data/svi/svi_2020_tract_overall_wgs84.tif",
"data/svi/svi_2020_tract_minority_wgs84.tif",
"data/svi/svi_2020_tract_socioeconomic_wgs84.tif",
"data/svi/svi_2020_tract_housing_wgs84.tif",
"data/svi/svi_2020_tract_household_wgs84.tif"
]
# Create an empty list to store the individual DataArrays
= []
data_arrays
# Read each TIF file, clip it to Detroit metro's extent, and append it to the list
for file in tif_files:
with rasterio.open(file) as src:
# Reproject Detroit metro boundary to match the raster CRS
= detroit_metro.to_crs(src.crs)
metro_reprojected
# Clip the raster to Detroit metro's geometry
= rasterio.mask.mask(src, metro_reprojected.geometry, crop=True)
out_image, out_transform = src.meta.copy()
out_meta
# Update the metadata
"driver": "GTiff",
out_meta.update({"height": out_image.shape[1],
"width": out_image.shape[2],
"transform": out_transform})
# Create coordinates
= out_meta['height']
height = out_meta['width']
width = np.meshgrid(np.arange(width), np.arange(height))
cols, rows = rasterio.transform.xy(out_transform, rows, cols)
xs, ys
# Convert lists to numpy arrays
= np.array(xs)
xs = np.array(ys)
ys
# Reshape coordinates to match dimensions of the raster
= xs.reshape(height, width)
xs = ys.reshape(height, width)
ys
# Create a DataArray from the clipped data
= xr.DataArray(out_image[0], # Use the first band
da ={'y': ('y', ys[:, 0]),
coords'x': ('x', xs[0, :])},
=['y', 'x'])
dims'crs'] = str(src.crs) # Convert CRS to string
da.attrs['transform'] = out_transform
da.attrs[
data_arrays.append(da)
# Combine all DataArrays into a single DataSet
= xr.concat(data_arrays, dim='layer')
ds
# Rename the layers using the appropriate dimension
= ds.assign_coords(layer=('layer', ['Overall', 'Minority', 'Socioeconomic', 'Housing', 'Household']))
ds
# Define the colorbar limits
= 0, 1
vmin, vmax
# Create a multipanel plot
= plt.subplots(3, 2, figsize=(15, 20))
fig, axes = axes.flatten()
axes
# Plot each layer
for i, layer in enumerate(ds.layer.values):
# Plot with custom color limits
= ds.sel(layer=layer).plot(ax=axes[i], add_colorbar=False, vmin=vmin, vmax=vmax, cmap='viridis')
im
axes[i].set_title(layer)
# Plot Detroit metro boundary
=axes[i], color='red', linewidth=1)
metro_reprojected.boundary.plot(ax
# Remove the extra subplot
5])
fig.delaxes(axes[
# Add a single colorbar
= fig.add_axes([0.92, 0.15, 0.02, 0.7])
cbar_ax = fig.colorbar(im, cax=cbar_ax, label='SVI Score')
cbar
plt.tight_layout() plt.show()
SVI, TRI, and Health Outcomes
Overview
In this lesson, you will use….
Learning Objectives
After completing this lesson, you should be able to:
- Manipulating and processing raster data; mainly loading and cropping.
- Exploring SVI/raster by mapping and processing different sub datasets (overall/racial/socioeconomic).
- Connecting to API’s for pollution and public health data.
- Geolocating tabular data with latitude and longitude.
- Joining tabular data to spatial data (probably polygons).
- Connecting to OpenStreetMap API for basemaps.
- Creating regional basemaps.
- Layering spatial data on basemaps.
- Semi-transparent SVI, TRI points, PLACES boundaries.
- Creating maps with graduated point symbols for point source pollution releases.
- Creating choropleths with census tract polygons for health outcomes data.
- Zonal statistics with census tracts for:
Mean SVI values
Health outcomes summaries weighted by SVI.
- (Maybe) Interpolate pollution load raster surface from point data.
- Spatial correlation analysis between SVI, point pollution, and health outcomes. .
Introduction
Air Quality and Environmental Justice:
In many urban areas, air quality issues disproportionately affect low-income communities and communities of color. This disparity is a key focus of environmental justice efforts. In cities like Detroit and Chicago, industrial facilities, highways, and other pollution sources are often concentrated in disadvantaged neighborhoods.
NIMBY and Environmental Justice:
The NIMBY phenomenon can sometimes conflict with environmental justice goals. While wealthier communities may successfully oppose new polluting facilities in their areas, this can lead to these facilities being placed in less affluent neighborhoods with less political clout, exacerbating environmental injustices.
Community-Driven Science:
In response to these challenges, community-driven science initiatives have emerged. These efforts involve local residents in collecting data on air quality and other environmental factors, often using low-cost sensors and mobile monitoring techniques. This approach helps fill gaps in official monitoring networks and empowers communities to advocate for themselves.
Detroit and Chicago Examples:
Detroit: The city has a history of industrial pollution and is working to address air quality issues in areas like Southwest Detroit, where residents face higher exposure to pollutants from nearby industries and heavy truck traffic.
Chicago: The city has seen community efforts to address air pollution in areas like the Southeast Side, where residents have fought against polluting industries and advocated for stricter regulations.
These topics are interconnected, as community-driven science often arises in response to environmental justice concerns, while NIMBY attitudes can influence the distribution of pollution sources across urban areas.
EPA (Environmental Protection Agency):
The EPA is the primary federal agency responsible for environmental protection in the United States. It sets and enforces air quality standards, including the National Ambient Air Quality Standards (NAAQS) for six criteria pollutants. The EPA also maintains the AirNow system, which provides real-time air quality data to the public. CDC (Centers for Disease Control and Prevention): While primarily focused on public health, the CDC plays a crucial role in understanding the health impacts of air pollution. It conducts research on the relationships between air quality and various health outcomes, and provides guidance on protecting public health from environmental hazards. Importance of Open Data: Open data is crucial for community-driven solutions in several ways:
Transparency: Open data allows communities to verify official reports and hold authorities accountable. Accessibility: When air quality data is freely available, communities can use it to inform local decision-making and advocacy efforts. Innovation: Open data enables researchers, activists, and tech developers to create new tools and analyses that can benefit communities. Collaboration: Open data facilitates collaboration between communities, scientists, and policymakers, leading to more effective solutions.
Examples of Open Data in Action:
The EPA’s Air Quality System (AQS) database is publicly accessible, allowing researchers and community groups to analyze historical air quality trends.
In Chicago, the Array of Things project has installed sensors throughout the city, providing open data on various environmental factors including air quality.
Detroit’s Community Air Monitoring Project uses low-cost sensors to collect and share air quality data in areas underserved by official monitoring stations.
Community-Driven Solutions: With access to open data, communities can:
Identify local air quality hotspots that may be missed by sparse official monitoring networks.
Correlate air quality data with health outcomes to strengthen advocacy efforts.
Develop targeted interventions, such as promoting indoor air filtration on high-pollution days.
Create custom alerts and information systems tailored to local needs.
Challenges and Opportunities: While open data provides many benefits, challenges remain:
Ensuring data quality and consistency, especially when integrating data from various sources.
Addressing the “digital divide” to ensure all community members can access and use the data. Balancing the need for detailed local data with privacy concerns.
Building capacity within communities to effectively use and interpret complex environmental data.
The EPA, CDC, and other agencies are increasingly recognizing the value of community-driven science and open data. This has led to initiatives like the EPA’s Community-Focused Exposure and Risk Screening Tool (C-FERST) and the CDC’s Environmental Public Health Tracking Network, which aim to make environmental health data more accessible to communities.
EGLE and Environmental Justice in Detroit
Certainly. Detroit, Michigan has indeed seen significant environmental justice efforts, particularly in collaboration with EGLE (Michigan Department of Environment, Great Lakes, and Energy). Here are some examples:
Detroit Environmental Agenda: This community-led initiative works closely with EGLE to address environmental concerns in Detroit. It focuses on issues like air quality, water quality, and waste management.
48217 Community Air Monitoring Project: Named after the zip code of a heavily industrialized area in Southwest Detroit, this project involves community members working with EGLE to monitor air quality using low-cost sensors.
Detroit Climate Action Plan: Developed in partnership with EGLE, this plan addresses climate change impacts on the city, with a focus on vulnerable communities.
Delray Neighborhood Initiatives: EGLE has been involved in efforts to address air quality concerns in the Delray neighborhood, which is impacted by industrial emissions and heavy truck traffic.
Green Door Initiative: This Detroit-based organization collaborates with EGLE on various environmental justice projects, including lead abatement and air quality improvement efforts.
Detroit River Sediment Cleanup: EGLE has been involved in efforts to clean up contaminated sediments in the Detroit River, which disproportionately affects nearby low-income communities.
Asthma Prevention Programs: EGLE supports community-based asthma prevention programs in Detroit, recognizing the link between air quality and asthma rates in disadvantaged neighborhoods.
These examples demonstrate the ongoing collaboration between community groups, local government, and EGLE to address environmental justice concerns in Detroit. However, as with many environmental justice efforts, challenges remain and work is ongoing.
Load the Data
SVI and Detroit Metro
SVI has a primary overall SVI score, but also provides sublayers. These include minority, socioeconomic, housing, and household data.
TRI API Column Definitions
Column Name | Description |
---|---|
FACILITY_NAME |
The name of the facility reporting to TRI |
TRI_FACILITY_ID |
A unique identifier for the facility in the TRI database |
STREET_ADDRESS |
The street address of the facility |
CITY_NAME |
The city where the facility is located |
COUNTY_NAME |
The county where the facility is located |
STATE_ABBR |
The two-letter abbreviation for the state where the facility is located |
ZIP_CODE |
The ZIP code of the facility’s location |
PREF_LATITUDE |
The preferred latitude coordinate of the facility |
PREF_LONGITUDE |
The preferred longitude coordinate of the facility |
PARENT_CO_NAME |
The name of the parent company, if applicable |
INDUSTRY_SECTOR_CODE |
A code representing the industry sector of the facility |
INDUSTRY_SECTOR |
A description of the industry |
Note: The availability and exact names of these columns may vary depending on the specific TRI API endpoint and query parameters used. Always refer to the official EPA TRI documentation for the most up-to-date and comprehensive information.
Important Considerations
- Not all columns may be present in every API response.
- Column names may have slight variations (e.g., with or without underscores).
- The EPA occasionally updates their API and data structure.
- Some columns related to chemical releases and waste management may have additional variations or breakdowns.
- Numeric values (like release amounts) are typically reported in pounds, but always verify the units.
- For coordinates (
PREF_LATITUDE
andPREF_LONGITUDE
), be aware that these are the preferred coordinates, which may have been adjusted for accuracy or privacy reasons.
ENVIROFACTS TRI Facilities w/ API
import xarray as xr
import rasterio
import rasterio.mask
import geopandas as gpd
import matplotlib.pyplot as plt
import pygris
import numpy as np
from shapely.geometry import box
import pandas as pd
import requests
import contextily as ctx
# Fetch Detroit metro area counties using pygris
= pygris.counties(state="MI", year=2022)
metro_counties = metro_counties[metro_counties['NAME'].isin([
detroit_metro 'Wayne', 'Oakland', 'Macomb', 'Livingston', 'St. Clair', 'Lapeer'
])]
# Dissolve the counties into a single polygon
= detroit_metro.dissolve(by='STATEFP')
detroit_metro
# Get the bounding box
= detroit_metro.total_bounds
bbox
# Print the bounding box coordinates
print("Bounding Box:")
print(f"Minimum X (Longitude): {bbox[0]}")
print(f"Minimum Y (Latitude): {bbox[1]}")
print(f"Maximum X (Longitude): {bbox[2]}")
print(f"Maximum Y (Latitude): {bbox[3]}")
# Create the bounding box as a polygon
= gpd.GeoDataFrame(
bbox_polygon =[box(*bbox)],
geometry=detroit_metro.crs
crs
)
# Fetch TRI facility data from EPA API for each county
= ['Wayne', 'Oakland', 'Macomb', 'Livingston', 'St. Clair', 'Lapeer']
counties = []
tri_data
for county in counties:
= f"https://data.epa.gov/efservice/tri_facility/state_abbr/MI/county_name/{county}/JSON"
api_url = requests.get(api_url)
response if response.status_code == 200:
= response.json()
county_data
tri_data.extend(county_data)else:
print(f"Failed to fetch data for {county} County. Status code: {response.status_code}")
# Convert TRI data to a DataFrame
= pd.DataFrame(tri_data)
tri_df
print(f"Number of facilities fetched: {len(tri_df)}")
# Create a copy of the dataframe to avoid SettingWithCopyWarning
= tri_df.copy()
tri_df_clean
# Remove facilities with empty latitude or longitude values
= tri_df_clean.dropna(subset=['pref_latitude', 'pref_longitude'])
tri_df_clean
print(f"Number of facilities after removing empty coordinates: {len(tri_df_clean)}")
# Convert latitude and longitude to numeric type
'pref_latitude'] = pd.to_numeric(tri_df_clean['pref_latitude'], errors='coerce')
tri_df_clean['pref_longitude'] = pd.to_numeric(tri_df_clean['pref_longitude'], errors='coerce')
tri_df_clean[
# Function to correct longitude
def correct_longitude(lon):
if lon > 0:
return -lon
return lon
# Apply longitude correction
'pref_longitude'] = tri_df_clean['pref_longitude'].apply(correct_longitude)
tri_df_clean[
# Calculate IQR for longitude
= tri_df_clean['pref_longitude'].quantile(0.25)
Q1 = tri_df_clean['pref_longitude'].quantile(0.75)
Q3 = Q3 - Q1
IQR
# Define bounds for outliers
= Q1 - 1.5 * IQR
lower_bound = Q3 + 1.5 * IQR
upper_bound
# Remove outliers
= tri_df_clean[(tri_df_clean['pref_longitude'] >= lower_bound) &
tri_df_clean 'pref_longitude'] <= upper_bound)]
(tri_df_clean[
print(f"Number of facilities after removing longitude outliers: {len(tri_df_clean)}")
# Create a GeoDataFrame from the cleaned TRI data
= gpd.GeoDataFrame(
detroit_tri
tri_df_clean, =gpd.points_from_xy(tri_df_clean.pref_longitude, tri_df_clean.pref_latitude),
geometry="EPSG:4326"
crs
)
# Reproject data to Web Mercator for contextily
= detroit_metro.to_crs(epsg=3857)
detroit_metro = bbox_polygon.to_crs(epsg=3857)
bbox_polygon = detroit_tri.to_crs(epsg=3857)
detroit_tri
# Create the plot
= plt.subplots(figsize=(15, 15))
fig, ax
# Plot the metro area and bounding box
=ax, color='lightblue', edgecolor='black', alpha=0.5)
detroit_metro.plot(ax=ax, color='red', linewidth=2)
bbox_polygon.boundary.plot(ax
# Plot TRI facilities
=ax, color='red', markersize=50, alpha=0.7)
detroit_tri.plot(ax
# Add the basemap
=ctx.providers.OpenStreetMap.Mapnik)
ctx.add_basemap(ax, source
# Set the extent of the map to the bounding box
0], bbox_polygon.total_bounds[2])
ax.set_xlim(bbox_polygon.total_bounds[1], bbox_polygon.total_bounds[3])
ax.set_ylim(bbox_polygon.total_bounds[
# Remove axes
ax.set_axis_off()
"Detroit Metro Area TRI Facilities", fontsize=16)
plt.title(
plt.tight_layout()
plt.show()
print(f"Final number of TRI facilities in the Detroit metro area: {len(detroit_tri)}")
Regulated Facilities w/ ICIS-AIR
import pandas as pd
import requests
import time
# Base URL for ECHO ICIS-AIR API
= "https://echodata.epa.gov/echo/air_rest_services"
base_url
# Parameters for the initial API call
= {
params "output": "JSON",
"p_st": "MI"
}
def get_facilities():
= requests.get(f"{base_url}.get_facilities", params=params)
response if response.status_code == 200:
= response.json()
data if 'Results' in data:
= data['Results']['QueryID']
qid print(f"Query ID: {qid}")
print(f"Total Facilities: {data['Results']['QueryRows']}")
return qid
print("Failed to get facilities and QID")
return None
def get_facility_data(qid):
= []
all_facilities = 1
page while True:
= {"qid": qid, "pageno": page, "output": "JSON"}
params = requests.get(f"{base_url}.get_qid", params=params)
response if response.status_code == 200:
= response.json()
data if 'Results' in data and 'Facilities' in data['Results']:
= data['Results']['Facilities']
facilities if not facilities: # No more facilities to retrieve
break
all_facilities.extend(facilities)print(f"Retrieved page {page}")
+= 1
page else:
break
else:
print(f"Failed to retrieve page {page}")
break
return all_facilities
# Step 1: Get the Query ID
= get_facilities()
qid
if qid:
# Step 2: Use get_qid to retrieve all facility data
print("Retrieving facility data...")
= get_facility_data(qid)
facilities
# Convert to DataFrame
= pd.DataFrame(facilities)
df_icis_air
print(f"\nSuccessfully retrieved {len(df_icis_air)} ICIS-AIR facilities for Michigan")
print("\nColumns in the dataset:")
print(df_icis_air.columns)
# Display the first few rows
print("\nFirst few rows of the data:")
print(df_icis_air.head())
# Save to CSV
else:
print("Failed to retrieve facility data")
import pandas as pd
# List of Detroit metro counties
= ['Wayne', 'Oakland', 'Macomb', 'Livingston', 'St. Clair', 'Lapeer']
metro_counties
# Subset the dataframe to include only the Detroit metro counties
= df_icis_air[df_icis_air['AIRCounty'].isin(metro_counties)]
df_detroit_metro
# Print information about the subset
print(f"Total ICIS-AIR facilities in Michigan: {len(df_icis_air)}")
print(f"ICIS-AIR facilities in Detroit metro area: {len(df_detroit_metro)}")
# Display the count of facilities in each metro county
print("\nFacilities per county:")
print(df_detroit_metro['AIRCounty'].value_counts())
# Display the first few rows of the subset
print("\nFirst few rows of the Detroit metro ICIS-AIR facilities:")
print(df_detroit_metro.head())
# Additional information: unique values in AIRCounty
print("\nUnique values in AIRCounty column:")
print(df_icis_air['AIRCounty'].unique())
Create geopandas and plot.
# Count records with missing coordinate values
= df_detroit_metro[(df_detroit_metro['FacLat'].isnull()) | (df_detroit_metro['FacLong'].isnull())]
missing_coords print(f"Number of ICIS-AIR records with missing coordinates: {len(missing_coords)}")
# Remove records with missing coordinates
= df_detroit_metro.dropna(subset=['FacLat', 'FacLong'])
df_detroit_metro
# Create a GeoDataFrame for ICIS-AIR facilities
= gpd.GeoDataFrame(
gdf_icis_air
df_detroit_metro, =gpd.points_from_xy(df_detroit_metro.FacLong, df_detroit_metro.FacLat),
geometry="EPSG:4326"
crs
)
# Reproject ICIS-AIR data to Web Mercator
= gdf_icis_air.to_crs(epsg=3857)
gdf_icis_air
# Create the plot
= plt.subplots(figsize=(15, 15))
fig, ax
# Plot the metro area and bounding box (reusing objects from earlier)
=ax, facecolor='none', edgecolor='blue', linewidth=2)
detroit_metro.plot(ax=ax, color='red', linewidth=2)
bbox_polygon.boundary.plot(ax
# Plot ICIS-AIR facilities
=ax, color='cyan', markersize=50, alpha=0.7, label='ICIS-AIR Facilities')
gdf_icis_air.plot(ax
# Plot TRI facilities (reusing the detroit_tri object from earlier)
=ax, color='purple', markersize=50, alpha=0.7, label='TRI Facilities')
detroit_tri.plot(ax
# Add the basemap
=ctx.providers.OpenStreetMap.Mapnik)
ctx.add_basemap(ax, source
# Set the extent of the map to the bounding box
0], bbox_polygon.total_bounds[2])
ax.set_xlim(bbox_polygon.total_bounds[1], bbox_polygon.total_bounds[3])
ax.set_ylim(bbox_polygon.total_bounds[
# Remove axes
ax.set_axis_off()
# Add legend
ax.legend()
"Detroit Metro Area TRI and ICIS-AIR Facilities", fontsize=16)
plt.title(
plt.tight_layout()
plt.show()
print(f"Number of TRI facilities plotted: {len(detroit_tri)}")
print(f"Number of ICIS-AIR facilities plotted: {len(gdf_icis_air)}")
Custom TRI Form A Search
# URL of the CSV file
= "https://dmap-epa-enviro-prod-export.s3.amazonaws.com/338211710.CSV"
url
# Read the CSV file directly with pandas
= pd.read_csv(url)
df_tri_custom print(f"Successfully read CSV. Number of records: {len(df_tri_custom)}")
# Display information about the dataset
print("\nColumns in the dataset:")
print(df_tri_custom.columns)
# Assuming the latitude and longitude columns are named 'LATITUDE' and 'LONGITUDE'
# Adjust these names if they're different in your CSV
= 'LATITUDE'
lat_col = 'LONGITUDE'
lon_col = 'AIR_TOTAL_RELEASE' # Adjust this to the actual column name for air releases
release_col
# Remove records with missing coordinates or air release data
= df_tri_custom.dropna(subset=[lat_col, lon_col, release_col])
df_tri_clean
print(f"\nNumber of records after removing missing data: {len(df_tri_clean)}")
# Create a GeoDataFrame
= gpd.GeoDataFrame(
gdf_tri_custom
df_tri_clean, =gpd.points_from_xy(df_tri_clean[lon_col], df_tri_clean[lat_col]),
geometry="EPSG:4326"
crs
)
# Reproject to Web Mercator
= gdf_tri_custom.to_crs(epsg=3857)
gdf_tri_custom
# Create the plot
= plt.subplots(figsize=(15, 15))
fig, ax
# Plot the metro area and bounding box (reusing objects from earlier)
=ax, facecolor='none', edgecolor='blue', linewidth=2)
detroit_metro.plot(ax=ax, color='orangered', linewidth=2)
bbox_polygon.boundary.plot(ax
# Plot TRI facilities with graduated symbols based on air releases
= ax.scatter(gdf_tri_custom.geometry.x, gdf_tri_custom.geometry.y,
scatter # s=gdf_tri_custom[release_col]/100, # Adjust the scaling factor as needed
='orangered', # Static fill color
c='yellow', # Outline color
edgecolor=1, # Adjust the outline width as needed
linewidth=0.7)
alpha
# Add the basemap
=ctx.providers.OpenStreetMap.Mapnik)
ctx.add_basemap(ax, source
# Set the extent of the map to the bounding box
0], bbox_polygon.total_bounds[2])
ax.set_xlim(bbox_polygon.total_bounds[1], bbox_polygon.total_bounds[3])
ax.set_ylim(bbox_polygon.total_bounds[
# Remove axes
ax.set_axis_off()
# Add a legend for symbol sizes
= [1000, 10000, 100000] # Example sizes, adjust based on your data
legend_sizes = [plt.scatter([], [], s=size/100, c='orangered', edgecolor='yellow',
legend_elements =1, alpha=1, label=f'{size:,}')
linewidthfor size in legend_sizes]
=legend_elements, title='Total Air Releases (lbs)',
ax.legend(handles='lower right', title_fontsize=12, fontsize=10)
loc
"Detroit Metro Area TRI Facilities - Total Air Releases (Custom Data)", fontsize=16)
plt.title(
plt.tight_layout()
plt.show()
print(f"\nNumber of TRI facilities plotted: {len(gdf_tri_custom)}")
print(f"Total air releases: {gdf_tri_custom[release_col].sum():,.2f} lbs")
print(f"Average air release per facility: {gdf_tri_custom[release_col].mean():,.2f} lbs")
Rasterizing Pollution Sums
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import rasterio
from rasterio import features
from rasterio.transform import from_origin
from matplotlib.colors import BoundaryNorm, ListedColormap
import xarray as xr
import rioxarray # This imports rioxarray and adds the .rio accessor to xarray objects
# Ensure gdf_tri_custom and detroit_metro are in the correct CRS (should be EPSG:3857 for Web Mercator)
= gdf_tri_custom.to_crs(epsg=3857)
gdf_tri_custom = detroit_metro.to_crs(epsg=3857)
detroit_metro
# Get the bounds of the Detroit metro area
= detroit_metro.total_bounds
minx, miny, maxx, maxy
# Define the resolution (100m)
= 5000
resolution
# Calculate the number of cells
= int((maxx - minx) / resolution)
nx = int((maxy - miny) / resolution)
ny
# Create the transform for the raster
= from_origin(minx, maxy, resolution, resolution)
transform
# Prepare geometries and values for rasterization
= ((geom, value) for geom, value in zip(gdf_tri_custom.geometry, gdf_tri_custom.AIR_TOTAL_RELEASE))
shapes
# Rasterize the point data
= features.rasterize(shapes=shapes,
raster =(ny, nx),
out_shape=transform,
transform=0,
fill=True,
all_touched=rasterio.enums.MergeAlg.add)
merge_alg
# Convert the raster to an xarray DataArray
# Note: We use ny and nx here to ensure the coordinates match the raster shape
= xr.DataArray(raster,
raster_da ={'y': np.linspace(maxy, miny, ny),
coords'x': np.linspace(minx, maxx, nx)},
=['y', 'x'])
dims=True)
raster_da.rio.write_crs(detroit_metro.crs, inplace
# Clip the raster with the Detroit metro boundary
= raster_da.rio.clip(detroit_metro.geometry.values, detroit_metro.crs, drop=False, all_touched=True)
clipped_raster
# Define the breaks for the discrete scale
= [0, 1, 10, 100, 1000, 10000, 100000, 250000, 500000]
breaks
# Create a custom colormap
= ['#FFFFFF', '#FFFFCC', '#FFEDA0', '#FED976', '#FEB24C', '#FD8D3C', '#FC4E2A', '#E31A1C', '#B10026']
colors = ListedColormap(colors)
cmap
# Create a normalization based on the breaks
= BoundaryNorm(breaks, cmap.N)
norm
# Create the plot
= plt.subplots(figsize=(15, 15))
fig, ax
# Plot the clipped raster with the custom colormap and norm
= ax.imshow(clipped_raster, extent=[minx, maxx, miny, maxy], origin='upper',
im =cmap, norm=norm)
cmap
# Add colorbar with discrete labels
= plt.colorbar(im, ax=ax, extend='max',
cbar ='Total Air Releases (pounds)',
label=breaks)
ticksf'{b:,}' for b in breaks])
cbar.ax.set_yticklabels([
# Plot the TRI facility points
=ax, color='blue', markersize=20, alpha=0.7)
gdf_tri_custom.plot(ax
# Plot the Detroit metro boundary
=ax, color='black', linewidth=2)
detroit_metro.boundary.plot(ax
# Set the extent to match the Detroit metro area
ax.set_xlim(minx, maxx)
ax.set_ylim(miny, maxy)
# Add title and labels
'TRI Air Total Release (100m resolution sum) with Facility Locations', fontsize=16)
ax.set_title('X Coordinate')
ax.set_xlabel('Y Coordinate')
ax.set_ylabel(
plt.tight_layout()
plt.show()
print(f"Number of TRI facilities plotted: {len(gdf_tri_custom)}")
print(f"Total air releases: {gdf_tri_custom['AIR_TOTAL_RELEASE'].sum():,.2f}")
print(f"Maximum cell value in raster: {clipped_raster.max().values:,.2f}")
Air Release Vulnerability Index
import numpy as np
import xarray as xr
import rioxarray
import geopandas as gpd
import matplotlib.pyplot as plt
from rasterio.enums import Resampling
from scipy import stats
# Select the 'Overall' layer
= ds.sel(layer='Overall')
svi_overall
# Convert to rioxarray for geospatial operations
= svi_overall.rio.write_crs("EPSG:4326")
svi_overall
# Reproject SVI to match the CRS of the air release raster
= svi_overall.rio.reproject_match(clipped_raster)
svi_reprojected
# Clip SVI raster to the Detroit metro boundary
= svi_reprojected.rio.clip(detroit_metro.geometry.values, detroit_metro.crs, drop=True, all_touched=True)
svi_clipped
# Disaggregate the air release data to match the resolution of the SVI data
= clipped_raster.rio.reproject_match(
air_release_disaggregated
svi_clipped,=Resampling.bilinear
resampling
)
# Calculate raster correlation between SVI overall and air release within Detroit metro boundary
= svi_clipped.values.flatten()
svi_flat = air_release_disaggregated.values.flatten()
air_flat
# Remove NaN values
= ~np.isnan(svi_flat) & ~np.isnan(air_flat)
mask = svi_flat[mask]
svi_flat = air_flat[mask]
air_flat
= stats.pearsonr(svi_flat, air_flat)
correlation, p_value
print(f"Raster Correlation between SVI Overall and Air Release within Detroit metro: {correlation:.4f}")
print(f"P-value: {p_value:.4f}")
# New correlation analysis
# Ensure gdf_tri_custom is in the same CRS as svi_clipped
= gdf_tri_custom.to_crs(svi_clipped.rio.crs)
gdf_tri_custom
# Extract SVI values at TRI facility locations
= []
svi_values for point in gdf_tri_custom.geometry:
= svi_clipped.sel(x=point.x, y=point.y, method="nearest").values
svi_value
svi_values.append(svi_value)
# Add SVI values to gdf_tri_custom
'SVI_OVERALL'] = svi_values
gdf_tri_custom[
# Perform correlation analysis
= stats.pearsonr(gdf_tri_custom['AIR_TOTAL_RELEASE'], gdf_tri_custom['SVI_OVERALL'])
correlation_points, p_value_points
print(f"Point-based Correlation between SVI Overall and AIR_TOTAL_RELEASE: {correlation_points:.4f}")
print(f"P-value: {p_value_points:.4f}")
# Log1p transform the air release data and scale to 0-1
= np.log1p(air_release_disaggregated)
air_release_log = (air_release_log - air_release_log.min()) / (air_release_log.max() - air_release_log.min())
air_release_scaled
# Multiply scaled air release data with SVI data
= air_release_scaled * svi_clipped
vulnerability_indicator
# Create the plots
= plt.subplots(3, 1, figsize=(15, 45))
fig, axs
# Plot SVI Overall
= svi_clipped.plot(ax=axs[0], cmap='viridis', vmin=0, vmax=1, add_colorbar=False)
im1 =axs[0], label='SVI Overall')
plt.colorbar(im1, ax0].set_title('Social Vulnerability Index (Overall)', fontsize=16)
axs[=axs[0], color='black', linewidth=2)
detroit_metro.boundary.plot(ax
# Plot Original Air Release (log-transformed for better visualization)
= np.log1p(air_release_disaggregated).plot(ax=axs[1], cmap='YlOrRd', add_colorbar=False)
im2 =axs[1], label='Log(Air Release + 1)')
plt.colorbar(im2, ax1].set_title('Air Release (Log-transformed)', fontsize=16)
axs[=axs[1], color='black', linewidth=2)
detroit_metro.boundary.plot(ax
# Plot Air Release Vulnerability Indicator
= vulnerability_indicator.plot(ax=axs[2], cmap='YlOrRd', vmin=0, vmax=1, add_colorbar=False)
im3 =axs[2], label='Air Release Vulnerability Indicator')
plt.colorbar(im3, ax2].set_title('Air Release Vulnerability Indicator\n(Scaled Air Release * SVI)', fontsize=16)
axs[=axs[2], color='black', linewidth=2)
detroit_metro.boundary.plot(ax
for ax in axs:
'Longitude')
ax.set_xlabel('Latitude')
ax.set_ylabel(min(), svi_clipped.x.max())
ax.set_xlim(svi_clipped.x.min(), svi_clipped.y.max())
ax.set_ylim(svi_clipped.y.
plt.tight_layout()
plt.show()
# Print some statistics
print(f"Minimum vulnerability indicator: {vulnerability_indicator.min().values:.4f}")
print(f"Maximum vulnerability indicator: {vulnerability_indicator.max().values:.4f}")
print(f"Mean vulnerability indicator: {vulnerability_indicator.mean().values:.4f}")
# Convert the vulnerability indicator to a pandas DataFrame
= vulnerability_indicator.to_dataframe(name='index').reset_index()
vulnerability_df
# Sort by index value and get the top 10
= vulnerability_df.sort_values('index', ascending=False).head(10)
top_10
# Create points from the coordinates
'geometry'] = gpd.points_from_xy(top_10.x, top_10.y)
top_10[= gpd.GeoDataFrame(top_10, geometry='geometry', crs=vulnerability_indicator.rio.crs)
top_10_gdf
# Create the final map
= plt.subplots(figsize=(15, 15))
fig, ax
# Plot the Detroit metro boundary
=ax, color='black', linewidth=2)
detroit_metro.boundary.plot(ax
# Plot the top 10 points
=ax, color='blue', markersize=100, alpha=0.7)
top_10_gdf.plot(ax
# Add labels to the points
for idx, row in top_10_gdf.iterrows():
f"#{idx+1}", (row.geometry.x, row.geometry.y),
ax.annotate(=(3, 3), textcoords="offset points",
xytext='white', fontweight='bold')
color
# Add a basemap
=vulnerability_indicator.rio.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)
ctx.add_basemap(ax, crs
# Set the extent to match the Detroit metro area
min(), vulnerability_indicator.x.max())
ax.set_xlim(vulnerability_indicator.x.min(), vulnerability_indicator.y.max())
ax.set_ylim(vulnerability_indicator.y.
'Top 10 Areas with Highest Air Release Vulnerability Index', fontsize=16)
ax.set_title(
ax.set_axis_off()
plt.tight_layout()
plt.show()
# Print the coordinates of the top 10 points
print("Coordinates of the top 10 points:")
for idx, row in top_10_gdf.iterrows():
print(f"#{idx+1}: ({row.geometry.x}, {row.geometry.y})")
PLACES
The CDC PLACES (Population Level Analysis and Community Estimates) dataset is a collaboration between the Centers for Disease Control and Prevention (CDC), the Robert Wood Johnson Foundation, and the CDC Foundation. It provides model-based population-level analysis and community estimates of health indicators for all counties, places (incorporated and census designated places), census tracts, and ZIP Code Tabulation Areas (ZCTAs) across the United States.
Key Points
- Geographic Coverage: Entire United States, including all 50 states, the District of Columbia, and Puerto Rico.
- Geographic Granularity: Multiple levels including counties, cities/towns, census tracts, and ZIP codes.
- Health Indicators: Wide range of chronic disease measures related to health outcomes, prevention, and health risk behaviors.
- Data Sources:
- Behavioral Risk Factor Surveillance System (BRFSS)
- U.S. Census Bureau’s American Community Survey (ACS)
- Methodology: Uses small area estimation methods for small geographic areas.
- Health Measures Include:
- Chronic diseases: e.g., asthma, COPD, heart disease, diabetes
- Health risk behaviors: e.g., smoking, physical inactivity, binge drinking
- Prevention practices: e.g., health insurance coverage, dental visits, cholesterol screening
- Socioeconomic Data: Includes some socioeconomic and demographic variables.
- Annual Updates: Providing recent estimates for local areas.
This dataset is particularly valuable for: - Public health researchers - Policymakers - Community organizations
It provides a standardized way to compare health indicators across different geographic areas and can be used to inform targeted interventions and policy decisions, especially in addressing health disparities at a local level.
Processing
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import requests
import contextily as ctx
# Define the GeoJSON API endpoint
= "https://data.cdc.gov/resource/cwsq-ngmh.geojson"
url
# Define the Detroit metro area counties
= ['Wayne', 'Oakland', 'Macomb', 'Livingston', 'St. Clair', 'Lapeer']
detroit_counties
# Create the county filter string
= " OR ".join([f"countyname = '{county}'" for county in detroit_counties])
county_filter
# Define the query parameters
= {
params "$where": f"stateabbr = 'MI' AND ({county_filter})",
"$limit": 50000 # Adjust if necessary
}
# Make the API request
= requests.get(url, params=params)
response
if response.status_code == 200:
= response.json()
data print(f"Successfully retrieved data")
else:
print(f"Failed to retrieve data. Status code: {response.status_code}")
print(response.text)
# Convert to GeoDataFrame
= gpd.read_file(response.text)
gdf
# Print available health measures
print("\nAvailable health measures:")
print(gdf['measure'].unique())
# Print the first few rows to see the structure of the data
print("\nFirst few rows of the GeoDataFrame:")
print(gdf.head())
# Print basic information about the GeoDataFrame
print("\nGeoDataFrame Info:")
print(gdf.info())
# Create a sample map for one health measure (e.g., Current asthma)
= plt.subplots(figsize=(15, 15))
fig, ax
# Filter for the specific measure and ensure data_value is numeric
= gdf[gdf['measure'] == 'Current asthma among adults'].copy()
gdf_asthma 'data_value'] = pd.to_numeric(gdf_asthma['data_value'], errors='coerce')
gdf_asthma[
# Plot the asthma data
='data_value',
gdf_asthma.plot(column=ax,
ax=True,
legend={'label': 'Asthma Prevalence (%)', 'orientation': 'horizontal'},
legend_kwds='YlOrRd',
cmap={'color': 'lightgrey'})
missing_kwds
# Add county boundaries
#gdf_asthma.dissolve(by='countyname').boundary.plot(ax=ax, color='black', linewidth=0.5)
# Add basemap
=gdf_asthma.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)
ctx.add_basemap(ax, crs
# Set the extent to match the Detroit metro area
0], gdf_asthma.total_bounds[2])
ax.set_xlim(gdf_asthma.total_bounds[1], gdf_asthma.total_bounds[3])
ax.set_ylim(gdf_asthma.total_bounds[
'Asthma Prevalence in Detroit Metro Area', fontsize=16)
plt.title('off')
ax.axis(
plt.tight_layout()
plt.show()
# Print some statistics for the asthma data
print("\nAsthma Statistics:")
print(f"Average asthma prevalence: {gdf_asthma['data_value'].mean():.2f}%")
print(f"Minimum asthma prevalence: {gdf_asthma['data_value'].min():.2f}%")
print(f"Maximum asthma prevalence: {gdf_asthma['data_value'].max():.2f}%")
# Print the number of census tracts per county
print("\nNumber of census tracts per county:")
print(gdf_asthma['countyname'].value_counts())
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
from scipy.interpolate import griddata
import rasterio
from rasterio.transform import from_origin
# Assuming gdf_asthma is already created and contains the asthma data
# Ensure gdf_asthma is in EPSG:4326
= gdf_asthma.to_crs(epsg=4326)
gdf_asthma
# Extract coordinates and values
= gdf_asthma.geometry.x.values
X = gdf_asthma.geometry.y.values
Y = gdf_asthma['data_value'].values
Z
# Remove any NaN values
= ~np.isnan(Z)
mask = X[mask], Y[mask], Z[mask]
X, Y, Z
# Create a grid to interpolate over
= 0.025 # in degrees
grid_resolution = gdf_asthma.total_bounds
x_min, y_min, x_max, y_max = np.arange(x_min, x_max, grid_resolution)
grid_x = np.arange(y_min, y_max, grid_resolution)
grid_y = np.meshgrid(grid_x, grid_y)
grid_xx, grid_yy
# Perform IDW interpolation
= np.column_stack((X, Y))
points = griddata(points, Z, (grid_xx, grid_yy), method='linear')
grid_z
# Create the plot
= plt.subplots(figsize=(15, 15))
fig, ax
# Plot the interpolated data
= ax.imshow(grid_z, extent=[x_min, x_max, y_min, y_max],
im ='lower', cmap='YlOrRd', alpha=0.7)
origin
# Add colorbar
= plt.colorbar(im, ax=ax, label='Asthma Prevalence')
cbar
# Add basemap
=gdf_asthma.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)
ctx.add_basemap(ax, crs
# Set the extent to match the Detroit metro area
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
'IDW Interpolated Asthma Prevalence in Detroit Metro Area', fontsize=16)
plt.title('off')
ax.axis(
plt.tight_layout()
plt.show()
# Save the interpolated raster
= from_origin(x_min, y_max, grid_resolution, grid_resolution)
transform with rasterio.open('idw_asthma.tif', 'w',
='GTiff',
driver=grid_z.shape[0],
height=grid_z.shape[1],
width=1,
count=grid_z.dtype,
dtype='EPSG:4326', # Explicitly set the CRS
crs=transform) as dst:
transform1)
dst.write(grid_z,
# Print some statistics about the interpolated data
print(f"Minimum interpolated value: {np.nanmin(grid_z):.2f}")
print(f"Maximum interpolated value: {np.nanmax(grid_z):.2f}")
print(f"Mean interpolated value: {np.nanmean(grid_z):.2f}")
Deployment
Congratulations! …. Now you should be able to:
- Test test…
Lesson 3
In this lesson, we explored ….