Washington Landslide Data Preprocessing

Washington Landslide Data Preprocessing#

Overview#

This notebook processes landslide data from the Washington Geological Survey (WGS) into a standardized format for integration with other regional datasets. We primarily look at the deposits and the landslide compilation layer of the bigger dataset.

The workflow includes:

Data Loading: Load GeoDataFrame layers from WGS Landslides.gdb geodatabase
Layer Analysis: Examine deposits and compilation layers structure and content
Data Standardization: Standardize confidence levels, movement types, and material classifications
Reference Integration: Create proper citation references for the dataset
Data Combination: Merge deposits and compilation layers with priority handling
Column Selection: Select and rename columns to match unified schema
Export: Save processed data as GeoJSON for further analysis

Input: WGS_Landslides.gdb (deposits and landslide_compilation layers)

Output: washington_landslides_processed.geojson

Obtain the Washington Geological Survey Landslide data from the official website: https://dnr.wa.gov/washington-geological-survey/geologic-hazards-and-environment/landslides

https://dnr.wa.gov/washington-geological-survey/publications-and-data/geology-gis-data-and-databases

Create a directory ‘Data’ in your projects root folder and place the downloaded files inside it.

Also create a directory ‘ProcessedDataSets’ inside of your PreProcessing Directory

Initialization#

import fiona
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

gdb_dir = "../data/ger_portal_landslides_ds29_v1.4_Apr2024/WGS_Landslides.gdb"

layers = fiona.listlayers(gdb_dir)
print("Layers in geodatabase:")
for lyr in layers:
    print("  -", lyr)

Initial Inspection of Deposits Layer#

deposits = gpd.read_file(gdb_dir, layer="landslide_deposit")

# Basic info
print(deposits.shape)      # (n_deposits, n_fields)
print(deposits.dtypes)     # field names and types

# Simple plot of landslide deposits
fig, ax = plt.subplots(figsize=(12, 8))
deposits.plot(ax=ax, alpha=0.7, color='red', markersize=0.5)
ax.set_title("Washington State Landslide Deposits", fontsize=16, fontweight='bold')
ax.set_xlabel("Longitude")
ax.set_ylabel("Latitude")
plt.tight_layout()
plt.show()

import matplotlib.pyplot as plt
import contextily as ctx

gdf_3857 = deposits.to_crs(epsg=3857)

fig, ax = plt.subplots(figsize=(15, 15))
gdf_3857.plot(ax=ax, edgecolor="k", linewidth=0.2, alpha=0.7)

# When the axis data is already in EPSG:3857, you can omit the crs argument:
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)

plt.title('Washington Landslides Deposits Layer', fontsize=16)
ctx.add_attribution(ax, "Map tiles: © OpenStreetMap contributors")
plt.tight_layout()
plt.show()

Inspect Landslide Compilation Dataset#

# Load landslide compilation layer
compilation = gpd.read_file(gdb_dir, layer="landslide_compilation")

# Inspect the compilation dataframe
print("Landslide compilation shape:", compilation.shape)
print("\nColumn names:")
for col in compilation.columns:
    print(f"  - {col}")

print("\nData types:")
print(compilation.dtypes)

print("\nFirst few rows:")
compilation.head()

import matplotlib.pyplot as plt
import contextily as ctx

gdf_3857 = compilation.to_crs(epsg=3857)

fig, ax = plt.subplots(figsize=(15, 15))
gdf_3857.plot(ax=ax, edgecolor="k", linewidth=0.2, alpha=0.7)

# When the axis data is already in EPSG:3857, you can omit the crs argument:
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)

plt.title('Washington Landslides Compilation Layer', fontsize=16)
ctx.add_attribution(ax, "Map tiles: © OpenStreetMap contributors")
plt.tight_layout()
plt.show()

print("Columns in compilation:")
for col in compilation.columns:
    print(f"  - {col}")

# Compare column names and values between deposits and compilation layers
print("Deposits columns:")
for col in deposits.columns:
    print(f"  - {col}")

print("\nCompilation columns:")
for col in compilation.columns:
    print(f"  - {col}")

# Check for potential matching columns
print("\nPotential matching columns:")
deposits_cols = set(deposits.columns)
compilation_cols = set(compilation.columns)

# Find common columns
common_cols = deposits_cols.intersection(compilation_cols)
print("Common columns:", list(common_cols))

# Check unique values in confidence-related columns
print("\nUnique values in deposits CONFIDENCE:")
print(deposits['CONFIDENCE'].value_counts())

if 'LOCATION_CONFIDENCE' in compilation.columns:
    print("\nUnique values in compilation LOCATION_CONFIDENCE:")
    print(compilation['LOCATION_CONFIDENCE'].value_counts())

if 'DATA_CONFIDENCE' in compilation.columns:
    print("\nUnique values in compilation DATA_CONFIDENCE:")
    print(compilation['DATA_CONFIDENCE'].value_counts())
    

# Check current DATA_CONFIDENCE values in compilation
print("Current DATA_CONFIDENCE values in compilation:")
print(compilation['DATA_CONFIDENCE'].value_counts())

# Create a mapping for standardizing confidence levels
confidence_mapping = {
    'Moderate-High': 'Moderate',
    'Low': 'Low', 
    'Moderate': 'Moderate',
    'Low-Moderate': 'Moderate',
    'High': 'High'
}

# Apply the mapping to create a new standardized confidence column
compilation['CONFIDENCE'] = compilation['DATA_CONFIDENCE'].map(confidence_mapping)

# Check the results
print("\nStandardized confidence distribution:")
print(compilation['CONFIDENCE'].value_counts())

# Inspect SLOPE_MORPHOLOGY in compilation layer
print("SLOPE_MORPHOLOGY values in compilation:")
print(compilation['SLOPE_MORPHOLOGY'].value_counts())

Create References for Compilation and Deposits Layer#

# Inspect unique values in SOURCE_INFORMATION column from compilation layer
print("Unique SOURCE_INFORMATION values in compilation:")
unique_sources = compilation['SOURCE_INFORMATION'].value_counts()
print(unique_sources)

print(f"\nTotal number of unique sources: {len(unique_sources)}")

# Look at some sample source information entries
print("\nSample SOURCE_INFORMATION entries:")
sample_sources = compilation['SOURCE_INFORMATION'].dropna().head(10)
for i, source in enumerate(sample_sources, 1):
    print(f"{i}. {source}")

# Create a new column 'Reference' in compilation and copy SOURCE_INFORMATION values to it
compilation['filter_REFERENCE'] = compilation['SOURCE_INFORMATION']

# Verify the new column was created
print("New 'Reference' column created successfully")
print("Sample Reference values:")
print(compilation['filter_REFERENCE'].head())

deposits['filter_REFERENCE'] = 'WGS'

Add Dataset Link and Origin#

# Add ORIGIN column to both datasets
compilation['filter_ORIGIN'] = 'WASHINGTON'
deposits['filter_ORIGIN'] = 'WASHINGTON'

print("Added ORIGIN column to both datasets")
print("Compilation ORIGIN values:", compilation['filter_ORIGIN'].unique())
print("Deposits ORIGIN values:", deposits['filter_ORIGIN'].unique())

# Add DATASET_LINK column to deposits layer only
deposits['filter_DATASET_LINK'] = 'https://dnr.wa.gov/washington-geological-survey/geologic-hazards-and-environment/landslides'

print("Added DATASET_LINK column to deposits layer")
print("Sample DATASET_LINK values:", deposits['filter_DATASET_LINK'].head())

Combine Landslide Compilation and Deposits layer#

remove compilation data from area where lidar mapping has been done#

(avoid duplicates, keep best data)

read the study_area
Check that the study area contains landslide deposits data (some study area contrains other data but no landslide deposits so we want to keep compilation there)
Apply mask to compilation data

study_areas = gpd.read_file(gdb_dir, layer="study_areas")
print(study_areas.shape)
print(study_areas.dtypes)
print(study_areas.crs)
study_areas

if deposits.crs != study_areas.crs:
    study_areas = study_areas.to_crs(deposits.crs)

study_areas["geometry"] = study_areas.geometry.make_valid()
deposits["geometry"] = deposits.geometry.make_valid()

sa = study_areas[["DESCRIPTION", "geometry"]].reset_index().rename(columns={"index": "study_idx"})
dep = deposits[["geometry"]].reset_index().rename(columns={"index": "deposit_idx"})

hits = gpd.sjoin(
    sa,
    dep,
    how="left",
    predicate="covers",   # study covers deposit (deposit fully inside; boundary-touch ok)
)

counts = (
    hits.loc[hits["deposit_idx"].notna()]
        .groupby("study_idx")["deposit_idx"]
        .nunique()
        .rename("n_deposits_covered")
        .reset_index()
)

study_areas2 = sa.merge(counts, on="study_idx", how="left")
study_areas2["n_deposits_covered"] = study_areas2["n_deposits_covered"].fillna(0).astype(int)

study_areas_filtered = study_areas2.loc[study_areas2["n_deposits_covered"] > 1].copy() # using 2 here to exclude Mt rainier

print("Study areas (original):", len(study_areas2))
print("Study areas (kept, cover >=2 deposit):", len(study_areas_filtered))
print("Study areas (dropped, cover 0 deposits):", int((study_areas2["n_deposits_covered"] == 0).sum()))

study_areas2

study_areas_3857 = study_areas_filtered.to_crs(epsg=3857)
deposit_3857 = deposits.to_crs(epsg=3857)
compilation_3857 = compilation.to_crs(epsg=3857)

for _, sa in study_areas_3857.iterrows():
    study_idx = sa["study_idx"]
    desc = sa["DESCRIPTION"]

    fig, ax = plt.subplots(figsize=(7, 7))

    # plot ALL deposits (we zoom anyway)
    deposit_3857.plot(ax=ax, alpha=0.6, linewidth=0.3)

    compilation_3857.plot(ax=ax, alpha=0.6, linewidth=0.3, color="orange")

    # plot this study area outline
    study_areas_3857.loc[
        study_areas_3857["study_idx"] == study_idx
    ].plot(
        ax=ax,
        facecolor="none",
        edgecolor="red",
        linewidth=2.0,
    )

    # zoom tightly to the study area bounds
    minx, miny, maxx, maxy = sa.geometry.bounds
    pad_x = 0.08 * (maxx - minx) if maxx > minx else 2000
    pad_y = 0.08 * (maxy - miny) if maxy > miny else 2000

    ax.set_xlim(minx - pad_x, maxx + pad_x)
    ax.set_ylim(miny - pad_y, maxy + pad_y)

    # basemap
    ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)

    ax.set_axis_off()
    ax.set_title(f"study_idx={study_idx}\n{desc}", fontsize=11)
    ctx.add_attribution(ax, "Map tiles: © OpenStreetMap contributors")

    plt.tight_layout()
    plt.show()

# combine excluded areas to one mask
mask = study_areas_filtered.dissolve()
mask

gdf_3857 = compilation.to_crs(epsg=3857)
mask_3857 = mask.to_crs(epsg=3857)

fig, ax = plt.subplots(figsize=(15, 15))

# Keep
gdf_3857.plot(
    ax=ax,
    color="steelblue",
    edgecolor="k",
    linewidth=0.2,
    alpha=0.5,
    label="Kept"
)

# Mask
mask_3857.plot(
    ax=ax,
    facecolor="none",
    edgecolor="red",
    linewidth=2.0,
    label="mask detailed"
)

ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)

ax.legend()
ax.set_title("Preview of Landslides in the deposits", fontsize=16)
ax.set_axis_off()

plt.tight_layout()
plt.show()

applied_mask = compilation.intersects(mask.geometry.iloc[0])

compilation_filtered = compilation.loc[~applied_mask].copy()

# resulting compilation dataset
gdf_3857 = compilation_filtered.to_crs(epsg=3857)
mask_3857 = mask.to_crs(epsg=3857)


fig, ax = plt.subplots(figsize=(15, 15))

gdf_3857.plot(
    ax=ax,
    color="steelblue",
    edgecolor="k",
    linewidth=0.2,
    alpha=0.5,
    label="Kept"
)

# Mask
mask_3857.plot(
    ax=ax,
    facecolor="none",
    edgecolor="red",
    linewidth=2.0,
    label="Exclusion area"
)

ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)

ax.legend()
ax.set_title("Preview of Landslides compilation after filtering", fontsize=16)
ax.set_axis_off()

plt.tight_layout()
plt.show()

# Combine the compilation and deposits layers
combined_landslides = gpd.GeoDataFrame(pd.concat([deposits, compilation_filtered], ignore_index=True), crs=deposits.crs)

print("Combined dataset shape:", combined_landslides.shape)
print("Deposits shape:", deposits.shape)
print("Compilation shape:", compilation_filtered.shape)

print("Columns in combined dataset:")
for col in combined_landslides.columns:
    print(f"  - {col}")

New DataFrame Creation#

# Create a new GeoDataFrame
washington_landslides = gpd.GeoDataFrame(combined_landslides, geometry='geometry', crs=deposits.crs)

Inspect the new DataFrame#

print("New GeoDataFrame shape:", washington_landslides.shape)
print("\nColumn names:")
for col in washington_landslides.columns:
    print(f"  - {col}")

washington_landslides.head()

# duplicate the material, movement and confidence columns for filter
washington_landslides['filter_MOVEMENT'] = washington_landslides['MOVEMENT']
washington_landslides['filter_MATERIAL'] = washington_landslides['MATERIAL']
washington_landslides['filter_CONFIDENCE'] = washington_landslides['CONFIDENCE']

washington_landslides.dtypes

Save the DataFrame#

washington_landslides.to_file("./processed_geojson/washington_landslides_processed_v2.geojson", driver="GeoJSON")

loaded_landslides = gpd.read_file("processed_geojson/washington_landslides_processed.geojson")
print("Loaded GeoDataFrame shape:", loaded_landslides.shape)
loaded_landslides.head()

import matplotlib.pyplot as plt
import contextily as ctx

gdf_3857 = washington_landslides.to_crs(epsg=3857)

fig, ax = plt.subplots(figsize=(15, 15))
gdf_3857.plot(ax=ax, edgecolor="k", linewidth=0.2, alpha=0.7)
mask_3857.plot(
    ax=ax,
    facecolor="none",
    edgecolor="red",
    linewidth=2.0,
    label="Exclusion area"
)

# When the axis data is already in EPSG:3857, you can omit the crs argument:
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)

plt.title('Washington Landslides', fontsize=16)
ctx.add_attribution(ax, "Map tiles: © OpenStreetMap contributors")
plt.tight_layout()
plt.show()
fig.savefig("./out.png")