Washington Landslide Data Preprocessing#

Overview#

This notebook processes landslide data from the Washington Geological Survey (WGS) into a standardized format for integration with other regional datasets. We primarily look at the deposits and the landslide compilation layer of the bigger dataset.

The workflow includes:

  1. Data Loading: Load GeoDataFrame layers from WGS Landslides.gdb geodatabase

  2. Layer Analysis: Examine deposits and compilation layers structure and content

  3. Data Standardization: Standardize confidence levels, movement types, and material classifications

  4. Reference Integration: Create proper citation references for the dataset

  5. Data Combination: Merge deposits and compilation layers with priority handling

  6. Column Selection: Select and rename columns to match unified schema

  7. Export: Save processed data as GeoJSON for further analysis

Input: WGS_Landslides.gdb (deposits and landslide_compilation layers)

Output: washington_landslides_processed.geojson

Obtain the Washington Geological Survey Landslide data from the official website: https://dnr.wa.gov/washington-geological-survey/geologic-hazards-and-environment/landslides

https://dnr.wa.gov/washington-geological-survey/publications-and-data/geology-gis-data-and-databases

Create a directory ‘Data’ in your projects root folder and place the downloaded files inside it.

Also create a directory ‘ProcessedDataSets’ inside of your PreProcessing Directory

Initialization#

import fiona
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
gdb_dir = "../data/ger_portal_landslides_ds29_v1.4_Apr2024/WGS_Landslides.gdb"

layers = fiona.listlayers(gdb_dir)
print("Layers in geodatabase:")
for lyr in layers:
    print("  -", lyr)

Initial Inspection of Deposits Layer#

deposits = gpd.read_file(gdb_dir, layer="landslide_deposit")
# Basic info
print(deposits.shape)      # (n_deposits, n_fields)
print(deposits.dtypes)     # field names and types
# Simple plot of landslide deposits
fig, ax = plt.subplots(figsize=(12, 8))
deposits.plot(ax=ax, alpha=0.7, color='red', markersize=0.5)
ax.set_title("Washington State Landslide Deposits", fontsize=16, fontweight='bold')
ax.set_xlabel("Longitude")
ax.set_ylabel("Latitude")
plt.tight_layout()
plt.show()
import matplotlib.pyplot as plt
import contextily as ctx

gdf_3857 = deposits.to_crs(epsg=3857)

fig, ax = plt.subplots(figsize=(15, 15))
gdf_3857.plot(ax=ax, edgecolor="k", linewidth=0.2, alpha=0.7)

# When the axis data is already in EPSG:3857, you can omit the crs argument:
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)

plt.title('Washington Landslides Deposits Layer', fontsize=16)
ctx.add_attribution(ax, "Map tiles: © OpenStreetMap contributors")
plt.tight_layout()
plt.show()

Inspect Landslide Compilation Dataset#

# Load landslide compilation layer
compilation = gpd.read_file(gdb_dir, layer="landslide_compilation")

# Inspect the compilation dataframe
print("Landslide compilation shape:", compilation.shape)
print("\nColumn names:")
for col in compilation.columns:
    print(f"  - {col}")

print("\nData types:")
print(compilation.dtypes)

print("\nFirst few rows:")
compilation.head()
import matplotlib.pyplot as plt
import contextily as ctx

gdf_3857 = compilation.to_crs(epsg=3857)

fig, ax = plt.subplots(figsize=(15, 15))
gdf_3857.plot(ax=ax, edgecolor="k", linewidth=0.2, alpha=0.7)

# When the axis data is already in EPSG:3857, you can omit the crs argument:
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)

plt.title('Washington Landslides Compilation Layer', fontsize=16)
ctx.add_attribution(ax, "Map tiles: © OpenStreetMap contributors")
plt.tight_layout()
plt.show()
print("Columns in compilation:")
for col in compilation.columns:
    print(f"  - {col}")
# Compare column names and values between deposits and compilation layers
print("Deposits columns:")
for col in deposits.columns:
    print(f"  - {col}")

print("\nCompilation columns:")
for col in compilation.columns:
    print(f"  - {col}")

# Check for potential matching columns
print("\nPotential matching columns:")
deposits_cols = set(deposits.columns)
compilation_cols = set(compilation.columns)

# Find common columns
common_cols = deposits_cols.intersection(compilation_cols)
print("Common columns:", list(common_cols))

# Check unique values in confidence-related columns
print("\nUnique values in deposits CONFIDENCE:")
print(deposits['CONFIDENCE'].value_counts())

if 'LOCATION_CONFIDENCE' in compilation.columns:
    print("\nUnique values in compilation LOCATION_CONFIDENCE:")
    print(compilation['LOCATION_CONFIDENCE'].value_counts())

if 'DATA_CONFIDENCE' in compilation.columns:
    print("\nUnique values in compilation DATA_CONFIDENCE:")
    print(compilation['DATA_CONFIDENCE'].value_counts())
    
# Check current DATA_CONFIDENCE values in compilation
print("Current DATA_CONFIDENCE values in compilation:")
print(compilation['DATA_CONFIDENCE'].value_counts())

# Create a mapping for standardizing confidence levels
confidence_mapping = {
    'Moderate-High': 'Moderate',
    'Low': 'Low', 
    'Moderate': 'Moderate',
    'Low-Moderate': 'Moderate',
    'High': 'High'
}

# Apply the mapping to create a new standardized confidence column
compilation['CONFIDENCE'] = compilation['DATA_CONFIDENCE'].map(confidence_mapping)

# Check the results
print("\nStandardized confidence distribution:")
print(compilation['CONFIDENCE'].value_counts())
# Inspect SLOPE_MORPHOLOGY in compilation layer
print("SLOPE_MORPHOLOGY values in compilation:")
print(compilation['SLOPE_MORPHOLOGY'].value_counts())

Create References for Compilation and Deposits Layer#

# Inspect unique values in SOURCE_INFORMATION column from compilation layer
print("Unique SOURCE_INFORMATION values in compilation:")
unique_sources = compilation['SOURCE_INFORMATION'].value_counts()
print(unique_sources)

print(f"\nTotal number of unique sources: {len(unique_sources)}")

# Look at some sample source information entries
print("\nSample SOURCE_INFORMATION entries:")
sample_sources = compilation['SOURCE_INFORMATION'].dropna().head(10)
for i, source in enumerate(sample_sources, 1):
    print(f"{i}. {source}")
# Create a new column 'Reference' in compilation and copy SOURCE_INFORMATION values to it
compilation['filter_REFERENCE'] = compilation['SOURCE_INFORMATION']

# Verify the new column was created
print("New 'Reference' column created successfully")
print("Sample Reference values:")
print(compilation['filter_REFERENCE'].head())
deposits['filter_REFERENCE'] = 'WGS'

Combine Landslide Compilation and Deposits layer#

remove compilation data from area where lidar mapping has been done#

(avoid duplicates, keep best data)

  • read the study_area

  • Check that the study area contains landslide deposits data (some study area contrains other data but no landslide deposits so we want to keep compilation there)

  • Apply mask to compilation data

study_areas = gpd.read_file(gdb_dir, layer="study_areas")
print(study_areas.shape)
print(study_areas.dtypes)
print(study_areas.crs)
study_areas
if deposits.crs != study_areas.crs:
    study_areas = study_areas.to_crs(deposits.crs)

study_areas["geometry"] = study_areas.geometry.make_valid()
deposits["geometry"] = deposits.geometry.make_valid()

sa = study_areas[["DESCRIPTION", "geometry"]].reset_index().rename(columns={"index": "study_idx"})
dep = deposits[["geometry"]].reset_index().rename(columns={"index": "deposit_idx"})

hits = gpd.sjoin(
    sa,
    dep,
    how="left",
    predicate="covers",   # study covers deposit (deposit fully inside; boundary-touch ok)
)

counts = (
    hits.loc[hits["deposit_idx"].notna()]
        .groupby("study_idx")["deposit_idx"]
        .nunique()
        .rename("n_deposits_covered")
        .reset_index()
)

study_areas2 = sa.merge(counts, on="study_idx", how="left")
study_areas2["n_deposits_covered"] = study_areas2["n_deposits_covered"].fillna(0).astype(int)

study_areas_filtered = study_areas2.loc[study_areas2["n_deposits_covered"] > 1].copy() # using 2 here to exclude Mt rainier

print("Study areas (original):", len(study_areas2))
print("Study areas (kept, cover >=2 deposit):", len(study_areas_filtered))
print("Study areas (dropped, cover 0 deposits):", int((study_areas2["n_deposits_covered"] == 0).sum()))
study_areas2
study_areas_3857 = study_areas_filtered.to_crs(epsg=3857)
deposit_3857 = deposits.to_crs(epsg=3857)
compilation_3857 = compilation.to_crs(epsg=3857)

for _, sa in study_areas_3857.iterrows():
    study_idx = sa["study_idx"]
    desc = sa["DESCRIPTION"]

    fig, ax = plt.subplots(figsize=(7, 7))

    # plot ALL deposits (we zoom anyway)
    deposit_3857.plot(ax=ax, alpha=0.6, linewidth=0.3)

    compilation_3857.plot(ax=ax, alpha=0.6, linewidth=0.3, color="orange")

    # plot this study area outline
    study_areas_3857.loc[
        study_areas_3857["study_idx"] == study_idx
    ].plot(
        ax=ax,
        facecolor="none",
        edgecolor="red",
        linewidth=2.0,
    )

    # zoom tightly to the study area bounds
    minx, miny, maxx, maxy = sa.geometry.bounds
    pad_x = 0.08 * (maxx - minx) if maxx > minx else 2000
    pad_y = 0.08 * (maxy - miny) if maxy > miny else 2000

    ax.set_xlim(minx - pad_x, maxx + pad_x)
    ax.set_ylim(miny - pad_y, maxy + pad_y)

    # basemap
    ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)

    ax.set_axis_off()
    ax.set_title(f"study_idx={study_idx}\n{desc}", fontsize=11)
    ctx.add_attribution(ax, "Map tiles: © OpenStreetMap contributors")

    plt.tight_layout()
    plt.show()
# combine excluded areas to one mask
mask = study_areas_filtered.dissolve()
mask
gdf_3857 = compilation.to_crs(epsg=3857)
mask_3857 = mask.to_crs(epsg=3857)

fig, ax = plt.subplots(figsize=(15, 15))

# Keep
gdf_3857.plot(
    ax=ax,
    color="steelblue",
    edgecolor="k",
    linewidth=0.2,
    alpha=0.5,
    label="Kept"
)

# Mask
mask_3857.plot(
    ax=ax,
    facecolor="none",
    edgecolor="red",
    linewidth=2.0,
    label="mask detailed"
)

ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)

ax.legend()
ax.set_title("Preview of Landslides in the deposits", fontsize=16)
ax.set_axis_off()

plt.tight_layout()
plt.show()
applied_mask = compilation.intersects(mask.geometry.iloc[0])

compilation_filtered = compilation.loc[~applied_mask].copy()
# resulting compilation dataset
gdf_3857 = compilation_filtered.to_crs(epsg=3857)
mask_3857 = mask.to_crs(epsg=3857)


fig, ax = plt.subplots(figsize=(15, 15))

gdf_3857.plot(
    ax=ax,
    color="steelblue",
    edgecolor="k",
    linewidth=0.2,
    alpha=0.5,
    label="Kept"
)

# Mask
mask_3857.plot(
    ax=ax,
    facecolor="none",
    edgecolor="red",
    linewidth=2.0,
    label="Exclusion area"
)

ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)

ax.legend()
ax.set_title("Preview of Landslides compilation after filtering", fontsize=16)
ax.set_axis_off()

plt.tight_layout()
plt.show()
# Combine the compilation and deposits layers
combined_landslides = gpd.GeoDataFrame(pd.concat([deposits, compilation_filtered], ignore_index=True), crs=deposits.crs)

print("Combined dataset shape:", combined_landslides.shape)
print("Deposits shape:", deposits.shape)
print("Compilation shape:", compilation_filtered.shape)
print("Columns in combined dataset:")
for col in combined_landslides.columns:
    print(f"  - {col}")

New DataFrame Creation#

# Create a new GeoDataFrame
washington_landslides = gpd.GeoDataFrame(combined_landslides, geometry='geometry', crs=deposits.crs)

Inspect the new DataFrame#

print("New GeoDataFrame shape:", washington_landslides.shape)
print("\nColumn names:")
for col in washington_landslides.columns:
    print(f"  - {col}")

washington_landslides.head()
# duplicate the material, movement and confidence columns for filter
washington_landslides['filter_MOVEMENT'] = washington_landslides['MOVEMENT']
washington_landslides['filter_MATERIAL'] = washington_landslides['MATERIAL']
washington_landslides['filter_CONFIDENCE'] = washington_landslides['CONFIDENCE']
washington_landslides.dtypes

Save the DataFrame#

washington_landslides.to_file("./processed_geojson/washington_landslides_processed_v2.geojson", driver="GeoJSON")
loaded_landslides = gpd.read_file("processed_geojson/washington_landslides_processed.geojson")
print("Loaded GeoDataFrame shape:", loaded_landslides.shape)
loaded_landslides.head()
import matplotlib.pyplot as plt
import contextily as ctx

gdf_3857 = washington_landslides.to_crs(epsg=3857)

fig, ax = plt.subplots(figsize=(15, 15))
gdf_3857.plot(ax=ax, edgecolor="k", linewidth=0.2, alpha=0.7)
mask_3857.plot(
    ax=ax,
    facecolor="none",
    edgecolor="red",
    linewidth=2.0,
    label="Exclusion area"
)

# When the axis data is already in EPSG:3857, you can omit the crs argument:
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)

plt.title('Washington Landslides', fontsize=16)
ctx.add_attribution(ax, "Map tiles: © OpenStreetMap contributors")
plt.tight_layout()
plt.show()
fig.savefig("./out.png")