%%capture

import warnings
warnings.filterwarnings("ignore")

import altair as alt
import branca.colormap as cm
import folium
import pandas as pd

import calitp_data_analysis.magics
import gt_extras as gte

from great_tables import GT, md

import chart_utils_for_operators as chart_utils
import prep_operator_data
import report_utils
import _color_palette
from rt_msa_utils import operator_report_month

alt.data_transformers.enable("vegafusion")

one_month = pd.to_datetime(operator_report_month)

# parameters cell
#name = "Antelope Valley Transit Authority Trip Updates"
#schedule_name = "Antelope Valley Transit Authority Schedule"

# Parameters
name = "Culver City TripUpdates"

%%capture_parameters

#name = "Antelope Valley Transit Authority Trip Updates"
schedule_name = "Antelope Valley Transit Authority Schedule"
date_format = "%b %Y" # gtfs_digest/_new_operator_report_utils.py
one_month_formatted = one_month.strftime(date_format)

name, schedule_name, one_month_formatted

Culver City TripUpdates¶

GTFS Stop Time Updates Jan 2026 Summary¶

Generally, we want better transit user experience. Specifically, the performance metrics we can derive from GTFS RT Trip Updates distills into the following objectives:

Increase prediction reliability and accuracy
Increase the availability and completeness of GTFS RT
Decrease the inconsistency and fluctuations of predictions

operator_cols = ["day_type"]

schedule_cols = [
    "daily_trips", "daily_service_hours", "n_routes", "n_shapes",
    "n_stops", "num_stop_times", "daily_arrivals", 'n_days_schedule_and_rt'
]

vp_cols = ['vp_messages_per_minute', 'pct_vp_trips', 'pct_vp_routes'] #'daily_vp_trips'
tu_cols =['tu_messages_per_minute', 'pct_tu_trips', 'pct_tu_routes'] #'daily_tu_trips',

tu_prediction_cols = [
    "bus_catch_likelihood", "pct_tu_complete_minutes", # both are percents
    "p25", "p75", "iqr", "p50",
    "n_predictions",
    "prediction_padding_minutes", "avg_prediction_spread_minutes"
]

df = report_utils.import_operator_df(
    filters = [[
        ("month_first_day", "==", one_month),
        ("tu_name", "==", name),
    ]],
).pipe(prep_operator_data.merge_in_operator_percentiles)

Schedule + RT Summary Stats¶

schedule_table = (
    GT(df[operator_cols + schedule_cols])
    .cols_label(
        daily_trips = "Daily Trips",
        daily_service_hours = "Daily Service Hours",
        n_routes = "# Routes",
        n_shapes = "# Shapes",
        n_stops = "# Stops",
        num_stop_times = "Total Scheduled Arrivals",
        daily_arrivals = "Daily Scheduled Arrivals",    
        n_days_schedule_and_rt = "# days with both RT",
    ).fmt_integer(
        columns = [
            "daily_trips", "n_routes", "n_shapes", "n_stops", 
            "num_stop_times", "daily_arrivals", "n_days_schedule_and_rt"]
    ).fmt_number(
        columns = ["daily_service_hours"], decimals=1
    ).tab_spanner(
        label="Schedule",
        columns = schedule_cols
    ).tab_header(
        title = "Schedule + RT Summary Metrics", 
        subtitle = f"{one_month_formatted}"
    )
)

chart_utils.format_great_table(schedule_table)

rt_table = (
    GT(df[operator_cols + vp_cols + tu_cols])
    .cols_label(
        tu_messages_per_minute = "Trip Updates per Minute",
        pct_tu_trips = "% Trips",
        pct_tu_routes = "% Routes",
        vp_messages_per_minute = "Vehicle Positions per Minute",
        pct_vp_trips = "% Trips",
        pct_vp_routes = "% Routes",     
    ).fmt_number(
        columns = ["tu_messages_per_minute", "vp_messages_per_minute"], 
        decimals=1
    ).fmt_percent(
        columns=["pct_tu_trips", "pct_tu_routes", "pct_vp_trips", "pct_vp_routes"], 
        decimals=1
    ).tab_spanner(
        label="Trip Updates",
        columns=tu_cols
    ).tab_spanner(
        label="Vehicle Positions",
        columns=vp_cols
    )
)

chart_utils.format_great_table(rt_table).pipe(
    gte.gt_color_box, 
    columns=["tu_messages_per_minute", "vp_messages_per_minute"], 
    palette="Blues",
    domain=[1, 3]
).pipe(
    gte.gt_hulk_col_numeric, 
    columns=["pct_tu_trips", "pct_tu_routes", "pct_vp_trips", "pct_vp_routes"],
    palette=["#FFEC8B", "#E5F5E0"], #[light goldenrod1, white alyssum (from greens)]
    domain=[0, 1],
    alpha=0.1
)

table = (
    GT(df[operator_cols + tu_prediction_cols])
    .cols_label(
        pct_tu_complete_minutes = "% Minutes with 2+ Predictions",
        bus_catch_likelihood = "% Early + On-time",
        p50 = "Prediction Error", 
        avg_prediction_spread_minutes = "Prediction Spread / Wobble",
        prediction_padding_minutes = "Prediction Padding",
        n_predictions = "# Predictions",
        iqr = ""
    ).fmt_percent(columns=["bus_catch_likelihood", "pct_tu_complete_minutes"], decimals=1)
    .fmt_number(columns=["p50", "avg_prediction_spread_minutes", "prediction_padding_minutes"], decimals=1)
    .fmt_integer(columns=["n_predictions"])
    .tab_header(title = f"Trip Update Prediction Accuracy Metrics", subtitle = "units are in minutes")
).pipe(chart_utils.format_great_table)

table.pipe(
    gte.gt_plt_dumbbell,
    col1='p25',
    col2='p75',
    label = "IQR",
    num_decimals=1,
    col1_color="purple",
    col2_color="orange",
    width=100, height=50,
    font_size=8
).pipe(
    gte.gt_hulk_col_numeric, 
    columns=["bus_catch_likelihood", "pct_tu_complete_minutes"],
    palette=["#FFEC8B", "#E5F5E0"], #[light goldenrod1, white alyssum (from greens)]
    domain=[0, 1],
    alpha=0.1
).cols_width(
    cases={
        c: "10%" for c in [
            "bus_catch_likelihood", "pct_tu_complete_minutes", 
            "prediction_padding_minutes", "avg_prediction_spread_minutes",
            "n_predictions", "iqr", "p50"
        ]
    }
)

Prediction Error Percentiles¶

The ratio of the 10th to 50th percentiles provides a sense of accuracy loss.

Newmark’s paper on a small sample of transit agencies suggests that the positive prediction errors typically have ratios of 4.
Late predictions (negative prediction errors) have ratios around 3.

decile_cols = [
    "month_first_day", "day_type",
    "schedule_name", "tu_name",
    'pos_prediction_error_sec_array', 'pos_prediction_error_sec_percentile_array', 
    'neg_prediction_error_sec_array', 'prediction_error_sec_percentile_array'
]

operator_deciles_df = report_utils.import_operator_df(
    filters = [[
        ("month_first_day", "==", one_month),
        ("tu_name", "==", name),
    ]],
    columns = decile_cols
).pipe(prep_operator_data.operator_deciles_for_chart)

percentile_chart = chart_utils.fig5and6_prediction_error_plots(operator_deciles_df)
percentile_chart

operator_cols = ["day_type"]
percentile_chart_cols = ["pos_error_ratio", "neg_error_ratio"]

mini_p10_p50_table = (
    GT(df[df.month_first_day == one_month][operator_cols + percentile_chart_cols])
    .cols_label(
        pos_error_ratio = "Early Predictions", 
        neg_error_ratio = "Late Predictions",
    ).fmt_number(columns=["pos_error_ratio", "neg_error_ratio"], decimals=1)
    .tab_header(
        title = "Accuracy Loss", 
        subtitle = "Ratio of 10th to 50th Percentile (lower = less accuracy loss)"
    )
)

chart_utils.format_great_table(mini_p10_p50_table).tab_options(table_width = "50%")

Map of Routes¶

The following layers are available:

Average prediction error (minutes) for all routes
Routes with <90% update completeness Providing complete real-time information for all routes is the crucial foundation.
Highly Variable Routes (IQR > 3) that could benefit from transit-supportive policies (signal priority, bus lanes). The variability in prediction accuracy here could be due to the local traffic conditions.
Routes with Bus Catch Likelihood (early + on-time accuracy < 75%), or late predictions 25% of the time.

route_gdf = report_utils.import_route_df(
    filters = [[
        ("month_first_day", "==", one_month),
        ("schedule_name", "==", schedule_name),
        ("day_type", "==", "Weekday")
    ]],
    columns = [
        "schedule_name", "tu_name", 
        "route_dir_name",
        "avg_prediction_error_minutes", "prediction_error_label",
        "pct_tu_complete_minutes", 
        "iqr", "bus_catch_likelihood",
        "geometry"
    ]
).drop_duplicates().reset_index(drop=True)

# Set color palette variables
PREDICTION_ERROR_COLORS =list(_color_palette.PREDICTION_ERROR_COLOR_PALETTE.values())
PREDICTION_ERROR_INDEX = [-5, -3, -1, 1, 3, 5]
PREDICTION_ERROR_LEGEND_CAPTION = "minutes (negative = late; positive = early)"

POS_BAR_COLOR = _color_palette.get_color("blueberry")
NEG_BAR_COLOR = _color_palette.get_color("vivid_cerise")

condition_completeness = route_gdf.pct_tu_complete_minutes < 0.9
condition_variability = route_gdf.iqr >= 3
condition_likelihood = route_gdf.bus_catch_likelihood < 0.75

m = route_gdf.explore(
    "avg_prediction_error_minutes",
    tiles = "CartoDB Positron",
    name = "All Routes",
    cmap = cm.StepColormap(
        colors=PREDICTION_ERROR_COLORS, index=PREDICTION_ERROR_INDEX, 
        vmin=-5, vmax=5,
        tick_labels=PREDICTION_ERROR_INDEX,
        caption=PREDICTION_ERROR_LEGEND_CAPTION
    ),
    marker_kwds={"fill": True},
    style_kwds={"opacity": 0.5, "fillOpacity": 0.3}
)

if len(route_gdf[condition_completeness]) > 0:
    m = route_gdf[condition_completeness].explore(
        "route_dir_name",
        m=m,
        tiles = "CartoDB Positron",
        name = "< 90% update completeness", # color by route-dir name, same as stop report
        categorical = True,
        legend = False,
    )

if len(route_gdf[condition_variability]) > 0:
    m = route_gdf[condition_variability].explore(
        "iqr",
        m=m,
        tiles = "CartoDB Positron",
        name = "High Variability (IQR 3+ minutes) Routes", 
        categorical = False,
        legend = True,
        cmap="viridis",
    )

if len(route_gdf[condition_likelihood]) > 0:
    m = route_gdf[condition_likelihood].explore(
        "bus_catch_likelihood",
        m=m,
        tiles = "CartoDB Positron",
        name = "<75% Bus Catch Likelihood", 
        categorical = False,
        legend = True,
        cmap="cividis"
    )

folium.LayerControl().add_to(m)

<folium.map.LayerControl at 0x78add03665d0>

Route Summary¶

Prediction accuracy varies by routes. Particularly, routes with high variability (IQRs) may have local traffic conditions that confound the prediction algorithm. For these routes, a focus on improving service reliability through additional infrastructure (signal priority, bus lanes), or other transit planning and policies could be explored.

Negative 25th percentiles (more than 1 minute late) mean that riders miss the bus (late predictions). These routes may benefit from service reliability improvements for riders.

route_iqr_df = report_utils.import_route_df(
    filters = [[
        ("tu_name", "==", name),
        ("month_first_day", "==", one_month),
        ("day_type", "==", "Weekday")
    ]],
    columns= [
        "route_dir_name", 
        "avg_prediction_error_minutes",
        "n_predictions",
        "p25", "p75",
        "iqr",
        "scaled_p25", "scaled_p75",
        # does putting IQR help in interpretation?
    ]
).sort_values("iqr", ascending=False)

route_iqr_table = (
    GT(route_iqr_df)
    .cols_label(
        route_dir_name = "Route-Direction",
        n_predictions = "# Predictions",
        iqr = "",
        avg_prediction_error_minutes = "Prediction Error (minutes)",
    ).fmt_integer(["n_predictions"])
    .tab_header(
        title = "Route Summary Metrics",
        subtitle = md(
            """
            High IQR = variability -> focus on service reliability through transit planning and policies. 
            Variability could be due to local traffic conditions. 
            Negative 25th percentiles = riders miss bus."""
        )
    ).tab_options(table_font_size="11px", table_width = "100%")
)

route_iqr_table.pipe(
    gte.gt_plt_dumbbell,
    col1='p25',
    col2='p75',
    label = "IQR (minutes)",
    num_decimals=1,
    width=200, height=50,
    col1_color=_color_palette.get_color("valentino"),
    col2_color=_color_palette.get_color("lizard_green"),
    font_size=8
).pipe(
    gte.gt_plt_dumbbell,
    col1="scaled_p25",
    col2='scaled_p75',
    label='scaled IQR',
    num_decimals=3,
    width=200, height=50,
    col1_color=_color_palette.get_color("valentino"),
    col2_color=_color_palette.get_color("lizard_green"),
    font_size=8
).pipe(
    gte.gt_color_box, 
    columns=["avg_prediction_error_minutes"], 
    palette=PREDICTION_ERROR_COLORS,
    domain=[-5, 5]
).pipe(
    gte.gt_color_box, 
    columns=["iqr"], 
    palette="YlOrRd",
).cols_width(
    cases={
        "avg_prediction_error_minutes": "10%",
        "n_predictions": "10%",
        "iqr": "10%"
    }
).cols_align("center").cols_align("left", columns = "route_dir_name")