%%capture
import warnings
warnings.filterwarnings("ignore")
import altair as alt
import branca.colormap as cm
import folium
import gt_extras as gte
import itables
import pandas as pd
import polars as pl
import calitp_data_analysis.magics
from datetime import datetime
from great_tables import GT, nanoplot_options
from IPython.display import HTML
import _color_palette
import chart_utils_for_stops as chart_utils
import report_utils
from rt_msa_utils import stop_report_month
alt.data_transformers.enable("vegafusion")
alt.data_transformers.enable(consolidate_datasets=True)
# format date to be Jan 2026
formatted_date = datetime.strptime(stop_report_month, "%Y-%m-%d").strftime("%b %Y")
# Comment out, this is `parameters` tagged cell
#schedule_name = "Big Blue Bus Schedule"
#name = "Big Blue Bus TripUpdates"
# Parameters
name = "Antelope Valley Transit Authority Trip Updates"
Schedule: Antelope Valley Transit Authority Schedule¶
Data reflecting: Jan 2026 weekdays
One of the most common transit user behaviors is to consult an app (Google Maps, Apple Maps, NextBus, etc) to find out when the bus or train is going to arrive.
That widely desired piece of information is powered by GTFS Real-Time Trip Updates, specifically the Stop Time Updates specification. The underlying data produced here is huge. Imagine every instance a bus arrives at a stop in California. Multiply that by 30 for the 30 minutes before the bus arrives, and that’s the dataset we’re working to distill into usable performance metrics for all transit operators.
Generally, we want better transit user experience. Specifically, the performance metrics we can derive from GTFS RT Trip Updates distills into the following objectives:
Increase prediction reliability and accuracy
Increase the availability and completeness of GTFS RT
Decrease the inconsistency and fluctuations of predictions
# Set variables for color bars used across maps, route dropdown, and great tables
PREDICTION_ERROR_COLORS =list(_color_palette.PREDICTION_ERROR_COLOR_PALETTE.values())
PREDICTION_ERROR_INDEX = [-5, -3, -1, 1, 3, 5]
PREDICTION_ERROR_LEGEND_CAPTION = "minutes (negative = late; positive = early)"
POS_BAR_COLOR = _color_palette.get_color("blueberry")
NEG_BAR_COLOR = _color_palette.get_color("vivid_cerise")
Due to size limitations, the routes that have at least one priority stop are presented in an interactive table.
itables.init_notebook_mode()
display_cols = [
'route_dir_name', 'stop_id', 'stop_name',
'daily_stop_arrivals',
'avg_prediction_error_minutes',
'pct_tu_complete_minutes',
'bus_catch_likelihood',
'n_predictions', 'n_tu_trips'
]
group_cols = [c for c in display_cols if c != "route_dir_name"]
# For df to display, only show the stop once, even if multiple routes serve it
# save route_dir_name as list
display_df = (
plot_gdf
.groupby(group_cols)
.agg({"route_dir_name": lambda x: list(set(x))})
.reset_index()
).sort_values("daily_stop_arrivals", ascending=False)
Loading...
Loading...
itables.show(
display_df[["route_dir_name"] + group_cols],
caption="Stops",
classes="compact",
pageLength=2,
maxBytes="5MB", # can play with what this is
search={"caseInsensitive": True},
showIndex=False,
fixedColumns={"start": 3, "end": 0},
scrollX=True,
buttons=["pageLength", "columnsToggle", "copyHtml5", "csvHtml5", "excelHtml5"],
)
Loading...
# https://mwouts.github.io/itables/apps/notebook.html
# turn display of itables off
itables.init_notebook_mode(all_interactive=False)
del display_df
Average prediction error: The accuracy of the predictions is predicted arrival - actual arrival. Closer to zero or small positive values are better, as you are more likely to catch the bus with minimal wait time.
Goal 1: fewer stops with negative prediction errors. We would rather have transit users follow the predictions and wait for the bus.
Goal 2: tighten the IQR range of prediction errors and have the range move closer to zero for shorter expected wait times.
Update completeness: The% of minutes with 2+ predictions available. Higher is better.Goal: 90% or above
Bus catch likelihood: The % prediction early or on-time predictions, and captures whether you’re likely to catch the bus by following the prediction exactly. Higher is better.
Goal: 80% or above
Prediction spread / wobble: this metric tracks whether the predicted arrival time was consistent before the bus arrived. Lower is better, as the predictions are not fluctuating wildly and frustrating for the rider.
# Define parameters for interactive altair charts
list_of_routes = sorted(stop_gdf.route_name.unique().tolist())
dropdown = alt.binding_select(options=list_of_routes, name = "Route ")
selection = alt.selection_point(
fields=['route_name'],
bind=dropdown,
)
legend_selection = alt.selection_point(fields = ['route_name'], bind='legend')
# save out df to use for altair (can't have timestamp or geometry)
stop_df = stop_gdf[[
"stop_id", "stop_name",
"route_name", "direction_id",
"stop_rank",
# these are the metrics
"avg_prediction_error_minutes", "prediction_error_label",
"pct_tu_complete_minutes",
"bus_catch_likelihood",
"avg_prediction_spread_minutes",
"n_predictions", "n_tu_trips"
]]
stop_col = "stop_rank"
direction_col = "direction_id"
HEIGHT = 150
WIDTH = 200
prediction_error_bar = chart_utils.chart_ordered_by_stop(
stop_df,
y_col = "avg_prediction_error_minutes",
dropdown_selection = selection,
is_faceted = True
).mark_bar().encode(
# not sure if when-then can support multiple conditions yet, seems to grab first and otherwise only
# https://github.com/vega/altair/issues/2759
color = alt.when(
alt.datum.avg_prediction_error_minutes >= 0, empty=False
).then(alt.value(POS_BAR_COLOR))
.otherwise(alt.value(NEG_BAR_COLOR)),
).properties(
title = "Avg Prediction Error (minutes)",
height = HEIGHT, width = WIDTH
)
error_stacked_bar = chart_utils.prediction_error_categories_stacked_bar(
stop_df,
dropdown_selection = selection,
legend_selection = legend_selection
).properties(
title = "# Stops by Prediction Error Category",
height = HEIGHT, width = WIDTH
)
# Note the order, define chart height/width first, then facet, then add title (so it appears over faceted chart)
# https://stackoverflow.com/questions/52872927/altair-cant-facet-layered-plots
completeness_chart = chart_utils.pct_completeness_line_chart(
stop_df,
y_col = "pct_tu_complete_minutes",
dropdown_selection = selection,
horiz_y_value = 0.9
).properties(
height = HEIGHT, width = WIDTH
).facet(column = direction_col, title = "Direction").properties(title = "% Minutes with 2+ Prediction")
# facet this way allows layering, but can't overwrite title correctly
bus_catch_chart = chart_utils.bus_catch_likelihood_line_chart(
stop_df,
y_col = "bus_catch_likelihood",
dropdown_selection = selection,
horiz_y_value = 0.8
).properties(
height = HEIGHT, width = WIDTH
).facet(column = direction_col, title = "Direction").properties(title = "% Early + On-time Predictions")
prediction_spread_line = chart_utils.prediction_spread_line_chart(
stop_df,
y_col = "avg_prediction_spread_minutes",
dropdown_selection = selection,
).properties(
height = HEIGHT, width = WIDTH
).facet(column = direction_col, title = "Direction").properties(title = "Prediction Spread / Wobble (minutes)")
This table shows all the routes side-by-side, in ascending order by prediction error (more late stops shown first).
The nanoplots show prediction error by stop order, showing there is quite a bit of variation even along the same route.
Average prediction error: The accuracy of the predictions is predicted arrival - actual arrival. Closer to zero or small positive values are better, as you are more likely to catch the bus with minimal wait time.
Goal 1: fewer stops with negative prediction errors, lower bound of IQR not negative. We would rather have transit users follow the predictions and wait for the bus.
Goal 2: small IQR range of prediction errors and have the range move closer to zero for shorter expected wait times.
nanoplot_df = report_utils.merge_route_to_stop_for_nanoplot(
route_df,
stop_gdf
)
# TODO: check torrance schedule, how it behaves, because maybe will have to pick 1
# sort this so negative prediction errors show up first, then more late stops
nanoplot_pl = pl.from_pandas(
nanoplot_df[[
"route_dir_name",
"avg_prediction_error_minutes",
# nanoplot columns
"prediction_error_by_stop",
"p25", "p75",
#"early_late_stop_counts"
"n_early_stops", "n_late_stops"
]].sort_values(
["avg_prediction_error_minutes", "n_late_stops", "route_dir_name"],
ascending = [True, False, False]
)
)