ActivitySim · jpn-- · May 20, 2025 · Mar 18, 2024 · Mar 18, 2024 · Mar 19, 2024
diff --git a/.github/workflows/core_tests.yml b/.github/workflows/core_tests.yml
@@ -275,11 +275,11 @@ jobs:
           - region: Standard 1-Zone Example (MTC)
             region-org: ActivitySim
             region-repo: activitysim-prototype-mtc
-            region-branch: extended
+            region-branch: pandas2
           - region: Standard 2-Zone Example (SANDAG)
             region-org: ActivitySim
             region-repo: sandag-abm3-example
-            region-branch: main
+            region-branch: pandas2
       fail-fast: false
     defaults:
       run:

diff --git a/activitysim/abm/models/disaggregate_accessibility.py b/activitysim/abm/models/disaggregate_accessibility.py
@@ -158,7 +158,7 @@ class DisaggregateAccessibilitySettings(PydanticReadable, extra="forbid"):
     """
     Disaggreate accessibility table is grouped by the "by" cols above and the KEEP_COLS are averaged
     across the group.  Initializing the below as NA if not in the auto ownership level, they are skipped
-    in the groupby mean and the values are correct. 
+    in the groupby mean and the values are correct.
     (It's a way to avoid having to update code to reshape the table and introduce new functionality there.)
     If none, will keep all of the columns with "accessibility" in the name.
     """
@@ -581,7 +581,7 @@ def expand_template_zones(self, tables):
         _expanded = pd.DataFrame(util.named_product(**index_params)).set_index("index")
 
         # Use result to join template onto expanded table of zones
-        ex_table = _expanded.join(master_template).reset_index()
+        ex_table = _expanded.join(master_template).sort_index().reset_index()
 
         # Concatenate a new unique set of ids
         cols = ["home_zone_id", "proto_household_id", "proto_person_id"]
@@ -654,7 +654,9 @@ def create_proto_pop(self):
                 .set_index("index")
                 .rename(columns={"hhid": hhid})
             )
-            persons = rep.join(persons).sort_values(hhid).reset_index(drop=True)
+            persons = (
+                rep.join(persons, sort=True).sort_values(hhid).reset_index(drop=True)
+            )
             persons[perid] = persons.index + 1
 
             # Assign persons to tours
@@ -730,6 +732,7 @@ def merge_persons(self):
 
         perid = self.params["proto_persons"]["index_col"]
         persons_merged.set_index(perid, inplace=True, drop=True)
+        persons_merged = persons_merged.sort_index()
         self.proto_pop["proto_persons_merged"] = persons_merged
 
         # Store in pipeline

diff --git a/activitysim/abm/models/input_checker.py b/activitysim/abm/models/input_checker.py
@@ -301,34 +301,42 @@ def report_errors(state, input_checker_settings, v_warnings, v_errors):
 
             for warn in warns:
                 if "dataframe validator" in str(warn.message):
-                    file_logger.warning(
-                        "Failed dataframe validator: "
-                        + str(warn.message).split("\n")[-1]
-                    )
-                elif "element-wise validator" in str(warn.message):
-                    if "DataFrameSchema" in str(warn.message):
-                        file_logger.warning(
-                            "Failed element-wise validator: <"
-                            + str(warn.message).split("\n")[0].split(" ")[1]
-                            + table_name
-                            + ")>\n\t"
-                            + str(warn.message)
-                            .split("failure cases:\n")[0]
-                            .split("\n")[-2]
-                            + "\n\tfailure cases:\n\t"
-                            + "\n\t".join(
-                                str(warn.message)
-                                .split("failure cases:\n")[1]
-                                .split("\n")
-                            )
-                        )
-                    else:
+                    try:
                         file_logger.warning(
-                            "Failed element-wise validator: <"
-                            + " ".join(str(warn.message).split("\n")[0].split(" ")[1:3])
-                            + "\n\t"
-                            + "\n\t".join(str(warn.message).split("\n")[1:])
+                            "Failed dataframe validator: "
+                            + str(warn.message).split("\n")[-1]
                         )
+                    except Exception:
+                        file_logger.warning(warn)
+                elif "element-wise validator" in str(warn.message):
+                    try:
+                        if "DataFrameSchema" in str(warn.message):
+                            file_logger.warning(
+                                "Failed element-wise validator: <"
+                                + str(warn.message).split("\n")[0].split(" ")[1]
+                                + table_name
+                                + ")>\n\t"
+                                + str(warn.message)
+                                .split("failure cases:\n")[0]
+                                .split("\n")[-2]
+                                + "\n\tfailure cases:\n\t"
+                                + "\n\t".join(
+                                    str(warn.message)
+                                    .split("failure cases:\n")[1]
+                                    .split("\n")
+                                )
+                            )
+                        else:
+                            file_logger.warning(
+                                "Failed element-wise validator: <"
+                                + " ".join(
+                                    str(warn.message).split("\n")[0].split(" ")[1:3]
+                                )
+                                + "\n\t"
+                                + "\n\t".join(str(warn.message).split("\n")[1:])
+                            )
+                    except Exception:
+                        file_logger.warning(warn)
                 else:
                     file_logger.warning(warn)
             file_logger.warning("\n")

diff --git a/activitysim/abm/models/school_escorting.py b/activitysim/abm/models/school_escorting.py
@@ -634,7 +634,7 @@ def school_escorting(
     state.add_table("tours", tours)
     state.get_rn_generator().drop_channel("tours")
     state.get_rn_generator().add_channel("tours", tours)
-    state.add_table("escort_bundles", escort_bundles)
+    state.add_table("escort_bundles", escort_bundles.reset_index(drop=True))
     # save school escorting tours and trips in pipeline so we can overwrite results from downstream models
     state.add_table("school_escort_tours", school_escort_tours)
     state.add_table("school_escort_trips", school_escort_trips)

diff --git a/activitysim/abm/models/trip_departure_choice.py b/activitysim/abm/models/trip_departure_choice.py
@@ -404,7 +404,7 @@ def apply_stage_two_model(
     trace_label: str,
     compute_settings: ComputeSettings | None = None,
 ):
-    if not trips.index.is_monotonic:
+    if not trips.index.is_monotonic_increasing:
         trips = trips.sort_index()
 
     # Assign the duration of the appropriate leg to the trip

diff --git a/activitysim/abm/models/util/school_escort_tours_trips.py b/activitysim/abm/models/util/school_escort_tours_trips.py
@@ -353,7 +353,7 @@ def create_chauf_escort_trips(bundles):
             "outbound",
             "purpose",
         ]
-    ).reset_index()
+    ).reset_index(drop=True)
 
     # numbering trips such that outbound escorting trips must come first and inbound trips must come last
     outbound_trip_num = -1 * (
@@ -539,7 +539,7 @@ def create_escortee_trips(bundles):
     # create a new trip for each escortee destination
     escortee_trips = escortee_trips.explode(
         ["destination", "escort_participants", "school_escort_trip_num", "purpose"]
-    ).reset_index()
+    ).reset_index(drop=True)
 
     # numbering trips such that outbound escorting trips must come first and inbound trips must come last
     # this comes in handy when merging trips to others in the tour decided downstream

diff --git a/activitysim/abm/models/vehicle_allocation.py b/activitysim/abm/models/vehicle_allocation.py
@@ -261,6 +261,7 @@ def vehicle_allocation(
             ]
 
         # set choice for non-household vehicle option
+        choices["choice"] = choices["choice"].astype(veh_choice_dtype)
         choices.loc[
             choices["alt_choice"] == alts_from_spec[-1], "choice"
         ] = alts_from_spec[-1]

diff --git a/activitysim/cli/create.py b/activitysim/cli/create.py
@@ -2,6 +2,7 @@
 
 import glob
 import hashlib
+import importlib.resources
 import logging
 import os
 import shutil
@@ -21,14 +22,15 @@
 
 def _example_path(resource):
     resource = os.path.join(EXAMPLES_DIR, resource)
-    path = pkg_resources.resource_filename(PACKAGE, resource)
-
-    return path
+    return importlib.resources.as_file(
+        importlib.resources.files(PACKAGE).joinpath(resource)
+    )
 
 
 def _load_manifest():
-    with open(_example_path(MANIFEST), "r") as f:
-        manifest = yaml.safe_load(f.read())
+    with _example_path(MANIFEST) as f_pth:
+        with open(f_pth, "r") as f:
+            manifest = yaml.safe_load(f.read())
 
     assert manifest, f"error: could not load {MANIFEST}"
     return {example["name"]: example for example in manifest}
@@ -177,8 +179,9 @@ def get_example(
             )
 
         else:
-            for asset_path in glob.glob(_example_path(assets)):
-                copy_asset(asset_path, target_path, dirs_exist_ok=True)
+            with _example_path(assets) as pth:
+                for asset_path in glob.glob(str(pth)):
+                    copy_asset(asset_path, target_path, dirs_exist_ok=True)
 
     print(f"copied! new project files are in {os.path.abspath(dest_path)}")
 

diff --git a/activitysim/core/assign.py b/activitysim/core/assign.py
@@ -96,7 +96,36 @@ def read_assignment_spec(
     """
 
     try:
-        cfg = pd.read_csv(file_name, comment="#")
+        # we use an explicit list of na_values, these are the values that
+        # Pandas version 1.5 recognized as NaN by default.  Notably absent is
+        # 'None' which is used in some spec files to be the object `None` not
+        # the float value NaN.
+        cfg = pd.read_csv(
+            file_name,
+            comment="#",
+            na_values=[
+                "",
+                "#N/A",
+                "#N/A N/A",
+                "#NA",
+                "-1.#IND",
+                "-1.#QNAN",
+                "-NaN",
+                "-nan",
+                "1.#IND",
+                "1.#QNAN",
+                "<NA>",
+                "N/A",
+                "NA",
+                "NULL",
+                "NaN",
+                "n/a",
+                "nan",
+                "null",
+            ],
+            keep_default_na=False,
+        )
+
     except Exception as e:
         logger.error(f"Error reading spec file: {file_name}")
         logger.error(str(e))

diff --git a/activitysim/core/fast_eval.py b/activitysim/core/fast_eval.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import pandas as pd
+from pandas import eval as _eval
+
+if TYPE_CHECKING:
+    from collections.abc import Hashable, Iterator, Mapping, Sequence
+
+    from pandas._typing import ArrayLike
+
+
+def _get_cleaned_column_resolvers(
+    df: pd.DataFrame, raw: bool = True
+) -> dict[Hashable, ArrayLike | pd.Series]:
+    """
+    Return the special character free column resolvers of a dataframe.
+
+    Column names with special characters are 'cleaned up' so that they can
+    be referred to by backtick quoting.
+    Used in :meth:`DataFrame.eval`.
+    """
+    from pandas import Series
+    from pandas.core.computation.parsing import clean_column_name
+
+    if isinstance(df, pd.Series):
+        return {clean_column_name(df.name): df}
+
+    # CHANGED FROM PANDAS: do not even convert the arrays to pd.Series, just
+    # give the raw arrays to the compute engine. This is potentially a breaking
+    # change if any of the operations in the eval string require a pd.Series.
+    if raw:
+        # Performance tradeoff: in the dict below, we iterate over `df.items`,
+        # which yields tuples of (column_name, data as pd.Series). This is marginally
+        # slower than iterating over `df.columns` and `df._iter_column_arrays()`,
+        # but the latter is not in Pandas' public API, and may be removed in the future.
+        return {
+            clean_column_name(k): v for k, v in df.items() if not isinstance(k, int)
+        }
+
+    # CHANGED FROM PANDAS: do not call df.dtype inside the dict comprehension loop
+    # This update has been made in https://github.com/pandas-dev/pandas/pull/59573,
+    # but appears not to have been released yet as of pandas 2.2.3
+    dtypes = df.dtypes
+
+    return {
+        clean_column_name(k): Series(
+            v, copy=False, index=df.index, name=k, dtype=dtypes[k]
+        ).__finalize__(df)
+        for k, v in zip(df.columns, df._iter_column_arrays())
+        if not isinstance(k, int)
+    }
+
+
+def fast_eval(df: pd.DataFrame, expr: str, **kwargs) -> Any | None:
+    """
+    Evaluate a string describing operations on DataFrame columns.
+
+    Operates on columns only, not specific rows or elements.  This allows
+    `eval` to run arbitrary code, which can make you vulnerable to code
+    injection if you pass user input to this function.
+
+    This function is a wrapper that replaces :meth:`~pandas.DataFrame.eval`
+    with a more efficient version than in the default pandas library (as
+    of pandas 2.2.3). It is recommended to use this function instead of
+    :meth:`~pandas.DataFrame.eval` for better performance. However, if you
+    encounter issues with this function, you can switch back to the default
+    pandas eval by changing the function call from `fast_eval(df, ...)` to
+    `df.eval(...)`.
+
+    Parameters
+    ----------
+    expr : str
+        The expression string to evaluate.
+    **kwargs
+        See the documentation for  :meth:`~pandas.DataFrame.eval` for complete
+        details on the keyword arguments accepted.
+
+    Returns
+    -------
+    ndarray, scalar, or pandas object
+        The result of the evaluation.
+    """
+
+    inplace = False
+    kwargs["level"] = kwargs.pop("level", 0) + 1
+    index_resolvers = df._get_index_resolvers()
+    column_resolvers = _get_cleaned_column_resolvers(df)
+    resolvers = column_resolvers, index_resolvers
+    if "target" not in kwargs:
+        kwargs["target"] = df
+    kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers
+
+    try:
+        return pd.Series(
+            _eval(expr, inplace=inplace, **kwargs), index=df.index, name=expr
+        ).__finalize__(df)
+    except Exception as e:
+        # Initially assume that the exception is caused by the potentially
+        # breaking change in _get_cleaned_column_resolvers, and try again
+        # TODO: what kind of exception should be caught here so it is less broad
+        column_resolvers = _get_cleaned_column_resolvers(df, raw=False)
+        resolvers = column_resolvers, index_resolvers
+        kwargs["resolvers"] = kwargs["resolvers"][:-2] + resolvers
+        return _eval(expr, inplace=inplace, **kwargs)
diff --git a/activitysim/core/interaction_simulate.py b/activitysim/core/interaction_simulate.py
@@ -14,6 +14,7 @@
 
 from activitysim.core import chunk, logit, simulate, tracing, util, workflow
 from activitysim.core.configuration.base import ComputeSettings
+from activitysim.core.fast_eval import fast_eval
 
 logger = logging.getLogger(__name__)
 
@@ -287,7 +288,7 @@ def to_series(x):
                         if expr.startswith("@"):
                             v = to_series(eval(expr[1:], globals(), locals_d))
                         else:
-                            v = df.eval(expr, resolvers=[locals_d])
+                            v = fast_eval(df, expr, resolvers=[locals_d])
 
                         if check_for_variability and v.std() == 0:
                             logger.info(
@@ -556,7 +557,7 @@ def to_series(x):
                             if expr.startswith("@"):
                                 v = to_series(eval(expr[1:], globals(), locals_d))
                             else:
-                                v = df.eval(expr, resolvers=[locals_d])
+                                v = fast_eval(df, expr, resolvers=[locals_d])
                             if check_for_variability and v.std() == 0:
                                 logger.info(
                                     "%s: no variability (%s) in: %s"

diff --git a/activitysim/core/los.py b/activitysim/core/los.py
@@ -780,7 +780,15 @@ def get_mazpairs(self, omaz, dmaz, attribute):
                 self.maz_ceiling
             ) + np.asanyarray(dmaz, dtype=np.int64)
         else:
-            i = np.asanyarray(omaz) * self.maz_ceiling + np.asanyarray(dmaz)
+            # if we have less than a 32-bit index, it will
+            # overflow so we need to upgrade to at least 32 bit
+            omaz_as_array = np.asanyarray(omaz)
+            if omaz_as_array.dtype not in (np.int32, np.int64):
+                omaz_as_array = omaz_as_array.astype(np.int32)
+            dmaz_as_array = np.asanyarray(dmaz)
+            if dmaz_as_array.dtype not in (np.int32, np.int64):
+                dmaz_as_array = dmaz_as_array.astype(np.int32)
+            i = omaz_as_array * self.maz_ceiling + dmaz_as_array
         s = util.quick_loc_df(i, self.maz_to_maz_df, attribute)
 
         # FIXME - no point in returning series?