Skip to content

Update to use pandas v2.* #932

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 34 commits into from
May 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
7b850ca
updates for pandas 2.2
jpn-- Mar 18, 2024
002604d
pytables 3.9
jpn-- Mar 18, 2024
8819b8c
input checker message failbacks
jpn-- Mar 19, 2024
be5c024
fix veh type categoricals
jpn-- Mar 19, 2024
98bc2e4
restore original pandas read_csv NaNs
jpn-- Mar 19, 2024
5beffda
is_monotonic_increasing
jpn-- Mar 19, 2024
9b67fec
fix disagg acc sorting
jpn-- Mar 19, 2024
234a420
drop unused indexes
jpn-- Mar 20, 2024
58003ed
update pipeline ref
jpn-- Mar 20, 2024
012e92e
temporarily disable sharrow in vehicle alloc
jpn-- Mar 22, 2024
c6975a4
fix dtype problem
jpn-- Mar 23, 2024
2a899e5
ensure MAX index does not overflow
jpn-- Mar 25, 2024
a752ea4
sort on join to preserve index ordering from old pandas
jpn-- Mar 25, 2024
543b19a
local compute test simplifies debugging
jpn-- Mar 26, 2024
8ed8fb9
Merge branch 'main' into depend-pandas-2
jpn-- Mar 26, 2024
50c9f6d
more robust conversion to pyarrow
jpn-- Mar 28, 2024
a393dbd
Merge branch 'main' into depend-pandas-2
jpn-- Apr 1, 2024
c06d737
Merge branch 'main' into depend-pandas-2
jpn-- Apr 3, 2024
cd38e57
Merge branch 'main' into pandas-2
jpn-- Mar 3, 2025
4f89ef6
rewrite df.eval to fast_eval
jpn-- Mar 3, 2025
59872fc
change xarray pin
jpn-- Mar 3, 2025
5191684
fix zarr pin
jpn-- Mar 3, 2025
8091bd5
update numpy and dask pins
jpn-- Mar 3, 2025
cf9fb21
wrap raw fast_eval in pd.Series
jpn-- Mar 3, 2025
7becbca
don't skip sharrow in veh alloc
jpn-- Mar 5, 2025
8be8e0d
rebuild ref pipeline
jpn-- Mar 5, 2025
804e780
Merge commit 'c59dc4cdf66e3f53816b00ca28fdbc2ca4fd0c8a' into pandas-2
jpn-- Mar 18, 2025
b019c4b
make fast_eval more robust
jpn-- Mar 18, 2025
501e249
revise external targets
jpn-- Mar 18, 2025
a0b3c27
prefer public API
jpn-- Mar 19, 2025
560db6b
Merge branch 'main' into pandas-2
jpn-- Apr 24, 2025
61a97b1
Update activitysim-dev-base.yml
jpn-- Apr 24, 2025
4b4906e
add note about why fast_eval exists and how to undo it
jpn-- May 20, 2025
d780be6
Merge branch 'main' into pandas-2
jpn-- May 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/core_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -275,11 +275,11 @@ jobs:
- region: Standard 1-Zone Example (MTC)
region-org: ActivitySim
region-repo: activitysim-prototype-mtc
region-branch: extended
region-branch: pandas2
- region: Standard 2-Zone Example (SANDAG)
region-org: ActivitySim
region-repo: sandag-abm3-example
region-branch: main
region-branch: pandas2
fail-fast: false
defaults:
run:
Expand Down
9 changes: 6 additions & 3 deletions activitysim/abm/models/disaggregate_accessibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ class DisaggregateAccessibilitySettings(PydanticReadable, extra="forbid"):
"""
Disaggreate accessibility table is grouped by the "by" cols above and the KEEP_COLS are averaged
across the group. Initializing the below as NA if not in the auto ownership level, they are skipped
in the groupby mean and the values are correct.
in the groupby mean and the values are correct.
(It's a way to avoid having to update code to reshape the table and introduce new functionality there.)
If none, will keep all of the columns with "accessibility" in the name.
"""
Expand Down Expand Up @@ -581,7 +581,7 @@ def expand_template_zones(self, tables):
_expanded = pd.DataFrame(util.named_product(**index_params)).set_index("index")

# Use result to join template onto expanded table of zones
ex_table = _expanded.join(master_template).reset_index()
ex_table = _expanded.join(master_template).sort_index().reset_index()

# Concatenate a new unique set of ids
cols = ["home_zone_id", "proto_household_id", "proto_person_id"]
Expand Down Expand Up @@ -654,7 +654,9 @@ def create_proto_pop(self):
.set_index("index")
.rename(columns={"hhid": hhid})
)
persons = rep.join(persons).sort_values(hhid).reset_index(drop=True)
persons = (
rep.join(persons, sort=True).sort_values(hhid).reset_index(drop=True)
)
persons[perid] = persons.index + 1

# Assign persons to tours
Expand Down Expand Up @@ -730,6 +732,7 @@ def merge_persons(self):

perid = self.params["proto_persons"]["index_col"]
persons_merged.set_index(perid, inplace=True, drop=True)
persons_merged = persons_merged.sort_index()
self.proto_pop["proto_persons_merged"] = persons_merged

# Store in pipeline
Expand Down
60 changes: 34 additions & 26 deletions activitysim/abm/models/input_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,34 +301,42 @@ def report_errors(state, input_checker_settings, v_warnings, v_errors):

for warn in warns:
if "dataframe validator" in str(warn.message):
file_logger.warning(
"Failed dataframe validator: "
+ str(warn.message).split("\n")[-1]
)
elif "element-wise validator" in str(warn.message):
if "DataFrameSchema" in str(warn.message):
file_logger.warning(
"Failed element-wise validator: <"
+ str(warn.message).split("\n")[0].split(" ")[1]
+ table_name
+ ")>\n\t"
+ str(warn.message)
.split("failure cases:\n")[0]
.split("\n")[-2]
+ "\n\tfailure cases:\n\t"
+ "\n\t".join(
str(warn.message)
.split("failure cases:\n")[1]
.split("\n")
)
)
else:
try:
file_logger.warning(
"Failed element-wise validator: <"
+ " ".join(str(warn.message).split("\n")[0].split(" ")[1:3])
+ "\n\t"
+ "\n\t".join(str(warn.message).split("\n")[1:])
"Failed dataframe validator: "
+ str(warn.message).split("\n")[-1]
)
except Exception:
file_logger.warning(warn)
elif "element-wise validator" in str(warn.message):
try:
if "DataFrameSchema" in str(warn.message):
file_logger.warning(
"Failed element-wise validator: <"
+ str(warn.message).split("\n")[0].split(" ")[1]
+ table_name
+ ")>\n\t"
+ str(warn.message)
.split("failure cases:\n")[0]
.split("\n")[-2]
+ "\n\tfailure cases:\n\t"
+ "\n\t".join(
str(warn.message)
.split("failure cases:\n")[1]
.split("\n")
)
)
else:
file_logger.warning(
"Failed element-wise validator: <"
+ " ".join(
str(warn.message).split("\n")[0].split(" ")[1:3]
)
+ "\n\t"
+ "\n\t".join(str(warn.message).split("\n")[1:])
)
except Exception:
file_logger.warning(warn)
else:
file_logger.warning(warn)
file_logger.warning("\n")
Expand Down
2 changes: 1 addition & 1 deletion activitysim/abm/models/school_escorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ def school_escorting(
state.add_table("tours", tours)
state.get_rn_generator().drop_channel("tours")
state.get_rn_generator().add_channel("tours", tours)
state.add_table("escort_bundles", escort_bundles)
state.add_table("escort_bundles", escort_bundles.reset_index(drop=True))
# save school escorting tours and trips in pipeline so we can overwrite results from downstream models
state.add_table("school_escort_tours", school_escort_tours)
state.add_table("school_escort_trips", school_escort_trips)
Expand Down
2 changes: 1 addition & 1 deletion activitysim/abm/models/trip_departure_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ def apply_stage_two_model(
trace_label: str,
compute_settings: ComputeSettings | None = None,
):
if not trips.index.is_monotonic:
if not trips.index.is_monotonic_increasing:
trips = trips.sort_index()

# Assign the duration of the appropriate leg to the trip
Expand Down
4 changes: 2 additions & 2 deletions activitysim/abm/models/util/school_escort_tours_trips.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def create_chauf_escort_trips(bundles):
"outbound",
"purpose",
]
).reset_index()
).reset_index(drop=True)

# numbering trips such that outbound escorting trips must come first and inbound trips must come last
outbound_trip_num = -1 * (
Expand Down Expand Up @@ -539,7 +539,7 @@ def create_escortee_trips(bundles):
# create a new trip for each escortee destination
escortee_trips = escortee_trips.explode(
["destination", "escort_participants", "school_escort_trip_num", "purpose"]
).reset_index()
).reset_index(drop=True)

# numbering trips such that outbound escorting trips must come first and inbound trips must come last
# this comes in handy when merging trips to others in the tour decided downstream
Expand Down
1 change: 1 addition & 0 deletions activitysim/abm/models/vehicle_allocation.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ def vehicle_allocation(
]

# set choice for non-household vehicle option
choices["choice"] = choices["choice"].astype(veh_choice_dtype)
choices.loc[
choices["alt_choice"] == alts_from_spec[-1], "choice"
] = alts_from_spec[-1]
Expand Down
17 changes: 10 additions & 7 deletions activitysim/cli/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import glob
import hashlib
import importlib.resources
import logging
import os
import shutil
Expand All @@ -21,14 +22,15 @@

def _example_path(resource):
resource = os.path.join(EXAMPLES_DIR, resource)
path = pkg_resources.resource_filename(PACKAGE, resource)

return path
return importlib.resources.as_file(
importlib.resources.files(PACKAGE).joinpath(resource)
)


def _load_manifest():
with open(_example_path(MANIFEST), "r") as f:
manifest = yaml.safe_load(f.read())
with _example_path(MANIFEST) as f_pth:
with open(f_pth, "r") as f:
manifest = yaml.safe_load(f.read())

assert manifest, f"error: could not load {MANIFEST}"
return {example["name"]: example for example in manifest}
Expand Down Expand Up @@ -177,8 +179,9 @@ def get_example(
)

else:
for asset_path in glob.glob(_example_path(assets)):
copy_asset(asset_path, target_path, dirs_exist_ok=True)
with _example_path(assets) as pth:
for asset_path in glob.glob(str(pth)):
copy_asset(asset_path, target_path, dirs_exist_ok=True)

print(f"copied! new project files are in {os.path.abspath(dest_path)}")

Expand Down
31 changes: 30 additions & 1 deletion activitysim/core/assign.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,36 @@ def read_assignment_spec(
"""

try:
cfg = pd.read_csv(file_name, comment="#")
# we use an explicit list of na_values, these are the values that
# Pandas version 1.5 recognized as NaN by default. Notably absent is
# 'None' which is used in some spec files to be the object `None` not
# the float value NaN.
cfg = pd.read_csv(
file_name,
comment="#",
na_values=[
"",
"#N/A",
"#N/A N/A",
"#NA",
"-1.#IND",
"-1.#QNAN",
"-NaN",
"-nan",
"1.#IND",
"1.#QNAN",
"<NA>",
"N/A",
"NA",
"NULL",
"NaN",
"n/a",
"nan",
"null",
],
keep_default_na=False,
)

except Exception as e:
logger.error(f"Error reading spec file: {file_name}")
logger.error(str(e))
Expand Down
106 changes: 106 additions & 0 deletions activitysim/core/fast_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any

import pandas as pd
from pandas import eval as _eval

if TYPE_CHECKING:
from collections.abc import Hashable, Iterator, Mapping, Sequence

from pandas._typing import ArrayLike


def _get_cleaned_column_resolvers(
df: pd.DataFrame, raw: bool = True
) -> dict[Hashable, ArrayLike | pd.Series]:
"""
Return the special character free column resolvers of a dataframe.

Column names with special characters are 'cleaned up' so that they can
be referred to by backtick quoting.
Used in :meth:`DataFrame.eval`.
"""
from pandas import Series
from pandas.core.computation.parsing import clean_column_name

if isinstance(df, pd.Series):
return {clean_column_name(df.name): df}

# CHANGED FROM PANDAS: do not even convert the arrays to pd.Series, just
# give the raw arrays to the compute engine. This is potentially a breaking
# change if any of the operations in the eval string require a pd.Series.
if raw:
# Performance tradeoff: in the dict below, we iterate over `df.items`,
# which yields tuples of (column_name, data as pd.Series). This is marginally
# slower than iterating over `df.columns` and `df._iter_column_arrays()`,
# but the latter is not in Pandas' public API, and may be removed in the future.
return {
clean_column_name(k): v for k, v in df.items() if not isinstance(k, int)
}

# CHANGED FROM PANDAS: do not call df.dtype inside the dict comprehension loop
# This update has been made in https://github.com/pandas-dev/pandas/pull/59573,
# but appears not to have been released yet as of pandas 2.2.3
dtypes = df.dtypes

return {
clean_column_name(k): Series(
v, copy=False, index=df.index, name=k, dtype=dtypes[k]
).__finalize__(df)
for k, v in zip(df.columns, df._iter_column_arrays())
if not isinstance(k, int)
}


def fast_eval(df: pd.DataFrame, expr: str, **kwargs) -> Any | None:
"""
Evaluate a string describing operations on DataFrame columns.

Operates on columns only, not specific rows or elements. This allows
`eval` to run arbitrary code, which can make you vulnerable to code
injection if you pass user input to this function.

This function is a wrapper that replaces :meth:`~pandas.DataFrame.eval`
with a more efficient version than in the default pandas library (as
of pandas 2.2.3). It is recommended to use this function instead of
:meth:`~pandas.DataFrame.eval` for better performance. However, if you
encounter issues with this function, you can switch back to the default
pandas eval by changing the function call from `fast_eval(df, ...)` to
`df.eval(...)`.

Parameters
----------
expr : str
The expression string to evaluate.
**kwargs
See the documentation for :meth:`~pandas.DataFrame.eval` for complete
details on the keyword arguments accepted.

Returns
-------
ndarray, scalar, or pandas object
The result of the evaluation.
"""

inplace = False
kwargs["level"] = kwargs.pop("level", 0) + 1
index_resolvers = df._get_index_resolvers()
column_resolvers = _get_cleaned_column_resolvers(df)
resolvers = column_resolvers, index_resolvers
if "target" not in kwargs:
kwargs["target"] = df
kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers

try:
return pd.Series(
_eval(expr, inplace=inplace, **kwargs), index=df.index, name=expr
).__finalize__(df)
except Exception as e:
# Initially assume that the exception is caused by the potentially
# breaking change in _get_cleaned_column_resolvers, and try again
# TODO: what kind of exception should be caught here so it is less broad
column_resolvers = _get_cleaned_column_resolvers(df, raw=False)
resolvers = column_resolvers, index_resolvers
kwargs["resolvers"] = kwargs["resolvers"][:-2] + resolvers
return _eval(expr, inplace=inplace, **kwargs)
5 changes: 3 additions & 2 deletions activitysim/core/interaction_simulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from activitysim.core import chunk, logit, simulate, tracing, util, workflow
from activitysim.core.configuration.base import ComputeSettings
from activitysim.core.fast_eval import fast_eval

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -287,7 +288,7 @@ def to_series(x):
if expr.startswith("@"):
v = to_series(eval(expr[1:], globals(), locals_d))
else:
v = df.eval(expr, resolvers=[locals_d])
v = fast_eval(df, expr, resolvers=[locals_d])

if check_for_variability and v.std() == 0:
logger.info(
Expand Down Expand Up @@ -556,7 +557,7 @@ def to_series(x):
if expr.startswith("@"):
v = to_series(eval(expr[1:], globals(), locals_d))
else:
v = df.eval(expr, resolvers=[locals_d])
v = fast_eval(df, expr, resolvers=[locals_d])
if check_for_variability and v.std() == 0:
logger.info(
"%s: no variability (%s) in: %s"
Expand Down
10 changes: 9 additions & 1 deletion activitysim/core/los.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,15 @@ def get_mazpairs(self, omaz, dmaz, attribute):
self.maz_ceiling
) + np.asanyarray(dmaz, dtype=np.int64)
else:
i = np.asanyarray(omaz) * self.maz_ceiling + np.asanyarray(dmaz)
# if we have less than a 32-bit index, it will
# overflow so we need to upgrade to at least 32 bit
omaz_as_array = np.asanyarray(omaz)
if omaz_as_array.dtype not in (np.int32, np.int64):
omaz_as_array = omaz_as_array.astype(np.int32)
dmaz_as_array = np.asanyarray(dmaz)
if dmaz_as_array.dtype not in (np.int32, np.int64):
dmaz_as_array = dmaz_as_array.astype(np.int32)
i = omaz_as_array * self.maz_ceiling + dmaz_as_array
s = util.quick_loc_df(i, self.maz_to_maz_df, attribute)

# FIXME - no point in returning series?
Expand Down
Loading
Loading