Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions skore/src/skore/_sklearn/_comparison/metrics_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ def _compute_metric_scores(
aggregate: Aggregate | None = ("mean", "std"),
**metric_kwargs: Any,
):
assert report_metric_name == "summarize"

# build the cache key components to finally create a tuple that will be used
# to check if the metric has already been computed
cache_key_parts: list[Any] = [
Expand Down Expand Up @@ -232,11 +234,34 @@ def _compute_metric_scores(
data_source=data_source,
X=X,
y=y,
# To avoid crashing if one of the sub-reports does not support a metric
on_unavailable_metric="nan",
**metric_kwargs,
)
if self._parent._reports_type == "CrossValidationReport":
kwargs["aggregate"] = None

# Check whether metrics are available in sub-reports
# FIXME: These checks are already done by the sub-reports

if isinstance(kwargs["scoring"], str):
scorings = [kwargs["scoring"]]
elif kwargs["scoring"] is None or callable(kwargs["scoring"]):
scorings = [] # No processing needed
else:
scorings = kwargs["scoring"]

for scoring in scorings:
if isinstance(scoring, str):
if any(
hasattr(report.metrics, scoring)
for report in self._parent.reports_
):
continue
raise ValueError(
f"None of the compared reports support metric '{scoring}'"
)

generator = parallel(
joblib.delayed(getattr(report.metrics, report_metric_name))(**kwargs)
for report in self._parent.reports_
Expand Down
4 changes: 4 additions & 0 deletions skore/src/skore/_sklearn/_comparison/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,11 @@ def sort_by_split(df: pd.DataFrame) -> pd.DataFrame:
values="Value",
aggfunc=aggregate,
sort=False,
dropna=False,
)

# Remove NaN rows
df = df.dropna(axis="index", how="all")
else:
df = sort_by_split(df)
df = df.set_index(list(df.columns.drop("Value")))
Expand Down
14 changes: 13 additions & 1 deletion skore/src/skore/_sklearn/_cross_validation/metrics_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def summarize(
indicator_favorability: bool = False,
flat_index: bool = False,
aggregate: Aggregate | None = ("mean", "std"),
on_unavailable_metric: Literal["raise", "nan"] = "raise",
) -> MetricsSummaryDisplay:
"""Report a set of metrics for our estimator.

Expand Down Expand Up @@ -125,6 +126,11 @@ class is set to the one provided when creating the report. If `None`,
Function to aggregate the scores across the cross-validation splits.
None will return the scores for each split.

on_unavailable_metric : {"raise", "nan"}, default="raise"
Whether to raise or return `numpy.nan` when the metric cannot be computed.
For example, "brier_score" cannot be computed for estimators without a
`predict_proba` method.

Returns
-------
MetricsSummaryDisplay
Expand Down Expand Up @@ -163,6 +169,7 @@ class is set to the one provided when creating the report. If `None`,
scoring_kwargs=scoring_kwargs,
scoring_names=scoring_names,
indicator_favorability=indicator_favorability,
on_unavailable_metric=on_unavailable_metric,
)
if flat_index:
if isinstance(results.columns, pd.MultiIndex):
Expand All @@ -184,6 +191,7 @@ def _compute_metric_scores(
X: ArrayLike | None = None,
y: ArrayLike | None = None,
aggregate: Aggregate | None = None,
on_unavailable_metric: Literal["raise", "nan"] = "raise",
**metric_kwargs: Any,
) -> pd.DataFrame:
if data_source == "X_y":
Expand Down Expand Up @@ -231,7 +239,11 @@ def _compute_metric_scores(
)
generator = parallel(
delayed(getattr(report.metrics, report_metric_name))(
data_source=data_source, X=X, y=y, **metric_kwargs
data_source=data_source,
X=X,
y=y,
on_unavailable_metric=on_unavailable_metric,
**metric_kwargs,
)
for report in self._parent.estimator_reports_
)
Expand Down
25 changes: 24 additions & 1 deletion skore/src/skore/_sklearn/_estimator/metrics_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,14 @@
DataSource = Literal["test", "train", "X_y"]


def _nan(*args, **kwargs):
"""Return NaN.

Used when metric is not known and on_unavailable_metric is set to "nan".
"""
return np.nan


class _MetricsAccessor(
_BaseMetricsAccessor, _BaseAccessor["EstimatorReport"], DirNamesMixin
):
Expand Down Expand Up @@ -72,6 +80,7 @@ def summarize(
pos_label: PositiveLabel | None = _DEFAULT,
indicator_favorability: bool = False,
flat_index: bool = False,
on_unavailable_metric: Literal["raise", "nan"] = "raise",
) -> MetricsSummaryDisplay:
"""Report a set of metrics for our estimator.

Expand Down Expand Up @@ -130,6 +139,11 @@ class is set to the one provided when creating the report. If `None`,
Whether to flatten the multi-index columns. Flat index will always be lower
case, do not include spaces and remove the hash symbol to ease indexing.

on_unavailable_metric : {"raise", "nan"}, default="raise"
Whether to raise or return `numpy.nan` when the metric cannot be computed.
For example, "brier_score" cannot be computed for estimators without a
`predict_proba` method.

Returns
-------
MetricsSummaryDisplay
Expand Down Expand Up @@ -288,7 +302,16 @@ class is set to the one provided when creating the report. If `None`,

# Handle built-in metrics (without underscore prefix)
elif metric in self._score_or_loss_info:
metric_fn = getattr(self, f"_{metric}")
try:
metric_fn = getattr(self, f"_{metric}")
except AttributeError as e:
if on_unavailable_metric == "nan":
metric_fn = _nan # type: ignore
else:
raise AttributeError(
f"This estimator does not support metric '{metric}'"
) from e

metrics_kwargs = {"data_source_hash": data_source_hash}
if metric_name is None:
metric_name = f"{self._score_or_loss_info[metric]['name']}"
Expand Down
157 changes: 157 additions & 0 deletions skore/tests/unit/reports/comparison/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
Common test for the metrics accessor of a ComparisonReport.
"""

import numpy as np
import pytest
from sklearn.datasets import make_classification
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from skore import ComparisonReport, CrossValidationReport, EstimatorReport
Expand Down Expand Up @@ -46,3 +48,158 @@ def test_favorability_undefined_metrics(report):
expected_values = {"(↗︎)", "(↘︎)"}
actual_values = set(metrics_df["Favorability"].to_numpy())
assert actual_values.issubset(expected_values)


class TestUnsupportedMetric:
"""Test the behaviour of ComparisonReport metrics when some or none of the compared
reports support the requested metric.

Originates from <https://github.com/probabl-ai/skore/issues/1473>
"""

@pytest.fixture
def estimator_reports_no_brier_score(self, binary_classification_train_test_split):
"""Reports where no estimator supports Brier score."""
X_train, X_test, y_train, y_test = binary_classification_train_test_split
report_1 = EstimatorReport(
LinearSVC(),
X_train=X_train,
y_train=y_train,
X_test=X_test,
y_test=y_test,
)
report_2 = EstimatorReport(
LinearSVC(),
X_train=X_train,
y_train=y_train,
X_test=X_test,
y_test=y_test,
)
return report_1, report_2

@pytest.fixture
def estimator_reports_some_brier_score(
self, binary_classification_train_test_split
):
"""Reports where some estimator supports Brier score."""
X_train, X_test, y_train, y_test = binary_classification_train_test_split
report_1 = EstimatorReport(
DummyClassifier(strategy="uniform", random_state=0),
X_train=X_train,
y_train=y_train,
X_test=X_test,
y_test=y_test,
)
report_2 = EstimatorReport(
LinearSVC(),
X_train=X_train,
y_train=y_train,
X_test=X_test,
y_test=y_test,
)
return report_1, report_2

@pytest.fixture
def cv_reports_no_brier_score(self, binary_classification_data):
"""Reports where no estimator supports Brier score."""
X, y = binary_classification_data
report_1 = CrossValidationReport(LinearSVC(), X, y)
report_2 = CrossValidationReport(LinearSVC(), X, y)
return report_1, report_2

@pytest.fixture
def cv_reports_some_brier_score(self, binary_classification_data):
"""Reports where some estimator supports Brier score."""
X, y = binary_classification_data
report_1 = CrossValidationReport(
DummyClassifier(strategy="uniform", random_state=0), X, y
)
report_2 = CrossValidationReport(LinearSVC(), X, y)
return report_1, report_2

@pytest.mark.parametrize(
"compared_report", [EstimatorReport, CrossValidationReport]
)
def test_no_report_supports_metric(
self,
compared_report,
estimator_reports_no_brier_score,
cv_reports_no_brier_score,
):
"""If you call Brier score and none of the sub-reports support it,
you should get an AttributeError."""
if compared_report is EstimatorReport:
report_1, report_2 = estimator_reports_no_brier_score
else:
report_1, report_2 = cv_reports_no_brier_score
report = ComparisonReport([report_1, report_2])

with pytest.raises(AttributeError):
report.metrics.brier_score()

@pytest.mark.parametrize(
"compared_report", [EstimatorReport, CrossValidationReport]
)
def test_summarize_no_report_supports_metric(
self,
compared_report,
estimator_reports_no_brier_score,
cv_reports_no_brier_score,
):
"""If you call `summarize` with Brier score and none of the sub-reports support
it, you should get an AttributeError"""
if compared_report is EstimatorReport:
report_1, report_2 = estimator_reports_no_brier_score
else:
report_1, report_2 = cv_reports_no_brier_score
report = ComparisonReport([report_1, report_2])

with pytest.raises(ValueError):
report.metrics.summarize(scoring="brier_score")

@pytest.mark.parametrize(
"compared_report", [EstimatorReport, CrossValidationReport]
)
def test_some_reports_support_metric(
self,
compared_report,
estimator_reports_some_brier_score,
cv_reports_some_brier_score,
):
"""If you call `brier_score` and some of the sub-reports support it,
you should get a dataframe with NaN"""
if compared_report is EstimatorReport:
report_1, report_2 = estimator_reports_some_brier_score
else:
report_1, report_2 = cv_reports_some_brier_score
report = ComparisonReport([report_1, report_2])
summary = report.metrics.brier_score()

if compared_report is EstimatorReport:
assert np.isnan(summary.loc["Brier score"]["LinearSVC"])
else:
assert np.isnan(summary.loc["Brier score"][("mean", "LinearSVC")])

@pytest.mark.parametrize(
"compared_report", [EstimatorReport, CrossValidationReport]
)
def test_summarize_some_reports_support_metric(
self,
compared_report,
estimator_reports_some_brier_score,
cv_reports_some_brier_score,
):
"""If you call `summarize` with Brier score and some of the sub-reports
support it, you should get a dataframe with NaN"""
if compared_report is EstimatorReport:
report_1, report_2 = estimator_reports_some_brier_score
else:
report_1, report_2 = cv_reports_some_brier_score

report = ComparisonReport([report_1, report_2])
summary = report.metrics.summarize(scoring="brier_score")

if compared_report is EstimatorReport:
assert np.isnan(summary.frame().loc["Brier score"]["LinearSVC"])
else:
assert np.isnan(summary.frame().loc["Brier score"][("mean", "LinearSVC")])
Loading