diff --git a/skore/src/skore/_sklearn/_comparison/metrics_accessor.py b/skore/src/skore/_sklearn/_comparison/metrics_accessor.py index 94a4c8f731..d578c381a4 100644 --- a/skore/src/skore/_sklearn/_comparison/metrics_accessor.py +++ b/skore/src/skore/_sklearn/_comparison/metrics_accessor.py @@ -187,6 +187,8 @@ def _compute_metric_scores( aggregate: Aggregate | None = ("mean", "std"), **metric_kwargs: Any, ): + assert report_metric_name == "summarize" + # build the cache key components to finally create a tuple that will be used # to check if the metric has already been computed cache_key_parts: list[Any] = [ @@ -232,11 +234,34 @@ def _compute_metric_scores( data_source=data_source, X=X, y=y, + # To avoid crashing if one of the sub-reports does not support a metric + on_unavailable_metric="nan", **metric_kwargs, ) if self._parent._reports_type == "CrossValidationReport": kwargs["aggregate"] = None + # Check whether metrics are available in sub-reports + # FIXME: These checks are already done by the sub-reports + + if isinstance(kwargs["scoring"], str): + scorings = [kwargs["scoring"]] + elif kwargs["scoring"] is None or callable(kwargs["scoring"]): + scorings = [] # No processing needed + else: + scorings = kwargs["scoring"] + + for scoring in scorings: + if isinstance(scoring, str): + if any( + hasattr(report.metrics, scoring) + for report in self._parent.reports_ + ): + continue + raise ValueError( + f"None of the compared reports support metric '{scoring}'" + ) + generator = parallel( joblib.delayed(getattr(report.metrics, report_metric_name))(**kwargs) for report in self._parent.reports_ diff --git a/skore/src/skore/_sklearn/_comparison/utils.py b/skore/src/skore/_sklearn/_comparison/utils.py index bc6305e44f..cf998c700d 100644 --- a/skore/src/skore/_sklearn/_comparison/utils.py +++ b/skore/src/skore/_sklearn/_comparison/utils.py @@ -328,7 +328,11 @@ def sort_by_split(df: pd.DataFrame) -> pd.DataFrame: values="Value", aggfunc=aggregate, sort=False, + dropna=False, ) + + # Remove NaN rows + df = df.dropna(axis="index", how="all") else: df = sort_by_split(df) df = df.set_index(list(df.columns.drop("Value"))) diff --git a/skore/src/skore/_sklearn/_cross_validation/metrics_accessor.py b/skore/src/skore/_sklearn/_cross_validation/metrics_accessor.py index 34f1c7d30e..923cbd8c5a 100644 --- a/skore/src/skore/_sklearn/_cross_validation/metrics_accessor.py +++ b/skore/src/skore/_sklearn/_cross_validation/metrics_accessor.py @@ -62,6 +62,7 @@ def summarize( indicator_favorability: bool = False, flat_index: bool = False, aggregate: Aggregate | None = ("mean", "std"), + on_unavailable_metric: Literal["raise", "nan"] = "raise", ) -> MetricsSummaryDisplay: """Report a set of metrics for our estimator. @@ -125,6 +126,11 @@ class is set to the one provided when creating the report. If `None`, Function to aggregate the scores across the cross-validation splits. None will return the scores for each split. + on_unavailable_metric : {"raise", "nan"}, default="raise" + Whether to raise or return `numpy.nan` when the metric cannot be computed. + For example, "brier_score" cannot be computed for estimators without a + `predict_proba` method. + Returns ------- MetricsSummaryDisplay @@ -163,6 +169,7 @@ class is set to the one provided when creating the report. If `None`, scoring_kwargs=scoring_kwargs, scoring_names=scoring_names, indicator_favorability=indicator_favorability, + on_unavailable_metric=on_unavailable_metric, ) if flat_index: if isinstance(results.columns, pd.MultiIndex): @@ -184,6 +191,7 @@ def _compute_metric_scores( X: ArrayLike | None = None, y: ArrayLike | None = None, aggregate: Aggregate | None = None, + on_unavailable_metric: Literal["raise", "nan"] = "raise", **metric_kwargs: Any, ) -> pd.DataFrame: if data_source == "X_y": @@ -231,7 +239,11 @@ def _compute_metric_scores( ) generator = parallel( delayed(getattr(report.metrics, report_metric_name))( - data_source=data_source, X=X, y=y, **metric_kwargs + data_source=data_source, + X=X, + y=y, + on_unavailable_metric=on_unavailable_metric, + **metric_kwargs, ) for report in self._parent.estimator_reports_ ) diff --git a/skore/src/skore/_sklearn/_estimator/metrics_accessor.py b/skore/src/skore/_sklearn/_estimator/metrics_accessor.py index 0b61753096..baa49d2bab 100644 --- a/skore/src/skore/_sklearn/_estimator/metrics_accessor.py +++ b/skore/src/skore/_sklearn/_estimator/metrics_accessor.py @@ -44,6 +44,14 @@ DataSource = Literal["test", "train", "X_y"] +def _nan(*args, **kwargs): + """Return NaN. + + Used when metric is not known and on_unavailable_metric is set to "nan". + """ + return np.nan + + class _MetricsAccessor( _BaseMetricsAccessor, _BaseAccessor["EstimatorReport"], DirNamesMixin ): @@ -72,6 +80,7 @@ def summarize( pos_label: PositiveLabel | None = _DEFAULT, indicator_favorability: bool = False, flat_index: bool = False, + on_unavailable_metric: Literal["raise", "nan"] = "raise", ) -> MetricsSummaryDisplay: """Report a set of metrics for our estimator. @@ -130,6 +139,11 @@ class is set to the one provided when creating the report. If `None`, Whether to flatten the multi-index columns. Flat index will always be lower case, do not include spaces and remove the hash symbol to ease indexing. + on_unavailable_metric : {"raise", "nan"}, default="raise" + Whether to raise or return `numpy.nan` when the metric cannot be computed. + For example, "brier_score" cannot be computed for estimators without a + `predict_proba` method. + Returns ------- MetricsSummaryDisplay @@ -288,7 +302,16 @@ class is set to the one provided when creating the report. If `None`, # Handle built-in metrics (without underscore prefix) elif metric in self._score_or_loss_info: - metric_fn = getattr(self, f"_{metric}") + try: + metric_fn = getattr(self, f"_{metric}") + except AttributeError as e: + if on_unavailable_metric == "nan": + metric_fn = _nan # type: ignore + else: + raise AttributeError( + f"This estimator does not support metric '{metric}'" + ) from e + metrics_kwargs = {"data_source_hash": data_source_hash} if metric_name is None: metric_name = f"{self._score_or_loss_info[metric]['name']}" diff --git a/skore/tests/unit/reports/comparison/test_metrics.py b/skore/tests/unit/reports/comparison/test_metrics.py index 5e843a85f1..54416b66e0 100644 --- a/skore/tests/unit/reports/comparison/test_metrics.py +++ b/skore/tests/unit/reports/comparison/test_metrics.py @@ -2,8 +2,10 @@ Common test for the metrics accessor of a ComparisonReport. """ +import numpy as np import pytest from sklearn.datasets import make_classification +from sklearn.dummy import DummyClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from skore import ComparisonReport, CrossValidationReport, EstimatorReport @@ -46,3 +48,158 @@ def test_favorability_undefined_metrics(report): expected_values = {"(↗︎)", "(↘︎)"} actual_values = set(metrics_df["Favorability"].to_numpy()) assert actual_values.issubset(expected_values) + + +class TestUnsupportedMetric: + """Test the behaviour of ComparisonReport metrics when some or none of the compared + reports support the requested metric. + + Originates from + """ + + @pytest.fixture + def estimator_reports_no_brier_score(self, binary_classification_train_test_split): + """Reports where no estimator supports Brier score.""" + X_train, X_test, y_train, y_test = binary_classification_train_test_split + report_1 = EstimatorReport( + LinearSVC(), + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + ) + report_2 = EstimatorReport( + LinearSVC(), + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + ) + return report_1, report_2 + + @pytest.fixture + def estimator_reports_some_brier_score( + self, binary_classification_train_test_split + ): + """Reports where some estimator supports Brier score.""" + X_train, X_test, y_train, y_test = binary_classification_train_test_split + report_1 = EstimatorReport( + DummyClassifier(strategy="uniform", random_state=0), + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + ) + report_2 = EstimatorReport( + LinearSVC(), + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + ) + return report_1, report_2 + + @pytest.fixture + def cv_reports_no_brier_score(self, binary_classification_data): + """Reports where no estimator supports Brier score.""" + X, y = binary_classification_data + report_1 = CrossValidationReport(LinearSVC(), X, y) + report_2 = CrossValidationReport(LinearSVC(), X, y) + return report_1, report_2 + + @pytest.fixture + def cv_reports_some_brier_score(self, binary_classification_data): + """Reports where some estimator supports Brier score.""" + X, y = binary_classification_data + report_1 = CrossValidationReport( + DummyClassifier(strategy="uniform", random_state=0), X, y + ) + report_2 = CrossValidationReport(LinearSVC(), X, y) + return report_1, report_2 + + @pytest.mark.parametrize( + "compared_report", [EstimatorReport, CrossValidationReport] + ) + def test_no_report_supports_metric( + self, + compared_report, + estimator_reports_no_brier_score, + cv_reports_no_brier_score, + ): + """If you call Brier score and none of the sub-reports support it, + you should get an AttributeError.""" + if compared_report is EstimatorReport: + report_1, report_2 = estimator_reports_no_brier_score + else: + report_1, report_2 = cv_reports_no_brier_score + report = ComparisonReport([report_1, report_2]) + + with pytest.raises(AttributeError): + report.metrics.brier_score() + + @pytest.mark.parametrize( + "compared_report", [EstimatorReport, CrossValidationReport] + ) + def test_summarize_no_report_supports_metric( + self, + compared_report, + estimator_reports_no_brier_score, + cv_reports_no_brier_score, + ): + """If you call `summarize` with Brier score and none of the sub-reports support + it, you should get an AttributeError""" + if compared_report is EstimatorReport: + report_1, report_2 = estimator_reports_no_brier_score + else: + report_1, report_2 = cv_reports_no_brier_score + report = ComparisonReport([report_1, report_2]) + + with pytest.raises(ValueError): + report.metrics.summarize(scoring="brier_score") + + @pytest.mark.parametrize( + "compared_report", [EstimatorReport, CrossValidationReport] + ) + def test_some_reports_support_metric( + self, + compared_report, + estimator_reports_some_brier_score, + cv_reports_some_brier_score, + ): + """If you call `brier_score` and some of the sub-reports support it, + you should get a dataframe with NaN""" + if compared_report is EstimatorReport: + report_1, report_2 = estimator_reports_some_brier_score + else: + report_1, report_2 = cv_reports_some_brier_score + report = ComparisonReport([report_1, report_2]) + summary = report.metrics.brier_score() + + if compared_report is EstimatorReport: + assert np.isnan(summary.loc["Brier score"]["LinearSVC"]) + else: + assert np.isnan(summary.loc["Brier score"][("mean", "LinearSVC")]) + + @pytest.mark.parametrize( + "compared_report", [EstimatorReport, CrossValidationReport] + ) + def test_summarize_some_reports_support_metric( + self, + compared_report, + estimator_reports_some_brier_score, + cv_reports_some_brier_score, + ): + """If you call `summarize` with Brier score and some of the sub-reports + support it, you should get a dataframe with NaN""" + if compared_report is EstimatorReport: + report_1, report_2 = estimator_reports_some_brier_score + else: + report_1, report_2 = cv_reports_some_brier_score + + report = ComparisonReport([report_1, report_2]) + summary = report.metrics.summarize(scoring="brier_score") + + if compared_report is EstimatorReport: + assert np.isnan(summary.frame().loc["Brier score"]["LinearSVC"]) + else: + assert np.isnan(summary.frame().loc["Brier score"][("mean", "LinearSVC")])