From 7b259fda675f64f62782244e971de15c8c9ba9a8 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Thu, 20 Mar 2025 20:49:23 +0000 Subject: [PATCH 01/37] require at least py39, fix nodeps tests collection --- .github/workflows/python-package.yml | 25 +++++++------- CHANGES.rst | 6 ++++ _ci/runtests_nodeps.sh | 13 -------- eli5/lime/lime.py | 49 +++++++++++----------------- eli5/lime/utils.py | 31 +++--------------- eli5/sklearn/utils.py | 47 +++++++++----------------- requirements.txt | 2 +- setup.py | 10 +++--- tests/test_sklearn_unhashing.py | 9 ++--- tests/utils.py | 10 ++---- tox.ini | 48 ++++++++++----------------- 11 files changed, 82 insertions(+), 168 deletions(-) delete mode 100644 _ci/runtests_nodeps.sh diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index b13a2c5a..89fbf692 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -17,26 +17,23 @@ jobs: fail-fast: false matrix: include: - - python-version: '3.6' + - python-version: '3.12' tox-env: 'mypy' - - python-version: '3.9' + - python-version: '3.12' tox-env: 'docs' - - python-version: '3.6' - tox-env: 'py36' - - python-version: '3.6' - tox-env: 'py36-nodeps' - - python-version: '3.6' - tox-env: 'py36-extra' - - python-version: '3.7' - tox-env: 'py37' - - python-version: '3.8' - tox-env: 'py38' - - python-version: '3.8' - tox-env: 'py38-nodeps' - python-version: '3.9' tox-env: 'py39' - python-version: '3.9' tox-env: 'py39-nodeps' + - python-version: '3.9' + tox-env: 'py39-extra' + - python-version: '3.10' + tox-env: 'py310' + - python-version: '3.11' + tox-env: 'py311' + - python-version: '3.12' + tox-env: 'py313' + - python-version: '3.13' steps: - uses: actions/checkout@v2 diff --git a/CHANGES.rst b/CHANGES.rst index a733d762..856644ce 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ Changelog ========= +0.14.0 (?) +------------------- + +* drop support for python 3.6, 3.7, 3.8 +* add support for python 3.11, 3.12, 3.13 + 0.13.0 (2022-05-11) ------------------- diff --git a/_ci/runtests_nodeps.sh b/_ci/runtests_nodeps.sh deleted file mode 100644 index 36074072..00000000 --- a/_ci/runtests_nodeps.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash -py.test --doctest-modules \ - --ignore eli5/lightning.py \ - --ignore eli5/sklearn_crfsuite \ - --ignore eli5/ipython.py \ - --ignore eli5/xgboost.py \ - --ignore eli5/lightgbm.py \ - --ignore eli5/catboost.py \ - --ignore eli5/keras \ - --ignore eli5/formatters/as_dataframe.py \ - --ignore eli5/formatters/image.py \ - --ignore tests/utils_image.py \ - --cov=eli5 --cov-report=html --cov-report=term "$@" diff --git a/eli5/lime/lime.py b/eli5/lime/lime.py index 2968da04..9a6e08d7 100644 --- a/eli5/lime/lime.py +++ b/eli5/lime/lime.py @@ -1,10 +1,8 @@ -# -*- coding: utf-8 -*- """ An impementation of LIME (http://arxiv.org/abs/1602.04938), an algorithm to explain predictions of black-box models. """ -from __future__ import absolute_import -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable, Optional import numpy as np from sklearn.feature_extraction.text import CountVectorizer @@ -14,7 +12,6 @@ from sklearn.base import clone, BaseEstimator import eli5 -from eli5.sklearn.utils import sklearn_version from eli5.lime.samplers import BaseSampler from eli5.lime.textutils import DEFAULT_TOKEN_PATTERN, CHAR_TOKEN_PATTERN from eli5.lime.samplers import MaskingTextSamplers @@ -139,18 +136,17 @@ class TextExplainer(BaseEstimator): Only available after :func:`fit`. """ def __init__(self, - n_samples=5000, # type: int - char_based=None, # type: bool + n_samples: int = 5000, + char_based: bool = None, clf=None, vec=None, - sampler=None, # type: BaseSampler - position_dependent=False, # type: bool - rbf_sigma=None, # type: float + sampler: BaseSampler = None, + position_dependent: bool = False, + rbf_sigma: float = None, random_state=None, - expand_factor=10, # type: Optional[int] - token_pattern=None, # type: Optional[str] - ): - # type: (...) -> None + expand_factor: Optional[int] = 10, + token_pattern: Optional[str] = None, + ) -> None: self.n_samples = n_samples self.random_state = random_state self.expand_factor = expand_factor @@ -161,8 +157,8 @@ def __init__(self, if char_based is None: if token_pattern is None: - self.char_based = False # type: Optional[bool] - self.token_pattern = DEFAULT_TOKEN_PATTERN # type: str + self.char_based: Optional[bool] = False + self.token_pattern: str = DEFAULT_TOKEN_PATTERN else: self.char_based = None self.token_pattern = token_pattern @@ -203,11 +199,7 @@ def __init__(self, ) self.vec = vec - def fit(self, - doc, # type: str - predict_proba, # type: Callable[[Any], Any] - ): - # type: (...) -> TextExplainer + def fit(self, doc: str, predict_proba: Callable[[Any], Any]) -> 'TextExplainer': """ Explain ``predict_proba`` probabilistic classification function for the ``doc`` example. This method fits a local classification @@ -323,23 +315,20 @@ def _default_clf(self): loss='log', penalty='elasticnet', alpha=1e-3, - random_state=self.rng_ + random_state=self.rng_, + tol=1e-3, ) - if sklearn_version() >= '0.19': - kwargs['tol'] = 1e-3 return SGDClassifier(**kwargs) - def _train_local_classifier(estimator, samples, - similarity, # type: np.ndarray - y_proba, # type: np.ndarray - expand_factor=10, # type: Optional[int] - test_size=0.3, # type: float + similarity: np.ndarray, + y_proba: np.ndarray, + expand_factor: Optional[int] = 10, + test_size: float = 0.3, random_state=None, - ): - # type: (...) -> Dict[str, float] + ) -> dict[str, float]: rng = check_random_state(random_state) (X_train, X_test, diff --git a/eli5/lime/utils.py b/eli5/lime/utils.py index 120dbfbd..3e21722c 100644 --- a/eli5/lime/utils.py +++ b/eli5/lime/utils.py @@ -1,16 +1,11 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from typing import List, Any - import numpy as np from scipy.stats import entropy +from scipy.sparse import issparse from sklearn.pipeline import Pipeline -from sklearn.utils import check_random_state, issparse -from sklearn.utils.metaestimators import if_delegate_has_method +from sklearn.utils import check_random_state from sklearn.utils import shuffle as _shuffle from eli5.utils import vstack -from eli5.sklearn.utils import sklearn_version def fit_proba(clf, X, y_proba, expand_factor=10, sample_weight=None, @@ -48,11 +43,8 @@ def with_sample_weight(clf, sample_weight, fit_params): return params -def fix_multiclass_predict_proba(y_proba, # type: np.ndarray - seen_classes, - complete_classes - ): - # type: (...) -> np.ndarray +def fix_multiclass_predict_proba( + y_proba: np.ndarray, seen_classes, complete_classes) -> np.ndarray: """ Add missing columns to predict_proba result. @@ -70,22 +62,7 @@ def fix_multiclass_predict_proba(y_proba, # type: np.ndarray return y_proba_fixed -class _PipelinePatched(Pipeline): - # Patch from https://github.com/scikit-learn/scikit-learn/pull/7723; - # only needed for scikit-learn < 0.19. - @if_delegate_has_method(delegate='_final_estimator') - def score(self, X, y=None, **score_params): - Xt = X - for name, transform in self.steps[:-1]: - if transform is not None: - Xt = transform.transform(Xt) - return self.steps[-1][-1].score(Xt, y, **score_params) - - def score_with_sample_weight(estimator, X, y=None, sample_weight=None): - if sklearn_version() < '0.19': - if isinstance(estimator, Pipeline) and sample_weight is not None: - estimator = _PipelinePatched(estimator.steps) if sample_weight is None: return estimator.score(X, y) return estimator.score(X, y, sample_weight=sample_weight) diff --git a/eli5/sklearn/utils.py b/eli5/sklearn/utils.py index 286d078a..b5cd3fae 100644 --- a/eli5/sklearn/utils.py +++ b/eli5/sklearn/utils.py @@ -1,7 +1,4 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from distutils.version import LooseVersion -from typing import Any, Optional, List, Tuple +from typing import Any, Optional import numpy as np import scipy.sparse as sp @@ -11,16 +8,14 @@ from eli5._feature_names import FeatureNames -def is_multiclass_classifier(clf): - # type: (Any) -> bool +def is_multiclass_classifier(clf) -> bool: """ Return True if a classifier is multiclass or False if it is binary. """ return clf.coef_.shape[0] > 1 -def is_multitarget_regressor(clf): - # type: (Any) -> bool +def is_multitarget_regressor(clf) -> bool: """ Return True if a regressor is multitarget or False if it predicts a single target. @@ -28,8 +23,7 @@ def is_multitarget_regressor(clf): return len(clf.coef_.shape) > 1 and clf.coef_.shape[0] > 1 -def is_probabilistic_classifier(clf): - # type: (Any) -> bool +def is_probabilistic_classifier(clf) -> bool: """ Return True if a classifier can return probabilities """ if not hasattr(clf, 'predict_proba'): return False @@ -40,8 +34,7 @@ def is_probabilistic_classifier(clf): return True -def predict_proba(estimator, X): - # type: (Any, Any) -> Optional[np.ndarray] +def predict_proba(estimator, X) -> Optional[np.ndarray]: """ Return result of predict_proba, if an estimator supports it, or None. """ if is_probabilistic_classifier(estimator): @@ -54,8 +47,7 @@ def predict_proba(estimator, X): return None -def has_intercept(estimator): - # type: (Any) -> bool +def has_intercept(estimator) -> bool: """ Return True if an estimator has intercept fit. """ if hasattr(estimator, 'fit_intercept'): return estimator.fit_intercept @@ -68,8 +60,7 @@ def has_intercept(estimator): def get_feature_names(clf, vec=None, bias_name='', feature_names=None, - num_features=None, estimator_feature_names=None): - # type: (Any, Any, Optional[str], Any, int, Any) -> FeatureNames + num_features=None, estimator_feature_names=None) -> FeatureNames: """ Return a FeatureNames instance that holds all feature names and a bias feature. @@ -112,11 +103,11 @@ def get_feature_names(clf, vec=None, bias_name='', feature_names=None, return FeatureNames(feature_names, bias_name=bias_name) -def get_feature_names_filtered(clf, vec=None, bias_name='', - feature_names=None, num_features=None, - feature_filter=None, feature_re=None, - estimator_feature_names=None): - # type: (...) -> Tuple[FeatureNames, List[int]] +def get_feature_names_filtered( + clf, vec=None, bias_name='', + feature_names=None, num_features=None, + feature_filter=None, feature_re=None, + estimator_feature_names=None) -> tuple[FeatureNames, list[int]]: feature_names = get_feature_names( clf=clf, vec=vec, @@ -247,8 +238,9 @@ def get_X0(X): return x -def handle_vec(clf, doc, vec, vectorized, feature_names, num_features=None): - # type: (...) -> Tuple[Any, FeatureNames] +def handle_vec( + clf, doc, vec, vectorized, feature_names, num_features=None, + ) -> tuple[Any, FeatureNames]: if not vectorized: vec = invert_hashing_and_fit(vec, [doc]) if (vec is None and feature_names is None and @@ -270,12 +262,3 @@ def add_intercept(X): return sp.hstack([X, intercept]).tocsr() else: return np.hstack([X, intercept]) - - -def sklearn_version(): - """Return sklearn version object which can be used for comparison. Usage: - >>> sklearn_version() > '0.17' - True - """ - from sklearn import __version__ - return LooseVersion(__version__) diff --git a/requirements.txt b/requirements.txt index a66f621d..6b1f7ff4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ numpy >= 1.9.0 scipy singledispatch >= 3.4.0.3 -scikit-learn >= 0.20 +scikit-learn >= 1.0 attrs > 16.0.0 jinja2 >= 3.0.0 pip >= 8.1 diff --git a/setup.py b/setup.py index ee0b26fa..94c30ce9 100755 --- a/setup.py +++ b/setup.py @@ -36,11 +36,11 @@ def get_long_description(): 'numpy >= 1.9.0', 'scipy', 'six', - 'scikit-learn >= 0.20', + 'scikit-learn >= 1.0', 'graphviz', 'tabulate>=0.7.7', ], - python_requires=">=3.6", + python_requires=">=3.9", classifiers=[ 'Development Status :: 4 - Beta', 'License :: OSI Approved :: MIT License', @@ -48,10 +48,10 @@ def get_long_description(): 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'Programming Language :: Python :: 3.13', ], ) diff --git a/tests/test_sklearn_unhashing.py b/tests/test_sklearn_unhashing.py index 51c3f174..dcdfc165 100644 --- a/tests/test_sklearn_unhashing.py +++ b/tests/test_sklearn_unhashing.py @@ -6,7 +6,6 @@ from sklearn.feature_extraction.text import HashingVectorizer from eli5.sklearn.unhashing import InvertableHashingVectorizer -from eli5.sklearn.utils import sklearn_version @pytest.mark.parametrize( ['always_signed', 'binary', 'alternate_sign'], [ @@ -22,12 +21,8 @@ def test_invertable_hashing_vectorizer(always_signed, binary, alternate_sign): n_features = 8 n_words = 4 * n_features - kwargs = dict(n_features=n_features, binary=binary) - if sklearn_version() < '0.19': - kwargs['non_negative'] = not alternate_sign - else: - kwargs['alternate_sign'] = alternate_sign - vec = HashingVectorizer(**kwargs) + vec = HashingVectorizer( + n_features=n_features, binary=binary, alternate_sign=alternate_sign) words = ['word_{}'.format(i) for i in range(n_words)] corpus = [w for i, word in enumerate(words, 1) for w in repeat(word, i)] split = len(corpus) // 2 diff --git a/tests/utils.py b/tests/utils.py index 5eb6c3a9..8eb31e5a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import print_function import os import inspect import json @@ -13,12 +11,9 @@ from eli5.formatters import format_as_text, format_as_html, format_as_dict from eli5.formatters.html import html_escape from eli5.formatters.text import format_signed -from eli5.sklearn.utils import sklearn_version -SGD_KWARGS = {'random_state': 42} -if sklearn_version() >= '0.19': - SGD_KWARGS['tol'] = 1e-3 +SGD_KWARGS = {'random_state': 42, 'tol': 1e-3} def rnd_len_arrays(dtype, min_len=0, max_len=3, elements=None): @@ -90,8 +85,7 @@ def get_names_coefs(feature_weights): for fw in feature_weights] -def check_targets_scores(explanation, atol=1e-8): - # type: (Explanation, float) -> None +def check_targets_scores(explanation: Explanation, atol: float = 1e-8) -> None: """ Check that feature weights sum to target score or proba, if both proba and score are present they match, and that there are no "remaining" features. diff --git a/tox.ini b/tox.ini index a5e3960d..be7c26bb 100644 --- a/tox.ini +++ b/tox.ini @@ -8,7 +8,7 @@ [tox] ; if adding or removing an environment, please also update .github/workflows/python-package.yml -envlist = docs,mypy,py36,py36-nodeps,py36-extra,py37,py38,py38-nodeps,py39,py39-nodeps +envlist = docs,mypy,py39,py310,py310-nodeps,py310-extra,py311,py312,py313 [base] deps= @@ -38,8 +38,8 @@ commands= ; bash _ci/runtests_default_with_crfsuite.sh {posargs: eli5 tests} -[testenv:py36-extra] -basepython=python3.6 +[testenv:py310-extra] +basepython=python3.10 deps= {[testenv]deps} xgboost @@ -55,42 +55,28 @@ commands= ; run tests for extra dependencies bash _ci/runtests_extra.sh {posargs: eli5 tests} -[testenv:py27-extra] -basepython=python2.7 -deps= - {[testenv]deps} - xgboost - lightgbm < 3.2.0 - catboost - tensorflow - keras - matplotlib - Pillow -commands={[testenv:py36-extra]commands} - -[testenv:py36-nodeps] +[testenv:py310-nodeps] deps= {[base]deps} commands= ; without lightning as it is optional pip install -e . - bash _ci/runtests_nodeps.sh {posargs: eli5 tests} - -[testenv:py38-nodeps] -basepython=python3.8 -deps={[base]deps} -commands={[testenv:py36-nodeps]commands} - - -[testenv:py39-nodeps] -basepython=python3.9 -deps={[base]deps} -commands={[testenv:py36-nodeps]commands} - + py.test --doctest-modules \ + --ignore eli5/lightning.py \ + --ignore eli5/sklearn_crfsuite \ + --ignore eli5/ipython.py \ + --ignore eli5/xgboost.py \ + --ignore eli5/lightgbm.py \ + --ignore eli5/catboost.py \ + --ignore eli5/keras \ + --ignore eli5/formatters/as_dataframe.py \ + --ignore eli5/formatters/image.py \ + --ignore tests/utils_image.py \ + --cov=eli5 --cov-report=html --cov-report=term {posargs: eli5 tests} [testenv:mypy] -basepython=python3.6 +basepython=python3.12 deps= {[testenv]deps} mypy == 0.750 From 220a256ccf924e64ac5987ad8b336c04258d5924 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Thu, 20 Mar 2025 20:55:58 +0000 Subject: [PATCH 02/37] fix typos in github workflows --- .github/workflows/python-package.yml | 11 ++++++----- tox.ini | 5 ++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 89fbf692..832512a1 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -23,17 +23,18 @@ jobs: tox-env: 'docs' - python-version: '3.9' tox-env: 'py39' - - python-version: '3.9' - tox-env: 'py39-nodeps' - - python-version: '3.9' - tox-env: 'py39-extra' - python-version: '3.10' tox-env: 'py310' + - python-version: '3.10' + tox-env: 'py310-nodeps' + - python-version: '3.10' + tox-env: 'py310-extra' - python-version: '3.11' tox-env: 'py311' - python-version: '3.12' - tox-env: 'py313' + tox-env: 'py312' - python-version: '3.13' + tox-env: 'py313' steps: - uses: actions/checkout@v2 diff --git a/tox.ini b/tox.ini index be7c26bb..66a6c71b 100644 --- a/tox.ini +++ b/tox.ini @@ -28,7 +28,6 @@ deps= {[base]deps} ipython pandas - commands= ; to install lightning numpy must be installed first pip install joblib "sklearn-contrib-lightning >= 0.4" @@ -49,16 +48,15 @@ deps= keras matplotlib Pillow - commands= pip install -e . ; run tests for extra dependencies bash _ci/runtests_extra.sh {posargs: eli5 tests} + [testenv:py310-nodeps] deps= {[base]deps} - commands= ; without lightning as it is optional pip install -e . @@ -75,6 +73,7 @@ commands= --ignore tests/utils_image.py \ --cov=eli5 --cov-report=html --cov-report=term {posargs: eli5 tests} + [testenv:mypy] basepython=python3.12 deps= From 1a3834127ffb7a0f5e6648cdc63ca6ed187eab55 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Thu, 20 Mar 2025 21:00:24 +0000 Subject: [PATCH 03/37] fix tests for indices_to_bool_mask --- eli5/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eli5/utils.py b/eli5/utils.py index e5f669f8..f9d62595 100644 --- a/eli5/utils.py +++ b/eli5/utils.py @@ -56,10 +56,10 @@ def is_sparse_vector(x): def indices_to_bool_mask(indices, size): """ Convert indices to a boolean (integer) mask. - >>> list(indices_to_bool_mask(np.array([2, 3]), 4)) + >>> list(map(bool, indices_to_bool_mask(np.array([2, 3]), 4))) [False, False, True, True] - >>> list(indices_to_bool_mask([2, 3], 4)) + >>> list(map(bool, indices_to_bool_mask([2, 3], 4))) [False, False, True, True] >>> indices_to_bool_mask(np.array([5]), 2) From fed9fedf7da6974f9bc4acacd3c6ab6f80513b31 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Thu, 20 Mar 2025 21:06:43 +0000 Subject: [PATCH 04/37] fix has_intercept for OneVsRestClassifier --- eli5/sklearn/utils.py | 2 ++ tests/test_sklearn_utils.py | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/eli5/sklearn/utils.py b/eli5/sklearn/utils.py index b5cd3fae..d769222e 100644 --- a/eli5/sklearn/utils.py +++ b/eli5/sklearn/utils.py @@ -49,6 +49,8 @@ def predict_proba(estimator, X) -> Optional[np.ndarray]: def has_intercept(estimator) -> bool: """ Return True if an estimator has intercept fit. """ + if isinstance(estimator, OneVsRestClassifier): + estimator = estimator.estimator if hasattr(estimator, 'fit_intercept'): return estimator.fit_intercept if hasattr(estimator, 'intercept_'): diff --git a/tests/test_sklearn_utils.py b/tests/test_sklearn_utils.py index b9a34904..96f45189 100644 --- a/tests/test_sklearn_utils.py +++ b/tests/test_sklearn_utils.py @@ -1,6 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import - import numpy as np import pytest from sklearn.datasets import make_classification, make_regression From c6948986eca48b3ade5aa464b3b3344d882ee3cf Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Thu, 20 Mar 2025 21:10:34 +0000 Subject: [PATCH 05/37] fix get_feature_names --- eli5/sklearn/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/eli5/sklearn/utils.py b/eli5/sklearn/utils.py index d769222e..e6152edd 100644 --- a/eli5/sklearn/utils.py +++ b/eli5/sklearn/utils.py @@ -66,15 +66,15 @@ def get_feature_names(clf, vec=None, bias_name='', feature_names=None, """ Return a FeatureNames instance that holds all feature names and a bias feature. - If vec is None or doesn't have get_feature_names() method, + If vec is None or doesn't have get_feature_names_out() method, features are named x0, x1, x2, etc. """ if not has_intercept(clf): bias_name = None if feature_names is None: - if vec and hasattr(vec, 'get_feature_names'): - return FeatureNames(vec.get_feature_names(), bias_name=bias_name) + if vec and hasattr(vec, 'get_feature_names_out'): + return FeatureNames(vec.get_feature_names_out(), bias_name=bias_name) else: if estimator_feature_names is None: num_features = num_features or get_num_features(clf) From f8990b680f2ceeff0ee2a47efb59ea603ddc2f72 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Thu, 20 Mar 2025 21:27:35 +0000 Subject: [PATCH 06/37] rename get_feature_names methods to get_feature_names_out --- eli5/_feature_names.py | 2 +- eli5/lime/_vectorizer.py | 19 +++---- eli5/sklearn/explain_weights.py | 10 ++-- eli5/sklearn/unhashing.py | 77 ++++++++++----------------- eli5/transform.py | 6 +-- tests/test_sklearn_explain_weights.py | 6 +-- tests/test_sklearn_transform.py | 21 ++------ tests/test_sklearn_unhashing.py | 2 +- tests/test_sklearn_vectorizers.py | 6 +-- 9 files changed, 52 insertions(+), 97 deletions(-) diff --git a/eli5/_feature_names.py b/eli5/_feature_names.py index ff1fd80c..ee734f7c 100644 --- a/eli5/_feature_names.py +++ b/eli5/_feature_names.py @@ -14,7 +14,7 @@ class FeatureNames(Sized, Iterable): A list-like object with feature names. It allows feature names for unknown features to be generated using a provided template, and to avoid making copies of large objects - in get_feature_names. + in get_feature_names_out. """ def __init__(self, feature_names=None, diff --git a/eli5/lime/_vectorizer.py b/eli5/lime/_vectorizer.py index 5356d6cd..ce1aca0b 100644 --- a/eli5/lime/_vectorizer.py +++ b/eli5/lime/_vectorizer.py @@ -1,6 +1,4 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from typing import Tuple, Callable, Dict, Optional, List +from typing import Callable import numpy as np from sklearn.base import BaseEstimator, TransformerMixin @@ -29,11 +27,10 @@ def transform(self, X): return np.ones(len(self.text_.tokens)).reshape((1, -1)) def get_doc_weighted_spans(self, - doc, # type: str - feature_weights, # type: FeatureWeights - feature_fn # type: Callable[[str], str] - ): - # type: (...) -> Tuple[Dict[Tuple[str, int], float], DocWeightedSpans] + doc: str, + feature_weights: FeatureWeights, + feature_fn: Callable[[str], str], + ) -> tuple[dict[tuple[str, int], float], DocWeightedSpans]: feature_weights_dict = _get_feature_weights_dict(feature_weights, feature_fn) spans = [] @@ -53,11 +50,9 @@ def get_doc_weighted_spans(self, ) return found_features, doc_weighted_spans - def _featname(self, idx, token): - # type: (int, str) -> str + def _featname(self, idx: int, token: str) -> str: return "[{}] {}".format(idx, token) - def get_feature_names(self): - # type: () -> List[str] + def get_feature_names_out(self) -> list[str]: return [self._featname(idx, token) for idx, token in enumerate(self.text_.tokens)] diff --git a/eli5/sklearn/explain_weights.py b/eli5/sklearn/explain_weights.py index 49010fb4..099110ad 100644 --- a/eli5/sklearn/explain_weights.py +++ b/eli5/sklearn/explain_weights.py @@ -1,6 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import - import numpy as np from sklearn.base import BaseEstimator, RegressorMixin @@ -38,7 +35,7 @@ OneClassSVM, ) # TODO: see https://github.com/scikit-learn/scikit-learn/pull/2250 -from sklearn.naive_bayes import BernoulliNB, MultinomialNB +# from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.ensemble import ( GradientBoostingClassifier, GradientBoostingRegressor, @@ -54,11 +51,10 @@ DecisionTreeRegressor, ) -from eli5.base import ( - Explanation, TargetExplanation, FeatureImportances) +from eli5.base import Explanation, TargetExplanation from eli5.base_utils import singledispatch from eli5._feature_weights import get_top_features -from eli5.utils import argsort_k_largest_positive, get_target_display_names +from eli5.utils import get_target_display_names from eli5.sklearn.unhashing import handle_hashing_vec, is_invhashing from eli5.sklearn.treeinspect import get_tree_info from eli5.sklearn.utils import ( diff --git a/eli5/sklearn/unhashing.py b/eli5/sklearn/unhashing.py index f4f79b8d..6b1392de 100644 --- a/eli5/sklearn/unhashing.py +++ b/eli5/sklearn/unhashing.py @@ -1,14 +1,11 @@ -# -*- coding: utf-8 -*- """ Utilities to reverse transformation done by FeatureHasher or HashingVectorizer. """ -from __future__ import absolute_import from collections import defaultdict, Counter from itertools import chain -from typing import List, Iterable, Any, Dict, Tuple, Union +from typing import Iterable, Union import numpy as np -import six from sklearn.base import BaseEstimator, TransformerMixin from sklearn.feature_extraction.text import ( HashingVectorizer, @@ -30,7 +27,7 @@ class InvertableHashingVectorizer(BaseEstimator, TransformerMixin): Unlike HashingVectorizer it can be fit. During fitting :class:`~.InvertableHashingVectorizer` learns which input terms map to which feature columns/signs; this allows to provide more meaningful - :meth:`get_feature_names`. The cost is that it is no longer stateless. + :meth:`get_feature_names_out`. The cost is that it is no longer stateless. You can fit :class:`~.InvertableHashingVectorizer` on a random sample of documents (not necessarily on the whole training and testing data), @@ -41,16 +38,14 @@ class InvertableHashingVectorizer(BaseEstimator, TransformerMixin): :meth:`transform` works the same as HashingVectorizer.transform. """ - def __init__(self, vec, - unkn_template="FEATURE[%d]"): - # type: (HashingVectorizer, str) -> None + def __init__(self, vec: HashingVectorizer, unkn_template="FEATURE[%d]"): self.vec = vec self.unkn_template = unkn_template self.unhasher = FeatureUnhasher( hasher=vec._get_hasher(), unkn_template=unkn_template, ) - self.n_features = vec.n_features # type: int + self.n_features: int = vec.n_features def fit(self, X, y=None): """ Extract possible terms from documents """ @@ -64,8 +59,7 @@ def partial_fit(self, X): def transform(self, X): return self.vec.transform(X) - def get_feature_names(self, always_signed=True): - # type: (bool) -> FeatureNames + def get_feature_names_out(self, always_signed=True) -> FeatureNames: """ Return feature names. This is a best-effort function which tries to reconstruct feature @@ -79,7 +73,7 @@ def get_feature_names(self, always_signed=True): unprocessed classifier coefficients, and always_signed=False if you've taken care of :attr:`column_signs_`. """ - return self.unhasher.get_feature_names( + return self.unhasher.get_feature_names_out( always_signed=always_signed, always_positive=self._always_positive(), ) @@ -105,8 +99,7 @@ def column_signs_(self): self.unhasher.recalculate_attributes() return self.unhasher.column_signs_ - def _always_positive(self): - # type: () -> bool + def _always_positive(self) -> bool: return ( self.vec.binary or getattr(self.vec, 'non_negative', False) @@ -118,32 +111,28 @@ class FeatureUnhasher(BaseEstimator): """ Class for recovering a mapping used by FeatureHasher. """ - def __init__(self, hasher, unkn_template="FEATURE[%d]"): - # type: (FeatureHasher, str) -> None + def __init__(self, hasher: FeatureHasher, unkn_template="FEATURE[%d]"): if hasher.input_type != 'string': raise ValueError("FeatureUnhasher only supports hashers with " "input_type 'string', got %r." % hasher.input_type) self.hasher = hasher - self.n_features = self.hasher.n_features # type: int + self.n_features: int = self.hasher.n_features self.unkn_template = unkn_template self._attributes_dirty = True - self._term_counts = Counter() # type: Counter + self._term_counts = Counter() - def fit(self, X, y=None): - # type: (Iterable[str], Any) -> FeatureUnhasher + def fit(self, X: Iterable[str], y=None) -> 'FeatureUnhasher': self._term_counts.clear() self.partial_fit(X, y) self.recalculate_attributes(force=True) return self - def partial_fit(self, X, y=None): - # type: (Iterable[str], Any) -> FeatureUnhasher + def partial_fit(self, X: Iterable[str], y=None) -> 'FeatureUnhasher': self._term_counts.update(X) self._attributes_dirty = True return self - def get_feature_names(self, always_signed=True, always_positive=False): - # type: (bool, bool) -> FeatureNames + def get_feature_names_out(self, always_signed=True, always_positive=False) -> FeatureNames: self.recalculate_attributes() # lists of names with signs of known features @@ -164,23 +153,18 @@ def get_feature_names(self, always_signed=True, always_positive=False): unkn_template=self.unkn_template) def recalculate_attributes(self, force=False): - # type: (bool) -> None """ Update all computed attributes. It is only needed if you need to access computed attributes after :meth:`patrial_fit` was called. """ if not self._attributes_dirty and not force: return - terms = [term for term, _ in self._term_counts.most_common()] - if six.PY2: - terms = np.array(terms, dtype=np.object) - else: - terms = np.array(terms) + terms = np.array([term for term, _ in self._term_counts.most_common()]) if len(terms): indices, signs = _get_indices_and_signs(self.hasher, terms) else: indices, signs = np.array([]), np.array([]) - self.terms_ = terms # type: np.ndarray + self.terms_: np.ndarray = terms self.term_columns_ = indices self.term_signs_ = signs self.collisions_ = _get_collisions(indices) @@ -197,8 +181,7 @@ def _get_column_signs(self): colums_signs[hash_id] = 1 return colums_signs - def _get_collision_info(self): - # type: () -> Tuple[List[int], List[np.ndarray], List[np.ndarray]] + def _get_collision_info(self) -> tuple[list[int], list[np.ndarray], list[np.ndarray]]: column_ids, term_names, term_signs = [], [], [] for column_id, _term_ids in self.collisions_.items(): column_ids.append(column_id) @@ -207,13 +190,12 @@ def _get_collision_info(self): return column_ids, term_names, term_signs -def _get_collisions(indices): - # type: (...) -> Dict[int, List[int]] +def _get_collisions(indices) -> dict[int, list[int]]: """ Return a dict ``{column_id: [possible term ids]}`` with collision information. """ - collisions = defaultdict(list) # type: Dict[int, List[int]] + collisions: dict[int, list[int]] = defaultdict(list) for term_id, hash_id in enumerate(indices): collisions[hash_id].append(term_id) return dict(collisions) @@ -247,12 +229,12 @@ def is_invhashing(vec): def handle_hashing_vec(vec, feature_names, coef_scale, with_coef_scale=True): """ Return feature_names and coef_scale (if with_coef_scale is True), - calling .get_feature_names for invhashing vectorizers. + calling .get_feature_names_out for invhashing vectorizers. """ needs_coef_scale = with_coef_scale and coef_scale is None if is_invhashing(vec): if feature_names is None: - feature_names = vec.get_feature_names(always_signed=False) + feature_names = vec.get_feature_names_out(always_signed=False) if needs_coef_scale: coef_scale = vec.column_signs_ elif (isinstance(vec, FeatureUnion) and @@ -266,15 +248,15 @@ def handle_hashing_vec(vec, feature_names, coef_scale, with_coef_scale=True): return (feature_names, coef_scale) if with_coef_scale else feature_names -def _invhashing_union_feature_names_scale(vec_union): - # type: (FeatureUnion) -> Tuple[FeatureNames, np.ndarray] - feature_names_store = {} # type: Dict[int, Union[str, List]] +def _invhashing_union_feature_names_scale( + vec_union: FeatureUnion) -> tuple[FeatureNames, np.ndarray]: + feature_names_store: dict[int, Union[str, list]] = {} unkn_template = None shift = 0 coef_scale_values = [] for vec_name, vec in vec_union.transformer_list: if isinstance(vec, InvertableHashingVectorizer): - vec_feature_names = vec.get_feature_names(always_signed=False) + vec_feature_names = vec.get_feature_names_out(always_signed=False) unkn_template = vec_feature_names.unkn_template for idx, fs in vec_feature_names.feature_names.items(): new_fs = [] @@ -286,7 +268,7 @@ def _invhashing_union_feature_names_scale(vec_union): coef_scale_values.append((shift, vec.column_signs_)) shift += vec_feature_names.n_features else: - vec_feature_names = vec.get_feature_names() + vec_feature_names = vec.get_feature_names_out() feature_names_store.update( (shift + idx, '{}__{}'.format(vec_name, fname)) for idx, fname in enumerate(vec_feature_names)) @@ -303,10 +285,8 @@ def _invhashing_union_feature_names_scale(vec_union): def invert_hashing_and_fit( - vec, # type: Union[FeatureUnion, HashingVectorizer] - docs - ): - # type: (...) -> Union[FeatureUnion, InvertableHashingVectorizer] + vec: Union[FeatureUnion, HashingVectorizer], docs, + ) -> Union[FeatureUnion, InvertableHashingVectorizer]: """ Create an :class:`~.InvertableHashingVectorizer` from hashing vectorizer vec and fit it on docs. If vec is a FeatureUnion, do it for all hashing vectorizers in the union. @@ -323,8 +303,7 @@ def invert_hashing_and_fit( return vec -def _fit_invhashing_union(vec_union, docs): - # type: (FeatureUnion, Any) -> FeatureUnion +def _fit_invhashing_union(vec_union: FeatureUnion, docs) -> FeatureUnion: """ Fit InvertableHashingVectorizer on doc inside a FeatureUnion. """ return FeatureUnion( diff --git a/eli5/transform.py b/eli5/transform.py index d79082a8..d89b52b2 100644 --- a/eli5/transform.py +++ b/eli5/transform.py @@ -12,7 +12,7 @@ def transform_feature_names(transformer, in_names=None): transformations for each class of transformer. If there is no ``singledispatch`` handler registered for a transformer - class, ``transformer.get_feature_names()`` method is called; if there is + class, ``transformer.get_feature_names_out()`` method is called; if there is no such method then feature names are not supported and this function raises an exception. @@ -28,7 +28,7 @@ def transform_feature_names(transformer, in_names=None): ------- feature_names : list of str """ - if hasattr(transformer, 'get_feature_names'): - return transformer.get_feature_names() + if hasattr(transformer, 'get_feature_names_out'): + return transformer.get_feature_names_out() raise NotImplementedError('transform_feature_names not available for ' '{}'.format(transformer)) diff --git a/tests/test_sklearn_explain_weights.py b/tests/test_sklearn_explain_weights.py index 7f5469c2..cd0acb04 100644 --- a/tests/test_sklearn_explain_weights.py +++ b/tests/test_sklearn_explain_weights.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from functools import partial import re @@ -120,7 +118,7 @@ def assert_explained_weights_linear_classifier( X = vec.fit_transform(docs) if add_bias: X = sp.hstack([X, np.ones((X.shape[0], 1))]) - feature_names = vec.get_feature_names() + ['BIAS'] + feature_names = vec.get_feature_names_out() + ['BIAS'] else: feature_names = None @@ -281,7 +279,7 @@ def test_explain_linear_hashed_pos_neg(newsgroups_train, pass_feature_weights): if pass_feature_weights: res = explain_weights( clf, top=(10, 10), target_names=target_names, - feature_names=ivec.get_feature_names(always_signed=False), + feature_names=ivec.get_feature_names_out(always_signed=False), coef_scale=ivec.column_signs_) else: res = explain_weights( diff --git a/tests/test_sklearn_transform.py b/tests/test_sklearn_transform.py index 1d25466c..d1b2cc8c 100644 --- a/tests/test_sklearn_transform.py +++ b/tests/test_sklearn_transform.py @@ -6,9 +6,9 @@ from sklearn.feature_selection import ( SelectPercentile, SelectKBest, - SelectFpr, # TODO: add tests and document - SelectFdr, # TODO: add tests and document - SelectFwe, # TODO: add tests and document + # SelectFpr, # TODO: add tests and document + # SelectFdr, # TODO: add tests and document + # SelectFwe, # TODO: add tests and document GenericUnivariateSelect, VarianceThreshold, RFE, @@ -16,17 +16,6 @@ SelectFromModel, ) from sklearn.linear_model import LogisticRegression -_additional_test_cases = [] -try: - from sklearn.linear_model import ( # type: ignore - RandomizedLogisticRegression, - RandomizedLasso, # TODO: add tests and document - ) - _additional_test_cases.append( - (RandomizedLogisticRegression(random_state=42), - ['', '', ''])) -except ImportError: # Removed in scikit-learn 0.21 - pass from sklearn.preprocessing import ( MinMaxScaler, StandardScaler, @@ -46,7 +35,7 @@ def fit(self, X, y=None): def transform(self, X): return X[:, :3] - def get_feature_names(self): + def get_feature_names_out(self): return ['f1', 'f2', 'f3'] @@ -95,7 +84,7 @@ def selection_score_func(X, y): ['', '']), (RFECV(LogisticRegression(solver='liblinear', random_state=42, multi_class='ovr'), cv=3), ['', '', '', '']), -] + _additional_test_cases) +]) def test_transform_feature_names_iris(transformer, expected, iris_train): X, y, _, _ = iris_train transformer.fit(X, y) diff --git a/tests/test_sklearn_unhashing.py b/tests/test_sklearn_unhashing.py index dcdfc165..83de1c3e 100644 --- a/tests/test_sklearn_unhashing.py +++ b/tests/test_sklearn_unhashing.py @@ -44,7 +44,7 @@ def test_invertable_hashing_vectorizer(always_signed, binary, alternate_sign): def check_feature_names(vec, ivec, always_signed, corpus, alternate_sign): - feature_names = ivec.get_feature_names(always_signed=always_signed) + feature_names = ivec.get_feature_names_out(always_signed=always_signed) seen_words = set() counts = Counter(corpus) for idx, collisions in enumerate(feature_names): diff --git a/tests/test_sklearn_vectorizers.py b/tests/test_sklearn_vectorizers.py index 0575c8b1..47e8bd23 100644 --- a/tests/test_sklearn_vectorizers.py +++ b/tests/test_sklearn_vectorizers.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from pprint import pprint import attr @@ -79,7 +77,7 @@ def test_explain_hashing_vectorizer(newsgroups_train_binary): assert res_vectorized == _without_weighted_spans(res) assert res == get_res( - feature_names=ivec.get_feature_names(always_signed=False)) + feature_names=ivec.get_feature_names_out(always_signed=False)) def _without_weighted_spans(res): @@ -105,7 +103,7 @@ def test_explain_linear_dense(): [test_day_vec] = vec.transform(test_day) res2 = explain_prediction( clf, test_day_vec, target_names=target_names, - vectorized=True, feature_names=vec.get_feature_names()) + vectorized=True, feature_names=vec.get_feature_names_out()) assert res1 == res2 From 75da85bd2190dea6726629566327715ef552046b Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Thu, 20 Mar 2025 21:44:29 +0000 Subject: [PATCH 07/37] pass test_explain_linear --- eli5/sklearn/utils.py | 16 ++++++++++++---- tests/test_sklearn_explain_weights.py | 2 +- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/eli5/sklearn/utils.py b/eli5/sklearn/utils.py index e6152edd..c6254820 100644 --- a/eli5/sklearn/utils.py +++ b/eli5/sklearn/utils.py @@ -12,7 +12,7 @@ def is_multiclass_classifier(clf) -> bool: """ Return True if a classifier is multiclass or False if it is binary. """ - return clf.coef_.shape[0] > 1 + return len(clf.classes_) > 1 def is_multitarget_regressor(clf) -> bool: @@ -146,7 +146,13 @@ def get_coef(clf, label_id, scale=None): ``scale`` (optional) is a scaling vector; coef_[i] => coef[i] * scale[i] if scale[i] is not nan. Intercept is not scaled. """ - if len(clf.coef_.shape) == 2: + if isinstance(clf, OneVsRestClassifier): + coef = clf.estimators_[label_id].coef_ + if len(coef.shape) == 2 and coef.shape[0] == 1: + coef = coef[0] + if len(coef.shape) != 1: + raise ValueError(f'Unexpected coef shape: {coef.shape}') + elif len(clf.coef_.shape) == 2: # Most classifiers (even in binary case) and regressors coef = _dense_1d(clf.coef_[label_id]) elif len(clf.coef_.shape) == 1: @@ -159,7 +165,7 @@ def get_coef(clf, label_id, scale=None): # Lasso with one feature: 0D array coef = np.array([clf.coef_]) else: - raise ValueError('Unexpected clf.coef_ shape: %s' % clf.coef_.shape) + raise ValueError(f'Unexpected coef shape: {clf.coef_.shape}') if scale is not None: if coef.shape != scale.shape: @@ -173,7 +179,9 @@ def get_coef(clf, label_id, scale=None): if not has_intercept(clf): return coef - if label_id == 0 and not isinstance(clf.intercept_, np.ndarray): + if isinstance(clf, OneVsRestClassifier): + bias = clf.estimators_[label_id].intercept_ + elif label_id == 0 and not isinstance(clf.intercept_, np.ndarray): bias = clf.intercept_ else: bias = clf.intercept_[label_id] diff --git a/tests/test_sklearn_explain_weights.py b/tests/test_sklearn_explain_weights.py index cd0acb04..80eb4aaa 100644 --- a/tests/test_sklearn_explain_weights.py +++ b/tests/test_sklearn_explain_weights.py @@ -162,7 +162,7 @@ def assert_explained_weights_linear_regressor(boston_train, reg, has_bias=True): [RidgeClassifier(random_state=42)], [RidgeClassifierCV()], [SGDClassifier(**SGD_KWARGS)], - [SGDClassifier(loss='log', **SGD_KWARGS)], + [SGDClassifier(loss='log_loss', **SGD_KWARGS)], [PassiveAggressiveClassifier(random_state=42)], [Perceptron(random_state=42)], [LinearSVC(random_state=42)], From 0c492af655bc59a680737cff2c82b9a19d4f63ca Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Thu, 20 Mar 2025 22:02:54 +0000 Subject: [PATCH 08/37] fix lime, fix is_multiclass_classifier from the previous commit --- eli5/lime/lime.py | 2 +- eli5/lime/textutils.py | 77 ++++++++++++++++++------------------------ eli5/sklearn/utils.py | 4 ++- tests/test_lime.py | 11 ++---- 4 files changed, 38 insertions(+), 56 deletions(-) diff --git a/eli5/lime/lime.py b/eli5/lime/lime.py index 9a6e08d7..6bbf6c38 100644 --- a/eli5/lime/lime.py +++ b/eli5/lime/lime.py @@ -312,7 +312,7 @@ def _fix_target_names(self, kwargs): def _default_clf(self): kwargs = dict( - loss='log', + loss='log_loss', penalty='elasticnet', alpha=1e-3, random_state=self.rng_, diff --git a/eli5/lime/textutils.py b/eli5/lime/textutils.py index 98da0428..362e23b6 100644 --- a/eli5/lime/textutils.py +++ b/eli5/lime/textutils.py @@ -1,8 +1,6 @@ -# -*- coding: utf-8 -*- """ Utilities for text generation. """ -from __future__ import absolute_import import re import math from typing import List, Tuple, Union, Optional @@ -14,22 +12,21 @@ # the same as scikit-learn token pattern, but allows single-char tokens -DEFAULT_TOKEN_PATTERN = r'(?u)\b\w+\b' +DEFAULT_TOKEN_PATTERN = r'\b\w+\b' # non-whitespace chars CHAR_TOKEN_PATTERN = r'[^\s]' -def generate_samples(text, # type: TokenizedText - n_samples=500, # type: int - bow=True, # type: bool +def generate_samples(text: 'TokenizedText', + n_samples=500, + bow=True, random_state=None, - replacement='', # type: str - min_replace=1, # type: Union[int, float] - max_replace=1.0, # type: Union[int, float] - group_size=1, # type: int - ): - # type: (...) -> Tuple[List[str], np.ndarray, np.ndarray] + replacement='', + min_replace=1.0, + max_replace=1.0, + group_size=1, + ) -> Tuple[List[str], np.ndarray, np.ndarray]: """ Return ``n_samples`` changed versions of text (with some words removed), along with distances between the original text and a generated @@ -66,21 +63,19 @@ def cosine_similarity_vec(num_tokens, num_removed_vec): class TokenizedText(object): - def __init__(self, text, token_pattern=DEFAULT_TOKEN_PATTERN): - # type: (str, str) -> None + def __init__(self, text: str, token_pattern=DEFAULT_TOKEN_PATTERN): self.text = text self.split = SplitResult.fromtext(text, token_pattern) - self._vocab = None # type: Optional[List[str]] + self._vocab: Optional[list[str]] = None def replace_random_tokens(self, - n_samples, # type: int - replacement='', # type: str + n_samples: int, + replacement='', random_state=None, - min_replace=1, # type: Union[int, float] - max_replace=1.0, # type: Union[int, float] - group_size=1 # type: int - ): - # type: (...) -> List[Tuple[str, int, np.ndarray]] + min_replace=1.0, + max_replace=1.0, + group_size=1, + ) -> list[tuple[str, int, np.ndarray]]: """ Return a list of ``(text, replaced_count, mask)`` tuples with n_samples versions of text with some words replaced. @@ -110,13 +105,12 @@ def replace_random_tokens(self, return res def replace_random_tokens_bow(self, - n_samples, # type: int - replacement='', # type: str + n_samples: int, + replacement='', random_state=None, - min_replace=1, # type: Union[int, float] - max_replace=1.0, # type: Union[int, float] - ): - # type: (...) -> List[Tuple[str, int, np.ndarray]] + min_replace=1.0, + max_replace=1.0, + ) -> list[tuple[str, int, np.ndarray]]: """ Return a list of ``(text, replaced_words_count, mask)`` tuples with n_samples versions of text with some words replaced. @@ -144,11 +138,10 @@ def replace_random_tokens_bow(self, return res def _get_min_max(self, - min_replace, # type: Union[int, float] - max_replace, # type: Union[int, float] - hard_maximum # type: int - ): - # type: (...) -> Tuple[int, int] + min_replace: Union[int, float], + max_replace: Union[int, float], + hard_maximum: int, + ) -> tuple[int, int]: if isinstance(min_replace, float): min_replace = int(math.floor(hard_maximum * min_replace)) or 1 if isinstance(max_replace, float): @@ -158,8 +151,7 @@ def _get_min_max(self, return min_replace, max_replace @property - def vocab(self): - # type: () -> List[str] + def vocab(self) -> list[str]: if self._vocab is None: self._vocab = sorted(set(self.tokens)) return self._vocab @@ -180,8 +172,7 @@ def __init__(self, parts): self.starts = self.lenghts.cumsum() @classmethod - def fromtext(cls, text, token_pattern=DEFAULT_TOKEN_PATTERN): - # type: (str, str) -> SplitResult + def fromtext(cls, text: str, token_pattern=DEFAULT_TOKEN_PATTERN) -> 'SplitResult': token_pattern = u"(%s)" % token_pattern parts = re.split(token_pattern, text) return cls(parts) @@ -195,21 +186,17 @@ def tokens(self): return self.parts[1::2] @property - def token_spans(self): - # type: () -> List[Tuple[int, int]] + def token_spans(self) -> list[tuple[int, int]]: return list(zip(self.starts[::2], self.starts[1::2])) - def copy(self): - # type: () -> SplitResult + def copy(self) -> 'SplitResult': return self.__class__(self.parts.copy()) - def masked(self, invmask, replacement=''): - # type: (Union[np.ndarray, List[int]], str) -> SplitResult + def masked(self, invmask: Union[np.ndarray, list[int]], replacement='') -> 'SplitResult': s = self.copy() s.tokens[invmask] = replacement return s @property - def text(self): - # type: () -> str + def text(self) -> str: return "".join(self.parts) diff --git a/eli5/sklearn/utils.py b/eli5/sklearn/utils.py index c6254820..20ca8f4e 100644 --- a/eli5/sklearn/utils.py +++ b/eli5/sklearn/utils.py @@ -12,7 +12,9 @@ def is_multiclass_classifier(clf) -> bool: """ Return True if a classifier is multiclass or False if it is binary. """ - return len(clf.classes_) > 1 + if isinstance(clf, OneVsRestClassifier): + return len(clf.estimators_) > 1 + return clf.coef_.shape[0] > 1 def is_multitarget_regressor(clf) -> bool: diff --git a/tests/test_lime.py b/tests/test_lime.py index 92d3d7f2..137e4048 100644 --- a/tests/test_lime.py +++ b/tests/test_lime.py @@ -1,6 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import - import numpy as np from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer from sklearn.naive_bayes import MultinomialNB @@ -15,11 +12,7 @@ def test_lime_explain_probabilistic(newsgroups_train): docs, y, target_names = newsgroups_train - try: - vec = HashingVectorizer(alternate_sign=False) - except TypeError: - # sklearn < 0.19 - vec = HashingVectorizer(non_negative=True) + vec = HashingVectorizer(alternate_sign=False) clf = MultinomialNB() X = vec.fit_transform(docs) @@ -154,7 +147,7 @@ def test_text_explainer_token_pattern(): predict_proba = substring_presence_predict_proba('bar') # a different token_pattern - te = TextExplainer(token_pattern=r'(?u)\b[-\w]+\b') + te = TextExplainer(token_pattern=r'\b[-\w]+\b') te.fit(text, predict_proba) print(te.metrics_) assert te.metrics_['score'] > 0.95 From 6573f6512fda0b956c0f864524bb1c52135c89c4 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Thu, 20 Mar 2025 22:17:41 +0000 Subject: [PATCH 09/37] fix target formatting --- eli5/formatters/text.py | 86 ++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 45 deletions(-) diff --git a/eli5/formatters/text.py b/eli5/formatters/text.py index e6abb286..f2e631cd 100644 --- a/eli5/formatters/text.py +++ b/eli5/formatters/text.py @@ -1,9 +1,8 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from itertools import chain -import six from tabulate import tabulate -from typing import List, Optional, Iterator +from typing import Optional, Iterator + +import numpy as np from eli5.base import Explanation, FeatureImportances from . import fields @@ -15,17 +14,16 @@ from .trees import tree2text -_PLUS_MINUS = "+-" if six.PY2 else "±" -_ELLIPSIS = '...' if six.PY2 else '…' -_SPACE = '_' if six.PY2 else '░' +_PLUS_MINUS = "±" +_ELLIPSIS = '…' +_SPACE = '░' -def format_as_text(expl, # type: Explanation +def format_as_text(expl: Explanation, show=fields.ALL, - highlight_spaces=None, # type: Optional[bool] - show_feature_values=False, # type: bool - ): - # type: (...) -> str + highlight_spaces: Optional[bool] = None, + show_feature_values: bool = False, + ) -> str: """ Format explanation as text. Parameters @@ -44,7 +42,7 @@ def format_as_text(expl, # type: Explanation When True, feature values are shown along with feature contributions. Default is False. - show : List[str], optional + show : list[str], optional List of sections to show. Allowed values: * 'targets' - per-target feature weights; @@ -59,7 +57,7 @@ def format_as_text(expl, # type: Explanation ``INFO`` (method and description), ``WEIGHTS`` (all the rest), and ``ALL`` (all). """ - lines = [] # type: List[str] + lines: list[str] = [] if highlight_spaces is None: highlight_spaces = should_highlight_spaces(expl) @@ -101,23 +99,20 @@ def format_as_text(expl, # type: Explanation return '\n'.join(lines) -def _method_lines(explanation): - # type: (Explanation) -> List[str] +def _method_lines(explanation: Explanation) -> list[str]: return ['Explained as: {}'.format(explanation.method)] -def _description_lines(explanation): - # type: (Explanation) -> List[str] +def _description_lines(explanation: Explanation) -> list[str]: return [explanation.description or ''] -def _error_lines(explanation): - # type: (Explanation) -> List[str] +def _error_lines(explanation: Explanation) -> list[str]: return ['Error: {}'.format(explanation.error)] -def _feature_importances_lines(explanation, hl_spaces): - # type: (Explanation, Optional[bool]) -> Iterator[str] +def _feature_importances_lines( + explanation: Explanation, hl_spaces: Optional[bool]) -> Iterator[str]: max_width = 0 assert explanation.feature_importances is not None for line in _fi_lines(explanation.feature_importances, hl_spaces): @@ -128,8 +123,9 @@ def _feature_importances_lines(explanation, hl_spaces): explanation.feature_importances.remaining, kind='', width=max_width) -def _fi_lines(feature_importances, hl_spaces): - # type: (FeatureImportances, Optional[bool]) -> Iterator[str] +def _fi_lines( + feature_importances: FeatureImportances, hl_spaces: Optional[bool], + ) -> Iterator[str]: for fw in feature_importances.importances: featname = _format_feature(fw.feature, hl_spaces) if fw.std or fw.weight: @@ -147,14 +143,12 @@ def _fi_lines(feature_importances, hl_spaces): ) -def _decision_tree_lines(explanation): - # type: (Explanation) -> List[str] +def _decision_tree_lines(explanation: Explanation) -> list[str]: assert explanation.decision_tree is not None return ["", tree2text(explanation.decision_tree)] -def _transition_features_lines(explanation): - # type: (Explanation) -> List[str] +def _transition_features_lines(explanation: Explanation) -> list[str]: tf = explanation.transition_features assert tf is not None return [ @@ -166,12 +160,11 @@ def _transition_features_lines(explanation): ] -def _targets_lines(explanation, # type: Explanation - hl_spaces, # type: Optional[bool] - show_feature_values, # type: bool - explaining_prediction, # type: bool - ): - # type: (...) -> List[str] +def _targets_lines(explanation: Explanation, + hl_spaces: Optional[bool], + show_feature_values: bool, + explaining_prediction: bool, + ) -> list[str]: lines = [] assert explanation.targets is not None for target in explanation.targets: @@ -181,7 +174,7 @@ def _targets_lines(explanation, # type: Explanation header = "%s%r%s top features" % ( 'y=' if not explanation.is_regression else '', - target.target, + _np_to_native(target.target), scores) lines.append(header) @@ -228,8 +221,7 @@ def _targets_lines(explanation, # type: Explanation return lines -def _format_scores(proba, score): - # type: (Optional[float], Optional[float]) -> str +def _format_scores(proba: Optional[float], score: Optional[float]) -> str: scores = [] if proba is not None: scores.append("probability=%0.3f" % proba) @@ -238,8 +230,7 @@ def _format_scores(proba, score): return ", ".join(scores) -def _format_remaining(remaining, kind, width): - # type: (int, str, int) -> str +def _format_remaining(remaining: int, kind: str, width: int) -> str: s = '{ellipsis} {remaining} more {kind}{ellipsis}'.format( ellipsis=_ELLIPSIS, remaining=remaining, @@ -248,8 +239,7 @@ def _format_remaining(remaining, kind, width): return ('{:^%d}' % width).format(s) -def _format_feature(name, hl_spaces): - # type: (...) -> str +def _format_feature(name, hl_spaces) -> str: if isinstance(name, bytes): name = name.decode('utf8') if isinstance(name, FormattedFeatureName): @@ -261,19 +251,25 @@ def _format_feature(name, hl_spaces): return _format_single_feature(name, hl_spaces=hl_spaces) -def _format_single_feature(feature, hl_spaces): - # type: (str, bool) -> str +def _format_single_feature(feature: str, hl_spaces: bool) -> str: if hl_spaces: return replace_spaces(feature, lambda n, _: _SPACE * n) else: return feature -def _format_unhashed_feature(name, hl_spaces, sep=' | '): - # type: (List, bool, str) -> str +def _format_unhashed_feature(name: list, hl_spaces: bool, sep=' | ') -> str: """ Format feature name for hashed features. """ return sep.join( format_signed(n, _format_single_feature, hl_spaces=hl_spaces) for n in name) + + +def _np_to_native(value) -> str: + if isinstance(value, np.integer): + value = int(value) + elif isinstance(value, np.str_): + value = str(value) + return value From 20475de9c6efed16d719ec4ac6260e5fe4b7e340 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 12:00:11 +0000 Subject: [PATCH 10/37] fix warnings: remove multi_class='multinomial' --- tests/test_sklearn_explain_prediction.py | 12 ++++-------- tests/test_sklearn_explain_weights.py | 2 +- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/tests/test_sklearn_explain_prediction.py b/tests/test_sklearn_explain_prediction.py index 277fee46..957ad5b0 100644 --- a/tests/test_sklearn_explain_prediction.py +++ b/tests/test_sklearn_explain_prediction.py @@ -1,9 +1,6 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from functools import partial from pprint import pprint import re -from typing import List import pytest import numpy as np @@ -275,12 +272,11 @@ def assert_predicted_class_used(clf, X): return assert_class_used(clf, X, y_pred) -def assert_class_used(clf, X, y, **explain_kwargs): - # type: (...) -> List[Explanation] +def assert_class_used(clf, X, y, **explain_kwargs) -> list[Explanation]: """ Check that classes y are used for explanations of X predictions """ explanations = [] for x, pred_target in zip(X, y): - res = explain_prediction(clf, x, **explain_kwargs) # type: Explanation + res: Explanation = explain_prediction(clf, x, **explain_kwargs) explanations.append(res) assert len(res.targets) == 1 if res.targets[0].score != 0: @@ -339,11 +335,11 @@ def _assert_feature_filter_works(get_res, x): @pytest.mark.parametrize(['clf'], [ [LogisticRegression(random_state=42)], - [LogisticRegression(random_state=42, multi_class='multinomial', solver='lbfgs')], + [LogisticRegression(random_state=42, solver='lbfgs')], [LogisticRegression(random_state=42, fit_intercept=False)], [LogisticRegressionCV(random_state=42)], [SGDClassifier(**SGD_KWARGS)], - [SGDClassifier(loss='log', **SGD_KWARGS)], + [SGDClassifier(loss='log_loss', **SGD_KWARGS)], [PassiveAggressiveClassifier(random_state=42)], [Perceptron(random_state=42)], [RidgeClassifier(random_state=42)], diff --git a/tests/test_sklearn_explain_weights.py b/tests/test_sklearn_explain_weights.py index 80eb4aaa..9f8bf13d 100644 --- a/tests/test_sklearn_explain_weights.py +++ b/tests/test_sklearn_explain_weights.py @@ -156,7 +156,7 @@ def assert_explained_weights_linear_regressor(boston_train, reg, has_bias=True): @pytest.mark.parametrize(['clf'], [ [LogisticRegression(random_state=42)], - [LogisticRegression(random_state=42, multi_class='multinomial', solver='lbfgs')], + [LogisticRegression(random_state=42, solver='lbfgs')], [LogisticRegression(random_state=42, fit_intercept=False)], [LogisticRegressionCV(random_state=42)], [RidgeClassifier(random_state=42)], From c9fe612ce6414e8133e1fe930aa143401a0fbc18 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 12:08:50 +0000 Subject: [PATCH 11/37] fix remaining tests for sklearn weight explanation make sure we converge well enough --- tests/test_sklearn_explain_weights.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_sklearn_explain_weights.py b/tests/test_sklearn_explain_weights.py index 9f8bf13d..6018de63 100644 --- a/tests/test_sklearn_explain_weights.py +++ b/tests/test_sklearn_explain_weights.py @@ -479,14 +479,14 @@ def test_unsupported(): [ElasticNetCV(random_state=42)], [HuberRegressor()], [Lars()], - [LarsCV(max_n_alphas=10)], + [LarsCV(max_n_alphas=100)], [Lasso(random_state=42)], [LassoCV(random_state=42)], [LassoLars(alpha=0.01)], [LassoLarsCV(max_n_alphas=10)], [LassoLarsIC()], [OrthogonalMatchingPursuit(n_nonzero_coefs=10)], - [OrthogonalMatchingPursuitCV()], + [OrthogonalMatchingPursuitCV(max_iter=10)], [PassiveAggressiveRegressor(C=0.1, random_state=42)], [Ridge(random_state=42)], [RidgeCV()], From f0d0f2e28a2d4c47c9e3b559b94b81b3e47dcfc9 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 12:25:28 +0000 Subject: [PATCH 12/37] fix explain prediction for gradient boosting due to loss refactoring --- eli5/sklearn/explain_prediction.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/eli5/sklearn/explain_prediction.py b/eli5/sklearn/explain_prediction.py index 18dcc36f..2ce8616f 100644 --- a/eli5/sklearn/explain_prediction.py +++ b/eli5/sklearn/explain_prediction.py @@ -3,7 +3,7 @@ import numpy as np import scipy.sparse as sp -from sklearn.base import BaseEstimator +from sklearn.base import BaseEstimator, is_classifier from sklearn.ensemble import ( ExtraTreesClassifier, ExtraTreesRegressor, @@ -12,6 +12,7 @@ RandomForestClassifier, RandomForestRegressor, ) +from sklearn.ensemble._gb import _init_raw_predictions from sklearn.linear_model import ( ElasticNet, # includes Lasso, MultiTaskElasticNet, etc. ElasticNetCV, @@ -581,9 +582,11 @@ def _trees_feature_weights(clf, X, feature_names, num_targets): if hasattr(clf, 'init_'): if clf.init_ == 'zero': bias_init = 0 - elif is_grad_boost and hasattr(clf.loss_, 'get_init_raw_predictions'): - bias_init = clf.loss_.get_init_raw_predictions( - X, clf.init_).astype(np.float64)[0] + elif is_grad_boost: + bias_init = _init_raw_predictions( + X, clf.init_, clf._loss, is_classifier(clf) + ) + bias_init = bias_init.astype(np.float64)[0] else: bias_init = clf.init_.predict(X)[0] feature_weights[feature_names.bias_idx] += bias_init From ac59cc79187f5a9db3ec3d269dfefa7459e507e4 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 12:40:02 +0000 Subject: [PATCH 13/37] fix permutation importance tests, raise min sklearn version to 1.6 --- eli5/sklearn/permutation_importance.py | 24 ++++++++++++++------ setup.py | 2 +- tests/test_sklearn_permutation_importance.py | 1 - 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/eli5/sklearn/permutation_importance.py b/eli5/sklearn/permutation_importance.py index 0c68cca4..aa448953 100644 --- a/eli5/sklearn/permutation_importance.py +++ b/eli5/sklearn/permutation_importance.py @@ -20,12 +20,18 @@ if pandas_available: import pandas as pd -def _estimator_has(attr): +def _wrapped_estimator_has(attr): def check(self): return hasattr(self.wrapped_estimator_, attr) return check +def _estimator_has(attr): + def check(self): + return hasattr(self.estimator, attr) + + return check + CAVEATS_CV_NONE = """ Feature importances are computed on the same data as used for training, i.e. feature importances don't reflect importance of features for @@ -202,7 +208,7 @@ def fit(self, X, y, groups=None, **fit_params): self.estimator_ = clone(self.estimator) self.estimator_.fit(X, y, **fit_params) - X = check_array(X, force_all_finite='allow-nan') + X = check_array(X, ensure_all_finite='allow-nan') if self.cv not in (None, "prefit"): si = self._cv_scores_importances(X, y, groups=groups, **fit_params) @@ -253,26 +259,30 @@ def caveats_(self): # ============= Exposed methods of a wrapped estimator: - @available_if(_estimator_has('score')) + @available_if(_wrapped_estimator_has('score')) def score(self, X, y=None, *args, **kwargs): return self.wrapped_estimator_.score(X, y, *args, **kwargs) - @available_if(_estimator_has('predict')) + @available_if(_wrapped_estimator_has('predict')) def predict(self, X): return self.wrapped_estimator_.predict(X) - @available_if(_estimator_has('predict_proba')) + @available_if(_wrapped_estimator_has('predict_proba')) def predict_proba(self, X): return self.wrapped_estimator_.predict_proba(X) - @available_if(_estimator_has('predict_log_proba')) + @available_if(_wrapped_estimator_has('predict_log_proba')) def predict_log_proba(self, X): return self.wrapped_estimator_.predict_log_proba(X) - @available_if(_estimator_has('decision_function')) + @available_if(_wrapped_estimator_has('decision_function')) def decision_function(self, X): return self.wrapped_estimator_.decision_function(X) + @available_if(_estimator_has('__sklearn_tags__')) + def __sklearn_tags__(self): + return self.estimator.__sklearn_tags__() + @property def wrapped_estimator_(self): if self.cv == "prefit" or not self.refit: diff --git a/setup.py b/setup.py index 94c30ce9..70e1e826 100755 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ def get_long_description(): 'numpy >= 1.9.0', 'scipy', 'six', - 'scikit-learn >= 1.0', + 'scikit-learn >= 1.6.0', 'graphviz', 'tabulate>=0.7.7', ], diff --git a/tests/test_sklearn_permutation_importance.py b/tests/test_sklearn_permutation_importance.py index 9848f4a7..45b606f6 100644 --- a/tests/test_sklearn_permutation_importance.py +++ b/tests/test_sklearn_permutation_importance.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import pytest import numpy as np from sklearn.base import is_classifier, is_regressor From a92dab19622bb664e98465994b7a36f47e4f8fb4 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 12:53:02 +0000 Subject: [PATCH 14/37] stop using bash in tox --- _ci/runtests_default.sh | 11 ----------- _ci/runtests_default_with_crfsuite.sh | 10 ---------- _ci/runtests_extra.sh | 12 ------------ tox.ini | 28 ++++++++++++++++++++++----- 4 files changed, 23 insertions(+), 38 deletions(-) delete mode 100644 _ci/runtests_default.sh delete mode 100644 _ci/runtests_default_with_crfsuite.sh delete mode 100644 _ci/runtests_extra.sh diff --git a/_ci/runtests_default.sh b/_ci/runtests_default.sh deleted file mode 100644 index d6dbcc6d..00000000 --- a/_ci/runtests_default.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash - -py.test --doctest-modules \ - --ignore eli5/xgboost.py \ - --ignore eli5/lightgbm.py \ - --ignore eli5/catboost.py \ - --ignore eli5/keras \ - --ignore eli5/sklearn_crfsuite \ - --ignore eli5/formatters/image.py \ - --ignore tests/utils_image.py \ - --cov=eli5 --cov-report=html --cov-report=term "$@" diff --git a/_ci/runtests_default_with_crfsuite.sh b/_ci/runtests_default_with_crfsuite.sh deleted file mode 100644 index 48bfa4b1..00000000 --- a/_ci/runtests_default_with_crfsuite.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash - -py.test --doctest-modules \ - --ignore eli5/xgboost.py \ - --ignore eli5/lightgbm.py \ - --ignore eli5/catboost.py \ - --ignore eli5/keras \ - --ignore eli5/formatters/image.py \ - --ignore tests/utils_image.py \ - --cov=eli5 --cov-report=html --cov-report=term "$@" diff --git a/_ci/runtests_extra.sh b/_ci/runtests_extra.sh deleted file mode 100644 index 6bfddbc5..00000000 --- a/_ci/runtests_extra.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash -py.test --doctest-modules \ - --ignore tests/test_lime.py \ - --ignore tests/test_formatters.py \ - --ignore tests/test_samplers.py \ - --ignore tests/test_sklearn_explain_prediction.py \ - --ignore tests/test_sklearn_explain_weights.py \ - --ignore tests/test_sklearn_vectorizers.py \ - --ignore tests/test_utils.py \ - --ignore eli5/lightning.py \ - --ignore eli5/sklearn_crfsuite \ - --cov=eli5 --cov-report=html --cov-report=term "$@" diff --git a/tox.ini b/tox.ini index 66a6c71b..1a23f0b0 100644 --- a/tox.ini +++ b/tox.ini @@ -29,12 +29,20 @@ deps= ipython pandas commands= - ; to install lightning numpy must be installed first + ; to install lightning numpy and Cython (if no wheel exists) must be installed first + pip install Cython pip install joblib "sklearn-contrib-lightning >= 0.4" pip install -e . - bash _ci/runtests_default.sh {posargs: eli5 tests} - ; TODO once sklearn-crfsuite is compatible, use - ; bash _ci/runtests_default_with_crfsuite.sh {posargs: eli5 tests} + py.test --doctest-modules \ + --ignore eli5/xgboost.py \ + --ignore eli5/lightgbm.py \ + --ignore eli5/catboost.py \ + --ignore eli5/keras \ + --ignore eli5/sklearn_crfsuite \ + --ignore eli5/formatters/image.py \ + --ignore tests/utils_image.py \ + --cov=eli5 --cov-report=html --cov-report=term {posargs: eli5 tests} + ; TODO once sklearn-crfsuite is compatible, stop ignoring eli5/sklearn_crfsuite [testenv:py310-extra] @@ -51,7 +59,17 @@ deps= commands= pip install -e . ; run tests for extra dependencies - bash _ci/runtests_extra.sh {posargs: eli5 tests} + py.test --doctest-modules \ + --ignore tests/test_lime.py \ + --ignore tests/test_formatters.py \ + --ignore tests/test_samplers.py \ + --ignore tests/test_sklearn_explain_prediction.py \ + --ignore tests/test_sklearn_explain_weights.py \ + --ignore tests/test_sklearn_vectorizers.py \ + --ignore tests/test_utils.py \ + --ignore eli5/lightning.py \ + --ignore eli5/sklearn_crfsuite \ + --cov=eli5 --cov-report=html --cov-report=term {posargs: eli5 tests} [testenv:py310-nodeps] From d19d648c46ca4bccade84065a7e40a2f9a0005e4 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 13:09:51 +0000 Subject: [PATCH 15/37] fix lightning tests, remove add_bias --- eli5/lightning.py | 3 --- tests/test_lightning.py | 8 ++------ tests/test_sklearn_explain_weights.py | 11 ++--------- 3 files changed, 4 insertions(+), 18 deletions(-) diff --git a/eli5/lightning.py b/eli5/lightning.py index 2f648064..643c4695 100644 --- a/eli5/lightning.py +++ b/eli5/lightning.py @@ -1,6 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import - from lightning.impl.base import BaseEstimator from lightning import classification, regression from sklearn.multiclass import OneVsRestClassifier diff --git a/tests/test_lightning.py b/tests/test_lightning.py index cebbb7a8..56c5a0fa 100644 --- a/tests/test_lightning.py +++ b/tests/test_lightning.py @@ -1,6 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import - import pytest pytest.importorskip('lightning') @@ -53,11 +50,10 @@ def test_explain_predition_classifiers_binary(newsgroups_train_binary, clf): @pytest.mark.parametrize(['clf'], _instances(_CLASSIFIERS)) def test_explain_weights_classifiers(newsgroups_train, clf): clf = OneVsRestClassifier(clf) - assert_explained_weights_linear_classifier(newsgroups_train, clf, - add_bias=True) + assert_explained_weights_linear_classifier(newsgroups_train, clf) if _CLASSIFIERS.index(type(clf.estimator)) == 0: assert_explained_weights_linear_classifier( - newsgroups_train, clf, add_bias=True, + newsgroups_train, clf, explain_weights=explain_weights_lightning) diff --git a/tests/test_sklearn_explain_weights.py b/tests/test_sklearn_explain_weights.py index 6018de63..407eb636 100644 --- a/tests/test_sklearn_explain_weights.py +++ b/tests/test_sklearn_explain_weights.py @@ -105,26 +105,19 @@ def get_result(): assert 'space' in pos pos, neg = _top('talk.religion.misc') - assert 'jesus' in pos or 'christians' in pos + assert 'jesus' in pos or 'christians' in pos or 'bible' in pos assert res == get_result() def assert_explained_weights_linear_classifier( - newsgroups_train, clf, add_bias=False, explain_weights=explain_weights, + newsgroups_train, clf, explain_weights=explain_weights, binary=False): docs, y, target_names = newsgroups_train vec = TfidfVectorizer() X = vec.fit_transform(docs) - if add_bias: - X = sp.hstack([X, np.ones((X.shape[0], 1))]) - feature_names = vec.get_feature_names_out() + ['BIAS'] - else: - feature_names = None - clf.fit(X, y) check_newsgroups_explanation_linear(clf, vec, target_names, - feature_names=feature_names, explain_weights=explain_weights, binary=binary, top=(20, 20)) From b9f1be8759678c640c9af06aa0870b8ec9c3b001 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 13:27:32 +0000 Subject: [PATCH 16/37] use a no-binary install of lightning to make sure it always builds --- tox.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index 1a23f0b0..5c4062a1 100644 --- a/tox.ini +++ b/tox.ini @@ -29,9 +29,9 @@ deps= ipython pandas commands= - ; to install lightning numpy and Cython (if no wheel exists) must be installed first + ; to install lightning numpy and Cython must be installed first pip install Cython - pip install joblib "sklearn-contrib-lightning >= 0.4" + pip install joblib "sklearn-contrib-lightning >= 0.4" --no-binary sklearn-contrib-lightning pip install -e . py.test --doctest-modules \ --ignore eli5/xgboost.py \ From 6b7f8f76e31edf3654529f99257e75896e8670d7 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 15:06:39 +0000 Subject: [PATCH 17/37] move lightning to extras, disable keras tests we need TF 1.x but it does not work on python 3.9+ --- .github/workflows/python-package.yml | 4 ++-- tox.ini | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 832512a1..fefa50e2 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -17,9 +17,9 @@ jobs: fail-fast: false matrix: include: - - python-version: '3.12' + - python-version: '3.10' tox-env: 'mypy' - - python-version: '3.12' + - python-version: '3.10' tox-env: 'docs' - python-version: '3.9' tox-env: 'py39' diff --git a/tox.ini b/tox.ini index 5c4062a1..2133a713 100644 --- a/tox.ini +++ b/tox.ini @@ -28,21 +28,18 @@ deps= {[base]deps} ipython pandas + sklearn-crfsuite commands= - ; to install lightning numpy and Cython must be installed first - pip install Cython - pip install joblib "sklearn-contrib-lightning >= 0.4" --no-binary sklearn-contrib-lightning pip install -e . py.test --doctest-modules \ --ignore eli5/xgboost.py \ --ignore eli5/lightgbm.py \ --ignore eli5/catboost.py \ --ignore eli5/keras \ - --ignore eli5/sklearn_crfsuite \ + --ignore eli5/lightning.py \ --ignore eli5/formatters/image.py \ --ignore tests/utils_image.py \ --cov=eli5 --cov-report=html --cov-report=term {posargs: eli5 tests} - ; TODO once sklearn-crfsuite is compatible, stop ignoring eli5/sklearn_crfsuite [testenv:py310-extra] @@ -52,11 +49,14 @@ deps= xgboost lightgbm != 2.0.5, != 2.0.6 catboost - tensorflow - keras + # tensorflow + # keras matplotlib Pillow commands= + ; to install lightning numpy and Cython must be installed first + pip install Cython + pip install joblib "sklearn-contrib-lightning >= 0.4" --no-binary sklearn-contrib-lightning pip install -e . ; run tests for extra dependencies py.test --doctest-modules \ @@ -68,7 +68,7 @@ commands= --ignore tests/test_sklearn_vectorizers.py \ --ignore tests/test_utils.py \ --ignore eli5/lightning.py \ - --ignore eli5/sklearn_crfsuite \ + --ignore eli5/keras \ --cov=eli5 --cov-report=html --cov-report=term {posargs: eli5 tests} @@ -76,7 +76,6 @@ commands= deps= {[base]deps} commands= -; without lightning as it is optional pip install -e . py.test --doctest-modules \ --ignore eli5/lightning.py \ @@ -93,7 +92,7 @@ commands= [testenv:mypy] -basepython=python3.12 +basepython=python3.10 deps= {[testenv]deps} mypy == 0.750 @@ -103,6 +102,7 @@ commands= [testenv:docs] +basepython=python3.10 deps= mock==1.0.1 pillow==8.3.1 From 49effea4c48f3ee821045e81a7908929da841eba Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 17:31:12 +0000 Subject: [PATCH 18/37] try to fix lightning build on CI --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 2133a713..098d97b2 100644 --- a/tox.ini +++ b/tox.ini @@ -55,7 +55,7 @@ deps= Pillow commands= ; to install lightning numpy and Cython must be installed first - pip install Cython + pip install Cython 'setuptools < 60.0' pip install joblib "sklearn-contrib-lightning >= 0.4" --no-binary sklearn-contrib-lightning pip install -e . ; run tests for extra dependencies From 7f915b7b88f42ecd2c5c7d04db7a2d72ed3d93eb Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 17:33:24 +0000 Subject: [PATCH 19/37] fix lightgbm tests --- tests/test_lightgbm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lightgbm.py b/tests/test_lightgbm.py index da98951e..94091692 100644 --- a/tests/test_lightgbm.py +++ b/tests/test_lightgbm.py @@ -92,7 +92,7 @@ def test_explain_prediction_clf_multitarget(newsgroups_train): docs, ys, target_names = newsgroups_train from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS stop_words = set(ENGLISH_STOP_WORDS) | {'does', 'just'} - vec = CountVectorizer(stop_words=stop_words, dtype=np.float64) + vec = CountVectorizer(stop_words=list(stop_words), dtype=np.float64) xs = vec.fit_transform(docs) clf = LGBMClassifier(n_estimators=100, max_depth=2, min_child_samples=1, min_child_weight=1, From e36149f178365fec76e15dec72d839b267b96f8f Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 17:53:07 +0000 Subject: [PATCH 20/37] update type annotations for xgboost and lightgbm --- eli5/lightgbm.py | 9 +++------ eli5/xgboost.py | 33 +++++++++++++-------------------- tests/test_xgboost.py | 3 --- 3 files changed, 16 insertions(+), 29 deletions(-) diff --git a/eli5/lightgbm.py b/eli5/lightgbm.py index d3b886b5..b212b425 100644 --- a/eli5/lightgbm.py +++ b/eli5/lightgbm.py @@ -1,7 +1,5 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import, division from collections import defaultdict -from typing import DefaultDict, Any, Tuple, Optional +from typing import DefaultDict, Optional import numpy as np import lightgbm @@ -204,8 +202,7 @@ def get_score_weights(_label_id): ) -def _check_booster_args(lgb, is_regression=None): - # type: (Any, Optional[bool]) -> Tuple[lightgbm.Booster, Optional[bool]] +def _check_booster_args(lgb, is_regression: Optional[bool] = None) -> tuple[lightgbm.Booster, Optional[bool]]: if isinstance(lgb, lightgbm.Booster): booster = lgb if is_regression is None: @@ -332,7 +329,7 @@ def _get_prediction_feature_weights(booster, X, n_targets): res = [] for target in range(n_targets): - feature_weights = defaultdict(float) # type: DefaultDict[Optional[str], float] + feature_weights: DefaultDict[Optional[str], float] = defaultdict(float) for info, leaf_id in zip(tree_info[:, target], pred_leafs[:, target]): leaf_index, split_index = _get_leaf_split_indices( info['tree_structure'] diff --git a/eli5/xgboost.py b/eli5/xgboost.py index 969cccf2..86bec85c 100644 --- a/eli5/xgboost.py +++ b/eli5/xgboost.py @@ -1,8 +1,6 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from functools import partial import re -from typing import Any, Dict, List, Tuple, Optional, Pattern +from typing import Any, Optional, Pattern import numpy as np import scipy.sparse as sp @@ -41,7 +39,7 @@ def explain_weights_xgboost(xgb, target_names=None, # ignored targets=None, # ignored feature_names=None, - feature_re=None, # type: Pattern[str] + feature_re: Pattern[str] = None, feature_filter=None, importance_type='gain', ): @@ -98,11 +96,11 @@ def explain_prediction_xgboost( target_names=None, targets=None, feature_names=None, - feature_re=None, # type: Pattern[str] + feature_re: Pattern[str] = None, feature_filter=None, - vectorized=False, # type: bool - is_regression=None, # type: bool - missing=None, # type: bool + vectorized: bool = False, + is_regression: bool = None, + missing: bool = None, ): """ Return an explanation of XGBoost prediction (via scikit-learn wrapper XGBClassifier or XGBRegressor, or via xgboost.Booster) as feature weights. @@ -171,7 +169,7 @@ def explain_prediction_xgboost( if isinstance(xgb, Booster): prediction = xgb.predict(dmatrix) - n_targets = prediction.shape[-1] # type: int + n_targets: int = prediction.shape[-1] if is_regression is None: # When n_targets is 1, this can be classification too, # but it's safer to assume regression. @@ -221,8 +219,7 @@ def explain_prediction_xgboost( ) -def _check_booster_args(xgb, is_regression=None): - # type: (Any, Optional[bool]) -> Tuple[Booster, Optional[bool]] +def _check_booster_args(xgb, is_regression: Optional[bool] = None) -> tuple[Booster, Optional[bool]]: if isinstance(xgb, Booster): booster = xgb else: @@ -309,8 +306,7 @@ def _indexed_leafs(parent): return indexed -def _parent_value(children): - # type: (...) -> int +def _parent_value(children) -> int: """ Value of the parent node: a weighted sum of child values. """ covers = np.array([child['cover'] for child in children]) @@ -319,8 +315,7 @@ def _parent_value(children): return np.sum(leafs * covers) -def _xgb_n_targets(xgb): - # type: (...) -> int +def _xgb_n_targets(xgb) -> int: if isinstance(xgb, XGBClassifier): return 1 if xgb.n_classes_ == 2 else xgb.n_classes_ elif isinstance(xgb, XGBRegressor): @@ -344,13 +339,12 @@ def _xgb_feature_importances(booster, importance_type, feature_names): return all_features / all_features.sum() -def _parse_tree_dump(text_dump): - # type: (str) -> Optional[Dict[str, Any]] +def _parse_tree_dump(text_dump: str) -> Optional[dict[str, Any]]: """ Parse text tree dump (one item of a list returned by Booster.get_dump()) into json format that will be used by next XGBoost release. """ result = None - stack = [] # type: List[Dict] + stack: list[dict] = [] for line in text_dump.split('\n'): if line: depth, node = _parse_dump_line(line) @@ -368,8 +362,7 @@ def _parse_tree_dump(text_dump): return result -def _parse_dump_line(line): - # type: (str) -> Tuple[int, Dict[str, Any]] +def _parse_dump_line(line: str) -> tuple[int, dict[str, Any]]: branch_match = re.match( r'^(\t*)(\d+):\[([^<]+)<([^\]]+)\] ' r'yes=(\d+),no=(\d+),missing=(\d+),' diff --git a/tests/test_xgboost.py b/tests/test_xgboost.py index 477ad147..6ceedb1e 100644 --- a/tests/test_xgboost.py +++ b/tests/test_xgboost.py @@ -1,6 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import - import pytest import numpy as np import scipy.sparse as sp From ec87bf002c0d48f42f1ec643d72f62258609ebc3 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 17:56:22 +0000 Subject: [PATCH 21/37] fix xgboost deprecation errors --- tests/test_xgboost.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/test_xgboost.py b/tests/test_xgboost.py index 6ceedb1e..d2bcef08 100644 --- a/tests/test_xgboost.py +++ b/tests/test_xgboost.py @@ -42,7 +42,7 @@ def test_explain_booster(newsgroups_train): vec = CountVectorizer() X = vec.fit_transform(docs) booster = xgboost.train( - params={'objective': 'multi:softprob', 'silent': True, 'max_depth': 3, + params={'objective': 'multi:softprob', 'max_depth': 3, 'num_class': len(target_names)}, dtrain=xgboost.DMatrix(X, label=y, missing=np.nan), num_boost_round=10) @@ -72,7 +72,7 @@ def test_explain_xgboost_regressor(boston_train): def test_explain_xgboost_booster(boston_train): xs, ys, feature_names = boston_train booster = xgboost.train( - params={'objective': 'reg:linear', 'silent': True}, + params={'objective': 'reg:squarederror'}, dtrain=xgboost.DMatrix(xs, label=ys), ) res = explain_weights(booster) @@ -94,9 +94,7 @@ def test_explain_prediction_clf_binary( explain_kwargs = {} if use_booster: clf = xgboost.train( - params={'objective': 'binary:logistic', - 'silent': True, - 'max_depth': 2}, + params={'objective': 'binary:logistic', 'max_depth': 2}, dtrain=xgboost.DMatrix(xs, label=ys, missing=missing), num_boost_round=100, ) @@ -156,7 +154,6 @@ def test_explain_prediction_clf_multitarget( clf = xgboost.train( params={'objective': 'multi:softprob', 'num_class': len(target_names), - 'silent': True, 'max_depth': 2}, dtrain=xgboost.DMatrix(xs, label=ys, missing=np.nan), num_boost_round=100, @@ -245,7 +242,7 @@ def test_explain_prediction_reg(boston_train): def test_explain_prediction_reg_booster(boston_train): X, y, feature_names = boston_train booster = xgboost.train( - params={'objective': 'reg:linear', 'silent': True, 'max_depth': 2}, + params={'objective': 'reg:squarederror', 'max_depth': 2}, dtrain=xgboost.DMatrix(X, label=y), ) assert_trained_linear_regression_explained( From 482c7cee510b069f76c57b96ec8f683593488967 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 18:12:18 +0000 Subject: [PATCH 22/37] update to last mypy version --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 098d97b2..df324108 100644 --- a/tox.ini +++ b/tox.ini @@ -95,7 +95,7 @@ commands= basepython=python3.10 deps= {[testenv]deps} - mypy == 0.750 + mypy == 1.15.0 lxml commands= mypy --html-report ./mypy-cov --check-untyped-defs --ignore-missing-imports eli5 From cbfa62c8de5dc9872ec47848768fa6ddc6063621 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 18:33:09 +0000 Subject: [PATCH 23/37] remove six, more typing modernization --- eli5/_feature_names.py | 55 +++++++++---------------- eli5/formatters/as_dict.py | 30 +------------- eli5/formatters/features.py | 7 +--- eli5/formatters/text.py | 14 +------ eli5/formatters/utils.py | 69 ++++++++++++++++++------------- eli5/lime/samplers.py | 70 +++++++++++++------------------- eli5/sklearn/_span_analyzers.py | 13 +++--- setup.py | 1 - tests/test_formatters_as_dict.py | 18 +------- tests/test_formatters_utils.py | 19 ++++++++- 10 files changed, 118 insertions(+), 178 deletions(-) diff --git a/eli5/_feature_names.py b/eli5/_feature_names.py index ee734f7c..9f2658f4 100644 --- a/eli5/_feature_names.py +++ b/eli5/_feature_names.py @@ -1,9 +1,6 @@ import re -import six from typing import ( - Any, Iterable, Iterator, Tuple, Sized, List, Optional, Dict, - Union, Callable, Pattern -) + Any, Iterable, Iterator, Sized, Optional, Union, Callable, Pattern) import numpy as np import scipy.sparse as sp @@ -18,11 +15,10 @@ class FeatureNames(Sized, Iterable): """ def __init__(self, feature_names=None, - bias_name=None, # type: str - unkn_template=None, # type: str - n_features=None, # type: int + bias_name: Optional[str] = None, + unkn_template: Optional[str] = None, + n_features: Optional[int] = None, ): - # type: (...) -> None if not (feature_names is not None or (unkn_template is not None and n_features)): raise ValueError( @@ -39,20 +35,17 @@ def __init__(self, 'unkn_template should be set for sparse features') self.feature_names = feature_names self.unkn_template = unkn_template - self.n_features = n_features or len(feature_names) # type: int + self.n_features: int = n_features or len(feature_names) self.bias_name = bias_name - def __repr__(self): - # type: () -> str + def __repr__(self) -> str: return ''.format( self.n_features, 'with' if self.has_bias else 'without') - def __len__(self): - # type: () -> int + def __len__(self) -> int: return self.n_features + int(self.has_bias) - def __iter__(self): - # type: () -> Iterator[str] + def __iter__(self) -> Iterator[str]: return (self[i] for i in range(len(self))) def __getitem__(self, idx): @@ -69,8 +62,7 @@ def __getitem__(self, idx): return self.unkn_template % idx raise IndexError('Feature index out of range') - def _slice(self, aslice): - # type: (slice) -> Any + def _slice(self, aslice: slice): if isinstance(self.feature_names, (list, np.ndarray)): # Fast path without going through __getitem__ if self.has_bias: @@ -84,29 +76,26 @@ def _slice(self, aslice): return [self[idx] for idx in indices] @property - def has_bias(self): - # type: () -> bool + def has_bias(self) -> bool: return self.bias_name is not None @property - def bias_idx(self): - # type: () -> Optional[int] + def bias_idx(self) -> Optional[int]: if self.has_bias: return self.n_features return None - def filtered(self, feature_filter, x=None): - # type: (Callable, Any) -> Tuple[FeatureNames, List[int]] + def filtered(self, feature_filter: Callable, x=None) -> tuple['FeatureNames', list[int]]: """ Return feature names filtered by a regular expression ``feature_re``, and indices of filtered elements. """ indices = [] filtered_feature_names = [] - indexed_names = None # type: Optional[Iterable[Tuple[int, Any]]] + indexed_names: Optional[Iterable[tuple[int, Any]]] = None if isinstance(self.feature_names, (np.ndarray, list)): indexed_names = enumerate(self.feature_names) elif isinstance(self.feature_names, dict): - indexed_names = six.iteritems(self.feature_names) + indexed_names = self.feature_names.items() elif self.feature_names is None: indexed_names = [] assert indexed_names is not None @@ -116,8 +105,7 @@ def filtered(self, feature_filter, x=None): assert x.shape[0] == 1 flt = lambda nm, i: feature_filter(nm, x[0, i]) else: - # FIXME: mypy warns about x[i] because it thinks x can be None - flt = lambda nm, i: feature_filter(nm, x[i]) # type: ignore + flt = lambda nm, i: feature_filter(nm, x[i]) else: flt = lambda nm, i: feature_filter(nm) @@ -141,10 +129,9 @@ def filtered(self, feature_filter, x=None): def handle_filter(self, feature_filter, - feature_re, # type: Pattern[str] - x=None, # type: Any - ): - # type: (...) -> Tuple[FeatureNames, Union[List[int], None]] + feature_re: Pattern[str], + x=None, + ) -> tuple['FeatureNames', Union[list[int], None]]: if feature_re is not None and feature_filter: raise ValueError('pass either feature_filter or feature_re') if feature_re is not None: @@ -158,8 +145,7 @@ def handle_filter(self, else: return self, None - def add_feature(self, feature): - # type: (Any) -> int + def add_feature(self, feature) -> int: """ Add a new feature name, return it's index. """ # A copy of self.feature_names is always made, because it might be @@ -179,8 +165,7 @@ def add_feature(self, feature): return idx -def _all_feature_names(name): - # type: (Union[str, bytes, List[Dict]]) -> List[str] +def _all_feature_names(name: Union[str, bytes, list[dict]]) -> list[str]: """ All feature names for a feature: usually just the feature itself, but can be several features for unhashed features with collisions. """ diff --git a/eli5/formatters/as_dict.py b/eli5/formatters/as_dict.py index fbad5ee5..880c457f 100644 --- a/eli5/formatters/as_dict.py +++ b/eli5/formatters/as_dict.py @@ -1,38 +1,12 @@ -import six - import attr -import numpy as np -from .features import FormattedFeatureName +from .utils import numpy_to_python def format_as_dict(explanation): """ Return a dictionary representing the explanation that can be JSON-encoded. It accepts parts of explanation (for example feature weights) as well. """ - return _numpy_to_python(attr.asdict(explanation)) - + return numpy_to_python(attr.asdict(explanation)) -_numpy_string_types = (np.string_, np.unicode_) if six.PY2 else np.str_ - -def _numpy_to_python(obj): - """ Convert an nested dict/list/tuple that might contain numpy objects - to their python equivalents. Return converted object. - """ - if isinstance(obj, dict): - return {k: _numpy_to_python(v) for k, v in obj.items()} - elif isinstance(obj, (list, tuple, np.ndarray)): - return [_numpy_to_python(x) for x in obj] - elif isinstance(obj, FormattedFeatureName): - return obj.value - elif isinstance(obj, _numpy_string_types): - return six.text_type(obj) - elif hasattr(obj, 'dtype') and np.isscalar(obj): - if np.issubdtype(obj, np.floating): - return float(obj) - elif np.issubdtype(obj, np.integer): - return int(obj) - elif np.issubdtype(obj, np.bool_): - return bool(obj) - return obj diff --git a/eli5/formatters/features.py b/eli5/formatters/features.py index 021428c1..6750489e 100644 --- a/eli5/formatters/features.py +++ b/eli5/formatters/features.py @@ -1,13 +1,8 @@ -# -*- coding: utf-8 -*- - -import six - - class FormattedFeatureName(object): """ Feature name that does not need any additional escaping. """ def __init__(self, value): - if not isinstance(value, six.string_types): + if not isinstance(value, str): raise TypeError('"value" must be a string, got {} instead' .format(type(value))) self.value = value diff --git a/eli5/formatters/text.py b/eli5/formatters/text.py index f2e631cd..902b7a8d 100644 --- a/eli5/formatters/text.py +++ b/eli5/formatters/text.py @@ -2,15 +2,13 @@ from tabulate import tabulate from typing import Optional, Iterator -import numpy as np - from eli5.base import Explanation, FeatureImportances from . import fields from .features import FormattedFeatureName from .utils import ( format_signed, format_value, format_weight, has_any_values_for_weights, replace_spaces, should_highlight_spaces) -from .utils import tabulate as eli5_tabulate +from .utils import tabulate as eli5_tabulate, numpy_to_python from .trees import tree2text @@ -174,7 +172,7 @@ def _targets_lines(explanation: Explanation, header = "%s%r%s top features" % ( 'y=' if not explanation.is_regression else '', - _np_to_native(target.target), + numpy_to_python(target.target), scores) lines.append(header) @@ -265,11 +263,3 @@ def _format_unhashed_feature(name: list, hl_spaces: bool, sep=' | ') -> str: return sep.join( format_signed(n, _format_single_feature, hl_spaces=hl_spaces) for n in name) - - -def _np_to_native(value) -> str: - if isinstance(value, np.integer): - value = int(value) - elif isinstance(value, np.str_): - value = str(value) - return value diff --git a/eli5/formatters/utils.py b/eli5/formatters/utils.py index 2e6d2d39..12ff6883 100644 --- a/eli5/formatters/utils.py +++ b/eli5/formatters/utils.py @@ -1,8 +1,6 @@ -from __future__ import absolute_import from itertools import chain import re -import six -from typing import Any, Union, List, Dict, Callable, Match, Optional +from typing import Any, Union, Callable, Match, Optional import numpy as np @@ -10,8 +8,7 @@ from .features import FormattedFeatureName -def replace_spaces(s, replacer): - # type: (str, Callable[[int, str], str]) -> str +def replace_spaces(s: str, replacer: Callable[[int, str], str]) -> str: """ >>> replace_spaces('ab', lambda n, l: '_' * n) 'ab' @@ -24,8 +21,7 @@ def replace_spaces(s, replacer): >>> replace_spaces(' a b ', lambda n, _: '0 0' * n) '0 0a0 0b0 00 0' """ - def replace(m): - # type: (Match[str]) -> str + def replace(m: Match[str]) -> str: if m.start() == 0: side = 'left' elif m.end() == len(s): @@ -37,11 +33,10 @@ def replace(m): return re.sub(r'[ ]+', replace, s) -def format_signed(feature, # type: Dict[str, Any] - formatter=None, # type: Callable[..., str] +def format_signed(feature: dict[str, Any], + formatter: Callable[..., str]=None, **kwargs - ): - # type: (...) -> str + ) -> str: """ Format unhashed feature with sign. @@ -53,14 +48,13 @@ def format_signed(feature, # type: Dict[str, Any] '(-)" foo"' """ txt = '' if feature['sign'] > 0 else '(-)' - name = feature['name'] # type: str + name: str = feature['name'] if formatter is not None: name = formatter(name, **kwargs) return '{}{}'.format(txt, name) -def should_highlight_spaces(explanation): - # type: (Explanation) -> bool +def should_highlight_spaces(explanation: Explanation) -> bool: hl_spaces = bool(explanation.highlight_spaces) if explanation.feature_importances: hl_spaces = hl_spaces or any( @@ -75,8 +69,7 @@ def should_highlight_spaces(explanation): return hl_spaces -def _has_invisible_spaces(name): - # type: (Union[str, List[Dict], FormattedFeatureName]) -> bool +def _has_invisible_spaces(name: Union[str, list[dict], FormattedFeatureName]) -> bool: if isinstance(name, FormattedFeatureName): return False elif isinstance(name, list): @@ -85,8 +78,7 @@ def _has_invisible_spaces(name): return name.startswith(' ') or name.endswith(' ') -def has_any_values_for_weights(explanation): - # type: (Explanation) -> bool +def has_any_values_for_weights(explanation: Explanation) -> bool: if explanation.targets: return any(fw.value is not None for t in explanation.targets @@ -97,11 +89,10 @@ def has_any_values_for_weights(explanation): return False -def tabulate(data, # type: List[List[Any]] - header=None, # type: Optional[List[Any]] - col_align=None, # type: Union[str, List[str]] - ): - # type: (...) -> List[str] +def tabulate(data: list[list[Any]], + header: Optional[list[Any]] = None, + col_align: Union[str, list[str]] = None, + ) -> list[str]: """ Format data as a table without any fancy features. col_align: l/r/c or a list/string of l/r/c. l = left, r = right, c = center Return a list of strings (lines of the table). @@ -118,7 +109,7 @@ def tabulate(data, # type: List[List[Any]] if col_align is None: col_align = ['l'] * n_cols - elif isinstance(col_align, six.string_types) and len(col_align) == 1: + elif isinstance(col_align, str) and len(col_align) == 1: col_align = [col_align] * n_cols else: col_align = list(col_align) @@ -130,7 +121,7 @@ def tabulate(data, # type: List[List[Any]] if header: data = [header] + data - data = [[six.text_type(x) for x in row] for row in data] + data = [[str(x) for x in row] for row in data] col_width = [max(len(row[col_i]) for row in data) for col_i in range(n_cols)] if header: data.insert(1, ['-' * width for width in col_width]) @@ -141,16 +132,36 @@ def tabulate(data, # type: List[List[Any]] return [line_tpl.format(*row) for row in data] -def format_weight(value): - # type: (float) -> str +def format_weight(value: float) -> str: return '{:+.3f}'.format(value) -def format_value(value): - # type: (Optional[float]) -> str +def format_value(value: Optional[float]) -> str: if value is None: return '' elif np.isnan(value): return 'Missing' else: return '{:.3f}'.format(value) + + +def numpy_to_python(obj): + """ Convert an nested dict/list/tuple that might contain numpy objects + to their python equivalents. Return converted object. + """ + if isinstance(obj, dict): + return {k: numpy_to_python(v) for k, v in obj.items()} + elif isinstance(obj, (list, tuple, np.ndarray)): + return [numpy_to_python(x) for x in obj] + elif isinstance(obj, FormattedFeatureName): + return obj.value + elif isinstance(obj, np.str_): + return str(obj) + elif hasattr(obj, 'dtype') and np.isscalar(obj): + if np.issubdtype(obj, np.floating): + return float(obj) + elif np.issubdtype(obj, np.integer): + return int(obj) + elif np.issubdtype(obj, np.bool_): + return bool(obj) + return obj diff --git a/eli5/lime/samplers.py b/eli5/lime/samplers.py index 2475f883..24268823 100644 --- a/eli5/lime/samplers.py +++ b/eli5/lime/samplers.py @@ -1,9 +1,6 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import import abc from functools import partial -from typing import List, Tuple, Any, Union, Dict, Optional -import six +from typing import Any, Union, Optional import numpy as np from sklearn.base import BaseEstimator, clone @@ -17,8 +14,7 @@ from .textutils import generate_samples, DEFAULT_TOKEN_PATTERN, TokenizedText -@six.add_metaclass(abc.ABCMeta) -class BaseSampler(BaseEstimator): +class BaseSampler(BaseEstimator, metaclass=abc.ABCMeta): """ Base sampler class. Sampler is an object which generates examples similar to a given example. @@ -67,15 +63,14 @@ class MaskingTextSampler(BaseSampler): Default is 1, meaning individual tokens are replaced. """ def __init__(self, - token_pattern=None, # type: Optional[str] - bow=True, # type: bool + token_pattern: Optional[str] = None, + bow: bool = True, random_state=None, - replacement='', # type: str - min_replace=1, # type: Union[int, float] - max_replace=1.0, # type: Union[int, float] - group_size=1, # type: int + replacement: str = '', + min_replace: Union[int, float] = 1, + max_replace: Union[int, float] = 1.0, + group_size: int = 1, ): - # type: (...) -> None self.token_pattern = token_pattern or DEFAULT_TOKEN_PATTERN self.bow = bow self.random_state = random_state @@ -85,18 +80,17 @@ def __init__(self, self.group_size = group_size self.rng_ = check_random_state(self.random_state) - def sample_near(self, doc, n_samples=1): - # type: (str, int) -> Tuple[List[str], np.ndarray] + def sample_near(self, doc: str, n_samples: int = 1) -> tuple[list[str], np.ndarray]: docs, similarities, mask, text = self.sample_near_with_mask( doc=doc, n_samples=n_samples ) return docs, similarities - def sample_near_with_mask(self, - doc, # type: Union[TokenizedText, str] - n_samples=1 # type: int - ): - # type: (...) -> Tuple[List[str], np.ndarray, np.ndarray, TokenizedText] + def sample_near_with_mask( + self, + doc: Union[TokenizedText, str], + n_samples: int = 1, + ) -> tuple[list[str], np.ndarray, np.ndarray, TokenizedText]: if not isinstance(doc, TokenizedText): doc = TokenizedText(doc, token_pattern=self.token_pattern) @@ -125,12 +119,11 @@ class MaskingTextSamplers(BaseSampler): with :class:`MaskingTextSampler` paremeters. """ def __init__(self, - sampler_params, # type: List[Dict[str, Any]] - token_pattern=None, # type: Optional[str] + sampler_params: list[dict[str, Any]], + token_pattern: Optional[str] = None, random_state=None, - weights=None, # type: Union[np.ndarray, List[float]] + weights: Union[np.ndarray, list[float]] = None, ): - # type: (...) -> None self.random_state = random_state self.rng_ = check_random_state(random_state) self.token_pattern = token_pattern @@ -141,19 +134,17 @@ def __init__(self, self.weights = np.array(weights) self.weights /= self.weights.sum() - def _create_sampler(self, extra): - # type: (Dict) -> MaskingTextSampler - params = dict( + def _create_sampler(self, extra: dict) -> MaskingTextSampler: + params: dict[str, Any] = dict( token_pattern=self.token_pattern, random_state=self.rng_, - ) # type: Dict[str, Any] + ) params.update(extra) return MaskingTextSampler(**params) - def sample_near(self, doc, n_samples=1): - # type: (str, int) -> Tuple[List[str], np.ndarray] + def sample_near(self, doc: str, n_samples: int = 1) -> tuple[list[str], np.ndarray]: assert n_samples >= 1 - all_docs = [] # type: List[str] + all_docs: list[str] = [] # type similarities = [] for sampler, freq in self._sampler_n_samples(n_samples): docs, sims = sampler.sample_near(doc, n_samples=freq) @@ -161,15 +152,13 @@ def sample_near(self, doc, n_samples=1): similarities.append(sims) return all_docs, np.hstack(similarities) - def sample_near_with_mask(self, - doc, # type: str - n_samples=1 # type: int - ): - # type: (...) -> Tuple[List[str], np.ndarray, np.ndarray, TokenizedText] + def sample_near_with_mask( + self, doc: str, n_samples: int = 1, + ) -> tuple[list[str], np.ndarray, np.ndarray, TokenizedText]: assert n_samples >= 1 assert self.token_pattern is not None text = TokenizedText(doc, token_pattern=self.token_pattern) - all_docs = [] # type: List[str] + all_docs: list[str] = [] similarities = [] masks = [] for sampler, freq in self._sampler_n_samples(n_samples): @@ -222,8 +211,7 @@ def _get_grid(self): return GridSearchCV(self.kde, param_grid=param_grid, n_jobs=self.n_jobs, cv=cv) - def _fit_kde(self, kde, X): - # type: (KernelDensity, np.ndarray) -> Tuple[GridSearchCV, KernelDensity] + def _fit_kde(self, kde: KernelDensity, X: np.ndarray) -> tuple[GridSearchCV, KernelDensity]: if self.fit_bandwidth: grid = self._get_grid() grid.fit(X) @@ -281,8 +269,8 @@ class UnivariateKernelDensitySampler(_BaseKernelDensitySampler): of the features instead of generating totally new examples. """ def fit(self, X, y=None): - self.kdes_ = [] # type: List[KernelDensity] - self.grids_ = [] # type: List[GridSearchCV] + self.kdes_: list[KernelDensity] = [] + self.grids_: list[GridSearchCV] = [] num_features = X.shape[-1] for i in range(num_features): grid, kde = self._fit_kde(self.kde, X[:, i].reshape(-1, 1)) diff --git a/eli5/sklearn/_span_analyzers.py b/eli5/sklearn/_span_analyzers.py index 75673440..2e4a594b 100644 --- a/eli5/sklearn/_span_analyzers.py +++ b/eli5/sklearn/_span_analyzers.py @@ -1,7 +1,4 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import import re -from six.moves import xrange def build_span_analyzer(document, vec): @@ -49,9 +46,9 @@ def _word_ngrams(vec, tokens, stop_words=None): original_tokens = tokens tokens = [] n_original_tokens = len(original_tokens) - for n in xrange(min_n, + for n in range(min_n, min(max_n + 1, n_original_tokens + 1)): - for i in xrange(n_original_tokens - n + 1): + for i in range(n_original_tokens - n + 1): ngram_tokens = original_tokens[i: i + n] tokens.append(( [s for s, _ in ngram_tokens], @@ -65,8 +62,8 @@ def _char_ngrams(vec, text_document): text_len = len(text_document) ngrams = [] min_n, max_n = vec.ngram_range - for n in xrange(min_n, min(max_n + 1, text_len + 1)): - for i in xrange(text_len - n + 1): + for n in range(min_n, min(max_n + 1, text_len + 1)): + for i in range(text_len - n + 1): ngrams.append(([(i, i + n)], text_document[i: i + n])) return ngrams @@ -81,7 +78,7 @@ def _char_wb_ngrams(vec, text_document): w = m.group(0) w = ' ' + w + ' ' w_len = len(w) - for n in xrange(min_n, max_n + 1): + for n in range(min_n, max_n + 1): offset = 0 ngrams.append(( [(w_start + offset - 1, w_start + offset + n - 1)], diff --git a/setup.py b/setup.py index 70e1e826..bbdd2877 100755 --- a/setup.py +++ b/setup.py @@ -35,7 +35,6 @@ def get_long_description(): 'jinja2 >= 3.0.0', 'numpy >= 1.9.0', 'scipy', - 'six', 'scikit-learn >= 1.6.0', 'graphviz', 'tabulate>=0.7.7', diff --git a/tests/test_formatters_as_dict.py b/tests/test_formatters_as_dict.py index 3658719b..c5ce6fca 100644 --- a/tests/test_formatters_as_dict.py +++ b/tests/test_formatters_as_dict.py @@ -1,29 +1,13 @@ -import json - import numpy as np from eli5.base import ( Explanation, TargetExplanation, FeatureWeights, FeatureWeight) -from eli5.formatters.as_dict import format_as_dict, _numpy_to_python +from eli5.formatters.as_dict import format_as_dict # format_as_dict is called in eli5.tests.utils.format_as_all -def test_numpy_to_python(): - x = _numpy_to_python({ - 'x': np.int32(12), - 'y': [np.ones(2)], - 'z': {'inner': np.bool_(False)}, - }) - assert x == { - 'x': 12, - 'y': [[1.0, 1.0]], - 'z': {'inner': False}, - } - json.dumps(x) - - def test_format_as_dict(): assert format_as_dict(Explanation( estimator='some estimator', diff --git a/tests/test_formatters_utils.py b/tests/test_formatters_utils.py index 2a974e8a..f9e466ea 100644 --- a/tests/test_formatters_utils.py +++ b/tests/test_formatters_utils.py @@ -1,6 +1,9 @@ +import json + +import numpy as np import pytest -from eli5.formatters.utils import tabulate, format_value +from eli5.formatters.utils import tabulate, format_value, numpy_to_python def test_tabulate(): @@ -53,3 +56,17 @@ def test_format_value(): assert format_value(float('nan')) == 'Missing' assert format_value(12.23333334) == '12.233' assert format_value(-12.23333334) == '-12.233' + + +def test_numpy_to_python(): + x = numpy_to_python({ + 'x': np.int32(12), + 'y': [np.ones(2)], + 'z': {'inner': np.bool_(False)}, + }) + assert x == { + 'x': 12, + 'y': [[1.0, 1.0]], + 'z': {'inner': False}, + } + json.dumps(x) From e9cf301918e70b2950a707798166465bdfccc792 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 19:59:24 +0000 Subject: [PATCH 24/37] mypy fixes --- eli5/_feature_names.py | 1 + eli5/formatters/utils.py | 4 ++-- eli5/lime/lime.py | 6 +++--- eli5/lime/samplers.py | 2 +- eli5/sklearn/unhashing.py | 4 ++-- eli5/sklearn/utils.py | 6 ++++-- eli5/xgboost.py | 8 ++++---- tox.ini | 1 + 8 files changed, 18 insertions(+), 14 deletions(-) diff --git a/eli5/_feature_names.py b/eli5/_feature_names.py index 9f2658f4..c4808db3 100644 --- a/eli5/_feature_names.py +++ b/eli5/_feature_names.py @@ -65,6 +65,7 @@ def __getitem__(self, idx): def _slice(self, aslice: slice): if isinstance(self.feature_names, (list, np.ndarray)): # Fast path without going through __getitem__ + lst: Union[list, np.ndarray] if self.has_bias: lst = list(self.feature_names) lst.append(self.bias_name) diff --git a/eli5/formatters/utils.py b/eli5/formatters/utils.py index 12ff6883..bff5a001 100644 --- a/eli5/formatters/utils.py +++ b/eli5/formatters/utils.py @@ -34,7 +34,7 @@ def replace(m: Match[str]) -> str: def format_signed(feature: dict[str, Any], - formatter: Callable[..., str]=None, + formatter: Optional[Callable[..., str]]=None, **kwargs ) -> str: """ @@ -91,7 +91,7 @@ def has_any_values_for_weights(explanation: Explanation) -> bool: def tabulate(data: list[list[Any]], header: Optional[list[Any]] = None, - col_align: Union[str, list[str]] = None, + col_align: Optional[Union[str, list[str]]] = None, ) -> list[str]: """ Format data as a table without any fancy features. col_align: l/r/c or a list/string of l/r/c. l = left, r = right, c = center diff --git a/eli5/lime/lime.py b/eli5/lime/lime.py index 6bbf6c38..97cfe1ca 100644 --- a/eli5/lime/lime.py +++ b/eli5/lime/lime.py @@ -137,12 +137,12 @@ class TextExplainer(BaseEstimator): """ def __init__(self, n_samples: int = 5000, - char_based: bool = None, + char_based: Optional[bool] = None, clf=None, vec=None, - sampler: BaseSampler = None, + sampler: Optional[BaseSampler] = None, position_dependent: bool = False, - rbf_sigma: float = None, + rbf_sigma: Optional[float] = None, random_state=None, expand_factor: Optional[int] = 10, token_pattern: Optional[str] = None, diff --git a/eli5/lime/samplers.py b/eli5/lime/samplers.py index 24268823..41b01c21 100644 --- a/eli5/lime/samplers.py +++ b/eli5/lime/samplers.py @@ -122,7 +122,7 @@ def __init__(self, sampler_params: list[dict[str, Any]], token_pattern: Optional[str] = None, random_state=None, - weights: Union[np.ndarray, list[float]] = None, + weights: Optional[Union[np.ndarray, list[float]]] = None, ): self.random_state = random_state self.rng_ = check_random_state(random_state) diff --git a/eli5/sklearn/unhashing.py b/eli5/sklearn/unhashing.py index 6b1392de..4c08827c 100644 --- a/eli5/sklearn/unhashing.py +++ b/eli5/sklearn/unhashing.py @@ -119,7 +119,7 @@ def __init__(self, hasher: FeatureHasher, unkn_template="FEATURE[%d]"): self.n_features: int = self.hasher.n_features self.unkn_template = unkn_template self._attributes_dirty = True - self._term_counts = Counter() + self._term_counts: Counter[str] = Counter() def fit(self, X: Iterable[str], y=None) -> 'FeatureUnhasher': self._term_counts.clear() @@ -144,7 +144,7 @@ def get_feature_names_out(self, always_signed=True, always_positive=False) -> Fe for name in names] else: if not always_signed and _invert_signs(signs): - signs = [-sign for sign in signs] + signs = -signs feature_names[col_id] = [{'name': name, 'sign': sign} for name, sign in zip(names, signs)] return FeatureNames( diff --git a/eli5/sklearn/utils.py b/eli5/sklearn/utils.py index 20ca8f4e..20a5ca71 100644 --- a/eli5/sklearn/utils.py +++ b/eli5/sklearn/utils.py @@ -1,4 +1,4 @@ -from typing import Any, Optional +from typing import Any, Optional, Union import numpy as np import scipy.sparse as sp @@ -59,7 +59,7 @@ def has_intercept(estimator) -> bool: if estimator.intercept_ is None: return False # scikit-learn sets intercept to zero vector if it is not fit - return np.any(estimator.intercept_) + return bool(np.any(estimator.intercept_)) return False @@ -224,6 +224,7 @@ def get_num_features(estimator): def get_X(doc, vec=None, vectorized=False, to_dense=False): + X: Union[np.ndarray, sp._base._spbase] if vec is None or vectorized: if isinstance(doc, np.ndarray): X = np.array([doc]) @@ -235,6 +236,7 @@ def get_X(doc, vec=None, vectorized=False, to_dense=False): else: X = vec.transform([doc]) if to_dense and sp.issparse(X): + assert isinstance(X, sp._base._spbase) X = X.toarray() return X diff --git a/eli5/xgboost.py b/eli5/xgboost.py index 86bec85c..06847ca3 100644 --- a/eli5/xgboost.py +++ b/eli5/xgboost.py @@ -39,7 +39,7 @@ def explain_weights_xgboost(xgb, target_names=None, # ignored targets=None, # ignored feature_names=None, - feature_re: Pattern[str] = None, + feature_re: Optional[Pattern[str]] = None, feature_filter=None, importance_type='gain', ): @@ -96,11 +96,11 @@ def explain_prediction_xgboost( target_names=None, targets=None, feature_names=None, - feature_re: Pattern[str] = None, + feature_re: Optional[Pattern[str]] = None, feature_filter=None, vectorized: bool = False, - is_regression: bool = None, - missing: bool = None, + is_regression: Optional[bool] = None, + missing: Optional[bool] = None, ): """ Return an explanation of XGBoost prediction (via scikit-learn wrapper XGBClassifier or XGBRegressor, or via xgboost.Booster) as feature weights. diff --git a/tox.ini b/tox.ini index df324108..a092ae6c 100644 --- a/tox.ini +++ b/tox.ini @@ -96,6 +96,7 @@ basepython=python3.10 deps= {[testenv]deps} mypy == 1.15.0 + types-tabulate lxml commands= mypy --html-report ./mypy-cov --check-untyped-defs --ignore-missing-imports eli5 From 87d5fea611355dff5eab2ac921d0b5457f830b9f Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 20:09:12 +0000 Subject: [PATCH 25/37] fix and modernize types in base.py --- eli5/base.py | 116 ++++++++++++++++++++------------------------------- 1 file changed, 46 insertions(+), 70 deletions(-) diff --git a/eli5/base.py b/eli5/base.py index 3bac3b5b..bb072499 100644 --- a/eli5/base.py +++ b/eli5/base.py @@ -1,5 +1,4 @@ -# -*- coding: utf-8 -*- -from typing import Any, List, Tuple, Union, Optional +from typing import Union, Optional import numpy as np @@ -17,19 +16,18 @@ class Explanation(object): it can either explain weights or a single prediction. """ def __init__(self, - estimator, # type: str - description=None, # type: Optional[str] - error=None, # type: Optional[str] - method=None, # type: Optional[str] - is_regression=False, # type: bool - targets=None, # type: Optional[List[TargetExplanation]] - feature_importances=None, # type: Optional[FeatureImportances] - decision_tree=None, # type: Optional[TreeInfo] - highlight_spaces=None, # type: Optional[bool] - transition_features=None, # type: Optional[TransitionFeatureWeights] - image=None, # type: Any + estimator: str, + description: Optional[str] = None, + error: Optional[str] = None, + method: Optional[str] = None, + is_regression: bool = False, + targets: Optional[list['TargetExplanation']] = None, + feature_importances: Optional['FeatureImportances'] = None, + decision_tree: Optional['TreeInfo'] = None, + highlight_spaces: Optional[bool] = None, + transition_features: Optional['TransitionFeatureWeights'] = None, + image=None, ): - # type: (...) -> None self.estimator = estimator self.description = description self.error = error @@ -55,9 +53,8 @@ class FeatureImportances(object): """ Feature importances with number of remaining non-zero features. """ def __init__(self, importances, remaining): - # type: (...) -> None - self.importances = importances # type: List[FeatureWeight] - self.remaining = remaining # type: int + self.importances: list[FeatureWeight] = importances + self.remaining: int = remaining @classmethod def from_names_values(cls, names, values, std=None, **kwargs): @@ -75,14 +72,13 @@ class TargetExplanation(object): Spatial values are stored in the :heatmap: attribute. """ def __init__(self, - target, # type: Union[str, int] - feature_weights=None, # type: Optional[FeatureWeights] - proba=None, # type: Optional[float] - score=None, # type: Optional[float] - weighted_spans=None, # type: Optional[WeightedSpans] - heatmap=None, # type: Optional[np.ndarray] + target: Union[str, int], + feature_weights: Optional['FeatureWeights'] = None, + proba: Optional[float] = None, + score: Optional[float] = None, + weighted_spans: Optional['WeightedSpans'] = None, + heatmap: Optional[np.ndarray] = None, ): - # type: (...) -> None self.target = target self.feature_weights = feature_weights self.proba = proba @@ -92,7 +88,7 @@ def __init__(self, # List is currently used for unhashed features -Feature = Union[str, List, FormattedFeatureName] +Feature = Union[str, list, FormattedFeatureName] @attrs @@ -103,12 +99,11 @@ class FeatureWeights(object): :pos_remaining: and :neg_remaining: attributes. """ def __init__(self, - pos, # type: List[FeatureWeight] - neg, # type: List[FeatureWeight] - pos_remaining=0, # type: int - neg_remaining=0, # type: int + pos: list['FeatureWeight'], + neg: list['FeatureWeight'], + pos_remaining: int = 0, + neg_remaining: int = 0, ): - # type: (...) -> None self.pos = pos self.neg = neg self.pos_remaining = pos_remaining @@ -117,13 +112,7 @@ def __init__(self, @attrs class FeatureWeight(object): - def __init__(self, - feature, # type: Feature - weight, # type: float - std=None, # type: float - value=None, # type: Any - ): - # type: (...) -> None + def __init__(self, feature: Feature, weight: float, std: Optional[float] = None, value=None): self.feature = feature self.weight = weight self.std = std @@ -136,17 +125,16 @@ class WeightedSpans(object): object for each vectorizer, and other features not highlighted anywhere. """ def __init__(self, - docs_weighted_spans, # type: List[DocWeightedSpans] - other=None, # type: FeatureWeights + docs_weighted_spans: list['DocWeightedSpans'], + other: Optional[FeatureWeights] = None, ): - # type: (...) -> None self.docs_weighted_spans = docs_weighted_spans self.other = other -WeightedSpan = Tuple[ +WeightedSpan = tuple[ Feature, - List[Tuple[int, int]], # list of spans (start, end) for this feature + list[tuple[int, int]], # list of spans (start, end) for this feature float, # feature weight ] @@ -161,12 +149,11 @@ class DocWeightedSpans(object): and to False for word features. """ def __init__(self, - document, # type: str - spans, # type: List[WeightedSpan] - preserve_density=None, # type: bool - vec_name=None, # type: str + document: str, + spans: list[WeightedSpan], + preserve_density: Optional[bool] = None, + vec_name: Optional[str] = None, ): - # type: (...) -> None self.document = document self.spans = spans self.preserve_density = preserve_density @@ -176,11 +163,7 @@ def __init__(self, @attrs class TransitionFeatureWeights(object): """ Weights matrix for transition features. """ - def __init__(self, - class_names, # type: List[str] - coef, - ): - # type: (...) -> None + def __init__(self, class_names: list[str], coef): self.class_names = class_names self.coef = coef @@ -191,13 +174,7 @@ class TreeInfo(object): the function to measure the quality of a split, :tree: holds all nodes of the tree, and :graphviz: is the tree rendered in graphviz .dot format. """ - def __init__(self, - criterion, # type: str - tree, # type: NodeInfo - graphviz, # type: str - is_classification, # type: bool - ): - # type: (...) -> None + def __init__(self, criterion: str, tree: 'NodeInfo', graphviz: str, is_classification: bool): self.criterion = criterion self.tree = tree self.graphviz = graphviz @@ -210,20 +187,19 @@ class NodeInfo(object): Pointers to left and right children are in :left: and :right: attributes. """ def __init__(self, - id, # type: int - is_leaf, # type: bool + id: int, + is_leaf: bool, value, value_ratio, - impurity, # type: float - samples, # type: int - sample_ratio, # type: float - feature_name=None, # type: str - feature_id=None, # type: int - threshold=None, # type: float - left=None, # type: NodeInfo - right=None, # type: NodeInfo + impurity: float, + samples: int, + sample_ratio: float, + feature_name: Optional[str] = None, + feature_id: Optional[int] = None, + threshold: Optional[float] = None, + left: Optional['NodeInfo'] = None, + right: Optional['NodeInfo'] = None, ): - # type: (...) -> None self.id = id self.is_leaf = is_leaf self.value = value From fe07ae2f0a64be718cd8dd075c36034fc458b4ad Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 20:21:33 +0000 Subject: [PATCH 26/37] fix and modernize types in eli5/formatters/image.py --- eli5/formatters/image.py | 49 +++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/eli5/formatters/image.py b/eli5/formatters/image.py index f776b2c2..07fa2113 100644 --- a/eli5/formatters/image.py +++ b/eli5/formatters/image.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from typing import Union, Optional, Callable import numpy as np @@ -9,12 +7,12 @@ from eli5.base import Explanation -def format_as_image(expl, # type: Explanation - resampling_filter=Image.LANCZOS, # type: int - colormap=matplotlib.cm.viridis, # type: Callable[[np.ndarray], np.ndarray] - alpha_limit=0.65, # type: Optional[Union[float, int]] - ): - # type: (...) -> Image +def format_as_image( + expl: Explanation, + resampling_filter: int = Image.LANCZOS, # type: ignore + colormap: Callable[[np.ndarray], np.ndarray] = matplotlib.cm.viridis, + alpha_limit: Optional[Union[float, int]] = 0.65, + ) -> Image.Image: """format_as_image(expl, resampling_filter=Image.LANCZOS, colormap=matplotlib.cm.viridis, alpha_limit=0.65) Format a :class:`eli5.base.Explanation` object as an image. @@ -106,6 +104,7 @@ def format_as_image(expl, # type: Explanation else: assert len(expl.targets) == 1 heatmap = expl.targets[0].heatmap + assert heatmap is not None _validate_heatmap(heatmap) # The order of our operations is: 1. colorize 2. resize @@ -120,13 +119,12 @@ def format_as_image(expl, # type: Explanation # cap the intensity so that it's not too opaque when near maximum value _update_alpha(heatmap, starting_array=heatvals, alpha_limit=alpha_limit) - heatmap = expand_heatmap(heatmap, image, resampling_filter=resampling_filter) - overlay = _overlay_heatmap(heatmap, image) + heatmap_image = expand_heatmap(heatmap, image, resampling_filter=resampling_filter) + overlay = _overlay_heatmap(heatmap_image, image) return overlay -def heatmap_to_image(heatmap): - # type: (np.ndarray) -> Image +def heatmap_to_image(heatmap: np.ndarray) -> Image.Image: """ Convert the numpy array ``heatmap`` to a Pillow image. @@ -185,8 +183,7 @@ def _validate_heatmap(heatmap): 'and maximum: {}'.format(mi, ma)) -def _colorize(heatmap, colormap): - # type: (np.ndarray, Callable[[np.ndarray], np.ndarray]) -> np.ndarray +def _colorize(heatmap: np.ndarray, colormap: Callable[[np.ndarray], np.ndarray]) -> np.ndarray: """ Apply the ``colormap`` function to a grayscale rank 2 ``heatmap`` array (with float values in interval [0, 1]). @@ -196,8 +193,10 @@ def _colorize(heatmap, colormap): return heatmap -def _update_alpha(image_array, starting_array=None, alpha_limit=None): - # type: (np.ndarray, Optional[np.ndarray], Optional[Union[float, int]]) -> None +def _update_alpha( + image_array: np.ndarray, + starting_array: Optional[np.ndarray] = None, + alpha_limit: Optional[Union[float, int]] = None) -> None: """ Update the alpha channel values of an RGBA rank 3 ndarray ``image_array``, optionally creating the alpha channel from rank 2 ``starting_array``, @@ -218,8 +217,7 @@ def _update_alpha(image_array, starting_array=None, alpha_limit=None): image_array[:,:,3] = alpha -def _cap_alpha(alpha_arr, alpha_limit): - # type: (np.ndarray, Union[None, float, int]) -> np.ndarray +def _cap_alpha(alpha_arr: np.ndarray, alpha_limit: Union[None, float, int]) -> np.ndarray: """ Limit the alpha values in ``alpha_arr`` by setting the maximum alpha value to ``alpha_limit``. @@ -239,8 +237,10 @@ def _cap_alpha(alpha_arr, alpha_limit): 'got: {}'.format(alpha_limit)) -def expand_heatmap(heatmap, image, resampling_filter=Image.LANCZOS): - # type: (np.ndarray, Image, Union[None, int]) -> Image +def expand_heatmap( + heatmap: np.ndarray, image: Image.Image, + resampling_filter: Optional[int] = Image.LANCZOS, # type: ignore + ) -> Image.Image: """ Resize the ``heatmap`` image array to fit over the original ``image``, using the specified ``resampling_filter`` method. @@ -271,14 +271,11 @@ def expand_heatmap(heatmap, image, resampling_filter=Image.LANCZOS): if not isinstance(image, Image.Image): raise TypeError('image must be a PIL.Image.Image instance. ' 'Got: {}'.format(image)) - heatmap = heatmap_to_image(heatmap) spatial_dimensions = (image.width, image.height) - heatmap = heatmap.resize(spatial_dimensions, resample=resampling_filter) - return heatmap + return heatmap_to_image(heatmap).resize(spatial_dimensions, resample=resampling_filter) -def _overlay_heatmap(heatmap, image): - # type: (Image, Image) -> Image +def _overlay_heatmap(heatmap: Image.Image, image: Image.Image) -> Image.Image: """ Blend (combine) ``heatmap`` over ``image``, using alpha channel values appropriately (must have mode `RGBA`). @@ -286,4 +283,4 @@ def _overlay_heatmap(heatmap, image): """ # note that the order of alpha_composite arguments matters overlayed_image = Image.alpha_composite(image, heatmap) - return overlayed_image \ No newline at end of file + return overlayed_image From 0105bfcf444c4fa51751f8a9be0776517cd179b7 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 20:27:47 +0000 Subject: [PATCH 27/37] fix types in lime/samplers.py and sklearn_crfsuite/explain_weights.py --- eli5/lime/samplers.py | 9 +++++---- eli5/sklearn_crfsuite/explain_weights.py | 9 +++------ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/eli5/lime/samplers.py b/eli5/lime/samplers.py index 41b01c21..5de546da 100644 --- a/eli5/lime/samplers.py +++ b/eli5/lime/samplers.py @@ -128,6 +128,7 @@ def __init__(self, self.rng_ = check_random_state(random_state) self.token_pattern = token_pattern self.samplers = list(map(self._create_sampler, sampler_params)) + self.weights: np.ndarray if weights is None: self.weights = np.ones(len(self.samplers)) else: @@ -241,7 +242,7 @@ class MultivariateKernelDensitySampler(_BaseKernelDensitySampler): It is a problem e.g. when features have different variances (e.g. some of them are one-hot encoded and other are continuous). """ - def fit(self, X, y=None): + def fit(self, X=None, y=None): self.grid_, self.kde_ = self._fit_kde(self.kde, X) self._set_sigma(self.kde_.bandwidth) return self @@ -268,7 +269,7 @@ class UnivariateKernelDensitySampler(_BaseKernelDensitySampler): Also, at sampling time it replaces only random subsets of the features instead of generating totally new examples. """ - def fit(self, X, y=None): + def fit(self, X=None, y=None): self.kdes_: list[KernelDensity] = [] self.grids_: list[GridSearchCV] = [] num_features = X.shape[-1] @@ -295,8 +296,8 @@ def sample_near(self, doc, n_samples=1): kde = self.kdes_[i] new_doc[i] = kde.sample(random_state=self.rng_).ravel() samples.append(new_doc) - samples = np.asarray(samples) - return samples, self._similarity(doc, samples) + samples_array = np.asarray(samples) + return samples_array, self._similarity(doc, samples_array) def _distances(doc, samples, metric): diff --git a/eli5/sklearn_crfsuite/explain_weights.py b/eli5/sklearn_crfsuite/explain_weights.py index 6007efd7..702854ee 100644 --- a/eli5/sklearn_crfsuite/explain_weights.py +++ b/eli5/sklearn_crfsuite/explain_weights.py @@ -1,6 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import - import numpy as np from scipy import sparse as sp from sklearn_crfsuite import CRF @@ -30,9 +27,9 @@ def explain_weights_sklearn_crfsuite(crf, transition_coef = crf_transition_coef(crf) if feature_filter is not None or feature_re is not None: - state_feature_names, flt_indices = ( + state_feature_names_obj, flt_indices = ( FeatureNames(feature_names).handle_filter(feature_filter, feature_re)) - state_feature_names = np.array(state_feature_names.feature_names) + state_feature_names = np.array(state_feature_names_obj.feature_names) state_coef = state_coef[:, flt_indices] else: state_feature_names = feature_names @@ -57,7 +54,7 @@ def _features(label_id): for label_id, label in zip(indices, names) ], transition_features=TransitionFeatureWeights( - class_names=names, + class_names=list(names), coef=transition_coef, ), estimator=repr(crf), From e3399ad5880e770ddfeeb99ecda0bc0a6cad8d93 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 20:31:33 +0000 Subject: [PATCH 28/37] ignore type errors in formatters/utils.py --- eli5/formatters/utils.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/eli5/formatters/utils.py b/eli5/formatters/utils.py index bff5a001..7ade569b 100644 --- a/eli5/formatters/utils.py +++ b/eli5/formatters/utils.py @@ -158,10 +158,7 @@ def numpy_to_python(obj): elif isinstance(obj, np.str_): return str(obj) elif hasattr(obj, 'dtype') and np.isscalar(obj): - if np.issubdtype(obj, np.floating): - return float(obj) - elif np.issubdtype(obj, np.integer): - return int(obj) - elif np.issubdtype(obj, np.bool_): - return bool(obj) + if np.issubdtype(obj, np.floating): return float(obj) # type: ignore + elif np.issubdtype(obj, np.integer): return int(obj) # type: ignore + elif np.issubdtype(obj, np.bool_): return bool(obj) # type: ignore return obj From 408f6f3ca743dde8546e827ff2450a485de4404b Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 20:36:42 +0000 Subject: [PATCH 29/37] fix types in lightgbm, xgboost and sklean/explain_prediction --- eli5/lightgbm.py | 3 ++- eli5/sklearn/explain_prediction.py | 4 ++-- eli5/xgboost.py | 5 +++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/eli5/lightgbm.py b/eli5/lightgbm.py index b212b425..0e6e72dd 100644 --- a/eli5/lightgbm.py +++ b/eli5/lightgbm.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import DefaultDict, Optional +from typing import DefaultDict, Optional, Union import numpy as np import lightgbm @@ -164,6 +164,7 @@ def explain_prediction_lightgbm( if is_regression is None: raise ValueError('Please specify is_regression argument') + names: Union[list[str], np.ndarray] if is_regression: names = ['y'] elif isinstance(lgb, lightgbm.Booster): diff --git a/eli5/sklearn/explain_prediction.py b/eli5/sklearn/explain_prediction.py index 2ce8616f..4a7d0bc9 100644 --- a/eli5/sklearn/explain_prediction.py +++ b/eli5/sklearn/explain_prediction.py @@ -583,10 +583,10 @@ def _trees_feature_weights(clf, X, feature_names, num_targets): if clf.init_ == 'zero': bias_init = 0 elif is_grad_boost: - bias_init = _init_raw_predictions( + bias_init_arr = _init_raw_predictions( X, clf.init_, clf._loss, is_classifier(clf) ) - bias_init = bias_init.astype(np.float64)[0] + bias_init = bias_init_arr.astype(np.float64)[0] else: bias_init = clf.init_.predict(X)[0] feature_weights[feature_names.bias_idx] += bias_init diff --git a/eli5/xgboost.py b/eli5/xgboost.py index 06847ca3..5f0f8b6b 100644 --- a/eli5/xgboost.py +++ b/eli5/xgboost.py @@ -1,6 +1,6 @@ from functools import partial import re -from typing import Any, Optional, Pattern +from typing import Any, Optional, Pattern, Union import numpy as np import scipy.sparse as sp @@ -100,7 +100,7 @@ def explain_prediction_xgboost( feature_filter=None, vectorized: bool = False, is_regression: Optional[bool] = None, - missing: Optional[bool] = None, + missing: Optional[Any] = None, ): """ Return an explanation of XGBoost prediction (via scikit-learn wrapper XGBClassifier or XGBRegressor, or via xgboost.Booster) as feature weights. @@ -187,6 +187,7 @@ def explain_prediction_xgboost( proba = predict_proba(xgb, X) n_targets = _xgb_n_targets(xgb) + names: Union[list[str], np.ndarray] if is_regression: names = ['y'] elif isinstance(xgb, Booster): From f086924cde41ec6b237a8161fc523a5b5e1128f5 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 20:42:11 +0000 Subject: [PATCH 30/37] fix and modernize types in formatters/as_dataframe.py --- eli5/formatters/as_dataframe.py | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/eli5/formatters/as_dataframe.py b/eli5/formatters/as_dataframe.py index 5b801e75..4d30dcf7 100644 --- a/eli5/formatters/as_dataframe.py +++ b/eli5/formatters/as_dataframe.py @@ -1,5 +1,5 @@ from itertools import chain -from typing import Any, Dict, List, Optional +from typing import Any, Optional import warnings import pandas as pd @@ -12,8 +12,7 @@ from eli5.base_utils import singledispatch -def explain_weights_df(estimator, **kwargs): - # type: (...) -> pd.DataFrame +def explain_weights_df(estimator, **kwargs) -> pd.DataFrame: """ Explain weights and export them to ``pandas.DataFrame``. All keyword arguments are passed to :func:`eli5.explain_weights`. Weights of all features are exported by default. @@ -23,8 +22,7 @@ def explain_weights_df(estimator, **kwargs): eli5.explain_weights(estimator, **kwargs)) -def explain_weights_dfs(estimator, **kwargs): - # type: (...) -> Dict[str, pd.DataFrame] +def explain_weights_dfs(estimator, **kwargs) -> dict[str, pd.DataFrame]: """ Explain weights and export them to a dict with ``pandas.DataFrame`` values (as :func:`eli5.formatters.as_dataframe.format_as_dataframes` does). All keyword arguments are passed to :func:`eli5.explain_weights`. @@ -35,8 +33,7 @@ def explain_weights_dfs(estimator, **kwargs): eli5.explain_weights(estimator, **kwargs)) -def explain_prediction_df(estimator, doc, **kwargs): - # type: (...) -> pd.DataFrame +def explain_prediction_df(estimator, doc, **kwargs) -> pd.DataFrame: """ Explain prediction and export explanation to ``pandas.DataFrame`` All keyword arguments are passed to :func:`eli5.explain_prediction`. Weights of all features are exported by default. @@ -46,8 +43,7 @@ def explain_prediction_df(estimator, doc, **kwargs): eli5.explain_prediction(estimator, doc, **kwargs)) -def explain_prediction_dfs(estimator, doc, **kwargs): - # type: (...) -> Dict[str, pd.DataFrame] +def explain_prediction_dfs(estimator, doc, **kwargs) -> dict[str, pd.DataFrame]: """ Explain prediction and export explanation to a dict with ``pandas.DataFrame`` values (as :func:`eli5.formatters.as_dataframe.format_as_dataframes` does). @@ -69,8 +65,7 @@ def _set_defaults(kwargs): _EXPORTED_ATTRIBUTES = ['transition_features', 'targets', 'feature_importances'] -def format_as_dataframes(explanation): - # type: (Explanation) -> Dict[str, pd.DataFrame] +def format_as_dataframes(explanation: Explanation) -> dict[str, pd.DataFrame]: """ Export an explanation to a dictionary with ``pandas.DataFrame`` values and string keys that correspond to explanation attributes. Use this method if several dataframes can be exported from a single @@ -90,8 +85,7 @@ def format_as_dataframes(explanation): @singledispatch -def format_as_dataframe(explanation): - # type: (Explanation) -> Optional[pd.DataFrame] +def format_as_dataframe(explanation) -> Optional[pd.DataFrame]: """ Export an explanation to a single ``pandas.DataFrame``. In case several dataframes could be exported by :func:`eli5.formatters.as_dataframe.format_as_dataframes`, @@ -117,8 +111,7 @@ def format_as_dataframe(explanation): @format_as_dataframe.register(FeatureImportances) -def _feature_importances_to_df(feature_importances): - # type: (FeatureImportances) -> pd.DataFrame +def _feature_importances_to_df(feature_importances: FeatureImportances) -> pd.DataFrame: weights = feature_importances.importances df = pd.DataFrame( {'feature': [fw.feature for fw in weights], @@ -133,12 +126,11 @@ def _feature_importances_to_df(feature_importances): @format_as_dataframe.register(list) -def _targets_to_df(targets): - # type: (List[TargetExplanation]) -> pd.DataFrame +def _targets_to_df(targets: list[TargetExplanation]) -> pd.DataFrame: if targets and not isinstance(targets[0], TargetExplanation): raise ValueError('Only lists of TargetExplanation are supported') columns = ['target', 'feature', 'weight', 'std', 'value'] - df_data = {f: [] for f in columns} # type: Dict[str, List[Any]] + df_data: dict[str, list[Any]] = {f: [] for f in columns} for target in targets: assert target.feature_weights is not None for fw in chain(target.feature_weights.pos, @@ -156,8 +148,7 @@ def _targets_to_df(targets): @format_as_dataframe.register(TransitionFeatureWeights) -def _transition_features_to_df(transition_features): - # type: (TransitionFeatureWeights) -> pd.DataFrame +def _transition_features_to_df(transition_features: TransitionFeatureWeights) -> pd.DataFrame: class_names = list(transition_features.class_names) return pd.DataFrame( {'from': [f for f in class_names for _ in class_names], From 91f13daf67c955230c5be11eeba0b361235e11d3 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 21:32:45 +0000 Subject: [PATCH 31/37] some doc build fixes --- docs/requirements.txt | 4 ++-- docs/source/conf.py | 6 +++--- tox.ini | 16 +++++++--------- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index ea3b328e..940c8e0e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -4,5 +4,5 @@ sphinx_rtd_theme ipython scipy numpy > 1.9.0 -scikit-learn >= 0.20 -typing +pandas +scikit-learn >= 1.6.0 diff --git a/docs/source/conf.py b/docs/source/conf.py index 8a6c5723..c4ffee72 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -54,7 +54,7 @@ def __getattr__(cls, name): 'keras.models', 'keras.layers', 'keras.preprocessing.image', - 'pandas', + # 'pandas', 'PIL', 'matplotlib', 'matplotlib.pyplot', @@ -69,7 +69,7 @@ def __getattr__(cls, name): def setup(app): # see https://github.com/snide/sphinx_rtd_theme/issues/117 - app.add_stylesheet("rtfd_overrides.css") + app.add_css_file("rtfd_overrides.css") suppress_warnings = ['image.nonlocal_uri'] @@ -123,7 +123,7 @@ def setup(app): # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = 'en' # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: diff --git a/tox.ini b/tox.ini index a092ae6c..8b9c767f 100644 --- a/tox.ini +++ b/tox.ini @@ -105,15 +105,13 @@ commands= [testenv:docs] basepython=python3.10 deps= - mock==1.0.1 - pillow==8.3.1 - alabaster>=0.7,<0.8,!=0.7.5 - commonmark==0.8.1 - recommonmark==0.5.0 - sphinx<2 - sphinx-rtd-theme<0.5 - readthedocs-sphinx-ext<2.2 - docutils < 0.17.0 + mock==5.2.0 + pillow==11.1.0 + commonmark==0.9.1 + recommonmark==0.7.1 + sphinx==7.1.2 + sphinx-rtd-theme==1.3.0rc1 + readthedocs-sphinx-ext==2.2.5 -rdocs/requirements.txt changedir=docs/source commands= From ffc20a247d00e8f233d18fbd022e7ca753f2304f Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sat, 22 Mar 2025 21:34:08 +0000 Subject: [PATCH 32/37] ignore warnings for now --- tox.ini | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 8b9c767f..f128a776 100644 --- a/tox.ini +++ b/tox.ini @@ -116,4 +116,5 @@ deps= changedir=docs/source commands= pip install -e ../.. - sphinx-build -W -b html . {envtmpdir}/html + ; TODO re-enable -W + sphinx-build -b html . {envtmpdir}/html From 2badc3d3cf44baccd5416f49ef6a004bba3e09d6 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sun, 23 Mar 2025 12:51:06 +0000 Subject: [PATCH 33/37] a more deterministic xor test --- eli5/xgboost.py | 2 +- tests/test_xgboost.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/eli5/xgboost.py b/eli5/xgboost.py index 5f0f8b6b..735f1d23 100644 --- a/eli5/xgboost.py +++ b/eli5/xgboost.py @@ -153,7 +153,7 @@ def explain_prediction_xgboost( xgb, doc, vec, vectorized, feature_names, num_features=len(xgb_feature_names)) if feature_names.bias_name is None: - # XGBoost estimators do not have an intercept, but here we interpret + # Some XGBoost estimators do not have an intercept, but here we interpret # them as having an intercept feature_names.bias_name = '' diff --git a/tests/test_xgboost.py b/tests/test_xgboost.py index d2bcef08..7207fc26 100644 --- a/tests/test_xgboost.py +++ b/tests/test_xgboost.py @@ -179,12 +179,14 @@ def test_explain_prediction_clf_multitarget( t.proba for t in res.targets)[-2:] -def test_explain_prediction_clf_xor(): - true_xs = [[np.random.randint(2), np.random.randint(2)] for _ in range(100)] - xs = np.array([[np.random.normal(x, 0.2), np.random.normal(y, 0.2)] +@pytest.mark.parametrize('seed', [1, 2, 3]) +def test_explain_prediction_clf_xor(seed): + rng = np.random.RandomState(seed) + true_xs = [[rng.randint(2), rng.randint(2)] for _ in range(100)] + xs = np.array([[rng.normal(x, 0.2), rng.normal(y, 0.2)] for x, y in true_xs]) ys = np.array([x == y for x, y in true_xs]) - clf = XGBClassifier(n_estimators=100, max_depth=2) + clf = XGBClassifier(n_estimators=100, max_depth=2, tree_method='exact') clf.fit(xs, ys) res = explain_prediction(clf, np.array([1, 1])) format_as_all(res, clf) From ab85a1773ec7020ffacde4963be1d03706874b7c Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sun, 23 Mar 2025 12:51:46 +0000 Subject: [PATCH 34/37] use xgboost < 2.0.0 in tests, as newer xgboost does not work correctly --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index f128a776..9ddc5479 100644 --- a/tox.ini +++ b/tox.ini @@ -46,7 +46,7 @@ commands= basepython=python3.10 deps= {[testenv]deps} - xgboost + xgboost < 2.0.0 lightgbm != 2.0.5, != 2.0.6 catboost # tensorflow From cd3f5b6e96a66de5233349244b96179bdab2aa80 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sun, 23 Mar 2025 13:10:59 +0000 Subject: [PATCH 35/37] fix is_classifier for older xgboost --- eli5/sklearn/explain_prediction.py | 6 +++--- eli5/sklearn/permutation_importance.py | 22 ++++++---------------- eli5/sklearn/utils.py | 18 ++++++++++++++++++ eli5/utils.py | 1 - 4 files changed, 27 insertions(+), 20 deletions(-) diff --git a/eli5/sklearn/explain_prediction.py b/eli5/sklearn/explain_prediction.py index 4a7d0bc9..ec530dda 100644 --- a/eli5/sklearn/explain_prediction.py +++ b/eli5/sklearn/explain_prediction.py @@ -1,9 +1,8 @@ -# -*- coding: utf-8 -*- from functools import partial import numpy as np import scipy.sparse as sp -from sklearn.base import BaseEstimator, is_classifier +from sklearn.base import BaseEstimator from sklearn.ensemble import ( ExtraTreesClassifier, ExtraTreesRegressor, @@ -54,7 +53,7 @@ from eli5.base_utils import singledispatch from eli5.utils import ( get_target_display_names, - get_binary_target_scale_label_id + get_binary_target_scale_label_id, ) from eli5.sklearn.utils import ( add_intercept, @@ -62,6 +61,7 @@ get_default_target_names, get_X, get_X0, + is_classifier, is_multiclass_classifier, is_multitarget_regressor, predict_proba, diff --git a/eli5/sklearn/permutation_importance.py b/eli5/sklearn/permutation_importance.py index aa448953..55f841d7 100644 --- a/eli5/sklearn/permutation_importance.py +++ b/eli5/sklearn/permutation_importance.py @@ -1,21 +1,14 @@ -# -*- coding: utf-8 -*- from functools import partial -from typing import List import numpy as np from sklearn.model_selection import check_cv from sklearn.utils.metaestimators import available_if from sklearn.utils import check_array, check_random_state -from sklearn.base import ( - BaseEstimator, - MetaEstimatorMixin, - clone, - is_classifier -) +from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone from sklearn.metrics import check_scoring from eli5.permutation_importance import get_score_importances -from eli5.sklearn.utils import pandas_available +from eli5.sklearn.utils import pandas_available, is_classifier if pandas_available: import pandas as pd @@ -157,7 +150,6 @@ class PermutationImportance(BaseEstimator, MetaEstimatorMixin): """ def __init__(self, estimator, scoring=None, n_iter=5, random_state=None, cv='prefit', refit=True): - # type: (...) -> None if isinstance(cv, str) and cv != "prefit": raise ValueError("Invalid cv value: {!r}".format(cv)) self.refit = refit @@ -174,8 +166,7 @@ def pd_scorer(model, X, y): return base_scorer(model, X, y) return pd_scorer - def fit(self, X, y, groups=None, **fit_params): - # type: (...) -> PermutationImportance + def fit(self, X, y, groups=None, **fit_params) -> 'PermutationImportance': """Compute ``feature_importances_`` attribute and optionally fit the base estimator. @@ -224,8 +215,8 @@ def fit(self, X, y, groups=None, **fit_params): def _cv_scores_importances(self, X, y, groups=None, **fit_params): assert self.cv is not None cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) - feature_importances = [] # type: List - base_scores = [] # type: List[float] + feature_importances: list = [] + base_scores: list[float] = [] weights = fit_params.pop('sample_weight', None) fold_fit_params = fit_params.copy() for train, test in cv.split(X, y, groups): @@ -249,8 +240,7 @@ def _get_score_importances(self, score_func, X, y): random_state=self.rng_) @property - def caveats_(self): - # type: () -> str + def caveats_(self) -> str: if self.cv == 'prefit': return CAVEATS_PREFIT elif self.cv is None: diff --git a/eli5/sklearn/utils.py b/eli5/sklearn/utils.py index 20a5ca71..7493b974 100644 --- a/eli5/sklearn/utils.py +++ b/eli5/sklearn/utils.py @@ -2,12 +2,30 @@ import numpy as np import scipy.sparse as sp +import sklearn.base from sklearn.multiclass import OneVsRestClassifier from eli5.sklearn.unhashing import invert_hashing_and_fit, handle_hashing_vec from eli5._feature_names import FeatureNames +def is_classifier(estimator): + try: + return sklearn.base.is_classifier(estimator) + except AttributeError: + # old xgboost < 2.0.0 is not compatible with new sklean here + try: + import xgboost + except ImportError: + pass + else: + if isinstance(estimator, xgboost.XGBClassifier): + return True + elif isinstance(estimator, (xgboost.XGBRanker, xgboost.XGBRegressor)): + return False + raise + + def is_multiclass_classifier(clf) -> bool: """ Return True if a classifier is multiclass or False if it is binary. diff --git a/eli5/utils.py b/eli5/utils.py index f9d62595..2ff37566 100644 --- a/eli5/utils.py +++ b/eli5/utils.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import numpy as np from scipy import sparse as sp From 11f536767ff82da040e4e1864afcfa535737c215 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sun, 23 Mar 2025 13:16:03 +0000 Subject: [PATCH 36/37] add a warning about xgboost version support --- docs/source/libraries/xgboost.rst | 4 +++- eli5/xgboost.py | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/source/libraries/xgboost.rst b/docs/source/libraries/xgboost.rst index ad4384f1..c1897716 100644 --- a/docs/source/libraries/xgboost.rst +++ b/docs/source/libraries/xgboost.rst @@ -6,7 +6,9 @@ XGBoost XGBoost_ is a popular Gradient Boosting library with Python interface. eli5 supports :func:`eli5.explain_weights` and :func:`eli5.explain_prediction` for XGBClassifer_, XGBRegressor_ and Booster_ estimators. It is tested for -xgboost >= 0.6a2. +xgboost >= 0.6a2 and < 2.0.0. +Versions starting from 2.0.0 likely produce incorrect results in +:func:`eli5.explain_prediction`, and will issue a warning. .. _XGBoost: https://github.com/dmlc/xgboost .. _XGBClassifer: https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier diff --git a/eli5/xgboost.py b/eli5/xgboost.py index 735f1d23..283ac228 100644 --- a/eli5/xgboost.py +++ b/eli5/xgboost.py @@ -1,9 +1,11 @@ -from functools import partial import re +import warnings +from functools import partial from typing import Any, Optional, Pattern, Union import numpy as np import scipy.sparse as sp +import xgboost from xgboost import ( XGBClassifier, XGBRegressor, @@ -147,6 +149,11 @@ def explain_prediction_xgboost( changes from parent to child. Weights of all features sum to the output score of the estimator. """ + if not xgboost.__version__.startswith(('0.', '1.')): + warnings.warn( + 'This explanation might be incoorrect, ' + 'only xgboost < 2.0.0 is known to work correctly') + booster, is_regression = _check_booster_args(xgb, is_regression) xgb_feature_names = _get_booster_feature_names(booster) vec, feature_names = handle_vec( From 0756b9fb479efb3f828e4a5110fa6397e65cdd02 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Sun, 23 Mar 2025 13:20:05 +0000 Subject: [PATCH 37/37] add a warning about keras requirements to the docs --- docs/source/libraries/keras.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/libraries/keras.rst b/docs/source/libraries/keras.rst index c835606a..690d2863 100644 --- a/docs/source/libraries/keras.rst +++ b/docs/source/libraries/keras.rst @@ -8,7 +8,7 @@ Keras_ is "a high-level neural networks API, written in Python and capable of ru Keras can be used for many Machine Learning tasks, and it has support for both popular and experimental neural network architectures. -Note: only TensorFlow 1.x is supported, recommended Keras version is 2.3.1 or earlier. +Note: only TensorFlow 1.x is supported, recommended Keras version is 2.3.1 or earlier, and eli5 version 0.13 or earlier, as you can't install TensorFlow 1.x on Python 3.9+ which is required for eli5 0.14+ .. _Keras: https://keras.io/