Skip to content

AME #655

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 16 commits into
base: develop
Choose a base branch
from
Draft

AME #655

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/value/owen.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ probability values $q$ between 0 and 0.5 at random and then generates two sample
for each index, one using the probability $q$ for index draws, and another with
probability $1-q$.

!!! Example
??? Example
```python
from pydvl.valuation import AntitheticOwenSampler, ShapleyValuation, RankCorrelation
...
Expand Down
3 changes: 3 additions & 0 deletions src/pydvl/valuation/samplers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ def fit(self, data: Dataset):
from typing import Union

from .base import *

from .ame import * # isort: skip
from .classwise import *
from .msr import *
from .owen import *
Expand All @@ -169,6 +171,7 @@ def fit(self, data: Dataset):
StochasticSampler = Union[
UniformSampler,
PermutationSampler,
AMESampler,
AntitheticSampler,
StratifiedSampler,
OwenSampler,
Expand Down
115 changes: 115 additions & 0 deletions src/pydvl/valuation/samplers/ame.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""
AMESampler implements a generalized Average Marginal Effect sampling scheme.

For each index $i$, it first samples a probability $q$ from a user defined distribution
$D_q$. Then it samples a subset from the complement of $i$, by including each element
independently, following a Bernoulli distribution with parameter $q$.

This two-stage process was introduced in Lin et al. (2022)[^1] together with an
approximation using Lasso. There it was shown, that under a uniform $D_q$, the Monte
Carlo sum of marginal utilities is an unbiased estimator of the Shapley value, without
the need for a correction factor. This is essentially the same as the Owen sampling
scheme.

However, one can also use other distributions for $D_q$, such as a beta distribution. In
this case, the Monte Carlo sum of marginal utilities no longer converges to Shapley
without the correction factor, and even then because of the very different shape of the
distribution, attempting to approximate Shapley values with this method is not advisable.


## References

[^1]: Lin, Jinkun, Anqi Zhang, Mathias Lécuyer, Jinyang Li, Aurojit Panda, and
Siddhartha Sen. [Measuring the Effect of Training Data on Deep Learning Predictions
via Randomized Experiments](https://proceedings.mlr.press/v162/lin22h.html). In
Proceedings of the 39th International Conference on Machine Learning, 13468–504.
PMLR, 2022.
"""

from __future__ import annotations

from typing import Type

import numpy as np
import scipy.stats.distributions as dist
from scipy.integrate import quad
from scipy.stats import rv_continuous

from pydvl.utils.numeric import complement, random_subset
from pydvl.utils.types import Seed
from pydvl.valuation.samplers.powerset import (
IndexIteration,
PowersetSampler,
SequentialIndexIteration,
)
from pydvl.valuation.samplers.utils import StochasticSamplerMixin
from pydvl.valuation.types import IndexSetT, Sample, SampleGenerator


class AMESampler(StochasticSamplerMixin, PowersetSampler):
"""AMESampler implements the two-stage Average Marginal Effect sampling scheme.

For each index i, it first samples a probability p from `p_distribution`. Then it
samples a subset from the complement of i by including each element independently
following a Bernoulli distribution with parameter p.

Args:
p_distribution: A scipy
[continuous random variable][scipy.stats._distn_infrastructure.rv_continuous]
Defaults to [scipy.stats.distributions.uniform][].
batch_size: Number of samples per batch.
index_iteration: Strategy to iterate over indices.
seed: Seed or random state.
"""

def __init__(
self,
p_distribution: rv_continuous | None = None,
batch_size: int = 1,
index_iteration: Type[IndexIteration] = SequentialIndexIteration,
seed: Seed | None = None,
):
super().__init__(
batch_size=batch_size, index_iteration=index_iteration, seed=seed
)
self.p_distribution = p_distribution or dist.uniform

def generate(self, indices: IndexSetT) -> SampleGenerator:
"""For each index i, sample p ~ p_distribution and then generate a subset of
the complement by including each element independently with probability p.
"""
for idx in self.index_iterable(indices):
_complement = complement(indices, [idx])
p = self.p_distribution.rvs(random_state=self._rng).item()
subset = random_subset(_complement, p, self._rng)
yield Sample(idx, subset)

def sample_limit(self, indices: IndexSetT) -> int | None:
"""The sample limit is determined by the outer loop over indices."""
return self._index_iterator_cls.length(len(indices))

def log_weight(self, n: int, subset_len: int) -> float:
r"""Computes the probability of a sample of a given size.

Let m = n - 1 or m = n depending on the index iteration strategy. For a given
index i and a sampled subset S of size k, The probability of obtaining S under
the two-stage scheme is the result of marginalizing over the distribution of p:

$$P(S) = \int_0^1 f(p) p^k (1-p)^{m-k} dp.$$

Args:
n: The size of the dataset.
subset_len: The size of the sampled subset.
Returns:
The natural logarithm of P(S)
"""
m = self._index_iterator_cls.complement_size(n)
integrand = (
lambda p: self.p_distribution.pdf(p)
* (p**subset_len)
* ((1 - p) ** (m - subset_len))
)
I, abserr = quad(integrand, 0, 1, epsabs=1e-12) # noqa
if I < 0:
raise ValueError("Computed integral is non-positive.")
return float(np.log(I) if I > 0 else -np.inf)
5 changes: 5 additions & 0 deletions src/pydvl/valuation/samplers/msr.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,11 @@ def log_weight(self, n: int, subset_len: int) -> float:
"""
return float(-(n - 1) * np.log(2)) if n > 0 else 0.0

def sample_limit(self, indices: IndexSetT) -> int | None:
if len(indices) == 0:
return 0
return None

def make_strategy(
self,
utility: UtilityBase,
Expand Down
9 changes: 4 additions & 5 deletions src/pydvl/valuation/samplers/owen.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,11 +219,10 @@ def log_weight(self, n: int, subset_len: int) -> float:
$$ P (| S_{q_j} | = k) = \binom{n}{k} \ q_j^k (1 - q_j)^{n - k}.$$

So, if each $q_j$ is chosen with equal weight (or more generally with
probability $p_j$),then by total probability, the overall probability of
obtaining a subset of size $k$ is a mixture of the binomials:
$$
P (| S | = k) = \sum_{j = 1}^N p_j \ \binom{n}{k} \ q_j^k (1 - q_j)^{n - k}.
$$
probability $p_j=P(q_j=q)$ for any $q$),then by total probability, the overall
probability of obtaining a subset of size $k$ is a mixture of the binomials:

$$P(|S| = k) = \sum_{j = 1}^N p_j \ \binom{n}{k} \ q_j^k (1 - q_j)^{n - k}.$$

In our case $p_j = 1/N$, so that $P(|S|=k) = \frac{1}{N} \sum_{j=1}^N P (|
S_{q_j} | = k)$. For large enough $N$ this is
Expand Down
5 changes: 5 additions & 0 deletions src/pydvl/valuation/samplers/permutation.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,11 @@ def skip_indices(self, indices: IndexSetT):
estimates if not done carefully."""
self._skip_indices = indices

def sample_limit(self, indices: IndexSetT) -> int | None:
if len(indices) == 0:
return 0
return None

def generate(self, indices: IndexSetT) -> SampleGenerator:
"""Generates the permutation samples.

Expand Down
12 changes: 11 additions & 1 deletion src/pydvl/valuation/samplers/powerset.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,10 @@ def complement_size(n: int) -> int:


class FiniteRandomIndexIteration(FiniteIterationMixin, RandomIndexIteration):
"""Samples indices at random, once"""
"""Samples indices at random, once, without replacement.

Each index is sampled exactly once.
"""

def __iter__(self) -> Generator[IndexT, None, None]:
if len(self._indices) == 0:
Expand Down Expand Up @@ -585,6 +588,9 @@ def generate(self, indices: IndexSetT) -> SampleGenerator:
subset = random_subset(complement(indices, [idx]), seed=self._rng)
yield Sample(idx, subset)

def sample_limit(self, indices: IndexSetT) -> int | None:
return self._index_iterator_cls.length(len(indices))


class AntitheticSampler(StochasticSamplerMixin, PowersetSampler):
"""A sampler that draws samples uniformly, followed by their complements.
Expand All @@ -604,3 +610,7 @@ def generate(self, indices: IndexSetT) -> SampleGenerator:
subset = random_subset(_complement, seed=self._rng)
yield Sample(idx, subset)
yield Sample(idx, complement(_complement, subset))

def sample_limit(self, indices: IndexSetT) -> int | None:
len_outer = self._index_iterator_cls.length(len(indices))
return len_outer * 2 if len_outer is not None else None
8 changes: 7 additions & 1 deletion src/pydvl/valuation/samplers/stratified.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,13 @@ def __iter__(self) -> Generator[tuple[int, int], None, None]:


class RandomSizeIteration(SampleSizeIteration):
"""Draws a set size $k$ following the distribution of sizes given by the strategy."""
"""Draws a set size $k$ following the distribution of sizes given by the strategy.

Args:
strategy: The strategy to use for computing the number of samples to take.
n_indices: The number of indices in the index set from which samples are taken.
seed: The seed for the random number generator.
"""

def __init__(
self, strategy: SampleSizeStrategy, n_indices: int, seed: Seed | None = None
Expand Down
4 changes: 0 additions & 4 deletions src/pydvl/valuation/samplers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import numpy as np

from pydvl.utils.types import Seed
from pydvl.valuation.types import IndexSetT


class StochasticSamplerMixin:
Expand All @@ -22,6 +21,3 @@ class StochasticSamplerMixin:
def __init__(self, *args, seed: Seed | None = None, **kwargs):
super().__init__(*args, **kwargs)
self._rng = np.random.default_rng(seed)

def sample_limit(self, indices: IndexSetT) -> int | None:
return None
1 change: 1 addition & 0 deletions src/pydvl/valuation/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
"ValueUpdateT",
]


IndexT: TypeAlias = np.int_
IndexSetT: TypeAlias = NDArray[IndexT]
NameT: TypeAlias = Union[np.object_, np.int_, np.str_]
Expand Down
11 changes: 6 additions & 5 deletions tests/valuation/methods/test_semivalues.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def log_weight(self, n: int, j: int) -> float:
)
@pytest.mark.parametrize(
"sampler_cls, sampler_kwargs",
deterministic_samplers() + random_samplers(proper=True),
deterministic_samplers() + random_samplers(proper=True, stratified=False),
)
@pytest.mark.parametrize(
"valuation_cls, valuation_kwargs, exact_values_attr",
Expand Down Expand Up @@ -95,10 +95,11 @@ def test_games(
sampler_cls,
sampler_kwargs,
seed=seed,
# For stratified samplers:
lower_bound=1,
upper_bound=None,
n_samples=n_samples, # Required for cases using FiniteSequentialSizeIteration
# Testing stratified samplers is too slow and they are tested elsewhere.
# # For stratified samplers:
# lower_bound=1,
# upper_bound=None,
# n_samples=n_samples, # Required for cases using FiniteSequentialSizeIteration
)
valuation = valuation_cls(
utility=test_game.u,
Expand Down
30 changes: 26 additions & 4 deletions tests/valuation/samplers/test_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
UniformOwenStrategy,
UniformSampler,
)
from pydvl.valuation.samplers.ame import AMESampler
from pydvl.valuation.samplers.permutation import PermutationSamplerBase
from pydvl.valuation.types import IndexSetT, Sample, SampleGenerator
from pydvl.valuation.utility.base import UtilityBase
Expand Down Expand Up @@ -151,6 +152,19 @@ def permutation_samplers():
]


def amesampler_samplers():
return [
(
AMESampler,
{"index_iteration": RandomIndexIteration, "seed": lambda seed: seed},
),
(
AMESampler,
{"index_iteration": SequentialIndexIteration, "seed": lambda seed: seed},
),
]


def powerset_samplers():
ret = []

Expand Down Expand Up @@ -291,20 +305,24 @@ def owen_samplers():
]


def random_samplers(proper: bool = False, n_samples_per_index: int = 32):
def random_samplers(proper: bool = False, stratified: bool = True):
"""Use this as parameter values in pytest.mark.parametrize for parameters
"sampler_cls, sampler_kwargs"

Build the objects with recursive_make(sampler_cls, sampler_kwargs, **lambda_args)
where lambda args are named as the key in the dictionary that contains the lambda.
"""
return (
ret = (
permutation_samplers()
+ powerset_samplers()
+ stratified_samplers(n_samples_per_index=n_samples_per_index)
+ amesampler_samplers()
+ owen_samplers()
+ (improper_samplers() if not proper else [])
)
if stratified:
ret += stratified_samplers()
if not proper:
ret += improper_samplers()
return ret


@pytest.mark.parametrize(
Expand Down Expand Up @@ -550,9 +568,13 @@ def test_length_of_infinite_samplers(
# check that we can generate samples that are longer than size of powerset
batches = list(islice(sampler.generate_batches(indices), n_batches))
if len(indices) > 0:
assert sampler.sample_limit(indices) is None
assert (
len(list(flatten(batches))) == n_batches * batch_size == sampler.n_samples
)
else:
assert sampler.sample_limit(indices) == 0

with pytest.raises(TypeError):
len(sampler)

Expand Down