diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 9b4ad016692..92c5c7d4697 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -1047,6 +1047,7 @@ def from_generator(
gen_kwargs: Optional[dict] = None,
num_proc: Optional[int] = None,
split: NamedSplit = Split.TRAIN,
+ fingerprint: Optional[str] = None,
**kwargs,
):
"""Create a Dataset from a generator.
@@ -1073,6 +1074,11 @@ def from_generator(
Split name to be assigned to the dataset.
+ fingerprint (`str`, *optional*):
+ Fingerprint that will be used to generate dataset ID.
+ By default `fingerprint` is generated by hashing all the args which can be slow in case of a large dataset.
+
+
**kwargs (additional keyword arguments):
Keyword arguments to be passed to :[`GeneratorConfig`].
@@ -1110,6 +1116,7 @@ def from_generator(
gen_kwargs=gen_kwargs,
num_proc=num_proc,
split=split,
+ fingerprint=fingerprint,
**kwargs,
).read()
diff --git a/src/datasets/builder.py b/src/datasets/builder.py
index d6992b9e19d..f16b6d622f0 100644
--- a/src/datasets/builder.py
+++ b/src/datasets/builder.py
@@ -313,6 +313,7 @@ def __init__(
data_dir: Optional[str] = None,
storage_options: Optional[dict] = None,
writer_batch_size: Optional[int] = None,
+ config_id: Optional[str] = None,
**config_kwargs,
):
# DatasetBuilder name
@@ -343,6 +344,7 @@ def __init__(
self.config, self.config_id = self._create_builder_config(
config_name=config_name,
custom_features=features,
+ config_id=config_id,
**config_kwargs,
)
@@ -533,7 +535,7 @@ def get_exported_dataset_info(self) -> DatasetInfo:
return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo())
def _create_builder_config(
- self, config_name=None, custom_features=None, **config_kwargs
+ self, config_name=None, custom_features=None, config_id=None, **config_kwargs
) -> tuple[BuilderConfig, str]:
"""Create and validate BuilderConfig object as well as a unique config id for this config.
Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None.
@@ -601,10 +603,13 @@ def _create_builder_config(
)
# compute the config id that is going to be used for caching
- config_id = builder_config.create_config_id(
- config_kwargs,
- custom_features=custom_features,
- )
+ if config_id is not None:
+ config_id = builder_config.name + "-" + config_id
+ else:
+ config_id = builder_config.create_config_id(
+ config_kwargs,
+ custom_features=custom_features,
+ )
is_custom = (config_id not in self.builder_configs) and config_id != "default"
if is_custom:
logger.info(f"Using custom data configuration {config_id}")
diff --git a/src/datasets/io/generator.py b/src/datasets/io/generator.py
index b10609cac23..1f63e992600 100644
--- a/src/datasets/io/generator.py
+++ b/src/datasets/io/generator.py
@@ -16,6 +16,7 @@ def __init__(
gen_kwargs: Optional[dict] = None,
num_proc: Optional[int] = None,
split: NamedSplit = Split.TRAIN,
+ fingerprint: Optional[str] = None,
**kwargs,
):
super().__init__(
@@ -32,6 +33,7 @@ def __init__(
generator=generator,
gen_kwargs=gen_kwargs,
split=split,
+ config_id=fingerprint,
**kwargs,
)