diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 9b4ad016692..92c5c7d4697 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1047,6 +1047,7 @@ def from_generator( gen_kwargs: Optional[dict] = None, num_proc: Optional[int] = None, split: NamedSplit = Split.TRAIN, + fingerprint: Optional[str] = None, **kwargs, ): """Create a Dataset from a generator. @@ -1073,6 +1074,11 @@ def from_generator( Split name to be assigned to the dataset. + fingerprint (`str`, *optional*): + Fingerprint that will be used to generate dataset ID. + By default `fingerprint` is generated by hashing all the args which can be slow in case of a large dataset. + + **kwargs (additional keyword arguments): Keyword arguments to be passed to :[`GeneratorConfig`]. @@ -1110,6 +1116,7 @@ def from_generator( gen_kwargs=gen_kwargs, num_proc=num_proc, split=split, + fingerprint=fingerprint, **kwargs, ).read() diff --git a/src/datasets/builder.py b/src/datasets/builder.py index d6992b9e19d..f16b6d622f0 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -313,6 +313,7 @@ def __init__( data_dir: Optional[str] = None, storage_options: Optional[dict] = None, writer_batch_size: Optional[int] = None, + config_id: Optional[str] = None, **config_kwargs, ): # DatasetBuilder name @@ -343,6 +344,7 @@ def __init__( self.config, self.config_id = self._create_builder_config( config_name=config_name, custom_features=features, + config_id=config_id, **config_kwargs, ) @@ -533,7 +535,7 @@ def get_exported_dataset_info(self) -> DatasetInfo: return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo()) def _create_builder_config( - self, config_name=None, custom_features=None, **config_kwargs + self, config_name=None, custom_features=None, config_id=None, **config_kwargs ) -> tuple[BuilderConfig, str]: """Create and validate BuilderConfig object as well as a unique config id for this config. Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None. @@ -601,10 +603,13 @@ def _create_builder_config( ) # compute the config id that is going to be used for caching - config_id = builder_config.create_config_id( - config_kwargs, - custom_features=custom_features, - ) + if config_id is not None: + config_id = builder_config.name + "-" + config_id + else: + config_id = builder_config.create_config_id( + config_kwargs, + custom_features=custom_features, + ) is_custom = (config_id not in self.builder_configs) and config_id != "default" if is_custom: logger.info(f"Using custom data configuration {config_id}") diff --git a/src/datasets/io/generator.py b/src/datasets/io/generator.py index b10609cac23..1f63e992600 100644 --- a/src/datasets/io/generator.py +++ b/src/datasets/io/generator.py @@ -16,6 +16,7 @@ def __init__( gen_kwargs: Optional[dict] = None, num_proc: Optional[int] = None, split: NamedSplit = Split.TRAIN, + fingerprint: Optional[str] = None, **kwargs, ): super().__init__( @@ -32,6 +33,7 @@ def __init__( generator=generator, gen_kwargs=gen_kwargs, split=split, + config_id=fingerprint, **kwargs, )