Add custom suffix support to from_generator

simonreise · simonreise · commit 0d4d22a879cd · 2025-04-23T23:21:43.000+04:00
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -1047,6 +1047,7 @@ def from_generator(
         gen_kwargs: Optional[dict] = None,
         num_proc: Optional[int] = None,
         split: NamedSplit = Split.TRAIN,
+        dataset_id_suffix: Optional[str] = None,
         **kwargs,
     ):
         """Create a Dataset from a generator.
@@ -1073,6 +1074,11 @@ def from_generator(
                 Split name to be assigned to the dataset.
 
                 <Added version="2.21.0"/>
+            dataset_id_suffix (`str`, *optional*):
+                Suffix that will be used to generate dataset ID.
+                By default `dataset_id_suffix` is generated by hashing all the args which can be slow in case of a large dataset.
+
+                <Added version="3.6.0"/>
             **kwargs (additional keyword arguments):
                 Keyword arguments to be passed to :[`GeneratorConfig`].
 
@@ -1110,6 +1116,7 @@ def from_generator(
             gen_kwargs=gen_kwargs,
             num_proc=num_proc,
             split=split,
+            dataset_id_suffix=dataset_id_suffix,
             **kwargs,
         ).read()
 
diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -155,43 +155,46 @@ def create_config_id(
         """
         # Possibly add a suffix to the name to handle custom features/data_files/config_kwargs
         suffix: Optional[str] = None
-        config_kwargs_to_add_to_suffix = config_kwargs.copy()
-        # name and version are already used to build the cache directory
-        config_kwargs_to_add_to_suffix.pop("name", None)
-        config_kwargs_to_add_to_suffix.pop("version", None)
-        # data dir handling (when specified it points to the manually downloaded data):
-        # it was previously ignored before the introduction of config id because we didn't want
-        # to change the config name. Now it's fine to take it into account for the config id.
-        # config_kwargs_to_add_to_suffix.pop("data_dir", None)
-        if "data_dir" in config_kwargs_to_add_to_suffix:
-            if config_kwargs_to_add_to_suffix["data_dir"] is None:
-                config_kwargs_to_add_to_suffix.pop("data_dir", None)
-            else:
-                # canonicalize the data dir to avoid two paths to the same location having different
-                # hashes
-                data_dir = config_kwargs_to_add_to_suffix["data_dir"]
-                data_dir = os.path.normpath(data_dir)
-                config_kwargs_to_add_to_suffix["data_dir"] = data_dir
-        if config_kwargs_to_add_to_suffix:
-            # we don't care about the order of the kwargs
-            config_kwargs_to_add_to_suffix = {
-                k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix)
-            }
-            if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):
-                suffix = ",".join(
-                    str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()
-                )
-                if len(suffix) > 32:  # hash if too long
+        if "dataset_id_suffix" in config_kwargs and config_kwargs["dataset_id_suffix"] is not None:
+            suffix = config_kwargs["dataset_id_suffix"]
+        else:
+            config_kwargs_to_add_to_suffix = config_kwargs.copy()
+            # name and version are already used to build the cache directory
+            config_kwargs_to_add_to_suffix.pop("name", None)
+            config_kwargs_to_add_to_suffix.pop("version", None)
+            # data dir handling (when specified it points to the manually downloaded data):
+            # it was previously ignored before the introduction of config id because we didn't want
+            # to change the config name. Now it's fine to take it into account for the config id.
+            # config_kwargs_to_add_to_suffix.pop("data_dir", None)
+            if "data_dir" in config_kwargs_to_add_to_suffix:
+                if config_kwargs_to_add_to_suffix["data_dir"] is None:
+                    config_kwargs_to_add_to_suffix.pop("data_dir", None)
+                else:
+                    # canonicalize the data dir to avoid two paths to the same location having different
+                    # hashes
+                    data_dir = config_kwargs_to_add_to_suffix["data_dir"]
+                    data_dir = os.path.normpath(data_dir)
+                    config_kwargs_to_add_to_suffix["data_dir"] = data_dir
+            if config_kwargs_to_add_to_suffix:
+                # we don't care about the order of the kwargs
+                config_kwargs_to_add_to_suffix = {
+                    k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix)
+                }
+                if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):
+                    suffix = ",".join(
+                        str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()
+                    )
+                    if len(suffix) > 32:  # hash if too long
+                        suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
+                else:
                     suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
-            else:
-                suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
-
-        if custom_features is not None:
-            m = Hasher()
-            if suffix:
-                m.update(suffix)
-            m.update(custom_features)
-            suffix = m.hexdigest()
+
+            if custom_features is not None:
+                m = Hasher()
+                if suffix:
+                    m.update(suffix)
+                m.update(custom_features)
+                suffix = m.hexdigest()
 
         if suffix:
             config_id = self.name + "-" + suffix
diff --git a/src/datasets/io/generator.py b/src/datasets/io/generator.py
@@ -16,6 +16,7 @@ def __init__(
         gen_kwargs: Optional[dict] = None,
         num_proc: Optional[int] = None,
         split: NamedSplit = Split.TRAIN,
+        dataset_id_suffix: Optional[str] = None,
         **kwargs,
     ):
         super().__init__(
@@ -32,6 +33,7 @@ def __init__(
             generator=generator,
             gen_kwargs=gen_kwargs,
             split=split,
+            dataset_id_suffix=dataset_id_suffix,
             **kwargs,
         )
 
diff --git a/src/datasets/packaged_modules/generator/generator.py b/src/datasets/packaged_modules/generator/generator.py
@@ -10,6 +10,7 @@ class GeneratorConfig(datasets.BuilderConfig):
     gen_kwargs: Optional[dict] = None
     features: Optional[datasets.Features] = None
     split: datasets.NamedSplit = datasets.Split.TRAIN
+    dataset_id_suffix: Optional[str] = None
 
     def __post_init__(self):
         super().__post_init__()