Skip to content

Commit 0d4d22a

Browse files
committed
Add custom suffix support to from_generator
1 parent ba451d1 commit 0d4d22a

File tree

4 files changed

+49
-36
lines changed

4 files changed

+49
-36
lines changed

src/datasets/arrow_dataset.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,6 +1047,7 @@ def from_generator(
10471047
gen_kwargs: Optional[dict] = None,
10481048
num_proc: Optional[int] = None,
10491049
split: NamedSplit = Split.TRAIN,
1050+
dataset_id_suffix: Optional[str] = None,
10501051
**kwargs,
10511052
):
10521053
"""Create a Dataset from a generator.
@@ -1073,6 +1074,11 @@ def from_generator(
10731074
Split name to be assigned to the dataset.
10741075
10751076
<Added version="2.21.0"/>
1077+
dataset_id_suffix (`str`, *optional*):
1078+
Suffix that will be used to generate dataset ID.
1079+
By default `dataset_id_suffix` is generated by hashing all the args which can be slow in case of a large dataset.
1080+
1081+
<Added version="3.6.0"/>
10761082
**kwargs (additional keyword arguments):
10771083
Keyword arguments to be passed to :[`GeneratorConfig`].
10781084
@@ -1110,6 +1116,7 @@ def from_generator(
11101116
gen_kwargs=gen_kwargs,
11111117
num_proc=num_proc,
11121118
split=split,
1119+
dataset_id_suffix=dataset_id_suffix,
11131120
**kwargs,
11141121
).read()
11151122

src/datasets/builder.py

Lines changed: 39 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -155,43 +155,46 @@ def create_config_id(
155155
"""
156156
# Possibly add a suffix to the name to handle custom features/data_files/config_kwargs
157157
suffix: Optional[str] = None
158-
config_kwargs_to_add_to_suffix = config_kwargs.copy()
159-
# name and version are already used to build the cache directory
160-
config_kwargs_to_add_to_suffix.pop("name", None)
161-
config_kwargs_to_add_to_suffix.pop("version", None)
162-
# data dir handling (when specified it points to the manually downloaded data):
163-
# it was previously ignored before the introduction of config id because we didn't want
164-
# to change the config name. Now it's fine to take it into account for the config id.
165-
# config_kwargs_to_add_to_suffix.pop("data_dir", None)
166-
if "data_dir" in config_kwargs_to_add_to_suffix:
167-
if config_kwargs_to_add_to_suffix["data_dir"] is None:
168-
config_kwargs_to_add_to_suffix.pop("data_dir", None)
169-
else:
170-
# canonicalize the data dir to avoid two paths to the same location having different
171-
# hashes
172-
data_dir = config_kwargs_to_add_to_suffix["data_dir"]
173-
data_dir = os.path.normpath(data_dir)
174-
config_kwargs_to_add_to_suffix["data_dir"] = data_dir
175-
if config_kwargs_to_add_to_suffix:
176-
# we don't care about the order of the kwargs
177-
config_kwargs_to_add_to_suffix = {
178-
k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix)
179-
}
180-
if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):
181-
suffix = ",".join(
182-
str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()
183-
)
184-
if len(suffix) > 32: # hash if too long
158+
if "dataset_id_suffix" in config_kwargs and config_kwargs["dataset_id_suffix"] is not None:
159+
suffix = config_kwargs["dataset_id_suffix"]
160+
else:
161+
config_kwargs_to_add_to_suffix = config_kwargs.copy()
162+
# name and version are already used to build the cache directory
163+
config_kwargs_to_add_to_suffix.pop("name", None)
164+
config_kwargs_to_add_to_suffix.pop("version", None)
165+
# data dir handling (when specified it points to the manually downloaded data):
166+
# it was previously ignored before the introduction of config id because we didn't want
167+
# to change the config name. Now it's fine to take it into account for the config id.
168+
# config_kwargs_to_add_to_suffix.pop("data_dir", None)
169+
if "data_dir" in config_kwargs_to_add_to_suffix:
170+
if config_kwargs_to_add_to_suffix["data_dir"] is None:
171+
config_kwargs_to_add_to_suffix.pop("data_dir", None)
172+
else:
173+
# canonicalize the data dir to avoid two paths to the same location having different
174+
# hashes
175+
data_dir = config_kwargs_to_add_to_suffix["data_dir"]
176+
data_dir = os.path.normpath(data_dir)
177+
config_kwargs_to_add_to_suffix["data_dir"] = data_dir
178+
if config_kwargs_to_add_to_suffix:
179+
# we don't care about the order of the kwargs
180+
config_kwargs_to_add_to_suffix = {
181+
k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix)
182+
}
183+
if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):
184+
suffix = ",".join(
185+
str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()
186+
)
187+
if len(suffix) > 32: # hash if too long
188+
suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
189+
else:
185190
suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
186-
else:
187-
suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
188-
189-
if custom_features is not None:
190-
m = Hasher()
191-
if suffix:
192-
m.update(suffix)
193-
m.update(custom_features)
194-
suffix = m.hexdigest()
191+
192+
if custom_features is not None:
193+
m = Hasher()
194+
if suffix:
195+
m.update(suffix)
196+
m.update(custom_features)
197+
suffix = m.hexdigest()
195198

196199
if suffix:
197200
config_id = self.name + "-" + suffix

src/datasets/io/generator.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ def __init__(
1616
gen_kwargs: Optional[dict] = None,
1717
num_proc: Optional[int] = None,
1818
split: NamedSplit = Split.TRAIN,
19+
dataset_id_suffix: Optional[str] = None,
1920
**kwargs,
2021
):
2122
super().__init__(
@@ -32,6 +33,7 @@ def __init__(
3233
generator=generator,
3334
gen_kwargs=gen_kwargs,
3435
split=split,
36+
dataset_id_suffix=dataset_id_suffix,
3537
**kwargs,
3638
)
3739

src/datasets/packaged_modules/generator/generator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ class GeneratorConfig(datasets.BuilderConfig):
1010
gen_kwargs: Optional[dict] = None
1111
features: Optional[datasets.Features] = None
1212
split: datasets.NamedSplit = datasets.Split.TRAIN
13+
dataset_id_suffix: Optional[str] = None
1314

1415
def __post_init__(self):
1516
super().__post_init__()

0 commit comments

Comments
 (0)