Description
Describe the bug
TL:DR
Dataset.from_generator
function passes all of its arguments to BuilderConfig.create_config_id
, including generator
function itself. BuilderConfig.create_config_id
function tries to hash all the args, which can take a large amount of time or even cause MemoryError if the dataset processed in a generator function is large enough.
Maybe we should pop generator
from config_kwargs_to_add_to_suffix
before hashing to avoid it.
Full description
I have a pretty large spatial imagery dataset that is generated from two xbatcher.BatchGenerators via custom dataset_generator
function that looks like this if simplified:
def dataset_generator():
for index in samples:
data_dict = {
"key": index,
"x": x_batches[index].data,
"y": y_batches[index].data,
}
yield data_dict
Then I use datasets.Dataset.from_generator
to generate the dataset itself.
# Create dataset
ds = datasets.Dataset.from_generator(
dataset_generator,
features=feat,
cache_dir=(output / ".cache"),
)
It works nicely with pretty small data, but if the dataset is huge and barely fits in memory, it crashes with memory error:
Full stack trace
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\remote_sensing_processor\segmentation\semantic\tiles.py:248](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/remote_sensing_processor/segmentation/semantic/tiles.py#line=247), in generate_tiles(x, y, output, tile_size, shuffle, split, x_dtype, y_dtype, x_nodata, y_nodata)
245 yield data_dict
247 # Create dataset
--> 248 ds = datasets.Dataset.from_generator(
249 dataset_generator,
250 features=feat,
251 cache_dir=(output / ".cache"),
252 )
254 # Save dataset
255 ds.save_to_disk(output / name)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\arrow_dataset.py:1105](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/arrow_dataset.py#line=1104), in Dataset.from_generator(generator, features, cache_dir, keep_in_memory, gen_kwargs, num_proc, split, **kwargs)
1052 """Create a Dataset from a generator.
1053
1054 Args:
(...) 1101 ```
1102 """
1103 from .io.generator import GeneratorDatasetInputStream
-> 1105 return GeneratorDatasetInputStream(
1106 generator=generator,
1107 features=features,
1108 cache_dir=cache_dir,
1109 keep_in_memory=keep_in_memory,
1110 gen_kwargs=gen_kwargs,
1111 num_proc=num_proc,
1112 split=split,
1113 **kwargs,
1114 ).read()
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\io\generator.py:29](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/io/generator.py#line=28), in GeneratorDatasetInputStream.__init__(self, generator, features, cache_dir, keep_in_memory, streaming, gen_kwargs, num_proc, split, **kwargs)
9 def __init__(
10 self,
11 generator: Callable,
(...) 19 **kwargs,
20 ):
21 super().__init__(
22 features=features,
23 cache_dir=cache_dir,
(...) 27 **kwargs,
28 )
---> 29 self.builder = Generator(
30 cache_dir=cache_dir,
31 features=features,
32 generator=generator,
33 gen_kwargs=gen_kwargs,
34 split=split,
35 **kwargs,
36 )
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\builder.py:343](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/builder.py#line=342), in DatasetBuilder.__init__(self, cache_dir, dataset_name, config_name, hash, base_path, info, features, token, repo_id, data_files, data_dir, storage_options, writer_batch_size, **config_kwargs)
341 config_kwargs["data_dir"] = data_dir
342 self.config_kwargs = config_kwargs
--> 343 self.config, self.config_id = self._create_builder_config(
344 config_name=config_name,
345 custom_features=features,
346 **config_kwargs,
347 )
349 # prepare info: DatasetInfo are a standardized dataclass across all datasets
350 # Prefill datasetinfo
351 if info is None:
352 # TODO FOR PACKAGED MODULES IT IMPORTS DATA FROM src/packaged_modules which doesn't make sense
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\builder.py:604](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/builder.py#line=603), in DatasetBuilder._create_builder_config(self, config_name, custom_features, **config_kwargs)
598 builder_config._resolve_data_files(
599 base_path=self.base_path,
600 download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),
601 )
603 # compute the config id that is going to be used for caching
--> 604 config_id = builder_config.create_config_id(
605 config_kwargs,
606 custom_features=custom_features,
607 )
608 is_custom = (config_id not in self.builder_configs) and config_id != "default"
609 if is_custom:
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\builder.py:187](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/builder.py#line=186), in BuilderConfig.create_config_id(self, config_kwargs, custom_features)
185 suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
186 else:
--> 187 suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
189 if custom_features is not None:
190 m = Hasher()
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\fingerprint.py:188](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/fingerprint.py#line=187), in Hasher.hash(cls, value)
186 @classmethod
187 def hash(cls, value: Any) -> str:
--> 188 return cls.hash_bytes(dumps(value))
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:109](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=108), in dumps(obj)
107 """Pickle an object to a string."""
108 file = BytesIO()
--> 109 dump(obj, file)
110 return file.getvalue()
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:103](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=102), in dump(obj, file)
101 def dump(obj, file):
102 """Pickle an object to a file."""
--> 103 Pickler(file, recurse=True).dump(obj)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:420](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=419), in Pickler.dump(self, obj)
418 def dump(self, obj): #NOTE: if settings change, need to update attributes
419 logger.trace_setup(self)
--> 420 StockPickler.dump(self, obj)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:484](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=483), in _Pickler.dump(self, obj)
482 if self.proto >= 4:
483 self.framer.start_framing()
--> 484 self.save(obj)
485 self.write(STOP)
486 self.framer.end_framing()
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:558](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=557), in _Pickler.save(self, obj, save_persistent_id)
556 f = self.dispatch.get(t)
557 if f is not None:
--> 558 f(self, obj) # Call unbound method with explicit self
559 return
561 # Check private dispatch table if any, or else
562 # copyreg.dispatch_table
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:1217](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=1216), in save_module_dict(pickler, obj)
1214 if is_dill(pickler, child=False) and pickler._session:
1215 # we only care about session the first pass thru
1216 pickler._first_pass = False
-> 1217 StockPickler.save_dict(pickler, obj)
1218 logger.trace(pickler, "# D2")
1219 return
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:990](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=989), in _Pickler.save_dict(self, obj)
987 self.write(MARK + DICT)
989 self.memoize(obj)
--> 990 self._batch_setitems(obj.items())
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:83](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=82), in Pickler._batch_setitems(self, items)
80 from datasets.fingerprint import Hasher
82 items = sorted(items, key=lambda x: Hasher.hash(x[0]))
---> 83 dill.Pickler._batch_setitems(self, items)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:1014](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=1013), in _Pickler._batch_setitems(self, items)
1012 for k, v in tmp:
1013 save(k)
-> 1014 save(v)
1015 write(SETITEMS)
1016 elif n:
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:558](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=557), in _Pickler.save(self, obj, save_persistent_id)
556 f = self.dispatch.get(t)
557 if f is not None:
--> 558 f(self, obj) # Call unbound method with explicit self
559 return
561 # Check private dispatch table if any, or else
562 # copyreg.dispatch_table
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:1985](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=1984), in save_function(pickler, obj)
1982 if state_dict:
1983 state = state, state_dict
-> 1985 _save_with_postproc(pickler, (_create_function, (
1986 obj.__code__, globs, obj.__name__, obj.__defaults__,
1987 closure
1988 ), state), obj=obj, postproc_list=postproc_list)
1990 # Lift closure cell update to earliest function (#458)
1991 if _postproc:
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:1117](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=1116), in _save_with_postproc(pickler, reduction, is_pickler_dill, obj, postproc_list)
1115 continue
1116 else:
-> 1117 pickler.save_reduce(*reduction)
1118 # pop None created by calling preprocessing step off stack
1119 pickler.write(POP)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:690](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=689), in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj)
688 else:
689 save(func)
--> 690 save(args)
691 write(REDUCE)
693 if obj is not None:
694 # If the object is already in the memo, this means it is
695 # recursive. In this case, throw away everything we put on the
696 # stack, and fetch the object back from the memo.
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:558](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=557), in _Pickler.save(self, obj, save_persistent_id)
556 f = self.dispatch.get(t)
557 if f is not None:
--> 558 f(self, obj) # Call unbound method with explicit self
559 return
561 # Check private dispatch table if any, or else
562 # copyreg.dispatch_table
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:905](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=904), in _Pickler.save_tuple(self, obj)
903 if n <= 3 and self.proto >= 2:
904 for element in obj:
--> 905 save(element)
906 # Subtle. Same as in the big comment below.
907 if id(obj) in memo:
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:601](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=600), in _Pickler.save(self, obj, save_persistent_id)
597 raise PicklingError("Tuple returned by %s must have "
598 "two to six elements" % reduce)
600 # Save the reduce() output and finally memoize the object
--> 601 self.save_reduce(obj=obj, *rv)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:715](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=714), in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj)
713 if state is not None:
714 if state_setter is None:
--> 715 save(state)
716 write(BUILD)
717 else:
718 # If a state_setter is specified, call it instead of load_build
719 # to update obj's with its previous state.
720 # First, push state_setter and its tuple of expected arguments
721 # (obj, state) onto the stack.
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:558](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=557), in _Pickler.save(self, obj, save_persistent_id)
556 f = self.dispatch.get(t)
557 if f is not None:
--> 558 f(self, obj) # Call unbound method with explicit self
559 return
561 # Check private dispatch table if any, or else
562 # copyreg.dispatch_table
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:1217](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=1216), in save_module_dict(pickler, obj)
1214 if is_dill(pickler, child=False) and pickler._session:
1215 # we only care about session the first pass thru
1216 pickler._first_pass = False
-> 1217 StockPickler.save_dict(pickler, obj)
1218 logger.trace(pickler, "# D2")
1219 return
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:990](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=989), in _Pickler.save_dict(self, obj)
987 self.write(MARK + DICT)
989 self.memoize(obj)
--> 990 self._batch_setitems(obj.items())
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:83](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=82), in Pickler._batch_setitems(self, items)
80 from datasets.fingerprint import Hasher
82 items = sorted(items, key=lambda x: Hasher.hash(x[0]))
---> 83 dill.Pickler._batch_setitems(self, items)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:1014](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=1013), in _Pickler._batch_setitems(self, items)
1012 for k, v in tmp:
1013 save(k)
-> 1014 save(v)
1015 write(SETITEMS)
1016 elif n:
[... skipping similar frames: Pickler.save at line 70 (1 times), Pickler.save at line 414 (1 times)]
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:601](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=600), in _Pickler.save(self, obj, save_persistent_id)
597 raise PicklingError("Tuple returned by %s must have "
598 "two to six elements" % reduce)
600 # Save the reduce() output and finally memoize the object
--> 601 self.save_reduce(obj=obj, *rv)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:715](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=714), in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj)
713 if state is not None:
714 if state_setter is None:
--> 715 save(state)
716 write(BUILD)
717 else:
718 # If a state_setter is specified, call it instead of load_build
719 # to update obj's with its previous state.
720 # First, push state_setter and its tuple of expected arguments
721 # (obj, state) onto the stack.
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:558](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=557), in _Pickler.save(self, obj, save_persistent_id)
556 f = self.dispatch.get(t)
557 if f is not None:
--> 558 f(self, obj) # Call unbound method with explicit self
559 return
561 # Check private dispatch table if any, or else
562 # copyreg.dispatch_table
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:905](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=904), in _Pickler.save_tuple(self, obj)
903 if n <= 3 and self.proto >= 2:
904 for element in obj:
--> 905 save(element)
906 # Subtle. Same as in the big comment below.
907 if id(obj) in memo:
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:558](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=557), in _Pickler.save(self, obj, save_persistent_id)
556 f = self.dispatch.get(t)
557 if f is not None:
--> 558 f(self, obj) # Call unbound method with explicit self
559 return
561 # Check private dispatch table if any, or else
562 # copyreg.dispatch_table
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:1217](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=1216), in save_module_dict(pickler, obj)
1214 if is_dill(pickler, child=False) and pickler._session:
1215 # we only care about session the first pass thru
1216 pickler._first_pass = False
-> 1217 StockPickler.save_dict(pickler, obj)
1218 logger.trace(pickler, "# D2")
1219 return
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:990](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=989), in _Pickler.save_dict(self, obj)
987 self.write(MARK + DICT)
989 self.memoize(obj)
--> 990 self._batch_setitems(obj.items())
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:83](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=82), in Pickler._batch_setitems(self, items)
80 from datasets.fingerprint import Hasher
82 items = sorted(items, key=lambda x: Hasher.hash(x[0]))
---> 83 dill.Pickler._batch_setitems(self, items)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:1014](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=1013), in _Pickler._batch_setitems(self, items)
1012 for k, v in tmp:
1013 save(k)
-> 1014 save(v)
1015 write(SETITEMS)
1016 elif n:
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:601](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=600), in _Pickler.save(self, obj, save_persistent_id)
597 raise PicklingError("Tuple returned by %s must have "
598 "two to six elements" % reduce)
600 # Save the reduce() output and finally memoize the object
--> 601 self.save_reduce(obj=obj, *rv)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:715](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=714), in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj)
713 if state is not None:
714 if state_setter is None:
--> 715 save(state)
716 write(BUILD)
717 else:
718 # If a state_setter is specified, call it instead of load_build
719 # to update obj's with its previous state.
720 # First, push state_setter and its tuple of expected arguments
721 # (obj, state) onto the stack.
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:558](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=557), in _Pickler.save(self, obj, save_persistent_id)
556 f = self.dispatch.get(t)
557 if f is not None:
--> 558 f(self, obj) # Call unbound method with explicit self
559 return
561 # Check private dispatch table if any, or else
562 # copyreg.dispatch_table
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:905](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=904), in _Pickler.save_tuple(self, obj)
903 if n <= 3 and self.proto >= 2:
904 for element in obj:
--> 905 save(element)
906 # Subtle. Same as in the big comment below.
907 if id(obj) in memo:
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:558](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=557), in _Pickler.save(self, obj, save_persistent_id)
556 f = self.dispatch.get(t)
557 if f is not None:
--> 558 f(self, obj) # Call unbound method with explicit self
559 return
561 # Check private dispatch table if any, or else
562 # copyreg.dispatch_table
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:1217](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=1216), in save_module_dict(pickler, obj)
1214 if is_dill(pickler, child=False) and pickler._session:
1215 # we only care about session the first pass thru
1216 pickler._first_pass = False
-> 1217 StockPickler.save_dict(pickler, obj)
1218 logger.trace(pickler, "# D2")
1219 return
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:990](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=989), in _Pickler.save_dict(self, obj)
987 self.write(MARK + DICT)
989 self.memoize(obj)
--> 990 self._batch_setitems(obj.items())
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:83](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=82), in Pickler._batch_setitems(self, items)
80 from datasets.fingerprint import Hasher
82 items = sorted(items, key=lambda x: Hasher.hash(x[0]))
---> 83 dill.Pickler._batch_setitems(self, items)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:1014](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=1013), in _Pickler._batch_setitems(self, items)
1012 for k, v in tmp:
1013 save(k)
-> 1014 save(v)
1015 write(SETITEMS)
1016 elif n:
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:601](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=600), in _Pickler.save(self, obj, save_persistent_id)
597 raise PicklingError("Tuple returned by %s must have "
598 "two to six elements" % reduce)
600 # Save the reduce() output and finally memoize the object
--> 601 self.save_reduce(obj=obj, *rv)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:690](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=689), in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj)
688 else:
689 save(func)
--> 690 save(args)
691 write(REDUCE)
693 if obj is not None:
694 # If the object is already in the memo, this means it is
695 # recursive. In this case, throw away everything we put on the
696 # stack, and fetch the object back from the memo.
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:558](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=557), in _Pickler.save(self, obj, save_persistent_id)
556 f = self.dispatch.get(t)
557 if f is not None:
--> 558 f(self, obj) # Call unbound method with explicit self
559 return
561 # Check private dispatch table if any, or else
562 # copyreg.dispatch_table
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:920](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=919), in _Pickler.save_tuple(self, obj)
918 write(MARK)
919 for element in obj:
--> 920 save(element)
922 if id(obj) in memo:
923 # Subtle. d was not in memo when we entered save_tuple(), so
924 # the process of saving the tuple's elements must have saved
(...) 928 # could have been done in the "for element" loop instead, but
929 # recursive tuples are a rare thing.
930 get = self.get(memo[id(obj)][0])
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:601](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=600), in _Pickler.save(self, obj, save_persistent_id)
597 raise PicklingError("Tuple returned by %s must have "
598 "two to six elements" % reduce)
600 # Save the reduce() output and finally memoize the object
--> 601 self.save_reduce(obj=obj, *rv)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:715](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=714), in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj)
713 if state is not None:
714 if state_setter is None:
--> 715 save(state)
716 write(BUILD)
717 else:
718 # If a state_setter is specified, call it instead of load_build
719 # to update obj's with its previous state.
720 # First, push state_setter and its tuple of expected arguments
721 # (obj, state) onto the stack.
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:558](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=557), in _Pickler.save(self, obj, save_persistent_id)
556 f = self.dispatch.get(t)
557 if f is not None:
--> 558 f(self, obj) # Call unbound method with explicit self
559 return
561 # Check private dispatch table if any, or else
562 # copyreg.dispatch_table
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:1217](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=1216), in save_module_dict(pickler, obj)
1214 if is_dill(pickler, child=False) and pickler._session:
1215 # we only care about session the first pass thru
1216 pickler._first_pass = False
-> 1217 StockPickler.save_dict(pickler, obj)
1218 logger.trace(pickler, "# D2")
1219 return
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:990](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=989), in _Pickler.save_dict(self, obj)
987 self.write(MARK + DICT)
989 self.memoize(obj)
--> 990 self._batch_setitems(obj.items())
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:83](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=82), in Pickler._batch_setitems(self, items)
80 from datasets.fingerprint import Hasher
82 items = sorted(items, key=lambda x: Hasher.hash(x[0]))
---> 83 dill.Pickler._batch_setitems(self, items)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:1014](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=1013), in _Pickler._batch_setitems(self, items)
1012 for k, v in tmp:
1013 save(k)
-> 1014 save(v)
1015 write(SETITEMS)
1016 elif n:
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:558](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=557), in _Pickler.save(self, obj, save_persistent_id)
556 f = self.dispatch.get(t)
557 if f is not None:
--> 558 f(self, obj) # Call unbound method with explicit self
559 return
561 # Check private dispatch table if any, or else
562 # copyreg.dispatch_table
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:1217](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=1216), in save_module_dict(pickler, obj)
1214 if is_dill(pickler, child=False) and pickler._session:
1215 # we only care about session the first pass thru
1216 pickler._first_pass = False
-> 1217 StockPickler.save_dict(pickler, obj)
1218 logger.trace(pickler, "# D2")
1219 return
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:990](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=989), in _Pickler.save_dict(self, obj)
987 self.write(MARK + DICT)
989 self.memoize(obj)
--> 990 self._batch_setitems(obj.items())
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:83](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=82), in Pickler._batch_setitems(self, items)
80 from datasets.fingerprint import Hasher
82 items = sorted(items, key=lambda x: Hasher.hash(x[0]))
---> 83 dill.Pickler._batch_setitems(self, items)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:1019](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=1018), in _Pickler._batch_setitems(self, items)
1017 k, v = tmp[0]
1018 save(k)
-> 1019 save(v)
1020 write(SETITEM)
1021 # else tmp is empty, and we're done
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:601](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=600), in _Pickler.save(self, obj, save_persistent_id)
597 raise PicklingError("Tuple returned by %s must have "
598 "two to six elements" % reduce)
600 # Save the reduce() output and finally memoize the object
--> 601 self.save_reduce(obj=obj, *rv)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:715](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=714), in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj)
713 if state is not None:
714 if state_setter is None:
--> 715 save(state)
716 write(BUILD)
717 else:
718 # If a state_setter is specified, call it instead of load_build
719 # to update obj's with its previous state.
720 # First, push state_setter and its tuple of expected arguments
721 # (obj, state) onto the stack.
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:558](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=557), in _Pickler.save(self, obj, save_persistent_id)
556 f = self.dispatch.get(t)
557 if f is not None:
--> 558 f(self, obj) # Call unbound method with explicit self
559 return
561 # Check private dispatch table if any, or else
562 # copyreg.dispatch_table
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:1217](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=1216), in save_module_dict(pickler, obj)
1214 if is_dill(pickler, child=False) and pickler._session:
1215 # we only care about session the first pass thru
1216 pickler._first_pass = False
-> 1217 StockPickler.save_dict(pickler, obj)
1218 logger.trace(pickler, "# D2")
1219 return
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:990](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=989), in _Pickler.save_dict(self, obj)
987 self.write(MARK + DICT)
989 self.memoize(obj)
--> 990 self._batch_setitems(obj.items())
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:83](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=82), in Pickler._batch_setitems(self, items)
80 from datasets.fingerprint import Hasher
82 items = sorted(items, key=lambda x: Hasher.hash(x[0]))
---> 83 dill.Pickler._batch_setitems(self, items)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:1014](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=1013), in _Pickler._batch_setitems(self, items)
1012 for k, v in tmp:
1013 save(k)
-> 1014 save(v)
1015 write(SETITEMS)
1016 elif n:
[... skipping similar frames: Pickler.save at line 70 (1 times), Pickler.save at line 414 (1 times)]
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:558](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=557), in _Pickler.save(self, obj, save_persistent_id)
556 f = self.dispatch.get(t)
557 if f is not None:
--> 558 f(self, obj) # Call unbound method with explicit self
559 return
561 # Check private dispatch table if any, or else
562 # copyreg.dispatch_table
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:1217](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=1216), in save_module_dict(pickler, obj)
1214 if is_dill(pickler, child=False) and pickler._session:
1215 # we only care about session the first pass thru
1216 pickler._first_pass = False
-> 1217 StockPickler.save_dict(pickler, obj)
1218 logger.trace(pickler, "# D2")
1219 return
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:990](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=989), in _Pickler.save_dict(self, obj)
987 self.write(MARK + DICT)
989 self.memoize(obj)
--> 990 self._batch_setitems(obj.items())
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:83](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=82), in Pickler._batch_setitems(self, items)
80 from datasets.fingerprint import Hasher
82 items = sorted(items, key=lambda x: Hasher.hash(x[0]))
---> 83 dill.Pickler._batch_setitems(self, items)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:1014](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=1013), in _Pickler._batch_setitems(self, items)
1012 for k, v in tmp:
1013 save(k)
-> 1014 save(v)
1015 write(SETITEMS)
1016 elif n:
[... skipping similar frames: Pickler.save at line 70 (1 times), Pickler.save at line 414 (1 times)]
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:601](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=600), in _Pickler.save(self, obj, save_persistent_id)
597 raise PicklingError("Tuple returned by %s must have "
598 "two to six elements" % reduce)
600 # Save the reduce() output and finally memoize the object
--> 601 self.save_reduce(obj=obj, *rv)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:715](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=714), in _Pickler.save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj)
713 if state is not None:
714 if state_setter is None:
--> 715 save(state)
716 write(BUILD)
717 else:
718 # If a state_setter is specified, call it instead of load_build
719 # to update obj's with its previous state.
720 # First, push state_setter and its tuple of expected arguments
721 # (obj, state) onto the stack.
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:558](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=557), in _Pickler.save(self, obj, save_persistent_id)
556 f = self.dispatch.get(t)
557 if f is not None:
--> 558 f(self, obj) # Call unbound method with explicit self
559 return
561 # Check private dispatch table if any, or else
562 # copyreg.dispatch_table
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:920](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=919), in _Pickler.save_tuple(self, obj)
918 write(MARK)
919 for element in obj:
--> 920 save(element)
922 if id(obj) in memo:
923 # Subtle. d was not in memo when we entered save_tuple(), so
924 # the process of saving the tuple's elements must have saved
(...) 928 # could have been done in the "for element" loop instead, but
929 # recursive tuples are a rare thing.
930 get = self.get(memo[id(obj)][0])
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\datasets\utils\_dill.py:70](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/datasets/utils/_dill.py#line=69), in Pickler.save(self, obj, save_persistent_id)
68 if obj_type is FunctionType:
69 obj = getattr(obj, "_torchdynamo_orig_callable", obj)
---> 70 dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\site-packages\dill\_dill.py:414](file:///C:/ProgramData/miniforge3/envs/geo/Lib/site-packages/dill/_dill.py#line=413), in Pickler.save(self, obj, save_persistent_id)
412 msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType
413 raise PicklingError(msg)
--> 414 StockPickler.save(self, obj, save_persistent_id)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:558](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=557), in _Pickler.save(self, obj, save_persistent_id)
556 f = self.dispatch.get(t)
557 if f is not None:
--> 558 f(self, obj) # Call unbound method with explicit self
559 return
561 # Check private dispatch table if any, or else
562 # copyreg.dispatch_table
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:809](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=808), in _Pickler.save_bytes(self, obj)
806 self.save_reduce(codecs.encode,
807 (str(obj, 'latin1'), 'latin1'), obj=obj)
808 return
--> 809 self._save_bytes_no_memo(obj)
810 self.memoize(obj)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:797](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=796), in _Pickler._save_bytes_no_memo(self, obj)
795 self._write_large_bytes(BINBYTES8 + pack("<Q", n), obj)
796 elif n >= self.framer._FRAME_SIZE_TARGET:
--> 797 self._write_large_bytes(BINBYTES + pack("<I", n), obj)
798 else:
799 self.write(BINBYTES + pack("<I", n) + obj)
File [C:\ProgramData\miniforge3\envs\geo\Lib\pickle.py:254](file:///C:/ProgramData/miniforge3/envs/geo/Lib/pickle.py#line=253), in _Framer.write_large_bytes(self, header, payload)
247 # Perform direct write of the header and payload of the large binary
248 # object. Be careful not to concatenate the header and the payload
249 # prior to calling 'write' as we do not want to allocate a large
250 # temporary bytes object.
251 # We intentionally do not insert a protocol 4 frame opcode to make
252 # it possible to optimize file.read calls in the loader.
253 write(header)
--> 254 write(payload)
MemoryError:
Memory error is an expected type of error in such case, but when I started digging down, I found out that it occurs in a kinda unexpected place - in create_config_id
function. It tries to hash config_kwargs_to_add_to_suffix
, including generator function itself.
I modified the BuilderConfig.create_config_id
code like this to check which values are hashed and how much time it takes to hash them and ran it on a toy dataset:
print(config_kwargs_to_add_to_suffix)
start_time = time.time()
if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):
suffix = ",".join(
str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()
)
if len(suffix) > 32: # hash if too long
suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
else:
suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
end_time = time.time()
print(f"Execution time: {end_time - start_time:.4f} seconds")
print(suffix)
In my case the content of config_kwargs_to_add_to_suffix
was like this:
{'features': {'key': Value(dtype='int64', id=None), 'x': Array3D(shape=(44, 128, 128), dtype='float32', id=None), 'y_class': Array2D(shape=(128, 128), dtype='int32', id=None)}, 'gen_kwargs': None, 'generator': <function generate_tiles.<locals>.dataset_generator at 0x00000139D10D7920>, 'split': NamedSplit('train')}
Also I noticed that hashing took a significant amount of time - 43.1482 seconds, while the overall function execution (with data loading, batching and saving dataset) took 2min 45s. The output of create_config_id
is just a dataset id, so, it is inappropirately large amount of time.
But when I added config_kwargs_to_add_to_suffix.pop("generator", None)
, the hashing took only 0.0060 seconds.
Maybe we shouldn't hash the generator function, as it can be really computationally and memory expensive.
Steps to reproduce the bug
This is a simplified example of a workflow I used to generate dataset. But I think that you can use almost any workflow to reproduce that bug.
import pystac
import pystac_client
import planetary_computer
import numpy as np
import xarray as xr
import rioxarray as rxr
import dask
import xbatcher
import datasets
# Loading a dataset, in our case - single Landsat image
catalog = pystac_client.Client.open(
"https://planetarycomputer.microsoft.com/api/stac/v1",
modifier=planetary_computer.sign_inplace,
)
brazil = [-60.2, -3.31]
time_of_interest = "2021-06-01/2021-08-31"
search = catalog.search(collections=["landsat-c2-l2"], intersects={"type": "Point", "coordinates": brazil}, datetime=time_of_interest)
items = search.item_collection()
item = min(items, key=lambda item: pystac.extensions.eo.EOExtension.ext(item).cloud_cover)
# Getting x data
bands = []
for band in ["red", "green", "blue", "nir08", "coastal", "swir16", "swir22", "lwir11"]:
with rxr.open_rasterio(item.assets[band].href, chunks=True, lock=True) as raster:
raster = raster.to_dataset('band')
#print(raster)
raster = raster.rename({1: band})
bands.append(raster)
x = xr.merge(bands).squeeze().to_array("band").persist()
# Getting y data
with rxr.open_rasterio(item.assets['qa_pixel'].href, chunks=True, lock=True) as raster:
y = raster.squeeze().persist()
# Setting up batches generators
x_batches = xbatcher.BatchGenerator(ds=x, input_dims={"x": 256, "y": 256})
y_batches = xbatcher.BatchGenerator(ds=y, input_dims={"x": 256, "y": 256})
# Filtering samples that contain only nodata
samples = list(range(len(x_batches)))
samples_filtered = []
for i in samples:
if not np.array_equal(np.unique(x_batches[i]), np.array([0.])) and not np.array_equal(np.unique(y_batches[i]), np.array([0])):
samples_filtered.append(i)
samples = samples_filtered
np.random.shuffle(samples)
# Setting up features
feat = {
"key": datasets.Value(dtype="int64"),
"x": datasets.Array3D(dtype="float32", shape=(4, 256, 256)),
"y": datasets.Array2D(dtype="int32", shape=(256, 256))
}
feat = datasets.Features(feat)
# Setting up a generator
def dataset_generator():
for index in samples:
data_dict = {
"key": index,
"x": x_batches[index].data,
"y": y_batches[index].data,
}
yield data_dict
# Create dataset
ds = datasets.Dataset.from_generator(
dataset_generator,
features=feat,
cache_dir="temp/cache",
)
Please, try adding config_kwargs_to_add_to_suffix.pop("generator", None)
to BuilderConfig.create_config_id
and then measuring how much time it takes to run
if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):
suffix = ",".join(
str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()
)
if len(suffix) > 32: # hash if too long
suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
else:
suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
code block with and without config_kwargs_to_add_to_suffix.pop("generator", None)
In my case the difference was 3.3828 seconds without popping generator function and 0.0010 seconds with popping.
Expected behavior
Much faster hashing and no MemoryErrors
Environment info
datasets
version: 3.5.0- Platform: Windows-11-10.0.26100-SP0
- Python version: 3.12.9
huggingface_hub
version: 0.30.1- PyArrow version: 17.0.0
- Pandas version: 2.2.2
fsspec
version: 2024.12.0