diff --git a/.github/conda/meta.yaml b/.github/conda/meta.yaml index a71fd602993..4b4a416d220 100644 --- a/.github/conda/meta.yaml +++ b/.github/conda/meta.yaml @@ -15,7 +15,7 @@ requirements: - python - pip - numpy >=1.17 - - pyarrow >=15.0.0 + - pyarrow >=16.0.0 - python-xxhash - dill - pandas @@ -30,7 +30,7 @@ requirements: - python - pip - numpy >=1.17 - - pyarrow >=15.0.0 + - pyarrow >=16.0.0 - python-xxhash - dill - pandas diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a4a9041fd28..3c6991b81ce 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,7 +64,7 @@ jobs: run: uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9" - name: Install dependencies (minimum versions) if: ${{ matrix.deps_versions != 'deps-latest' }} - run: uv pip install --system pyarrow==15.0.0 huggingface-hub==0.24.7 transformers dill==0.3.1.1 + run: uv pip install --system pyarrow==16.0.0 huggingface-hub==0.24.7 transformers dill==0.3.1.1 - name: Test with pytest run: | python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/ diff --git a/setup.py b/setup.py index 5e58064212a..d8566676eac 100644 --- a/setup.py +++ b/setup.py @@ -110,8 +110,8 @@ # We use numpy>=1.17 to have np.random.Generator (Dataset shuffling) "numpy>=1.17", # Backend and serialization. - # Minimum 15.0.0 to be able to cast dictionary types to their underlying types - "pyarrow>=15.0.0", + # Minimum 16.0.0 to support string views + "pyarrow>=16.0.0", # For smart caching dataset processing "dill>=0.3.0,<0.3.9", # tmp pin until dill has official support for determinism see https://github.com/uqfoundation/dill/issues/19 # For performance gains with apache arrow diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 70ea9fb5ddd..4fbd3ea7afd 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -110,6 +110,8 @@ def _arrow_to_datasets_dtype(arrow_type: pa.DataType) -> str: return "string" elif pyarrow.types.is_large_string(arrow_type): return "large_string" + elif pyarrow.types.is_string_view(arrow_type): + return "string_view" elif pyarrow.types.is_dictionary(arrow_type): return _arrow_to_datasets_dtype(arrow_type.value_type) else: @@ -508,6 +510,7 @@ class Value: - `large_binary` - `string` - `large_string` + - `string_view` Args: dtype (`str`): @@ -548,6 +551,10 @@ def encode_example(self, value): return float(value) elif pa.types.is_string(self.pa_type): return str(value) + elif pa.types.is_large_string(self.pa_type): + return str(value) + elif pa.types.is_string_view(self.pa_type): + return str(value) else: return value