Skip to content

Commit 2dcb31e

Browse files
Copilotgitosaurus
authored andcommitted
Add automatic fsspec optimization for remote storage using fsspec.parquet (#369)
1 parent c5cf5ee commit 2dcb31e

File tree

3 files changed

+167
-1
lines changed

3 files changed

+167
-1
lines changed

benchmarks/benchmarks.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,3 +254,27 @@ def time_run(self):
254254
def peakmem_run(self):
255255
"""Benchmark the memory usage of read_parquet(self.path, columns=self.columns)"""
256256
self.run()
257+
258+
259+
class ReadFewColumnsHTTPSWithOptimization:
260+
"""Benchmark read_parquet("https://", columns=[...])
261+
262+
Note: fsspec optimization is now automatic for remote URLs,
263+
so this benchmark is equivalent to ReadFewColumnsHTTPS.
264+
Kept for historical comparison purposes.
265+
"""
266+
267+
path = "https://data.lsdb.io/hats/gaia_dr3/gaia/dataset/Norder=2/Dir=0/Npix=0.parquet"
268+
columns = ["_healpix_29", "ra", "astrometric_primary_flag"]
269+
270+
def run(self):
271+
"""Run the benchmark (fsspec optimization is automatic for remote URLs)."""
272+
_ = read_parquet(self.path, columns=self.columns)
273+
274+
def time_run(self):
275+
"""Benchmark the runtime with automatic fsspec optimization"""
276+
self.run()
277+
278+
def peakmem_run(self):
279+
"""Benchmark the memory usage with automatic fsspec optimization"""
280+
self.run()

src/nested_pandas/nestedframe/io.py

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,11 @@ def read_parquet(
5757
5858
Notes
5959
-----
60+
For remote storage (S3, GCS, HTTP/HTTPS), this function automatically uses
61+
fsspec.parquet.open_parquet_file for optimized access with intelligent
62+
precaching, which can significantly improve performance compared to standard
63+
PyArrow reading.
64+
6065
pyarrow supports partial loading of nested structures from parquet, for
6166
example ```pd.read_parquet("data.parquet", columns=["nested.a"])``` will
6267
load the "a" column of the "nested" column. Standard pandas/pyarrow
@@ -94,8 +99,11 @@ def read_parquet(
9499
reject_nesting = [reject_nesting]
95100

96101
# First load through pyarrow
102+
# If data is remote, use fsspec.parquet for better performance
103+
if _should_use_fsspec_optimization(data, kwargs.get("filesystem")):
104+
table = _read_with_fsspec_optimization(data, columns, kwargs)
97105
# If `filesystem` is specified - use it
98-
if kwargs.get("filesystem") is not None:
106+
elif kwargs.get("filesystem") is not None:
99107
table = pq.read_table(data, columns=columns, **kwargs)
100108
# Otherwise convert with a special function
101109
else:
@@ -291,3 +299,88 @@ def _cast_list_cols_to_nested(df):
291299
if pa.types.is_list(dtype.pyarrow_dtype):
292300
df[col] = pack_lists(df[[col]])
293301
return df
302+
303+
304+
def _should_use_fsspec_optimization(data, explicit_filesystem):
305+
"""Determine if fsspec optimization should be used.
306+
307+
Parameters
308+
----------
309+
data : str, Path, UPath, or file-like object
310+
The data source
311+
explicit_filesystem : filesystem or None
312+
Explicitly provided filesystem
313+
314+
Returns
315+
-------
316+
bool
317+
True if fsspec optimization should be used for this data source
318+
"""
319+
# Don't use optimization if explicit filesystem is provided
320+
if explicit_filesystem is not None:
321+
return False
322+
323+
# Don't use for file-like objects
324+
if hasattr(data, "read"):
325+
return False
326+
327+
# For UPath objects, check if they're remote (check before Path since UPath inherits from Path)
328+
if isinstance(data, UPath):
329+
return data.protocol not in ("", "file")
330+
331+
# Don't use for Path objects (local files)
332+
if isinstance(data, Path):
333+
return False
334+
335+
# For strings, check if they look like remote URLs
336+
if isinstance(data, str):
337+
return data.startswith(("http://", "https://", "s3://", "gs://", "gcs://", "azure://", "adl://"))
338+
339+
return False
340+
341+
342+
def _read_with_fsspec_optimization(data, columns, kwargs):
343+
"""Read parquet using fsspec optimization for better remote storage performance.
344+
345+
Parameters
346+
----------
347+
data : str, UPath, or path-like
348+
Path to the parquet file
349+
columns : list or None
350+
Columns to read
351+
kwargs : dict
352+
Additional kwargs for reading
353+
354+
Returns
355+
-------
356+
pyarrow.Table
357+
The loaded table
358+
"""
359+
try:
360+
import fsspec.parquet
361+
except ImportError:
362+
# Fall back to regular method if fsspec.parquet not available
363+
data_converted, filesystem = _transform_read_parquet_data_arg(data)
364+
return pq.read_table(data_converted, filesystem=filesystem, columns=columns, **kwargs)
365+
366+
# Convert UPath to string if needed
367+
if isinstance(data, UPath):
368+
path_str = str(data)
369+
# Use UPath storage options for fsspec
370+
storage_options = data.storage_options if data.storage_options else None
371+
else:
372+
path_str = str(data)
373+
storage_options = None
374+
375+
# Use fsspec.parquet.open_parquet_file for optimized access
376+
try:
377+
with fsspec.parquet.open_parquet_file(
378+
path_str, columns=columns, storage_options=storage_options, engine="pyarrow"
379+
) as parquet_file:
380+
# Read the table using PyArrow with the optimized file handle
381+
table = pq.read_table(parquet_file, columns=columns, **kwargs)
382+
return table
383+
except Exception:
384+
# Fall back to regular method if optimization fails
385+
data_converted, filesystem = _transform_read_parquet_data_arg(data)
386+
return pq.read_table(data_converted, filesystem=filesystem, columns=columns, **kwargs)

tests/nested_pandas/nestedframe/test_io.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,3 +399,52 @@ def test__transform_read_parquet_data_arg():
399399
"https://data.lsdb.io/hats/gaia_dr3/gaia/dataset/Norder=2/Dir=0/Npix=0.parquet",
400400
]
401401
)
402+
403+
404+
def test_read_parquet_with_fsspec_optimization():
405+
"""Test that read_parquet automatically uses fsspec optimization for remote files."""
406+
# Test with local file (should not use fsspec optimization)
407+
local_path = "tests/test_data/nested.parquet"
408+
409+
# Test basic reading - local files should work as before
410+
nf1 = read_parquet(local_path)
411+
412+
# Test with additional kwargs
413+
nf2 = read_parquet(local_path, columns=["a", "nested.flux"], use_threads=True)
414+
415+
assert len(nf2) <= len(nf1) # filtered columns
416+
assert "a" in nf2.columns
417+
assert "nested" in nf2.columns
418+
419+
420+
def test_fsspec_optimization_path_detection():
421+
"""Test _should_use_fsspec_optimization correctly identifies remote paths."""
422+
from pathlib import Path
423+
424+
from nested_pandas.nestedframe.io import _should_use_fsspec_optimization
425+
426+
# Test cases that should NOT use optimization
427+
assert not _should_use_fsspec_optimization("local_file.parquet", None)
428+
assert not _should_use_fsspec_optimization("/path/to/file.parquet", None)
429+
assert not _should_use_fsspec_optimization(Path("local_file.parquet"), None)
430+
assert not _should_use_fsspec_optimization(UPath("local_file.parquet"), None)
431+
assert not _should_use_fsspec_optimization("https://example.com/file.parquet", "some_filesystem")
432+
433+
# Test file-like object
434+
import io
435+
436+
assert not _should_use_fsspec_optimization(io.BytesIO(b"test"), None)
437+
438+
# Test cases that SHOULD use optimization
439+
assert _should_use_fsspec_optimization("https://example.com/file.parquet", None)
440+
assert _should_use_fsspec_optimization("s3://bucket/file.parquet", None)
441+
assert _should_use_fsspec_optimization("gs://bucket/file.parquet", None)
442+
assert _should_use_fsspec_optimization(UPath("https://example.com/file.parquet"), None)
443+
assert _should_use_fsspec_optimization(UPath("s3://bucket/file.parquet"), None)
444+
445+
446+
def test_docstring_includes_fsspec_notes():
447+
"""Test that the docstring mentions the automatic fsspec optimization."""
448+
docstring = read_parquet.__doc__
449+
assert "fsspec" in docstring
450+
assert "remote" in docstring.lower()

0 commit comments

Comments
 (0)