@@ -57,6 +57,11 @@ def read_parquet(
57
57
58
58
Notes
59
59
-----
60
+ For remote storage (S3, GCS, HTTP/HTTPS), this function automatically uses
61
+ fsspec.parquet.open_parquet_file for optimized access with intelligent
62
+ precaching, which can significantly improve performance compared to standard
63
+ PyArrow reading.
64
+
60
65
pyarrow supports partial loading of nested structures from parquet, for
61
66
example ```pd.read_parquet("data.parquet", columns=["nested.a"])``` will
62
67
load the "a" column of the "nested" column. Standard pandas/pyarrow
@@ -94,8 +99,11 @@ def read_parquet(
94
99
reject_nesting = [reject_nesting ]
95
100
96
101
# First load through pyarrow
102
+ # If data is remote, use fsspec.parquet for better performance
103
+ if _should_use_fsspec_optimization (data , kwargs .get ("filesystem" )):
104
+ table = _read_with_fsspec_optimization (data , columns , kwargs )
97
105
# If `filesystem` is specified - use it
98
- if kwargs .get ("filesystem" ) is not None :
106
+ elif kwargs .get ("filesystem" ) is not None :
99
107
table = pq .read_table (data , columns = columns , ** kwargs )
100
108
# Otherwise convert with a special function
101
109
else :
@@ -291,3 +299,88 @@ def _cast_list_cols_to_nested(df):
291
299
if pa .types .is_list (dtype .pyarrow_dtype ):
292
300
df [col ] = pack_lists (df [[col ]])
293
301
return df
302
+
303
+
304
+ def _should_use_fsspec_optimization (data , explicit_filesystem ):
305
+ """Determine if fsspec optimization should be used.
306
+
307
+ Parameters
308
+ ----------
309
+ data : str, Path, UPath, or file-like object
310
+ The data source
311
+ explicit_filesystem : filesystem or None
312
+ Explicitly provided filesystem
313
+
314
+ Returns
315
+ -------
316
+ bool
317
+ True if fsspec optimization should be used for this data source
318
+ """
319
+ # Don't use optimization if explicit filesystem is provided
320
+ if explicit_filesystem is not None :
321
+ return False
322
+
323
+ # Don't use for file-like objects
324
+ if hasattr (data , "read" ):
325
+ return False
326
+
327
+ # For UPath objects, check if they're remote (check before Path since UPath inherits from Path)
328
+ if isinstance (data , UPath ):
329
+ return data .protocol not in ("" , "file" )
330
+
331
+ # Don't use for Path objects (local files)
332
+ if isinstance (data , Path ):
333
+ return False
334
+
335
+ # For strings, check if they look like remote URLs
336
+ if isinstance (data , str ):
337
+ return data .startswith (("http://" , "https://" , "s3://" , "gs://" , "gcs://" , "azure://" , "adl://" ))
338
+
339
+ return False
340
+
341
+
342
+ def _read_with_fsspec_optimization (data , columns , kwargs ):
343
+ """Read parquet using fsspec optimization for better remote storage performance.
344
+
345
+ Parameters
346
+ ----------
347
+ data : str, UPath, or path-like
348
+ Path to the parquet file
349
+ columns : list or None
350
+ Columns to read
351
+ kwargs : dict
352
+ Additional kwargs for reading
353
+
354
+ Returns
355
+ -------
356
+ pyarrow.Table
357
+ The loaded table
358
+ """
359
+ try :
360
+ import fsspec .parquet
361
+ except ImportError :
362
+ # Fall back to regular method if fsspec.parquet not available
363
+ data_converted , filesystem = _transform_read_parquet_data_arg (data )
364
+ return pq .read_table (data_converted , filesystem = filesystem , columns = columns , ** kwargs )
365
+
366
+ # Convert UPath to string if needed
367
+ if isinstance (data , UPath ):
368
+ path_str = str (data )
369
+ # Use UPath storage options for fsspec
370
+ storage_options = data .storage_options if data .storage_options else None
371
+ else :
372
+ path_str = str (data )
373
+ storage_options = None
374
+
375
+ # Use fsspec.parquet.open_parquet_file for optimized access
376
+ try :
377
+ with fsspec .parquet .open_parquet_file (
378
+ path_str , columns = columns , storage_options = storage_options , engine = "pyarrow"
379
+ ) as parquet_file :
380
+ # Read the table using PyArrow with the optimized file handle
381
+ table = pq .read_table (parquet_file , columns = columns , ** kwargs )
382
+ return table
383
+ except Exception :
384
+ # Fall back to regular method if optimization fails
385
+ data_converted , filesystem = _transform_read_parquet_data_arg (data )
386
+ return pq .read_table (data_converted , filesystem = filesystem , columns = columns , ** kwargs )
0 commit comments