Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,7 @@ venv.bak/
*.hdf5
*.nc
*.tif

*.log
notebooks/logs
notebooks/results
23 changes: 23 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: h5cloud
channels:
- conda-forge
dependencies:
- jupyterlab
- boto3
- tqdm
- matplotlib-base
- pandas
- numpy
- s3fs
- xarray
- dask
- distributed
- geopandas
- h5py>=3.10
- zarr
- kerchunk
- h5netcdf
- pip
- pip:
- git+https://github.com/betolink/filesystem_spec.git
- git+https://github.com/ICESat2-SlideRule/h5coro.git
43 changes: 28 additions & 15 deletions h5tests/h5coro_arr_mean.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,40 @@
from .h5test import H5Test, timer_decorator
import numpy as np
import subprocess

import numpy as np
from h5test import H5Test, timer_decorator

try:
import h5coro
except:
completed_process = subprocess.run([
'mamba', 'install', '-c', 'conda-forge', 'h5coro', '--yes'
])
completed_process = subprocess.run(
["pip", "install", "git+https://github.com/ICESat2-SlideRule/h5coro.git@main"]
)
import h5coro

from h5coro import h5coro, s3driver, filedriver
h5coro.config(errorChecking=True, verbose=False, enableAttributes=False)

from h5coro import h5coro, s3driver

driver = s3driver.S3Driver


class H5CoroArrMean(H5Test):
@timer_decorator
def run(self):
group = '/gt1l/heights'
variable = 'h_ph'
def run(self, dataset="/gt1l/heights", variable="h_ph"):
group = dataset
variable = variable
final_h5coro_array = []

for file in self.files:
h5obj = h5coro.H5Coro(file.replace("s3://", ""), s3driver.S3Driver)
output = h5obj.readDatasets(datasets=[f'{group}/{variable}'], block=True)
data = h5obj[f'{group}/{variable}'].values
final_h5coro_array = np.insert(final_h5coro_array, len(final_h5coro_array), data, axis=None)
if link.startswith("s3://nasa-cryo-persistent/"):
h5obj = h5coro.H5Coro(link.replace("s3://", ""), s3driver.S3Driver)
else:
h5obj = h5coro.H5Coro(
link.replace("s3://", ""),
s3driver.S3Driver,
credentials={"annon": True},
)
ds = h5obj.readDatasets(datasets=[f"{group}/{variable}"], block=True)
data = ds[f"{group}/{variable}"][:]
final_h5coro_array = np.insert(
final_h5coro_array, len(final_h5coro_array), data, axis=None
)
return np.mean(final_h5coro_array)
33 changes: 19 additions & 14 deletions h5tests/h5py_arr_mean.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,26 @@
from .h5test import H5Test, timer_decorator
import h5py
import numpy as np
from h5test import H5Test, fsspec_logging_decorator, timer_decorator


class H5pyArrMean(H5Test):
@timer_decorator
def run(self):
final_h5py_array = []
# TODO: Do we need to make this configurable or consistent?
group = '/gt1l/heights'
variable = 'h_ph'
@fsspec_logging_decorator
def run(self, io_params={}, dataset="/gt1l/heights", variable="h_ph"):
final_h5py_array = []
fsspec_params = {}
h5py_params = {}
if "fsspec_params" in io_params:
fsspec_params = io_params["fsspec_params"]
if "h5py_params" in io_params:
h5py_params = io_params["h5py_params"]
self.file_sizes = [self.s3_fs.info(file)["size"] for file in self.files]
for file in self.files:
with h5py.File(self.s3_fs.open(file, 'rb')) as f:
data = f[f'{group}/{variable}'][:]
# Need to test if using concatenate is faster
final_h5py_array = np.insert(
final_h5py_array,
len(final_h5py_array),
data, axis=None
)
with self.s3_fs.open(file, mode="rb", **fsspec_params) as fo:
print("h5py params: ", h5py_params)
with h5py.File(fo, **h5py_params) as f:
data = f[f"{dataset}/{variable}"][:]
final_h5py_array = np.insert(
final_h5py_array, len(final_h5py_array), data, axis=None
)
return np.mean(final_h5py_array)
44 changes: 23 additions & 21 deletions h5tests/h5py_arr_subset_mean.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,51 @@
import os
import sys

from .h5test import H5Test, timer_decorator
import h5py
import numpy as np
from h5test import H5Test, fsspec_logging_decorator, timer_decorator

current = os.path.abspath('..')
current = os.path.abspath("..")
sys.path.append(current)
from helpers.geospatial import get_subset_region, get_subset_indices
from helpers.geospatial import get_subset_indices, get_subset_region


class H5pyArrSubsetMean(H5Test):

def __init__(self, data_format, geometry=None):
"""
geometry : path to geojson file containing geometry
**Could be list containing [lonmin, lonmax, latmin, latmax]**
"""
super().__init__(data_format)
self.bounds = get_subset_region(geometry)

@timer_decorator
def run(self):
final_h5py_array = []
@fsspec_logging_decorator
def run(self, io_params={}, dataset="/gt1l/heights", variable="h_ph"):
final_h5py_array = []
# TODO: Do we need to make this configurable or consistent?
group = '/gt1l/heights'
variable = 'h_ph'
if "fsspec_params" in io_params:
fsspec_params = io_params["fsspec_params"]
if "h5py_params" in io_params:
h5py_params = io_params["h5py_params"]
for file in self.files:
with h5py.File(self.s3_fs.open(file, 'rb')) as f:

lat = f[f'{group}/lat_ph'][:]
lon = f[f'{group}/lon_ph'][:]

with h5py.File(
self.s3_fs.open(file, "rb", **fsspec_params), **h5py_params
) as f:
lat = f[f"{dataset}/lat_ph"][:]
lon = f[f"{dataset}/lon_ph"][:]

idx_start, idx_end = get_subset_indices(lat, lon, self.bounds)

# Leaving this code here so that we can create a DataFrame or
# Dataset at a later date. Suggest creating dict which can be
# Dataset at a later date. Suggest creating dict which can be
# passsed to xarray or (geo)pandas
# lat[idx_start:idx_end])
# lon[idx_start:idx_end])

data = f[f'{group}/{variable}'][idx_start:idx_end]
data = f[f"{dataset}/{variable}"][idx_start:idx_end]
# Need to test if using concatenate is faster
final_h5py_array = np.insert(
final_h5py_array,
len(final_h5py_array),
data, axis=None
final_h5py_array, len(final_h5py_array), data, axis=None
)
return np.mean(final_h5py_array)
return np.mean(final_h5py_array)
Loading