Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
c9a82bc
Inserting new parameter in concatenation function to allow for ignori…
luigilcsilva Sep 26, 2025
a994992
Restoring pyproject.toml
luigilcsilva Sep 29, 2025
674c47a
Merge branch 'main' into sep26-2025-implementing-ignore-empty-margin
luigilcsilva Sep 29, 2025
14969b2
Moving margin handling to a handler in concat_catalog_data.py
luigilcsilva Sep 29, 2025
159ed97
Restoring versions in pre-commit yaml.
luigilcsilva Sep 30, 2025
26afe7f
Removing asserts from concat_catalog_data.
luigilcsilva Sep 30, 2025
788c42f
Adding tests to increase coverage.
luigilcsilva Sep 30, 2025
61ee110
Changes in code to treat types clearly. New tests.
luigilcsilva Sep 30, 2025
110c35e
Ensuring comments are in english
luigilcsilva Sep 30, 2025
6bd729a
Merge remote-tracking branch 'origin/main' into sep26-2025-implementi…
luigilcsilva Sep 30, 2025
d147b6b
Merge branch 'main' into sep26-2025-implementing-ignore-empty-margin
luigilcsilva Sep 30, 2025
c20972a
Removing cast in concat_catalog_data.py
luigilcsilva Oct 1, 2025
4f6cc38
Joining the two _asset_concat_symmetry helpers into one.
luigilcsilva Oct 1, 2025
5ed1fd9
Changing modules imported names to be clearer.
luigilcsilva Oct 1, 2025
61010d0
Simplifying one of the tests, from approximation to an exact check.
luigilcsilva Oct 1, 2025
315f162
Changing the mocks to use MagicMock.
luigilcsilva Oct 1, 2025
0dc7885
Adding strict check of ra and dec columns names. Updating tests accor…
luigilcsilva Oct 1, 2025
c284f99
Merge branch 'main' into sep26-2025-implementing-ignore-empty-margin
luigilcsilva Oct 6, 2025
afe7806
Merge branch 'main' into sep26-2025-implementing-ignore-empty-margin
luigilcsilva Oct 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 25 additions & 54 deletions src/lsdb/catalog/catalog.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import warnings
from pathlib import Path
from typing import Any, Callable, Iterable, Type

Expand All @@ -24,7 +23,7 @@
from lsdb.core.crossmatch.crossmatch_algorithms import BuiltInCrossmatchAlgorithm
from lsdb.core.search.abstract_search import AbstractSearch
from lsdb.core.search.index_search import IndexSearch
from lsdb.dask.concat_catalog_data import concat_catalog_data, concat_margin_data
from lsdb.dask.concat_catalog_data import _assert_same_ra_dec, concat_catalog_data, handle_margins_for_concat
from lsdb.dask.crossmatch_catalog_data import crossmatch_catalog_data, crossmatch_catalog_data_nested
from lsdb.dask.join_catalog_data import (
join_catalog_data_nested,
Expand Down Expand Up @@ -364,68 +363,40 @@ def crossmatch_nested(
def concat(
self,
other: Catalog,
*,
ignore_empty_margins: bool = False,
**kwargs,
) -> Catalog:
"""
Concatenate two catalogs by aligned HEALPix pixels.
"""Concatenate two catalogs by aligned HEALPix pixels.

Args:
other (Catalog): The catalog to concatenate with.
**kwargs: Extra arguments forwarded to internal `pandas.concat` calls.
other (Catalog): Catalog to concatenate with.
ignore_empty_margins (bool, optional): If True, keep the available margin
when only one side has it (treated as incomplete). If False, drop
margins when only one side has them. Defaults to False.
**kwargs: Extra arguments forwarded to internal `pandas.concat`.

Returns:
Catalog: A new catalog whose partitions correspond to the OUTER pixel alignment
and whose rows are the per-pixel concatenation of both inputs. If both
inputs provide a margin, the result includes a concatenated margin
dataset as described above.
Catalog: New catalog with OUTER pixel alignment. If both inputs have a
margin — or if `ignore_empty_margins=True` and at least one side has it —
the result includes a concatenated margin dataset.

Raises:
Warning: If only one side has a margin, a warning is emitted and the result will
not include a margin dataset.

Notes:
- The main (non-margin) alignment is filtered by the catalogs’ MOCs when
available; the pixel-tree alignment itself is OUTER, so pixels present on
either side are preserved (within the MOC filter).
- This is a stacking operation, not a row-wise join or crossmatch; no
deduplication or key-based matching is applied.
- Column dtypes may be upcast by pandas to accommodate the unioned schema.
Row/column order is not guaranteed to be stable.
- `**kwargs` are forwarded to the internal pandas concatenations (e.g.,
`ignore_index`, etc.).
ValueError: If RA/Dec column names differ between the input catalogs, or
between a catalog and its own margin.
"""
# check if the catalogs have margins
margin = None
if self.margin is None and other.margin is not None:
warnings.warn(
"Left catalog has no margin, result will not include margin data.",
)

if self.margin is not None and other.margin is None:
warnings.warn(
"Right catalog has no margin, result will not include margin data.",
)

if self.margin is not None and other.margin is not None:
smallest_margin_radius = min(
self.margin.hc_structure.catalog_info.margin_threshold or 0,
other.margin.hc_structure.catalog_info.margin_threshold or 0,
)

margin_ddf, margin_ddf_map, margin_alignment = concat_margin_data(
self, other, smallest_margin_radius, **kwargs
)
margin_hc_catalog = self.margin.hc_structure.__class__(
self.margin.hc_structure.catalog_info,
margin_alignment.pixel_tree,
)
margin = self.margin._create_updated_dataset(
ddf=margin_ddf,
ddf_pixel_map=margin_ddf_map,
hc_structure=margin_hc_catalog,
updated_catalog_info_params={"margin_threshold": smallest_margin_radius},
)
# Fail fast if RA/Dec columns differ between the two catalogs.
_assert_same_ra_dec(self, other, context="Catalog concat")

# Delegate margin handling to helper (which also validates catalog vs margin)
margin = handle_margins_for_concat(
self,
other,
ignore_empty_margins=ignore_empty_margins,
**kwargs,
)

# Main catalog concatenation
ddf, ddf_map, alignment = concat_catalog_data(self, other, **kwargs)
hc_catalog = self.hc_structure.__class__(
self.hc_structure.catalog_info,
Expand Down
148 changes: 148 additions & 0 deletions src/lsdb/dask/concat_catalog_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import warnings
from typing import TYPE_CHECKING

import pandas as pd
Expand All @@ -20,6 +21,54 @@

if TYPE_CHECKING:
from lsdb.catalog.catalog import Catalog
from lsdb.catalog.margin_catalog import MarginCatalog


def _get_ra_dec_from_info(hc_dataset) -> tuple[str | None, str | None]:
"""
Return the (ra_column, dec_column) from a HATS-like dataset's catalog_info.

Notes:
Returns (None, None) if not present in the metadata; the caller decides how to handle it.
"""
info = hc_dataset.hc_structure.catalog_info # type: ignore[attr-defined]
return getattr(info, "ra_column", None), getattr(info, "dec_column", None)


def _assert_same_ra_dec(
left_ds,
right_ds,
*,
context: str,
) -> None:
"""
Raise ValueError if RA/Dec column names differ between two datasets.

Args:
left_ds: Left dataset (Catalog or MarginCatalog).
right_ds: Right dataset (Catalog or MarginCatalog).
context (str): Human-readable context message for the error.

Raises:
ValueError: If either side is missing RA/Dec names or if they differ.
"""
l_ra, l_dec = _get_ra_dec_from_info(left_ds)
r_ra, r_dec = _get_ra_dec_from_info(right_ds)

# We require both sides to specify RA/Dec and to match exactly.
if not l_ra or not l_dec or not r_ra or not r_dec:
raise ValueError(
f"{context}: RA/Dec column names must be defined on both sides "
f"(left: ra={l_ra!r}, dec={l_dec!r}; right: ra={r_ra!r}, dec={r_dec!r})."
)

if (l_ra != r_ra) or (l_dec != r_dec):
raise ValueError(
f"{context}: incompatible RA/Dec columns "
f"(left: ra={l_ra!r}, dec={l_dec!r} vs right: ra={r_ra!r}, dec={r_dec!r}). "
"Please rename columns so both catalogs (and their margins) share the same RA/Dec names "
"before calling concat()."
)


def _check_strict_column_types(meta1: pd.DataFrame, meta2: pd.DataFrame):
Expand Down Expand Up @@ -403,3 +452,102 @@ def concat_margin_data(
)

return construct_catalog_args(joined_partitions, meta_df, alignment)


# pylint: disable=too-many-locals
def handle_margins_for_concat(
left: Catalog,
right: Catalog,
*,
ignore_empty_margins: bool,
**kwargs,
) -> MarginCatalog | None:
"""Handle margin concatenation policy for Catalog.concat().

Args:
left (Catalog): Left catalog.
right (Catalog): Right catalog.
ignore_empty_margins (bool): If True and only one side has a margin, keep
the existing margin and treat the missing side as empty. If False,
drop margins when only one side has them.
**kwargs: Extra keyword arguments forwarded to `concat_margin_data`.

Returns:
MarginCatalog | None: Concatenated margin catalog, or None if margins
are not retained.
"""
# Read once; helps both runtime clarity and type checkers.
lm: MarginCatalog | None = left.margin
rm: MarginCatalog | None = right.margin

# Ensure margin RA/Dec are consistent with their owning catalog (if margins exist).
if lm is not None:
_assert_same_ra_dec(left, lm, context="Left catalog vs left margin")
if rm is not None:
_assert_same_ra_dec(right, rm, context="Right catalog vs right margin")

if lm is not None and rm is not None:
# Both sides have margins: standard path (unchanged behavior).
# Use the smallest radius between the two margins.
smallest_margin_radius = min(
lm.hc_structure.catalog_info.margin_threshold or 0.0,
rm.hc_structure.catalog_info.margin_threshold or 0.0,
)
margin_ddf, margin_ddf_map, margin_alignment = concat_margin_data(
left, right, smallest_margin_radius, **kwargs
)
margin_hc_catalog = lm.hc_structure.__class__(
lm.hc_structure.catalog_info,
margin_alignment.pixel_tree,
)
return lm._create_updated_dataset( # pylint: disable=protected-access
ddf=margin_ddf,
ddf_pixel_map=margin_ddf_map,
hc_structure=margin_hc_catalog,
updated_catalog_info_params={"margin_threshold": smallest_margin_radius},
)

if lm is not None or rm is not None:
# Exactly one side has a margin.
if lm is not None:
existing: MarginCatalog = lm
elif rm is not None:
existing = rm
else: # pragma: no cover - logically unreachable due to the condition above
return None

if not ignore_empty_margins:
# Legacy behavior: drop margins entirely.
warnings.warn(
"One side has no margin; result will not include margin data. "
"Set ignore_empty_margins=True to keep the available margin "
"(treated as incomplete).",
)
return None

# New behavior: keep the available margin by treating the missing side as empty.
# Use the existing side's radius for the concatenated margin.
existing_radius = existing.hc_structure.catalog_info.margin_threshold or 0.0

warnings.warn(
"ignore_empty_margins=True and only one side has a margin: the "
"missing margin is treated as empty at the same radius. The "
"resulting concatenated margin may be incomplete.",
)

margin_ddf, margin_ddf_map, margin_alignment = concat_margin_data(
left, right, existing_radius, **kwargs
)
margin_hc_catalog = existing.hc_structure.__class__(
existing.hc_structure.catalog_info,
margin_alignment.pixel_tree,
)
return existing._create_updated_dataset( # pylint: disable=protected-access
ddf=margin_ddf,
ddf_pixel_map=margin_ddf_map,
hc_structure=margin_hc_catalog,
updated_catalog_info_params={"margin_threshold": existing_radius},
)

# Neither side has margin: nothing to do for margins (unchanged).
return None
Loading
Loading