Source code for sccellfie.io.segmentation

"""
Cell segmentation polygon loaders.

Generic loader for vertex-table segmentation files (Xenium parquet,
CSV/TSV, optionally gzipped) plus convenience wrappers. Column names
are auto-detected when not supplied. ``geopandas`` and ``shapely`` are
imported lazily so users who only score non-spatial data don't need
them installed.
"""
from typing import Optional, Union

import numpy as np
import pandas as pd
from tqdm import tqdm


def _require_geo():
    try:
        import geopandas as gpd
        from shapely.geometry import Polygon, MultiPolygon
    except ImportError as e:
        raise ImportError(
            "Loading cell segmentations requires geopandas and shapely. "
            "Install via: pip install geopandas shapely"
        ) from e
    return gpd, Polygon, MultiPolygon



[docs]
def load_segmentation(
    filepath: str,
    cell_ids: Optional[np.ndarray] = None,
    cell_id_col: Optional[str] = None,
    vertex_x_col: Optional[str] = None,
    vertex_y_col: Optional[str] = None,
    output: str = "geodataframe",
) -> Union["gpd.GeoDataFrame", dict]:
    """
    Load cell boundary polygons from a segmentation file.

    Generic loader for any vertex-table format (one row per polygon
    vertex). Supports Xenium parquet, CSV.gz, CSV, TSV, and TSV.gz with
    auto-detection of column names.

    Parameters
    ----------
    filepath : str
        Path to the cell boundaries file. Accepted extensions are
        ``.parquet``, ``.csv.gz``, ``.csv``, ``.tsv``, and ``.tsv.gz``.

    cell_ids : np.ndarray, optional (default: None)
        If provided, only load polygons for these cell IDs.

    cell_id_col : str, optional (default: None)
        Column name for cell identifiers. Auto-detected if None. Tries
        ``"cell_id"``, ``"ID"``, ``"id"``, ``"cell_ID"`` in that order.

    vertex_x_col : str, optional (default: None)
        Column name for vertex x-coordinates. Auto-detected if None.
        Tries ``"vertex_x"``, ``"x_location"``, ``"X"``.

    vertex_y_col : str, optional (default: None)
        Column name for vertex y-coordinates. Auto-detected if None.
        Tries ``"vertex_y"``, ``"y_location"``, ``"Y"``.

    output : {"geodataframe", "dict"}, optional (default: "geodataframe")
        Return format. ``"geodataframe"`` returns a GeoDataFrame indexed
        by cell ID with ``centroid_x`` / ``centroid_y`` columns.
        ``"dict"`` returns a mapping of ``cell_id -> shapely.Polygon``.

    Returns
    -------
    geopandas.GeoDataFrame or dict
        Cell boundary polygons in the requested format.
    """
    gpd, Polygon, MultiPolygon = _require_geo()

    lower = filepath.lower()
    if lower.endswith(".parquet"):
        df = pd.read_parquet(filepath)
    elif lower.endswith(".csv.gz") or lower.endswith(".csv"):
        df = pd.read_csv(filepath)
    elif lower.endswith(".tsv.gz") or lower.endswith(".tsv"):
        df = pd.read_csv(filepath, sep="\t")
    else:
        raise ValueError(
            f"Unsupported file format: {filepath}. "
            "Expected .parquet, .csv, .csv.gz, .tsv, or .tsv.gz"
        )

    if cell_id_col is None:
        for candidate in ("cell_id", "ID", "id", "cell_ID"):
            if candidate in df.columns:
                cell_id_col = candidate
                break
        if cell_id_col is None:
            raise ValueError(
                f"Could not find cell ID column. Available: {df.columns.tolist()}"
            )

    if vertex_x_col is None:
        for candidate in ("vertex_x", "x_location", "X"):
            if candidate in df.columns:
                vertex_x_col = candidate
                break
        if vertex_x_col is None:
            raise ValueError(
                f"Could not find x-coordinate column. Available: {df.columns.tolist()}"
            )

    if vertex_y_col is None:
        for candidate in ("vertex_y", "y_location", "Y"):
            if candidate in df.columns:
                vertex_y_col = candidate
                break
        if vertex_y_col is None:
            raise ValueError(
                f"Could not find y-coordinate column. Available: {df.columns.tolist()}"
            )

    if cell_ids is not None:
        df = df[df[cell_id_col].isin(cell_ids)]

    polygons = {}
    for cid, group in tqdm(
        df.groupby(cell_id_col), desc="Loading segmentation", leave=False
    ):
        coords = group[[vertex_x_col, vertex_y_col]].values
        if len(coords) < 3:
            continue
        try:
            poly = Polygon(coords)
            if not poly.is_valid:
                poly = poly.buffer(0)
            if poly.is_empty:
                continue
            if isinstance(poly, MultiPolygon):
                poly = max(poly.geoms, key=lambda g: g.area)
            polygons[cid] = poly
        except Exception:
            continue

    if output == "dict":
        return polygons

    gdf = gpd.GeoDataFrame(
        geometry=list(polygons.values()),
        index=pd.Index(polygons.keys(), name=cell_id_col),
    )
    gdf["centroid_x"] = gdf.geometry.centroid.x
    gdf["centroid_y"] = gdf.geometry.centroid.y
    return gdf




[docs]
def load_xenium_segmentation(
    filepath: str,
    cell_ids: Optional[np.ndarray] = None,
    cell_id_col: Optional[str] = None,
    vertex_x_col: Optional[str] = None,
    vertex_y_col: Optional[str] = None,
    output: str = "geodataframe",
) -> Union["gpd.GeoDataFrame", dict]:
    """
    Load cell boundaries from a Xenium ``cell_boundaries`` file.

    Thin wrapper around :func:`load_segmentation` kept for discoverability.
    Xenium ``cell_boundaries`` files use the default auto-detected columns
    (``cell_id``, ``vertex_x``, ``vertex_y``) so this is equivalent to
    calling :func:`load_segmentation` directly.

    See :func:`load_segmentation` for parameter and return documentation.
    """
    return load_segmentation(
        filepath=filepath,
        cell_ids=cell_ids,
        cell_id_col=cell_id_col,
        vertex_x_col=vertex_x_col,
        vertex_y_col=vertex_y_col,
        output=output,
    )




[docs]
def load_segmentation_from_gdf(gdf, geometry_col: str = "geometry"):
    """
    Prepare a pre-loaded GeoDataFrame for downstream plotting.

    Adds ``centroid_x`` and ``centroid_y`` columns if missing.

    Parameters
    ----------
    gdf : geopandas.GeoDataFrame
        GeoDataFrame with polygon geometries.

    geometry_col : str, optional (default: "geometry")
        Name of the geometry column.

    Returns
    -------
    geopandas.GeoDataFrame
        Input GeoDataFrame with centroid columns added.
    """
    _require_geo()

    if geometry_col != "geometry":
        gdf = gdf.set_geometry(geometry_col)

    if "centroid_x" not in gdf.columns:
        gdf["centroid_x"] = gdf.geometry.centroid.x
    if "centroid_y" not in gdf.columns:
        gdf["centroid_y"] = gdf.geometry.centroid.y

    return gdf