Source code for sccellfie.io.segmentation

"""
Cell segmentation polygon loaders.

Generic loader for vertex-table segmentation files (Xenium parquet,
CSV/TSV, optionally gzipped) plus convenience wrappers. Column names
are auto-detected when not supplied. ``geopandas`` and ``shapely`` are
imported lazily so users who only score non-spatial data don't need
them installed.
"""
from typing import Optional, Union

import numpy as np
import pandas as pd
from tqdm import tqdm


def _require_geo():
    try:
        import geopandas as gpd
        from shapely.geometry import Polygon, MultiPolygon
    except ImportError as e:
        raise ImportError(
            "Loading cell segmentations requires geopandas and shapely. "
            "Install via: pip install geopandas shapely"
        ) from e
    return gpd, Polygon, MultiPolygon


[docs] def load_segmentation( filepath: str, cell_ids: Optional[np.ndarray] = None, cell_id_col: Optional[str] = None, vertex_x_col: Optional[str] = None, vertex_y_col: Optional[str] = None, output: str = "geodataframe", ) -> Union["gpd.GeoDataFrame", dict]: """ Load cell boundary polygons from a segmentation file. Generic loader for any vertex-table format (one row per polygon vertex). Supports Xenium parquet, CSV.gz, CSV, TSV, and TSV.gz with auto-detection of column names. Parameters ---------- filepath : str Path to the cell boundaries file. Accepted extensions are ``.parquet``, ``.csv.gz``, ``.csv``, ``.tsv``, and ``.tsv.gz``. cell_ids : np.ndarray, optional (default: None) If provided, only load polygons for these cell IDs. cell_id_col : str, optional (default: None) Column name for cell identifiers. Auto-detected if None. Tries ``"cell_id"``, ``"ID"``, ``"id"``, ``"cell_ID"`` in that order. vertex_x_col : str, optional (default: None) Column name for vertex x-coordinates. Auto-detected if None. Tries ``"vertex_x"``, ``"x_location"``, ``"X"``. vertex_y_col : str, optional (default: None) Column name for vertex y-coordinates. Auto-detected if None. Tries ``"vertex_y"``, ``"y_location"``, ``"Y"``. output : {"geodataframe", "dict"}, optional (default: "geodataframe") Return format. ``"geodataframe"`` returns a GeoDataFrame indexed by cell ID with ``centroid_x`` / ``centroid_y`` columns. ``"dict"`` returns a mapping of ``cell_id -> shapely.Polygon``. Returns ------- geopandas.GeoDataFrame or dict Cell boundary polygons in the requested format. """ gpd, Polygon, MultiPolygon = _require_geo() lower = filepath.lower() if lower.endswith(".parquet"): df = pd.read_parquet(filepath) elif lower.endswith(".csv.gz") or lower.endswith(".csv"): df = pd.read_csv(filepath) elif lower.endswith(".tsv.gz") or lower.endswith(".tsv"): df = pd.read_csv(filepath, sep="\t") else: raise ValueError( f"Unsupported file format: {filepath}. " "Expected .parquet, .csv, .csv.gz, .tsv, or .tsv.gz" ) if cell_id_col is None: for candidate in ("cell_id", "ID", "id", "cell_ID"): if candidate in df.columns: cell_id_col = candidate break if cell_id_col is None: raise ValueError( f"Could not find cell ID column. Available: {df.columns.tolist()}" ) if vertex_x_col is None: for candidate in ("vertex_x", "x_location", "X"): if candidate in df.columns: vertex_x_col = candidate break if vertex_x_col is None: raise ValueError( f"Could not find x-coordinate column. Available: {df.columns.tolist()}" ) if vertex_y_col is None: for candidate in ("vertex_y", "y_location", "Y"): if candidate in df.columns: vertex_y_col = candidate break if vertex_y_col is None: raise ValueError( f"Could not find y-coordinate column. Available: {df.columns.tolist()}" ) if cell_ids is not None: df = df[df[cell_id_col].isin(cell_ids)] polygons = {} for cid, group in tqdm( df.groupby(cell_id_col), desc="Loading segmentation", leave=False ): coords = group[[vertex_x_col, vertex_y_col]].values if len(coords) < 3: continue try: poly = Polygon(coords) if not poly.is_valid: poly = poly.buffer(0) if poly.is_empty: continue if isinstance(poly, MultiPolygon): poly = max(poly.geoms, key=lambda g: g.area) polygons[cid] = poly except Exception: continue if output == "dict": return polygons gdf = gpd.GeoDataFrame( geometry=list(polygons.values()), index=pd.Index(polygons.keys(), name=cell_id_col), ) gdf["centroid_x"] = gdf.geometry.centroid.x gdf["centroid_y"] = gdf.geometry.centroid.y return gdf
[docs] def load_xenium_segmentation( filepath: str, cell_ids: Optional[np.ndarray] = None, cell_id_col: Optional[str] = None, vertex_x_col: Optional[str] = None, vertex_y_col: Optional[str] = None, output: str = "geodataframe", ) -> Union["gpd.GeoDataFrame", dict]: """ Load cell boundaries from a Xenium ``cell_boundaries`` file. Thin wrapper around :func:`load_segmentation` kept for discoverability. Xenium ``cell_boundaries`` files use the default auto-detected columns (``cell_id``, ``vertex_x``, ``vertex_y``) so this is equivalent to calling :func:`load_segmentation` directly. See :func:`load_segmentation` for parameter and return documentation. """ return load_segmentation( filepath=filepath, cell_ids=cell_ids, cell_id_col=cell_id_col, vertex_x_col=vertex_x_col, vertex_y_col=vertex_y_col, output=output, )
[docs] def load_segmentation_from_gdf(gdf, geometry_col: str = "geometry"): """ Prepare a pre-loaded GeoDataFrame for downstream plotting. Adds ``centroid_x`` and ``centroid_y`` columns if missing. Parameters ---------- gdf : geopandas.GeoDataFrame GeoDataFrame with polygon geometries. geometry_col : str, optional (default: "geometry") Name of the geometry column. Returns ------- geopandas.GeoDataFrame Input GeoDataFrame with centroid columns added. """ _require_geo() if geometry_col != "geometry": gdf = gdf.set_geometry(geometry_col) if "centroid_x" not in gdf.columns: gdf["centroid_x"] = gdf.geometry.centroid.x if "centroid_y" not in gdf.columns: gdf["centroid_y"] = gdf.geometry.centroid.y return gdf