Source code for sccellfie.expression.thresholds

import json
import os
import warnings

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm



[docs]
def get_local_percentile_threshold(adata, percentile=0.75, lower_bound=1e-5, upper_bound=None, exclude_zeros=False, use_raw=False):
    """
    Obtains the local percentile threshold for each gene in a AnnData object.

    Parameters
    ----------
    adata: AnnData object
        Annotated data matrix.

    percentile: float or list of floats, optional (default: 0.75)
        Percentile(s) to compute the threshold.

    lower_bound: float or pandas.DataFrame, optional (default: 1e-5)
        Lower bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes
        as the adata object.

    upper_bound: float or pandas.DataFrame, optional (default: None)
        Upper bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes
        as the adata object.

    exclude_zeros: bool, optional (default: False)
        Whether to exclude zeros when computing the threshold.

    use_raw: bool, optional (default: False)
        Whether to use the raw data stored in adata.raw.X.

    Returns
    -------
    thresholds: pandas.DataFrame
        A pandas.DataFrame object with the local percentile threshold for each gene.
    """
    if use_raw:
        X = adata.raw.X.toarray()
    else:
        X = adata.X.toarray()
    if exclude_zeros:
        X = np.where(X==0, np.nan, X)
    thresholds = np.nanquantile(X, q=percentile, axis=0, method='midpoint')
    if isinstance(percentile, list):
        columns = ['threshold-{}'.format(p) for p in percentile]
    else:
        columns = ['threshold-{}'.format(percentile)]
    thresholds = pd.DataFrame(thresholds.T, index=adata.var_names, columns=columns, dtype=float)

    if lower_bound is not None:
        if type(lower_bound) not in (int, float, complex):
            lb = lower_bound.copy()
            lb.columns = columns
            thresholds[thresholds < lb] = lb[thresholds < lb]
        else:
            thresholds[thresholds < lower_bound] = lower_bound

    if upper_bound is not None:
        if type(upper_bound) not in (int, float, complex):
            ub = upper_bound.copy()
            ub.columns = columns
            thresholds[thresholds > ub] = ub[thresholds > ub]
        else:
            thresholds[thresholds > upper_bound] = upper_bound
    return thresholds




[docs]
def get_global_percentile_threshold(adata, percentile=0.75, lower_bound=1e-5, upper_bound=None, exclude_zeros=False, use_raw=False):
    """
    Obtains the global percentile threshold for each gene in a AnnData object.

    Parameters
    ----------
    adata: AnnData object
        Annotated data matrix.

    percentile: float or list of floats, optional (default: 0.75)
        Percentile(s) to compute the threshold.

    lower_bound: float or pandas.DataFrame, optional (default: 1e-5)
        Lower bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes
        as the adata object.

    upper_bound: float or pandas.DataFrame, optional (default: None)
        Upper bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes
        as the adata object.

    exclude_zeros: bool, optional (default: False)
        Whether to exclude zeros when computing the threshold.

    use_raw: bool, optional (default: False)
        Whether to use the raw data stored in adata.raw.X.

    Returns
    -------
    thresholds: pandas.DataFrame
        A pandas.DataFrame object with the global percentile threshold for each gene.
    """
    if use_raw:
        X = adata.raw.X.toarray()
    else:
        X = adata.X.toarray()
    if exclude_zeros:
        X = np.where(X == 0, np.nan, X)
    thresholds = np.nanquantile(X, q=percentile, method='midpoint')
    if isinstance(percentile, list):
        columns = ['threshold-{}'.format(p) for p in percentile]
        thresholds = pd.DataFrame({col: [thresholds[i]]*adata.shape[1] for i, col in enumerate(columns)})
    else:
        columns = ['threshold-{}'.format(percentile)]
        thresholds = pd.DataFrame(thresholds.T, index=adata.var_names, columns=columns, dtype=float)

    if lower_bound is not None:
        if type(lower_bound) not in (int, float, complex):
            lb = lower_bound.copy()
            lb.columns = columns
            thresholds[thresholds < lb] = lb[thresholds < lb]
        else:
            thresholds[thresholds < lower_bound] = lower_bound

    if upper_bound is not None:
        if type(upper_bound) not in (int, float, complex):
            ub = upper_bound.copy()
            ub.columns = columns
            thresholds[thresholds > ub] = ub[thresholds > ub]
        else:
            thresholds[thresholds > upper_bound] = upper_bound
    return thresholds




[docs]
def get_local_mean_threshold(adata, lower_bound=1e-5, upper_bound=None, exclude_zeros=False, use_raw=False):
    """
    Obtains the local mean threshold for each gene in a AnnData object.

    Parameters
    ----------
    adata: AnnData object
        Annotated data matrix.

    lower_bound: float or pandas.DataFrame, optional (default: 1e-5)
        Lower bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes
        as the adata object.

    upper_bound: float or pandas.DataFrame, optional (default: None)
        Upper bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes
        as the adata object.

    exclude_zeros: bool, optional (default: False)
        Whether to exclude zeros when computing the threshold.

    use_raw: bool, optional (default: False)
        Whether to use the raw data stored in adata.raw.X.

    Returns
    -------
    thresholds: pandas.DataFrame
        A pandas.DataFrame object with the local mean threshold for each gene.
    """
    if use_raw:
        X = adata.raw.X.toarray()
    else:
        X = adata.X.toarray()
    if exclude_zeros:
        X = np.where(X==0, np.nan, X)
    thresholds = np.nanmean(X, axis=0)
    columns = ['threshold-mean']
    thresholds = pd.DataFrame(thresholds, index=adata.var_names, columns=columns, dtype=float)

    if lower_bound is not None:
        if type(lower_bound) not in (int, float, complex):
            lb = lower_bound.copy()
            lb.columns = columns
            thresholds[thresholds < lb] = lb[thresholds < lb]
        else:
            thresholds[thresholds < lower_bound] = lower_bound

    if upper_bound is not None:
        if type(upper_bound) not in (int, float, complex):
            ub = upper_bound.copy()
            ub.columns = columns
            thresholds[thresholds > ub] = ub[thresholds > ub]
        else:
            thresholds[thresholds > upper_bound] = upper_bound
    return thresholds




[docs]
def get_global_mean_threshold(adata, lower_bound=1e-5, upper_bound=None, exclude_zeros=False, use_raw=False):
    """
    Obtains the global mean threshold for each gene in a AnnData object.

    Parameters
    ----------
    adata: AnnData object
        Annotated data matrix.

    lower_bound: float or pandas.DataFrame, optional (default: 1e-5)
        Lower bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes

    upper_bound: float or pandas.DataFrame, optional (default: None)
        Upper bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes

    exclude_zeros: bool, optional (default: False)
        Whether to exclude zeros when computing the threshold.

    use_raw: bool, optional (default: False)
        Whether to use the raw data stored in adata.raw.X.

    Returns
    -------
    thresholds: pandas.DataFrame
        A pandas.DataFrame object with the global mean threshold for each gene.
    """
    if use_raw:
        X = adata.raw.X.toarray()
    else:
        X = adata.X.toarray()
    if exclude_zeros:
        X = np.where(X == 0, np.nan, X)
    thresholds = np.nanmean(X)
    columns = ['threshold-mean']
    thresholds = pd.DataFrame(thresholds, index=adata.var_names, columns=columns, dtype=float)

    if lower_bound is not None:
        if type(lower_bound) not in (int, float, complex):
            lb = lower_bound.copy()
            lb.columns = columns
            thresholds[thresholds < lb] = lb[thresholds < lb]
        else:
            thresholds[thresholds < lower_bound] = lower_bound

    if upper_bound is not None:
        if type(upper_bound) not in (int, float, complex):
            ub = upper_bound.copy()
            ub.columns = columns
            thresholds[thresholds > ub] = ub[thresholds > ub]
        else:
            thresholds[thresholds > upper_bound] = upper_bound
    return thresholds




[docs]
def get_local_trimean_threshold(adata, lower_bound=1e-5, upper_bound=None, exclude_zeros=False, use_raw=False):
    """
    Obtains the local Tukey's trimean threshold for each gene in a AnnData object.

    Parameters
    ----------
    adata: AnnData object
        Annotated data matrix.

    lower_bound: float or pandas.DataFrame, optional (default: 1e-5)
        Lower bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes
        as the adata object.

    upper_bound: float or pandas.DataFrame, optional (default: None)
        Upper bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes
        as the adata object.

    exclude_zeros: bool, optional (default: False)
        Whether to exclude zeros when computing the threshold.

    use_raw: bool, optional (default: False)
        Whether to use the raw data stored in adata.raw.X.

    Returns
    -------
    thresholds: pandas.DataFrame
        A pandas.DataFrame object with the local Tukey's trimean threshold for each gene.
    """
    if use_raw:
        X = adata.raw.X.toarray()
    else:
        X = adata.X.toarray()

    if exclude_zeros:
        X = np.where(X == 0, np.nan, X)
    q1 = np.nanquantile(X, q=0.25, axis=0, method='midpoint')
    median = np.nanquantile(X, q=0.5, axis=0, method='midpoint')
    q3 = np.nanquantile(X, q=0.75, axis=0, method='midpoint')
    thresholds = (q1 + 2 * median + q3) / 4

    columns = ['threshold-trimean']
    thresholds = pd.DataFrame(thresholds.T, index=adata.var_names, columns=columns, dtype=float)

    if lower_bound is not None:
        if type(lower_bound) not in (int, float, complex):
            lb = lower_bound.copy()
            lb.columns = columns
            thresholds[thresholds < lb] = lb[thresholds < lb]
        else:
            thresholds[thresholds < lower_bound] = lower_bound

    if upper_bound is not None:
        if type(upper_bound) not in (int, float, complex):
            ub = upper_bound.copy()
            ub.columns = columns
            thresholds[thresholds > ub] = ub[thresholds > ub]
        else:
            thresholds[thresholds > upper_bound] = upper_bound
    return thresholds




[docs]
def get_global_trimean_threshold(adata, lower_bound=1e-5, upper_bound=None, exclude_zeros=False, use_raw=False):
    """
    Obtains the global Tukey's trimean threshold for each gene in a AnnData object.

    Parameters
    ----------
    adata: AnnData object
        Annotated data matrix.

    lower_bound: float or pandas.DataFrame, optional (default: 1e-5)
        Lower bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes
        as the adata object.

    upper_bound: float or pandas.DataFrame, optional (default: None)
        Upper bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes
        as the adata object.

    exclude_zeros: bool, optional (default: False)
        Whether to exclude zeros when computing the threshold.

    use_raw: bool, optional (default: False)
        Whether to use the raw data stored in adata.raw.X.

    Returns
    -------
    thresholds: pandas.DataFrame
        A pandas.DataFrame object with the global Tukey's trimean threshold for each gene.
    """
    if use_raw:
        X = adata.raw.X.toarray()
    else:
        X = adata.X.toarray()

    if exclude_zeros:
        X = np.where(X == 0, np.nan, X)
    q1 = np.nanquantile(X, q=0.25, method='midpoint')
    median = np.nanquantile(X, q=0.5, method='midpoint')
    q3 = np.nanquantile(X, q=0.75, method='midpoint')
    thresholds = (q1 + 2 * median + q3) / 4

    columns = ['threshold-trimean']
    thresholds = pd.DataFrame(thresholds, index=adata.var_names, columns=columns, dtype=float)

    if lower_bound is not None:
        if type(lower_bound) not in (int, float, complex):
            lb = lower_bound.copy()
            lb.columns = columns
            thresholds[thresholds < lb] = lb[thresholds < lb]
        else:
            thresholds[thresholds < lower_bound] = lower_bound

    if upper_bound is not None:
        if type(upper_bound) not in (int, float, complex):
            ub = upper_bound.copy()
            ub.columns = columns
            thresholds[thresholds > ub] = ub[thresholds > ub]
        else:
            thresholds[thresholds > upper_bound] = upper_bound
    return thresholds



class _ReservoirSampler:
    """Vectorized Vitter Algorithm R — uniform sample of size `size` from an unbounded stream."""

    def __init__(self, size, rng):
        self.size = int(size)
        self.rng = rng
        self.reservoir = np.empty(self.size, dtype=np.float32)
        self.count = 0

    def update(self, values):
        values = np.asarray(values, dtype=np.float32).ravel()
        n = values.size
        if n == 0:
            return
        if self.count < self.size:
            fill = min(self.size - self.count, n)
            self.reservoir[self.count:self.count + fill] = values[:fill]
            self.count += fill
            if fill == n:
                return
            values = values[fill:]
            n = values.size
        j = np.arange(self.count + 1, self.count + n + 1, dtype=np.int64)
        r = (self.rng.random(n) * j).astype(np.int64)
        keep = r < self.size
        if keep.any():
            self.reservoir[r[keep]] = values[keep]
        self.count += n

    def sample(self):
        return self.reservoir[:min(self.count, self.size)]


_N_COUNTS_AUTO_KEYS = ('total_counts', 'n_counts', 'raw_sum', 'nCount_RNA')


def _load_gene_set(gene_set, organism):
    """Normalize the ``gene_set`` argument into a list of gene symbols."""
    if gene_set is None:
        from sccellfie.datasets.database import load_sccellfie_database
        db = load_sccellfie_database(organism=organism)
        return list(db['thresholds'].index)
    if isinstance(gene_set, str):
        if not gene_set.lower().endswith('.json'):
            raise ValueError("String `gene_set` must point to a .json file containing a list of gene symbols.")
        with open(os.path.expanduser(gene_set)) as fp:
            loaded = json.load(fp)
        if not isinstance(loaded, list):
            raise ValueError(f"Expected a JSON list in {gene_set}, got {type(loaded).__name__}.")
        return list(loaded)
    return list(gene_set)


def _resolve_cell_index(adata, cell_mask):
    """Return a sorted integer array of selected cells."""
    n_cells = adata.n_obs
    if cell_mask is None:
        return np.arange(n_cells)
    if isinstance(cell_mask, str):
        if cell_mask not in adata.obs.columns:
            raise KeyError(f"`cell_mask='{cell_mask}'` not found in adata.obs.")
        mask = np.asarray(adata.obs[cell_mask].values).astype(bool)
    elif isinstance(cell_mask, pd.Series):
        mask = cell_mask.reindex(adata.obs_names).fillna(False).astype(bool).values
    else:
        mask = np.asarray(cell_mask)
        if mask.dtype == bool:
            if mask.shape[0] != n_cells:
                raise ValueError(f"Boolean `cell_mask` length {mask.shape[0]} != adata.n_obs {n_cells}.")
        else:
            idx = np.asarray(mask, dtype=np.int64)
            mask = np.zeros(n_cells, dtype=bool)
            mask[idx] = True
    return np.where(mask)[0]


def _source_var_names(adata, use_raw):
    if use_raw:
        if adata.raw is None:
            raise ValueError("`use_raw=True` but adata.raw is None.")
        return list(adata.raw.var_names)
    return list(adata.var_names)


def _get_chunk_matrix(chunk_adata, layer, use_raw):
    if use_raw:
        return chunk_adata.raw.X
    if layer is not None:
        return chunk_adata.layers[layer]
    return chunk_adata.X



[docs]
def get_sccellfie_dataset_threshold(adata,
                                    gene_set=None,
                                    organism='human',
                                    cell_mask=None,
                                    layer=None,
                                    use_raw=False,
                                    target_sum=10_000,
                                    n_counts_key=None,
                                    chunk_size=100_000,
                                    reservoir_size=5_000_000,
                                    percentiles=(10, 25, 50, 75, 90, 95),
                                    lower_percentile=25,
                                    upper_percentile=75,
                                    random_state=None,
                                    verbose=True,
                                    return_stats=False):
    """
    Computes a dataset-wise ``sccellfie_threshold`` per metabolic gene by streaming
    the AnnData in chunks. Faithful port of the atlas-based threshold script that
    produced the default ``Thresholds.csv``, generalized to a single (possibly backed)
    AnnData.

    Pipeline per chunk:
      1. CP10k-normalize using a per-cell library size (obs column or computed from the full chunk).
      2. Subset to the corrected metabolic-gene columns (after applying ``CORRECT_GENES[organism]``).
      3. Accumulate per-gene sum, non-zero cell count, and max.
      4. Stream non-zero normalized values into a reservoir sample for global percentiles.

    The final threshold rule matches the original script (with configurable bounds):
        if max > P_lower or max == 0:  threshold = clip(nonzero_mean, P_lower, P_upper)
        else:                          threshold = nonzero_mean
    where ``P_lower`` / ``P_upper`` default to P25 / P75 (the original atlas behavior)
    and are controlled by ``lower_percentile`` / ``upper_percentile``.

    Parameters
    ----------
    adata : AnnData
        Annotated data matrix. May be backed (``sc.read_h5ad(..., backed='r')``);
        chunks are materialized one at a time.

    gene_set : list, set, pandas.Index, str or None, optional (default: None)
        Metabolic gene list. ``None`` loads the default gene list from the scCellFie
        database for ``organism``. A string ending in ``.json`` is treated as the path
        to a JSON file containing a list of gene symbols.

    organism : str, optional (default: 'human')
        Used to select the ``CORRECT_GENES`` rename map and, if ``gene_set`` is None,
        the scCellFie database to load metabolic genes from. Currently ``'human'`` or ``'mouse'``.

    cell_mask : array-like, str or None, optional (default: None)
        Restricts the computation to a subset of cells. Accepts a boolean/integer array,
        a column name in ``adata.obs``, or a ``pandas.Series`` indexed by cell names.

    layer : str or None, optional (default: None)
        Read from ``adata.layers[layer]`` instead of ``adata.X``. Mutually exclusive with ``use_raw``.

    use_raw : bool, optional (default: False)
        Read from ``adata.raw.X``. Mutually exclusive with ``layer``.

    target_sum : float or None, optional (default: 10_000)
        Target library size for CP-normalization. Pass ``None`` to skip normalization
        (e.g. when the input values are already on the desired scale).

    n_counts_key : str or None, optional (default: None)
        Column in ``adata.obs`` containing per-cell totals. If None, auto-detect among
        ``('total_counts', 'n_counts', 'raw_sum', 'nCount_RNA')`` and otherwise compute
        per-cell sums from the full-matrix chunk before gene subsetting.

    chunk_size : int, optional (default: 100_000)
        Number of cells processed per chunk.

    reservoir_size : int, optional (default: 5_000_000)
        Size of the reservoir used to estimate global percentiles of non-zero normalized
        values. Memory cost is ``reservoir_size * 4B`` (float32).

    percentiles : tuple of int, optional (default: (10, 25, 50, 75, 90, 95))
        Percentiles to *report* in the returned stats. Always merged with
        ``{lower_percentile, upper_percentile}`` so the rule's bounds are also
        available for inspection.

    lower_percentile, upper_percentile : int or float, optional (default: 25 and 75)
        Percentile bounds used by the clip rule. The threshold for each gene is
        ``clip(nonzero_mean, P_lower, P_upper)`` when the gene's max value exceeds
        ``P_lower`` or is zero (the low-expression escape); otherwise the raw
        ``nonzero_mean`` is used. Must satisfy
        ``0 <= lower_percentile < upper_percentile <= 100``. Defaults reproduce the
        original atlas-derived ``sccellfie_threshold`` exactly.

    random_state : int or None, optional (default: None)
        Seed for the reservoir sampler.

    verbose : bool, optional (default: True)
        If True, print progress via tqdm.

    return_stats : bool, optional (default: False)
        If True, also return a dict with intermediate statistics.

    Returns
    -------
    thresholds : pandas.DataFrame
        A DataFrame indexed by metabolic gene symbol with a single column
        ``'sccellfie_threshold'``. Ready to pass to ``compute_gene_scores``
        (which selects the first column positionally).

    stats : dict, only if ``return_stats=True``
        Dict with keys ``percentiles``, ``sum_per_gene``, ``nnz_per_gene``,
        ``max_per_gene``, ``mean``, ``nonzero_mean``, ``n_cells``,
        ``n_values_seen``, ``reservoir_size_used``.
    """
    if use_raw and layer is not None:
        raise ValueError("`use_raw=True` and `layer` are mutually exclusive.")

    if not (0 <= lower_percentile < upper_percentile <= 100):
        raise ValueError(
            f"Require 0 <= lower_percentile < upper_percentile <= 100; "
            f"got lower_percentile={lower_percentile}, upper_percentile={upper_percentile}."
        )

    from sccellfie.preprocessing.prepare_inputs import CORRECT_GENES

    rename_map = CORRECT_GENES.get(organism, {})
    source_var = _source_var_names(adata, use_raw)
    corrected_var = np.array([rename_map.get(g, g) for g in source_var])

    gene_list = _load_gene_set(gene_set, organism)
    gene_set_set = set(gene_list)
    col_idx = np.where(np.array([g in gene_set_set for g in corrected_var]))[0]
    if col_idx.size == 0:
        raise ValueError("No overlap between `gene_set` and adata variables "
                         "(after applying CORRECT_GENES). Check `organism` and gene nomenclature.")
    final_gene_names = corrected_var[col_idx].tolist()

    cell_idx_full = _resolve_cell_index(adata, cell_mask)
    n_cells_sel = cell_idx_full.size
    if n_cells_sel == 0:
        raise ValueError("`cell_mask` selected zero cells.")

    already_normalized = (adata.uns.get('normalization', {}).get('method') == 'total_counts')
    do_normalize = (target_sum is not None) and (not already_normalized)

    resolved_key = None
    if do_normalize:
        if n_counts_key is not None:
            if n_counts_key not in adata.obs.columns:
                raise KeyError(f"`n_counts_key='{n_counts_key}'` not found in adata.obs.")
            resolved_key = n_counts_key
        else:
            for k in _N_COUNTS_AUTO_KEYS:
                if k in adata.obs.columns:
                    resolved_key = k
                    break

    n_genes = col_idx.size
    sum_per_gene = np.zeros(n_genes, dtype=np.float64)
    nnz_per_gene = np.zeros(n_genes, dtype=np.int64)
    max_per_gene = np.zeros(n_genes, dtype=np.float64)

    rng = np.random.default_rng(random_state)
    reservoir = _ReservoirSampler(reservoir_size, rng)

    chunk_starts = list(range(0, n_cells_sel, chunk_size))
    iterator = tqdm(chunk_starts, desc='Streaming chunks', disable=not verbose)

    for start in iterator:
        end = min(start + chunk_size, n_cells_sel)
        idx = cell_idx_full[start:end]

        chunk_adata = adata[idx]
        if getattr(adata, 'isbacked', False):
            chunk_adata = chunk_adata.to_memory()

        X_full = _get_chunk_matrix(chunk_adata, layer, use_raw)

        if do_normalize:
            if resolved_key is not None:
                n_counts_chunk = np.asarray(adata.obs[resolved_key].values[idx], dtype=np.float64)
            else:
                if sparse.issparse(X_full):
                    n_counts_chunk = np.asarray(X_full.sum(axis=1)).ravel().astype(np.float64)
                else:
                    n_counts_chunk = np.asarray(X_full.sum(axis=1), dtype=np.float64).ravel()
            safe = n_counts_chunk > 0
            scaling = np.zeros_like(n_counts_chunk)
            scaling[safe] = target_sum / n_counts_chunk[safe]
        else:
            scaling = None

        X_sub = X_full[:, col_idx]

        if scaling is not None:
            if sparse.issparse(X_sub):
                X_norm = sparse.diags(scaling, 0, format='csr') @ X_sub.tocsr()
            else:
                X_norm = np.asarray(X_sub) * scaling[:, None]
        else:
            X_norm = X_sub

        if sparse.issparse(X_norm):
            X_csr = X_norm.tocsr()
            sum_per_gene += np.asarray(X_csr.sum(axis=0)).ravel()
            nnz_per_gene += np.asarray((X_csr > 0).sum(axis=0)).ravel().astype(np.int64)
            col_max = np.asarray(X_csr.max(axis=0).todense()).ravel()
            np.maximum(max_per_gene, col_max, out=max_per_gene)
            data = X_csr.data
            nz_values = data[data > 0]
        else:
            X_dense = np.asarray(X_norm)
            sum_per_gene += X_dense.sum(axis=0)
            nnz_per_gene += (X_dense > 0).sum(axis=0).astype(np.int64)
            col_max = X_dense.max(axis=0) if X_dense.size else np.zeros(n_genes)
            np.maximum(max_per_gene, col_max, out=max_per_gene)
            nz_values = X_dense[X_dense > 0]

        reservoir.update(nz_values)

    required = {lower_percentile, upper_percentile}
    all_pcts = sorted(set(percentiles) | required)
    sample = reservoir.sample()
    if sample.size == 0:
        warnings.warn("No non-zero values encountered; returning zero thresholds.", UserWarning)
        pct_values = np.zeros(len(all_pcts))
    else:
        pct_values = np.percentile(sample, all_pcts)
    pct_dict = dict(zip(all_pcts, pct_values))
    p_lo = float(pct_dict[lower_percentile])
    p_hi = float(pct_dict[upper_percentile])

    nz_mean = np.zeros(n_genes, dtype=np.float64)
    has_nz = nnz_per_gene > 0
    nz_mean[has_nz] = sum_per_gene[has_nz] / nnz_per_gene[has_nz]

    clip_mask = (max_per_gene > p_lo) | (max_per_gene == 0)
    threshold = np.where(clip_mask, np.clip(nz_mean, p_lo, p_hi), nz_mean)

    thresholds_df = pd.DataFrame({'sccellfie_threshold': threshold},
                                 index=pd.Index(final_gene_names, name='symbol'))

    if not return_stats:
        return thresholds_df

    mean = np.zeros(n_genes, dtype=np.float64)
    if n_cells_sel > 0:
        mean = sum_per_gene / n_cells_sel
    def _pct_key(p):
        return int(p) if float(p).is_integer() else float(p)

    stats = {
        'percentiles': {_pct_key(p): float(v) for p, v in pct_dict.items()},
        'sum_per_gene': pd.Series(sum_per_gene, index=final_gene_names),
        'nnz_per_gene': pd.Series(nnz_per_gene, index=final_gene_names),
        'max_per_gene': pd.Series(max_per_gene, index=final_gene_names),
        'mean': pd.Series(mean, index=final_gene_names),
        'nonzero_mean': pd.Series(nz_mean, index=final_gene_names),
        'n_cells': int(n_cells_sel),
        'n_values_seen': int(reservoir.count),
        'reservoir_size_used': int(min(reservoir.count, reservoir.size)),
    }
    return thresholds_df, stats




[docs]
def set_manual_threshold(adata, threshold):
    """
    Sets a threshold manually for each gene in a AnnData object.

    Parameters
    ----------
    adata: AnnData object
        Annotated data matrix.

    threshold: float or list of floats
        Threshold(s) to be set for each gene. If a list is passed
        it must have the same number of elements as genes in adata, and
        in the same order.

    Returns
    -------
    thresholds: pandas.DataFrame
        A pandas.DataFrame object with the manual threshold for each gene.
    """
    if isinstance(threshold, list):
        assert len(adata.var_names) == len(threshold), "The len of threshold must be the same as gene number in adata"
        thresholds = pd.DataFrame(data={'threshold-manual': threshold}, index=adata.var_names, dtype=float)
    else:
        thresholds = pd.DataFrame(data={'threshold-manual': [threshold]}, index=adata.var_names, dtype=float)
    return thresholds