Source code for sccellfie.expression.thresholds

import json
import os
import warnings

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm


[docs] def get_local_percentile_threshold(adata, percentile=0.75, lower_bound=1e-5, upper_bound=None, exclude_zeros=False, use_raw=False): """ Obtains the local percentile threshold for each gene in a AnnData object. Parameters ---------- adata: AnnData object Annotated data matrix. percentile: float or list of floats, optional (default: 0.75) Percentile(s) to compute the threshold. lower_bound: float or pandas.DataFrame, optional (default: 1e-5) Lower bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes as the adata object. upper_bound: float or pandas.DataFrame, optional (default: None) Upper bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes as the adata object. exclude_zeros: bool, optional (default: False) Whether to exclude zeros when computing the threshold. use_raw: bool, optional (default: False) Whether to use the raw data stored in adata.raw.X. Returns ------- thresholds: pandas.DataFrame A pandas.DataFrame object with the local percentile threshold for each gene. """ if use_raw: X = adata.raw.X.toarray() else: X = adata.X.toarray() if exclude_zeros: X = np.where(X==0, np.nan, X) thresholds = np.nanquantile(X, q=percentile, axis=0, method='midpoint') if isinstance(percentile, list): columns = ['threshold-{}'.format(p) for p in percentile] else: columns = ['threshold-{}'.format(percentile)] thresholds = pd.DataFrame(thresholds.T, index=adata.var_names, columns=columns, dtype=float) if lower_bound is not None: if type(lower_bound) not in (int, float, complex): lb = lower_bound.copy() lb.columns = columns thresholds[thresholds < lb] = lb[thresholds < lb] else: thresholds[thresholds < lower_bound] = lower_bound if upper_bound is not None: if type(upper_bound) not in (int, float, complex): ub = upper_bound.copy() ub.columns = columns thresholds[thresholds > ub] = ub[thresholds > ub] else: thresholds[thresholds > upper_bound] = upper_bound return thresholds
[docs] def get_global_percentile_threshold(adata, percentile=0.75, lower_bound=1e-5, upper_bound=None, exclude_zeros=False, use_raw=False): """ Obtains the global percentile threshold for each gene in a AnnData object. Parameters ---------- adata: AnnData object Annotated data matrix. percentile: float or list of floats, optional (default: 0.75) Percentile(s) to compute the threshold. lower_bound: float or pandas.DataFrame, optional (default: 1e-5) Lower bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes as the adata object. upper_bound: float or pandas.DataFrame, optional (default: None) Upper bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes as the adata object. exclude_zeros: bool, optional (default: False) Whether to exclude zeros when computing the threshold. use_raw: bool, optional (default: False) Whether to use the raw data stored in adata.raw.X. Returns ------- thresholds: pandas.DataFrame A pandas.DataFrame object with the global percentile threshold for each gene. """ if use_raw: X = adata.raw.X.toarray() else: X = adata.X.toarray() if exclude_zeros: X = np.where(X == 0, np.nan, X) thresholds = np.nanquantile(X, q=percentile, method='midpoint') if isinstance(percentile, list): columns = ['threshold-{}'.format(p) for p in percentile] thresholds = pd.DataFrame({col: [thresholds[i]]*adata.shape[1] for i, col in enumerate(columns)}) else: columns = ['threshold-{}'.format(percentile)] thresholds = pd.DataFrame(thresholds.T, index=adata.var_names, columns=columns, dtype=float) if lower_bound is not None: if type(lower_bound) not in (int, float, complex): lb = lower_bound.copy() lb.columns = columns thresholds[thresholds < lb] = lb[thresholds < lb] else: thresholds[thresholds < lower_bound] = lower_bound if upper_bound is not None: if type(upper_bound) not in (int, float, complex): ub = upper_bound.copy() ub.columns = columns thresholds[thresholds > ub] = ub[thresholds > ub] else: thresholds[thresholds > upper_bound] = upper_bound return thresholds
[docs] def get_local_mean_threshold(adata, lower_bound=1e-5, upper_bound=None, exclude_zeros=False, use_raw=False): """ Obtains the local mean threshold for each gene in a AnnData object. Parameters ---------- adata: AnnData object Annotated data matrix. lower_bound: float or pandas.DataFrame, optional (default: 1e-5) Lower bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes as the adata object. upper_bound: float or pandas.DataFrame, optional (default: None) Upper bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes as the adata object. exclude_zeros: bool, optional (default: False) Whether to exclude zeros when computing the threshold. use_raw: bool, optional (default: False) Whether to use the raw data stored in adata.raw.X. Returns ------- thresholds: pandas.DataFrame A pandas.DataFrame object with the local mean threshold for each gene. """ if use_raw: X = adata.raw.X.toarray() else: X = adata.X.toarray() if exclude_zeros: X = np.where(X==0, np.nan, X) thresholds = np.nanmean(X, axis=0) columns = ['threshold-mean'] thresholds = pd.DataFrame(thresholds, index=adata.var_names, columns=columns, dtype=float) if lower_bound is not None: if type(lower_bound) not in (int, float, complex): lb = lower_bound.copy() lb.columns = columns thresholds[thresholds < lb] = lb[thresholds < lb] else: thresholds[thresholds < lower_bound] = lower_bound if upper_bound is not None: if type(upper_bound) not in (int, float, complex): ub = upper_bound.copy() ub.columns = columns thresholds[thresholds > ub] = ub[thresholds > ub] else: thresholds[thresholds > upper_bound] = upper_bound return thresholds
[docs] def get_global_mean_threshold(adata, lower_bound=1e-5, upper_bound=None, exclude_zeros=False, use_raw=False): """ Obtains the global mean threshold for each gene in a AnnData object. Parameters ---------- adata: AnnData object Annotated data matrix. lower_bound: float or pandas.DataFrame, optional (default: 1e-5) Lower bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes upper_bound: float or pandas.DataFrame, optional (default: None) Upper bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes exclude_zeros: bool, optional (default: False) Whether to exclude zeros when computing the threshold. use_raw: bool, optional (default: False) Whether to use the raw data stored in adata.raw.X. Returns ------- thresholds: pandas.DataFrame A pandas.DataFrame object with the global mean threshold for each gene. """ if use_raw: X = adata.raw.X.toarray() else: X = adata.X.toarray() if exclude_zeros: X = np.where(X == 0, np.nan, X) thresholds = np.nanmean(X) columns = ['threshold-mean'] thresholds = pd.DataFrame(thresholds, index=adata.var_names, columns=columns, dtype=float) if lower_bound is not None: if type(lower_bound) not in (int, float, complex): lb = lower_bound.copy() lb.columns = columns thresholds[thresholds < lb] = lb[thresholds < lb] else: thresholds[thresholds < lower_bound] = lower_bound if upper_bound is not None: if type(upper_bound) not in (int, float, complex): ub = upper_bound.copy() ub.columns = columns thresholds[thresholds > ub] = ub[thresholds > ub] else: thresholds[thresholds > upper_bound] = upper_bound return thresholds
[docs] def get_local_trimean_threshold(adata, lower_bound=1e-5, upper_bound=None, exclude_zeros=False, use_raw=False): """ Obtains the local Tukey's trimean threshold for each gene in a AnnData object. Parameters ---------- adata: AnnData object Annotated data matrix. lower_bound: float or pandas.DataFrame, optional (default: 1e-5) Lower bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes as the adata object. upper_bound: float or pandas.DataFrame, optional (default: None) Upper bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes as the adata object. exclude_zeros: bool, optional (default: False) Whether to exclude zeros when computing the threshold. use_raw: bool, optional (default: False) Whether to use the raw data stored in adata.raw.X. Returns ------- thresholds: pandas.DataFrame A pandas.DataFrame object with the local Tukey's trimean threshold for each gene. """ if use_raw: X = adata.raw.X.toarray() else: X = adata.X.toarray() if exclude_zeros: X = np.where(X == 0, np.nan, X) q1 = np.nanquantile(X, q=0.25, axis=0, method='midpoint') median = np.nanquantile(X, q=0.5, axis=0, method='midpoint') q3 = np.nanquantile(X, q=0.75, axis=0, method='midpoint') thresholds = (q1 + 2 * median + q3) / 4 columns = ['threshold-trimean'] thresholds = pd.DataFrame(thresholds.T, index=adata.var_names, columns=columns, dtype=float) if lower_bound is not None: if type(lower_bound) not in (int, float, complex): lb = lower_bound.copy() lb.columns = columns thresholds[thresholds < lb] = lb[thresholds < lb] else: thresholds[thresholds < lower_bound] = lower_bound if upper_bound is not None: if type(upper_bound) not in (int, float, complex): ub = upper_bound.copy() ub.columns = columns thresholds[thresholds > ub] = ub[thresholds > ub] else: thresholds[thresholds > upper_bound] = upper_bound return thresholds
[docs] def get_global_trimean_threshold(adata, lower_bound=1e-5, upper_bound=None, exclude_zeros=False, use_raw=False): """ Obtains the global Tukey's trimean threshold for each gene in a AnnData object. Parameters ---------- adata: AnnData object Annotated data matrix. lower_bound: float or pandas.DataFrame, optional (default: 1e-5) Lower bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes as the adata object. upper_bound: float or pandas.DataFrame, optional (default: None) Upper bound for the threshold. If a pandas.DataFrame is provided, it must have the same number of genes as the adata object. exclude_zeros: bool, optional (default: False) Whether to exclude zeros when computing the threshold. use_raw: bool, optional (default: False) Whether to use the raw data stored in adata.raw.X. Returns ------- thresholds: pandas.DataFrame A pandas.DataFrame object with the global Tukey's trimean threshold for each gene. """ if use_raw: X = adata.raw.X.toarray() else: X = adata.X.toarray() if exclude_zeros: X = np.where(X == 0, np.nan, X) q1 = np.nanquantile(X, q=0.25, method='midpoint') median = np.nanquantile(X, q=0.5, method='midpoint') q3 = np.nanquantile(X, q=0.75, method='midpoint') thresholds = (q1 + 2 * median + q3) / 4 columns = ['threshold-trimean'] thresholds = pd.DataFrame(thresholds, index=adata.var_names, columns=columns, dtype=float) if lower_bound is not None: if type(lower_bound) not in (int, float, complex): lb = lower_bound.copy() lb.columns = columns thresholds[thresholds < lb] = lb[thresholds < lb] else: thresholds[thresholds < lower_bound] = lower_bound if upper_bound is not None: if type(upper_bound) not in (int, float, complex): ub = upper_bound.copy() ub.columns = columns thresholds[thresholds > ub] = ub[thresholds > ub] else: thresholds[thresholds > upper_bound] = upper_bound return thresholds
class _ReservoirSampler: """Vectorized Vitter Algorithm R — uniform sample of size `size` from an unbounded stream.""" def __init__(self, size, rng): self.size = int(size) self.rng = rng self.reservoir = np.empty(self.size, dtype=np.float32) self.count = 0 def update(self, values): values = np.asarray(values, dtype=np.float32).ravel() n = values.size if n == 0: return if self.count < self.size: fill = min(self.size - self.count, n) self.reservoir[self.count:self.count + fill] = values[:fill] self.count += fill if fill == n: return values = values[fill:] n = values.size j = np.arange(self.count + 1, self.count + n + 1, dtype=np.int64) r = (self.rng.random(n) * j).astype(np.int64) keep = r < self.size if keep.any(): self.reservoir[r[keep]] = values[keep] self.count += n def sample(self): return self.reservoir[:min(self.count, self.size)] _N_COUNTS_AUTO_KEYS = ('total_counts', 'n_counts', 'raw_sum', 'nCount_RNA') def _load_gene_set(gene_set, organism): """Normalize the ``gene_set`` argument into a list of gene symbols.""" if gene_set is None: from sccellfie.datasets.database import load_sccellfie_database db = load_sccellfie_database(organism=organism) return list(db['thresholds'].index) if isinstance(gene_set, str): if not gene_set.lower().endswith('.json'): raise ValueError("String `gene_set` must point to a .json file containing a list of gene symbols.") with open(os.path.expanduser(gene_set)) as fp: loaded = json.load(fp) if not isinstance(loaded, list): raise ValueError(f"Expected a JSON list in {gene_set}, got {type(loaded).__name__}.") return list(loaded) return list(gene_set) def _resolve_cell_index(adata, cell_mask): """Return a sorted integer array of selected cells.""" n_cells = adata.n_obs if cell_mask is None: return np.arange(n_cells) if isinstance(cell_mask, str): if cell_mask not in adata.obs.columns: raise KeyError(f"`cell_mask='{cell_mask}'` not found in adata.obs.") mask = np.asarray(adata.obs[cell_mask].values).astype(bool) elif isinstance(cell_mask, pd.Series): mask = cell_mask.reindex(adata.obs_names).fillna(False).astype(bool).values else: mask = np.asarray(cell_mask) if mask.dtype == bool: if mask.shape[0] != n_cells: raise ValueError(f"Boolean `cell_mask` length {mask.shape[0]} != adata.n_obs {n_cells}.") else: idx = np.asarray(mask, dtype=np.int64) mask = np.zeros(n_cells, dtype=bool) mask[idx] = True return np.where(mask)[0] def _source_var_names(adata, use_raw): if use_raw: if adata.raw is None: raise ValueError("`use_raw=True` but adata.raw is None.") return list(adata.raw.var_names) return list(adata.var_names) def _get_chunk_matrix(chunk_adata, layer, use_raw): if use_raw: return chunk_adata.raw.X if layer is not None: return chunk_adata.layers[layer] return chunk_adata.X
[docs] def get_sccellfie_dataset_threshold(adata, gene_set=None, organism='human', cell_mask=None, layer=None, use_raw=False, target_sum=10_000, n_counts_key=None, chunk_size=100_000, reservoir_size=5_000_000, percentiles=(10, 25, 50, 75, 90, 95), lower_percentile=25, upper_percentile=75, random_state=None, verbose=True, return_stats=False): """ Computes a dataset-wise ``sccellfie_threshold`` per metabolic gene by streaming the AnnData in chunks. Faithful port of the atlas-based threshold script that produced the default ``Thresholds.csv``, generalized to a single (possibly backed) AnnData. Pipeline per chunk: 1. CP10k-normalize using a per-cell library size (obs column or computed from the full chunk). 2. Subset to the corrected metabolic-gene columns (after applying ``CORRECT_GENES[organism]``). 3. Accumulate per-gene sum, non-zero cell count, and max. 4. Stream non-zero normalized values into a reservoir sample for global percentiles. The final threshold rule matches the original script (with configurable bounds): if max > P_lower or max == 0: threshold = clip(nonzero_mean, P_lower, P_upper) else: threshold = nonzero_mean where ``P_lower`` / ``P_upper`` default to P25 / P75 (the original atlas behavior) and are controlled by ``lower_percentile`` / ``upper_percentile``. Parameters ---------- adata : AnnData Annotated data matrix. May be backed (``sc.read_h5ad(..., backed='r')``); chunks are materialized one at a time. gene_set : list, set, pandas.Index, str or None, optional (default: None) Metabolic gene list. ``None`` loads the default gene list from the scCellFie database for ``organism``. A string ending in ``.json`` is treated as the path to a JSON file containing a list of gene symbols. organism : str, optional (default: 'human') Used to select the ``CORRECT_GENES`` rename map and, if ``gene_set`` is None, the scCellFie database to load metabolic genes from. Currently ``'human'`` or ``'mouse'``. cell_mask : array-like, str or None, optional (default: None) Restricts the computation to a subset of cells. Accepts a boolean/integer array, a column name in ``adata.obs``, or a ``pandas.Series`` indexed by cell names. layer : str or None, optional (default: None) Read from ``adata.layers[layer]`` instead of ``adata.X``. Mutually exclusive with ``use_raw``. use_raw : bool, optional (default: False) Read from ``adata.raw.X``. Mutually exclusive with ``layer``. target_sum : float or None, optional (default: 10_000) Target library size for CP-normalization. Pass ``None`` to skip normalization (e.g. when the input values are already on the desired scale). n_counts_key : str or None, optional (default: None) Column in ``adata.obs`` containing per-cell totals. If None, auto-detect among ``('total_counts', 'n_counts', 'raw_sum', 'nCount_RNA')`` and otherwise compute per-cell sums from the full-matrix chunk before gene subsetting. chunk_size : int, optional (default: 100_000) Number of cells processed per chunk. reservoir_size : int, optional (default: 5_000_000) Size of the reservoir used to estimate global percentiles of non-zero normalized values. Memory cost is ``reservoir_size * 4B`` (float32). percentiles : tuple of int, optional (default: (10, 25, 50, 75, 90, 95)) Percentiles to *report* in the returned stats. Always merged with ``{lower_percentile, upper_percentile}`` so the rule's bounds are also available for inspection. lower_percentile, upper_percentile : int or float, optional (default: 25 and 75) Percentile bounds used by the clip rule. The threshold for each gene is ``clip(nonzero_mean, P_lower, P_upper)`` when the gene's max value exceeds ``P_lower`` or is zero (the low-expression escape); otherwise the raw ``nonzero_mean`` is used. Must satisfy ``0 <= lower_percentile < upper_percentile <= 100``. Defaults reproduce the original atlas-derived ``sccellfie_threshold`` exactly. random_state : int or None, optional (default: None) Seed for the reservoir sampler. verbose : bool, optional (default: True) If True, print progress via tqdm. return_stats : bool, optional (default: False) If True, also return a dict with intermediate statistics. Returns ------- thresholds : pandas.DataFrame A DataFrame indexed by metabolic gene symbol with a single column ``'sccellfie_threshold'``. Ready to pass to ``compute_gene_scores`` (which selects the first column positionally). stats : dict, only if ``return_stats=True`` Dict with keys ``percentiles``, ``sum_per_gene``, ``nnz_per_gene``, ``max_per_gene``, ``mean``, ``nonzero_mean``, ``n_cells``, ``n_values_seen``, ``reservoir_size_used``. """ if use_raw and layer is not None: raise ValueError("`use_raw=True` and `layer` are mutually exclusive.") if not (0 <= lower_percentile < upper_percentile <= 100): raise ValueError( f"Require 0 <= lower_percentile < upper_percentile <= 100; " f"got lower_percentile={lower_percentile}, upper_percentile={upper_percentile}." ) from sccellfie.preprocessing.prepare_inputs import CORRECT_GENES rename_map = CORRECT_GENES.get(organism, {}) source_var = _source_var_names(adata, use_raw) corrected_var = np.array([rename_map.get(g, g) for g in source_var]) gene_list = _load_gene_set(gene_set, organism) gene_set_set = set(gene_list) col_idx = np.where(np.array([g in gene_set_set for g in corrected_var]))[0] if col_idx.size == 0: raise ValueError("No overlap between `gene_set` and adata variables " "(after applying CORRECT_GENES). Check `organism` and gene nomenclature.") final_gene_names = corrected_var[col_idx].tolist() cell_idx_full = _resolve_cell_index(adata, cell_mask) n_cells_sel = cell_idx_full.size if n_cells_sel == 0: raise ValueError("`cell_mask` selected zero cells.") already_normalized = (adata.uns.get('normalization', {}).get('method') == 'total_counts') do_normalize = (target_sum is not None) and (not already_normalized) resolved_key = None if do_normalize: if n_counts_key is not None: if n_counts_key not in adata.obs.columns: raise KeyError(f"`n_counts_key='{n_counts_key}'` not found in adata.obs.") resolved_key = n_counts_key else: for k in _N_COUNTS_AUTO_KEYS: if k in adata.obs.columns: resolved_key = k break n_genes = col_idx.size sum_per_gene = np.zeros(n_genes, dtype=np.float64) nnz_per_gene = np.zeros(n_genes, dtype=np.int64) max_per_gene = np.zeros(n_genes, dtype=np.float64) rng = np.random.default_rng(random_state) reservoir = _ReservoirSampler(reservoir_size, rng) chunk_starts = list(range(0, n_cells_sel, chunk_size)) iterator = tqdm(chunk_starts, desc='Streaming chunks', disable=not verbose) for start in iterator: end = min(start + chunk_size, n_cells_sel) idx = cell_idx_full[start:end] chunk_adata = adata[idx] if getattr(adata, 'isbacked', False): chunk_adata = chunk_adata.to_memory() X_full = _get_chunk_matrix(chunk_adata, layer, use_raw) if do_normalize: if resolved_key is not None: n_counts_chunk = np.asarray(adata.obs[resolved_key].values[idx], dtype=np.float64) else: if sparse.issparse(X_full): n_counts_chunk = np.asarray(X_full.sum(axis=1)).ravel().astype(np.float64) else: n_counts_chunk = np.asarray(X_full.sum(axis=1), dtype=np.float64).ravel() safe = n_counts_chunk > 0 scaling = np.zeros_like(n_counts_chunk) scaling[safe] = target_sum / n_counts_chunk[safe] else: scaling = None X_sub = X_full[:, col_idx] if scaling is not None: if sparse.issparse(X_sub): X_norm = sparse.diags(scaling, 0, format='csr') @ X_sub.tocsr() else: X_norm = np.asarray(X_sub) * scaling[:, None] else: X_norm = X_sub if sparse.issparse(X_norm): X_csr = X_norm.tocsr() sum_per_gene += np.asarray(X_csr.sum(axis=0)).ravel() nnz_per_gene += np.asarray((X_csr > 0).sum(axis=0)).ravel().astype(np.int64) col_max = np.asarray(X_csr.max(axis=0).todense()).ravel() np.maximum(max_per_gene, col_max, out=max_per_gene) data = X_csr.data nz_values = data[data > 0] else: X_dense = np.asarray(X_norm) sum_per_gene += X_dense.sum(axis=0) nnz_per_gene += (X_dense > 0).sum(axis=0).astype(np.int64) col_max = X_dense.max(axis=0) if X_dense.size else np.zeros(n_genes) np.maximum(max_per_gene, col_max, out=max_per_gene) nz_values = X_dense[X_dense > 0] reservoir.update(nz_values) required = {lower_percentile, upper_percentile} all_pcts = sorted(set(percentiles) | required) sample = reservoir.sample() if sample.size == 0: warnings.warn("No non-zero values encountered; returning zero thresholds.", UserWarning) pct_values = np.zeros(len(all_pcts)) else: pct_values = np.percentile(sample, all_pcts) pct_dict = dict(zip(all_pcts, pct_values)) p_lo = float(pct_dict[lower_percentile]) p_hi = float(pct_dict[upper_percentile]) nz_mean = np.zeros(n_genes, dtype=np.float64) has_nz = nnz_per_gene > 0 nz_mean[has_nz] = sum_per_gene[has_nz] / nnz_per_gene[has_nz] clip_mask = (max_per_gene > p_lo) | (max_per_gene == 0) threshold = np.where(clip_mask, np.clip(nz_mean, p_lo, p_hi), nz_mean) thresholds_df = pd.DataFrame({'sccellfie_threshold': threshold}, index=pd.Index(final_gene_names, name='symbol')) if not return_stats: return thresholds_df mean = np.zeros(n_genes, dtype=np.float64) if n_cells_sel > 0: mean = sum_per_gene / n_cells_sel def _pct_key(p): return int(p) if float(p).is_integer() else float(p) stats = { 'percentiles': {_pct_key(p): float(v) for p, v in pct_dict.items()}, 'sum_per_gene': pd.Series(sum_per_gene, index=final_gene_names), 'nnz_per_gene': pd.Series(nnz_per_gene, index=final_gene_names), 'max_per_gene': pd.Series(max_per_gene, index=final_gene_names), 'mean': pd.Series(mean, index=final_gene_names), 'nonzero_mean': pd.Series(nz_mean, index=final_gene_names), 'n_cells': int(n_cells_sel), 'n_values_seen': int(reservoir.count), 'reservoir_size_used': int(min(reservoir.count, reservoir.size)), } return thresholds_df, stats
[docs] def set_manual_threshold(adata, threshold): """ Sets a threshold manually for each gene in a AnnData object. Parameters ---------- adata: AnnData object Annotated data matrix. threshold: float or list of floats Threshold(s) to be set for each gene. If a list is passed it must have the same number of elements as genes in adata, and in the same order. Returns ------- thresholds: pandas.DataFrame A pandas.DataFrame object with the manual threshold for each gene. """ if isinstance(threshold, list): assert len(adata.var_names) == len(threshold), "The len of threshold must be the same as gene number in adata" thresholds = pd.DataFrame(data={'threshold-manual': threshold}, index=adata.var_names, dtype=float) else: thresholds = pd.DataFrame(data={'threshold-manual': [threshold]}, index=adata.var_names, dtype=float) return thresholds