Source code for sccellfie.expression.aggregation

import numpy as np
import pandas as pd
from scipy.sparse import issparse



[docs]
def agg_expression_cells(adata, groupby, layer=None, gene_symbols=None, agg_func='mean', top_percent=10,
                         exclude_zeros=False, use_raw=False, threshold=None):
    """
    Aggregates gene expression data for specified cell groups in an `AnnData` object.

    Parameters
    ----------
    adata : AnnData
        An `AnnData` object containing the expression data to be aggregated.

    groupby : str
        The key in the `adata.obs` DataFrame to group by. This could be any
        categorical annotation of cells (e.g., cell type, condition).

    layer : str, optional (default: None)
        The name of the layer in `adata` to use for aggregation. If `None`,
        the main expression matrix `adata.X` is used.

    gene_symbols : str or list, optional (default: None)
        Gene names to include in the aggregation. If a string is provided,
        it is converted to a single-element list. If `None`, all genes are included.

    agg_func : str, optional  (default: 'mean')
        The aggregation function to apply. Options are 'mean', 'median',
        '25p' (25th percentile), '75p' (75th percentile), 'trimean' (0.5*Q2 + 0.25(Q1+Q3)),
        'topmean' (computed among the top `top_percent`% of values), and
        'fraction_above' (fraction of cells above threshold)
        The function must be one of the keys in the `AGG_FUNC` dictionary.

    top_percent : float, optional (default: 10)
        The percentage of top values to consider when `agg_func` is 'topmean'.
        Ranging from 0 to 100.

    exclude_zeros: bool, optional (default: False)
        Whether to exclude zeros when aggregating the values.

    use_raw : bool, optional  (default: False)
        Whether to use the data in adata.raw.X (True) or in adata.X (False).

    threshold : float, optional (default: None)
        Expression threshold used when agg_func is 'fraction_above'. Represents the
        minimum expression value for a cell to be considered as expressing the gene.

    Returns
    -------
    agg_expression : pandas.DataFrame
        A pandas.DataFrame where columns correspond to genes and rows correspond to the
        unique categories in `groupby`. Each cell in the DataFrame contains the
        aggregated expression value for the corresponding gene and group.

    Raises
    ------
    AssertionError
        If the provided `agg_func` is not a valid key in `AGG_FUNC`.

    Notes
    -----
    This function is used to compute summary statistics of gene expression data
    across different groups of cells. It is useful for exploring expression
    patterns in different cell types or conditions.

    The function relies on the `groupby` parameter in `adata.obs` to define the
    groups of cells for which the expression data will be aggregated.
    """
    # Check if agg_func is fraction_above and threshold is provided
    if agg_func == 'fraction_above' and threshold is None:
        raise ValueError("Must provide threshold when using 'fraction_above' aggregation")

    assert agg_func in AGG_FUNC.keys(), "Specify a valid `agg_func`."

    # Select the appropriate data layer
    if layer is not None:
        X = adata.layers[layer]
    else:
        if use_raw:
            X = adata.raw.X
        else:
            X = adata.X

    if issparse(X):
        X = X.toarray()

    if exclude_zeros:
        X = np.where(X==0, np.nan, X)

    # Filter for specific genes if provided
    if gene_symbols is not None:
        if isinstance(gene_symbols, str):
            gene_symbols = [gene_symbols]
        gene_mask = adata.var_names.isin(gene_symbols)
        X = X[:, gene_mask]
        gene_index = adata.var_names[gene_mask]
    else:
        gene_index = adata.var_names

    # Group data by the specified groupby column
    grouped = adata.obs[groupby]
    agg_expression = pd.DataFrame(index=gene_index)

    # Perform aggregation for each group
    for group in sorted(np.unique(grouped)):
        group_mask = grouped == group
        group_data = X[group_mask, :]

        if agg_func == 'topmean':
            agg_expression[group] = AGG_FUNC[agg_func](group_data, axis=0, percent=top_percent)
        elif agg_func == 'fraction_above':
            agg_expression[group] = AGG_FUNC[agg_func](group_data, axis=0, threshold=threshold)
        else:
            agg_expression[group] = AGG_FUNC[agg_func](group_data, axis=0)

    return agg_expression.transpose()




[docs]
def top_mean(x, axis, percent=10):
    """
    Computes the mean of the top x% values along the specified axis of a matrix, handling NaN values.

    Parameters
    ----------
    x : numpy.ndarray
        The input matrix containing the data to be aggregated.

    axis : int
        The axis along which to compute the mean. Use 0 for columns, 1 for rows.

    percent : float, (default: 10)
        The percentage of top values to consider, ranging from 0 to 100.
        For example, 10 would compute the mean of the top 10% of values.

    Returns
    -------
    numpy.ndarray
        An array containing the mean of the top x% values for each row or column,
        depending on the specified axis. The shape of the output array will be
        (n_rows,) if axis=1, or (n_columns,) if axis=0.
    """

    def top_nanmean(arr, p):
        # Ensure the input is a numpy array
        arr = np.asarray(arr)

        # Remove NaN values
        non_nan = arr[~np.isnan(arr)]

        # If all values are NaN, return NaN
        if len(non_nan) == 0:
            return np.nan

        # Calculate the number of elements to keep
        n = max(1, int(np.ceil(len(arr) * p / 100)))

        # Sort non-NaN values and select top n
        sorted_arr = np.sort(non_nan)
        top_values = sorted_arr[-n:]

        # Return the mean of the top values
        return np.mean(top_values)

    # Apply the function along the specified axis
    return np.apply_along_axis(top_nanmean, axis, x, percent)




[docs]
def fraction_above_threshold(x, axis, threshold=0):
    """
    Computes the fraction of values above a threshold along the specified axis.

    Parameters
    ----------
    x : numpy.ndarray
        The input matrix containing the data.

    axis : int
        The axis along which to compute the fraction. Use 0 for columns, 1 for rows.

    threshold : float, (default: 0)
        The threshold value above which to count values.

    Returns
    -------
    numpy.ndarray
        An array containing the fraction (between 0 and 1) of values above threshold.
    """
    return np.mean(x > threshold, axis=axis)



AGG_FUNC = {'mean' : np.nanmean,
            'median' : np.nanmedian,
            '25p' : lambda x, axis: np.nanpercentile(x, q=25, axis=axis),
            '75p' : lambda x, axis: np.nanpercentile(x, q=75, axis=axis),
            'trimean' : lambda x, axis: 0.5*np.nanpercentile(x, q=50, axis=axis) + 0.25*(np.nanpercentile(x, q=25, axis=axis) + np.nanpercentile(x, q=75, axis=axis)),
            'topmean' : top_mean,
            'fraction_above' : fraction_above_threshold
            }