Source code for sccellfie.stats.markers_from_task

import warnings
import pandas as pd



[docs]
def get_task_determinant_genes(adata, metabolic_task, task_by_rxn, groupby=None, group=None, min_activity=0.0):
    """
    Finds the genes that determine the activity of all reactions in a metabolic task. Returns determinant genes
    for each reaction and their activity across specified cell groups, along with the fraction of cells in each
    group where the gene was determinant.

    Parameters
    ----------
    adata: AnnData object
        Annotated data matrix.

    metabolic_task: str
        Name of the metabolic task to analyze. Must be one of the tasks in the `task_by_rxn` DataFrame.
        It must also be present in the `adata.metabolic_tasks` attribute.

    task_by_rxn: pandas.DataFrame
        A pandas.DataFrame object where rows are metabolic tasks and columns are
        reactions. Each cell contains ones or zeros, indicating whether a reaction
        is involved in a metabolic task.

    groupby: str, optional (default: None)
        The key in the `adata.obs` DataFrame to group by. This could be any
        categorical annotation of cells (e.g., cell type, cluster).

    group: str or list, optional (default: None)
        The group(s) in the `adata.obs` DataFrame to analyze. If `None`, the analysis is performed
        by treating all single cells as a group. If `groups` is specified, `groupby` must be specified.
        The column referred by `groupby` must contain the groups specified in `group`.

    min_activity: float, optional (default: 0.0)
        Minimum reaction activity level to consider a reaction as active. Only genes that are
        associated with active reactions are considered. If zero, all reactions and therefore
        all genes are considered.

    Returns
    -------
    df: pandas.DataFrame
        A pandas.DataFrame reporting the determinant genes for each reaction in the metabolic task.
        The DataFrame has the following columns:
            - Group: The cell group.
            - Rxn: The reaction.
            - Det-Gene: The determinant gene for the reaction.
            - RAL: The reaction activity level for the reaction.
            - Cell_fraction: The fraction of cells in the group where this gene was determinant.

    Notes
    -----
    This function assumes that reaction activity levels have been computed using
    sccellfie.reaction_activity.compute_reaction_activity() and are stored in adata.reactions.X.

    Scores are computed as previously indicated in the CellFie paper (https://doi.org/10.1016/j.crmeth.2021.100040).
    """
    assert hasattr(adata, "metabolic_tasks"), "Please run scCellFie on your dataset before using this function."

    # Get list of rxns that belong to the metabolic task
    rxns_in_task = task_by_rxn.loc[metabolic_task, :]
    rxns_in_task = sorted([rxn for rxn in rxns_in_task[rxns_in_task != 0].index if rxn in adata.reactions.var_names])

    if (group is not None) & (groupby is None):
        warning_message = "You have specified `group` but not the column where to find the groups (`groupby`). Analysis will be performed across all groups."
        warnings.warn(warning_message, UserWarning)

    if groupby is not None:
        if group is not None:
            if isinstance(group, list):
                groups = group
            else:
                groups = [group]
        else:
            groups = adata.obs[groupby].unique().tolist()
        barcodes = [adata[adata.obs[groupby] == group].obs_names for group in groups]
    else:
        groups = ['All-Groups']
        barcodes = [adata.obs_names]

    dfs = []
    for _group, _barcodes in zip(groups, barcodes):
        # Get total number of cells in this group
        total_cells = len(_barcodes)

        rxn_filter = adata.reactions.obs_names.isin(_barcodes)
        adata_rxns = adata.reactions[rxn_filter]
        rxn_df = adata_rxns.to_df()
        adata_rxns.uns.update({'Rxn-Max-Genes': adata_rxns.uns['Rxn-Max-Genes'][rxn_filter]})

        for rxn in rxns_in_task:
            # Create initial DataFrame with cell-level information
            df = pd.DataFrame(index=_barcodes)
            df['Group'] = _group
            df['Rxn'] = rxn
            df['Det-Gene'] = adata_rxns.uns['Rxn-Max-Genes'][rxn]
            df['RAL'] = rxn_df[rxn]

            # Group by gene to get counts and mean RAL
            grouped = df.groupby(['Group', 'Rxn', 'Det-Gene']).agg({
                'RAL': 'mean',
                'Det-Gene': 'count'  # This gives us the count of cells for each gene
            }).rename(columns={'Det-Gene': 'Cell_count'})

            # Calculate fraction
            grouped['Cell_fraction'] = grouped['Cell_count'] / total_cells

            # Drop the cell count column as it was just for intermediate calculation
            grouped = grouped.drop('Cell_count', axis=1)

            dfs.append(grouped)

    df = pd.concat(dfs).reset_index().sort_values('RAL', ascending=False)
    if min_activity != 0.:
        df = df[df['RAL'] >= min_activity]
    df = df.reset_index(drop=True)
    return df