Source code for sccellfie.stats.markers_from_task
import warnings
import pandas as pd
[docs]
def get_task_determinant_genes(adata, metabolic_task, task_by_rxn, groupby=None, group=None, min_activity=0.0):
"""
Finds the genes that determine the activity of all reactions in a metabolic task. Returns determinant genes
for each reaction and their activity across specified cell groups, along with the fraction of cells in each
group where the gene was determinant.
Parameters
----------
adata: AnnData object
Annotated data matrix.
metabolic_task: str
Name of the metabolic task to analyze. Must be one of the tasks in the `task_by_rxn` DataFrame.
It must also be present in the `adata.metabolic_tasks` attribute.
task_by_rxn: pandas.DataFrame
A pandas.DataFrame object where rows are metabolic tasks and columns are
reactions. Each cell contains ones or zeros, indicating whether a reaction
is involved in a metabolic task.
groupby: str, optional (default: None)
The key in the `adata.obs` DataFrame to group by. This could be any
categorical annotation of cells (e.g., cell type, cluster).
group: str or list, optional (default: None)
The group(s) in the `adata.obs` DataFrame to analyze. If `None`, the analysis is performed
by treating all single cells as a group. If `groups` is specified, `groupby` must be specified.
The column referred by `groupby` must contain the groups specified in `group`.
min_activity: float, optional (default: 0.0)
Minimum reaction activity level to consider a reaction as active. Only genes that are
associated with active reactions are considered. If zero, all reactions and therefore
all genes are considered.
Returns
-------
df: pandas.DataFrame
A pandas.DataFrame reporting the determinant genes for each reaction in the metabolic task.
The DataFrame has the following columns:
- Group: The cell group.
- Rxn: The reaction.
- Det-Gene: The determinant gene for the reaction.
- RAL: The reaction activity level for the reaction.
- Cell_fraction: The fraction of cells in the group where this gene was determinant.
Notes
-----
This function assumes that reaction activity levels have been computed using
sccellfie.reaction_activity.compute_reaction_activity() and are stored in adata.reactions.X.
Scores are computed as previously indicated in the CellFie paper (https://doi.org/10.1016/j.crmeth.2021.100040).
"""
assert hasattr(adata, "metabolic_tasks"), "Please run scCellFie on your dataset before using this function."
# Get list of rxns that belong to the metabolic task
rxns_in_task = task_by_rxn.loc[metabolic_task, :]
rxns_in_task = sorted([rxn for rxn in rxns_in_task[rxns_in_task != 0].index if rxn in adata.reactions.var_names])
if (group is not None) & (groupby is None):
warning_message = "You have specified `group` but not the column where to find the groups (`groupby`). Analysis will be performed across all groups."
warnings.warn(warning_message, UserWarning)
if groupby is not None:
if group is not None:
if isinstance(group, list):
groups = group
else:
groups = [group]
else:
groups = adata.obs[groupby].unique().tolist()
barcodes = [adata[adata.obs[groupby] == group].obs_names for group in groups]
else:
groups = ['All-Groups']
barcodes = [adata.obs_names]
dfs = []
for _group, _barcodes in zip(groups, barcodes):
# Get total number of cells in this group
total_cells = len(_barcodes)
rxn_filter = adata.reactions.obs_names.isin(_barcodes)
adata_rxns = adata.reactions[rxn_filter]
rxn_df = adata_rxns.to_df()
adata_rxns.uns.update({'Rxn-Max-Genes': adata_rxns.uns['Rxn-Max-Genes'][rxn_filter]})
for rxn in rxns_in_task:
# Create initial DataFrame with cell-level information
df = pd.DataFrame(index=_barcodes)
df['Group'] = _group
df['Rxn'] = rxn
df['Det-Gene'] = adata_rxns.uns['Rxn-Max-Genes'][rxn]
df['RAL'] = rxn_df[rxn]
# Group by gene to get counts and mean RAL
grouped = df.groupby(['Group', 'Rxn', 'Det-Gene']).agg({
'RAL': 'mean',
'Det-Gene': 'count' # This gives us the count of cells for each gene
}).rename(columns={'Det-Gene': 'Cell_count'})
# Calculate fraction
grouped['Cell_fraction'] = grouped['Cell_count'] / total_cells
# Drop the cell count column as it was just for intermediate calculation
grouped = grouped.drop('Cell_count', axis=1)
dfs.append(grouped)
df = pd.concat(dfs).reset_index().sort_values('RAL', ascending=False)
if min_activity != 0.:
df = df[df['RAL'] >= min_activity]
df = df.reset_index(drop=True)
return df