Source code for sccellfie.preprocessing.prepare_inputs
import cobra
import pandas as pd
from sccellfie.preprocessing.gpr_rules import find_genes_gpr
[docs]
def preprocess_inputs(adata, gpr_info, task_by_gene, rxn_by_gene, task_by_rxn, correction_organism='human',
gene_fraction_threshold=0.0, reaction_fraction_threshold=0.0, verbose=True):
"""
Preprocesses inputs for metabolic analysis.
Parameters
----------
adata : AnnData
Annotated data matrix.
gpr_info : pandas.DataFrame
DataFrame containing reaction IDs and their corresponding Gene-Protein-Reaction (GPR) rules.
task_by_gene : pandas.DataFrame
DataFrame representing the relationship between tasks and genes.
rxn_by_gene : pandas.DataFrame
DataFrame representing the relationship between reactions and genes.
task_by_rxn : pandas.DataFrame
DataFrame representing the relationship between tasks and reactions.
correction_organism : str, optional (default: 'human')
Organism of the input data. This is important to correct gene names that are present in
scCellFie's or custom database. Check options in `sccellfie.preprocessing.prepare_inputs.CORRECT_GENES.keys()`
gene_fraction_threshold : float, optional (default: 0.0)
The minimum fraction of genes in a reaction's GPR that must be present in adata to keep the reaction.
Range is 0 to 1.
1.0 means all genes must be present.
Any value > 0 and < 1 keeps reactions with at least that fraction of genes present.
0 means keep reactions with at least one gene present.
reaction_fraction_threshold : float, optional (default: 0.0)
The minimum fraction of reactions in a task that must be present after gene filtering to keep the task.
Range is 0 to 1.
1.0 means all reactions must be present.
Any value > 0 and < 1 keeps tasks with at least that fraction of reactions present.
0 means keep tasks with at least one reaction present.
verbose : bool, optional (default: True)
If True, prints information about the preprocessing results.
Returns
-------
adata2 : AnnData
Filtered annotated data matrix.
gpr_rules : dict
Dictionary of GPR rules for the filtered reactions.
task_by_gene : pandas.DataFrame
Filtered DataFrame representing the relationship between tasks and genes.
rxn_by_gene : pandas.DataFrame
Filtered DataFrame representing the relationship between reactions and genes.
task_by_rxn : pandas.DataFrame
Filtered DataFrame representing the relationship between tasks and reactions.
"""
if not 0 <= gene_fraction_threshold <= 1:
raise ValueError("gene_fraction_threshold must be between 0 and 1")
if not 0 <= reaction_fraction_threshold <= 1:
raise ValueError("reaction_fraction_threshold must be between 0 and 1")
adata_var = pd.DataFrame(index=adata.var.index)
correction_col = 'corrected'
if correction_organism in CORRECT_GENES.keys():
correction_dict = CORRECT_GENES[correction_organism]
correction_dict = {k : v for k, v in correction_dict.items() if v in rxn_by_gene.columns}
adata_var[correction_col] = [correction_dict[g] if g in correction_dict.keys() else g for g in adata_var.index]
if verbose:
print('Gene names corrected to match database: {}'.format(len(correction_dict)))
else:
adata_var[correction_col] = list(adata_var.index)
# Initialize GPRs
gpr_rules = gpr_info.set_index('Reaction')['GPR-symbol'].to_dict()
gpr_rules = {k: cobra.core.gene.GPR().from_string(gpr) for k, gpr in gpr_rules.items()}
valid_genes = set()
valid_reactions = set()
for reaction, gpr in gpr_rules.items():
if reaction in rxn_by_gene.index and reaction in task_by_rxn.columns:
genes_in_rule = find_genes_gpr(gpr.to_string())
genes_present = [gene for gene in genes_in_rule if gene in adata_var[correction_col].values]
n_genes_in_rule = len(genes_in_rule)
n_genes_present = len(genes_present)
if n_genes_in_rule > 0:
if gene_fraction_threshold == 0:
# Keep reaction if at least one gene is present
if n_genes_present > 0:
valid_genes.update(genes_present)
valid_reactions.add(reaction)
else:
# Keep reaction if the fraction of present genes meets or exceeds the threshold
fraction_present = n_genes_present / n_genes_in_rule
if fraction_present >= gene_fraction_threshold:
valid_genes.update(genes_present)
valid_reactions.add(reaction)
valid_genes = sorted(valid_genes)
valid_reactions = sorted(valid_reactions)
# Filter adata
adata2 = adata[:, adata_var[correction_col].isin(valid_genes)]
adata2.var_names = adata_var[adata_var[correction_col].isin(valid_genes)][correction_col].values.tolist()
# Filter gene tables
task_by_gene = task_by_gene.loc[:, valid_genes]
rxn_by_gene = rxn_by_gene.loc[valid_reactions, valid_genes]
# Filter tasks based on reaction presence
valid_tasks = set()
for task in task_by_rxn.index:
rxns_in_task = task_by_rxn.loc[task]
rxns_present = rxns_in_task[rxns_in_task.index.isin(valid_reactions)]
n_rxns_in_task = rxns_in_task.sum()
n_rxns_present = rxns_present.sum()
if n_rxns_in_task > 0:
if reaction_fraction_threshold == 0:
# Keep task if at least one reaction is present
if n_rxns_present > 0:
valid_tasks.add(task)
else:
# Keep task if the fraction of present reactions meets or exceeds the threshold
fraction_present = n_rxns_present / n_rxns_in_task
if fraction_present >= reaction_fraction_threshold:
valid_tasks.add(task)
valid_tasks = sorted(valid_tasks)
# Final filtering of task tables
task_by_gene = task_by_gene.loc[valid_tasks]
task_by_rxn = task_by_rxn.loc[valid_tasks, valid_reactions]
# Remove genes and reactions with no non-zero values
task_by_gene = task_by_gene.loc[:, (task_by_gene != 0).any(axis=0)]
rxn_by_gene = rxn_by_gene.loc[(rxn_by_gene != 0).any(axis=1), (rxn_by_gene != 0).any(axis=0)]
task_by_rxn = task_by_rxn.loc[:, (task_by_rxn != 0).any(axis=0)]
# Update valid genes and reactions
valid_genes = sorted(set(task_by_gene.columns))
valid_reactions = sorted(set(task_by_rxn.columns))
# Update GPR rules
gpr_rules = {k: v for k, v in gpr_rules.items() if k in valid_reactions}
# Final update
rxn_by_gene = rxn_by_gene.loc[valid_reactions, valid_genes]
task_by_gene = task_by_gene.loc[valid_tasks, valid_genes]
task_by_rxn = task_by_rxn.loc[valid_tasks, valid_reactions]
adata2 = adata2[:, valid_genes]
if verbose:
print(f'Shape of new adata object: {adata2.shape}\n'
f'Number of GPRs: {len(gpr_rules)}\n'
f'Shape of tasks by genes: {task_by_gene.shape}\n'
f'Shape of reactions by genes: {rxn_by_gene.shape}\n'
f'Shape of tasks by reactions: {task_by_rxn.shape}')
return adata2, gpr_rules, task_by_gene, rxn_by_gene, task_by_rxn
# Gene name in dataset to gene name in scCellFie's DB.
CORRECT_GENES = {'human' : {'ADSS': 'ADSS2',
'ADSSL1': 'ADSS1',
'COL4A3BP': 'CERT1',
'MT-CO1': 'COX1',
'MT-CO2': 'COX2',
'MT-CO3': 'COX3',
'MT-CYB': 'CYTB',
'ATP5S': 'DMAC2L',
'FUK': 'FCSK',
'G6PC': 'G6PC1',
'WRB': 'GET1',
'ASNA1': 'GET3',
'MARCH6': 'MARCHF6',
'MUT': 'MMUT',
'MT-ND1': 'ND1',
'MT-ND2': 'ND2',
'MT-ND3': 'ND3',
'MT-ND4': 'ND4',
'MT-ND4L': 'ND4L',
'MT-ND5': 'ND5',
'MT-ND6': 'ND6',
'ZADH2': 'PTGR3',
},
'mouse' : {'Gars1': 'Gars',
'Srpr' : 'Srpra',
'mt-Cytb' : 'Cytb',
'mt-Nd1' : 'Nd1',
'mt-Nd2' : 'Nd2',
'mt-Nd3' : 'Nd3',
'mt-Nd4' : 'Nd4',
'mt-Nd4l' : 'Nd4l',
'mt-Nd5' : 'Nd5',
'mt-Nd6' : 'Nd6',
'Sdr42e2' : 'Gm5737',
'Klk1b26' : 'Egfbp2',
'Il4i1' : 'Il4i1b',
}
}