Source code for sccellfie.preprocessing.prepare_inputs

import cobra
import pandas as pd

from sccellfie.preprocessing.gpr_rules import find_genes_gpr



[docs]
def preprocess_inputs(adata, gpr_info, task_by_gene, rxn_by_gene, task_by_rxn, correction_organism='human',
                      gene_fraction_threshold=0.0, reaction_fraction_threshold=0.0, verbose=True):
    """
    Preprocesses inputs for metabolic analysis.

    Parameters
    ----------
    adata : AnnData
        Annotated data matrix.

    gpr_info : pandas.DataFrame
        DataFrame containing reaction IDs and their corresponding Gene-Protein-Reaction (GPR) rules.

    task_by_gene : pandas.DataFrame
        DataFrame representing the relationship between tasks and genes.

    rxn_by_gene : pandas.DataFrame
        DataFrame representing the relationship between reactions and genes.

    task_by_rxn : pandas.DataFrame
        DataFrame representing the relationship between tasks and reactions.

    correction_organism : str, optional (default: 'human')
        Organism of the input data. This is important to correct gene names that are present in
        scCellFie's or custom database. Check options in `sccellfie.preprocessing.prepare_inputs.CORRECT_GENES.keys()`

    gene_fraction_threshold : float, optional (default: 0.0)
        The minimum fraction of genes in a reaction's GPR that must be present in adata to keep the reaction.
        Range is 0 to 1.
        1.0 means all genes must be present.
        Any value > 0 and < 1 keeps reactions with at least that fraction of genes present.
        0 means keep reactions with at least one gene present.

    reaction_fraction_threshold : float, optional (default: 0.0)
        The minimum fraction of reactions in a task that must be present after gene filtering to keep the task.
        Range is 0 to 1.
        1.0 means all reactions must be present.
        Any value > 0 and < 1 keeps tasks with at least that fraction of reactions present.
        0 means keep tasks with at least one reaction present.

    verbose : bool, optional (default: True)
        If True, prints information about the preprocessing results.

    Returns
    -------
    adata2 : AnnData
        Filtered annotated data matrix.

    gpr_rules : dict
        Dictionary of GPR rules for the filtered reactions.

    task_by_gene : pandas.DataFrame
        Filtered DataFrame representing the relationship between tasks and genes.

    rxn_by_gene : pandas.DataFrame
        Filtered DataFrame representing the relationship between reactions and genes.

    task_by_rxn : pandas.DataFrame
        Filtered DataFrame representing the relationship between tasks and reactions.
    """
    if not 0 <= gene_fraction_threshold <= 1:
        raise ValueError("gene_fraction_threshold must be between 0 and 1")
    if not 0 <= reaction_fraction_threshold <= 1:
        raise ValueError("reaction_fraction_threshold must be between 0 and 1")

    adata_var = pd.DataFrame(index=adata.var.index)

    correction_col = 'corrected'
    if correction_organism in CORRECT_GENES.keys():
        correction_dict = CORRECT_GENES[correction_organism]
        correction_dict = {k : v for k, v in correction_dict.items() if v in rxn_by_gene.columns}
        adata_var[correction_col] = [correction_dict[g] if g in correction_dict.keys() else g for g in adata_var.index]
        if verbose:
            print('Gene names corrected to match database: {}'.format(len(correction_dict)))
    else:
        adata_var[correction_col] = list(adata_var.index)


    # Initialize GPRs
    gpr_rules = gpr_info.set_index('Reaction')['GPR-symbol'].to_dict()
    gpr_rules = {k: cobra.core.gene.GPR().from_string(gpr) for k, gpr in gpr_rules.items()}

    valid_genes = set()
    valid_reactions = set()

    for reaction, gpr in gpr_rules.items():
        if reaction in rxn_by_gene.index and reaction in task_by_rxn.columns:
            genes_in_rule = find_genes_gpr(gpr.to_string())
            genes_present = [gene for gene in genes_in_rule if gene in adata_var[correction_col].values]

            n_genes_in_rule = len(genes_in_rule)
            n_genes_present = len(genes_present)
            if n_genes_in_rule > 0:
                if gene_fraction_threshold == 0:
                    # Keep reaction if at least one gene is present
                    if n_genes_present > 0:
                        valid_genes.update(genes_present)
                        valid_reactions.add(reaction)
                else:
                    # Keep reaction if the fraction of present genes meets or exceeds the threshold
                    fraction_present = n_genes_present / n_genes_in_rule
                    if fraction_present >= gene_fraction_threshold:
                        valid_genes.update(genes_present)
                        valid_reactions.add(reaction)
    valid_genes = sorted(valid_genes)
    valid_reactions = sorted(valid_reactions)

    # Filter adata
    adata2 = adata[:, adata_var[correction_col].isin(valid_genes)]
    adata2.var_names = adata_var[adata_var[correction_col].isin(valid_genes)][correction_col].values.tolist()

    # Filter gene tables
    task_by_gene = task_by_gene.loc[:, valid_genes]
    rxn_by_gene = rxn_by_gene.loc[valid_reactions, valid_genes]

    # Filter tasks based on reaction presence
    valid_tasks = set()
    for task in task_by_rxn.index:
        rxns_in_task = task_by_rxn.loc[task]
        rxns_present = rxns_in_task[rxns_in_task.index.isin(valid_reactions)]

        n_rxns_in_task = rxns_in_task.sum()
        n_rxns_present = rxns_present.sum()
        if n_rxns_in_task > 0:
            if reaction_fraction_threshold == 0:
                # Keep task if at least one reaction is present
                if n_rxns_present > 0:
                    valid_tasks.add(task)
            else:
                # Keep task if the fraction of present reactions meets or exceeds the threshold
                fraction_present = n_rxns_present / n_rxns_in_task
                if fraction_present >= reaction_fraction_threshold:
                    valid_tasks.add(task)
    valid_tasks = sorted(valid_tasks)

    # Final filtering of task tables
    task_by_gene = task_by_gene.loc[valid_tasks]
    task_by_rxn = task_by_rxn.loc[valid_tasks, valid_reactions]

    # Remove genes and reactions with no non-zero values
    task_by_gene = task_by_gene.loc[:, (task_by_gene != 0).any(axis=0)]
    rxn_by_gene = rxn_by_gene.loc[(rxn_by_gene != 0).any(axis=1), (rxn_by_gene != 0).any(axis=0)]
    task_by_rxn = task_by_rxn.loc[:, (task_by_rxn != 0).any(axis=0)]

    # Update valid genes and reactions
    valid_genes = sorted(set(task_by_gene.columns))
    valid_reactions = sorted(set(task_by_rxn.columns))

    # Update GPR rules
    gpr_rules = {k: v for k, v in gpr_rules.items() if k in valid_reactions}

    # Final update
    rxn_by_gene = rxn_by_gene.loc[valid_reactions, valid_genes]
    task_by_gene = task_by_gene.loc[valid_tasks, valid_genes]
    task_by_rxn = task_by_rxn.loc[valid_tasks, valid_reactions]
    adata2 = adata2[:, valid_genes]

    if verbose:
        print(f'Shape of new adata object: {adata2.shape}\n'
              f'Number of GPRs: {len(gpr_rules)}\n'
              f'Shape of tasks by genes: {task_by_gene.shape}\n'
              f'Shape of reactions by genes: {rxn_by_gene.shape}\n'
              f'Shape of tasks by reactions: {task_by_rxn.shape}')

    return adata2, gpr_rules, task_by_gene, rxn_by_gene, task_by_rxn



# Gene name in dataset to gene name in scCellFie's DB.
CORRECT_GENES = {'human' : {'ADSS': 'ADSS2',
                            'ADSSL1': 'ADSS1',
                            'COL4A3BP': 'CERT1',
                            'MT-CO1': 'COX1',
                            'MT-CO2': 'COX2',
                            'MT-CO3': 'COX3',
                            'MT-CYB': 'CYTB',
                            'ATP5S': 'DMAC2L',
                            'FUK': 'FCSK',
                            'G6PC': 'G6PC1',
                            'WRB': 'GET1',
                            'ASNA1': 'GET3',
                            'MARCH6': 'MARCHF6',
                            'MUT': 'MMUT',
                            'MT-ND1': 'ND1',
                            'MT-ND2': 'ND2',
                            'MT-ND3': 'ND3',
                            'MT-ND4': 'ND4',
                            'MT-ND4L': 'ND4L',
                            'MT-ND5': 'ND5',
                            'MT-ND6': 'ND6',
                            'ZADH2': 'PTGR3',
                            },
                 'mouse' : {'Gars1': 'Gars',
                            'Srpr' : 'Srpra',
                            'mt-Cytb' : 'Cytb',
                            'mt-Nd1' : 'Nd1',
                            'mt-Nd2' : 'Nd2',
                            'mt-Nd3' : 'Nd3',
                            'mt-Nd4' : 'Nd4',
                            'mt-Nd4l' : 'Nd4l',
                            'mt-Nd5' : 'Nd5',
                            'mt-Nd6' : 'Nd6',
                            'Sdr42e2' : 'Gm5737',
                            'Klk1b26' : 'Egfbp2',
                            'Il4i1' : 'Il4i1b',
                            }
                 }