Source code for sccellfie.preprocessing.database_manipulation

import numpy as np
import pandas as pd
from sccellfie.preprocessing.gpr_rules import find_genes_gpr


[docs] def get_element_associations(df, element, axis_element=0): """ Gets the tasks, reactions, or genes associated with a given element in the DataFrame. Parameters ---------- df : pandas.DataFrame DataFrame containing the associations. element : str Element for which to get the associations. This can be a task, reaction, or gene. Name should match exactly the name in indexes or columns of the DataFrame. axis_element : int, optional (default: 0) Axis along which the element is located. Can be 0 (rows) or 1 (columns). Returns ------- associations : list of str List of tasks, reactions, or genes associated with the given element. """ if axis_element == 0: e = df.loc[element, :] elif axis_element == 1: e = df.loc[:, element] else: raise ValueError('Not a valid axis') e = e.loc[e != 0] associations = sorted(e.index) return associations
[docs] def add_new_task(task_by_rxn, task_by_gene, rxn_by_gene, task_info, rxn_info, task_name, task_system, task_subsystem, rxn_names, gpr_hgncs, gpr_symbols): """ Adds a new task and their associated reactions and genes to the database. Parameters ---------- task_by_rxn : pandas.DataFrame DataFrame representing the relationship between tasks and reactions. task_by_gene : pandas.DataFrame DataFrame representing the relationship between tasks and genes. rxn_by_gene : pandas.DataFrame DataFrame representing the relationship between reactions and genes. task_info : pandas.DataFrame DataFrame containing information about tasks, including the task name, system (major group of tasks), and subsystem (specific group of tasks). rxn_info : pandas.DataFrame DataFrame containing information about reactions, including the reaction name, and the associated GPR rules in HGNC and symbol format. task_name : str Name of the task to add. task_system : str System (major group of tasks) to which the task belongs. task_subsystem : str Subsystem (specific group of tasks) to which the task belongs. rxn_names : list of str List of reaction names associated with the task. gpr_hgncs : list of str List of GPR rules in HGNC format associated with the reactions. Order should match the order of the reaction names. gpr_symbols : list of str List of GPR rules in symbol format associated with the reactions. Order should match the order of the reaction names. Returns ------- task_by_rxn : pandas.DataFrame Updated DataFrame representing the relationship between tasks and reactions. task_by_gene : pandas.DataFrame Updated DataFrame representing the relationship between tasks and genes. rxn_by_gene : pandas.DataFrame Updated DataFrame representing the relationship between reactions and genes. task_info : pandas.DataFrame Updated DataFrame containing information about tasks, including the task name, system (major group of tasks), and subsystem (specific group of tasks). rxn_info : pandas.DataFrame Updated DataFrame containing information about reactions, including the reaction name, and the associated GPR rules in HGNC and symbol format. """ task_by_rxn, task_by_gene, rxn_by_gene, task_info, rxn_info = task_by_rxn.copy(), task_by_gene.copy(), rxn_by_gene.copy(), task_info.copy(), rxn_info.copy(), # Add task if (task_name not in task_by_rxn.index): task_by_rxn.loc[task_name] = task_by_rxn.shape[1] * [0] if (task_name not in task_by_gene.index): task_by_gene.loc[task_name] = task_by_gene.shape[1] * [0] # Annotate task if task_name not in task_info.index: task_info.loc[len(task_info)] = [task_name, task_system, task_subsystem] # Add rxns for rxn_name, gpr_hgnc, gpr_symbol in zip(rxn_names, gpr_hgncs, gpr_symbols): # Add to GPR rules if rxn_name not in rxn_info.Reaction.values.tolist(): rxn_info.loc[len(rxn_info)] = [rxn_name, gpr_hgnc, gpr_symbol] # Add rxn to task_by_rxn df = task_by_rxn if rxn_name not in df.columns: df[rxn_name] = [0] * df.shape[0] df.loc[task_name, rxn_name] = 1 # Add rxn and gene to rxn_by_gene df = rxn_by_gene if rxn_name not in df.index: df.loc[rxn_name] = [0] * df.shape[1] for gene in find_genes_gpr(gpr_symbol): if gene not in df.columns: df[gene] = [0] * df.shape[0] df.loc[rxn_name, gene] = 1 # Add gene to task_by_gene df = task_by_gene for gene in find_genes_gpr(gpr_symbol): if gene not in df.columns: df[gene] = [0] * df.shape[0] df.loc[task_name, gene] = 1 return task_by_rxn, task_by_gene, rxn_by_gene, task_info, rxn_info
[docs] def combine_and_sort_dataframes(df1, df2, preference='max'): """ Combines two DataFrames and sort the rows and columns alphabetically. Parameters ---------- df1 : pandas.DataFrame First DataFrame to combine. df2 : pandas.DataFrame Second DataFrame to combine. preference : str, optional Preference for which value to keep when both dataframes have the same cell. Options: 'max' (default), 'min', 'df1', 'df2'. Returns ------- combined_df : pandas.DataFrame Combined DataFrame with all rows and columns from df1 and df2, sorted alphabetically. Missing values are filled with 0. """ # Get the union of index (rows) and columns all_rows = df1.index.union(df2.index) all_columns = df1.columns.union(df2.columns) # Create a new DataFrame with all rows and columns, filled with NaN combined_df = pd.DataFrame(np.nan, index=all_rows, columns=all_columns) # Update the combined DataFrame with values from df1 combined_df.update(df1) # Reindex df2 to match the combined DataFrame's structure df2_reindexed = df2.reindex(index=all_rows, columns=all_columns) if preference == 'max': combined_df = combined_df.combine(df2_reindexed, np.fmax) elif preference == 'min': combined_df = combined_df.combine(df2_reindexed, np.fmin) elif preference == 'df1': combined_df.update(df2_reindexed, overwrite=False) elif preference == 'df2': combined_df.update(df2_reindexed) else: raise ValueError("Invalid preference. Choose 'max', 'min', 'df1', or 'df2'.") # Sort the rows and columns alphabetically combined_df = combined_df.sort_index().sort_index(axis=1).fillna(0) return combined_df
[docs] def handle_duplicate_indexes(df, value_column=None, operation='first'): """ Handles duplicated indexes in a DataFrame by keeping the min, max, mean, first, or last value associated with them in a specified column. Parameters ---------- df : pandas.DataFrame DataFrame with duplicated indexes. value_column : str, optional (default: None) Name of the column containing values to make a decision when handling duplicated indexes. This value is optional only when operation is 'first' or 'last'. operation : str, optional (default: 'first') Operation to perform when handling duplicated indexes. Options: 'min', 'max', 'mean', 'first', 'last'. Returns ------- df_result : pandas.DataFrame DataFrame with duplicated indexes handled according to the specified operation """ if df.empty: return df.copy() if operation not in ['min', 'max', 'mean', 'first', 'last']: raise ValueError("Operation must be 'min', 'max', 'mean', or 'first'") if operation in ['first', 'last']: return df[~df.index.duplicated(keep=operation)] # Group by index and apply the specified operation assert value_column is not None, "A value column must be provided for operations other than 'first' or 'last'" if operation == 'mean': df_grouped = df.groupby(level=0).agg({value_column: 'mean'}) else: # min or max df_grouped = df.groupby(level=0).agg({value_column: operation}) # Merge the result back with the original DataFrame to keep other columns df_result = df.loc[~df.index.duplicated(keep='first')].copy() df_result[value_column] = df_grouped[value_column] return df_result