import numpy as np
import pandas as pd
from sccellfie.preprocessing.gpr_rules import find_genes_gpr
[docs]
def get_element_associations(df, element, axis_element=0):
"""
Gets the tasks, reactions, or genes associated with
a given element in the DataFrame.
Parameters
----------
df : pandas.DataFrame
DataFrame containing the associations.
element : str
Element for which to get the associations. This can be a task, reaction, or gene.
Name should match exactly the name in indexes or columns of the DataFrame.
axis_element : int, optional (default: 0)
Axis along which the element is located. Can be 0 (rows) or 1 (columns).
Returns
-------
associations : list of str
List of tasks, reactions, or genes associated with the given element.
"""
if axis_element == 0:
e = df.loc[element, :]
elif axis_element == 1:
e = df.loc[:, element]
else:
raise ValueError('Not a valid axis')
e = e.loc[e != 0]
associations = sorted(e.index)
return associations
[docs]
def add_new_task(task_by_rxn, task_by_gene, rxn_by_gene, task_info, rxn_info,
task_name, task_system, task_subsystem, rxn_names, gpr_hgncs, gpr_symbols):
"""
Adds a new task and their associated reactions and genes to the database.
Parameters
----------
task_by_rxn : pandas.DataFrame
DataFrame representing the relationship between tasks and reactions.
task_by_gene : pandas.DataFrame
DataFrame representing the relationship between tasks and genes.
rxn_by_gene : pandas.DataFrame
DataFrame representing the relationship between reactions and genes.
task_info : pandas.DataFrame
DataFrame containing information about tasks, including the task name,
system (major group of tasks), and subsystem (specific group of tasks).
rxn_info : pandas.DataFrame
DataFrame containing information about reactions, including the reaction name,
and the associated GPR rules in HGNC and symbol format.
task_name : str
Name of the task to add.
task_system : str
System (major group of tasks) to which the task belongs.
task_subsystem : str
Subsystem (specific group of tasks) to which the task belongs.
rxn_names : list of str
List of reaction names associated with the task.
gpr_hgncs : list of str
List of GPR rules in HGNC format associated with the reactions. Order
should match the order of the reaction names.
gpr_symbols : list of str
List of GPR rules in symbol format associated with the reactions. Order
should match the order of the reaction names.
Returns
-------
task_by_rxn : pandas.DataFrame
Updated DataFrame representing the relationship between tasks and reactions.
task_by_gene : pandas.DataFrame
Updated DataFrame representing the relationship between tasks and genes.
rxn_by_gene : pandas.DataFrame
Updated DataFrame representing the relationship between reactions and genes.
task_info : pandas.DataFrame
Updated DataFrame containing information about tasks, including the task name,
system (major group of tasks), and subsystem (specific group of tasks).
rxn_info : pandas.DataFrame
Updated DataFrame containing information about reactions, including the reaction name,
and the associated GPR rules in HGNC and symbol format.
"""
task_by_rxn, task_by_gene, rxn_by_gene, task_info, rxn_info = task_by_rxn.copy(), task_by_gene.copy(), rxn_by_gene.copy(), task_info.copy(), rxn_info.copy(),
# Add task
if (task_name not in task_by_rxn.index):
task_by_rxn.loc[task_name] = task_by_rxn.shape[1] * [0]
if (task_name not in task_by_gene.index):
task_by_gene.loc[task_name] = task_by_gene.shape[1] * [0]
# Annotate task
if task_name not in task_info.index:
task_info.loc[len(task_info)] = [task_name, task_system, task_subsystem]
# Add rxns
for rxn_name, gpr_hgnc, gpr_symbol in zip(rxn_names, gpr_hgncs, gpr_symbols):
# Add to GPR rules
if rxn_name not in rxn_info.Reaction.values.tolist():
rxn_info.loc[len(rxn_info)] = [rxn_name, gpr_hgnc, gpr_symbol]
# Add rxn to task_by_rxn
df = task_by_rxn
if rxn_name not in df.columns:
df[rxn_name] = [0] * df.shape[0]
df.loc[task_name, rxn_name] = 1
# Add rxn and gene to rxn_by_gene
df = rxn_by_gene
if rxn_name not in df.index:
df.loc[rxn_name] = [0] * df.shape[1]
for gene in find_genes_gpr(gpr_symbol):
if gene not in df.columns:
df[gene] = [0] * df.shape[0]
df.loc[rxn_name, gene] = 1
# Add gene to task_by_gene
df = task_by_gene
for gene in find_genes_gpr(gpr_symbol):
if gene not in df.columns:
df[gene] = [0] * df.shape[0]
df.loc[task_name, gene] = 1
return task_by_rxn, task_by_gene, rxn_by_gene, task_info, rxn_info
[docs]
def combine_and_sort_dataframes(df1, df2, preference='max'):
"""
Combines two DataFrames and sort the rows and columns alphabetically.
Parameters
----------
df1 : pandas.DataFrame
First DataFrame to combine.
df2 : pandas.DataFrame
Second DataFrame to combine.
preference : str, optional
Preference for which value to keep when both dataframes have the same cell.
Options: 'max' (default), 'min', 'df1', 'df2'.
Returns
-------
combined_df : pandas.DataFrame
Combined DataFrame with all rows and columns from df1 and df2, sorted alphabetically.
Missing values are filled with 0.
"""
# Get the union of index (rows) and columns
all_rows = df1.index.union(df2.index)
all_columns = df1.columns.union(df2.columns)
# Create a new DataFrame with all rows and columns, filled with NaN
combined_df = pd.DataFrame(np.nan, index=all_rows, columns=all_columns)
# Update the combined DataFrame with values from df1
combined_df.update(df1)
# Reindex df2 to match the combined DataFrame's structure
df2_reindexed = df2.reindex(index=all_rows, columns=all_columns)
if preference == 'max':
combined_df = combined_df.combine(df2_reindexed, np.fmax)
elif preference == 'min':
combined_df = combined_df.combine(df2_reindexed, np.fmin)
elif preference == 'df1':
combined_df.update(df2_reindexed, overwrite=False)
elif preference == 'df2':
combined_df.update(df2_reindexed)
else:
raise ValueError("Invalid preference. Choose 'max', 'min', 'df1', or 'df2'.")
# Sort the rows and columns alphabetically
combined_df = combined_df.sort_index().sort_index(axis=1).fillna(0)
return combined_df
[docs]
def handle_duplicate_indexes(df, value_column=None, operation='first'):
"""
Handles duplicated indexes in a DataFrame by keeping the min, max, mean, first, or last value
associated with them in a specified column.
Parameters
----------
df : pandas.DataFrame
DataFrame with duplicated indexes.
value_column : str, optional (default: None)
Name of the column containing values to make a decision
when handling duplicated indexes. This value is optional
only when operation is 'first' or 'last'.
operation : str, optional (default: 'first')
Operation to perform when handling duplicated indexes.
Options: 'min', 'max', 'mean', 'first', 'last'.
Returns
-------
df_result : pandas.DataFrame
DataFrame with duplicated indexes handled according to the specified operation
"""
if df.empty:
return df.copy()
if operation not in ['min', 'max', 'mean', 'first', 'last']:
raise ValueError("Operation must be 'min', 'max', 'mean', or 'first'")
if operation in ['first', 'last']:
return df[~df.index.duplicated(keep=operation)]
# Group by index and apply the specified operation
assert value_column is not None, "A value column must be provided for operations other than 'first' or 'last'"
if operation == 'mean':
df_grouped = df.groupby(level=0).agg({value_column: 'mean'})
else: # min or max
df_grouped = df.groupby(level=0).agg({value_column: operation})
# Merge the result back with the original DataFrame to keep other columns
df_result = df.loc[~df.index.duplicated(keep='first')].copy()
df_result[value_column] = df_grouped[value_column]
return df_result