Source code for sccellfie.preprocessing.matrix_utils

import numpy as np
import pandas as pd
from scipy.sparse import issparse


[docs] def min_max_normalization(df, axis=0): """ Applies min-max normalization along specified axis. Parameters ---------- df : pandas.DataFrame or array-like The input DataFrame to be normalized. axis : int, optional (default: 0) The axis along which to normalize. Use 0 to normalize each column or 1 to normalize each row using their cognate min and max values. Returns ------- df_scaled : pandas.DataFrame A DataFrame containing the normalized values. Minimum and maximum values are calculated along the specified axis. Minimum and maximum values are 0 and 1, respectively. NaN values are filled with 0. """ if isinstance(df, pd.DataFrame): df = df.copy() else: df = pd.DataFrame(df) min_vals = df.min(axis=axis) max_vals = df.max(axis=axis) df_scaled = df.sub(min_vals, axis=1 - axis).div(max_vals - min_vals, axis=1 - axis).fillna(0) return df_scaled
[docs] def get_matrix_gene_expression(matrix, var_names, gene, normalize=False): """ Safely extracts expression values for a gene from any matrix type. Parameters ---------- matrix : numpy.ndarray The matrix containing the expression data. Rows correspond to cells and columns to genes. var_names : list or pandas.Index The index or array containing the gene names. gene : str The gene name to extract. normalize : bool, optional (default: False) If True, apply min-max normalization to the expression values. Returns ------- expression : numpy.ndarray An array containing the expression values for the specified gene. """ # Find gene index if isinstance(var_names, pd.Index): gene_idx = var_names.get_loc(gene) elif isinstance(var_names, list): gene_idx = var_names.index(gene) else: gene_idx = np.where(var_names == gene)[0][0] # Handle different matrix types if issparse(matrix): expression = matrix[:, gene_idx].toarray().flatten() elif isinstance(matrix, np.ndarray): expression = matrix[:, gene_idx].flatten() elif isinstance(matrix, pd.DataFrame): expression = matrix[gene].values else: raise ValueError(f"Unsupported matrix type: {type(matrix)}") expression = expression.astype(np.float64) # Apply normalization if requested if normalize: expression = min_max_normalization(expression).values return expression
[docs] def compute_dataframes_correlation(df1, df2, col_name=None, method='spearman'): """ Computes correlations between one column in ´df1´ and all columns in another ´df2´. Parameters ---------- df1 : pandas.DataFrame DataFrame of which one column will be correlated against multiple columns in df2. df2 : pandas.DataFrame DataFrame containing multiple columns to correlate against the single column in df1. col_name : str, optional (default: None) The name of the column in df1 to correlate against df2. If None, the first column in df1 is used. method : str, optional (default: 'spearman') The correlation method to use. Either 'pearson' or 'spearman'. Returns ------- pandas.DataFrame DataFrame with correlation coefficients for each column in multi_column_df """ # Validate correlation method if method not in ['pearson', 'spearman']: raise ValueError("method must be either 'pearson' or 'spearman'") # Validate first DataFrame has only one column if len(df1.columns) != 1: raise ValueError("First DataFrame should have exactly one column") if col_name is not None: # Validate column name if col_name not in df1.columns: raise ValueError(f"Column '{col_name}' not found in df1") else: # Get the column name from the first DataFrame col_name = df1.columns[0] # Convert first DataFrame to series single_series = df1[col_name] # Compute correlations correlations = {} for column in df2.columns: corr = single_series.corr(df2[column], method=method) correlations[column] = corr # Convert to DataFrame correlation_df = pd.DataFrame(correlations.items(), columns=['Column', col_name]) correlation_df = correlation_df.set_index('Column') return correlation_df.sort_values(by=col_name, ascending=False)