Source code for sccellfie.preprocessing.matrix_utils

import numpy as np
import pandas as pd
from scipy.sparse import issparse



[docs]
def min_max_normalization(df, axis=0):
    """
    Applies min-max normalization along specified axis.

    Parameters
    ----------
    df : pandas.DataFrame or array-like
        The input DataFrame to be normalized.

    axis : int, optional (default: 0)
        The axis along which to normalize. Use 0 to normalize
        each column or 1 to normalize each row using their
        cognate min and max values.

    Returns
    -------
    df_scaled : pandas.DataFrame
        A DataFrame containing the normalized values. Minimum and maximum values
        are calculated along the specified axis. Minimum and maximum values are
        0 and 1, respectively. NaN values are filled with 0.
    """
    if isinstance(df, pd.DataFrame):
        df = df.copy()
    else:
        df = pd.DataFrame(df)
    min_vals = df.min(axis=axis)
    max_vals = df.max(axis=axis)
    df_scaled = df.sub(min_vals, axis=1 - axis).div(max_vals - min_vals, axis=1 - axis).fillna(0)
    return df_scaled




[docs]
def get_matrix_gene_expression(matrix, var_names, gene, normalize=False):
    """
    Safely extracts expression values for a gene from any matrix type.

    Parameters
    ----------
    matrix : numpy.ndarray
        The matrix containing the expression data. Rows correspond to cells and columns to genes.

    var_names : list or pandas.Index
        The index or array containing the gene names.

    gene : str
        The gene name to extract.

    normalize : bool, optional (default: False)
        If True, apply min-max normalization to the expression values.

    Returns
    -------
    expression : numpy.ndarray
        An array containing the expression values for the specified gene.
    """
    # Find gene index
    if isinstance(var_names, pd.Index):
        gene_idx = var_names.get_loc(gene)
    elif isinstance(var_names, list):
        gene_idx = var_names.index(gene)
    else:
        gene_idx = np.where(var_names == gene)[0][0]

    # Handle different matrix types
    if issparse(matrix):
        expression = matrix[:, gene_idx].toarray().flatten()
    elif isinstance(matrix, np.ndarray):
        expression = matrix[:, gene_idx].flatten()
    elif isinstance(matrix, pd.DataFrame):
        expression = matrix[gene].values
    else:
        raise ValueError(f"Unsupported matrix type: {type(matrix)}")

    expression = expression.astype(np.float64)

    # Apply normalization if requested
    if normalize:
        expression = min_max_normalization(expression).values

    return expression




[docs]
def compute_dataframes_correlation(df1, df2, col_name=None, method='spearman'):
    """
    Computes correlations between one column in ´df1´ and all columns in another ´df2´.

    Parameters
    ----------
    df1 : pandas.DataFrame
        DataFrame of which one column will be correlated against multiple columns in df2.

    df2 : pandas.DataFrame
        DataFrame containing multiple columns to correlate against the single column in df1.

    col_name : str, optional (default: None)
        The name of the column in df1 to correlate against df2. If None, the first column in df1 is used.

    method : str, optional (default: 'spearman')
        The correlation method to use. Either 'pearson' or 'spearman'.

    Returns
    -------
    pandas.DataFrame
        DataFrame with correlation coefficients for each column in multi_column_df
    """
    # Validate correlation method
    if method not in ['pearson', 'spearman']:
        raise ValueError("method must be either 'pearson' or 'spearman'")

    # Validate first DataFrame has only one column
    if len(df1.columns) != 1:
        raise ValueError("First DataFrame should have exactly one column")

    if col_name is not None:
        # Validate column name
        if col_name not in df1.columns:
            raise ValueError(f"Column '{col_name}' not found in df1")
    else:
        # Get the column name from the first DataFrame
        col_name = df1.columns[0]

    # Convert first DataFrame to series
    single_series = df1[col_name]

    # Compute correlations
    correlations = {}
    for column in df2.columns:
        corr = single_series.corr(df2[column], method=method)
        correlations[column] = corr

    # Convert to DataFrame
    correlation_df = pd.DataFrame(correlations.items(),
                                  columns=['Column', col_name])
    correlation_df = correlation_df.set_index('Column')

    return correlation_df.sort_values(by=col_name, ascending=False)