Source code for sccellfie.plotting.differential_results

import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import textwrap

from matplotlib.lines import Line2D



[docs]
def create_volcano_plot(de_results, effect_threshold=0.75, padj_threshold=0.05, cell_type=None, group1=None,
                        group2=None, effect_col='cohens_d', effect_title="Cohen's d", wrapped_title_length=50,
                        save=None, dpi=300, tight_layout=True):
    """
    Creates a volcano plot for differential analysis results.

    Parameters
    ----------
    de_results : pd.DataFrame
        A DataFrame containing the results of the differential analysis.
        Required columns: 'feature', 'adj_p_value', and the column specified in effect_col.
        Optional columns: 'cell_type', 'group1', 'group2'.

    effect_threshold : float, optional (default: 0.75)
        The threshold for the effect size (e.g., log2 fold change or Cohen's d)
        to consider a variable significant.

    padj_threshold : float, optional (default: 0.05)
        The threshold for the adjusted p-value to consider a variable significant.

    cell_type : str, optional (default: None)
        The specific cell type to plot. If None and cell_type column exists,
        all cell types are plotted.

    group1 : str, optional (default: None)
        The first group in the comparison. If None, all group1 values are included.

    group2 : str, optional (default: None)
        The second group in the comparison. If None, all group2 values are included.

    effect_col : str, optional (default: 'cohens_d')
        The column in de_results that contains the effect size values.

    effect_title : str, optional (default: "Cohen's d")
        The title to use for the effect size in the plot.

    wrapped_title_length : int, optional (default: 50)
        The maximum number of characters per line in the title.

    save : str, optional (default: None)
        The file path to save the plot. If None, the plot is not saved.
        A file extension (e.g., '.png') can be provided to specify the file format.

    dpi : int, optional (default: 300)
        The resolution of the saved figure.

    tight_layout : bool, optional (default: True)
        Whether to use tight layout for the plot.

    Returns
    -------
    list
        A list of feature names that are considered significant based on the
        provided thresholds, sorted by effect size in ascending order.
        Returns an empty list if no significant features are found.

    Notes
    -----
    This function creates a volcano plot where:
    - x-axis represents the effect size (e.g., log2 fold change or Cohen's d)
    - y-axis represents the -log10(adjusted p-value)
    - Gray points indicate non-significant features
    - Red points indicate significant features that pass both thresholds
    - Dashed lines indicate the significance thresholds
    """
    # Create a copy of the results to avoid modifying the original
    df = de_results.copy()

    # Filter based on cell type if provided and if column exists
    if cell_type is not None and 'cell_type' in df.columns:
        df = df[df['cell_type'] == cell_type]

    # Filter based on groups if provided
    if group1 is not None:
        df = df[df['group1'] == group1]
    if group2 is not None:
        df = df[df['group2'] == group2]

    # Calculate -log10(padj) for y-axis
    df['neg_log_padj'] = -np.log10(df['adj_p_value'])
    # Replace infinite values with a large number for visualization
    df['neg_log_padj'] = df['neg_log_padj'].replace(np.inf, 300)

    # Create the volcano plot
    fig, ax = plt.subplots(figsize=(10, 8))

    # Define significance filter based on thresholds
    sig_filter = (df['adj_p_value'] < padj_threshold) & \
                 (df[effect_col].abs() >= effect_threshold)

    # Plot non-significant points in gray
    ax.scatter(df[~sig_filter][effect_col], df[~sig_filter]['neg_log_padj'],
               color='gray', alpha=0.5, label='Not significant')

    # Get and plot significant points in red
    significant_points = df[sig_filter].copy()  # Make a copy to avoid SettingWithCopyWarning
    if not significant_points.empty:
        ax.scatter(significant_points[effect_col], significant_points['neg_log_padj'],
                   color='red', alpha=0.8, label='Significant')

    # Add labels and title
    ax.set_xlabel(effect_title)
    ax.set_ylabel('-log10(adj. p-value)')

    # Create title based on filters
    title_parts = []
    if cell_type is not None:
        title_parts.append(f'Cell Type: {cell_type}')
    if group1 is not None and group2 is not None:
        title_parts.append(f'{group1} vs {group2}')

    # Set and wrap the title
    title = 'Volcano Plot' if not title_parts else ' - '.join(title_parts)
    wrapped_title = "\n".join(textwrap.wrap(title, width=wrapped_title_length))
    ax.set_title(wrapped_title)

    # Set the x-axis limits to center the plot around zero
    max_lfc = np.ceil(np.nanmax(df[effect_col].abs().values))
    ax.set_xlim(-max_lfc, max_lfc)

    # Add threshold lines
    ax.axhline(-np.log10(padj_threshold), color='darkgray', linestyle='--')
    ax.axvline(-effect_threshold, color='darkgray', linestyle='--')
    ax.axvline(effect_threshold, color='darkgray', linestyle='--')

    # Add legend and grid
    ax.legend()
    ax.grid(True, linestyle=':', alpha=0.5)

    # Apply tight layout if requested
    if tight_layout:
        plt.tight_layout()

    # Save the plot if a save path is provided
    if save:
        from sccellfie.plotting.plot_utils import _get_file_format, _get_file_dir

        # Get directory and base filename
        dir, basename = _get_file_dir(save)
        os.makedirs(dir, exist_ok=True)
        format = _get_file_format(save)

        # Update save name to include filters
        save_parts = []
        if cell_type:
            save_parts.append(cell_type)
        if group1 and group2:
            save_parts.append(f'{group1}_vs_{group2}')
        suffix = '_'.join(save_parts)
        suffix = f'_{suffix}' if suffix else ''

        # Construct the final filename with 'volcano_' prefix
        final_filename = f'volcano_{basename}{suffix}.{format}'
        final_path = os.path.join(dir, final_filename)

        # Save the figure
        plt.savefig(final_path, dpi=dpi, bbox_inches='tight')

    # Return sorted list of significant features
    return (significant_points.sort_values(effect_col, ascending=True)
            .feature.unique().tolist() if not significant_points.empty else [])




[docs]
def create_comparative_violin(adata, significant_features, group1, group2, condition_key,
                              celltype, cell_type_key, xlabel='Feature', ylabel='Metabolic Activity',
                              title=None, wrapped_title_length=50, figsize=(16, 7), fontsize=10, violin_cut=0,
                              palette=['coral', 'lightsteelblue'], lgd_bbox_to_anchor=(1.05, 1), lgd_loc='upper left',
                              save=None, dpi=300, tight_layout=True):
    """
    Compares features between two groups for a specific cell type in an AnnData object and creates a violin plot.

    Parameters
    ----------
    adata : AnnData
        An AnnData object containing the data.

    significant_features : list
        List of significant feature names from the volcano plot function, sorted by effect size.

    group1 : str
        The name of the first group to compare.

    group2 : str
        The name of the second group to compare.

    condition_key : str
        The column name in adata.obs containing the condition information.

    celltype : str
        The cell type to analyze.

    cell_type_key : str
        The column name in adata.obs containing the cell type information.

    xlabel : str, optional (default: 'Feature')
        The label for the x-axis.

    ylabel : str, optional (default: 'Metabolic Activity')
        The label for the y-axis.

    title : str, optional (default: None)
        The title for the plot. If None, a default title is used.

    wrapped_title_length : int, optional (default: 50)
        The maximum number of characters per line in the title.

    figsize : tuple, optional (default: (16, 7))
        The figure size.

    fontsize : int, optional (default: 10)
        The font size for the labels and legend.

    violin_cut : float, optional (default: 0)
        The cut parameter for the violin plot. Distance, in units of bandwidth,
         to extend the density past extreme datapoints. Set to 0 to limit the
         violin within the data range.

    palette : list, optional (default: ['coral', 'lightsteelblue'])
        The color palette for the plot. Each color corresponds to a group or condition.

    lgd_bbox_to_anchor : tuple, optional (default: (1.05, 1))
        The bbox_to_anchor parameter for the legend.

    lgd_loc : str, optional (default: 'upper left')
        The location of the legend.

    save : str, optional (default: None)
        The file path to save the plot. If None, the plot is not saved.

    dpi : int, optional (default: 300)
        The resolution of the saved figure.

    tight_layout : bool, optional (default: True)
        Whether to use tight layout for the plot.

    Returns
    -------
    fig, ax : matplotlib.pyplot.Figure, matplotlib.pyplot.Axes
        The matplotlib Figure and Axes objects for the plot.
    """
    # Filter cells for each group
    cells_group1 = adata.obs.loc[(adata.obs[condition_key] == group1) & (adata.obs[cell_type_key] == celltype)].index
    cells_group2 = adata.obs.loc[(adata.obs[condition_key] == group2) & (adata.obs[cell_type_key] == celltype)].index

    # Compile data for significant features
    feature_dfs = []
    for feature in significant_features:
        series1 = adata[cells_group1].to_df()[feature]
        series2 = adata[cells_group2].to_df()[feature]
        df = pd.concat([series1, series2], axis=0)
        df = df.reset_index(level=0)
        df = df.rename(columns={feature: ylabel})
        df['Feature'] = feature
        df['Group'] = pd.Series([group1] * len(series1) + [group2] * len(series2))
        feature_dfs.append(df)

    feature_df = pd.concat(feature_dfs, axis=0)

    # Create the violin plot
    fig, ax = plt.subplots(figsize=figsize)
    sns.violinplot(data=feature_df, x='Feature', y=ylabel, hue='Group', cut=violin_cut,
                   split=True, inner="quart", linewidth=0.5, density_norm="width",
                   palette=palette, order=significant_features, ax=ax)

    ax.set_xlabel(xlabel, fontsize=fontsize)
    ax.set_ylabel(ylabel, fontsize=fontsize)
    ax.set_xticklabels(labels=ax.get_xticklabels(), rotation=90, rotation_mode='anchor', ha='right', va='center',
                       fontsize=fontsize - 4)
    ax.tick_params(axis='both', which='major', labelsize=fontsize - 4)
    plt.legend(frameon=False, fontsize=fontsize - 2, bbox_to_anchor=lgd_bbox_to_anchor, loc=lgd_loc)

    if title is None:
        title = f"Feature Comparison: {group1} vs {group2} ({celltype})"
    wrapped_title = "\n".join(textwrap.wrap(title, width=wrapped_title_length))
    plt.title(wrapped_title, fontsize=fontsize + 2)

    if tight_layout:
        plt.tight_layout()

    if save:
        from sccellfie.plotting.plot_utils import _get_file_format, _get_file_dir
        dir, basename = _get_file_dir(save)
        os.makedirs(dir, exist_ok=True)
        format = _get_file_format(save)
        plt.savefig(f'{dir}/violin_{basename}.{format}', dpi=dpi, bbox_inches='tight')

    return fig, ax




[docs]
def create_beeswarm_plot(df, x='log2FC', y='cell_type', cohen_threshold=0.5, pval_threshold=0.05, show_n_significant=True,
                         logfc_threshold=1.0, title=None, title_fontsize=20, ticks_fontsize=14, labels_fontsize=16,
                         condition1_color='#8B0000', condition2_color='#000080', ns_color='#808080',
                         strip_size=4, strip_alpha=0.6, strip_jitter=0.2, lgd_fontsize=14,
                         lgd_marker_size=12, lgd_frameon=False, lgd_loc='upper left', lgd_bbox_to_anchor=(1.1, 1),
                         sort_lambda=None, figsize=(10, 12), save=None, dpi=300, tight_layout=True):
    """
    Creates a beeswarm plot to visualize differential analysis results.
    X-axis represents the effect size (e.g., log2 fold change or Cohen's d).
    Y-axis represents the cell types or any categorical variable.

    Parameters
    ----------
    df : DataFrame
        A DataFrame containing the results of the differential analysis, using the function `pairwise_differential_analysis`.
        The DataFrame should have at lest the following columns: 'cell_type', 'feature', 'group1',
        'group2', 'log2FC', 'cohens_d', 'adj_p_value'.

    x : str, optional (default: 'log2FC')
        The column in df to use as the x-axis.

    y : str, optional (default: 'cell_type')
        The column in df to use as the y-axis.

    cohen_threshold : float, optional (default: 0.5)
        The threshold for Cohen's D to consider a feature significant.

    pval_threshold : float, optional (default: 0.05)
        The threshold for the adjusted p-value to consider a feature significant.

    show_n_significant : bool, optional (default: True)
        Whether to show the count of significant features per cell type.

    logfc_threshold : float, optional (default: 1.0)
        The threshold for the log2 fold change to consider a feature significant.

    title : str, optional (default: None)
        The title for the plot. If None, a default title is used.

    title_fontsize : int, optional (default: 20)
        The font size for the title.

    ticks_fontsize : int, optional (default: 14)
        The font size for the ticks.

    labels_fontsize : int, optional (default: 16)
        The font size for the labels.

    condition1_color : str, optional (default: '#8B0000')
        The color for the first condition.

    condition2_color : str, optional (default: '#000080')
        The color for the second condition.

    ns_color : str, optional (default: '#808080')
        The color for non-significant features.

    strip_size : int, optional (default: 4)
        The size of the strip plot points.

    strip_alpha : float, optional (default: 0.6)
        The transparency of the strip plot points.

    strip_jitter : float, optional (default: 0.2)
        The amount of jitter to apply to the strip plot points.

    lgd_fontsize : int, optional (default: 14)
        The font size for the legend.

    lgd_marker_size : int, optional (default: 12)
        The size of the legend markers.

    lgd_frameon : bool, optional (default: False)
        Whether to show the legend frame.

    lgd_loc : str, optional (default: 'upper left')
        The location of the legend.

    lgd_bbox_to_anchor : tuple, optional (default: (1.1, 1))
        The bbox_to_anchor parameter for the legend.

    sort_lambda : function, optional (default: None)
        A lambda function to sort the y-axis values. If None, the values are sorted by the y-axis column.

    figsize : tuple, optional (default: (10, 12))
        The figure size.

    save : str, optional (default: None)
        The file path to save the plot. If None, the plot is not saved.

    dpi : int, optional (default: 300)
        The resolution of the saved figure.

    tight_layout : bool, optional (default: True)
        Whether to use tight layout for the plot.

    Returns
    -------
    fig, ax : matplotlib.pyplot.Figure, matplotlib.pyplot.Axes
        The matplotlib Figure and Axes objects for the plot.

    sig_df : DataFrame
        The input DataFrame filtered to only include significant features.
        Index is set to 'cell_type' and 'feature'.
    """
    df = df.reset_index().sort_values(by=y, key=sort_lambda)
    sorted_cells = df[y].drop_duplicates().tolist()

    # Extract conditions from contrast
    first_condition = df['group1'].iloc[0]
    second_condition = df['group2'].iloc[0]

    # Create significance column
    df['significance'] = 'NS'  # NS = Not Significant
    mask = (df['cohens_d'].abs() >= cohen_threshold) & (df['adj_p_value'] < pval_threshold)

    # Mark points as significant based on direction of change
    df.loc[mask & (df['log2FC'] <= -logfc_threshold), 'significance'] = first_condition
    df.loc[mask & (df['log2FC'] >= logfc_threshold), 'significance'] = second_condition

    # Color mapping
    color_map = {
        first_condition: condition1_color,
        second_condition: condition2_color,
        'NS': ns_color
    }

    # Create the plot
    fig, ax = plt.subplots(figsize=figsize)

    # Add strip plot for all features
    sns.stripplot(data=df,
                  y=y,
                  x=x,
                  hue='significance',
                  palette=color_map,
                  size=strip_size,
                  alpha=strip_alpha,
                  jitter=strip_jitter,
                  ax=ax,
                  legend=False)  # Remove automatic legend

    # Add reference lines
    lines = {'cohens_d': cohen_threshold, 'log2FC': logfc_threshold}
    ax.axvline(x=-lines[x], color=condition1_color, linestyle='--', alpha=0.5)  # First condition threshold
    ax.axvline(x=0, color='black', linestyle='-', alpha=0.5)  # Zero line
    ax.axvline(x=lines[x], color=condition2_color, linestyle='--', alpha=0.5)  # Second condition threshold

    # Customize the plot
    label = {'cohens_d': "Cohen's D", 'log2FC': 'log2(Fold Change)'}
    ax.set_xlabel(label[x], fontsize=labels_fontsize, fontweight='bold')
    ax.set_ylabel(y.capitalize().replace('_', ' '), fontsize=labels_fontsize, fontweight='bold')

    # Create custom legend elements
    legend_elements = [
        Line2D([0], [0], marker='o', color='w',
               markerfacecolor=color_map[first_condition],
               markersize=lgd_marker_size,
               label=f"FDR < {pval_threshold} ({first_condition})"
               # and Cohen's D $\geq$ $\pm${cohen_threshold} ({first_condition})"
               ),
        Line2D([0], [0], marker='o', color='w',
               markerfacecolor=color_map[second_condition],
               markersize=lgd_marker_size,
               label=f"FDR < {pval_threshold} ({second_condition})"
               # and Cohen's D $\geq$ $\pm${cohen_threshold} ({second_condition})"
               ),
        Line2D([0], [0], marker='o', color='w',
               markerfacecolor=color_map['NS'],
               markersize=lgd_marker_size,
               label=f"FDR $\geq$ {pval_threshold}"  # or Cohen's D < $\pm${cohen_threshold}"
               )]

    # Add custom legend
    legend = ax.legend(handles=legend_elements,
                       bbox_to_anchor=lgd_bbox_to_anchor,
                       fontsize=lgd_fontsize,
                       frameon=lgd_frameon,
                       loc=lgd_loc)

    # Show all spines
    ax.spines['right'].set_visible(True)
    ax.spines['top'].set_visible(True)

    xlim = ax.get_xlim()

    # Add count of significant features per cell type
    if show_n_significant:
        for i, cell_type in enumerate(sorted_cells):
            cell_data = df[df[y] == cell_type]
            sig_first = sum((cell_data['significance'] == first_condition))
            sig_second = sum((cell_data['significance'] == second_condition))
            if sig_first > 0 or sig_second > 0:
                ax.text(xlim[1] + 0.1, i, f'n={sig_first + sig_second}',
                        va='center', ha='left', fontsize=ticks_fontsize, fontweight='bold')

    ax.tick_params(axis='both', which='major', labelsize=ticks_fontsize)
    # Add contrast as title
    if title is None:
        title = f'{first_condition} vs {second_condition}'
    plt.title(title, pad=20, fontweight='bold', fontsize=title_fontsize)

    plt.grid(color='#E0E0E0')

    # Adjust layout
    if tight_layout:
        plt.tight_layout()

    # Output
    sig_df = df.loc[df.significance != 'NS'].drop(columns=['significance']).reset_index(drop=True).set_index(['cell_type', 'feature'])

    if save:
        from sccellfie.plotting.plot_utils import _get_file_format, _get_file_dir
        dir, basename = _get_file_dir(save)
        os.makedirs(dir, exist_ok=True)
        format = _get_file_format(save)
        plt.savefig(f'{dir}/beeswarm_{basename}.{format}', dpi=dpi, bbox_inches='tight')
    return fig, ax, sig_df