Source code for sccellfie.preprocessing.gpr_rules

import re
import numpy as np


[docs] def clean_gene_names(gpr_rule): """ Removes spaces between parentheses and gene IDs in a GPR rule. Parameters ---------- gpr_rule : str GPR rule to clean. Returns ------- cleaned_gpr : str Cleaned GPR rule, without spaces between parentheses and gene IDs. """ # Remove spaces between parentheses and gene IDs cleaned_gpr = re.sub(r'\(\s*(\w+)\s*\)', r'(\1)', gpr_rule) return cleaned_gpr
[docs] def find_genes_gpr(gpr_rule): """ Finds all gene IDs in a GPR rule. Parameters ---------- gpr_rule : str GPR rule to search for gene IDs. Returns ------- genes : list of str List of gene IDs found in the GPR rule. """ elements = re.findall(r'\b[^\s(),]+\b', gpr_rule) genes = [e for e in elements if e.lower() not in ('and', 'or')] return genes
[docs] def replace_gene_ids_in_gpr(gpr_rule, gene_id_mapping): """ Replaces gene IDs in a GPR rule with new IDs (different nomenclature). Parameters ---------- gpr_rule : str GPR rule to update. gene_id_mapping : dict Dictionary mapping old gene IDs to new gene IDs. Returns ------- updated_gpr_rule : str GPR rule with gene IDs replaced by new IDs. """ updated_gpr_rule = gpr_rule for gene_id, new_id in gene_id_mapping.items(): # Replace gene_id when it's surrounded by parentheses, removing the parentheses updated_gpr_rule = re.sub(rf'\({re.escape(gene_id)}\)', new_id, updated_gpr_rule) # Replace gene_id when it's not surrounded by parentheses updated_gpr_rule = re.sub(rf'\b{re.escape(gene_id)}\b', new_id, updated_gpr_rule) return updated_gpr_rule
[docs] def convert_gpr_nomenclature(gpr_rules, id_mapping): """ Converts gene IDs in multiple GPR rules to a different nomenclature. Parameters ---------- gpr_rules : list of str List of GPR rules to update. id_mapping : dict Dictionary mapping old gene IDs to new gene IDs. Returns ------- converted_rules : list of str List of GPR rules with gene IDs replaced by new IDs. """ converted_rules = [] for gpr in gpr_rules: if isinstance(gpr, str): cleaned_gpr = clean_gene_names(gpr) converted_gpr = replace_gene_ids_in_gpr(cleaned_gpr, id_mapping) converted_rules.append(converted_gpr) else: converted_rules.append(np.nan) return converted_rules