Source code for sccellfie.datasets.database

import pandas as pd
import os


[docs] def load_sccellfie_database(organism='human', task_folder=None, rxn_info_filename=None, task_info_filename=None, task_by_rxn_filename=None, task_by_gene_filename=None, rxn_by_gene_filename=None, thresholds_filename=None): """ Loads files of the metabolic task database from either a local folder, individual file paths, or predefined URLs. Parameters ---------- organism : str, optional (default: 'human') The organism to retrieve data for. Choose 'human' or 'mouse'. Used when loading from URLs. task_folder : str, optional (default: None) The local folder path containing CellFie data files. If provided, this takes priority. rxn_info_filename : str, optional (default: None) Full path for reaction information JSON file. task_info_filename : str, optional (default: None) Full path for task information CSV file. task_by_rxn_filename : str, optional (default: None) Full path for task by reaction CSV file. task_by_gene_filename : str, optional (default: None) Full path for task by gene CSV file. rxn_by_gene_filename : str, optional (default: None) Full path for reaction by gene CSV file. thresholds_filename : str, optional (default: None) Full path for thresholds CSV file. Returns ------- data : dict A dictionary containing the loaded data frames and information. Keys are 'rxn_info', 'task_info', 'task_by_rxn', 'task_by_gene', 'rxn_by_gene', 'thresholds', and 'organism'. Examples of dataframes can be found at https://github.com/earmingol/scCellFie/raw/refs/heads/main/task_data/homo_sapiens/ """ # Define default URLs for human and mouse data default_urls = { 'human': 'https://github.com/earmingol/scCellFie/raw/refs/heads/main/task_data/homo_sapiens/', 'mouse': 'https://github.com/earmingol/scCellFie/raw/refs/heads/main/task_data/mus_musculus/' } # Define default file names default_file_names = { 'human': { 'rxn_info': 'Rxn-Info-Recon2-2.json', 'task_info': 'Task-Info.csv', 'task_by_rxn': 'Task_by_Rxn.csv', 'task_by_gene': 'Task_by_Gene.csv', 'rxn_by_gene': 'Rxn_by_Gene.csv', 'thresholds': 'Thresholds.csv' }, 'mouse': { 'rxn_info': 'Rxn-Info-iMM1415.json', 'task_info': 'Task-Info.csv', 'task_by_rxn': 'Task_by_Rxn.csv', 'task_by_gene': 'Task_by_Gene.csv', 'rxn_by_gene': 'Rxn_by_Gene.csv', 'thresholds': 'Thresholds.csv' } } # Determine the base path and file names if task_folder: base_path = task_folder file_paths = { 'rxn_info': os.path.join(base_path, default_file_names[organism]['rxn_info']), 'task_info': os.path.join(base_path, default_file_names[organism]['task_info']), 'task_by_rxn': os.path.join(base_path, default_file_names[organism]['task_by_rxn']), 'task_by_gene': os.path.join(base_path, default_file_names[organism]['task_by_gene']), 'rxn_by_gene': os.path.join(base_path, default_file_names[organism]['rxn_by_gene']), 'thresholds': os.path.join(base_path, default_file_names[organism]['thresholds']) } else: base_path = default_urls.get(organism.lower()) if not base_path: raise ValueError("Invalid organism. Choose 'human' or 'mouse', or provide a custom folder path.") file_paths = { 'rxn_info': rxn_info_filename or f"{base_path}/{default_file_names[organism]['rxn_info']}", 'task_info': task_info_filename or f"{base_path}/{default_file_names[organism]['task_info']}", 'task_by_rxn': task_by_rxn_filename or f"{base_path}/{default_file_names[organism]['task_by_rxn']}", 'task_by_gene': task_by_gene_filename or f"{base_path}/{default_file_names[organism]['task_by_gene']}", 'rxn_by_gene': rxn_by_gene_filename or f"{base_path}/{default_file_names[organism]['rxn_by_gene']}", 'thresholds': thresholds_filename or f"{base_path}/{default_file_names[organism]['thresholds']}" } # Function to load a file def load_file(file_key, index_col=None): full_path = file_paths[file_key] try: if full_path.endswith('.json'): return pd.read_json(full_path) elif full_path.endswith('.csv'): return pd.read_csv(full_path, index_col=index_col) else: raise ValueError(f"Unsupported file format: {full_path}") except Exception as e: print(f"Error loading {full_path}: {str(e)}") return None # Load all files data = {} data['rxn_info'] = load_file('rxn_info') data['task_info'] = load_file('task_info') data['task_by_rxn'] = load_file('task_by_rxn', index_col='Task') data['task_by_gene'] = load_file('task_by_gene', index_col='Task') data['rxn_by_gene'] = load_file('rxn_by_gene', index_col='Reaction') data['thresholds'] = load_file('thresholds', index_col='symbol') data['organism'] = organism return data