Source code for sccellfie.datasets.gene_info
import pandas as pd
[docs]
def retrieve_ensembl2symbol_data(filename=None, organism='human'):
"""
Retrieves a dictionary mapping Ensembl IDs to gene symbols for a given organism.
Parameters
----------
filename : str, optional (default: None)
The file path to a custom CSV file containing Ensembl IDs and gene symbols.
organism : str, optional (default: 'human')
The organism to retrieve data for. Choose 'human' or 'mouse'.
Returns
-------
ensembl2symbol : dict
A dictionary mapping Ensembl IDs to gene symbols
"""
# Define default URLs for human and mouse data
default_urls = {
'human': 'https://github.com/earmingol/scCellFie/raw/refs/heads/main/task_data/Ensembl2symbol_human.csv',
'mouse': 'https://github.com/earmingol/scCellFie/raw/refs/heads/main/task_data/Ensembl2symbol_mouse.csv'
}
# Prioritize the provided file_path if it exists
if filename:
path = filename
else:
# Use the default URL based on the organism if no file_path is provided
path = default_urls.get(organism.lower())
if not path:
raise ValueError("Invalid organism. Choose 'human' or 'mouse', or provide a custom file path.")
try:
# Read the CSV file
df = pd.read_csv(path)
# Check if required columns are present
if 'symbol' not in df.columns or 'ensembl_id' not in df.columns:
raise ValueError("CSV file must contain 'symbol' and 'ensembl_id' columns.")
# Create and return the dictionary
ensembl2symbol = dict(zip(df['ensembl_id'], df['symbol']))
return ensembl2symbol
except FileNotFoundError:
print(f"File not found: {path}")
return {}
except pd.errors.EmptyDataError:
print(f"The file is empty: {path}")
return {}
except Exception as e:
print(f"An error occurred: {str(e)}")
return {}