Source code for ggpubpy.datasets

"""
Built-in datasets for ggpubpy examples and testing.

This module provides easy access to commonly used datasets for demonstration
and testing purposes.
"""

import os
from typing import Any, Dict
from importlib import resources

import pandas as pd


def _get_data_path() -> str:
    """Get filesystem path to the installed data directory.

    Uses importlib.resources to be robust across installations.
    """
    try:
        # For modern Python, resolve the package resource to a real path
        from importlib.resources import files

        with resources.as_file(files("ggpubpy") / "data") as p:
            return str(p)
    except Exception:
        # Fallback to relative path next to the module
        return os.path.join(os.path.dirname(__file__), "data")


[docs] def load_iris() -> pd.DataFrame: """ Load the famous iris dataset. The iris dataset contains measurements of sepal and petal dimensions for three species of iris flowers (setosa, versicolor, virginica). Returns ------- pd.DataFrame DataFrame with columns: sepal_length, sepal_width, petal_length, petal_width, species. Examples -------- >>> from ggpubpy.datasets import load_iris >>> iris = load_iris() >>> iris.head() """ # Try reading via importlib.resources first try: from importlib.resources import files with resources.as_file(files("ggpubpy") / "data" / "iris.csv") as p: return pd.read_csv(str(p)) except Exception: data_path = _get_data_path() iris_path = os.path.join(data_path, "iris.csv") return pd.read_csv(iris_path)
[docs] def load_titanic() -> pd.DataFrame: """ Load the famous Titanic dataset. The Titanic dataset contains information about passengers aboard the RMS Titanic, including survival status, passenger class, age, gender, and other details. Returns ------- pd.DataFrame DataFrame with columns: PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked. Examples -------- >>> from ggpubpy.datasets import load_titanic >>> titanic = load_titanic() >>> titanic.head() """ try: from importlib.resources import files with resources.as_file(files("ggpubpy") / "data" / "titanic.csv") as p: return pd.read_csv(str(p)) except Exception: data_path = _get_data_path() titanic_path = os.path.join(data_path, "titanic.csv") return pd.read_csv(titanic_path)
[docs] def get_iris_palette() -> Dict[str, str]: """ Get the default color palette for iris species. Returns ------- dict Dictionary mapping species names to hex colors. Examples -------- >>> from ggpubpy.datasets import get_iris_palette >>> palette = get_iris_palette() >>> print(palette) {'setosa': '#00AFBB', 'versicolor': '#E7B800', 'virginica': '#FC4E07'} """ return {"setosa": "#00AFBB", "versicolor": "#E7B800", "virginica": "#FC4E07"}
[docs] def get_titanic_palette() -> Dict[str, Dict[str, str]]: """ Get the default color palette for Titanic dataset categories. Returns ------- dict Dictionary mapping category names to hex colors. Examples -------- >>> from ggpubpy.datasets import get_titanic_palette >>> palette = get_titanic_palette() >>> print(palette) {'Survived': {'0': '#E74C3C', '1': '#2ECC71'}, 'Pclass': {'1': '#F39C12', '2': '#3498DB', '3': '#9B59B6'}, 'Sex': {'male': '#3498DB', 'female': '#E91E63'}} """ return { "Survived": { "0": "#E74C3C", "1": "#2ECC71", }, # Red for died, Green for survived "Pclass": { "1": "#F39C12", "2": "#3498DB", "3": "#9B59B6", }, # Orange, Blue, Purple "Sex": { "male": "#3498DB", "female": "#E91E63", }, # Blue for male, Pink for female "Embarked": { "C": "#E74C3C", "Q": "#F39C12", "S": "#2ECC71", }, # Red, Orange, Green }
[docs] def list_datasets() -> Dict[str, Any]: """ List all available datasets with descriptions. Returns ------- dict Dictionary with dataset names as keys and descriptions as values. """ return { "iris": { "description": "The famous iris flower dataset with sepal/petal measurements", "shape": (150, 5), "columns": [ "sepal_length", "sepal_width", "petal_length", "petal_width", "species", ], "loader": load_iris, }, "titanic": { "description": "The famous Titanic passenger dataset with survival information", "shape": (891, 12), "columns": [ "PassengerId", "Survived", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked", ], "loader": load_titanic, }, }