Source code for container_collection.manifest.summarize_manifest_files

import re

import pandas as pd
from tabulate import tabulate


[docs]def summarize_manifest_files(
    manifest: pd.DataFrame, name: str, conditions: list[dict], seeds: list[int]
) -> str:
    """
    Summarize manifest files into a table.

    Summary table is formatted as:

    .. code-block:: bash

        ┍━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━┯━━━━━┯━━━━━━━━━━━━━┑
        │                 │ extension.a   │ ... │ extension.n │
        ┝━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━┿━━━━━┿━━━━━━━━━━━━━┥
        │ condition_key_a │ #/# (##.## %) │ ... │ ✓           │
        │ condition_key_b │ #/# (##.## %) │ ... │ ✓           │
        │ ...             │...            │ ... │ ...         │
        │ condition_key_n │ #/# (##.## %) │ ... │ ✓           │
        ┕━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━┷━━━━━┷━━━━━━━━━━━━━┙

    For file extensions that have files  or each random seed, the summary table
    reports the number and percentage of random seeds. For file extensions with
    only one file per conditions, a checkmark (✓) is used to indicate if the
    file exists or not.

    Parameters
    ----------
    manifest
        Manifest of file keys, extensions, and locations.
    name
        Name of the simulation series.
    conditions
        List of series condition dictionaries (must include unique condition
        "key").
    seeds
        List of series random seeds.

    Returns
    -------
    :
        Manifest summary table.
    """

    condition_keys = [f"{name}_{condition['key']}" for condition in conditions]
    manifest_keys = manifest.set_index("KEY").filter(regex=f"^{name}", axis="index").reset_index()
    extensions = manifest_keys["EXTENSION"].unique()

    counts = pd.DataFrame(index=condition_keys, columns=extensions).fillna(0)
    summary = pd.DataFrame(index=condition_keys, columns=extensions)

    for entry in manifest_keys.to_dict("records"):
        key = entry["KEY"]
        extension = entry["EXTENSION"]

        match = re.search("[0-9]{4}", key)
        if match:
            key = entry["KEY"].replace(f"_{match.group(0)}", "")
            if key in condition_keys and int(match.group(0)) in seeds:
                counts.loc[key][extension] += 1
                count = counts.loc[key][extension]
                percent = count / len(seeds) * 100
                summary.loc[key, extension] = f"{count}/{len(seeds)} ({percent:.2f} %)"
        elif key in condition_keys:
            summary.loc[key, extension] = "✓"

    summary = summary.dropna(axis=1, how="all")
    summary = summary.fillna("")

    return tabulate(summary, headers="keys", tablefmt="mixed_outline")