Source code for arcade_collection.input.merge_region_samples

from __future__ import annotations

import numpy as np
import pandas as pd


[docs]def merge_region_samples( samples: dict[str, pd.DataFrame], margins: tuple[int, int, int] ) -> pd.DataFrame: """ Merge different region samples into single valid samples dataframe. The input samples are formatted as: .. code-block:: python { "DEFAULT": (dataframe with columns = id, x, y, z), "<REGION>": (dataframe with columns = id, x, y, z), "<REGION>": (dataframe with columns = id, x, y, z), ... } The DEFAULT region is used as the superset of (x, y, z) samples; any sample found only in a non-DEFAULT region are ignored. For a given id, there must be at least one sample in each region. The output samples are formatted as: .. code-block:: markdown ┍━━━━━━┯━━━━━━━━━━┯━━━━━━━━━━┯━━━━━━━━━━┯━━━━━━━━━━┑ │ id │ x │ y │ z │ region │ ┝━━━━━━┿━━━━━━━━━━┿━━━━━━━━━━┿━━━━━━━━━━┿━━━━━━━━━━┥ │ <id> │ <x + dx> │ <y + dy> │ <z + dz> │ DEFAULT │ │ <id> │ <x + dx> │ <y + dy> │ <z + dz> │ <REGION> │ │ ... │ ... │ ... │ ... │ ... │ │ <id> │ <x + dx> │ <y + dy> │ <z + dz> │ <REGION> │ ┕━━━━━━┷━━━━━━━━━━┷━━━━━━━━━━┷━━━━━━━━━━┷━━━━━━━━━━┙ Samples that are found in the DEFAULT region, but not in any non-DEFAULT region are marked as DEFAULT. Otherwise, the sample is marked with the corresponding region. Region samples should be mutually exclusive. Parameters ---------- samples Map of region names to region samples. margins Margin in the x, y, and z directions applied to sample locations. Returns ------- : Dataframe of merged samples with applied margins. """ default_samples = samples["DEFAULT"] all_samples = transform_sample_coordinates(default_samples, margins) regions = [key for key in samples if key != "DEFAULT"] all_region_samples = [] for region in regions: region_samples = transform_sample_coordinates(samples[region], margins, default_samples) region_samples["region"] = region all_region_samples.append(region_samples) if len(all_region_samples) > 0: all_samples = all_samples.merge( pd.concat(all_region_samples), on=["id", "x", "y", "z"], how="left" ) all_samples["region"] = all_samples["region"].fillna("DEFAULT") return filter_valid_samples(all_samples)
[docs]def transform_sample_coordinates( samples: pd.DataFrame, margins: tuple[int, int, int], reference: pd.DataFrame | None = None, ) -> pd.DataFrame: """ Transform samples into centered coordinates. Parameters ---------- samples Sample cell ids and coordinates. margins Margin size in x, y, and z directions. reference Reference samples used to calculate transformation. Returns ------- : Transformed sample cell ids and coordinates. """ if reference is None: reference = samples minimums = (min(reference.x), min(reference.y), min(reference.z)) offsets = np.subtract(margins, minimums) + 1 coordinates = samples[["x", "y", "z"]].to_numpy() + offsets coordinates = coordinates.astype("int64") transformed_samples = pd.DataFrame(coordinates, columns=["x", "y", "z"]) transformed_samples.insert(0, "id", samples["id"]) return transformed_samples
[docs]def filter_valid_samples(samples: pd.DataFrame) -> pd.DataFrame: """ Filter samples for valid cell ids. Filter conditions include: - Each cell must have at least one sample assigned to each specified region Parameters ---------- samples Sample cell ids and coordinates. Returns ------- : Valid sample cell ids and coordinates. """ if "region" in samples.columns: num_regions = len(samples.region.unique()) samples = samples.groupby("id").filter(lambda x: len(x.region.unique()) == num_regions) return samples.reset_index(drop=True)