Coverage for src/arcade_collection/input/merge_region_samples.py: 100%
31 statements
« prev ^ index » next coverage.py v7.1.0, created at 2024-12-09 19:07 +0000
« prev ^ index » next coverage.py v7.1.0, created at 2024-12-09 19:07 +0000
1from __future__ import annotations
3import numpy as np
4import pandas as pd
7def merge_region_samples(
8 samples: dict[str, pd.DataFrame], margins: tuple[int, int, int]
9) -> pd.DataFrame:
10 """
11 Merge different region samples into single valid samples dataframe.
13 The input samples are formatted as:
15 .. code-block:: python
17 {
18 "DEFAULT": (dataframe with columns = id, x, y, z),
19 "<REGION>": (dataframe with columns = id, x, y, z),
20 "<REGION>": (dataframe with columns = id, x, y, z),
21 ...
22 }
24 The DEFAULT region is used as the superset of (x, y, z) samples; any sample
25 found only in a non-DEFAULT region are ignored. For a given id, there must
26 be at least one sample in each region.
28 The output samples are formatted as:
30 .. code-block:: markdown
32 ┍━━━━━━┯━━━━━━━━━━┯━━━━━━━━━━┯━━━━━━━━━━┯━━━━━━━━━━┑
33 │ id │ x │ y │ z │ region │
34 ┝━━━━━━┿━━━━━━━━━━┿━━━━━━━━━━┿━━━━━━━━━━┿━━━━━━━━━━┥
35 │ <id> │ <x + dx> │ <y + dy> │ <z + dz> │ DEFAULT │
36 │ <id> │ <x + dx> │ <y + dy> │ <z + dz> │ <REGION> │
37 │ ... │ ... │ ... │ ... │ ... │
38 │ <id> │ <x + dx> │ <y + dy> │ <z + dz> │ <REGION> │
39 ┕━━━━━━┷━━━━━━━━━━┷━━━━━━━━━━┷━━━━━━━━━━┷━━━━━━━━━━┙
41 Samples that are found in the DEFAULT region, but not in any non-DEFAULT
42 region are marked as DEFAULT. Otherwise, the sample is marked with the
43 corresponding region. Region samples should be mutually exclusive.
45 Parameters
46 ----------
47 samples
48 Map of region names to region samples.
49 margins
50 Margin in the x, y, and z directions applied to sample locations.
52 Returns
53 -------
54 :
55 Dataframe of merged samples with applied margins.
56 """
58 default_samples = samples["DEFAULT"]
59 all_samples = transform_sample_coordinates(default_samples, margins)
61 regions = [key for key in samples if key != "DEFAULT"]
62 all_region_samples = []
64 for region in regions:
65 region_samples = transform_sample_coordinates(samples[region], margins, default_samples)
66 region_samples["region"] = region
67 all_region_samples.append(region_samples)
69 if len(all_region_samples) > 0:
70 all_samples = all_samples.merge(
71 pd.concat(all_region_samples), on=["id", "x", "y", "z"], how="left"
72 )
73 all_samples["region"] = all_samples["region"].fillna("DEFAULT")
75 return filter_valid_samples(all_samples)
78def transform_sample_coordinates(
79 samples: pd.DataFrame,
80 margins: tuple[int, int, int],
81 reference: pd.DataFrame | None = None,
82) -> pd.DataFrame:
83 """
84 Transform samples into centered coordinates.
86 Parameters
87 ----------
88 samples
89 Sample cell ids and coordinates.
90 margins
91 Margin size in x, y, and z directions.
92 reference
93 Reference samples used to calculate transformation.
95 Returns
96 -------
97 :
98 Transformed sample cell ids and coordinates.
99 """
101 if reference is None:
102 reference = samples
104 minimums = (min(reference.x), min(reference.y), min(reference.z))
105 offsets = np.subtract(margins, minimums) + 1
107 coordinates = samples[["x", "y", "z"]].to_numpy() + offsets
108 coordinates = coordinates.astype("int64")
110 transformed_samples = pd.DataFrame(coordinates, columns=["x", "y", "z"])
111 transformed_samples.insert(0, "id", samples["id"])
113 return transformed_samples
116def filter_valid_samples(samples: pd.DataFrame) -> pd.DataFrame:
117 """
118 Filter samples for valid cell ids.
120 Filter conditions include:
122 - Each cell must have at least one sample assigned to each specified region
124 Parameters
125 ----------
126 samples
127 Sample cell ids and coordinates.
129 Returns
130 -------
131 :
132 Valid sample cell ids and coordinates.
133 """
135 if "region" in samples.columns:
136 num_regions = len(samples.region.unique())
137 samples = samples.groupby("id").filter(lambda x: len(x.region.unique()) == num_regions)
139 return samples.reset_index(drop=True)