Coverage for src/container_collection/manifest/summarize_manifest_files.py: 100%

25 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2024-09-25 18:23 +0000

1import re 

2 

3import pandas as pd 

4from tabulate import tabulate 

5 

6 

7def summarize_manifest_files( 

8 manifest: pd.DataFrame, name: str, conditions: list[dict], seeds: list[int] 

9) -> str: 

10 """ 

11 Summarize manifest files into a table. 

12 

13 Summary table is formatted as: 

14 

15 .. code-block:: bash 

16 

17 ┍━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━┯━━━━━┯━━━━━━━━━━━━━┑ 

18 │ │ extension.a │ ... │ extension.n │ 

19 ┝━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━┿━━━━━┿━━━━━━━━━━━━━┥ 

20 │ condition_key_a │ #/# (##.## %) │ ... │ ✓ │ 

21 │ condition_key_b │ #/# (##.## %) │ ... │ ✓ │ 

22 │ ... │... │ ... │ ... │ 

23 │ condition_key_n │ #/# (##.## %) │ ... │ ✓ │ 

24 ┕━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━┷━━━━━┷━━━━━━━━━━━━━┙ 

25 

26 For file extensions that have files or each random seed, the summary table 

27 reports the number and percentage of random seeds. For file extensions with 

28 only one file per conditions, a checkmark (✓) is used to indicate if the 

29 file exists or not. 

30 

31 Parameters 

32 ---------- 

33 manifest 

34 Manifest of file keys, extensions, and locations. 

35 name 

36 Name of the simulation series. 

37 conditions 

38 List of series condition dictionaries (must include unique condition 

39 "key"). 

40 seeds 

41 List of series random seeds. 

42 

43 Returns 

44 ------- 

45 : 

46 Manifest summary table. 

47 """ 

48 

49 condition_keys = [f"{name}_{condition['key']}" for condition in conditions] 

50 manifest_keys = manifest.set_index("KEY").filter(regex=f"^{name}", axis="index").reset_index() 

51 extensions = manifest_keys["EXTENSION"].unique() 

52 

53 counts = pd.DataFrame(index=condition_keys, columns=extensions).fillna(0) 

54 summary = pd.DataFrame(index=condition_keys, columns=extensions) 

55 

56 for entry in manifest_keys.to_dict("records"): 

57 key = entry["KEY"] 

58 extension = entry["EXTENSION"] 

59 

60 match = re.search("[0-9]{4}", key) 

61 if match: 

62 key = entry["KEY"].replace(f"_{match.group(0)}", "") 

63 if key in condition_keys and int(match.group(0)) in seeds: 

64 counts.loc[key][extension] += 1 

65 count = counts.loc[key][extension] 

66 percent = count / len(seeds) * 100 

67 summary.loc[key, extension] = f"{count}/{len(seeds)} ({percent:.2f} %)" 

68 elif key in condition_keys: 

69 summary.loc[key, extension] = "✓" 

70 

71 summary = summary.dropna(axis=1, how="all") 

72 summary = summary.fillna("") 

73 

74 return tabulate(summary, headers="keys", tablefmt="mixed_outline")