Coverage for arrakis_server/partition.py: 100.0%
64 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-08-12 16:39 -0700
« prev ^ index » next coverage.py v7.6.12, created at 2025-08-12 16:39 -0700
1import bisect
2import math
3import random
4import string
5from collections import Counter, defaultdict
6from collections.abc import Iterator
7from dataclasses import replace
8from typing import Any
10import numpy
12from .channel import Channel
15def generate_partition_id(publisher_id: str, channel: Channel | None = None) -> str:
16 alphanum = string.ascii_uppercase + string.digits
17 rand_id = "".join(random.SystemRandom().choice(alphanum) for _ in range(6))
18 if channel:
19 subsystem = channel.name.split(":")[1].split("-")[0]
20 return f"{publisher_id}_{subsystem}_{rand_id}"
21 else:
22 return f"{publisher_id}_{rand_id}"
25def grouped(items: list[Any], n: int) -> Iterator[list[Any]]:
26 for i in range(0, len(items), n):
27 yield items[i : i + n]
30def partition_channels(
31 channels: list[Channel],
32 publisher: str,
33 metadata: dict[str, Channel] | None = None,
34 max_channels: int = 100,
35 partition_fraction: float = 0.8,
36) -> list[Channel]:
37 """determine partitions IDs for channels
39 Parameters
40 ----------
41 channels : list[Channel]
42 List of channels for which to determine partition IDs
43 publisher: str
44 A publisher ID to apply to all channels being partitioned.
45 This will override any publisher already specified in the
46 channel metadata returned.
47 metadata: dict[str, Channel]
48 An existing channel metadata dictionary, from which existing
49 partition information will be taken.
50 max_channels: int
51 The maximum number per partition.
52 partition_fraction: float
53 Fraction of max channels to use in intial partition
54 allocation.
56 Returns the initially provided channel list updated with publisher
57 and partition info.
59 """
60 if metadata is None:
61 metadata = {}
62 else:
63 # trim metadata to only contain the channels with the listed publisher
64 metadata = {
65 name: meta for name, meta in metadata.items() if meta.publisher == publisher
66 }
68 # determine channels to partition
69 channels_to_partition = [
70 channel for channel in channels if channel.partition_id is None
71 ]
72 if not channels_to_partition:
73 return channels
75 # map channels to dtypes
76 channels_by_dtype: dict[numpy.dtype | None, list[Channel]] = {}
77 for channel in channels_to_partition:
78 channels_by_dtype.setdefault(channel.data_type, []).append(channel)
80 # filter channels that aren't matched to an ID
81 # handle each data type separately
82 updated = {}
83 for subblock in channels_by_dtype.values():
84 # filter channels that aren't matched to an ID
85 subblock_group = {channel.name for channel in subblock}
86 subpartitions = {
87 name: meta.partition_id
88 for name, meta in metadata.items()
89 if name in subblock_group
90 }
91 unmatched = [
92 channel for channel in subblock if channel.name not in subpartitions
93 ]
94 part_count = Counter(subpartitions.values())
95 ordered = sorted(list(subpartitions.keys()))
97 # determine where channel would go in sorted order
98 insert_pt = defaultdict(list)
99 for channel in unmatched:
100 idx = bisect.bisect_left(ordered, channel.name)
101 insert_pt[idx].append(channel)
103 # assign unmatched into existing or new partitions
104 max_partition_size = math.floor(partition_fraction * max_channels)
105 for idx, adjacent in insert_pt.items():
106 insert_idx = min(idx, len(ordered) - 1)
108 if insert_idx == -1:
109 # no initial partitions
110 partition_id = generate_partition_id(publisher, adjacent[0])
111 else:
112 id_ = metadata[ordered[insert_idx]].partition_id
113 assert isinstance(id_, str)
114 partition_id = id_
116 if part_count[partition_id] + len(adjacent) > max_channels:
117 # assign to new partition
118 for group in grouped(adjacent, max_partition_size):
119 partition_id = generate_partition_id(publisher, group[0])
120 for channel in group:
121 updated[channel.name] = replace(
122 channel,
123 publisher=publisher,
124 partition_id=partition_id,
125 )
126 part_count[partition_id] += len(group)
127 else:
128 # assign to existing partition
129 for channel in adjacent:
130 updated[channel.name] = replace(
131 channel,
132 publisher=publisher,
133 partition_id=partition_id,
134 )
135 part_count[partition_id] += len(adjacent)
137 # fill in any channels that were not newly partitioned
138 for channel in channels:
139 if channel.name in updated:
140 continue
141 assert metadata[channel.name].partition_id
142 updated[channel.name] = replace(
143 channel,
144 publisher=publisher,
145 partition_id=metadata[channel.name].partition_id,
146 )
148 # return same channel list order as passed in
149 return [updated[channel.name] for channel in channels]