Coverage for arrakis_server/partition.py: 100.0%

64 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-08-12 16:39 -0700

1import bisect 

2import math 

3import random 

4import string 

5from collections import Counter, defaultdict 

6from collections.abc import Iterator 

7from dataclasses import replace 

8from typing import Any 

9 

10import numpy 

11 

12from .channel import Channel 

13 

14 

15def generate_partition_id(publisher_id: str, channel: Channel | None = None) -> str: 

16 alphanum = string.ascii_uppercase + string.digits 

17 rand_id = "".join(random.SystemRandom().choice(alphanum) for _ in range(6)) 

18 if channel: 

19 subsystem = channel.name.split(":")[1].split("-")[0] 

20 return f"{publisher_id}_{subsystem}_{rand_id}" 

21 else: 

22 return f"{publisher_id}_{rand_id}" 

23 

24 

25def grouped(items: list[Any], n: int) -> Iterator[list[Any]]: 

26 for i in range(0, len(items), n): 

27 yield items[i : i + n] 

28 

29 

30def partition_channels( 

31 channels: list[Channel], 

32 publisher: str, 

33 metadata: dict[str, Channel] | None = None, 

34 max_channels: int = 100, 

35 partition_fraction: float = 0.8, 

36) -> list[Channel]: 

37 """determine partitions IDs for channels 

38 

39 Parameters 

40 ---------- 

41 channels : list[Channel] 

42 List of channels for which to determine partition IDs 

43 publisher: str 

44 A publisher ID to apply to all channels being partitioned. 

45 This will override any publisher already specified in the 

46 channel metadata returned. 

47 metadata: dict[str, Channel] 

48 An existing channel metadata dictionary, from which existing 

49 partition information will be taken. 

50 max_channels: int 

51 The maximum number per partition. 

52 partition_fraction: float 

53 Fraction of max channels to use in intial partition 

54 allocation. 

55 

56 Returns the initially provided channel list updated with publisher 

57 and partition info. 

58 

59 """ 

60 if metadata is None: 

61 metadata = {} 

62 else: 

63 # trim metadata to only contain the channels with the listed publisher 

64 metadata = { 

65 name: meta for name, meta in metadata.items() if meta.publisher == publisher 

66 } 

67 

68 # determine channels to partition 

69 channels_to_partition = [ 

70 channel for channel in channels if channel.partition_id is None 

71 ] 

72 if not channels_to_partition: 

73 return channels 

74 

75 # map channels to dtypes 

76 channels_by_dtype: dict[numpy.dtype | None, list[Channel]] = {} 

77 for channel in channels_to_partition: 

78 channels_by_dtype.setdefault(channel.data_type, []).append(channel) 

79 

80 # filter channels that aren't matched to an ID 

81 # handle each data type separately 

82 updated = {} 

83 for subblock in channels_by_dtype.values(): 

84 # filter channels that aren't matched to an ID 

85 subblock_group = {channel.name for channel in subblock} 

86 subpartitions = { 

87 name: meta.partition_id 

88 for name, meta in metadata.items() 

89 if name in subblock_group 

90 } 

91 unmatched = [ 

92 channel for channel in subblock if channel.name not in subpartitions 

93 ] 

94 part_count = Counter(subpartitions.values()) 

95 ordered = sorted(list(subpartitions.keys())) 

96 

97 # determine where channel would go in sorted order 

98 insert_pt = defaultdict(list) 

99 for channel in unmatched: 

100 idx = bisect.bisect_left(ordered, channel.name) 

101 insert_pt[idx].append(channel) 

102 

103 # assign unmatched into existing or new partitions 

104 max_partition_size = math.floor(partition_fraction * max_channels) 

105 for idx, adjacent in insert_pt.items(): 

106 insert_idx = min(idx, len(ordered) - 1) 

107 

108 if insert_idx == -1: 

109 # no initial partitions 

110 partition_id = generate_partition_id(publisher, adjacent[0]) 

111 else: 

112 id_ = metadata[ordered[insert_idx]].partition_id 

113 assert isinstance(id_, str) 

114 partition_id = id_ 

115 

116 if part_count[partition_id] + len(adjacent) > max_channels: 

117 # assign to new partition 

118 for group in grouped(adjacent, max_partition_size): 

119 partition_id = generate_partition_id(publisher, group[0]) 

120 for channel in group: 

121 updated[channel.name] = replace( 

122 channel, 

123 publisher=publisher, 

124 partition_id=partition_id, 

125 ) 

126 part_count[partition_id] += len(group) 

127 else: 

128 # assign to existing partition 

129 for channel in adjacent: 

130 updated[channel.name] = replace( 

131 channel, 

132 publisher=publisher, 

133 partition_id=partition_id, 

134 ) 

135 part_count[partition_id] += len(adjacent) 

136 

137 # fill in any channels that were not newly partitioned 

138 for channel in channels: 

139 if channel.name in updated: 

140 continue 

141 assert metadata[channel.name].partition_id 

142 updated[channel.name] = replace( 

143 channel, 

144 publisher=publisher, 

145 partition_id=metadata[channel.name].partition_id, 

146 ) 

147 

148 # return same channel list order as passed in 

149 return [updated[channel.name] for channel in channels]