Coverage for .tox/p313/lib/python3.13/site-packages/scicom/utilities/statistics.py: 89%

109 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-10 16:35 +1200

1"""Prune a network.""" 

2import igraph as ig 

3import numpy as np 

4import pandas as pd 

5 

6 

7class PruneNetwork: 

8 """Create statistics for communication networks by deletion. 

9 

10 For a given dataset with sender and receiver information, 

11 create a weighted network with igraph. For a given number 

12 of iterations, deletion amounts, and deletion types, the 

13 algorithm then generates network statistics for randomly 

14 sampled subnetworks. 

15 """ 

16 

17 def __init__(self, dataframe:pd.DataFrame) -> None: 

18 """Initialize pruning.""" 

19 self.inputDF = dataframe 

20 

21 def makeNet(self, dataframe:pd.DataFrame) -> ig.Graph: 

22 """Create network from dataframe. 

23 

24 Assumes the existence of sender, receiver and step 

25 column names. 

26 """ 

27 networkdata = dataframe.groupby(["sender", "receiver"]).agg({"step": lambda x: x.to_list()}).reset_index() 

28 counts = networkdata.step.apply(lambda x : len(x)) 

29 networkdata.insert(3, "weight", counts) 

30 graph = ig.Graph.TupleList( 

31 networkdata.itertuples(index=False), directed=True, edge_attrs=["step", "weight"], 

32 ) 

33 for node in graph.vs: 

34 agent = node["name"] 

35 edgSend = self.inputDF.query("sender == @agent") 

36 maxSend = edgSend.step.max() 

37 edgRec = self.inputDF.query("receiver == @agent") 

38 maxRec = edgRec.step.max() 

39 if maxSend > maxRec or np.isnan(maxRec): 

40 lastLoc = edgSend.query("step == @maxSend")["sender_location"].iloc[0] 

41 elif maxSend < maxRec or maxSend == maxRec or np.isnan(maxSend): 

42 lastLoc = edgRec.query("step == @maxRec")["receiver_location"].iloc[0] 

43 else: 

44 text = f"No location for agent {agent}, got max send {maxSend} and max rec {maxRec}." 

45 raise ValueError(text) 

46 node["location"] = lastLoc 

47 return graph 

48 

49 def setSurvivalProb(self, graph:ig.Graph, *, method:str = "agents", ranked:bool = True) -> pd.DataFrame: 

50 """Generate probabilities for different survival modes.""" 

51 if method == "agents": 

52 tempData = pd.DataFrame( 

53 {"id": graph.vs["name"], "degree": graph.indegree()}, 

54 ) 

55 tempData = tempData.sort_values("degree", ascending=False) if ranked else tempData.sample(frac=1) 

56 elif method == "regions": 

57 tempData = pd.DataFrame( 

58 pd.concat( 

59 [self.inputDF.sender_location, self.inputDF.receiver_location], 

60 ).unique(), columns = ["location"], 

61 ) 

62 locations = pd.DataFrame({"id":graph.vs["name"], "location":graph.vs["location"]}) 

63 locations = locations.groupby("location")["id"].nunique().reset_index(name = "count") 

64 tempData = tempData.merge(locations, how="left").fillna(0) 

65 tempData = tempData.sort_values("count", ascending = False) if ranked else tempData.sample(frac=1) 

66 elif method == "time": 

67 tempData = pd.DataFrame({"step": range(self.inputDF.step.max() + 1)}) 

68 tempData = tempData.sort_values("step", ascending = False) if ranked else tempData.sample(frac=1) 

69 rng = np.random.default_rng() 

70 probabilities = pd.DataFrame( 

71 { 

72 "unif": -np.sort(-rng.uniform(0, 1, len(tempData))), 

73 "log_normal1": -np.sort(-rng.lognormal(0, 1/2, len(tempData))), 

74 "log_normal2": -np.sort(-rng.lognormal(0, 1, len(tempData))), 

75 "log_normal3": -np.sort(-rng.lognormal(0, 2, len(tempData))), 

76 "exp": -np.sort(-rng.exponential(10, len(tempData))), 

77 "beta": -np.sort(-rng.beta(a=4, b=5, size=len(tempData))), 

78 }, 

79 ) 

80 return pd.concat([tempData, probabilities], axis = 1) 

81 

82 def scaleSurvivalProb(self, probabilities:pd.DataFrame, *, method:str = "agents") -> pd.DataFrame: 

83 """Scale survival for methods agents and regions.""" 

84 colsType = ["unif", "beta", "exp", "log_normal1", "log_normal2", "log_normal3"] 

85 if method == "time": 

86 return probabilities 

87 if method == "agents": 

88 cols = ["sender", "receiver"] 

89 cols.extend(colsType) 

90 tempData = self.inputDF[["sender", "receiver"]].drop_duplicates().merge( 

91 probabilities, left_on="sender", right_on="id", 

92 ) 

93 tempData = tempData.merge(probabilities, left_on="receiver", right_on="id") 

94 if method == "regions": 

95 cols = ["sender_location", "receiver_location"] 

96 cols.extend(colsType) 

97 tempData = self.inputDF[["sender_location", "receiver_location"]].drop_duplicates().merge( 

98 probabilities, left_on="sender_location", right_on="location", 

99 ) 

100 tempData = tempData.merge(probabilities, left_on="receiver_location", right_on="location") 

101 for i in colsType: 

102 tempData[i] = tempData[i + "_x"] * tempData[i + "_y"] / np.dot(tempData[i + "_x"], tempData[i + "_y"]) 

103 return tempData[cols] 

104 

105 def basicNetStats(self, graph:ig.Graph) -> pd.DataFrame: 

106 """Generate base statistics of network.""" 

107 #Find the degree centrality 

108 tempData = pd.DataFrame({"Degree":graph.degree()}) 

109 

110 #Find the ranking 

111 tempData["Rank"] = tempData["Degree"].rank(method = "min", ascending = False) 

112 

113 #Adding other types of centrality 

114 tempData["Betweenness"] = graph.betweenness() 

115 tempData["Closeness"] = graph.closeness() 

116 tempData["Eigenvector"] = graph.eigenvector_centrality() 

117 tempData["Page_Rank"] = graph.pagerank() 

118 

119 return tempData 

120 

121 def netStats(self, G:ig.Graph) -> pd.DataFrame: 

122 """Generate network statistics.""" 

123 #Number of components: 

124 no_components = len(G.components()) 

125 #Number of maximal cliques: 

126 # TODO(Malte): Consider if these are necessary. Performance! 

127 # no_cliques = len(G.maximal_cliques()) 

128 #Size of the largest clique: 

129 # size_clique = G.omega() 

130 #Average path length: 

131 avg_path = G.average_path_length() 

132 #Diameter: 

133 diameter = G.diameter() 

134 #Modularity: 

135 modularity = G.modularity(G.components()) 

136 #Transitivity: 

137 transitivity = G.transitivity_undirected() 

138 #Cohesion 

139 cohesion = G.cohesion() 

140 #Degree assortativity: 

141 assortativity = G.assortativity_degree() 

142 #Find the in-degree centrality for each node: 

143 indegrees = G.indegree() 

144 #Average relative degree: 

145 N = len(G.vs) 

146 avg_rel_degree = np.mean([x/N for x in indegrees]) 

147 #Tail estimator (Hill): 

148 try: 

149 hill = ig.statistics.power_law_fit( 

150 indegrees, 

151 xmin=1, 

152 method = "hill", 

153 ).alpha 

154 except: 

155 # TODO: power law estimation fails for small samples 

156 # This is especially the case in the tests. 

157 hill = 1 

158 #Centralization: 

159 max_indegree = max(indegrees) 

160 centralization = float(N*max_indegree - sum(indegrees))/(N-1)**2 

161 

162 return pd.DataFrame([{ 

163 "no_components":no_components, 

164 # "no_cliques":no_cliques, 

165 # "size_clique":size_clique, 

166 "diameter":diameter, 

167 "avg_path":avg_path, 

168 "modularity":modularity, 

169 "transitivity":transitivity, 

170 "cohesion":cohesion, 

171 "assortativity":assortativity, 

172 "avg_degree":avg_rel_degree, 

173 "centralization":centralization, 

174 "hill":hill, 

175 }]) 

176 

177 def deleteFromNetwork( 

178 self, 

179 iterations: int = 10, 

180 delAmounts: tuple = (0.1, 0.25, 0.5, 0.75, 0.9), 

181 delTypes: tuple = ("unif", "log_normal1", "exp", "beta", "log_normal2", "log_normal3"), 

182 delMethod: tuple = ("agents", "regions", "time"), 

183 rankedVals: tuple = (True, False), 

184 ) -> pd.DataFrame: 

185 """Run the deletion by sampling.""" 

186 results = [] 

187 fullNet = self.makeNet( 

188 self.inputDF, 

189 ) 

190 fullStats = self.netStats(fullNet) 

191 fullStats = fullStats.assign( 

192 delVal=0, delType="NA", delIteration=0, delMethod="NA", rankedVal="NA", 

193 ) 

194 results.append(fullStats) 

195 for idx in range(1, iterations + 1): 

196 for method in delMethod: 

197 for ranked in rankedVals: 

198 probVals = self.setSurvivalProb( 

199 fullNet, method=method, ranked=ranked, 

200 ) 

201 prunVals = self.scaleSurvivalProb( 

202 probVals, method=method, 

203 ) 

204 tempDF = self.inputDF.merge( 

205 prunVals, 

206 ) 

207 for val in list(delAmounts): 

208 for deltype in list(delTypes): 

209 delDF = tempDF.sample( 

210 frac = (1 - val), 

211 weights=deltype, 

212 ) 

213 delNet = self.makeNet(delDF) 

214 delStats = self.netStats(delNet) 

215 delStats = delStats.assign( 

216 delVal=val, delType=deltype, delIteration=idx, delMethod=method, rankedVal=ranked, 

217 ) 

218 results.append(delStats) 

219 return pd.concat(results) 

220 

221 

222 

223def prune( 

224 modelparameters: dict, 

225 network: tuple, 

226 columns: list, 

227 iterations: int = 10, 

228 delAmounts: tuple = (0.1, 0.25, 0.5, 0.75, 0.9), 

229 delTypes: tuple = ("unif", "log_normal1", "exp", "beta", "log_normal2", "log_normal3"), 

230 delMethod: tuple = ("agents", "regions", "time"), 

231 rankedVals: tuple = (True, False)) -> pd.DataFrame: 

232 """Generate pruned networks from input. 

233 

234 Assumes existence of columns "sender", "receiver", 

235 "sender_location", "receiver_location" and "step". 

236 """ 

237 runDf = pd.DataFrame(network, columns = columns) 

238 pruning = PruneNetwork(runDf) 

239 result = pruning.deleteFromNetwork( 

240 iterations=iterations, 

241 delAmounts=delAmounts, 

242 delTypes=delTypes, 

243 delMethod=delMethod, 

244 rankedVals=rankedVals, 

245 ) 

246 return result.assign(**modelparameters)