Coverage for .tox/p313/lib/python3.13/site-packages/scicom/utilities/statistics.py: 92%
109 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-10 17:58 +1200
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-10 17:58 +1200
1"""Prune a network."""
2import igraph as ig
3import numpy as np
4import pandas as pd
7class PruneNetwork:
8 """Create statistics for communication networks by deletion.
10 For a given dataset with sender and receiver information,
11 create a weighted network with igraph. For a given number
12 of iterations, deletion amounts, and deletion types, the
13 algorithm then generates network statistics for randomly
14 sampled subnetworks.
15 """
17 def __init__(self, dataframe:pd.DataFrame) -> None:
18 """Initialize pruning."""
19 self.inputDF = dataframe
21 def makeNet(self, dataframe:pd.DataFrame) -> ig.Graph:
22 """Create network from dataframe.
24 Assumes the existence of sender, receiver and step
25 column names.
26 """
27 networkdata = dataframe.groupby(["sender", "receiver"]).agg({"step": lambda x: x.to_list()}).reset_index()
28 counts = networkdata.step.apply(lambda x : len(x))
29 networkdata.insert(3, "weight", counts)
30 graph = ig.Graph.TupleList(
31 networkdata.itertuples(index=False), directed=True, edge_attrs=["step", "weight"],
32 )
33 for node in graph.vs:
34 agent = node["name"]
35 edgSend = self.inputDF.query("sender == @agent")
36 maxSend = edgSend.step.max()
37 edgRec = self.inputDF.query("receiver == @agent")
38 maxRec = edgRec.step.max()
39 if maxSend > maxRec or np.isnan(maxRec):
40 lastLoc = edgSend.query("step == @maxSend")["sender_location"].iloc[0]
41 elif maxSend < maxRec or maxSend == maxRec or np.isnan(maxSend):
42 lastLoc = edgRec.query("step == @maxRec")["receiver_location"].iloc[0]
43 else:
44 text = f"No location for agent {agent}, got max send {maxSend} and max rec {maxRec}."
45 raise ValueError(text)
46 node["location"] = lastLoc
47 return graph
49 def setSurvivalProb(self, graph:ig.Graph, *, method:str = "agents", ranked:bool = True) -> pd.DataFrame:
50 """Generate probabilities for different survival modes."""
51 if method == "agents":
52 tempData = pd.DataFrame(
53 {"id": graph.vs["name"], "degree": graph.indegree()},
54 )
55 tempData = tempData.sort_values("degree", ascending=False) if ranked else tempData.sample(frac=1)
56 elif method == "regions":
57 tempData = pd.DataFrame(
58 pd.concat(
59 [self.inputDF.sender_location, self.inputDF.receiver_location],
60 ).unique(), columns = ["location"],
61 )
62 locations = pd.DataFrame({"id":graph.vs["name"], "location":graph.vs["location"]})
63 locations = locations.groupby("location")["id"].nunique().reset_index(name = "count")
64 tempData = tempData.merge(locations, how="left").fillna(0)
65 tempData = tempData.sort_values("count", ascending = False) if ranked else tempData.sample(frac=1)
66 elif method == "time":
67 tempData = pd.DataFrame({"step": range(self.inputDF.step.max() + 1)})
68 tempData = tempData.sort_values("step", ascending = False) if ranked else tempData.sample(frac=1)
69 rng = np.random.default_rng()
70 probabilities = pd.DataFrame(
71 {
72 "unif": -np.sort(-rng.uniform(0, 1, len(tempData))),
73 "log_normal1": -np.sort(-rng.lognormal(0, 1/2, len(tempData))),
74 "log_normal2": -np.sort(-rng.lognormal(0, 1, len(tempData))),
75 "log_normal3": -np.sort(-rng.lognormal(0, 2, len(tempData))),
76 "exp": -np.sort(-rng.exponential(10, len(tempData))),
77 "beta": -np.sort(-rng.beta(a=4, b=5, size=len(tempData))),
78 },
79 )
80 return pd.concat([tempData, probabilities], axis = 1)
82 def scaleSurvivalProb(self, probabilities:pd.DataFrame, *, method:str = "agents") -> pd.DataFrame:
83 """Scale survival for methods agents and regions."""
84 colsType = ["unif", "beta", "exp", "log_normal1", "log_normal2", "log_normal3"]
85 if method == "time":
86 return probabilities
87 if method == "agents":
88 cols = ["sender", "receiver"]
89 cols.extend(colsType)
90 tempData = self.inputDF[["sender", "receiver"]].drop_duplicates().merge(
91 probabilities, left_on="sender", right_on="id",
92 )
93 tempData = tempData.merge(probabilities, left_on="receiver", right_on="id")
94 if method == "regions":
95 cols = ["sender_location", "receiver_location"]
96 cols.extend(colsType)
97 tempData = self.inputDF[["sender_location", "receiver_location"]].drop_duplicates().merge(
98 probabilities, left_on="sender_location", right_on="location",
99 )
100 tempData = tempData.merge(probabilities, left_on="receiver_location", right_on="location")
101 for i in colsType:
102 tempData[i] = tempData[i + "_x"] * tempData[i + "_y"] / np.dot(tempData[i + "_x"], tempData[i + "_y"])
103 return tempData[cols]
105 def basicNetStats(self, graph:ig.Graph) -> pd.DataFrame:
106 """Generate base statistics of network."""
107 #Find the degree centrality
108 tempData = pd.DataFrame({"Degree":graph.degree()})
110 #Find the ranking
111 tempData["Rank"] = tempData["Degree"].rank(method = "min", ascending = False)
113 #Adding other types of centrality
114 tempData["Betweenness"] = graph.betweenness()
115 tempData["Closeness"] = graph.closeness()
116 tempData["Eigenvector"] = graph.eigenvector_centrality()
117 tempData["Page_Rank"] = graph.pagerank()
119 return tempData
121 def netStats(self, G:ig.Graph) -> pd.DataFrame:
122 """Generate network statistics."""
123 #Number of components:
124 no_components = len(G.components())
125 #Number of maximal cliques:
126 # TODO(Malte): Consider if these are necessary. Performance!
127 # no_cliques = len(G.maximal_cliques())
128 #Size of the largest clique:
129 # size_clique = G.omega()
130 #Average path length:
131 avg_path = G.average_path_length()
132 #Diameter:
133 diameter = G.diameter()
134 #Modularity:
135 modularity = G.modularity(G.components())
136 #Transitivity:
137 transitivity = G.transitivity_undirected()
138 #Cohesion
139 cohesion = G.cohesion()
140 #Degree assortativity:
141 assortativity = G.assortativity_degree()
142 #Find the in-degree centrality for each node:
143 indegrees = G.indegree()
144 #Average relative degree:
145 N = len(G.vs)
146 avg_rel_degree = np.mean([x/N for x in indegrees])
147 #Tail estimator (Hill):
148 try:
149 hill = ig.statistics.power_law_fit(
150 indegrees,
151 xmin=1,
152 method = "hill",
153 ).alpha
154 except:
155 # TODO: power law estimation fails for small samples
156 # This is especially the case in the tests.
157 hill = 1
158 #Centralization:
159 max_indegree = max(indegrees)
160 centralization = float(N*max_indegree - sum(indegrees))/(N-1)**2
162 return pd.DataFrame([{
163 "no_components":no_components,
164 # "no_cliques":no_cliques,
165 # "size_clique":size_clique,
166 "diameter":diameter,
167 "avg_path":avg_path,
168 "modularity":modularity,
169 "transitivity":transitivity,
170 "cohesion":cohesion,
171 "assortativity":assortativity,
172 "avg_degree":avg_rel_degree,
173 "centralization":centralization,
174 "hill":hill,
175 }])
177 def deleteFromNetwork(
178 self,
179 iterations: int = 10,
180 delAmounts: tuple = (0.1, 0.25, 0.5, 0.75, 0.9),
181 delTypes: tuple = ("unif", "log_normal1", "exp", "beta", "log_normal2", "log_normal3"),
182 delMethod: tuple = ("agents", "regions", "time"),
183 rankedVals: tuple = (True, False),
184 ) -> pd.DataFrame:
185 """Run the deletion by sampling."""
186 results = []
187 fullNet = self.makeNet(
188 self.inputDF,
189 )
190 fullStats = self.netStats(fullNet)
191 fullStats = fullStats.assign(
192 delVal=0, delType="NA", delIteration=0, delMethod="NA", rankedVal="NA",
193 )
194 results.append(fullStats)
195 for idx in range(1, iterations + 1):
196 for method in delMethod:
197 for ranked in rankedVals:
198 probVals = self.setSurvivalProb(
199 fullNet, method=method, ranked=ranked,
200 )
201 prunVals = self.scaleSurvivalProb(
202 probVals, method=method,
203 )
204 tempDF = self.inputDF.merge(
205 prunVals,
206 )
207 for val in list(delAmounts):
208 for deltype in list(delTypes):
209 delDF = tempDF.sample(
210 frac = (1 - val),
211 weights=deltype,
212 )
213 delNet = self.makeNet(delDF)
214 delStats = self.netStats(delNet)
215 delStats = delStats.assign(
216 delVal=val, delType=deltype, delIteration=idx, delMethod=method, rankedVal=ranked,
217 )
218 results.append(delStats)
219 return pd.concat(results)
223def prune(
224 modelparameters: dict,
225 network: tuple,
226 columns: list,
227 iterations: int = 10,
228 delAmounts: tuple = (0.1, 0.25, 0.5, 0.75, 0.9),
229 delTypes: tuple = ("unif", "log_normal1", "exp", "beta", "log_normal2", "log_normal3"),
230 delMethod: tuple = ("agents", "regions", "time"),
231 rankedVals: tuple = (True, False)) -> pd.DataFrame:
232 """Generate pruned networks from input.
234 Assumes existence of columns "sender", "receiver",
235 "sender_location", "receiver_location" and "step".
236 """
237 runDf = pd.DataFrame(network, columns = columns)
238 pruning = PruneNetwork(runDf)
239 result = pruning.deleteFromNetwork(
240 iterations=iterations,
241 delAmounts=delAmounts,
242 delTypes=delTypes,
243 delMethod=delMethod,
244 rankedVals=rankedVals,
245 )
246 return result.assign(**modelparameters)