Coverage for src/dataknobs_fsm/execution/async_stream.py: 22%

116 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-20 16:46 -0600

1"""Asynchronous stream executor for real-time processing.""" 

2 

3import asyncio 

4import time 

5from dataclasses import dataclass, field 

6from typing import Any, AsyncIterator, Callable, List, Tuple, Union 

7 

8from dataknobs_fsm.core.fsm import FSM 

9from dataknobs_fsm.core.modes import ProcessingMode, TransactionMode 

10from dataknobs_fsm.execution.context import ExecutionContext 

11from dataknobs_fsm.execution.engine import ExecutionEngine 

12from dataknobs_fsm.execution.stream import StreamProgress 

13from dataknobs_fsm.streaming.core import StreamConfig 

14 

15 

16@dataclass 

17class AsyncStreamResult: 

18 """Result from async stream processing.""" 

19 total_processed: int 

20 successful: int 

21 failed: int 

22 duration: float 

23 throughput: float 

24 error_details: List[Any] = field(default_factory=list) 

25 

26 

27class AsyncStreamExecutor: 

28 """Asynchronous executor for stream processing. 

29  

30 This executor handles: 

31 - True async stream processing 

32 - Async iterators and generators 

33 - Backpressure management 

34 - Real-time progress reporting 

35 - Memory-efficient chunk processing 

36 """ 

37 

38 def __init__( 

39 self, 

40 fsm: FSM, 

41 stream_config: StreamConfig | None = None, 

42 enable_backpressure: bool = True, 

43 progress_callback: Union[Callable, None] = None 

44 ): 

45 """Initialize async stream executor. 

46  

47 Args: 

48 fsm: FSM to execute. 

49 stream_config: Stream configuration. 

50 enable_backpressure: Enable backpressure handling. 

51 progress_callback: Callback for progress updates. 

52 """ 

53 self.fsm = fsm 

54 self.stream_config = stream_config or StreamConfig() 

55 self.enable_backpressure = enable_backpressure 

56 self.progress_callback = progress_callback 

57 

58 # Create execution engine 

59 self.engine = ExecutionEngine(fsm) 

60 

61 # Backpressure management 

62 self._pending_chunks = 0 

63 self._backpressure_threshold = self.stream_config.backpressure_threshold 

64 self._semaphore = asyncio.Semaphore(self.stream_config.parallelism) 

65 

66 async def execute_stream( 

67 self, 

68 source: Union[AsyncIterator[Any], List[Any]], 

69 sink: Union[Callable, None] = None, 

70 chunk_size: int = 100, 

71 max_transitions: int = 1000 

72 ) -> AsyncStreamResult: 

73 """Execute stream processing asynchronously. 

74  

75 Args: 

76 source: Async iterator or list of items. 

77 sink: Optional async sink function. 

78 chunk_size: Size of processing chunks. 

79 max_transitions: Maximum transitions per item. 

80  

81 Returns: 

82 Stream processing result. 

83 """ 

84 progress = StreamProgress() 

85 start_time = time.time() 

86 

87 # Create base context 

88 # Use SINGLE mode since we process items individually 

89 context_template = ExecutionContext( 

90 data_mode=ProcessingMode.SINGLE, 

91 transaction_mode=TransactionMode.NONE 

92 ) 

93 

94 # Process stream 

95 try: 

96 # Convert source to async iterator if needed 

97 if hasattr(source, '__aiter__'): 

98 stream = source 

99 elif hasattr(source, '__iter__'): 

100 # Convert sync iterator to async 

101 stream = self._sync_to_async_iter(source) 

102 else: 

103 raise ValueError("Source must be an iterator or async iterator") 

104 

105 # Process in chunks 

106 chunk = [] 

107 chunk_num = 0 

108 

109 async for item in stream: 

110 # Handle both individual items and pre-chunked lists 

111 if isinstance(item, list): 

112 # Already chunked (e.g., from streaming file reader) 

113 await self._process_chunk( 

114 item, 

115 chunk_num, 

116 context_template, 

117 max_transitions, 

118 progress, 

119 sink 

120 ) 

121 chunk_num += 1 

122 else: 

123 # Individual item - accumulate into chunks 

124 chunk.append(item) 

125 

126 if len(chunk) >= chunk_size: 

127 # Process chunk 

128 await self._process_chunk( 

129 chunk, 

130 chunk_num, 

131 context_template, 

132 max_transitions, 

133 progress, 

134 sink 

135 ) 

136 chunk = [] 

137 chunk_num += 1 

138 

139 # Apply backpressure if needed 

140 if self.enable_backpressure and self._pending_chunks >= self._backpressure_threshold: 

141 await asyncio.sleep(0.1) 

142 

143 # Process remaining items 

144 if chunk: 

145 await self._process_chunk( 

146 chunk, 

147 chunk_num, 

148 context_template, 

149 max_transitions, 

150 progress, 

151 sink 

152 ) 

153 

154 finally: 

155 # Clean up 

156 if hasattr(source, 'aclose'): 

157 await source.aclose() 

158 

159 # Calculate final statistics 

160 duration = time.time() - start_time 

161 return AsyncStreamResult( 

162 total_processed=progress.records_processed, 

163 successful=progress.records_processed - len(progress.errors), 

164 failed=len(progress.errors), 

165 duration=duration, 

166 throughput=progress.records_processed / duration if duration > 0 else 0, 

167 error_details=progress.errors[:10] # First 10 errors 

168 ) 

169 

170 async def _process_chunk( 

171 self, 

172 items: List[Any], 

173 chunk_num: int, 

174 context_template: ExecutionContext, 

175 max_transitions: int, 

176 progress: StreamProgress, 

177 sink: Union[Callable, None] 

178 ): 

179 """Process a chunk of items. 

180  

181 Args: 

182 items: Items to process. 

183 chunk_num: Chunk number. 

184 context_template: Template context. 

185 max_transitions: Maximum transitions. 

186 progress: Progress tracker. 

187 sink: Optional sink function. 

188 """ 

189 self._pending_chunks += 1 

190 

191 try: 

192 # Create tasks for parallel processing 

193 tasks = [] 

194 for i, item in enumerate(items): 

195 task = asyncio.create_task( 

196 self._process_item( 

197 item, 

198 progress.records_processed + i, 

199 context_template, 

200 max_transitions 

201 ) 

202 ) 

203 tasks.append(task) 

204 

205 # Wait for all tasks 

206 results = await asyncio.gather(*tasks, return_exceptions=True) 

207 

208 # Process results 

209 successful_results = [] 

210 for i, result in enumerate(results): 

211 if isinstance(result, Exception): 

212 progress.errors.append((progress.records_processed + i, result)) 

213 else: 

214 # Result is a tuple[bool, Any] at this point 

215 success, value = result # type: ignore 

216 if success: # success 

217 successful_results.append(value) 

218 else: 

219 progress.errors.append((progress.records_processed + i, Exception(value))) 

220 

221 # Send to sink if provided 

222 if sink and successful_results: 

223 if asyncio.iscoroutinefunction(sink): 

224 await sink(successful_results) 

225 else: 

226 # Run sync sink in executor 

227 loop = asyncio.get_event_loop() 

228 await loop.run_in_executor(None, sink, successful_results) 

229 

230 # Update progress 

231 progress.chunks_processed += 1 

232 progress.records_processed += len(items) 

233 progress.last_chunk_time = time.time() 

234 

235 # Fire progress callback 

236 if self.progress_callback: 

237 await self._fire_progress_callback(progress) 

238 

239 finally: 

240 self._pending_chunks -= 1 

241 

242 async def _process_item( 

243 self, 

244 item: Any, 

245 index: int, 

246 context_template: ExecutionContext, 

247 max_transitions: int 

248 ) -> Tuple[bool, Any]: 

249 """Process a single item. 

250  

251 Args: 

252 item: Item to process. 

253 index: Item index. 

254 context_template: Template context. 

255 max_transitions: Maximum transitions. 

256  

257 Returns: 

258 Tuple of (success, result). 

259 """ 

260 async with self._semaphore: # Control parallelism 

261 # Create context 

262 context = context_template.clone() 

263 context.data = item 

264 

265 # Reset to initial state 

266 initial_state = self._find_initial_state() 

267 if initial_state: 

268 context.set_state(initial_state) 

269 

270 # Execute in thread pool 

271 loop = asyncio.get_event_loop() 

272 return await loop.run_in_executor( 

273 None, 

274 self.engine.execute, 

275 context, 

276 item, 

277 max_transitions 

278 ) 

279 

280 async def _sync_to_async_iter(self, sync_iter): 

281 """Convert sync iterator to async iterator. 

282  

283 Args: 

284 sync_iter: Synchronous iterator. 

285  

286 Yields: 

287 Items from the iterator. 

288 """ 

289 for item in sync_iter: 

290 yield item 

291 await asyncio.sleep(0) # Allow other tasks to run 

292 

293 def _find_initial_state(self) -> str | None: 

294 """Find initial state in FSM. 

295  

296 Returns: 

297 Initial state name or None. 

298 """ 

299 # Get main network 

300 main_network = getattr(self.fsm, 'main_network', None) 

301 if isinstance(main_network, str): 

302 if main_network in self.fsm.networks: 

303 network = self.fsm.networks[main_network] 

304 if hasattr(network, 'initial_states') and network.initial_states: 

305 return next(iter(network.initial_states)) 

306 elif main_network and hasattr(main_network, 'initial_states'): 

307 if main_network.initial_states: 

308 return next(iter(main_network.initial_states)) 

309 

310 # Fallback: check all networks 

311 for network in self.fsm.networks.values(): 

312 if hasattr(network, 'initial_states') and network.initial_states: 

313 return next(iter(network.initial_states)) 

314 

315 return None 

316 

317 async def _fire_progress_callback(self, progress: StreamProgress): 

318 """Fire progress callback. 

319  

320 Args: 

321 progress: Progress information. 

322 """ 

323 if asyncio.iscoroutinefunction(self.progress_callback): 

324 await self.progress_callback(progress) 

325 else: 

326 # Run sync callback in executor 

327 loop = asyncio.get_event_loop() 

328 await loop.run_in_executor(None, self.progress_callback, progress) # type: ignore