Coverage for src/dataknobs_fsm/execution/stream.py: 78%

141 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-20 16:51 -0600

1"""Stream executor for chunk-based processing.""" 

2 

3import time 

4from dataclasses import dataclass, field 

5from typing import Any, Callable, Dict, List, Tuple, Union 

6 

7from dataknobs_fsm.core.fsm import FSM 

8from dataknobs_fsm.core.modes import ProcessingMode, TransactionMode 

9from dataknobs_fsm.execution.context import ExecutionContext 

10from dataknobs_fsm.execution.engine import ExecutionEngine 

11from dataknobs_fsm.streaming.core import ( 

12 IStreamSink, 

13 IStreamSource, 

14 StreamChunk, 

15 StreamConfig, 

16 StreamContext, 

17) 

18 

19 

20@dataclass 

21class StreamPipeline: 

22 """Pipeline configuration for stream processing.""" 

23 source: IStreamSource 

24 sink: IStreamSink | None = None 

25 transformations: List[Callable] = field(default_factory=list) 

26 chunk_processors: List[Callable] = field(default_factory=list) 

27 

28 

29@dataclass 

30class StreamProgress: 

31 """Progress tracking for stream processing.""" 

32 chunks_processed: int = 0 

33 records_processed: int = 0 

34 bytes_processed: int = 0 

35 errors: List[Tuple[int, Exception]] = field(default_factory=list) 

36 start_time: float = field(default_factory=time.time) 

37 last_chunk_time: float = field(default_factory=time.time) 

38 

39 @property 

40 def elapsed_time(self) -> float: 

41 """Get elapsed time.""" 

42 return time.time() - self.start_time 

43 

44 @property 

45 def chunks_per_second(self) -> float: 

46 """Get chunk processing rate.""" 

47 elapsed = self.elapsed_time 

48 if elapsed == 0: 

49 return 0.0 

50 return self.chunks_processed / elapsed 

51 

52 @property 

53 def records_per_second(self) -> float: 

54 """Get record processing rate.""" 

55 elapsed = self.elapsed_time 

56 if elapsed == 0: 

57 return 0.0 

58 return self.records_processed / elapsed 

59 

60 

61class StreamExecutor: 

62 """Executor for stream-based processing. 

63  

64 This executor handles: 

65 - Chunk-based processing with backpressure 

66 - Pipeline coordination 

67 - Memory management 

68 - Progress reporting 

69 - Stream transformations 

70 """ 

71 

72 def __init__( 

73 self, 

74 fsm: FSM, 

75 stream_config: StreamConfig | None = None, 

76 enable_backpressure: bool = True, 

77 progress_callback: Union[Callable, None] = None 

78 ): 

79 """Initialize stream executor. 

80  

81 Args: 

82 fsm: FSM to execute. 

83 stream_config: Stream configuration. 

84 enable_backpressure: Enable backpressure handling. 

85 progress_callback: Callback for progress updates. 

86 """ 

87 self.fsm = fsm 

88 self.stream_config = stream_config or StreamConfig() 

89 self.enable_backpressure = enable_backpressure 

90 self.progress_callback = progress_callback 

91 

92 # Create execution engine 

93 self.engine = ExecutionEngine(fsm) 

94 

95 # Memory management 

96 self._memory_usage = 0 

97 self._memory_limit = self.stream_config.memory_limit_mb * 1024 * 1024 

98 

99 # Backpressure management 

100 self._pending_chunks = 0 

101 self._backpressure_threshold = self.stream_config.backpressure_threshold 

102 

103 def execute_stream( 

104 self, 

105 pipeline: StreamPipeline, 

106 context_template: ExecutionContext | None = None, 

107 max_transitions: int = 1000 

108 ) -> Dict[str, Any]: 

109 """Execute stream processing pipeline. 

110  

111 Args: 

112 pipeline: Stream pipeline configuration. 

113 context_template: Template context. 

114 max_transitions: Maximum transitions per record. 

115  

116 Returns: 

117 Stream processing statistics. 

118 """ 

119 # Create progress tracker 

120 progress = StreamProgress() 

121 

122 # Create base context 

123 if context_template is None: 

124 # Use SINGLE mode since we process items individually 

125 context_template = ExecutionContext( 

126 data_mode=ProcessingMode.SINGLE, 

127 transaction_mode=TransactionMode.NONE 

128 ) 

129 

130 # Create stream context 

131 stream_context = StreamContext(config=self.stream_config) 

132 

133 # Set stream context in execution context 

134 context_template.stream_context = stream_context 

135 

136 # Process stream 

137 try: 

138 while True: 

139 # Check memory usage 

140 if self._should_apply_backpressure(): 

141 time.sleep(0.1) 

142 continue 

143 

144 # Read next chunk from source 

145 chunk = pipeline.source.read_chunk() 

146 if chunk is None: 

147 break 

148 

149 # Apply chunk processors 

150 for processor in pipeline.chunk_processors: 

151 chunk = processor(chunk) 

152 if chunk is None: 

153 break 

154 

155 if chunk is None: 

156 continue 

157 

158 # Process chunk 

159 chunk_results = self._process_chunk( 

160 chunk, 

161 context_template, 

162 pipeline.transformations, 

163 max_transitions, 

164 progress 

165 ) 

166 

167 # Write results to sink if provided 

168 if pipeline.sink and chunk_results: 

169 result_chunk = StreamChunk( 

170 data=chunk_results, 

171 sequence_number=chunk.sequence_number, 

172 metadata=chunk.metadata, 

173 is_last=chunk.is_last 

174 ) 

175 pipeline.sink.write_chunk(result_chunk) 

176 

177 # Update progress 

178 progress.chunks_processed += 1 

179 progress.records_processed += len(chunk.data) 

180 progress.last_chunk_time = time.time() 

181 

182 # Fire progress callback 

183 if self.progress_callback: 

184 self.progress_callback(progress) 

185 

186 # Check if last chunk 

187 if chunk.is_last: 

188 break 

189 

190 finally: 

191 # Clean up 

192 if hasattr(pipeline.source, 'aclose') or hasattr(pipeline.source, 'close'): 

193 pipeline.source.close() 

194 

195 if pipeline.sink: 

196 pipeline.sink.flush() 

197 pipeline.sink.close() 

198 

199 return self._generate_stats(progress) 

200 

201 def _process_chunk( 

202 self, 

203 chunk: StreamChunk, 

204 context_template: ExecutionContext, 

205 transformations: List[Callable], 

206 max_transitions: int, 

207 progress: StreamProgress 

208 ) -> List[Any]: 

209 """Process a single chunk. 

210  

211 Args: 

212 chunk: Chunk to process. 

213 context_template: Template context. 

214 transformations: Transformations to apply. 

215 max_transitions: Maximum transitions. 

216 progress: Progress tracker. 

217  

218 Returns: 

219 List of processed results. 

220 """ 

221 results = [] 

222 self._pending_chunks += 1 

223 

224 try: 

225 for i, record in enumerate(chunk.data): 

226 # Apply transformations 

227 transformed = record 

228 for transform in transformations: 

229 transformed = transform(transformed) 

230 if transformed is None: 

231 break 

232 

233 if transformed is None: 

234 continue 

235 

236 # Create context for this record 

237 context = context_template.clone() 

238 context.data = transformed 

239 context.set_stream_chunk(chunk) 

240 

241 # Reset to initial state 

242 initial_state = self._find_initial_state() 

243 if initial_state: 

244 context.set_state(initial_state) 

245 

246 # Execute FSM 

247 try: 

248 success, result = self.engine.execute( 

249 context, 

250 transformed, 

251 max_transitions 

252 ) 

253 

254 if success: 

255 results.append(result) 

256 else: 

257 # FSM failed, but still pass the data through 

258 results.append(transformed) 

259 progress.errors.append(( 

260 progress.records_processed + i, 

261 Exception(result) 

262 )) 

263 except Exception as e: 

264 # On error, pass the data through 

265 results.append(transformed) 

266 progress.errors.append(( 

267 progress.records_processed + i, 

268 e 

269 )) 

270 else: 

271 # No FSM configured, just pass data through 

272 results.append(transformed) 

273 

274 finally: 

275 self._pending_chunks -= 1 

276 

277 return results 

278 

279 def _should_apply_backpressure(self) -> bool: 

280 """Check if backpressure should be applied. 

281  

282 Returns: 

283 True if backpressure needed. 

284 """ 

285 if not self.enable_backpressure: 

286 return False 

287 

288 # Check pending chunks 

289 if self._pending_chunks >= self._backpressure_threshold: 

290 return True 

291 

292 # Check memory usage 

293 if self._memory_usage >= self._memory_limit: 

294 return True 

295 

296 return False 

297 

298 def _find_initial_state(self) -> str | None: 

299 """Find initial state in FSM. 

300  

301 Returns: 

302 Initial state name or None. 

303 """ 

304 # Get main network 

305 if self.fsm.name in self.fsm.networks: 

306 network = self.fsm.networks[self.fsm.name] 

307 if network.initial_states: 

308 return next(iter(network.initial_states)) 

309 return None 

310 

311 def _generate_stats(self, progress: StreamProgress) -> Dict[str, Any]: 

312 """Generate stream processing statistics. 

313  

314 Args: 

315 progress: Progress tracker. 

316  

317 Returns: 

318 Processing statistics. 

319 """ 

320 return { 

321 'total_processed': progress.records_processed, 

322 'successful': progress.records_processed - len(progress.errors), 

323 'failed': len(progress.errors), 

324 'duration': progress.elapsed_time, 

325 'throughput': progress.records_per_second, 

326 # Additional details 

327 'chunks_processed': progress.chunks_processed, 

328 'bytes_processed': progress.bytes_processed, 

329 'error_details': progress.errors[:10] # First 10 errors 

330 } 

331 

332 def create_multi_stage_pipeline( 

333 self, 

334 stages: List[Dict[str, Any]] 

335 ) -> StreamPipeline: 

336 """Create a multi-stage processing pipeline. 

337  

338 Args: 

339 stages: List of stage configurations. 

340  

341 Returns: 

342 Configured pipeline. 

343 """ 

344 # Build pipeline from stages 

345 transformations = [] 

346 chunk_processors = [] 

347 

348 for stage in stages: 

349 stage_type = stage.get('type') 

350 

351 if stage_type == 'transform': 

352 transformations.append(stage['function']) 

353 elif stage_type == 'chunk_processor': 

354 chunk_processors.append(stage['function']) 

355 

356 return StreamPipeline( 

357 source=stages[0].get('source'), 

358 sink=stages[-1].get('sink'), 

359 transformations=transformations, 

360 chunk_processors=chunk_processors 

361 )