Coverage for src/dataknobs_fsm/execution/stream.py: 78%
141 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 16:51 -0600
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 16:51 -0600
1"""Stream executor for chunk-based processing."""
3import time
4from dataclasses import dataclass, field
5from typing import Any, Callable, Dict, List, Tuple, Union
7from dataknobs_fsm.core.fsm import FSM
8from dataknobs_fsm.core.modes import ProcessingMode, TransactionMode
9from dataknobs_fsm.execution.context import ExecutionContext
10from dataknobs_fsm.execution.engine import ExecutionEngine
11from dataknobs_fsm.streaming.core import (
12 IStreamSink,
13 IStreamSource,
14 StreamChunk,
15 StreamConfig,
16 StreamContext,
17)
20@dataclass
21class StreamPipeline:
22 """Pipeline configuration for stream processing."""
23 source: IStreamSource
24 sink: IStreamSink | None = None
25 transformations: List[Callable] = field(default_factory=list)
26 chunk_processors: List[Callable] = field(default_factory=list)
29@dataclass
30class StreamProgress:
31 """Progress tracking for stream processing."""
32 chunks_processed: int = 0
33 records_processed: int = 0
34 bytes_processed: int = 0
35 errors: List[Tuple[int, Exception]] = field(default_factory=list)
36 start_time: float = field(default_factory=time.time)
37 last_chunk_time: float = field(default_factory=time.time)
39 @property
40 def elapsed_time(self) -> float:
41 """Get elapsed time."""
42 return time.time() - self.start_time
44 @property
45 def chunks_per_second(self) -> float:
46 """Get chunk processing rate."""
47 elapsed = self.elapsed_time
48 if elapsed == 0:
49 return 0.0
50 return self.chunks_processed / elapsed
52 @property
53 def records_per_second(self) -> float:
54 """Get record processing rate."""
55 elapsed = self.elapsed_time
56 if elapsed == 0:
57 return 0.0
58 return self.records_processed / elapsed
61class StreamExecutor:
62 """Executor for stream-based processing.
64 This executor handles:
65 - Chunk-based processing with backpressure
66 - Pipeline coordination
67 - Memory management
68 - Progress reporting
69 - Stream transformations
70 """
72 def __init__(
73 self,
74 fsm: FSM,
75 stream_config: StreamConfig | None = None,
76 enable_backpressure: bool = True,
77 progress_callback: Union[Callable, None] = None
78 ):
79 """Initialize stream executor.
81 Args:
82 fsm: FSM to execute.
83 stream_config: Stream configuration.
84 enable_backpressure: Enable backpressure handling.
85 progress_callback: Callback for progress updates.
86 """
87 self.fsm = fsm
88 self.stream_config = stream_config or StreamConfig()
89 self.enable_backpressure = enable_backpressure
90 self.progress_callback = progress_callback
92 # Create execution engine
93 self.engine = ExecutionEngine(fsm)
95 # Memory management
96 self._memory_usage = 0
97 self._memory_limit = self.stream_config.memory_limit_mb * 1024 * 1024
99 # Backpressure management
100 self._pending_chunks = 0
101 self._backpressure_threshold = self.stream_config.backpressure_threshold
103 def execute_stream(
104 self,
105 pipeline: StreamPipeline,
106 context_template: ExecutionContext | None = None,
107 max_transitions: int = 1000
108 ) -> Dict[str, Any]:
109 """Execute stream processing pipeline.
111 Args:
112 pipeline: Stream pipeline configuration.
113 context_template: Template context.
114 max_transitions: Maximum transitions per record.
116 Returns:
117 Stream processing statistics.
118 """
119 # Create progress tracker
120 progress = StreamProgress()
122 # Create base context
123 if context_template is None:
124 # Use SINGLE mode since we process items individually
125 context_template = ExecutionContext(
126 data_mode=ProcessingMode.SINGLE,
127 transaction_mode=TransactionMode.NONE
128 )
130 # Create stream context
131 stream_context = StreamContext(config=self.stream_config)
133 # Set stream context in execution context
134 context_template.stream_context = stream_context
136 # Process stream
137 try:
138 while True:
139 # Check memory usage
140 if self._should_apply_backpressure():
141 time.sleep(0.1)
142 continue
144 # Read next chunk from source
145 chunk = pipeline.source.read_chunk()
146 if chunk is None:
147 break
149 # Apply chunk processors
150 for processor in pipeline.chunk_processors:
151 chunk = processor(chunk)
152 if chunk is None:
153 break
155 if chunk is None:
156 continue
158 # Process chunk
159 chunk_results = self._process_chunk(
160 chunk,
161 context_template,
162 pipeline.transformations,
163 max_transitions,
164 progress
165 )
167 # Write results to sink if provided
168 if pipeline.sink and chunk_results:
169 result_chunk = StreamChunk(
170 data=chunk_results,
171 sequence_number=chunk.sequence_number,
172 metadata=chunk.metadata,
173 is_last=chunk.is_last
174 )
175 pipeline.sink.write_chunk(result_chunk)
177 # Update progress
178 progress.chunks_processed += 1
179 progress.records_processed += len(chunk.data)
180 progress.last_chunk_time = time.time()
182 # Fire progress callback
183 if self.progress_callback:
184 self.progress_callback(progress)
186 # Check if last chunk
187 if chunk.is_last:
188 break
190 finally:
191 # Clean up
192 if hasattr(pipeline.source, 'aclose') or hasattr(pipeline.source, 'close'):
193 pipeline.source.close()
195 if pipeline.sink:
196 pipeline.sink.flush()
197 pipeline.sink.close()
199 return self._generate_stats(progress)
201 def _process_chunk(
202 self,
203 chunk: StreamChunk,
204 context_template: ExecutionContext,
205 transformations: List[Callable],
206 max_transitions: int,
207 progress: StreamProgress
208 ) -> List[Any]:
209 """Process a single chunk.
211 Args:
212 chunk: Chunk to process.
213 context_template: Template context.
214 transformations: Transformations to apply.
215 max_transitions: Maximum transitions.
216 progress: Progress tracker.
218 Returns:
219 List of processed results.
220 """
221 results = []
222 self._pending_chunks += 1
224 try:
225 for i, record in enumerate(chunk.data):
226 # Apply transformations
227 transformed = record
228 for transform in transformations:
229 transformed = transform(transformed)
230 if transformed is None:
231 break
233 if transformed is None:
234 continue
236 # Create context for this record
237 context = context_template.clone()
238 context.data = transformed
239 context.set_stream_chunk(chunk)
241 # Reset to initial state
242 initial_state = self._find_initial_state()
243 if initial_state:
244 context.set_state(initial_state)
246 # Execute FSM
247 try:
248 success, result = self.engine.execute(
249 context,
250 transformed,
251 max_transitions
252 )
254 if success:
255 results.append(result)
256 else:
257 # FSM failed, but still pass the data through
258 results.append(transformed)
259 progress.errors.append((
260 progress.records_processed + i,
261 Exception(result)
262 ))
263 except Exception as e:
264 # On error, pass the data through
265 results.append(transformed)
266 progress.errors.append((
267 progress.records_processed + i,
268 e
269 ))
270 else:
271 # No FSM configured, just pass data through
272 results.append(transformed)
274 finally:
275 self._pending_chunks -= 1
277 return results
279 def _should_apply_backpressure(self) -> bool:
280 """Check if backpressure should be applied.
282 Returns:
283 True if backpressure needed.
284 """
285 if not self.enable_backpressure:
286 return False
288 # Check pending chunks
289 if self._pending_chunks >= self._backpressure_threshold:
290 return True
292 # Check memory usage
293 if self._memory_usage >= self._memory_limit:
294 return True
296 return False
298 def _find_initial_state(self) -> str | None:
299 """Find initial state in FSM.
301 Returns:
302 Initial state name or None.
303 """
304 # Get main network
305 if self.fsm.name in self.fsm.networks:
306 network = self.fsm.networks[self.fsm.name]
307 if network.initial_states:
308 return next(iter(network.initial_states))
309 return None
311 def _generate_stats(self, progress: StreamProgress) -> Dict[str, Any]:
312 """Generate stream processing statistics.
314 Args:
315 progress: Progress tracker.
317 Returns:
318 Processing statistics.
319 """
320 return {
321 'total_processed': progress.records_processed,
322 'successful': progress.records_processed - len(progress.errors),
323 'failed': len(progress.errors),
324 'duration': progress.elapsed_time,
325 'throughput': progress.records_per_second,
326 # Additional details
327 'chunks_processed': progress.chunks_processed,
328 'bytes_processed': progress.bytes_processed,
329 'error_details': progress.errors[:10] # First 10 errors
330 }
332 def create_multi_stage_pipeline(
333 self,
334 stages: List[Dict[str, Any]]
335 ) -> StreamPipeline:
336 """Create a multi-stage processing pipeline.
338 Args:
339 stages: List of stage configurations.
341 Returns:
342 Configured pipeline.
343 """
344 # Build pipeline from stages
345 transformations = []
346 chunk_processors = []
348 for stage in stages:
349 stage_type = stage.get('type')
351 if stage_type == 'transform':
352 transformations.append(stage['function'])
353 elif stage_type == 'chunk_processor':
354 chunk_processors.append(stage['function'])
356 return StreamPipeline(
357 source=stages[0].get('source'),
358 sink=stages[-1].get('sink'),
359 transformations=transformations,
360 chunk_processors=chunk_processors
361 )