Coverage for src/dataknobs_fsm/execution/async_stream.py: 22%
116 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 16:46 -0600
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 16:46 -0600
1"""Asynchronous stream executor for real-time processing."""
3import asyncio
4import time
5from dataclasses import dataclass, field
6from typing import Any, AsyncIterator, Callable, List, Tuple, Union
8from dataknobs_fsm.core.fsm import FSM
9from dataknobs_fsm.core.modes import ProcessingMode, TransactionMode
10from dataknobs_fsm.execution.context import ExecutionContext
11from dataknobs_fsm.execution.engine import ExecutionEngine
12from dataknobs_fsm.execution.stream import StreamProgress
13from dataknobs_fsm.streaming.core import StreamConfig
16@dataclass
17class AsyncStreamResult:
18 """Result from async stream processing."""
19 total_processed: int
20 successful: int
21 failed: int
22 duration: float
23 throughput: float
24 error_details: List[Any] = field(default_factory=list)
27class AsyncStreamExecutor:
28 """Asynchronous executor for stream processing.
30 This executor handles:
31 - True async stream processing
32 - Async iterators and generators
33 - Backpressure management
34 - Real-time progress reporting
35 - Memory-efficient chunk processing
36 """
38 def __init__(
39 self,
40 fsm: FSM,
41 stream_config: StreamConfig | None = None,
42 enable_backpressure: bool = True,
43 progress_callback: Union[Callable, None] = None
44 ):
45 """Initialize async stream executor.
47 Args:
48 fsm: FSM to execute.
49 stream_config: Stream configuration.
50 enable_backpressure: Enable backpressure handling.
51 progress_callback: Callback for progress updates.
52 """
53 self.fsm = fsm
54 self.stream_config = stream_config or StreamConfig()
55 self.enable_backpressure = enable_backpressure
56 self.progress_callback = progress_callback
58 # Create execution engine
59 self.engine = ExecutionEngine(fsm)
61 # Backpressure management
62 self._pending_chunks = 0
63 self._backpressure_threshold = self.stream_config.backpressure_threshold
64 self._semaphore = asyncio.Semaphore(self.stream_config.parallelism)
66 async def execute_stream(
67 self,
68 source: Union[AsyncIterator[Any], List[Any]],
69 sink: Union[Callable, None] = None,
70 chunk_size: int = 100,
71 max_transitions: int = 1000
72 ) -> AsyncStreamResult:
73 """Execute stream processing asynchronously.
75 Args:
76 source: Async iterator or list of items.
77 sink: Optional async sink function.
78 chunk_size: Size of processing chunks.
79 max_transitions: Maximum transitions per item.
81 Returns:
82 Stream processing result.
83 """
84 progress = StreamProgress()
85 start_time = time.time()
87 # Create base context
88 # Use SINGLE mode since we process items individually
89 context_template = ExecutionContext(
90 data_mode=ProcessingMode.SINGLE,
91 transaction_mode=TransactionMode.NONE
92 )
94 # Process stream
95 try:
96 # Convert source to async iterator if needed
97 if hasattr(source, '__aiter__'):
98 stream = source
99 elif hasattr(source, '__iter__'):
100 # Convert sync iterator to async
101 stream = self._sync_to_async_iter(source)
102 else:
103 raise ValueError("Source must be an iterator or async iterator")
105 # Process in chunks
106 chunk = []
107 chunk_num = 0
109 async for item in stream:
110 # Handle both individual items and pre-chunked lists
111 if isinstance(item, list):
112 # Already chunked (e.g., from streaming file reader)
113 await self._process_chunk(
114 item,
115 chunk_num,
116 context_template,
117 max_transitions,
118 progress,
119 sink
120 )
121 chunk_num += 1
122 else:
123 # Individual item - accumulate into chunks
124 chunk.append(item)
126 if len(chunk) >= chunk_size:
127 # Process chunk
128 await self._process_chunk(
129 chunk,
130 chunk_num,
131 context_template,
132 max_transitions,
133 progress,
134 sink
135 )
136 chunk = []
137 chunk_num += 1
139 # Apply backpressure if needed
140 if self.enable_backpressure and self._pending_chunks >= self._backpressure_threshold:
141 await asyncio.sleep(0.1)
143 # Process remaining items
144 if chunk:
145 await self._process_chunk(
146 chunk,
147 chunk_num,
148 context_template,
149 max_transitions,
150 progress,
151 sink
152 )
154 finally:
155 # Clean up
156 if hasattr(source, 'aclose'):
157 await source.aclose()
159 # Calculate final statistics
160 duration = time.time() - start_time
161 return AsyncStreamResult(
162 total_processed=progress.records_processed,
163 successful=progress.records_processed - len(progress.errors),
164 failed=len(progress.errors),
165 duration=duration,
166 throughput=progress.records_processed / duration if duration > 0 else 0,
167 error_details=progress.errors[:10] # First 10 errors
168 )
170 async def _process_chunk(
171 self,
172 items: List[Any],
173 chunk_num: int,
174 context_template: ExecutionContext,
175 max_transitions: int,
176 progress: StreamProgress,
177 sink: Union[Callable, None]
178 ):
179 """Process a chunk of items.
181 Args:
182 items: Items to process.
183 chunk_num: Chunk number.
184 context_template: Template context.
185 max_transitions: Maximum transitions.
186 progress: Progress tracker.
187 sink: Optional sink function.
188 """
189 self._pending_chunks += 1
191 try:
192 # Create tasks for parallel processing
193 tasks = []
194 for i, item in enumerate(items):
195 task = asyncio.create_task(
196 self._process_item(
197 item,
198 progress.records_processed + i,
199 context_template,
200 max_transitions
201 )
202 )
203 tasks.append(task)
205 # Wait for all tasks
206 results = await asyncio.gather(*tasks, return_exceptions=True)
208 # Process results
209 successful_results = []
210 for i, result in enumerate(results):
211 if isinstance(result, Exception):
212 progress.errors.append((progress.records_processed + i, result))
213 else:
214 # Result is a tuple[bool, Any] at this point
215 success, value = result # type: ignore
216 if success: # success
217 successful_results.append(value)
218 else:
219 progress.errors.append((progress.records_processed + i, Exception(value)))
221 # Send to sink if provided
222 if sink and successful_results:
223 if asyncio.iscoroutinefunction(sink):
224 await sink(successful_results)
225 else:
226 # Run sync sink in executor
227 loop = asyncio.get_event_loop()
228 await loop.run_in_executor(None, sink, successful_results)
230 # Update progress
231 progress.chunks_processed += 1
232 progress.records_processed += len(items)
233 progress.last_chunk_time = time.time()
235 # Fire progress callback
236 if self.progress_callback:
237 await self._fire_progress_callback(progress)
239 finally:
240 self._pending_chunks -= 1
242 async def _process_item(
243 self,
244 item: Any,
245 index: int,
246 context_template: ExecutionContext,
247 max_transitions: int
248 ) -> Tuple[bool, Any]:
249 """Process a single item.
251 Args:
252 item: Item to process.
253 index: Item index.
254 context_template: Template context.
255 max_transitions: Maximum transitions.
257 Returns:
258 Tuple of (success, result).
259 """
260 async with self._semaphore: # Control parallelism
261 # Create context
262 context = context_template.clone()
263 context.data = item
265 # Reset to initial state
266 initial_state = self._find_initial_state()
267 if initial_state:
268 context.set_state(initial_state)
270 # Execute in thread pool
271 loop = asyncio.get_event_loop()
272 return await loop.run_in_executor(
273 None,
274 self.engine.execute,
275 context,
276 item,
277 max_transitions
278 )
280 async def _sync_to_async_iter(self, sync_iter):
281 """Convert sync iterator to async iterator.
283 Args:
284 sync_iter: Synchronous iterator.
286 Yields:
287 Items from the iterator.
288 """
289 for item in sync_iter:
290 yield item
291 await asyncio.sleep(0) # Allow other tasks to run
293 def _find_initial_state(self) -> str | None:
294 """Find initial state in FSM.
296 Returns:
297 Initial state name or None.
298 """
299 # Get main network
300 main_network = getattr(self.fsm, 'main_network', None)
301 if isinstance(main_network, str):
302 if main_network in self.fsm.networks:
303 network = self.fsm.networks[main_network]
304 if hasattr(network, 'initial_states') and network.initial_states:
305 return next(iter(network.initial_states))
306 elif main_network and hasattr(main_network, 'initial_states'):
307 if main_network.initial_states:
308 return next(iter(main_network.initial_states))
310 # Fallback: check all networks
311 for network in self.fsm.networks.values():
312 if hasattr(network, 'initial_states') and network.initial_states:
313 return next(iter(network.initial_states))
315 return None
317 async def _fire_progress_callback(self, progress: StreamProgress):
318 """Fire progress callback.
320 Args:
321 progress: Progress information.
322 """
323 if asyncio.iscoroutinefunction(self.progress_callback):
324 await self.progress_callback(progress)
325 else:
326 # Run sync callback in executor
327 loop = asyncio.get_event_loop()
328 await loop.run_in_executor(None, self.progress_callback, progress) # type: ignore