Coverage for src/dataknobs_fsm/streaming/core.py: 27%
366 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 16:51 -0600
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 16:51 -0600
1"""Core streaming interfaces and implementations for FSM data processing."""
3import asyncio
4import queue
5import threading
6import time
7from contextlib import contextmanager
8from dataclasses import dataclass, field
9from enum import Enum
10from typing import Any, AsyncIterator, Callable, Dict, Iterator, List, Protocol, Union, runtime_checkable
11from uuid import uuid4
14class StreamStatus(Enum):
15 """Stream processing status."""
16 IDLE = "idle"
17 ACTIVE = "active"
18 PAUSED = "paused"
19 COMPLETED = "completed"
20 ERROR = "error"
23@dataclass
24class StreamConfig:
25 """Configuration for stream processing.
27 Attributes:
28 chunk_size: Number of items per chunk.
29 buffer_size: Maximum items to buffer in memory.
30 parallelism: Number of parallel workers for processing.
31 memory_limit_mb: Maximum memory usage in MB.
32 backpressure_threshold: Queue size that triggers backpressure.
33 timeout_seconds: Maximum time for stream processing.
34 enable_metrics: Whether to collect metrics.
35 retry_on_error: Whether to retry failed chunks.
36 max_retries: Maximum retry attempts for failed chunks.
37 """
38 chunk_size: int = 1000
39 buffer_size: int = 10000
40 parallelism: int = 1
41 memory_limit_mb: int = 512
42 backpressure_threshold: int = 5000
43 timeout_seconds: float | None = None
44 enable_metrics: bool = True
45 retry_on_error: bool = True
46 max_retries: int = 3
49@dataclass
50class StreamChunk:
51 """A chunk of data in a stream.
53 Attributes:
54 data: The chunk data.
55 chunk_id: Unique chunk identifier.
56 sequence_number: Position in the stream.
57 metadata: Additional chunk metadata.
58 timestamp: Creation timestamp.
59 is_last: Whether this is the last chunk.
60 """
61 data: Any
62 chunk_id: str = field(default_factory=lambda: str(uuid4()))
63 sequence_number: int = 0
64 metadata: Dict[str, Any] = field(default_factory=dict)
65 timestamp: float = field(default_factory=time.time)
66 is_last: bool = False
69@dataclass
70class StreamMetrics:
71 """Metrics for stream processing.
73 Attributes:
74 chunks_processed: Number of chunks processed.
75 bytes_processed: Total bytes processed.
76 items_processed: Total items processed.
77 errors_count: Number of errors encountered.
78 retries_count: Number of retries performed.
79 start_time: Processing start timestamp.
80 end_time: Processing end timestamp.
81 peak_memory_mb: Peak memory usage in MB.
82 """
83 chunks_processed: int = 0
84 bytes_processed: int = 0
85 items_processed: int = 0
86 errors_count: int = 0
87 retries_count: int = 0
88 start_time: float | None = None
89 end_time: float | None = None
90 peak_memory_mb: float = 0.0
92 def duration_seconds(self) -> float | None:
93 """Get processing duration in seconds."""
94 if self.start_time is None:
95 return None
96 end = self.end_time or time.time()
97 return end - self.start_time
99 def throughput_items_per_second(self) -> float:
100 """Calculate throughput in items per second."""
101 duration = self.duration_seconds()
102 if duration and duration > 0:
103 return self.items_processed / duration
104 return 0.0
106 def throughput_mb_per_second(self) -> float:
107 """Calculate throughput in MB per second."""
108 duration = self.duration_seconds()
109 if duration and duration > 0:
110 return (self.bytes_processed / (1024 * 1024)) / duration
111 return 0.0
114@runtime_checkable
115class IStreamSource(Protocol):
116 """Interface for stream data sources."""
118 def read_chunk(self) -> StreamChunk | None:
119 """Read the next chunk from the source.
121 Returns:
122 StreamChunk if available, None if exhausted.
123 """
124 ...
126 def __iter__(self) -> Iterator[StreamChunk]:
127 """Iterate over chunks."""
128 ...
130 def close(self) -> None:
131 """Close the stream source."""
132 ...
135@runtime_checkable
136class IStreamSink(Protocol):
137 """Interface for stream data sinks."""
139 def write_chunk(self, chunk: StreamChunk) -> bool:
140 """Write a chunk to the sink.
142 Args:
143 chunk: The chunk to write.
145 Returns:
146 True if successful, False otherwise.
147 """
148 ...
150 def flush(self) -> None:
151 """Flush any buffered data."""
152 ...
154 def close(self) -> None:
155 """Close the stream sink."""
156 ...
159class StreamContext:
160 """Context for managing stream processing.
162 This class coordinates stream sources, sinks, and processing
163 with support for backpressure, parallelism, and metrics.
164 """
166 def __init__(self, config: StreamConfig | None = None):
167 """Initialize stream context.
169 Args:
170 config: Stream configuration.
171 """
172 self.config = config or StreamConfig()
173 self.status = StreamStatus.IDLE
174 self.metrics = StreamMetrics()
176 # Internal queues for processing
177 self._input_queue: queue.Queue[StreamChunk | None] = queue.Queue(
178 maxsize=self.config.buffer_size
179 )
180 self._output_queue: queue.Queue[StreamChunk | None] = queue.Queue(
181 maxsize=self.config.buffer_size
182 )
184 # Threading support
185 self._lock = threading.RLock()
186 self._stop_event = threading.Event()
187 self._workers: list[threading.Thread] = []
189 # Registered processors
190 self._processors: list[Callable[[StreamChunk], StreamChunk | None]] = []
192 # Backpressure management
193 self._backpressure_active = False
194 self._last_backpressure_check = time.time()
196 def add_processor(
197 self,
198 processor: Callable[[StreamChunk], StreamChunk | None]
199 ) -> None:
200 """Add a chunk processor function.
202 Args:
203 processor: Function to process chunks.
204 """
205 self._processors.append(processor)
207 def _check_backpressure(self) -> None:
208 """Check and handle backpressure."""
209 current_time = time.time()
210 if current_time - self._last_backpressure_check < 0.1:
211 return # Throttle checks
213 self._last_backpressure_check = current_time
215 # Check queue sizes
216 input_size = self._input_queue.qsize()
217 output_size = self._output_queue.qsize()
219 if (input_size > self.config.backpressure_threshold or
220 output_size > self.config.backpressure_threshold):
221 if not self._backpressure_active:
222 self._backpressure_active = True
223 self.status = StreamStatus.PAUSED
224 # Could implement more sophisticated backpressure handling
225 time.sleep(0.01) # Brief pause
226 else:
227 if self._backpressure_active:
228 self._backpressure_active = False
229 self.status = StreamStatus.ACTIVE
231 def get_next_chunk(self) -> StreamChunk | None:
232 """Get the next chunk from the stream.
234 Returns:
235 Next chunk or None if no more chunks.
236 """
237 try:
238 # Try to get from input queue with a short timeout
239 chunk = self._input_queue.get(timeout=0.001)
240 return chunk
241 except queue.Empty:
242 return None
244 def add_chunk(self, chunk: StreamChunk) -> bool:
245 """Add a chunk to the input queue for processing.
247 Args:
248 chunk: The chunk to add.
250 Returns:
251 True if added successfully, False if queue is full.
252 """
253 try:
254 self._input_queue.put(chunk, timeout=0.001)
255 return True
256 except queue.Full:
257 return False
259 def add_data(self, data: Any, chunk_id: str | None = None, is_last: bool = False) -> bool:
260 """Add data as a chunk to the stream.
262 Args:
263 data: The data to add (will be wrapped in a StreamChunk).
264 chunk_id: Optional chunk ID.
265 is_last: Whether this is the last chunk.
267 Returns:
268 True if added successfully, False if queue is full.
269 """
270 import uuid
271 chunk = StreamChunk(
272 data=data if isinstance(data, list) else [data],
273 chunk_id=chunk_id or str(uuid.uuid4()),
274 is_last=is_last
275 )
276 return self.add_chunk(chunk)
278 def _process_chunk(self, chunk: StreamChunk) -> StreamChunk | None:
279 """Process a chunk through all processors.
281 Args:
282 chunk: The chunk to process.
284 Returns:
285 Processed chunk or None if filtered.
286 """
287 result = chunk
288 for processor in self._processors:
289 if result is None:
290 break # type: ignore[unreachable]
291 try:
292 result = processor(result)
293 except Exception:
294 self.metrics.errors_count += 1
295 if self.config.retry_on_error:
296 # Simple retry logic
297 for retry in range(self.config.max_retries):
298 try:
299 self.metrics.retries_count += 1
300 result = processor(chunk)
301 break
302 except Exception:
303 if retry == self.config.max_retries - 1:
304 return None
305 else:
306 return None
308 return result
310 def _worker_thread(self) -> None:
311 """Worker thread for processing chunks."""
312 while not self._stop_event.is_set():
313 try:
314 # Get chunk with timeout
315 chunk = self._input_queue.get(timeout=0.1)
316 if chunk is None:
317 # Poison pill - propagate to output
318 self._output_queue.put(None)
319 break
321 # Process chunk
322 processed = self._process_chunk(chunk)
324 if processed is not None:
325 # Put in output queue
326 self._output_queue.put(processed)
328 # Update metrics
329 with self._lock:
330 self.metrics.chunks_processed += 1
331 if hasattr(processed.data, '__len__'):
332 self.metrics.items_processed += len(processed.data)
334 # Check backpressure
335 self._check_backpressure()
337 except queue.Empty:
338 continue
339 except Exception:
340 with self._lock:
341 self.metrics.errors_count += 1
342 self.status = StreamStatus.ERROR
344 def stream(
345 self,
346 source: IStreamSource,
347 sink: IStreamSink,
348 transform: Callable[[Any], Any] | None = None
349 ) -> StreamMetrics:
350 """Stream data from source to sink with optional transformation.
352 Args:
353 source: Data source.
354 sink: Data sink.
355 transform: Optional transformation function.
357 Returns:
358 Stream processing metrics.
359 """
360 if transform:
361 self.add_processor(lambda c: StreamChunk(
362 data=transform(c.data),
363 chunk_id=c.chunk_id,
364 sequence_number=c.sequence_number,
365 metadata=c.metadata,
366 timestamp=c.timestamp,
367 is_last=c.is_last
368 ))
370 # Start metrics
371 self.metrics.start_time = time.time()
372 self.status = StreamStatus.ACTIVE
374 # Start worker threads
375 for i in range(self.config.parallelism):
376 worker = threading.Thread(
377 target=self._worker_thread,
378 name=f"stream-worker-{i}"
379 )
380 worker.daemon = True
381 worker.start()
382 self._workers.append(worker)
384 # Reader thread
385 def read_thread():
386 try:
387 for chunk in source:
388 if self._stop_event.is_set():
389 break
390 self._input_queue.put(chunk)
391 if chunk.is_last:
392 break
393 finally:
394 # Send poison pills to workers
395 for _ in range(self.config.parallelism):
396 self._input_queue.put(None)
397 source.close()
399 reader = threading.Thread(target=read_thread, name="stream-reader")
400 reader.daemon = True
401 reader.start()
403 # Writer thread
404 def write_thread():
405 poison_pills = 0
406 try:
407 while poison_pills < self.config.parallelism:
408 chunk = self._output_queue.get(timeout=0.1)
409 if chunk is None:
410 poison_pills += 1
411 continue
413 success = sink.write_chunk(chunk)
414 if not success:
415 with self._lock:
416 self.metrics.errors_count += 1
418 if chunk.is_last:
419 break
420 except Exception:
421 with self._lock:
422 self.metrics.errors_count += 1
423 self.status = StreamStatus.ERROR
424 finally:
425 sink.flush()
426 sink.close()
428 writer = threading.Thread(target=write_thread, name="stream-writer")
429 writer.daemon = True
430 writer.start()
432 # Wait for completion with optional timeout
433 try:
434 reader.join(timeout=self.config.timeout_seconds)
435 for worker in self._workers:
436 worker.join(timeout=1)
437 writer.join(timeout=self.config.timeout_seconds)
438 except Exception:
439 self._stop_event.set()
440 self.status = StreamStatus.ERROR
441 finally:
442 # Update final metrics
443 self.metrics.end_time = time.time()
444 if self.status != StreamStatus.ERROR:
445 self.status = StreamStatus.COMPLETED
447 return self.metrics
449 @contextmanager
450 def streaming_context(self):
451 """Context manager for streaming operations.
453 Yields:
454 This StreamContext instance.
455 """
456 try:
457 yield self
458 finally:
459 self.close()
461 def close(self) -> None:
462 """Close the stream context and clean up resources."""
463 self._stop_event.set()
465 # Wait briefly for threads to finish
466 for worker in self._workers:
467 worker.join(timeout=0.5)
469 self.status = StreamStatus.COMPLETED
470 self.metrics.end_time = self.metrics.end_time or time.time()
473class AsyncStreamContext:
474 """Async version of StreamContext for async/await support."""
476 def __init__(self, config: StreamConfig | None = None):
477 """Initialize async stream context.
479 Args:
480 config: Stream configuration.
481 """
482 self.config = config or StreamConfig()
483 self.status = StreamStatus.IDLE
484 self.metrics = StreamMetrics()
486 # Async queues
487 self._input_queue: asyncio.Queue[StreamChunk | None] = asyncio.Queue(
488 maxsize=self.config.buffer_size
489 )
490 self._output_queue: asyncio.Queue[StreamChunk | None] = asyncio.Queue(
491 maxsize=self.config.buffer_size
492 )
494 self._processors: list[Callable[[StreamChunk], StreamChunk | None]] = []
495 self._stop_event = asyncio.Event()
497 async def stream_async(
498 self,
499 source: AsyncIterator[StreamChunk],
500 sink: Callable[[StreamChunk], bool],
501 transform: Callable[[Any], Any] | None = None
502 ) -> StreamMetrics:
503 """Async streaming from source to sink.
505 Args:
506 source: Async data source iterator.
507 sink: Sink function.
508 transform: Optional transformation.
510 Returns:
511 Stream processing metrics.
512 """
513 if transform:
514 self._processors.append(lambda c: StreamChunk(
515 data=transform(c.data),
516 chunk_id=c.chunk_id,
517 sequence_number=c.sequence_number,
518 metadata=c.metadata,
519 timestamp=c.timestamp,
520 is_last=c.is_last
521 ))
523 self.metrics.start_time = time.time()
524 self.status = StreamStatus.ACTIVE
526 # Create async tasks for reading, processing, and writing
527 async def read_task():
528 try:
529 async for chunk in source:
530 if self._stop_event.is_set():
531 break
532 await self._input_queue.put(chunk)
533 if chunk.is_last:
534 break
535 finally:
536 # Send poison pills
537 for _ in range(self.config.parallelism):
538 await self._input_queue.put(None)
540 async def process_task():
541 while not self._stop_event.is_set():
542 chunk = await self._input_queue.get()
543 if chunk is None:
544 await self._output_queue.put(None)
545 break
547 # Process through all processors
548 result = chunk
549 for processor in self._processors:
550 if result:
551 result = processor(result)
553 if result:
554 await self._output_queue.put(result)
555 self.metrics.chunks_processed += 1
557 async def write_task():
558 poison_pills = 0
559 while poison_pills < self.config.parallelism:
560 chunk = await self._output_queue.get()
561 if chunk is None:
562 poison_pills += 1
563 continue
565 if not sink(chunk):
566 self.metrics.errors_count += 1
568 if chunk.is_last:
569 break
571 # Run all tasks concurrently
572 tasks = [
573 asyncio.create_task(read_task()),
574 *[asyncio.create_task(process_task()) for _ in range(self.config.parallelism)],
575 asyncio.create_task(write_task())
576 ]
578 try:
579 await asyncio.gather(*tasks)
580 self.status = StreamStatus.COMPLETED
581 except Exception:
582 self._stop_event.set()
583 self.status = StreamStatus.ERROR
584 for task in tasks:
585 task.cancel()
586 finally:
587 self.metrics.end_time = time.time()
589 return self.metrics
592class BasicStreamProcessor:
593 """Basic stream processor implementation."""
595 def __init__(
596 self,
597 source: IStreamSource,
598 sink: IStreamSink,
599 transform_func: Union[Callable, None] = None,
600 buffer_size: int = 1000
601 ):
602 """Initialize stream processor.
604 Args:
605 source: Stream source.
606 sink: Stream sink.
607 transform_func: Optional transformation function.
608 buffer_size: Buffer size for processing.
609 """
610 self.source = source
611 self.sink = sink
612 self.transform_func = transform_func
613 self.buffer_size = buffer_size
614 self.processed_chunks = 0
615 self.processed_records = 0
616 self.errors = []
618 def process(self) -> Dict[str, Any]:
619 """Process the entire stream.
621 Returns:
622 Processing statistics.
623 """
624 start_time = time.time()
626 try:
627 # Process all chunks
628 for chunk in self.source:
629 try:
630 # Apply transformation if provided
631 chunk_to_write = chunk
632 if self.transform_func:
633 transformed_chunk = self.transform_func(chunk)
634 if transformed_chunk:
635 chunk_to_write = transformed_chunk
637 # Write to sink
638 success = self.sink.write_chunk(chunk_to_write)
639 if success:
640 self.processed_chunks += 1
641 self.processed_records += len(chunk.data) if hasattr(chunk.data, '__len__') else 1
642 else:
643 self.errors.append(f"Failed to write chunk {self.processed_chunks}")
645 except Exception as e:
646 self.errors.append(f"Error processing chunk {self.processed_chunks}: {e!s}")
647 continue
649 # Flush sink
650 self.sink.flush()
652 except Exception as e:
653 self.errors.append(f"Stream processing error: {e!s}")
654 finally:
655 # Clean up
656 self.source.close()
657 self.sink.close()
659 end_time = time.time()
661 return {
662 'processed_chunks': self.processed_chunks,
663 'processed_records': self.processed_records,
664 'duration': end_time - start_time,
665 'errors': self.errors,
666 'success': len(self.errors) == 0
667 }
669 async def process_async(self) -> Dict[str, Any]:
670 """Process the stream asynchronously.
672 Returns:
673 Processing statistics.
674 """
675 # For now, just wrap sync processing
676 # In a real implementation, this would use async iterators
677 import asyncio
678 return await asyncio.get_event_loop().run_in_executor(None, self.process)
681class MemoryStreamSource:
682 """Simple in-memory stream source for testing."""
684 def __init__(self, data: List[Any], chunk_size: int = 100):
685 """Initialize with data.
687 Args:
688 data: List of data items.
689 chunk_size: Size of each chunk.
690 """
691 self.data = data
692 self.chunk_size = chunk_size
693 self.current_index = 0
695 def read_chunk(self) -> StreamChunk | None:
696 """Read next chunk."""
697 if self.current_index >= len(self.data):
698 return None
700 end_index = min(self.current_index + self.chunk_size, len(self.data))
701 chunk_data = self.data[self.current_index:end_index]
703 chunk = StreamChunk(
704 data=chunk_data,
705 chunk_id=f"chunk_{self.current_index // self.chunk_size}",
706 timestamp=time.time(),
707 is_last=end_index >= len(self.data)
708 )
710 self.current_index = end_index
711 return chunk
713 def __iter__(self) -> Iterator[StreamChunk]:
714 """Iterate over chunks."""
715 while True:
716 chunk = self.read_chunk()
717 if chunk is None:
718 break
719 yield chunk
721 def close(self) -> None:
722 """Close source."""
723 pass
726class MemoryStreamSink:
727 """Simple in-memory stream sink for testing."""
729 def __init__(self):
730 """Initialize sink."""
731 self.chunks = []
732 self.records = []
734 def write_chunk(self, chunk: StreamChunk) -> bool:
735 """Write chunk to memory."""
736 try:
737 self.chunks.append(chunk)
738 if hasattr(chunk.data, '__iter__'):
739 self.records.extend(chunk.data)
740 else:
741 self.records.append(chunk.data)
742 return True
743 except Exception:
744 return False
746 def flush(self) -> None:
747 """Flush (no-op for memory)."""
748 pass
750 def close(self) -> None:
751 """Close sink."""
752 pass