Coverage for src/dataknobs_fsm/streaming/core.py: 27%

366 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-20 16:51 -0600

1"""Core streaming interfaces and implementations for FSM data processing.""" 

2 

3import asyncio 

4import queue 

5import threading 

6import time 

7from contextlib import contextmanager 

8from dataclasses import dataclass, field 

9from enum import Enum 

10from typing import Any, AsyncIterator, Callable, Dict, Iterator, List, Protocol, Union, runtime_checkable 

11from uuid import uuid4 

12 

13 

14class StreamStatus(Enum): 

15 """Stream processing status.""" 

16 IDLE = "idle" 

17 ACTIVE = "active" 

18 PAUSED = "paused" 

19 COMPLETED = "completed" 

20 ERROR = "error" 

21 

22 

23@dataclass 

24class StreamConfig: 

25 """Configuration for stream processing. 

26  

27 Attributes: 

28 chunk_size: Number of items per chunk. 

29 buffer_size: Maximum items to buffer in memory. 

30 parallelism: Number of parallel workers for processing. 

31 memory_limit_mb: Maximum memory usage in MB. 

32 backpressure_threshold: Queue size that triggers backpressure. 

33 timeout_seconds: Maximum time for stream processing. 

34 enable_metrics: Whether to collect metrics. 

35 retry_on_error: Whether to retry failed chunks. 

36 max_retries: Maximum retry attempts for failed chunks. 

37 """ 

38 chunk_size: int = 1000 

39 buffer_size: int = 10000 

40 parallelism: int = 1 

41 memory_limit_mb: int = 512 

42 backpressure_threshold: int = 5000 

43 timeout_seconds: float | None = None 

44 enable_metrics: bool = True 

45 retry_on_error: bool = True 

46 max_retries: int = 3 

47 

48 

49@dataclass 

50class StreamChunk: 

51 """A chunk of data in a stream. 

52  

53 Attributes: 

54 data: The chunk data. 

55 chunk_id: Unique chunk identifier. 

56 sequence_number: Position in the stream. 

57 metadata: Additional chunk metadata. 

58 timestamp: Creation timestamp. 

59 is_last: Whether this is the last chunk. 

60 """ 

61 data: Any 

62 chunk_id: str = field(default_factory=lambda: str(uuid4())) 

63 sequence_number: int = 0 

64 metadata: Dict[str, Any] = field(default_factory=dict) 

65 timestamp: float = field(default_factory=time.time) 

66 is_last: bool = False 

67 

68 

69@dataclass 

70class StreamMetrics: 

71 """Metrics for stream processing. 

72  

73 Attributes: 

74 chunks_processed: Number of chunks processed. 

75 bytes_processed: Total bytes processed. 

76 items_processed: Total items processed. 

77 errors_count: Number of errors encountered. 

78 retries_count: Number of retries performed. 

79 start_time: Processing start timestamp. 

80 end_time: Processing end timestamp. 

81 peak_memory_mb: Peak memory usage in MB. 

82 """ 

83 chunks_processed: int = 0 

84 bytes_processed: int = 0 

85 items_processed: int = 0 

86 errors_count: int = 0 

87 retries_count: int = 0 

88 start_time: float | None = None 

89 end_time: float | None = None 

90 peak_memory_mb: float = 0.0 

91 

92 def duration_seconds(self) -> float | None: 

93 """Get processing duration in seconds.""" 

94 if self.start_time is None: 

95 return None 

96 end = self.end_time or time.time() 

97 return end - self.start_time 

98 

99 def throughput_items_per_second(self) -> float: 

100 """Calculate throughput in items per second.""" 

101 duration = self.duration_seconds() 

102 if duration and duration > 0: 

103 return self.items_processed / duration 

104 return 0.0 

105 

106 def throughput_mb_per_second(self) -> float: 

107 """Calculate throughput in MB per second.""" 

108 duration = self.duration_seconds() 

109 if duration and duration > 0: 

110 return (self.bytes_processed / (1024 * 1024)) / duration 

111 return 0.0 

112 

113 

114@runtime_checkable 

115class IStreamSource(Protocol): 

116 """Interface for stream data sources.""" 

117 

118 def read_chunk(self) -> StreamChunk | None: 

119 """Read the next chunk from the source. 

120  

121 Returns: 

122 StreamChunk if available, None if exhausted. 

123 """ 

124 ... 

125 

126 def __iter__(self) -> Iterator[StreamChunk]: 

127 """Iterate over chunks.""" 

128 ... 

129 

130 def close(self) -> None: 

131 """Close the stream source.""" 

132 ... 

133 

134 

135@runtime_checkable 

136class IStreamSink(Protocol): 

137 """Interface for stream data sinks.""" 

138 

139 def write_chunk(self, chunk: StreamChunk) -> bool: 

140 """Write a chunk to the sink. 

141  

142 Args: 

143 chunk: The chunk to write. 

144  

145 Returns: 

146 True if successful, False otherwise. 

147 """ 

148 ... 

149 

150 def flush(self) -> None: 

151 """Flush any buffered data.""" 

152 ... 

153 

154 def close(self) -> None: 

155 """Close the stream sink.""" 

156 ... 

157 

158 

159class StreamContext: 

160 """Context for managing stream processing. 

161  

162 This class coordinates stream sources, sinks, and processing 

163 with support for backpressure, parallelism, and metrics. 

164 """ 

165 

166 def __init__(self, config: StreamConfig | None = None): 

167 """Initialize stream context. 

168  

169 Args: 

170 config: Stream configuration. 

171 """ 

172 self.config = config or StreamConfig() 

173 self.status = StreamStatus.IDLE 

174 self.metrics = StreamMetrics() 

175 

176 # Internal queues for processing 

177 self._input_queue: queue.Queue[StreamChunk | None] = queue.Queue( 

178 maxsize=self.config.buffer_size 

179 ) 

180 self._output_queue: queue.Queue[StreamChunk | None] = queue.Queue( 

181 maxsize=self.config.buffer_size 

182 ) 

183 

184 # Threading support 

185 self._lock = threading.RLock() 

186 self._stop_event = threading.Event() 

187 self._workers: list[threading.Thread] = [] 

188 

189 # Registered processors 

190 self._processors: list[Callable[[StreamChunk], StreamChunk | None]] = [] 

191 

192 # Backpressure management 

193 self._backpressure_active = False 

194 self._last_backpressure_check = time.time() 

195 

196 def add_processor( 

197 self, 

198 processor: Callable[[StreamChunk], StreamChunk | None] 

199 ) -> None: 

200 """Add a chunk processor function. 

201  

202 Args: 

203 processor: Function to process chunks. 

204 """ 

205 self._processors.append(processor) 

206 

207 def _check_backpressure(self) -> None: 

208 """Check and handle backpressure.""" 

209 current_time = time.time() 

210 if current_time - self._last_backpressure_check < 0.1: 

211 return # Throttle checks 

212 

213 self._last_backpressure_check = current_time 

214 

215 # Check queue sizes 

216 input_size = self._input_queue.qsize() 

217 output_size = self._output_queue.qsize() 

218 

219 if (input_size > self.config.backpressure_threshold or 

220 output_size > self.config.backpressure_threshold): 

221 if not self._backpressure_active: 

222 self._backpressure_active = True 

223 self.status = StreamStatus.PAUSED 

224 # Could implement more sophisticated backpressure handling 

225 time.sleep(0.01) # Brief pause 

226 else: 

227 if self._backpressure_active: 

228 self._backpressure_active = False 

229 self.status = StreamStatus.ACTIVE 

230 

231 def get_next_chunk(self) -> StreamChunk | None: 

232 """Get the next chunk from the stream. 

233  

234 Returns: 

235 Next chunk or None if no more chunks. 

236 """ 

237 try: 

238 # Try to get from input queue with a short timeout 

239 chunk = self._input_queue.get(timeout=0.001) 

240 return chunk 

241 except queue.Empty: 

242 return None 

243 

244 def add_chunk(self, chunk: StreamChunk) -> bool: 

245 """Add a chunk to the input queue for processing. 

246  

247 Args: 

248 chunk: The chunk to add. 

249  

250 Returns: 

251 True if added successfully, False if queue is full. 

252 """ 

253 try: 

254 self._input_queue.put(chunk, timeout=0.001) 

255 return True 

256 except queue.Full: 

257 return False 

258 

259 def add_data(self, data: Any, chunk_id: str | None = None, is_last: bool = False) -> bool: 

260 """Add data as a chunk to the stream. 

261  

262 Args: 

263 data: The data to add (will be wrapped in a StreamChunk). 

264 chunk_id: Optional chunk ID. 

265 is_last: Whether this is the last chunk. 

266  

267 Returns: 

268 True if added successfully, False if queue is full. 

269 """ 

270 import uuid 

271 chunk = StreamChunk( 

272 data=data if isinstance(data, list) else [data], 

273 chunk_id=chunk_id or str(uuid.uuid4()), 

274 is_last=is_last 

275 ) 

276 return self.add_chunk(chunk) 

277 

278 def _process_chunk(self, chunk: StreamChunk) -> StreamChunk | None: 

279 """Process a chunk through all processors. 

280  

281 Args: 

282 chunk: The chunk to process. 

283  

284 Returns: 

285 Processed chunk or None if filtered. 

286 """ 

287 result = chunk 

288 for processor in self._processors: 

289 if result is None: 

290 break # type: ignore[unreachable] 

291 try: 

292 result = processor(result) 

293 except Exception: 

294 self.metrics.errors_count += 1 

295 if self.config.retry_on_error: 

296 # Simple retry logic 

297 for retry in range(self.config.max_retries): 

298 try: 

299 self.metrics.retries_count += 1 

300 result = processor(chunk) 

301 break 

302 except Exception: 

303 if retry == self.config.max_retries - 1: 

304 return None 

305 else: 

306 return None 

307 

308 return result 

309 

310 def _worker_thread(self) -> None: 

311 """Worker thread for processing chunks.""" 

312 while not self._stop_event.is_set(): 

313 try: 

314 # Get chunk with timeout 

315 chunk = self._input_queue.get(timeout=0.1) 

316 if chunk is None: 

317 # Poison pill - propagate to output 

318 self._output_queue.put(None) 

319 break 

320 

321 # Process chunk 

322 processed = self._process_chunk(chunk) 

323 

324 if processed is not None: 

325 # Put in output queue 

326 self._output_queue.put(processed) 

327 

328 # Update metrics 

329 with self._lock: 

330 self.metrics.chunks_processed += 1 

331 if hasattr(processed.data, '__len__'): 

332 self.metrics.items_processed += len(processed.data) 

333 

334 # Check backpressure 

335 self._check_backpressure() 

336 

337 except queue.Empty: 

338 continue 

339 except Exception: 

340 with self._lock: 

341 self.metrics.errors_count += 1 

342 self.status = StreamStatus.ERROR 

343 

344 def stream( 

345 self, 

346 source: IStreamSource, 

347 sink: IStreamSink, 

348 transform: Callable[[Any], Any] | None = None 

349 ) -> StreamMetrics: 

350 """Stream data from source to sink with optional transformation. 

351  

352 Args: 

353 source: Data source. 

354 sink: Data sink. 

355 transform: Optional transformation function. 

356  

357 Returns: 

358 Stream processing metrics. 

359 """ 

360 if transform: 

361 self.add_processor(lambda c: StreamChunk( 

362 data=transform(c.data), 

363 chunk_id=c.chunk_id, 

364 sequence_number=c.sequence_number, 

365 metadata=c.metadata, 

366 timestamp=c.timestamp, 

367 is_last=c.is_last 

368 )) 

369 

370 # Start metrics 

371 self.metrics.start_time = time.time() 

372 self.status = StreamStatus.ACTIVE 

373 

374 # Start worker threads 

375 for i in range(self.config.parallelism): 

376 worker = threading.Thread( 

377 target=self._worker_thread, 

378 name=f"stream-worker-{i}" 

379 ) 

380 worker.daemon = True 

381 worker.start() 

382 self._workers.append(worker) 

383 

384 # Reader thread 

385 def read_thread(): 

386 try: 

387 for chunk in source: 

388 if self._stop_event.is_set(): 

389 break 

390 self._input_queue.put(chunk) 

391 if chunk.is_last: 

392 break 

393 finally: 

394 # Send poison pills to workers 

395 for _ in range(self.config.parallelism): 

396 self._input_queue.put(None) 

397 source.close() 

398 

399 reader = threading.Thread(target=read_thread, name="stream-reader") 

400 reader.daemon = True 

401 reader.start() 

402 

403 # Writer thread 

404 def write_thread(): 

405 poison_pills = 0 

406 try: 

407 while poison_pills < self.config.parallelism: 

408 chunk = self._output_queue.get(timeout=0.1) 

409 if chunk is None: 

410 poison_pills += 1 

411 continue 

412 

413 success = sink.write_chunk(chunk) 

414 if not success: 

415 with self._lock: 

416 self.metrics.errors_count += 1 

417 

418 if chunk.is_last: 

419 break 

420 except Exception: 

421 with self._lock: 

422 self.metrics.errors_count += 1 

423 self.status = StreamStatus.ERROR 

424 finally: 

425 sink.flush() 

426 sink.close() 

427 

428 writer = threading.Thread(target=write_thread, name="stream-writer") 

429 writer.daemon = True 

430 writer.start() 

431 

432 # Wait for completion with optional timeout 

433 try: 

434 reader.join(timeout=self.config.timeout_seconds) 

435 for worker in self._workers: 

436 worker.join(timeout=1) 

437 writer.join(timeout=self.config.timeout_seconds) 

438 except Exception: 

439 self._stop_event.set() 

440 self.status = StreamStatus.ERROR 

441 finally: 

442 # Update final metrics 

443 self.metrics.end_time = time.time() 

444 if self.status != StreamStatus.ERROR: 

445 self.status = StreamStatus.COMPLETED 

446 

447 return self.metrics 

448 

449 @contextmanager 

450 def streaming_context(self): 

451 """Context manager for streaming operations. 

452  

453 Yields: 

454 This StreamContext instance. 

455 """ 

456 try: 

457 yield self 

458 finally: 

459 self.close() 

460 

461 def close(self) -> None: 

462 """Close the stream context and clean up resources.""" 

463 self._stop_event.set() 

464 

465 # Wait briefly for threads to finish 

466 for worker in self._workers: 

467 worker.join(timeout=0.5) 

468 

469 self.status = StreamStatus.COMPLETED 

470 self.metrics.end_time = self.metrics.end_time or time.time() 

471 

472 

473class AsyncStreamContext: 

474 """Async version of StreamContext for async/await support.""" 

475 

476 def __init__(self, config: StreamConfig | None = None): 

477 """Initialize async stream context. 

478  

479 Args: 

480 config: Stream configuration. 

481 """ 

482 self.config = config or StreamConfig() 

483 self.status = StreamStatus.IDLE 

484 self.metrics = StreamMetrics() 

485 

486 # Async queues 

487 self._input_queue: asyncio.Queue[StreamChunk | None] = asyncio.Queue( 

488 maxsize=self.config.buffer_size 

489 ) 

490 self._output_queue: asyncio.Queue[StreamChunk | None] = asyncio.Queue( 

491 maxsize=self.config.buffer_size 

492 ) 

493 

494 self._processors: list[Callable[[StreamChunk], StreamChunk | None]] = [] 

495 self._stop_event = asyncio.Event() 

496 

497 async def stream_async( 

498 self, 

499 source: AsyncIterator[StreamChunk], 

500 sink: Callable[[StreamChunk], bool], 

501 transform: Callable[[Any], Any] | None = None 

502 ) -> StreamMetrics: 

503 """Async streaming from source to sink. 

504  

505 Args: 

506 source: Async data source iterator. 

507 sink: Sink function. 

508 transform: Optional transformation. 

509  

510 Returns: 

511 Stream processing metrics. 

512 """ 

513 if transform: 

514 self._processors.append(lambda c: StreamChunk( 

515 data=transform(c.data), 

516 chunk_id=c.chunk_id, 

517 sequence_number=c.sequence_number, 

518 metadata=c.metadata, 

519 timestamp=c.timestamp, 

520 is_last=c.is_last 

521 )) 

522 

523 self.metrics.start_time = time.time() 

524 self.status = StreamStatus.ACTIVE 

525 

526 # Create async tasks for reading, processing, and writing 

527 async def read_task(): 

528 try: 

529 async for chunk in source: 

530 if self._stop_event.is_set(): 

531 break 

532 await self._input_queue.put(chunk) 

533 if chunk.is_last: 

534 break 

535 finally: 

536 # Send poison pills 

537 for _ in range(self.config.parallelism): 

538 await self._input_queue.put(None) 

539 

540 async def process_task(): 

541 while not self._stop_event.is_set(): 

542 chunk = await self._input_queue.get() 

543 if chunk is None: 

544 await self._output_queue.put(None) 

545 break 

546 

547 # Process through all processors 

548 result = chunk 

549 for processor in self._processors: 

550 if result: 

551 result = processor(result) 

552 

553 if result: 

554 await self._output_queue.put(result) 

555 self.metrics.chunks_processed += 1 

556 

557 async def write_task(): 

558 poison_pills = 0 

559 while poison_pills < self.config.parallelism: 

560 chunk = await self._output_queue.get() 

561 if chunk is None: 

562 poison_pills += 1 

563 continue 

564 

565 if not sink(chunk): 

566 self.metrics.errors_count += 1 

567 

568 if chunk.is_last: 

569 break 

570 

571 # Run all tasks concurrently 

572 tasks = [ 

573 asyncio.create_task(read_task()), 

574 *[asyncio.create_task(process_task()) for _ in range(self.config.parallelism)], 

575 asyncio.create_task(write_task()) 

576 ] 

577 

578 try: 

579 await asyncio.gather(*tasks) 

580 self.status = StreamStatus.COMPLETED 

581 except Exception: 

582 self._stop_event.set() 

583 self.status = StreamStatus.ERROR 

584 for task in tasks: 

585 task.cancel() 

586 finally: 

587 self.metrics.end_time = time.time() 

588 

589 return self.metrics 

590 

591 

592class BasicStreamProcessor: 

593 """Basic stream processor implementation.""" 

594 

595 def __init__( 

596 self, 

597 source: IStreamSource, 

598 sink: IStreamSink, 

599 transform_func: Union[Callable, None] = None, 

600 buffer_size: int = 1000 

601 ): 

602 """Initialize stream processor. 

603  

604 Args: 

605 source: Stream source. 

606 sink: Stream sink. 

607 transform_func: Optional transformation function. 

608 buffer_size: Buffer size for processing. 

609 """ 

610 self.source = source 

611 self.sink = sink 

612 self.transform_func = transform_func 

613 self.buffer_size = buffer_size 

614 self.processed_chunks = 0 

615 self.processed_records = 0 

616 self.errors = [] 

617 

618 def process(self) -> Dict[str, Any]: 

619 """Process the entire stream. 

620  

621 Returns: 

622 Processing statistics. 

623 """ 

624 start_time = time.time() 

625 

626 try: 

627 # Process all chunks 

628 for chunk in self.source: 

629 try: 

630 # Apply transformation if provided 

631 chunk_to_write = chunk 

632 if self.transform_func: 

633 transformed_chunk = self.transform_func(chunk) 

634 if transformed_chunk: 

635 chunk_to_write = transformed_chunk 

636 

637 # Write to sink 

638 success = self.sink.write_chunk(chunk_to_write) 

639 if success: 

640 self.processed_chunks += 1 

641 self.processed_records += len(chunk.data) if hasattr(chunk.data, '__len__') else 1 

642 else: 

643 self.errors.append(f"Failed to write chunk {self.processed_chunks}") 

644 

645 except Exception as e: 

646 self.errors.append(f"Error processing chunk {self.processed_chunks}: {e!s}") 

647 continue 

648 

649 # Flush sink 

650 self.sink.flush() 

651 

652 except Exception as e: 

653 self.errors.append(f"Stream processing error: {e!s}") 

654 finally: 

655 # Clean up 

656 self.source.close() 

657 self.sink.close() 

658 

659 end_time = time.time() 

660 

661 return { 

662 'processed_chunks': self.processed_chunks, 

663 'processed_records': self.processed_records, 

664 'duration': end_time - start_time, 

665 'errors': self.errors, 

666 'success': len(self.errors) == 0 

667 } 

668 

669 async def process_async(self) -> Dict[str, Any]: 

670 """Process the stream asynchronously. 

671  

672 Returns: 

673 Processing statistics. 

674 """ 

675 # For now, just wrap sync processing 

676 # In a real implementation, this would use async iterators 

677 import asyncio 

678 return await asyncio.get_event_loop().run_in_executor(None, self.process) 

679 

680 

681class MemoryStreamSource: 

682 """Simple in-memory stream source for testing.""" 

683 

684 def __init__(self, data: List[Any], chunk_size: int = 100): 

685 """Initialize with data. 

686  

687 Args: 

688 data: List of data items. 

689 chunk_size: Size of each chunk. 

690 """ 

691 self.data = data 

692 self.chunk_size = chunk_size 

693 self.current_index = 0 

694 

695 def read_chunk(self) -> StreamChunk | None: 

696 """Read next chunk.""" 

697 if self.current_index >= len(self.data): 

698 return None 

699 

700 end_index = min(self.current_index + self.chunk_size, len(self.data)) 

701 chunk_data = self.data[self.current_index:end_index] 

702 

703 chunk = StreamChunk( 

704 data=chunk_data, 

705 chunk_id=f"chunk_{self.current_index // self.chunk_size}", 

706 timestamp=time.time(), 

707 is_last=end_index >= len(self.data) 

708 ) 

709 

710 self.current_index = end_index 

711 return chunk 

712 

713 def __iter__(self) -> Iterator[StreamChunk]: 

714 """Iterate over chunks.""" 

715 while True: 

716 chunk = self.read_chunk() 

717 if chunk is None: 

718 break 

719 yield chunk 

720 

721 def close(self) -> None: 

722 """Close source.""" 

723 pass 

724 

725 

726class MemoryStreamSink: 

727 """Simple in-memory stream sink for testing.""" 

728 

729 def __init__(self): 

730 """Initialize sink.""" 

731 self.chunks = [] 

732 self.records = [] 

733 

734 def write_chunk(self, chunk: StreamChunk) -> bool: 

735 """Write chunk to memory.""" 

736 try: 

737 self.chunks.append(chunk) 

738 if hasattr(chunk.data, '__iter__'): 

739 self.records.extend(chunk.data) 

740 else: 

741 self.records.append(chunk.data) 

742 return True 

743 except Exception: 

744 return False 

745 

746 def flush(self) -> None: 

747 """Flush (no-op for memory).""" 

748 pass 

749 

750 def close(self) -> None: 

751 """Close sink.""" 

752 pass