Coverage for src/dataknobs_fsm/api/async_simple.py: 24%
126 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 16:46 -0600
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 16:46 -0600
1"""Async-first API for FSM operations.
3This module provides an async-first interface for FSM operations,
4designed to work natively in async contexts without any asyncio.run() calls.
5"""
7import asyncio
8from collections.abc import AsyncIterator, Callable
9from pathlib import Path
10from typing import Any
12from dataknobs_data import Record
14from ..config.builder import FSMBuilder
15from ..config.loader import ConfigLoader
16from ..core.context_factory import ContextFactory
17from ..core.data_modes import DataHandlingMode
18from ..core.result_formatter import ResultFormatter
19from ..execution.async_batch import AsyncBatchExecutor
20from ..execution.async_engine import AsyncExecutionEngine
21from ..execution.async_stream import AsyncStreamExecutor
22from ..resources.manager import ResourceManager
23from ..streaming.core import StreamConfig as CoreStreamConfig
26class AsyncSimpleFSM:
27 """Async-first FSM interface for processing data.
29 This class provides a fully asynchronous API for FSM operations,
30 designed to work natively in async contexts.
31 """
33 def __init__(
34 self,
35 config: str | Path | dict[str, Any],
36 data_mode: DataHandlingMode = DataHandlingMode.COPY,
37 resources: dict[str, Any] | None = None,
38 custom_functions: dict[str, Callable] | None = None
39 ):
40 """Initialize AsyncSimpleFSM from configuration.
42 Args:
43 config: Path to config file or config dictionary
44 data_mode: Default data mode for processing
45 resources: Optional resource configurations
46 custom_functions: Optional custom functions to register
47 """
48 self.data_mode = data_mode
49 self._resources = resources or {}
50 self._custom_functions = custom_functions or {}
52 # Create loader with knowledge of custom functions
53 loader = ConfigLoader()
55 # Tell the loader about registered function names
56 if self._custom_functions:
57 for name in self._custom_functions.keys():
58 loader.add_registered_function(name)
60 # Load configuration
61 if isinstance(config, (str, Path)):
62 self._config = loader.load_from_file(Path(config))
63 else:
64 self._config = loader.load_from_dict(config)
66 # Build FSM with custom functions
67 builder = FSMBuilder()
69 # Register custom functions with the builder
70 for name, func in self._custom_functions.items():
71 builder.register_function(name, func)
73 self._fsm = builder.build(self._config)
75 # Initialize resource manager
76 self._resource_manager = ResourceManager()
77 self._setup_resources()
79 # Create async execution engine
80 self._async_engine = AsyncExecutionEngine(self._fsm)
82 def _setup_resources(self) -> None:
83 """Set up resources from configuration."""
84 # Register resources from config
85 if hasattr(self._config, 'resources'):
86 for resource_config in self._config.resources:
87 try:
88 resource = self._create_resource_provider(resource_config)
89 self._resource_manager.register_provider(resource_config.name, resource)
90 except Exception:
91 # Continue if resource creation fails - this is for simplified API
92 pass
94 # Register additional resources passed to constructor
95 for name, resource_config in self._resources.items():
96 try:
97 # Use ResourceManager factory method
98 self._resource_manager.register_from_dict(name, resource_config)
99 except Exception:
100 # Continue if resource creation fails
101 pass
103 def _create_resource_provider(self, resource_config):
104 """Create a resource provider from ResourceConfig."""
105 # Use the same logic as FSMBuilder
106 from ..config.builder import FSMBuilder
107 builder = FSMBuilder()
108 return builder._create_resource(resource_config)
110 async def process(self, data: dict[str, Any] | Record) -> dict[str, Any]:
111 """Process a single record through the FSM asynchronously.
113 Args:
114 data: Input data to process
116 Returns:
117 Dict containing the processed result
118 """
119 # Convert to Record if needed
120 if isinstance(data, dict):
121 record = Record(data)
122 else:
123 record = data
125 # Create context
126 from ..core.modes import ProcessingMode
127 context = ContextFactory.create_context(
128 fsm=self._fsm,
129 data=record,
130 data_mode=ProcessingMode.SINGLE,
131 resource_manager=self._resource_manager
132 )
134 try:
135 # Execute FSM asynchronously
136 success, result = await self._async_engine.execute(context)
138 # Format result
139 return ResultFormatter.format_single_result(
140 context=context,
141 success=success,
142 result=result
143 )
144 except Exception as e:
145 return ResultFormatter.format_error_result(
146 context=context,
147 error=e
148 )
150 async def process_batch(
151 self,
152 data: list[dict[str, Any] | Record],
153 batch_size: int = 10,
154 max_workers: int = 4,
155 on_progress: Callable | None = None
156 ) -> list[dict[str, Any]]:
157 """Process multiple records in parallel batches asynchronously.
159 Args:
160 data: List of input records to process
161 batch_size: Number of records per batch
162 max_workers: Maximum parallel workers
163 on_progress: Optional callback for progress updates
165 Returns:
166 List of results for each input record
167 """
168 batch_executor = AsyncBatchExecutor(
169 fsm=self._fsm,
170 parallelism=max_workers,
171 batch_size=batch_size,
172 progress_callback=on_progress
173 )
175 # Convert to Records
176 records = []
177 for item in data:
178 if isinstance(item, dict):
179 records.append(Record(item))
180 else:
181 records.append(item)
183 # Execute batch
184 results = await batch_executor.execute_batch(items=records)
186 # Format results
187 formatted_results = []
188 for result in results:
189 if result.success:
190 formatted_results.append({
191 'final_state': result.metadata.get('final_state', 'unknown'),
192 'data': result.result,
193 'path': result.metadata.get('path', []),
194 'success': True,
195 'error': None
196 })
197 else:
198 formatted_results.append({
199 'final_state': result.metadata.get('final_state', None),
200 'data': result.result if result.result else {},
201 'path': result.metadata.get('path', []),
202 'success': False,
203 'error': str(result.error) if result.error else str(result.result)
204 })
206 return formatted_results
208 async def process_stream(
209 self,
210 source: str | AsyncIterator[dict[str, Any]],
211 sink: str | None = None,
212 chunk_size: int = 100,
213 on_progress: Callable | None = None,
214 input_format: str = 'auto',
215 text_field_name: str = 'text',
216 csv_delimiter: str = ',',
217 csv_has_header: bool = True,
218 skip_empty_lines: bool = True,
219 use_streaming: bool = False
220 ) -> dict[str, Any]:
221 """Process a stream of data through the FSM asynchronously.
223 Args:
224 source: Data source (file path or async iterator)
225 sink: Optional output destination
226 chunk_size: Size of processing chunks
227 on_progress: Optional progress callback
228 input_format: Input file format ('auto', 'jsonl', 'json', 'csv', 'text')
229 text_field_name: Field name for text lines when converting to dict
230 csv_delimiter: CSV delimiter character
231 csv_has_header: Whether CSV file has header row
232 skip_empty_lines: Skip empty lines in text files
233 use_streaming: Use memory-efficient streaming for large files
235 Returns:
236 Dict containing stream processing statistics
237 """
238 # Configure streaming
239 stream_config = CoreStreamConfig(
240 chunk_size=chunk_size,
241 parallelism=4,
242 memory_limit_mb=1024
243 )
245 # Create async stream executor
246 stream_executor = AsyncStreamExecutor(
247 fsm=self._fsm,
248 stream_config=stream_config,
249 progress_callback=on_progress
250 )
252 # Choose between streaming and regular mode
253 if use_streaming and isinstance(source, str):
254 # Use memory-efficient streaming for large files
255 from ..utils.streaming_file_utils import (
256 create_streaming_file_reader,
257 create_streaming_file_writer,
258 )
260 stream_source = create_streaming_file_reader(
261 file_path=source,
262 config=stream_config,
263 input_format=input_format,
264 text_field_name=text_field_name,
265 csv_delimiter=csv_delimiter,
266 csv_has_header=csv_has_header,
267 skip_empty_lines=skip_empty_lines
268 )
270 # Handle sink for streaming mode
271 sink_func = None
272 cleanup_func = None
273 if sink:
274 sink_func, cleanup_func = await create_streaming_file_writer(
275 file_path=sink,
276 config=stream_config
277 )
278 else:
279 # Use regular mode (loads full chunks into memory)
280 from ..utils.file_utils import create_file_reader, create_file_writer
282 # Handle file source
283 if isinstance(source, str):
284 stream_source = create_file_reader(
285 file_path=source,
286 input_format=input_format,
287 text_field_name=text_field_name,
288 csv_delimiter=csv_delimiter,
289 csv_has_header=csv_has_header,
290 skip_empty_lines=skip_empty_lines
291 )
292 else:
293 # Already an async iterator
294 stream_source = source
296 # Handle sink for regular mode
297 sink_func = None
298 cleanup_func = None
299 if sink:
300 sink_func, cleanup_func = create_file_writer(sink)
302 try:
303 # Execute stream using async executor
304 result = await stream_executor.execute_stream(
305 source=stream_source,
306 sink=sink_func,
307 chunk_size=chunk_size
308 )
310 return {
311 'total_processed': result.total_processed,
312 'successful': result.successful,
313 'failed': result.failed,
314 'duration': result.duration,
315 'throughput': result.throughput
316 }
317 finally:
318 # Clean up any resources (e.g., close files)
319 if cleanup_func:
320 if asyncio.iscoroutinefunction(cleanup_func):
321 await cleanup_func()
322 else:
323 cleanup_func()
325 async def validate(self, data: dict[str, Any] | Record) -> dict[str, Any]:
326 """Validate data against FSM's start state schema asynchronously.
328 Args:
329 data: Data to validate
331 Returns:
332 Dict containing validation results
333 """
334 # Convert to Record if needed
335 if isinstance(data, dict):
336 record = Record(data)
337 else:
338 record = data
340 # Get start state
341 start_state = self._fsm.get_start_state()
343 # Validate against schema
344 if start_state.schema:
345 validation_result = start_state.schema.validate(record)
346 return {
347 'valid': validation_result.valid,
348 'errors': validation_result.errors if not validation_result.valid else []
349 }
350 else:
351 return {
352 'valid': True,
353 'errors': []
354 }
356 def get_states(self) -> list[str]:
357 """Get list of all state names in the FSM."""
358 states = []
359 # The FSM has networks, and each network has states
360 for network in self._fsm.networks.values():
361 for state in network.states.values():
362 states.append(state.name)
363 return states
365 def get_resources(self) -> list[str]:
366 """Get list of registered resource names."""
367 return list(self._resource_manager._resources.keys())
369 @property
370 def config(self) -> Any:
371 """Get the FSM configuration object."""
372 return self._config
374 async def close(self) -> None:
375 """Clean up resources and close connections asynchronously."""
376 await self._resource_manager.cleanup()
378 # Alias for consistency with other async libraries
379 aclose = close
382# Factory function for AsyncSimpleFSM
383async def create_async_fsm(
384 config: str | Path | dict[str, Any],
385 custom_functions: dict[str, Callable] | None = None,
386 **kwargs
387) -> AsyncSimpleFSM:
388 """Factory function to create an AsyncSimpleFSM instance.
390 Args:
391 config: Configuration file path or dictionary
392 custom_functions: Optional custom functions to register
393 **kwargs: Additional arguments passed to AsyncSimpleFSM
395 Returns:
396 Configured AsyncSimpleFSM instance
397 """
398 return AsyncSimpleFSM(config, custom_functions=custom_functions, **kwargs)