Coverage for src/dataknobs_fsm/api/simple.py: 21%
125 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 16:46 -0600
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 16:46 -0600
1"""Simple API for common FSM operations.
3This module provides a simplified synchronous interface for common FSM use cases,
4wrapping the async-first AsyncSimpleFSM implementation.
5"""
7import asyncio
8import threading
9from collections.abc import Callable
10from pathlib import Path
11from typing import Any
13from dataknobs_data import Record
15from ..core.data_modes import DataHandlingMode
16from .async_simple import AsyncSimpleFSM
19class SimpleFSM:
20 """Synchronous FSM interface wrapping AsyncSimpleFSM.
22 This class provides a purely synchronous API for FSM operations,
23 internally using AsyncSimpleFSM with a dedicated event loop.
24 """
26 def __init__(
27 self,
28 config: str | Path | dict[str, Any],
29 data_mode: DataHandlingMode = DataHandlingMode.COPY,
30 resources: dict[str, Any] | None = None,
31 custom_functions: dict[str, Callable] | None = None
32 ):
33 """Initialize SimpleFSM from configuration.
35 Args:
36 config: Path to config file or config dictionary
37 data_mode: Default data mode for processing
38 resources: Optional resource configurations
39 custom_functions: Optional custom functions to register
40 """
41 # Store data_mode for compatibility
42 self.data_mode = data_mode
44 # Create the async FSM
45 self._async_fsm = AsyncSimpleFSM(
46 config=config,
47 data_mode=data_mode,
48 resources=resources,
49 custom_functions=custom_functions
50 )
52 # Expose internal attributes for compatibility
53 self._fsm = self._async_fsm._fsm
54 self._resource_manager = self._async_fsm._resource_manager
55 self._async_engine = self._async_fsm._async_engine
57 # Create synchronous engine for compatibility
58 from ..execution.engine import ExecutionEngine
59 self._engine = ExecutionEngine(self._fsm)
61 # Create a dedicated event loop for sync operations
62 self._loop: asyncio.AbstractEventLoop | None = None
63 self._loop_thread: threading.Thread | None = None
64 self._setup_event_loop()
66 def _setup_event_loop(self) -> None:
67 """Set up a dedicated event loop in a separate thread."""
68 self._loop = asyncio.new_event_loop()
70 def run_loop() -> None:
71 asyncio.set_event_loop(self._loop)
72 self._loop.run_forever()
74 self._loop_thread = threading.Thread(target=run_loop, daemon=True)
75 self._loop_thread.start()
77 def _run_async(self, coro: Any) -> Any:
78 """Run an async operation in the dedicated event loop.
80 Args:
81 coro: Coroutine to run
83 Returns:
84 The result of the coroutine
85 """
86 if not self._loop or not self._loop.is_running():
87 self._setup_event_loop()
89 if self._loop is None:
90 raise RuntimeError("Failed to setup event loop")
92 future = asyncio.run_coroutine_threadsafe(coro, self._loop)
93 return future.result()
95 def process(
96 self,
97 data: dict[str, Any] | Record,
98 initial_state: str | None = None,
99 timeout: float | None = None
100 ) -> dict[str, Any]:
101 """Process a single record through the FSM synchronously.
103 Args:
104 data: Input data to process
105 initial_state: Optional starting state (defaults to FSM start state)
106 timeout: Optional timeout in seconds
108 Returns:
109 Dict containing the processed result with fields:
110 - final_state: Name of the final state reached
111 - data: The transformed data
112 - path: List of states traversed
113 - success: Whether processing succeeded
114 - error: Any error message (None if successful)
115 """
116 # Create the coroutine with the async process method
117 async def _process():
118 # Import here to avoid circular dependency
119 from ..core.context_factory import ContextFactory
120 from ..core.modes import ProcessingMode
121 from ..core.result_formatter import ResultFormatter
123 # Convert to Record if needed
124 if isinstance(data, dict):
125 from dataknobs_data import Record
126 record = Record(data)
127 else:
128 record = data
130 # Create context
131 context = ContextFactory.create_context(
132 fsm=self._fsm,
133 data=record,
134 initial_state=initial_state,
135 data_mode=ProcessingMode.SINGLE,
136 resource_manager=self._resource_manager
137 )
139 try:
140 # Execute FSM asynchronously
141 success, result = await self._async_engine.execute(context)
143 # Format result
144 return ResultFormatter.format_single_result(
145 context=context,
146 success=success,
147 result=result
148 )
149 except asyncio.TimeoutError:
150 # Return error result instead of raising
151 return ResultFormatter.format_error_result(
152 context=context,
153 error=TimeoutError(f"FSM execution exceeded timeout of {timeout} seconds")
154 )
155 except Exception as e:
156 return ResultFormatter.format_error_result(
157 context=context,
158 error=e
159 )
161 if timeout:
162 # Use threading for timeout support
163 import concurrent.futures
164 with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
165 future = executor.submit(self._run_async, _process())
166 try:
167 return future.result(timeout=timeout)
168 except concurrent.futures.TimeoutError:
169 future.cancel()
170 # Return an error result instead of raising
171 return {
172 'success': False,
173 'error': f"FSM execution exceeded timeout of {timeout} seconds",
174 'final_state': None,
175 'data': data if isinstance(data, dict) else data.data,
176 'path': []
177 }
178 else:
179 return self._run_async(_process())
181 def process_batch(
182 self,
183 data: list[dict[str, Any] | Record],
184 batch_size: int = 10,
185 max_workers: int = 4,
186 on_progress: Callable | None = None
187 ) -> list[dict[str, Any]]:
188 """Process multiple records in parallel batches synchronously.
190 Args:
191 data: List of input records to process
192 batch_size: Number of records per batch
193 max_workers: Maximum parallel workers
194 on_progress: Optional callback for progress updates
196 Returns:
197 List of results for each input record
198 """
199 return self._run_async(
200 self._async_fsm.process_batch(
201 data=data,
202 batch_size=batch_size,
203 max_workers=max_workers,
204 on_progress=on_progress
205 )
206 )
208 def process_stream(
209 self,
210 source: str | Any,
211 sink: str | None = None,
212 chunk_size: int = 100,
213 on_progress: Callable | None = None,
214 input_format: str = 'auto',
215 text_field_name: str = 'text',
216 csv_delimiter: str = ',',
217 csv_has_header: bool = True,
218 skip_empty_lines: bool = True,
219 use_streaming: bool = False
220 ) -> dict[str, Any]:
221 """Process a stream of data through the FSM synchronously.
223 Args:
224 source: Data source file path or async iterator
225 sink: Optional output destination
226 chunk_size: Size of processing chunks
227 on_progress: Optional progress callback
228 input_format: Input file format ('auto', 'jsonl', 'json', 'csv', 'text')
229 text_field_name: Field name for text lines when converting to dict
230 csv_delimiter: CSV delimiter character
231 csv_has_header: Whether CSV file has header row
232 skip_empty_lines: Skip empty lines in text files
233 use_streaming: Use memory-efficient streaming for large files
235 Returns:
236 Dict containing stream processing statistics
237 """
238 # If source is a string (file path), use the async version directly
239 if isinstance(source, str):
240 return self._run_async(
241 self._async_fsm.process_stream(
242 source=source,
243 sink=sink,
244 chunk_size=chunk_size,
245 on_progress=on_progress,
246 input_format=input_format,
247 text_field_name=text_field_name,
248 csv_delimiter=csv_delimiter,
249 csv_has_header=csv_has_header,
250 skip_empty_lines=skip_empty_lines,
251 use_streaming=use_streaming
252 )
253 )
254 else:
255 # Source is an async iterator, need to handle it properly
256 async def _process():
257 return await self._async_fsm.process_stream(
258 source=source,
259 sink=sink,
260 chunk_size=chunk_size,
261 on_progress=on_progress,
262 input_format=input_format,
263 text_field_name=text_field_name,
264 csv_delimiter=csv_delimiter,
265 csv_has_header=csv_has_header,
266 skip_empty_lines=skip_empty_lines,
267 use_streaming=use_streaming
268 )
269 return self._run_async(_process())
271 def validate(self, data: dict[str, Any] | Record) -> dict[str, Any]:
272 """Validate data against FSM's start state schema synchronously.
274 Args:
275 data: Data to validate
277 Returns:
278 Dict containing validation results
279 """
280 return self._run_async(self._async_fsm.validate(data))
282 def get_states(self) -> list[str]:
283 """Get list of all state names in the FSM."""
284 return self._async_fsm.get_states()
286 def get_resources(self) -> list[str]:
287 """Get list of registered resource names."""
288 return self._async_fsm.get_resources()
290 @property
291 def config(self) -> Any:
292 """Get the FSM configuration object."""
293 return self._async_fsm._config
295 def close(self) -> None:
296 """Clean up resources and close connections synchronously."""
297 self._run_async(self._async_fsm.close())
299 # Shut down the event loop
300 if self._loop and self._loop.is_running():
301 self._loop.call_soon_threadsafe(self._loop.stop)
302 if self._loop_thread and self._loop_thread.is_alive():
303 self._loop_thread.join(timeout=1.0)
305 async def aclose(self) -> None:
306 """Async version of close for use in async contexts."""
307 await self._async_fsm.close()
310def create_fsm(
311 config: str | Path | dict[str, Any],
312 custom_functions: dict[str, Callable] | None = None,
313 **kwargs
314) -> SimpleFSM:
315 """Factory function to create a SimpleFSM instance.
317 Args:
318 config: Configuration file path or dictionary
319 custom_functions: Optional custom functions to register
320 **kwargs: Additional arguments passed to SimpleFSM
322 Returns:
323 Configured SimpleFSM instance
324 """
325 return SimpleFSM(config, custom_functions=custom_functions, **kwargs)
328# Convenience functions for common operations
330def process_file(
331 fsm_config: str | Path | dict[str, Any],
332 input_file: str,
333 output_file: str | None = None,
334 input_format: str = 'auto',
335 chunk_size: int = 1000,
336 timeout: float | None = None,
337 text_field_name: str = 'text',
338 csv_delimiter: str = ',',
339 csv_has_header: bool = True,
340 skip_empty_lines: bool = True,
341 use_streaming: bool = False
342) -> dict[str, Any]:
343 """Process a file through an FSM with automatic format detection.
345 Args:
346 fsm_config: FSM configuration
347 input_file: Path to input file
348 output_file: Optional output file path (format auto-detected from extension)
349 input_format: Input format ('auto', 'jsonl', 'json', 'csv', 'text')
350 chunk_size: Processing chunk size
351 timeout: Optional timeout in seconds for processing
352 text_field_name: Field name for text lines when converting to dict
353 csv_delimiter: CSV delimiter character
354 csv_has_header: Whether CSV file has header row
355 skip_empty_lines: Skip empty lines in text files
356 use_streaming: Use memory-efficient streaming for large files
358 Returns:
359 Processing statistics
361 Examples:
362 # Process plain text file
363 results = process_file('config.yaml', 'input.txt', 'output.jsonl')
365 # Process large CSV file with streaming
366 results = process_file('config.yaml', 'large_data.csv', 'results.json', use_streaming=True)
368 # Process with custom text field name
369 results = process_file('config.yaml', 'input.txt', text_field_name='content')
370 """
371 fsm = create_fsm(fsm_config)
373 try:
374 if timeout:
375 # Use threading timeout
376 import concurrent.futures
378 with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
379 future = executor.submit(
380 fsm.process_stream,
381 source=input_file,
382 sink=output_file,
383 chunk_size=chunk_size,
384 input_format=input_format,
385 text_field_name=text_field_name,
386 csv_delimiter=csv_delimiter,
387 csv_has_header=csv_has_header,
388 skip_empty_lines=skip_empty_lines,
389 use_streaming=use_streaming
390 )
391 try:
392 result = future.result(timeout=timeout)
393 except concurrent.futures.TimeoutError as e:
394 future.cancel()
395 raise TimeoutError(f"File processing exceeded timeout of {timeout} seconds") from e
396 else:
397 result = fsm.process_stream(
398 source=input_file,
399 sink=output_file,
400 chunk_size=chunk_size,
401 input_format=input_format,
402 text_field_name=text_field_name,
403 csv_delimiter=csv_delimiter,
404 csv_has_header=csv_has_header,
405 skip_empty_lines=skip_empty_lines,
406 use_streaming=use_streaming
407 )
408 return result
409 finally:
410 fsm.close()
413def validate_data(
414 fsm_config: str | Path | dict[str, Any],
415 data: list[dict[str, Any]]
416) -> list[dict[str, Any]]:
417 """Validate multiple data records against FSM schema.
419 Args:
420 fsm_config: FSM configuration
421 data: List of data records to validate
423 Returns:
424 List of validation results
425 """
426 fsm = create_fsm(fsm_config)
428 try:
429 results = []
430 for record in data:
431 results.append(fsm.validate(record))
432 return results
433 finally:
434 fsm.close()
437def batch_process(
438 fsm_config: str | Path | dict[str, Any],
439 data: list[dict[str, Any] | Record],
440 batch_size: int = 10,
441 max_workers: int = 4,
442 timeout: float | None = None
443) -> list[dict[str, Any]]:
444 """Process multiple records in parallel.
446 Args:
447 fsm_config: FSM configuration
448 data: List of input records
449 batch_size: Batch size for processing
450 max_workers: Maximum parallel workers
451 timeout: Optional timeout in seconds for entire batch processing
453 Returns:
454 List of processing results
455 """
456 fsm = create_fsm(fsm_config)
458 try:
459 if timeout:
460 # Use threading timeout for batch processing
461 import concurrent.futures
463 with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
464 future = executor.submit(
465 fsm.process_batch,
466 data=data,
467 batch_size=batch_size,
468 max_workers=max_workers
469 )
470 try:
471 return future.result(timeout=timeout)
472 except concurrent.futures.TimeoutError as e:
473 future.cancel()
474 raise TimeoutError(f"Batch processing exceeded timeout of {timeout} seconds") from e
475 else:
476 return fsm.process_batch(
477 data=data,
478 batch_size=batch_size,
479 max_workers=max_workers
480 )
481 finally:
482 fsm.close()