Coverage for src/dataknobs_fsm/api/simple.py: 21%

125 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-20 16:46 -0600

1"""Simple API for common FSM operations. 

2 

3This module provides a simplified synchronous interface for common FSM use cases, 

4wrapping the async-first AsyncSimpleFSM implementation. 

5""" 

6 

7import asyncio 

8import threading 

9from collections.abc import Callable 

10from pathlib import Path 

11from typing import Any 

12 

13from dataknobs_data import Record 

14 

15from ..core.data_modes import DataHandlingMode 

16from .async_simple import AsyncSimpleFSM 

17 

18 

19class SimpleFSM: 

20 """Synchronous FSM interface wrapping AsyncSimpleFSM. 

21 

22 This class provides a purely synchronous API for FSM operations, 

23 internally using AsyncSimpleFSM with a dedicated event loop. 

24 """ 

25 

26 def __init__( 

27 self, 

28 config: str | Path | dict[str, Any], 

29 data_mode: DataHandlingMode = DataHandlingMode.COPY, 

30 resources: dict[str, Any] | None = None, 

31 custom_functions: dict[str, Callable] | None = None 

32 ): 

33 """Initialize SimpleFSM from configuration. 

34 

35 Args: 

36 config: Path to config file or config dictionary 

37 data_mode: Default data mode for processing 

38 resources: Optional resource configurations 

39 custom_functions: Optional custom functions to register 

40 """ 

41 # Store data_mode for compatibility 

42 self.data_mode = data_mode 

43 

44 # Create the async FSM 

45 self._async_fsm = AsyncSimpleFSM( 

46 config=config, 

47 data_mode=data_mode, 

48 resources=resources, 

49 custom_functions=custom_functions 

50 ) 

51 

52 # Expose internal attributes for compatibility 

53 self._fsm = self._async_fsm._fsm 

54 self._resource_manager = self._async_fsm._resource_manager 

55 self._async_engine = self._async_fsm._async_engine 

56 

57 # Create synchronous engine for compatibility 

58 from ..execution.engine import ExecutionEngine 

59 self._engine = ExecutionEngine(self._fsm) 

60 

61 # Create a dedicated event loop for sync operations 

62 self._loop: asyncio.AbstractEventLoop | None = None 

63 self._loop_thread: threading.Thread | None = None 

64 self._setup_event_loop() 

65 

66 def _setup_event_loop(self) -> None: 

67 """Set up a dedicated event loop in a separate thread.""" 

68 self._loop = asyncio.new_event_loop() 

69 

70 def run_loop() -> None: 

71 asyncio.set_event_loop(self._loop) 

72 self._loop.run_forever() 

73 

74 self._loop_thread = threading.Thread(target=run_loop, daemon=True) 

75 self._loop_thread.start() 

76 

77 def _run_async(self, coro: Any) -> Any: 

78 """Run an async operation in the dedicated event loop. 

79 

80 Args: 

81 coro: Coroutine to run 

82 

83 Returns: 

84 The result of the coroutine 

85 """ 

86 if not self._loop or not self._loop.is_running(): 

87 self._setup_event_loop() 

88 

89 if self._loop is None: 

90 raise RuntimeError("Failed to setup event loop") 

91 

92 future = asyncio.run_coroutine_threadsafe(coro, self._loop) 

93 return future.result() 

94 

95 def process( 

96 self, 

97 data: dict[str, Any] | Record, 

98 initial_state: str | None = None, 

99 timeout: float | None = None 

100 ) -> dict[str, Any]: 

101 """Process a single record through the FSM synchronously. 

102 

103 Args: 

104 data: Input data to process 

105 initial_state: Optional starting state (defaults to FSM start state) 

106 timeout: Optional timeout in seconds 

107 

108 Returns: 

109 Dict containing the processed result with fields: 

110 - final_state: Name of the final state reached 

111 - data: The transformed data 

112 - path: List of states traversed 

113 - success: Whether processing succeeded 

114 - error: Any error message (None if successful) 

115 """ 

116 # Create the coroutine with the async process method 

117 async def _process(): 

118 # Import here to avoid circular dependency 

119 from ..core.context_factory import ContextFactory 

120 from ..core.modes import ProcessingMode 

121 from ..core.result_formatter import ResultFormatter 

122 

123 # Convert to Record if needed 

124 if isinstance(data, dict): 

125 from dataknobs_data import Record 

126 record = Record(data) 

127 else: 

128 record = data 

129 

130 # Create context 

131 context = ContextFactory.create_context( 

132 fsm=self._fsm, 

133 data=record, 

134 initial_state=initial_state, 

135 data_mode=ProcessingMode.SINGLE, 

136 resource_manager=self._resource_manager 

137 ) 

138 

139 try: 

140 # Execute FSM asynchronously 

141 success, result = await self._async_engine.execute(context) 

142 

143 # Format result 

144 return ResultFormatter.format_single_result( 

145 context=context, 

146 success=success, 

147 result=result 

148 ) 

149 except asyncio.TimeoutError: 

150 # Return error result instead of raising 

151 return ResultFormatter.format_error_result( 

152 context=context, 

153 error=TimeoutError(f"FSM execution exceeded timeout of {timeout} seconds") 

154 ) 

155 except Exception as e: 

156 return ResultFormatter.format_error_result( 

157 context=context, 

158 error=e 

159 ) 

160 

161 if timeout: 

162 # Use threading for timeout support 

163 import concurrent.futures 

164 with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: 

165 future = executor.submit(self._run_async, _process()) 

166 try: 

167 return future.result(timeout=timeout) 

168 except concurrent.futures.TimeoutError: 

169 future.cancel() 

170 # Return an error result instead of raising 

171 return { 

172 'success': False, 

173 'error': f"FSM execution exceeded timeout of {timeout} seconds", 

174 'final_state': None, 

175 'data': data if isinstance(data, dict) else data.data, 

176 'path': [] 

177 } 

178 else: 

179 return self._run_async(_process()) 

180 

181 def process_batch( 

182 self, 

183 data: list[dict[str, Any] | Record], 

184 batch_size: int = 10, 

185 max_workers: int = 4, 

186 on_progress: Callable | None = None 

187 ) -> list[dict[str, Any]]: 

188 """Process multiple records in parallel batches synchronously. 

189 

190 Args: 

191 data: List of input records to process 

192 batch_size: Number of records per batch 

193 max_workers: Maximum parallel workers 

194 on_progress: Optional callback for progress updates 

195 

196 Returns: 

197 List of results for each input record 

198 """ 

199 return self._run_async( 

200 self._async_fsm.process_batch( 

201 data=data, 

202 batch_size=batch_size, 

203 max_workers=max_workers, 

204 on_progress=on_progress 

205 ) 

206 ) 

207 

208 def process_stream( 

209 self, 

210 source: str | Any, 

211 sink: str | None = None, 

212 chunk_size: int = 100, 

213 on_progress: Callable | None = None, 

214 input_format: str = 'auto', 

215 text_field_name: str = 'text', 

216 csv_delimiter: str = ',', 

217 csv_has_header: bool = True, 

218 skip_empty_lines: bool = True, 

219 use_streaming: bool = False 

220 ) -> dict[str, Any]: 

221 """Process a stream of data through the FSM synchronously. 

222 

223 Args: 

224 source: Data source file path or async iterator 

225 sink: Optional output destination 

226 chunk_size: Size of processing chunks 

227 on_progress: Optional progress callback 

228 input_format: Input file format ('auto', 'jsonl', 'json', 'csv', 'text') 

229 text_field_name: Field name for text lines when converting to dict 

230 csv_delimiter: CSV delimiter character 

231 csv_has_header: Whether CSV file has header row 

232 skip_empty_lines: Skip empty lines in text files 

233 use_streaming: Use memory-efficient streaming for large files 

234 

235 Returns: 

236 Dict containing stream processing statistics 

237 """ 

238 # If source is a string (file path), use the async version directly 

239 if isinstance(source, str): 

240 return self._run_async( 

241 self._async_fsm.process_stream( 

242 source=source, 

243 sink=sink, 

244 chunk_size=chunk_size, 

245 on_progress=on_progress, 

246 input_format=input_format, 

247 text_field_name=text_field_name, 

248 csv_delimiter=csv_delimiter, 

249 csv_has_header=csv_has_header, 

250 skip_empty_lines=skip_empty_lines, 

251 use_streaming=use_streaming 

252 ) 

253 ) 

254 else: 

255 # Source is an async iterator, need to handle it properly 

256 async def _process(): 

257 return await self._async_fsm.process_stream( 

258 source=source, 

259 sink=sink, 

260 chunk_size=chunk_size, 

261 on_progress=on_progress, 

262 input_format=input_format, 

263 text_field_name=text_field_name, 

264 csv_delimiter=csv_delimiter, 

265 csv_has_header=csv_has_header, 

266 skip_empty_lines=skip_empty_lines, 

267 use_streaming=use_streaming 

268 ) 

269 return self._run_async(_process()) 

270 

271 def validate(self, data: dict[str, Any] | Record) -> dict[str, Any]: 

272 """Validate data against FSM's start state schema synchronously. 

273 

274 Args: 

275 data: Data to validate 

276 

277 Returns: 

278 Dict containing validation results 

279 """ 

280 return self._run_async(self._async_fsm.validate(data)) 

281 

282 def get_states(self) -> list[str]: 

283 """Get list of all state names in the FSM.""" 

284 return self._async_fsm.get_states() 

285 

286 def get_resources(self) -> list[str]: 

287 """Get list of registered resource names.""" 

288 return self._async_fsm.get_resources() 

289 

290 @property 

291 def config(self) -> Any: 

292 """Get the FSM configuration object.""" 

293 return self._async_fsm._config 

294 

295 def close(self) -> None: 

296 """Clean up resources and close connections synchronously.""" 

297 self._run_async(self._async_fsm.close()) 

298 

299 # Shut down the event loop 

300 if self._loop and self._loop.is_running(): 

301 self._loop.call_soon_threadsafe(self._loop.stop) 

302 if self._loop_thread and self._loop_thread.is_alive(): 

303 self._loop_thread.join(timeout=1.0) 

304 

305 async def aclose(self) -> None: 

306 """Async version of close for use in async contexts.""" 

307 await self._async_fsm.close() 

308 

309 

310def create_fsm( 

311 config: str | Path | dict[str, Any], 

312 custom_functions: dict[str, Callable] | None = None, 

313 **kwargs 

314) -> SimpleFSM: 

315 """Factory function to create a SimpleFSM instance. 

316 

317 Args: 

318 config: Configuration file path or dictionary 

319 custom_functions: Optional custom functions to register 

320 **kwargs: Additional arguments passed to SimpleFSM 

321 

322 Returns: 

323 Configured SimpleFSM instance 

324 """ 

325 return SimpleFSM(config, custom_functions=custom_functions, **kwargs) 

326 

327 

328# Convenience functions for common operations 

329 

330def process_file( 

331 fsm_config: str | Path | dict[str, Any], 

332 input_file: str, 

333 output_file: str | None = None, 

334 input_format: str = 'auto', 

335 chunk_size: int = 1000, 

336 timeout: float | None = None, 

337 text_field_name: str = 'text', 

338 csv_delimiter: str = ',', 

339 csv_has_header: bool = True, 

340 skip_empty_lines: bool = True, 

341 use_streaming: bool = False 

342) -> dict[str, Any]: 

343 """Process a file through an FSM with automatic format detection. 

344 

345 Args: 

346 fsm_config: FSM configuration 

347 input_file: Path to input file 

348 output_file: Optional output file path (format auto-detected from extension) 

349 input_format: Input format ('auto', 'jsonl', 'json', 'csv', 'text') 

350 chunk_size: Processing chunk size 

351 timeout: Optional timeout in seconds for processing 

352 text_field_name: Field name for text lines when converting to dict 

353 csv_delimiter: CSV delimiter character 

354 csv_has_header: Whether CSV file has header row 

355 skip_empty_lines: Skip empty lines in text files 

356 use_streaming: Use memory-efficient streaming for large files 

357 

358 Returns: 

359 Processing statistics 

360 

361 Examples: 

362 # Process plain text file 

363 results = process_file('config.yaml', 'input.txt', 'output.jsonl') 

364 

365 # Process large CSV file with streaming 

366 results = process_file('config.yaml', 'large_data.csv', 'results.json', use_streaming=True) 

367 

368 # Process with custom text field name 

369 results = process_file('config.yaml', 'input.txt', text_field_name='content') 

370 """ 

371 fsm = create_fsm(fsm_config) 

372 

373 try: 

374 if timeout: 

375 # Use threading timeout 

376 import concurrent.futures 

377 

378 with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: 

379 future = executor.submit( 

380 fsm.process_stream, 

381 source=input_file, 

382 sink=output_file, 

383 chunk_size=chunk_size, 

384 input_format=input_format, 

385 text_field_name=text_field_name, 

386 csv_delimiter=csv_delimiter, 

387 csv_has_header=csv_has_header, 

388 skip_empty_lines=skip_empty_lines, 

389 use_streaming=use_streaming 

390 ) 

391 try: 

392 result = future.result(timeout=timeout) 

393 except concurrent.futures.TimeoutError as e: 

394 future.cancel() 

395 raise TimeoutError(f"File processing exceeded timeout of {timeout} seconds") from e 

396 else: 

397 result = fsm.process_stream( 

398 source=input_file, 

399 sink=output_file, 

400 chunk_size=chunk_size, 

401 input_format=input_format, 

402 text_field_name=text_field_name, 

403 csv_delimiter=csv_delimiter, 

404 csv_has_header=csv_has_header, 

405 skip_empty_lines=skip_empty_lines, 

406 use_streaming=use_streaming 

407 ) 

408 return result 

409 finally: 

410 fsm.close() 

411 

412 

413def validate_data( 

414 fsm_config: str | Path | dict[str, Any], 

415 data: list[dict[str, Any]] 

416) -> list[dict[str, Any]]: 

417 """Validate multiple data records against FSM schema. 

418 

419 Args: 

420 fsm_config: FSM configuration 

421 data: List of data records to validate 

422 

423 Returns: 

424 List of validation results 

425 """ 

426 fsm = create_fsm(fsm_config) 

427 

428 try: 

429 results = [] 

430 for record in data: 

431 results.append(fsm.validate(record)) 

432 return results 

433 finally: 

434 fsm.close() 

435 

436 

437def batch_process( 

438 fsm_config: str | Path | dict[str, Any], 

439 data: list[dict[str, Any] | Record], 

440 batch_size: int = 10, 

441 max_workers: int = 4, 

442 timeout: float | None = None 

443) -> list[dict[str, Any]]: 

444 """Process multiple records in parallel. 

445 

446 Args: 

447 fsm_config: FSM configuration 

448 data: List of input records 

449 batch_size: Batch size for processing 

450 max_workers: Maximum parallel workers 

451 timeout: Optional timeout in seconds for entire batch processing 

452 

453 Returns: 

454 List of processing results 

455 """ 

456 fsm = create_fsm(fsm_config) 

457 

458 try: 

459 if timeout: 

460 # Use threading timeout for batch processing 

461 import concurrent.futures 

462 

463 with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: 

464 future = executor.submit( 

465 fsm.process_batch, 

466 data=data, 

467 batch_size=batch_size, 

468 max_workers=max_workers 

469 ) 

470 try: 

471 return future.result(timeout=timeout) 

472 except concurrent.futures.TimeoutError as e: 

473 future.cancel() 

474 raise TimeoutError(f"Batch processing exceeded timeout of {timeout} seconds") from e 

475 else: 

476 return fsm.process_batch( 

477 data=data, 

478 batch_size=batch_size, 

479 max_workers=max_workers 

480 ) 

481 finally: 

482 fsm.close()