Coverage for src/dataknobs_fsm/api/async_simple.py: 24%

126 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-20 16:46 -0600

1"""Async-first API for FSM operations. 

2 

3This module provides an async-first interface for FSM operations, 

4designed to work natively in async contexts without any asyncio.run() calls. 

5""" 

6 

7import asyncio 

8from collections.abc import AsyncIterator, Callable 

9from pathlib import Path 

10from typing import Any 

11 

12from dataknobs_data import Record 

13 

14from ..config.builder import FSMBuilder 

15from ..config.loader import ConfigLoader 

16from ..core.context_factory import ContextFactory 

17from ..core.data_modes import DataHandlingMode 

18from ..core.result_formatter import ResultFormatter 

19from ..execution.async_batch import AsyncBatchExecutor 

20from ..execution.async_engine import AsyncExecutionEngine 

21from ..execution.async_stream import AsyncStreamExecutor 

22from ..resources.manager import ResourceManager 

23from ..streaming.core import StreamConfig as CoreStreamConfig 

24 

25 

26class AsyncSimpleFSM: 

27 """Async-first FSM interface for processing data. 

28 

29 This class provides a fully asynchronous API for FSM operations, 

30 designed to work natively in async contexts. 

31 """ 

32 

33 def __init__( 

34 self, 

35 config: str | Path | dict[str, Any], 

36 data_mode: DataHandlingMode = DataHandlingMode.COPY, 

37 resources: dict[str, Any] | None = None, 

38 custom_functions: dict[str, Callable] | None = None 

39 ): 

40 """Initialize AsyncSimpleFSM from configuration. 

41 

42 Args: 

43 config: Path to config file or config dictionary 

44 data_mode: Default data mode for processing 

45 resources: Optional resource configurations 

46 custom_functions: Optional custom functions to register 

47 """ 

48 self.data_mode = data_mode 

49 self._resources = resources or {} 

50 self._custom_functions = custom_functions or {} 

51 

52 # Create loader with knowledge of custom functions 

53 loader = ConfigLoader() 

54 

55 # Tell the loader about registered function names 

56 if self._custom_functions: 

57 for name in self._custom_functions.keys(): 

58 loader.add_registered_function(name) 

59 

60 # Load configuration 

61 if isinstance(config, (str, Path)): 

62 self._config = loader.load_from_file(Path(config)) 

63 else: 

64 self._config = loader.load_from_dict(config) 

65 

66 # Build FSM with custom functions 

67 builder = FSMBuilder() 

68 

69 # Register custom functions with the builder 

70 for name, func in self._custom_functions.items(): 

71 builder.register_function(name, func) 

72 

73 self._fsm = builder.build(self._config) 

74 

75 # Initialize resource manager 

76 self._resource_manager = ResourceManager() 

77 self._setup_resources() 

78 

79 # Create async execution engine 

80 self._async_engine = AsyncExecutionEngine(self._fsm) 

81 

82 def _setup_resources(self) -> None: 

83 """Set up resources from configuration.""" 

84 # Register resources from config 

85 if hasattr(self._config, 'resources'): 

86 for resource_config in self._config.resources: 

87 try: 

88 resource = self._create_resource_provider(resource_config) 

89 self._resource_manager.register_provider(resource_config.name, resource) 

90 except Exception: 

91 # Continue if resource creation fails - this is for simplified API 

92 pass 

93 

94 # Register additional resources passed to constructor 

95 for name, resource_config in self._resources.items(): 

96 try: 

97 # Use ResourceManager factory method 

98 self._resource_manager.register_from_dict(name, resource_config) 

99 except Exception: 

100 # Continue if resource creation fails 

101 pass 

102 

103 def _create_resource_provider(self, resource_config): 

104 """Create a resource provider from ResourceConfig.""" 

105 # Use the same logic as FSMBuilder 

106 from ..config.builder import FSMBuilder 

107 builder = FSMBuilder() 

108 return builder._create_resource(resource_config) 

109 

110 async def process(self, data: dict[str, Any] | Record) -> dict[str, Any]: 

111 """Process a single record through the FSM asynchronously. 

112 

113 Args: 

114 data: Input data to process 

115 

116 Returns: 

117 Dict containing the processed result 

118 """ 

119 # Convert to Record if needed 

120 if isinstance(data, dict): 

121 record = Record(data) 

122 else: 

123 record = data 

124 

125 # Create context 

126 from ..core.modes import ProcessingMode 

127 context = ContextFactory.create_context( 

128 fsm=self._fsm, 

129 data=record, 

130 data_mode=ProcessingMode.SINGLE, 

131 resource_manager=self._resource_manager 

132 ) 

133 

134 try: 

135 # Execute FSM asynchronously 

136 success, result = await self._async_engine.execute(context) 

137 

138 # Format result 

139 return ResultFormatter.format_single_result( 

140 context=context, 

141 success=success, 

142 result=result 

143 ) 

144 except Exception as e: 

145 return ResultFormatter.format_error_result( 

146 context=context, 

147 error=e 

148 ) 

149 

150 async def process_batch( 

151 self, 

152 data: list[dict[str, Any] | Record], 

153 batch_size: int = 10, 

154 max_workers: int = 4, 

155 on_progress: Callable | None = None 

156 ) -> list[dict[str, Any]]: 

157 """Process multiple records in parallel batches asynchronously. 

158 

159 Args: 

160 data: List of input records to process 

161 batch_size: Number of records per batch 

162 max_workers: Maximum parallel workers 

163 on_progress: Optional callback for progress updates 

164 

165 Returns: 

166 List of results for each input record 

167 """ 

168 batch_executor = AsyncBatchExecutor( 

169 fsm=self._fsm, 

170 parallelism=max_workers, 

171 batch_size=batch_size, 

172 progress_callback=on_progress 

173 ) 

174 

175 # Convert to Records 

176 records = [] 

177 for item in data: 

178 if isinstance(item, dict): 

179 records.append(Record(item)) 

180 else: 

181 records.append(item) 

182 

183 # Execute batch 

184 results = await batch_executor.execute_batch(items=records) 

185 

186 # Format results 

187 formatted_results = [] 

188 for result in results: 

189 if result.success: 

190 formatted_results.append({ 

191 'final_state': result.metadata.get('final_state', 'unknown'), 

192 'data': result.result, 

193 'path': result.metadata.get('path', []), 

194 'success': True, 

195 'error': None 

196 }) 

197 else: 

198 formatted_results.append({ 

199 'final_state': result.metadata.get('final_state', None), 

200 'data': result.result if result.result else {}, 

201 'path': result.metadata.get('path', []), 

202 'success': False, 

203 'error': str(result.error) if result.error else str(result.result) 

204 }) 

205 

206 return formatted_results 

207 

208 async def process_stream( 

209 self, 

210 source: str | AsyncIterator[dict[str, Any]], 

211 sink: str | None = None, 

212 chunk_size: int = 100, 

213 on_progress: Callable | None = None, 

214 input_format: str = 'auto', 

215 text_field_name: str = 'text', 

216 csv_delimiter: str = ',', 

217 csv_has_header: bool = True, 

218 skip_empty_lines: bool = True, 

219 use_streaming: bool = False 

220 ) -> dict[str, Any]: 

221 """Process a stream of data through the FSM asynchronously. 

222 

223 Args: 

224 source: Data source (file path or async iterator) 

225 sink: Optional output destination 

226 chunk_size: Size of processing chunks 

227 on_progress: Optional progress callback 

228 input_format: Input file format ('auto', 'jsonl', 'json', 'csv', 'text') 

229 text_field_name: Field name for text lines when converting to dict 

230 csv_delimiter: CSV delimiter character 

231 csv_has_header: Whether CSV file has header row 

232 skip_empty_lines: Skip empty lines in text files 

233 use_streaming: Use memory-efficient streaming for large files 

234 

235 Returns: 

236 Dict containing stream processing statistics 

237 """ 

238 # Configure streaming 

239 stream_config = CoreStreamConfig( 

240 chunk_size=chunk_size, 

241 parallelism=4, 

242 memory_limit_mb=1024 

243 ) 

244 

245 # Create async stream executor 

246 stream_executor = AsyncStreamExecutor( 

247 fsm=self._fsm, 

248 stream_config=stream_config, 

249 progress_callback=on_progress 

250 ) 

251 

252 # Choose between streaming and regular mode 

253 if use_streaming and isinstance(source, str): 

254 # Use memory-efficient streaming for large files 

255 from ..utils.streaming_file_utils import ( 

256 create_streaming_file_reader, 

257 create_streaming_file_writer, 

258 ) 

259 

260 stream_source = create_streaming_file_reader( 

261 file_path=source, 

262 config=stream_config, 

263 input_format=input_format, 

264 text_field_name=text_field_name, 

265 csv_delimiter=csv_delimiter, 

266 csv_has_header=csv_has_header, 

267 skip_empty_lines=skip_empty_lines 

268 ) 

269 

270 # Handle sink for streaming mode 

271 sink_func = None 

272 cleanup_func = None 

273 if sink: 

274 sink_func, cleanup_func = await create_streaming_file_writer( 

275 file_path=sink, 

276 config=stream_config 

277 ) 

278 else: 

279 # Use regular mode (loads full chunks into memory) 

280 from ..utils.file_utils import create_file_reader, create_file_writer 

281 

282 # Handle file source 

283 if isinstance(source, str): 

284 stream_source = create_file_reader( 

285 file_path=source, 

286 input_format=input_format, 

287 text_field_name=text_field_name, 

288 csv_delimiter=csv_delimiter, 

289 csv_has_header=csv_has_header, 

290 skip_empty_lines=skip_empty_lines 

291 ) 

292 else: 

293 # Already an async iterator 

294 stream_source = source 

295 

296 # Handle sink for regular mode 

297 sink_func = None 

298 cleanup_func = None 

299 if sink: 

300 sink_func, cleanup_func = create_file_writer(sink) 

301 

302 try: 

303 # Execute stream using async executor 

304 result = await stream_executor.execute_stream( 

305 source=stream_source, 

306 sink=sink_func, 

307 chunk_size=chunk_size 

308 ) 

309 

310 return { 

311 'total_processed': result.total_processed, 

312 'successful': result.successful, 

313 'failed': result.failed, 

314 'duration': result.duration, 

315 'throughput': result.throughput 

316 } 

317 finally: 

318 # Clean up any resources (e.g., close files) 

319 if cleanup_func: 

320 if asyncio.iscoroutinefunction(cleanup_func): 

321 await cleanup_func() 

322 else: 

323 cleanup_func() 

324 

325 async def validate(self, data: dict[str, Any] | Record) -> dict[str, Any]: 

326 """Validate data against FSM's start state schema asynchronously. 

327 

328 Args: 

329 data: Data to validate 

330 

331 Returns: 

332 Dict containing validation results 

333 """ 

334 # Convert to Record if needed 

335 if isinstance(data, dict): 

336 record = Record(data) 

337 else: 

338 record = data 

339 

340 # Get start state 

341 start_state = self._fsm.get_start_state() 

342 

343 # Validate against schema 

344 if start_state.schema: 

345 validation_result = start_state.schema.validate(record) 

346 return { 

347 'valid': validation_result.valid, 

348 'errors': validation_result.errors if not validation_result.valid else [] 

349 } 

350 else: 

351 return { 

352 'valid': True, 

353 'errors': [] 

354 } 

355 

356 def get_states(self) -> list[str]: 

357 """Get list of all state names in the FSM.""" 

358 states = [] 

359 # The FSM has networks, and each network has states 

360 for network in self._fsm.networks.values(): 

361 for state in network.states.values(): 

362 states.append(state.name) 

363 return states 

364 

365 def get_resources(self) -> list[str]: 

366 """Get list of registered resource names.""" 

367 return list(self._resource_manager._resources.keys()) 

368 

369 @property 

370 def config(self) -> Any: 

371 """Get the FSM configuration object.""" 

372 return self._config 

373 

374 async def close(self) -> None: 

375 """Clean up resources and close connections asynchronously.""" 

376 await self._resource_manager.cleanup() 

377 

378 # Alias for consistency with other async libraries 

379 aclose = close 

380 

381 

382# Factory function for AsyncSimpleFSM 

383async def create_async_fsm( 

384 config: str | Path | dict[str, Any], 

385 custom_functions: dict[str, Callable] | None = None, 

386 **kwargs 

387) -> AsyncSimpleFSM: 

388 """Factory function to create an AsyncSimpleFSM instance. 

389 

390 Args: 

391 config: Configuration file path or dictionary 

392 custom_functions: Optional custom functions to register 

393 **kwargs: Additional arguments passed to AsyncSimpleFSM 

394 

395 Returns: 

396 Configured AsyncSimpleFSM instance 

397 """ 

398 return AsyncSimpleFSM(config, custom_functions=custom_functions, **kwargs)