Coverage for src/dataknobs_fsm/storage/database.py: 14%

177 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-20 16:46 -0600

1"""Database storage backend for execution history using dataknobs_data. 

2 

3This module provides a unified storage backend that works with ANY dataknobs_data 

4database backend (SQLite, PostgreSQL, MongoDB, Elasticsearch, S3, etc.) through 

5the common AsyncDatabase interface. 

6""" 

7 

8import time 

9import uuid 

10from typing import Any, Dict, List, TYPE_CHECKING 

11 

12from dataknobs_data.records import Record 

13from dataknobs_data.query import Query 

14from dataknobs_data.schema import DatabaseSchema, FieldSchema 

15 

16if TYPE_CHECKING: 

17 from dataknobs_data.database import AsyncDatabase 

18 

19from dataknobs_fsm.core.data_modes import DataHandlingMode 

20from dataknobs_fsm.execution.history import ExecutionHistory, ExecutionStep, ExecutionStatus 

21from dataknobs_fsm.storage.base import BaseHistoryStorage, StorageBackend, StorageConfig, StorageFactory 

22 

23 

24class UnifiedDatabaseStorage(BaseHistoryStorage): 

25 """Unified database storage that works with any dataknobs_data backend. 

26  

27 This single implementation works with: 

28 - Memory (AsyncMemoryDatabase) 

29 - SQLite (AsyncSQLiteDatabase) 

30 - PostgreSQL (AsyncPostgresDatabase) 

31 - MongoDB (AsyncMongoDatabase) 

32 - Elasticsearch (AsyncElasticsearchDatabase) 

33 - S3 (AsyncS3Database) 

34 - File (AsyncFileDatabase) 

35  

36 All through the same AsyncDatabase interface from dataknobs_data. 

37 """ 

38 

39 def __init__(self, config: StorageConfig): 

40 """Initialize database storage. 

41  

42 Args: 

43 config: Storage configuration with backend type in connection_params. 

44 """ 

45 super().__init__(config) 

46 self._db: AsyncDatabase | None = None 

47 self._steps_db: AsyncDatabase | None = None # Separate DB for steps if needed 

48 

49 async def _setup_backend(self) -> None: 

50 """Set up the database backend using dataknobs_data factory.""" 

51 # Extract backend type from config 

52 backend_type = self.config.connection_params.get('type', 'memory') 

53 

54 # Prepare dataknobs_data configuration 

55 db_config = { 

56 **self.config.connection_params, 

57 'schema': self._create_history_schema() 

58 } 

59 

60 # Remove 'type' as it's not needed by dataknobs_data 

61 db_config.pop('type', None) 

62 

63 # Use AsyncDatabaseFactory to create database instance 

64 from dataknobs_data.factory import AsyncDatabaseFactory 

65 factory = AsyncDatabaseFactory() 

66 

67 # The factory expects 'backend' not 'type' 

68 db_config['backend'] = backend_type 

69 

70 self._db = factory.create(**db_config) 

71 

72 # Connect to the database if it has a connect method 

73 if hasattr(self._db, 'connect'): 

74 await self._db.connect() 

75 

76 # For steps, use the same database instance 

77 # Different backends handle collections/tables differently 

78 self._steps_db = self._db 

79 

80 def _create_history_schema(self) -> DatabaseSchema: 

81 """Create schema for history records.""" 

82 from dataknobs_data.fields import FieldType 

83 

84 schema = DatabaseSchema() 

85 

86 # Core fields 

87 schema.add_field(FieldSchema( 

88 name='id', 

89 type=FieldType.TEXT, 

90 metadata={'primary_key': True} 

91 )) 

92 schema.add_field(FieldSchema( 

93 name='execution_id', 

94 type=FieldType.TEXT, 

95 metadata={'indexed': True, 'unique': True} 

96 )) 

97 schema.add_field(FieldSchema( 

98 name='fsm_name', 

99 type=FieldType.TEXT, 

100 metadata={'indexed': True} 

101 )) 

102 schema.add_field(FieldSchema( 

103 name='data_mode', 

104 type=FieldType.TEXT, 

105 metadata={'indexed': True} 

106 )) 

107 schema.add_field(FieldSchema( 

108 name='status', 

109 type=FieldType.TEXT, 

110 metadata={'indexed': True} 

111 )) 

112 

113 # Timing fields 

114 schema.add_field(FieldSchema( 

115 name='start_time', 

116 type=FieldType.FLOAT, 

117 metadata={'indexed': True} 

118 )) 

119 schema.add_field(FieldSchema( 

120 name='end_time', 

121 type=FieldType.FLOAT, 

122 required=False 

123 )) 

124 

125 # Metrics 

126 schema.add_field(FieldSchema( 

127 name='total_steps', 

128 type=FieldType.INTEGER, 

129 default=0 

130 )) 

131 schema.add_field(FieldSchema( 

132 name='failed_steps', 

133 type=FieldType.INTEGER, 

134 default=0 

135 )) 

136 schema.add_field(FieldSchema( 

137 name='skipped_steps', 

138 type=FieldType.INTEGER, 

139 default=0 

140 )) 

141 

142 # JSON data fields 

143 schema.add_field(FieldSchema( 

144 name='history_data', 

145 type=FieldType.JSON 

146 )) 

147 schema.add_field(FieldSchema( 

148 name='metadata', 

149 type=FieldType.JSON 

150 )) 

151 

152 # Timestamps 

153 schema.add_field(FieldSchema( 

154 name='created_at', 

155 type=FieldType.FLOAT 

156 )) 

157 schema.add_field(FieldSchema( 

158 name='updated_at', 

159 type=FieldType.FLOAT 

160 )) 

161 

162 return schema 

163 

164 def _create_steps_schema(self) -> DatabaseSchema: 

165 """Create schema for step records.""" 

166 from dataknobs_data.fields import FieldType 

167 

168 schema = DatabaseSchema() 

169 

170 schema.add_field(FieldSchema( 

171 name='id', 

172 type=FieldType.TEXT, 

173 metadata={'primary_key': True} 

174 )) 

175 schema.add_field(FieldSchema( 

176 name='execution_id', 

177 type=FieldType.TEXT, 

178 metadata={'indexed': True} 

179 )) 

180 schema.add_field(FieldSchema( 

181 name='step_id', 

182 type=FieldType.TEXT, 

183 metadata={'indexed': True} 

184 )) 

185 schema.add_field(FieldSchema( 

186 name='parent_id', 

187 type=FieldType.TEXT, 

188 required=False 

189 )) 

190 schema.add_field(FieldSchema( 

191 name='state_name', 

192 type=FieldType.TEXT, 

193 metadata={'indexed': True} 

194 )) 

195 schema.add_field(FieldSchema( 

196 name='network_name', 

197 type=FieldType.TEXT 

198 )) 

199 schema.add_field(FieldSchema( 

200 name='status', 

201 type=FieldType.TEXT, 

202 metadata={'indexed': True} 

203 )) 

204 schema.add_field(FieldSchema( 

205 name='timestamp', 

206 type=FieldType.FLOAT, 

207 metadata={'indexed': True} 

208 )) 

209 schema.add_field(FieldSchema( 

210 name='step_data', 

211 type=FieldType.JSON 

212 )) 

213 

214 return schema 

215 

216 async def save_history( 

217 self, 

218 history: ExecutionHistory, 

219 metadata: Dict[str, Any] | None = None 

220 ) -> str: 

221 """Save execution history to database.""" 

222 if not self._db: 

223 await self.initialize() 

224 

225 history_id = history.execution_id 

226 

227 # Serialize history based on data mode 

228 history_data = self._serialize_history(history) 

229 

230 # Create record using dataknobs_data Record 

231 record = Record({ 

232 'id': str(uuid.uuid4()), 

233 'execution_id': history_id, 

234 'fsm_name': history.fsm_name, 

235 'data_mode': history.data_mode.value, 

236 'status': 'completed' if history.end_time else 'in_progress', 

237 'start_time': history.start_time, 

238 'end_time': history.end_time, 

239 'total_steps': history.total_steps, 

240 'failed_steps': history.failed_steps, 

241 'skipped_steps': history.skipped_steps, 

242 'history_data': history_data, 

243 'metadata': metadata or {}, 

244 'created_at': time.time(), 

245 'updated_at': time.time() 

246 }) 

247 

248 # Save using dataknobs_data interface - just pass the record 

249 await self._db.upsert(record) 

250 

251 return history_id 

252 

253 async def load_history(self, history_id: str) -> ExecutionHistory | None: 

254 """Load execution history from database.""" 

255 if not self._db: 

256 await self.initialize() 

257 

258 # Query using dataknobs_data Query builder 

259 query = Query().filter('execution_id', '=', history_id) 

260 

261 # Find record 

262 results = await self._db.search(query) 

263 record = results[0] if results else None 

264 

265 if not record: 

266 return None 

267 

268 # Deserialize history 

269 history = self._deserialize_history( 

270 record['history_data'], 

271 record['fsm_name'], 

272 history_id 

273 ) 

274 

275 return history 

276 

277 async def save_step( 

278 self, 

279 execution_id: str, 

280 step: ExecutionStep, 

281 parent_id: str | None = None 

282 ) -> str: 

283 """Save a single execution step.""" 

284 if not self._steps_db: 

285 await self.initialize() 

286 

287 # Create step record 

288 record = Record({ 

289 'id': str(uuid.uuid4()), 

290 'execution_id': execution_id, 

291 'step_id': step.step_id, 

292 'parent_id': parent_id, 

293 'state_name': step.state_name, 

294 'network_name': step.network_name, 

295 'status': step.status.value, 

296 'timestamp': step.timestamp, 

297 'step_data': step.to_dict() 

298 }) 

299 

300 await self._steps_db.upsert(record) 

301 return step.step_id 

302 

303 async def load_steps( 

304 self, 

305 execution_id: str, 

306 filters: Dict[str, Any] | None = None 

307 ) -> List[ExecutionStep]: 

308 """Load execution steps from database.""" 

309 if not self._steps_db: 

310 await self.initialize() 

311 

312 # Build query 

313 query = Query().filter('execution_id', '=', execution_id) 

314 

315 if filters: 

316 for key, value in filters.items(): 

317 query = query.filter(key, '=', value) 

318 

319 # Load and reconstruct steps 

320 steps = [] 

321 results = await self._steps_db.search(query) 

322 for record in results: 

323 step_data = record['step_data'] 

324 

325 step = ExecutionStep( 

326 step_id=step_data['step_id'], 

327 state_name=step_data['state_name'], 

328 network_name=step_data['network_name'], 

329 timestamp=step_data['timestamp'], 

330 data_mode=DataHandlingMode(step_data['data_mode']), 

331 status=ExecutionStatus(step_data['status']) 

332 ) 

333 

334 # Restore other properties 

335 for attr in ['start_time', 'end_time', 'arc_taken', 'metrics', 

336 'resource_usage', 'stream_progress', 'chunks_processed', 

337 'records_processed']: 

338 if attr in step_data: 

339 setattr(step, attr, step_data[attr]) 

340 

341 if step_data.get('error'): 

342 step.error = Exception(step_data['error']) 

343 

344 steps.append(step) 

345 

346 return steps 

347 

348 async def query_histories( 

349 self, 

350 filters: Dict[str, Any], 

351 limit: int = 100, 

352 offset: int = 0 

353 ) -> List[Dict[str, Any]]: 

354 """Query execution histories.""" 

355 if not self._db: 

356 await self.initialize() 

357 

358 # Build query using dataknobs_data Query 

359 query = Query() 

360 

361 # Map filter keys to database fields 

362 for key, value in filters.items(): 

363 if key in ['fsm_name', 'data_mode', 'status']: 

364 query = query.filter(key, '=', value) 

365 elif key == 'start_time_after': 

366 query = query.filter('start_time', '>=', value) 

367 elif key == 'start_time_before': 

368 query = query.filter('start_time', '<=', value) 

369 elif key == 'failed': 

370 if value: 

371 query = query.filter('failed_steps', '>', 0) 

372 else: 

373 query = query.filter('failed_steps', '=', 0) 

374 

375 # Apply pagination 

376 query = query.sort_by('start_time', 'desc').limit(limit).offset(offset) 

377 

378 # Execute and return results 

379 results = [] 

380 search_results = await self._db.search(query) 

381 for record in search_results: 

382 results.append({ 

383 'id': record['execution_id'], 

384 'fsm_name': record['fsm_name'], 

385 'data_mode': record['data_mode'], 

386 'status': record['status'], 

387 'start_time': record['start_time'], 

388 'end_time': record.get_value('end_time'), 

389 'total_steps': record['total_steps'], 

390 'failed_steps': record['failed_steps'], 

391 'metadata': record.get_value('metadata', {}) 

392 }) 

393 

394 return results 

395 

396 async def delete_history(self, history_id: str) -> bool: 

397 """Delete execution history.""" 

398 if not self._db: 

399 await self.initialize() 

400 

401 # Find and delete history records 

402 query = Query().filter('execution_id', '=', history_id) 

403 records = await self._db.search(query) 

404 

405 deleted_count = 0 

406 for record in records: 

407 # Get the storage ID from the record 

408 record_id = record.storage_id or record.get_value('id') 

409 if record_id and await self._db.delete(record_id): 

410 deleted_count += 1 

411 

412 # Delete associated steps 

413 if self._steps_db: 

414 step_query = Query().filter('execution_id', '=', history_id) 

415 step_records = await self._steps_db.search(step_query) 

416 for step_record in step_records: 

417 step_id = step_record.storage_id or step_record.get_value('id') 

418 if step_id: 

419 await self._steps_db.delete(step_id) 

420 

421 return deleted_count > 0 

422 

423 async def get_statistics( 

424 self, 

425 execution_id: str | None = None 

426 ) -> Dict[str, Any]: 

427 """Get storage statistics.""" 

428 if not self._db: 

429 await self.initialize() 

430 

431 if execution_id: 

432 # Specific execution stats 

433 query = Query().filter('execution_id', '=', execution_id) 

434 

435 search_results = await self._db.search(query) 

436 for record in search_results: 

437 return { 

438 'execution_id': execution_id, 

439 'fsm_name': record['fsm_name'], 

440 'data_mode': record['data_mode'], 

441 'status': record['status'], 

442 'total_steps': record['total_steps'], 

443 'failed_steps': record['failed_steps'], 

444 'start_time': record['start_time'], 

445 'end_time': record.get_value('end_time') 

446 } 

447 return {} 

448 else: 

449 # Overall stats 

450 stats = { 

451 'total_histories': 0, 

452 'mode_distribution': {}, 

453 'status_distribution': {}, 

454 'backend_type': self.config.connection_params.get('type', 'unknown') 

455 } 

456 

457 all_records = await self._db.search(Query()) 

458 for record in all_records: 

459 stats['total_histories'] += 1 

460 

461 mode = record['data_mode'] 

462 stats['mode_distribution'][mode] = stats['mode_distribution'].get(mode, 0) + 1 

463 

464 status = record['status'] 

465 stats['status_distribution'][status] = stats['status_distribution'].get(status, 0) + 1 

466 

467 return stats 

468 

469 async def cleanup( 

470 self, 

471 before_timestamp: float | None = None, 

472 keep_failed: bool = True 

473 ) -> int: 

474 """Clean up old histories.""" 

475 if not self._db: 

476 await self.initialize() 

477 

478 if before_timestamp is None: 

479 before_timestamp = time.time() - (7 * 86400) # 7 days 

480 

481 # Build query 

482 query = Query().filter('start_time', '<', before_timestamp) 

483 

484 if keep_failed: 

485 query = query.filter('failed_steps', '=', 0) 

486 

487 # Get histories to delete 

488 to_delete = [] 

489 search_results = await self._db.search(query) 

490 for record in search_results: 

491 to_delete.append(record['execution_id']) 

492 

493 # Delete each 

494 deleted = 0 

495 for history_id in to_delete: 

496 if await self.delete_history(history_id): 

497 deleted += 1 

498 

499 # Close database connection if supported 

500 if hasattr(self._db, 'close'): 

501 await self._db.close() 

502 

503 return deleted 

504 

505 

506# Register all backends with the same implementation 

507for backend in [StorageBackend.MEMORY, StorageBackend.SQLITE, 

508 StorageBackend.POSTGRES, StorageBackend.MONGODB, 

509 StorageBackend.ELASTICSEARCH, StorageBackend.S3]: 

510 StorageFactory.register(backend, UnifiedDatabaseStorage)