Coverage for src/dataknobs_fsm/streaming/file_stream.py: 15%

281 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-20 16:46 -0600

1"""File streaming implementation for FSM.""" 

2 

3import csv 

4import gzip 

5import io 

6import json 

7import logging 

8import os 

9from pathlib import Path 

10from typing import Any, Dict, Iterator, List, Union, TextIO, BinaryIO 

11 

12from dataknobs_fsm.streaming.core import ( 

13 IStreamSink, 

14 IStreamSource, 

15 StreamChunk, 

16) 

17 

18logger = logging.getLogger(__name__) 

19 

20 

21class FileFormat: 

22 """Supported file formats.""" 

23 JSON = "json" 

24 JSONL = "jsonl" 

25 CSV = "csv" 

26 TEXT = "text" 

27 BINARY = "binary" 

28 

29 @staticmethod 

30 def detect(file_path: Path) -> str: 

31 """Detect format from file extension. 

32  

33 Args: 

34 file_path: Path to file. 

35  

36 Returns: 

37 Detected format string. 

38 """ 

39 suffix = file_path.suffix.lower() 

40 if suffix in ['.json']: 

41 return FileFormat.JSON 

42 elif suffix in ['.jsonl', '.ndjson']: 

43 return FileFormat.JSONL 

44 elif suffix in ['.csv', '.tsv']: 

45 return FileFormat.CSV 

46 elif suffix in ['.txt', '.text', '.log']: 

47 return FileFormat.TEXT 

48 else: 

49 return FileFormat.BINARY 

50 

51 

52class CompressionFormat: 

53 """Supported compression formats.""" 

54 NONE = "none" 

55 GZIP = "gzip" 

56 

57 @staticmethod 

58 def detect(file_path: Path) -> str: 

59 """Detect compression from file extension. 

60  

61 Args: 

62 file_path: Path to file. 

63  

64 Returns: 

65 Detected compression format. 

66 """ 

67 if file_path.suffix.lower() in ['.gz', '.gzip']: 

68 return CompressionFormat.GZIP 

69 return CompressionFormat.NONE 

70 

71 

72class FileStreamSource(IStreamSource): 

73 """File-based stream source with format detection and decompression. 

74  

75 This source supports reading files in chunks with automatic 

76 format detection, decompression, and progress tracking. 

77 """ 

78 

79 def __init__( 

80 self, 

81 file_path: Union[str, Path], 

82 format: str | None = None, 

83 compression: str | None = None, 

84 chunk_size: int = 1000, 

85 encoding: str = 'utf-8' 

86 ): 

87 """Initialize file stream source. 

88  

89 Args: 

90 file_path: Path to source file. 

91 format: File format (auto-detected if None). 

92 compression: Compression format (auto-detected if None). 

93 chunk_size: Number of items per chunk. 

94 encoding: Text encoding for text formats. 

95 """ 

96 self.file_path = Path(file_path) 

97 self.format = format or FileFormat.detect(self.file_path) 

98 self.compression = compression or CompressionFormat.detect(self.file_path) 

99 self.chunk_size = chunk_size 

100 self.encoding = encoding 

101 

102 self._file_handle: Union[TextIO, BinaryIO, Any, None] = None 

103 self._reader: Any | None = None 

104 self._chunk_count = 0 

105 self._item_count = 0 

106 self._bytes_read = 0 

107 self._file_size = self.file_path.stat().st_size if self.file_path.exists() else 0 

108 

109 self._open_file() 

110 

111 def _open_file(self) -> None: 

112 """Open the file with appropriate decompression.""" 

113 if not self.file_path.exists(): 

114 raise FileNotFoundError(f"File not found: {self.file_path}") 

115 

116 # Open with decompression if needed 

117 if self.compression == CompressionFormat.GZIP: 

118 self._file_handle = gzip.open(self.file_path, 'rt', encoding=self.encoding) 

119 elif self.format == FileFormat.BINARY: 

120 self._file_handle = open(self.file_path, 'rb') 

121 else: 

122 self._file_handle = open(self.file_path, encoding=self.encoding) 

123 

124 # Set up format-specific reader 

125 if self.format == FileFormat.CSV: 

126 self._reader = csv.DictReader(self._file_handle) # type: ignore 

127 elif self.format == FileFormat.JSON: 

128 # Load entire JSON file (for array or object) 

129 content = self._file_handle.read() # type: ignore 

130 data = json.loads(content) 

131 if isinstance(data, list): 

132 self._reader = iter(data) 

133 else: 

134 self._reader = iter([data]) 

135 elif self.format == FileFormat.JSONL: 

136 # Will read line by line 

137 self._reader = self._file_handle 

138 elif self.format == FileFormat.TEXT or self.format == FileFormat.BINARY: 

139 self._reader = self._file_handle 

140 else: 

141 self._reader = self._file_handle 

142 

143 def read_chunk(self) -> StreamChunk | None: 

144 """Read next chunk from file. 

145  

146 Returns: 

147 StreamChunk with file data or None if exhausted. 

148 """ 

149 if self._reader is None: 

150 return None 

151 

152 chunk_data = [] 

153 

154 try: 

155 # Read up to chunk_size items 

156 for _ in range(self.chunk_size): 

157 item = self._read_next_item() 

158 if item is None: 

159 break 

160 chunk_data.append(item) 

161 self._item_count += 1 

162 

163 if not chunk_data: 

164 return None 

165 

166 # Calculate progress 

167 progress = 0.0 

168 if self._file_size > 0 and self._file_handle: 

169 if hasattr(self._file_handle, 'tell'): 

170 try: 

171 current_pos = self._file_handle.tell() 

172 progress = current_pos / self._file_size 

173 except (OSError, io.UnsupportedOperation): 

174 pass 

175 

176 # Create chunk 

177 chunk = StreamChunk( 

178 data=chunk_data, 

179 sequence_number=self._chunk_count, 

180 metadata={ 

181 'file_path': str(self.file_path), 

182 'format': self.format, 

183 'progress': progress, 

184 'item_count': len(chunk_data) 

185 }, 

186 is_last=len(chunk_data) < self.chunk_size 

187 ) 

188 

189 self._chunk_count += 1 

190 return chunk 

191 

192 except Exception as e: 

193 # Return error chunk 

194 return StreamChunk( 

195 data=[], 

196 sequence_number=self._chunk_count, 

197 metadata={'error': str(e)}, 

198 is_last=True 

199 ) 

200 

201 def _read_next_item(self) -> Any | None: 

202 """Read next item based on format. 

203  

204 Returns: 

205 Next item or None if exhausted. 

206 """ 

207 try: 

208 if self.format == FileFormat.CSV: 

209 return next(self._reader, None) if self._reader else None 

210 elif self.format == FileFormat.JSONL: 

211 line = next(self._reader, None) if self._reader else None 

212 if line: 

213 return json.loads(line.strip()) 

214 return None 

215 elif self.format == FileFormat.TEXT: 

216 return next(self._reader, None) if self._reader else None 

217 elif self.format == FileFormat.BINARY: 

218 # Read in 4KB chunks for binary 

219 data = self._reader.read(4096) if self._reader else None 

220 if data: 

221 return data 

222 return None 

223 else: 

224 # For JSON array, already using iterator 

225 return next(self._reader, None) if self._reader else None 

226 except StopIteration: 

227 return None 

228 except Exception: 

229 return None 

230 

231 def __iter__(self) -> Iterator[StreamChunk]: 

232 """Iterate over all chunks.""" 

233 while True: 

234 chunk = self.read_chunk() 

235 if chunk is None: 

236 break 

237 yield chunk 

238 

239 def close(self) -> None: 

240 """Close the file handle.""" 

241 if self._file_handle: 

242 self._file_handle.close() 

243 self._file_handle = None 

244 self._reader = None 

245 

246 

247class FileStreamSink(IStreamSink): 

248 """File-based stream sink with format serialization and compression. 

249  

250 This sink supports writing data chunks to files with automatic 

251 format serialization, compression, and atomic writes. 

252 """ 

253 

254 def __init__( 

255 self, 

256 file_path: Union[str, Path], 

257 format: str | None = None, 

258 compression: str | None = None, 

259 encoding: str = 'utf-8', 

260 atomic: bool = True, 

261 append: bool = False 

262 ): 

263 """Initialize file stream sink. 

264  

265 Args: 

266 file_path: Path to target file. 

267 format: File format (auto-detected if None). 

268 compression: Compression format (auto-detected if None). 

269 encoding: Text encoding for text formats. 

270 atomic: Use atomic writes (write to temp then rename). 

271 append: Append to existing file instead of overwriting. 

272 """ 

273 self.file_path = Path(file_path) 

274 self.format = format or FileFormat.detect(self.file_path) 

275 self.compression = compression or CompressionFormat.detect(self.file_path) 

276 self.encoding = encoding 

277 self.atomic = atomic 

278 self.append = append 

279 

280 self._file_handle: Any | None = None 

281 self._writer: Any | None = None 

282 self._temp_path: Path | None = None 

283 self._chunk_count = 0 

284 self._item_count = 0 

285 self._bytes_written = 0 

286 self._buffer: List[Any] = [] 

287 

288 self._open_file() 

289 

290 def _open_file(self) -> None: 

291 """Open file for writing.""" 

292 # Create parent directories 

293 self.file_path.parent.mkdir(parents=True, exist_ok=True) 

294 

295 # Use temp file for atomic writes 

296 if self.atomic and not self.append: 

297 self._temp_path = self.file_path.with_suffix( 

298 self.file_path.suffix + '.tmp' 

299 ) 

300 target_path = self._temp_path 

301 else: 

302 target_path = self.file_path 

303 

304 # Open with compression if needed 

305 mode = 'ab' if self.append and self.format == FileFormat.BINARY else ( 

306 'a' if self.append else 'w' 

307 ) 

308 

309 if self.compression == CompressionFormat.GZIP: 

310 if self.format == FileFormat.BINARY: 

311 self._file_handle = gzip.open(str(target_path), mode + 'b') 

312 else: 

313 self._file_handle = gzip.open( 

314 str(target_path), 

315 mode + 't', 

316 encoding=self.encoding 

317 ) 

318 elif self.format == FileFormat.BINARY: 

319 self._file_handle = open(str(target_path), mode + 'b') 

320 else: 

321 self._file_handle = open(str(target_path), mode, encoding=self.encoding) 

322 

323 # Set up format-specific writer 

324 if self.format == FileFormat.CSV: 

325 # CSV writer will be initialized on first write 

326 self._writer = None 

327 elif self.format == FileFormat.JSON: 

328 # Buffer all data for JSON 

329 self._buffer = [] 

330 else: 

331 self._writer = self._file_handle 

332 

333 def write_chunk(self, chunk: StreamChunk) -> bool: 

334 """Write chunk to file. 

335  

336 Args: 

337 chunk: Chunk to write. 

338  

339 Returns: 

340 True if successful. 

341 """ 

342 if self._file_handle is None: 

343 return False 

344 

345 try: 

346 if not chunk.data: 

347 return True 

348 

349 if self.format == FileFormat.CSV: 

350 self._write_csv_chunk(chunk.data) 

351 elif self.format == FileFormat.JSON: 

352 # Buffer for final write 

353 if isinstance(chunk.data, list): 

354 self._buffer.extend(chunk.data) 

355 else: 

356 self._buffer.append(chunk.data) 

357 elif self.format == FileFormat.JSONL: 

358 for item in chunk.data: 

359 self._file_handle.write(json.dumps(item) + '\n') 

360 elif self.format == FileFormat.TEXT: 

361 for item in chunk.data: 

362 if item is not None: 

363 self._file_handle.write(str(item)) 

364 if not str(item).endswith('\n'): 

365 self._file_handle.write('\n') 

366 elif self.format == FileFormat.BINARY: 

367 for item in chunk.data: 

368 if isinstance(item, bytes): 

369 self._file_handle.write(item) 

370 else: 

371 self._file_handle.write(str(item).encode(self.encoding)) 

372 else: 

373 # Default text write 

374 for item in chunk.data: 

375 self._file_handle.write(str(item) + '\n') 

376 

377 self._chunk_count += 1 

378 self._item_count += len(chunk.data) if isinstance(chunk.data, list) else 1 

379 

380 return True 

381 

382 except Exception as e: 

383 logger.error(f"Error writing chunk: {e}") 

384 return False 

385 

386 def _write_csv_chunk(self, data: List[Dict[str, Any]]) -> None: 

387 """Write CSV data. 

388  

389 Args: 

390 data: List of dictionaries to write. 

391 """ 

392 if not data: 

393 return 

394 

395 # Initialize CSV writer on first write 

396 if self._writer is None: 

397 fieldnames = list(data[0].keys()) 

398 self._writer = csv.DictWriter( 

399 self._file_handle, # type: ignore 

400 fieldnames=fieldnames 

401 ) 

402 if not self.append or self._chunk_count == 0: 

403 self._writer.writeheader() 

404 

405 for row in data: 

406 self._writer.writerow(row) 

407 

408 def flush(self) -> None: 

409 """Flush buffered data to disk.""" 

410 if self._file_handle is None: 

411 return 

412 

413 try: 

414 # Write JSON buffer if needed 

415 if self.format == FileFormat.JSON and self._buffer: 

416 json.dump(self._buffer, self._file_handle, indent=2) 

417 self._buffer = [] 

418 

419 # Flush file handle 

420 self._file_handle.flush() 

421 

422 if hasattr(self._file_handle, 'fileno'): 

423 os.fsync(self._file_handle.fileno()) 

424 except Exception: 

425 pass 

426 

427 def close(self) -> None: 

428 """Close file and finalize atomic write.""" 

429 if self._file_handle is None: 

430 return 

431 

432 try: 

433 # Flush any remaining data 

434 self.flush() 

435 

436 # Close file 

437 self._file_handle.close() 

438 

439 # Atomic rename if using temp file 

440 if self.atomic and self._temp_path and self._temp_path.exists(): 

441 self._temp_path.replace(self.file_path) 

442 self._temp_path = None 

443 

444 except Exception: 

445 pass 

446 finally: 

447 self._file_handle = None 

448 self._writer = None 

449 

450 

451class DirectoryStreamSource(IStreamSource): 

452 """Stream source that reads from multiple files in a directory.""" 

453 

454 def __init__( 

455 self, 

456 directory: Union[str, Path], 

457 pattern: str = "*", 

458 recursive: bool = False, 

459 format: str | None = None, 

460 chunk_size: int = 1000 

461 ): 

462 """Initialize directory stream source. 

463  

464 Args: 

465 directory: Directory path. 

466 pattern: File pattern to match. 

467 recursive: Search recursively. 

468 format: File format for all files. 

469 chunk_size: Chunk size for reading. 

470 """ 

471 self.directory = Path(directory) 

472 self.pattern = pattern 

473 self.recursive = recursive 

474 self.format = format 

475 self.chunk_size = chunk_size 

476 

477 # Find all matching files 

478 if recursive: 

479 self.files = list(self.directory.rglob(self.pattern)) 

480 else: 

481 self.files = list(self.directory.glob(self.pattern)) 

482 

483 self.files = [f for f in self.files if f.is_file()] 

484 self.files.sort() 

485 

486 self._current_file_index = 0 

487 self._current_source: FileStreamSource | None = None 

488 self._total_chunks = 0 

489 

490 def read_chunk(self) -> StreamChunk | None: 

491 """Read next chunk from directory files. 

492  

493 Returns: 

494 Next chunk or None if exhausted. 

495 """ 

496 while self._current_file_index < len(self.files): 

497 # Open next file if needed 

498 if self._current_source is None: 

499 file_path = self.files[self._current_file_index] 

500 try: 

501 self._current_source = FileStreamSource( 

502 file_path, 

503 format=self.format, 

504 chunk_size=self.chunk_size 

505 ) 

506 except Exception: 

507 self._current_file_index += 1 

508 continue 

509 

510 # Read chunk from current file 

511 chunk = self._current_source.read_chunk() 

512 

513 if chunk is None: 

514 # Current file exhausted, move to next 

515 self._current_source.close() 

516 self._current_source = None 

517 self._current_file_index += 1 

518 continue 

519 

520 # Add file info to metadata 

521 chunk.metadata['source_file'] = str( 

522 self.files[self._current_file_index] 

523 ) 

524 chunk.metadata['file_index'] = self._current_file_index 

525 chunk.metadata['total_files'] = len(self.files) 

526 

527 # Update is_last flag 

528 chunk.is_last = ( 

529 self._current_file_index == len(self.files) - 1 and 

530 chunk.is_last 

531 ) 

532 

533 self._total_chunks += 1 

534 chunk.sequence_number = self._total_chunks 

535 

536 return chunk 

537 

538 return None 

539 

540 def __iter__(self) -> Iterator[StreamChunk]: 

541 """Iterate over all chunks.""" 

542 while True: 

543 chunk = self.read_chunk() 

544 if chunk is None: 

545 break 

546 yield chunk 

547 

548 def close(self) -> None: 

549 """Close current file source.""" 

550 if self._current_source: 

551 self._current_source.close() 

552 self._current_source = None