Coverage for src/dataknobs_fsm/utils/streaming_file_utils.py: 42%

261 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-16 20:48 -0600

1"""Streaming file utilities for processing large files efficiently. 

2 

3This module provides memory-efficient streaming utilities for reading and writing 

4large files that may not fit in memory. 

5""" 

6 

7import asyncio 

8import csv 

9import json 

10from collections import deque 

11from io import StringIO 

12from pathlib import Path 

13from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Tuple, Union 

14 

15from dataknobs_fsm.streaming.core import StreamChunk, StreamConfig, StreamMetrics 

16from dataknobs_fsm.utils.file_utils import detect_format, get_csv_delimiter 

17 

18 

19class StreamingFileReader: 

20 """Memory-efficient streaming file reader with chunking support.""" 

21 

22 def __init__( 

23 self, 

24 file_path: Union[str, Path], 

25 chunk_size: int = 1000, 

26 input_format: str = 'auto', 

27 text_field_name: str = 'text', 

28 csv_delimiter: str = ',', 

29 csv_has_header: bool = True, 

30 skip_empty_lines: bool = True, 

31 max_memory_mb: int = 100 

32 ): 

33 """Initialize streaming file reader. 

34 

35 Args: 

36 file_path: Path to input file 

37 chunk_size: Number of records per chunk 

38 input_format: File format ('auto', 'jsonl', 'json', 'csv', 'text') 

39 text_field_name: Field name for text lines 

40 csv_delimiter: CSV delimiter character 

41 csv_has_header: Whether CSV has header row 

42 skip_empty_lines: Skip empty lines in text files 

43 max_memory_mb: Maximum memory usage in MB 

44 """ 

45 self.file_path = Path(file_path) 

46 self.chunk_size = chunk_size 

47 self.text_field_name = text_field_name 

48 self.csv_delimiter = csv_delimiter 

49 self.csv_has_header = csv_has_header 

50 self.skip_empty_lines = skip_empty_lines 

51 self.max_memory_mb = max_memory_mb 

52 

53 # Auto-detect format if needed 

54 if input_format == 'auto': 

55 self.format = detect_format(self.file_path) 

56 if self.format == 'csv' and self.file_path.suffix.lower() == '.tsv': 

57 self.csv_delimiter = '\t' 

58 else: 

59 self.format = input_format 

60 

61 self.metrics = StreamMetrics() 

62 self._chunk_count = 0 

63 

64 async def read_chunks(self) -> AsyncIterator[StreamChunk]: 

65 """Read file in chunks, yielding StreamChunk objects. 

66 

67 Yields: 

68 StreamChunk objects containing batches of records 

69 """ 

70 self.metrics.start_time = asyncio.get_event_loop().time() 

71 

72 try: 

73 if self.format == 'jsonl': 

74 async for chunk in self._read_jsonl_chunks(): 

75 yield chunk 

76 elif self.format == 'json': 

77 async for chunk in self._read_json_chunks(): 

78 yield chunk 

79 elif self.format == 'csv': 

80 async for chunk in self._read_csv_chunks(): 

81 yield chunk 

82 elif self.format == 'text': 

83 async for chunk in self._read_text_chunks(): 

84 yield chunk 

85 else: 

86 raise ValueError(f"Unsupported format: {self.format}") 

87 finally: 

88 self.metrics.end_time = asyncio.get_event_loop().time() 

89 

90 async def _read_jsonl_chunks(self) -> AsyncIterator[StreamChunk]: 

91 """Read JSONL file in chunks.""" 

92 chunk_data = [] 

93 

94 with open(self.file_path, 'r') as f: 

95 for line_num, line in enumerate(f, 1): 

96 if line.strip(): 

97 try: 

98 record = json.loads(line) 

99 chunk_data.append(record) 

100 self.metrics.items_processed += 1 

101 

102 if len(chunk_data) >= self.chunk_size: 

103 yield self._create_chunk(chunk_data) 

104 chunk_data = [] 

105 

106 # Allow other tasks to run 

107 await asyncio.sleep(0) 

108 except json.JSONDecodeError: 

109 self.metrics.errors_count += 1 

110 continue 

111 

112 # Yield remaining data 

113 if chunk_data: 

114 yield self._create_chunk(chunk_data, is_last=True) 

115 

116 async def _read_json_chunks(self) -> AsyncIterator[StreamChunk]: 

117 """Read JSON file in chunks (for arrays).""" 

118 import ijson 

119 

120 with open(self.file_path, 'rb') as f: 

121 # First try to parse as an array with streaming 

122 try: 

123 parser = ijson.items(f, 'item') 

124 chunk_data = [] 

125 item_count = 0 

126 

127 for item in parser: 

128 chunk_data.append(item) 

129 item_count += 1 

130 self.metrics.items_processed += 1 

131 

132 if len(chunk_data) >= self.chunk_size: 

133 yield self._create_chunk(chunk_data) 

134 chunk_data = [] 

135 await asyncio.sleep(0) 

136 

137 if chunk_data: 

138 yield self._create_chunk(chunk_data, is_last=True) 

139 elif item_count == 0: 

140 # No items found, might be a single object 

141 raise ValueError("No array items found") 

142 

143 except (ijson.JSONError, ValueError): 

144 # Not an array or empty, try as single object 

145 f.seek(0) 

146 with open(self.file_path, 'r') as text_f: 

147 data = json.load(text_f) 

148 

149 if isinstance(data, list): 

150 # It's an array, process in chunks 

151 for i in range(0, len(data), self.chunk_size): 

152 chunk = data[i:i + self.chunk_size] 

153 is_last = (i + self.chunk_size) >= len(data) 

154 self.metrics.items_processed += len(chunk) 

155 yield self._create_chunk(chunk, is_last=is_last) 

156 await asyncio.sleep(0) 

157 else: 

158 # Single object 

159 self.metrics.items_processed += 1 

160 yield self._create_chunk([data], is_last=True) 

161 

162 async def _read_csv_chunks(self) -> AsyncIterator[StreamChunk]: 

163 """Read CSV file in chunks.""" 

164 chunk_data = [] 

165 total_rows = 0 

166 

167 with open(self.file_path, 'r', newline='') as f: 

168 if self.csv_has_header: 

169 reader = csv.DictReader(f, delimiter=self.csv_delimiter) 

170 else: 

171 # For headerless CSV, create field names 

172 first_line = f.readline() 

173 f.seek(0) 

174 num_fields = len(first_line.split(self.csv_delimiter)) 

175 fieldnames = [f'col_{i}' for i in range(num_fields)] 

176 reader = csv.DictReader(f, fieldnames=fieldnames, delimiter=self.csv_delimiter) 

177 

178 # Count total rows for determining last chunk 

179 reader_list = list(reader) 

180 total_rows = len(reader_list) 

181 

182 for idx, row in enumerate(reader_list): 

183 chunk_data.append(dict(row)) # Convert OrderedDict to dict 

184 self.metrics.items_processed += 1 

185 

186 if len(chunk_data) >= self.chunk_size: 

187 # Check if this will be the last chunk 

188 is_last = (idx + 1) >= total_rows 

189 yield self._create_chunk(chunk_data, is_last=is_last) 

190 chunk_data = [] 

191 await asyncio.sleep(0) 

192 

193 if chunk_data: 

194 yield self._create_chunk(chunk_data, is_last=True) 

195 

196 async def _read_text_chunks(self) -> AsyncIterator[StreamChunk]: 

197 """Read text file in chunks.""" 

198 chunk_data = [] 

199 

200 with open(self.file_path, 'r') as f: 

201 for line in f: 

202 line = line.rstrip('\n\r') 

203 if line or not self.skip_empty_lines: 

204 chunk_data.append({self.text_field_name: line}) 

205 self.metrics.items_processed += 1 

206 

207 if len(chunk_data) >= self.chunk_size: 

208 yield self._create_chunk(chunk_data) 

209 chunk_data = [] 

210 await asyncio.sleep(0) 

211 

212 if chunk_data: 

213 yield self._create_chunk(chunk_data, is_last=True) 

214 

215 def _create_chunk(self, data: List[Dict[str, Any]], is_last: bool = False) -> StreamChunk: 

216 """Create a StreamChunk from data.""" 

217 chunk = StreamChunk( 

218 data=data, 

219 sequence_number=self._chunk_count, 

220 metadata={ 

221 'file': str(self.file_path), 

222 'format': self.format, 

223 'chunk_size': len(data) 

224 }, 

225 is_last=is_last 

226 ) 

227 self._chunk_count += 1 

228 self.metrics.chunks_processed += 1 

229 return chunk 

230 

231 

232class StreamingFileWriter: 

233 """Memory-efficient streaming file writer with buffering.""" 

234 

235 def __init__( 

236 self, 

237 file_path: Union[str, Path], 

238 output_format: Optional[str] = None, 

239 buffer_size: int = 1000, 

240 flush_interval: float = 1.0 

241 ): 

242 """Initialize streaming file writer. 

243 

244 Args: 

245 file_path: Path to output file 

246 output_format: Output format (auto-detected if None) 

247 buffer_size: Number of records to buffer before writing 

248 flush_interval: Time interval (seconds) to flush buffer 

249 """ 

250 self.file_path = Path(file_path) 

251 self.buffer_size = buffer_size 

252 self.flush_interval = flush_interval 

253 

254 # Auto-detect format 

255 self.format = output_format or detect_format(self.file_path, for_output=True) 

256 

257 self._buffer: deque = deque() 

258 self._file_handle: Optional[Any] = None 

259 self._csv_writer: Optional[csv.DictWriter] = None 

260 self._last_flush_time = asyncio.get_event_loop().time() 

261 self._is_first_write = True 

262 self.metrics = StreamMetrics() 

263 

264 async def __aenter__(self): 

265 """Async context manager entry.""" 

266 self.open() 

267 return self 

268 

269 async def __aexit__(self, exc_type, exc_val, exc_tb): 

270 """Async context manager exit.""" 

271 await self.close() 

272 

273 def open(self): 

274 """Open the output file.""" 

275 if self.format == 'jsonl': 

276 self._file_handle = open(self.file_path, 'w') 

277 elif self.format == 'csv': 

278 self._file_handle = open(self.file_path, 'w', newline='') 

279 elif self.format == 'json': 

280 self._file_handle = open(self.file_path, 'w') 

281 self._file_handle.write('[') # Start JSON array 

282 elif self.format == 'text': 

283 self._file_handle = open(self.file_path, 'w') 

284 else: 

285 self._file_handle = open(self.file_path, 'w') 

286 

287 self.metrics.start_time = asyncio.get_event_loop().time() 

288 

289 async def write_chunk(self, chunk: StreamChunk) -> None: 

290 """Write a chunk of data to the file. 

291 

292 Args: 

293 chunk: StreamChunk to write 

294 """ 

295 if not self._file_handle: 

296 self.open() 

297 

298 # Add chunk data to buffer 

299 if isinstance(chunk.data, list): 

300 self._buffer.extend(chunk.data) 

301 else: 

302 self._buffer.append(chunk.data) 

303 

304 # Check if we should flush 

305 current_time = asyncio.get_event_loop().time() 

306 should_flush = ( 

307 len(self._buffer) >= self.buffer_size or 

308 chunk.is_last or 

309 (current_time - self._last_flush_time) > self.flush_interval 

310 ) 

311 

312 if should_flush: 

313 await self._flush_buffer() 

314 self._last_flush_time = current_time 

315 

316 self.metrics.chunks_processed += 1 

317 

318 async def _flush_buffer(self) -> None: 

319 """Flush the buffer to file.""" 

320 if not self._buffer or not self._file_handle: 

321 return 

322 

323 if self.format == 'jsonl': 

324 # Write each record as a JSON line 

325 while self._buffer: 

326 record = self._buffer.popleft() 

327 json.dump(record, self._file_handle) 

328 self._file_handle.write('\n') 

329 self.metrics.items_processed += 1 

330 

331 elif self.format == 'csv': 

332 # Initialize CSV writer if needed 

333 if self._csv_writer is None and self._buffer: 

334 first_record = self._buffer[0] 

335 fieldnames = list(first_record.keys()) 

336 delimiter = get_csv_delimiter(self.file_path) 

337 self._csv_writer = csv.DictWriter( 

338 self._file_handle, 

339 fieldnames=fieldnames, 

340 delimiter=delimiter 

341 ) 

342 self._csv_writer.writeheader() 

343 

344 # Write records 

345 while self._buffer: 

346 record = self._buffer.popleft() 

347 self._csv_writer.writerow(record) 

348 self.metrics.items_processed += 1 

349 

350 elif self.format == 'json': 

351 # Write as JSON array elements 

352 while self._buffer: 

353 record = self._buffer.popleft() 

354 if not self._is_first_write: 

355 self._file_handle.write(',') 

356 json.dump(record, self._file_handle) 

357 self._is_first_write = False 

358 self.metrics.items_processed += 1 

359 

360 elif self.format == 'text': 

361 # Write text lines 

362 while self._buffer: 

363 record = self._buffer.popleft() 

364 # Extract text from dict if needed 

365 if isinstance(record, dict): 

366 text = record.get('text', str(record)) 

367 else: 

368 text = str(record) 

369 self._file_handle.write(text + '\n') 

370 self.metrics.items_processed += 1 

371 

372 # Flush to disk 

373 self._file_handle.flush() 

374 

375 # Allow other tasks to run 

376 await asyncio.sleep(0) 

377 

378 async def close(self) -> None: 

379 """Close the file and flush remaining buffer.""" 

380 if self._buffer: 

381 await self._flush_buffer() 

382 

383 if self._file_handle: 

384 if self.format == 'json': 

385 self._file_handle.write(']') # Close JSON array 

386 

387 self._file_handle.close() 

388 self._file_handle = None 

389 

390 self.metrics.end_time = asyncio.get_event_loop().time() 

391 

392 

393class StreamingFileProcessor: 

394 """High-level streaming file processor combining reader and writer.""" 

395 

396 def __init__( 

397 self, 

398 input_path: Union[str, Path], 

399 output_path: Union[str, Path], 

400 transform_fn: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, 

401 chunk_size: int = 1000, 

402 input_format: str = 'auto', 

403 output_format: Optional[str] = None 

404 ): 

405 """Initialize streaming file processor. 

406 

407 Args: 

408 input_path: Input file path 

409 output_path: Output file path 

410 transform_fn: Optional transformation function for each record 

411 chunk_size: Records per chunk 

412 input_format: Input file format 

413 output_format: Output file format (auto-detected if None) 

414 """ 

415 self.reader = StreamingFileReader( 

416 input_path, 

417 chunk_size=chunk_size, 

418 input_format=input_format 

419 ) 

420 self.writer = StreamingFileWriter( 

421 output_path, 

422 output_format=output_format, 

423 buffer_size=chunk_size 

424 ) 

425 self.transform_fn = transform_fn or (lambda x: x) 

426 

427 async def process(self, progress_callback: Optional[Callable[[int, int], None]] = None) -> StreamMetrics: 

428 """Process the file with streaming. 

429 

430 Args: 

431 progress_callback: Optional callback for progress updates (items_processed, total_chunks) 

432 

433 Returns: 

434 Combined metrics from processing 

435 """ 

436 async with self.writer: 

437 total_items = 0 

438 

439 async for chunk in self.reader.read_chunks(): 

440 # Transform each record in the chunk 

441 transformed_data = [] 

442 for record in chunk.data: 

443 try: 

444 transformed = self.transform_fn(record) 

445 if transformed is not None: 

446 transformed_data.append(transformed) 

447 except Exception as e: 

448 self.reader.metrics.errors_count += 1 

449 continue 

450 

451 # Create new chunk with transformed data 

452 if transformed_data: 

453 transformed_chunk = StreamChunk( 

454 data=transformed_data, 

455 sequence_number=chunk.sequence_number, 

456 metadata=chunk.metadata, 

457 is_last=chunk.is_last 

458 ) 

459 await self.writer.write_chunk(transformed_chunk) 

460 

461 total_items += len(chunk.data) 

462 

463 # Report progress 

464 if progress_callback: 

465 progress_callback(total_items, self.reader._chunk_count) 

466 

467 # Combine metrics 

468 combined_metrics = StreamMetrics( 

469 chunks_processed=self.reader.metrics.chunks_processed, 

470 items_processed=self.reader.metrics.items_processed, 

471 errors_count=self.reader.metrics.errors_count, 

472 start_time=self.reader.metrics.start_time, 

473 end_time=self.writer.metrics.end_time 

474 ) 

475 

476 return combined_metrics 

477 

478 

479# Convenience functions for SimpleFSM integration 

480 

481async def create_streaming_file_reader( 

482 file_path: Union[str, Path], 

483 config: StreamConfig, 

484 **kwargs 

485) -> AsyncIterator[List[Dict[str, Any]]]: 

486 """Create a streaming file reader compatible with SimpleFSM. 

487 

488 Args: 

489 file_path: Input file path 

490 config: Stream configuration 

491 **kwargs: Additional reader parameters 

492 

493 Yields: 

494 Lists of records (chunks) 

495 """ 

496 reader = StreamingFileReader( 

497 file_path, 

498 chunk_size=config.chunk_size, 

499 **kwargs 

500 ) 

501 

502 async for chunk in reader.read_chunks(): 

503 yield chunk.data 

504 

505 

506async def create_streaming_file_writer( 

507 file_path: Union[str, Path], 

508 config: StreamConfig, 

509 **kwargs 

510) -> Tuple[Callable, Callable]: 

511 """Create a streaming file writer compatible with SimpleFSM. 

512 

513 Args: 

514 file_path: Output file path 

515 config: Stream configuration 

516 **kwargs: Additional writer parameters 

517 

518 Returns: 

519 Tuple of (write_fn, cleanup_fn) 

520 """ 

521 writer = StreamingFileWriter( 

522 file_path, 

523 buffer_size=config.buffer_size, 

524 **kwargs 

525 ) 

526 

527 writer.open() 

528 

529 async def write_fn(results: List[Dict[str, Any]]) -> None: 

530 """Write results to file.""" 

531 chunk = StreamChunk(data=results) 

532 await writer.write_chunk(chunk) 

533 

534 async def cleanup_fn() -> None: 

535 """Close and cleanup.""" 

536 await writer.close() 

537 

538 return write_fn, cleanup_fn