Coverage for src/dataknobs_fsm/streaming/file_stream.py: 15%
281 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 16:46 -0600
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 16:46 -0600
1"""File streaming implementation for FSM."""
3import csv
4import gzip
5import io
6import json
7import logging
8import os
9from pathlib import Path
10from typing import Any, Dict, Iterator, List, Union, TextIO, BinaryIO
12from dataknobs_fsm.streaming.core import (
13 IStreamSink,
14 IStreamSource,
15 StreamChunk,
16)
18logger = logging.getLogger(__name__)
21class FileFormat:
22 """Supported file formats."""
23 JSON = "json"
24 JSONL = "jsonl"
25 CSV = "csv"
26 TEXT = "text"
27 BINARY = "binary"
29 @staticmethod
30 def detect(file_path: Path) -> str:
31 """Detect format from file extension.
33 Args:
34 file_path: Path to file.
36 Returns:
37 Detected format string.
38 """
39 suffix = file_path.suffix.lower()
40 if suffix in ['.json']:
41 return FileFormat.JSON
42 elif suffix in ['.jsonl', '.ndjson']:
43 return FileFormat.JSONL
44 elif suffix in ['.csv', '.tsv']:
45 return FileFormat.CSV
46 elif suffix in ['.txt', '.text', '.log']:
47 return FileFormat.TEXT
48 else:
49 return FileFormat.BINARY
52class CompressionFormat:
53 """Supported compression formats."""
54 NONE = "none"
55 GZIP = "gzip"
57 @staticmethod
58 def detect(file_path: Path) -> str:
59 """Detect compression from file extension.
61 Args:
62 file_path: Path to file.
64 Returns:
65 Detected compression format.
66 """
67 if file_path.suffix.lower() in ['.gz', '.gzip']:
68 return CompressionFormat.GZIP
69 return CompressionFormat.NONE
72class FileStreamSource(IStreamSource):
73 """File-based stream source with format detection and decompression.
75 This source supports reading files in chunks with automatic
76 format detection, decompression, and progress tracking.
77 """
79 def __init__(
80 self,
81 file_path: Union[str, Path],
82 format: str | None = None,
83 compression: str | None = None,
84 chunk_size: int = 1000,
85 encoding: str = 'utf-8'
86 ):
87 """Initialize file stream source.
89 Args:
90 file_path: Path to source file.
91 format: File format (auto-detected if None).
92 compression: Compression format (auto-detected if None).
93 chunk_size: Number of items per chunk.
94 encoding: Text encoding for text formats.
95 """
96 self.file_path = Path(file_path)
97 self.format = format or FileFormat.detect(self.file_path)
98 self.compression = compression or CompressionFormat.detect(self.file_path)
99 self.chunk_size = chunk_size
100 self.encoding = encoding
102 self._file_handle: Union[TextIO, BinaryIO, Any, None] = None
103 self._reader: Any | None = None
104 self._chunk_count = 0
105 self._item_count = 0
106 self._bytes_read = 0
107 self._file_size = self.file_path.stat().st_size if self.file_path.exists() else 0
109 self._open_file()
111 def _open_file(self) -> None:
112 """Open the file with appropriate decompression."""
113 if not self.file_path.exists():
114 raise FileNotFoundError(f"File not found: {self.file_path}")
116 # Open with decompression if needed
117 if self.compression == CompressionFormat.GZIP:
118 self._file_handle = gzip.open(self.file_path, 'rt', encoding=self.encoding)
119 elif self.format == FileFormat.BINARY:
120 self._file_handle = open(self.file_path, 'rb')
121 else:
122 self._file_handle = open(self.file_path, encoding=self.encoding)
124 # Set up format-specific reader
125 if self.format == FileFormat.CSV:
126 self._reader = csv.DictReader(self._file_handle) # type: ignore
127 elif self.format == FileFormat.JSON:
128 # Load entire JSON file (for array or object)
129 content = self._file_handle.read() # type: ignore
130 data = json.loads(content)
131 if isinstance(data, list):
132 self._reader = iter(data)
133 else:
134 self._reader = iter([data])
135 elif self.format == FileFormat.JSONL:
136 # Will read line by line
137 self._reader = self._file_handle
138 elif self.format == FileFormat.TEXT or self.format == FileFormat.BINARY:
139 self._reader = self._file_handle
140 else:
141 self._reader = self._file_handle
143 def read_chunk(self) -> StreamChunk | None:
144 """Read next chunk from file.
146 Returns:
147 StreamChunk with file data or None if exhausted.
148 """
149 if self._reader is None:
150 return None
152 chunk_data = []
154 try:
155 # Read up to chunk_size items
156 for _ in range(self.chunk_size):
157 item = self._read_next_item()
158 if item is None:
159 break
160 chunk_data.append(item)
161 self._item_count += 1
163 if not chunk_data:
164 return None
166 # Calculate progress
167 progress = 0.0
168 if self._file_size > 0 and self._file_handle:
169 if hasattr(self._file_handle, 'tell'):
170 try:
171 current_pos = self._file_handle.tell()
172 progress = current_pos / self._file_size
173 except (OSError, io.UnsupportedOperation):
174 pass
176 # Create chunk
177 chunk = StreamChunk(
178 data=chunk_data,
179 sequence_number=self._chunk_count,
180 metadata={
181 'file_path': str(self.file_path),
182 'format': self.format,
183 'progress': progress,
184 'item_count': len(chunk_data)
185 },
186 is_last=len(chunk_data) < self.chunk_size
187 )
189 self._chunk_count += 1
190 return chunk
192 except Exception as e:
193 # Return error chunk
194 return StreamChunk(
195 data=[],
196 sequence_number=self._chunk_count,
197 metadata={'error': str(e)},
198 is_last=True
199 )
201 def _read_next_item(self) -> Any | None:
202 """Read next item based on format.
204 Returns:
205 Next item or None if exhausted.
206 """
207 try:
208 if self.format == FileFormat.CSV:
209 return next(self._reader, None) if self._reader else None
210 elif self.format == FileFormat.JSONL:
211 line = next(self._reader, None) if self._reader else None
212 if line:
213 return json.loads(line.strip())
214 return None
215 elif self.format == FileFormat.TEXT:
216 return next(self._reader, None) if self._reader else None
217 elif self.format == FileFormat.BINARY:
218 # Read in 4KB chunks for binary
219 data = self._reader.read(4096) if self._reader else None
220 if data:
221 return data
222 return None
223 else:
224 # For JSON array, already using iterator
225 return next(self._reader, None) if self._reader else None
226 except StopIteration:
227 return None
228 except Exception:
229 return None
231 def __iter__(self) -> Iterator[StreamChunk]:
232 """Iterate over all chunks."""
233 while True:
234 chunk = self.read_chunk()
235 if chunk is None:
236 break
237 yield chunk
239 def close(self) -> None:
240 """Close the file handle."""
241 if self._file_handle:
242 self._file_handle.close()
243 self._file_handle = None
244 self._reader = None
247class FileStreamSink(IStreamSink):
248 """File-based stream sink with format serialization and compression.
250 This sink supports writing data chunks to files with automatic
251 format serialization, compression, and atomic writes.
252 """
254 def __init__(
255 self,
256 file_path: Union[str, Path],
257 format: str | None = None,
258 compression: str | None = None,
259 encoding: str = 'utf-8',
260 atomic: bool = True,
261 append: bool = False
262 ):
263 """Initialize file stream sink.
265 Args:
266 file_path: Path to target file.
267 format: File format (auto-detected if None).
268 compression: Compression format (auto-detected if None).
269 encoding: Text encoding for text formats.
270 atomic: Use atomic writes (write to temp then rename).
271 append: Append to existing file instead of overwriting.
272 """
273 self.file_path = Path(file_path)
274 self.format = format or FileFormat.detect(self.file_path)
275 self.compression = compression or CompressionFormat.detect(self.file_path)
276 self.encoding = encoding
277 self.atomic = atomic
278 self.append = append
280 self._file_handle: Any | None = None
281 self._writer: Any | None = None
282 self._temp_path: Path | None = None
283 self._chunk_count = 0
284 self._item_count = 0
285 self._bytes_written = 0
286 self._buffer: List[Any] = []
288 self._open_file()
290 def _open_file(self) -> None:
291 """Open file for writing."""
292 # Create parent directories
293 self.file_path.parent.mkdir(parents=True, exist_ok=True)
295 # Use temp file for atomic writes
296 if self.atomic and not self.append:
297 self._temp_path = self.file_path.with_suffix(
298 self.file_path.suffix + '.tmp'
299 )
300 target_path = self._temp_path
301 else:
302 target_path = self.file_path
304 # Open with compression if needed
305 mode = 'ab' if self.append and self.format == FileFormat.BINARY else (
306 'a' if self.append else 'w'
307 )
309 if self.compression == CompressionFormat.GZIP:
310 if self.format == FileFormat.BINARY:
311 self._file_handle = gzip.open(str(target_path), mode + 'b')
312 else:
313 self._file_handle = gzip.open(
314 str(target_path),
315 mode + 't',
316 encoding=self.encoding
317 )
318 elif self.format == FileFormat.BINARY:
319 self._file_handle = open(str(target_path), mode + 'b')
320 else:
321 self._file_handle = open(str(target_path), mode, encoding=self.encoding)
323 # Set up format-specific writer
324 if self.format == FileFormat.CSV:
325 # CSV writer will be initialized on first write
326 self._writer = None
327 elif self.format == FileFormat.JSON:
328 # Buffer all data for JSON
329 self._buffer = []
330 else:
331 self._writer = self._file_handle
333 def write_chunk(self, chunk: StreamChunk) -> bool:
334 """Write chunk to file.
336 Args:
337 chunk: Chunk to write.
339 Returns:
340 True if successful.
341 """
342 if self._file_handle is None:
343 return False
345 try:
346 if not chunk.data:
347 return True
349 if self.format == FileFormat.CSV:
350 self._write_csv_chunk(chunk.data)
351 elif self.format == FileFormat.JSON:
352 # Buffer for final write
353 if isinstance(chunk.data, list):
354 self._buffer.extend(chunk.data)
355 else:
356 self._buffer.append(chunk.data)
357 elif self.format == FileFormat.JSONL:
358 for item in chunk.data:
359 self._file_handle.write(json.dumps(item) + '\n')
360 elif self.format == FileFormat.TEXT:
361 for item in chunk.data:
362 if item is not None:
363 self._file_handle.write(str(item))
364 if not str(item).endswith('\n'):
365 self._file_handle.write('\n')
366 elif self.format == FileFormat.BINARY:
367 for item in chunk.data:
368 if isinstance(item, bytes):
369 self._file_handle.write(item)
370 else:
371 self._file_handle.write(str(item).encode(self.encoding))
372 else:
373 # Default text write
374 for item in chunk.data:
375 self._file_handle.write(str(item) + '\n')
377 self._chunk_count += 1
378 self._item_count += len(chunk.data) if isinstance(chunk.data, list) else 1
380 return True
382 except Exception as e:
383 logger.error(f"Error writing chunk: {e}")
384 return False
386 def _write_csv_chunk(self, data: List[Dict[str, Any]]) -> None:
387 """Write CSV data.
389 Args:
390 data: List of dictionaries to write.
391 """
392 if not data:
393 return
395 # Initialize CSV writer on first write
396 if self._writer is None:
397 fieldnames = list(data[0].keys())
398 self._writer = csv.DictWriter(
399 self._file_handle, # type: ignore
400 fieldnames=fieldnames
401 )
402 if not self.append or self._chunk_count == 0:
403 self._writer.writeheader()
405 for row in data:
406 self._writer.writerow(row)
408 def flush(self) -> None:
409 """Flush buffered data to disk."""
410 if self._file_handle is None:
411 return
413 try:
414 # Write JSON buffer if needed
415 if self.format == FileFormat.JSON and self._buffer:
416 json.dump(self._buffer, self._file_handle, indent=2)
417 self._buffer = []
419 # Flush file handle
420 self._file_handle.flush()
422 if hasattr(self._file_handle, 'fileno'):
423 os.fsync(self._file_handle.fileno())
424 except Exception:
425 pass
427 def close(self) -> None:
428 """Close file and finalize atomic write."""
429 if self._file_handle is None:
430 return
432 try:
433 # Flush any remaining data
434 self.flush()
436 # Close file
437 self._file_handle.close()
439 # Atomic rename if using temp file
440 if self.atomic and self._temp_path and self._temp_path.exists():
441 self._temp_path.replace(self.file_path)
442 self._temp_path = None
444 except Exception:
445 pass
446 finally:
447 self._file_handle = None
448 self._writer = None
451class DirectoryStreamSource(IStreamSource):
452 """Stream source that reads from multiple files in a directory."""
454 def __init__(
455 self,
456 directory: Union[str, Path],
457 pattern: str = "*",
458 recursive: bool = False,
459 format: str | None = None,
460 chunk_size: int = 1000
461 ):
462 """Initialize directory stream source.
464 Args:
465 directory: Directory path.
466 pattern: File pattern to match.
467 recursive: Search recursively.
468 format: File format for all files.
469 chunk_size: Chunk size for reading.
470 """
471 self.directory = Path(directory)
472 self.pattern = pattern
473 self.recursive = recursive
474 self.format = format
475 self.chunk_size = chunk_size
477 # Find all matching files
478 if recursive:
479 self.files = list(self.directory.rglob(self.pattern))
480 else:
481 self.files = list(self.directory.glob(self.pattern))
483 self.files = [f for f in self.files if f.is_file()]
484 self.files.sort()
486 self._current_file_index = 0
487 self._current_source: FileStreamSource | None = None
488 self._total_chunks = 0
490 def read_chunk(self) -> StreamChunk | None:
491 """Read next chunk from directory files.
493 Returns:
494 Next chunk or None if exhausted.
495 """
496 while self._current_file_index < len(self.files):
497 # Open next file if needed
498 if self._current_source is None:
499 file_path = self.files[self._current_file_index]
500 try:
501 self._current_source = FileStreamSource(
502 file_path,
503 format=self.format,
504 chunk_size=self.chunk_size
505 )
506 except Exception:
507 self._current_file_index += 1
508 continue
510 # Read chunk from current file
511 chunk = self._current_source.read_chunk()
513 if chunk is None:
514 # Current file exhausted, move to next
515 self._current_source.close()
516 self._current_source = None
517 self._current_file_index += 1
518 continue
520 # Add file info to metadata
521 chunk.metadata['source_file'] = str(
522 self.files[self._current_file_index]
523 )
524 chunk.metadata['file_index'] = self._current_file_index
525 chunk.metadata['total_files'] = len(self.files)
527 # Update is_last flag
528 chunk.is_last = (
529 self._current_file_index == len(self.files) - 1 and
530 chunk.is_last
531 )
533 self._total_chunks += 1
534 chunk.sequence_number = self._total_chunks
536 return chunk
538 return None
540 def __iter__(self) -> Iterator[StreamChunk]:
541 """Iterate over all chunks."""
542 while True:
543 chunk = self.read_chunk()
544 if chunk is None:
545 break
546 yield chunk
548 def close(self) -> None:
549 """Close current file source."""
550 if self._current_source:
551 self._current_source.close()
552 self._current_source = None