Coverage for src/dataknobs_fsm/utils/streaming_file_utils.py: 42%
261 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-16 20:48 -0600
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-16 20:48 -0600
1"""Streaming file utilities for processing large files efficiently.
3This module provides memory-efficient streaming utilities for reading and writing
4large files that may not fit in memory.
5"""
7import asyncio
8import csv
9import json
10from collections import deque
11from io import StringIO
12from pathlib import Path
13from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Tuple, Union
15from dataknobs_fsm.streaming.core import StreamChunk, StreamConfig, StreamMetrics
16from dataknobs_fsm.utils.file_utils import detect_format, get_csv_delimiter
19class StreamingFileReader:
20 """Memory-efficient streaming file reader with chunking support."""
22 def __init__(
23 self,
24 file_path: Union[str, Path],
25 chunk_size: int = 1000,
26 input_format: str = 'auto',
27 text_field_name: str = 'text',
28 csv_delimiter: str = ',',
29 csv_has_header: bool = True,
30 skip_empty_lines: bool = True,
31 max_memory_mb: int = 100
32 ):
33 """Initialize streaming file reader.
35 Args:
36 file_path: Path to input file
37 chunk_size: Number of records per chunk
38 input_format: File format ('auto', 'jsonl', 'json', 'csv', 'text')
39 text_field_name: Field name for text lines
40 csv_delimiter: CSV delimiter character
41 csv_has_header: Whether CSV has header row
42 skip_empty_lines: Skip empty lines in text files
43 max_memory_mb: Maximum memory usage in MB
44 """
45 self.file_path = Path(file_path)
46 self.chunk_size = chunk_size
47 self.text_field_name = text_field_name
48 self.csv_delimiter = csv_delimiter
49 self.csv_has_header = csv_has_header
50 self.skip_empty_lines = skip_empty_lines
51 self.max_memory_mb = max_memory_mb
53 # Auto-detect format if needed
54 if input_format == 'auto':
55 self.format = detect_format(self.file_path)
56 if self.format == 'csv' and self.file_path.suffix.lower() == '.tsv':
57 self.csv_delimiter = '\t'
58 else:
59 self.format = input_format
61 self.metrics = StreamMetrics()
62 self._chunk_count = 0
64 async def read_chunks(self) -> AsyncIterator[StreamChunk]:
65 """Read file in chunks, yielding StreamChunk objects.
67 Yields:
68 StreamChunk objects containing batches of records
69 """
70 self.metrics.start_time = asyncio.get_event_loop().time()
72 try:
73 if self.format == 'jsonl':
74 async for chunk in self._read_jsonl_chunks():
75 yield chunk
76 elif self.format == 'json':
77 async for chunk in self._read_json_chunks():
78 yield chunk
79 elif self.format == 'csv':
80 async for chunk in self._read_csv_chunks():
81 yield chunk
82 elif self.format == 'text':
83 async for chunk in self._read_text_chunks():
84 yield chunk
85 else:
86 raise ValueError(f"Unsupported format: {self.format}")
87 finally:
88 self.metrics.end_time = asyncio.get_event_loop().time()
90 async def _read_jsonl_chunks(self) -> AsyncIterator[StreamChunk]:
91 """Read JSONL file in chunks."""
92 chunk_data = []
94 with open(self.file_path, 'r') as f:
95 for line_num, line in enumerate(f, 1):
96 if line.strip():
97 try:
98 record = json.loads(line)
99 chunk_data.append(record)
100 self.metrics.items_processed += 1
102 if len(chunk_data) >= self.chunk_size:
103 yield self._create_chunk(chunk_data)
104 chunk_data = []
106 # Allow other tasks to run
107 await asyncio.sleep(0)
108 except json.JSONDecodeError:
109 self.metrics.errors_count += 1
110 continue
112 # Yield remaining data
113 if chunk_data:
114 yield self._create_chunk(chunk_data, is_last=True)
116 async def _read_json_chunks(self) -> AsyncIterator[StreamChunk]:
117 """Read JSON file in chunks (for arrays)."""
118 import ijson
120 with open(self.file_path, 'rb') as f:
121 # First try to parse as an array with streaming
122 try:
123 parser = ijson.items(f, 'item')
124 chunk_data = []
125 item_count = 0
127 for item in parser:
128 chunk_data.append(item)
129 item_count += 1
130 self.metrics.items_processed += 1
132 if len(chunk_data) >= self.chunk_size:
133 yield self._create_chunk(chunk_data)
134 chunk_data = []
135 await asyncio.sleep(0)
137 if chunk_data:
138 yield self._create_chunk(chunk_data, is_last=True)
139 elif item_count == 0:
140 # No items found, might be a single object
141 raise ValueError("No array items found")
143 except (ijson.JSONError, ValueError):
144 # Not an array or empty, try as single object
145 f.seek(0)
146 with open(self.file_path, 'r') as text_f:
147 data = json.load(text_f)
149 if isinstance(data, list):
150 # It's an array, process in chunks
151 for i in range(0, len(data), self.chunk_size):
152 chunk = data[i:i + self.chunk_size]
153 is_last = (i + self.chunk_size) >= len(data)
154 self.metrics.items_processed += len(chunk)
155 yield self._create_chunk(chunk, is_last=is_last)
156 await asyncio.sleep(0)
157 else:
158 # Single object
159 self.metrics.items_processed += 1
160 yield self._create_chunk([data], is_last=True)
162 async def _read_csv_chunks(self) -> AsyncIterator[StreamChunk]:
163 """Read CSV file in chunks."""
164 chunk_data = []
165 total_rows = 0
167 with open(self.file_path, 'r', newline='') as f:
168 if self.csv_has_header:
169 reader = csv.DictReader(f, delimiter=self.csv_delimiter)
170 else:
171 # For headerless CSV, create field names
172 first_line = f.readline()
173 f.seek(0)
174 num_fields = len(first_line.split(self.csv_delimiter))
175 fieldnames = [f'col_{i}' for i in range(num_fields)]
176 reader = csv.DictReader(f, fieldnames=fieldnames, delimiter=self.csv_delimiter)
178 # Count total rows for determining last chunk
179 reader_list = list(reader)
180 total_rows = len(reader_list)
182 for idx, row in enumerate(reader_list):
183 chunk_data.append(dict(row)) # Convert OrderedDict to dict
184 self.metrics.items_processed += 1
186 if len(chunk_data) >= self.chunk_size:
187 # Check if this will be the last chunk
188 is_last = (idx + 1) >= total_rows
189 yield self._create_chunk(chunk_data, is_last=is_last)
190 chunk_data = []
191 await asyncio.sleep(0)
193 if chunk_data:
194 yield self._create_chunk(chunk_data, is_last=True)
196 async def _read_text_chunks(self) -> AsyncIterator[StreamChunk]:
197 """Read text file in chunks."""
198 chunk_data = []
200 with open(self.file_path, 'r') as f:
201 for line in f:
202 line = line.rstrip('\n\r')
203 if line or not self.skip_empty_lines:
204 chunk_data.append({self.text_field_name: line})
205 self.metrics.items_processed += 1
207 if len(chunk_data) >= self.chunk_size:
208 yield self._create_chunk(chunk_data)
209 chunk_data = []
210 await asyncio.sleep(0)
212 if chunk_data:
213 yield self._create_chunk(chunk_data, is_last=True)
215 def _create_chunk(self, data: List[Dict[str, Any]], is_last: bool = False) -> StreamChunk:
216 """Create a StreamChunk from data."""
217 chunk = StreamChunk(
218 data=data,
219 sequence_number=self._chunk_count,
220 metadata={
221 'file': str(self.file_path),
222 'format': self.format,
223 'chunk_size': len(data)
224 },
225 is_last=is_last
226 )
227 self._chunk_count += 1
228 self.metrics.chunks_processed += 1
229 return chunk
232class StreamingFileWriter:
233 """Memory-efficient streaming file writer with buffering."""
235 def __init__(
236 self,
237 file_path: Union[str, Path],
238 output_format: Optional[str] = None,
239 buffer_size: int = 1000,
240 flush_interval: float = 1.0
241 ):
242 """Initialize streaming file writer.
244 Args:
245 file_path: Path to output file
246 output_format: Output format (auto-detected if None)
247 buffer_size: Number of records to buffer before writing
248 flush_interval: Time interval (seconds) to flush buffer
249 """
250 self.file_path = Path(file_path)
251 self.buffer_size = buffer_size
252 self.flush_interval = flush_interval
254 # Auto-detect format
255 self.format = output_format or detect_format(self.file_path, for_output=True)
257 self._buffer: deque = deque()
258 self._file_handle: Optional[Any] = None
259 self._csv_writer: Optional[csv.DictWriter] = None
260 self._last_flush_time = asyncio.get_event_loop().time()
261 self._is_first_write = True
262 self.metrics = StreamMetrics()
264 async def __aenter__(self):
265 """Async context manager entry."""
266 self.open()
267 return self
269 async def __aexit__(self, exc_type, exc_val, exc_tb):
270 """Async context manager exit."""
271 await self.close()
273 def open(self):
274 """Open the output file."""
275 if self.format == 'jsonl':
276 self._file_handle = open(self.file_path, 'w')
277 elif self.format == 'csv':
278 self._file_handle = open(self.file_path, 'w', newline='')
279 elif self.format == 'json':
280 self._file_handle = open(self.file_path, 'w')
281 self._file_handle.write('[') # Start JSON array
282 elif self.format == 'text':
283 self._file_handle = open(self.file_path, 'w')
284 else:
285 self._file_handle = open(self.file_path, 'w')
287 self.metrics.start_time = asyncio.get_event_loop().time()
289 async def write_chunk(self, chunk: StreamChunk) -> None:
290 """Write a chunk of data to the file.
292 Args:
293 chunk: StreamChunk to write
294 """
295 if not self._file_handle:
296 self.open()
298 # Add chunk data to buffer
299 if isinstance(chunk.data, list):
300 self._buffer.extend(chunk.data)
301 else:
302 self._buffer.append(chunk.data)
304 # Check if we should flush
305 current_time = asyncio.get_event_loop().time()
306 should_flush = (
307 len(self._buffer) >= self.buffer_size or
308 chunk.is_last or
309 (current_time - self._last_flush_time) > self.flush_interval
310 )
312 if should_flush:
313 await self._flush_buffer()
314 self._last_flush_time = current_time
316 self.metrics.chunks_processed += 1
318 async def _flush_buffer(self) -> None:
319 """Flush the buffer to file."""
320 if not self._buffer or not self._file_handle:
321 return
323 if self.format == 'jsonl':
324 # Write each record as a JSON line
325 while self._buffer:
326 record = self._buffer.popleft()
327 json.dump(record, self._file_handle)
328 self._file_handle.write('\n')
329 self.metrics.items_processed += 1
331 elif self.format == 'csv':
332 # Initialize CSV writer if needed
333 if self._csv_writer is None and self._buffer:
334 first_record = self._buffer[0]
335 fieldnames = list(first_record.keys())
336 delimiter = get_csv_delimiter(self.file_path)
337 self._csv_writer = csv.DictWriter(
338 self._file_handle,
339 fieldnames=fieldnames,
340 delimiter=delimiter
341 )
342 self._csv_writer.writeheader()
344 # Write records
345 while self._buffer:
346 record = self._buffer.popleft()
347 self._csv_writer.writerow(record)
348 self.metrics.items_processed += 1
350 elif self.format == 'json':
351 # Write as JSON array elements
352 while self._buffer:
353 record = self._buffer.popleft()
354 if not self._is_first_write:
355 self._file_handle.write(',')
356 json.dump(record, self._file_handle)
357 self._is_first_write = False
358 self.metrics.items_processed += 1
360 elif self.format == 'text':
361 # Write text lines
362 while self._buffer:
363 record = self._buffer.popleft()
364 # Extract text from dict if needed
365 if isinstance(record, dict):
366 text = record.get('text', str(record))
367 else:
368 text = str(record)
369 self._file_handle.write(text + '\n')
370 self.metrics.items_processed += 1
372 # Flush to disk
373 self._file_handle.flush()
375 # Allow other tasks to run
376 await asyncio.sleep(0)
378 async def close(self) -> None:
379 """Close the file and flush remaining buffer."""
380 if self._buffer:
381 await self._flush_buffer()
383 if self._file_handle:
384 if self.format == 'json':
385 self._file_handle.write(']') # Close JSON array
387 self._file_handle.close()
388 self._file_handle = None
390 self.metrics.end_time = asyncio.get_event_loop().time()
393class StreamingFileProcessor:
394 """High-level streaming file processor combining reader and writer."""
396 def __init__(
397 self,
398 input_path: Union[str, Path],
399 output_path: Union[str, Path],
400 transform_fn: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
401 chunk_size: int = 1000,
402 input_format: str = 'auto',
403 output_format: Optional[str] = None
404 ):
405 """Initialize streaming file processor.
407 Args:
408 input_path: Input file path
409 output_path: Output file path
410 transform_fn: Optional transformation function for each record
411 chunk_size: Records per chunk
412 input_format: Input file format
413 output_format: Output file format (auto-detected if None)
414 """
415 self.reader = StreamingFileReader(
416 input_path,
417 chunk_size=chunk_size,
418 input_format=input_format
419 )
420 self.writer = StreamingFileWriter(
421 output_path,
422 output_format=output_format,
423 buffer_size=chunk_size
424 )
425 self.transform_fn = transform_fn or (lambda x: x)
427 async def process(self, progress_callback: Optional[Callable[[int, int], None]] = None) -> StreamMetrics:
428 """Process the file with streaming.
430 Args:
431 progress_callback: Optional callback for progress updates (items_processed, total_chunks)
433 Returns:
434 Combined metrics from processing
435 """
436 async with self.writer:
437 total_items = 0
439 async for chunk in self.reader.read_chunks():
440 # Transform each record in the chunk
441 transformed_data = []
442 for record in chunk.data:
443 try:
444 transformed = self.transform_fn(record)
445 if transformed is not None:
446 transformed_data.append(transformed)
447 except Exception as e:
448 self.reader.metrics.errors_count += 1
449 continue
451 # Create new chunk with transformed data
452 if transformed_data:
453 transformed_chunk = StreamChunk(
454 data=transformed_data,
455 sequence_number=chunk.sequence_number,
456 metadata=chunk.metadata,
457 is_last=chunk.is_last
458 )
459 await self.writer.write_chunk(transformed_chunk)
461 total_items += len(chunk.data)
463 # Report progress
464 if progress_callback:
465 progress_callback(total_items, self.reader._chunk_count)
467 # Combine metrics
468 combined_metrics = StreamMetrics(
469 chunks_processed=self.reader.metrics.chunks_processed,
470 items_processed=self.reader.metrics.items_processed,
471 errors_count=self.reader.metrics.errors_count,
472 start_time=self.reader.metrics.start_time,
473 end_time=self.writer.metrics.end_time
474 )
476 return combined_metrics
479# Convenience functions for SimpleFSM integration
481async def create_streaming_file_reader(
482 file_path: Union[str, Path],
483 config: StreamConfig,
484 **kwargs
485) -> AsyncIterator[List[Dict[str, Any]]]:
486 """Create a streaming file reader compatible with SimpleFSM.
488 Args:
489 file_path: Input file path
490 config: Stream configuration
491 **kwargs: Additional reader parameters
493 Yields:
494 Lists of records (chunks)
495 """
496 reader = StreamingFileReader(
497 file_path,
498 chunk_size=config.chunk_size,
499 **kwargs
500 )
502 async for chunk in reader.read_chunks():
503 yield chunk.data
506async def create_streaming_file_writer(
507 file_path: Union[str, Path],
508 config: StreamConfig,
509 **kwargs
510) -> Tuple[Callable, Callable]:
511 """Create a streaming file writer compatible with SimpleFSM.
513 Args:
514 file_path: Output file path
515 config: Stream configuration
516 **kwargs: Additional writer parameters
518 Returns:
519 Tuple of (write_fn, cleanup_fn)
520 """
521 writer = StreamingFileWriter(
522 file_path,
523 buffer_size=config.buffer_size,
524 **kwargs
525 )
527 writer.open()
529 async def write_fn(results: List[Dict[str, Any]]) -> None:
530 """Write results to file."""
531 chunk = StreamChunk(data=results)
532 await writer.write_chunk(chunk)
534 async def cleanup_fn() -> None:
535 """Close and cleanup."""
536 await writer.close()
538 return write_fn, cleanup_fn