Coverage for src/dataknobs_fsm/utils/file_utils.py: 38%
111 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-16 20:47 -0600
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-16 20:47 -0600
1"""File processing utilities for FSM.
3This module provides utilities for reading and writing various file formats
4in the context of FSM stream processing.
5"""
7import csv
8import json
9from pathlib import Path
10from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union
13def detect_format(file_path: Union[str, Path], for_output: bool = False) -> str:
14 """Detect file format from extension.
16 Args:
17 file_path: Path to the file
18 for_output: If True, detect output format (defaults to jsonl for unknown)
20 Returns:
21 Detected format string
22 """
23 path = Path(file_path)
24 suffix = path.suffix.lower()
26 if suffix in ['.jsonl', '.ndjson']:
27 return 'jsonl'
28 elif suffix == '.json':
29 return 'json'
30 elif suffix in ['.csv', '.tsv']:
31 return 'csv'
32 elif suffix in ['.txt', '.text', '.log']:
33 return 'text'
34 else:
35 # Default to jsonl for output, text for input
36 return 'jsonl' if for_output else 'text'
39def get_csv_delimiter(file_path: Union[str, Path]) -> str:
40 """Get CSV delimiter based on file extension.
42 Args:
43 file_path: Path to the file
45 Returns:
46 Delimiter character
47 """
48 path = Path(file_path)
49 return '\t' if path.suffix.lower() == '.tsv' else ','
52async def create_file_reader(
53 file_path: Union[str, Path],
54 input_format: str = 'auto',
55 text_field_name: str = 'text',
56 csv_delimiter: str = ',',
57 csv_has_header: bool = True,
58 skip_empty_lines: bool = True
59) -> AsyncIterator[Dict[str, Any]]:
60 """Create an async iterator for reading files in various formats.
62 Args:
63 file_path: Path to the input file
64 input_format: File format ('auto', 'jsonl', 'json', 'csv', 'text')
65 text_field_name: Field name for text lines
66 csv_delimiter: Delimiter for CSV files
67 csv_has_header: Whether CSV has header row
68 skip_empty_lines: Skip empty lines in text files
70 Yields:
71 Dictionaries representing each record from the file
73 Raises:
74 ValueError: If input format is not supported
75 """
76 path = Path(file_path)
78 # Auto-detect format if needed
79 if input_format == 'auto':
80 input_format = detect_format(path)
81 if input_format == 'csv' and path.suffix.lower() == '.tsv':
82 csv_delimiter = '\t'
84 if input_format == 'jsonl':
85 async for record in read_jsonl_file(path):
86 yield record
88 elif input_format == 'json':
89 async for record in read_json_file(path):
90 yield record
92 elif input_format == 'csv':
93 async for record in read_csv_file(path, csv_delimiter, csv_has_header):
94 yield record
96 elif input_format == 'text':
97 async for record in read_text_file(path, text_field_name, skip_empty_lines):
98 yield record
100 else:
101 raise ValueError(f"Unsupported input format: {input_format}")
104async def read_jsonl_file(file_path: Path) -> AsyncIterator[Dict[str, Any]]:
105 """Read a JSONL (JSON Lines) file.
107 Args:
108 file_path: Path to the JSONL file
110 Yields:
111 Dictionaries from each valid JSON line
112 """
113 with open(file_path) as f:
114 for line in f:
115 if line.strip():
116 try:
117 yield json.loads(line)
118 except json.JSONDecodeError:
119 # Skip malformed JSON lines
120 continue
123async def read_json_file(file_path: Path) -> AsyncIterator[Dict[str, Any]]:
124 """Read a JSON file (single object or array).
126 Args:
127 file_path: Path to the JSON file
129 Yields:
130 Dictionary or dictionaries from the JSON file
131 """
132 with open(file_path) as f:
133 data = json.load(f)
134 if isinstance(data, list):
135 for item in data:
136 yield item
137 else:
138 yield data
141async def read_csv_file(
142 file_path: Path,
143 delimiter: str = ',',
144 has_header: bool = True
145) -> AsyncIterator[Dict[str, Any]]:
146 """Read a CSV file.
148 Args:
149 file_path: Path to the CSV file
150 delimiter: CSV delimiter character
151 has_header: Whether the CSV has a header row
153 Yields:
154 Dictionaries representing each row
155 """
156 with open(file_path, newline='') as f:
157 if has_header:
158 dict_reader = csv.DictReader(f, delimiter=delimiter)
159 for row in dict_reader:
160 yield row
161 else:
162 list_reader = csv.reader(f, delimiter=delimiter)
163 for row_list in list_reader:
164 yield {f'col_{i}': val for i, val in enumerate(row_list)}
167async def read_text_file(
168 file_path: Path,
169 field_name: str = 'text',
170 skip_empty: bool = True
171) -> AsyncIterator[Dict[str, Any]]:
172 """Read a plain text file line by line.
174 Args:
175 file_path: Path to the text file
176 field_name: Field name to use for each line
177 skip_empty: Skip empty lines
179 Yields:
180 Dictionaries with each line as a field
181 """
182 with open(file_path) as f:
183 for line in f:
184 line = line.rstrip('\n\r')
185 if line or not skip_empty:
186 yield {field_name: line}
189def create_file_writer(
190 file_path: Union[str, Path],
191 output_format: Optional[str] = None
192) -> tuple[Callable[[List[Dict[str, Any]]], None], Optional[Callable[[], None]]]:
193 """Create a file writer function for the specified format.
195 Args:
196 file_path: Path to the output file
197 output_format: Output format (auto-detected if None)
199 Returns:
200 Tuple of (writer_function, cleanup_function)
201 The cleanup_function is None for formats that don't need cleanup
202 """
203 path = Path(file_path)
205 # Auto-detect format if not specified
206 if output_format is None:
207 output_format = detect_format(path, for_output=True)
209 if output_format == 'jsonl':
210 return create_jsonl_writer(path), None
212 elif output_format == 'csv':
213 delimiter = get_csv_delimiter(path)
214 return create_csv_writer(path, delimiter)
216 elif output_format == 'json':
217 return create_json_writer(path)
219 else:
220 # Default to JSONL
221 return create_jsonl_writer(path), None
224def create_jsonl_writer(file_path: Path) -> Callable[[List[Dict[str, Any]]], None]:
225 """Create a JSONL writer function.
227 Args:
228 file_path: Path to the output file
230 Returns:
231 Writer function that appends to JSONL file
232 """
233 def write_jsonl(results: List[Dict[str, Any]]) -> None:
234 from dataknobs_fsm.utils.json_encoder import dumps
235 with open(file_path, 'a') as f:
236 for result in results:
237 f.write(dumps(result) + '\n')
239 return write_jsonl
242def create_csv_writer(
243 file_path: Path,
244 delimiter: str = ','
245) -> tuple[Callable[[List[Dict[str, Any]]], None], Callable[[], None]]:
246 """Create a CSV writer function with state management.
248 Args:
249 file_path: Path to the output file
250 delimiter: CSV delimiter character
252 Returns:
253 Tuple of (writer_function, cleanup_function)
254 """
255 csv_writer: Optional[csv.DictWriter] = None
256 csv_file: Optional[Any] = None
258 def write_csv(results: List[Dict[str, Any]]) -> None:
259 nonlocal csv_writer, csv_file
261 if not csv_file:
262 csv_file = open(file_path, 'w', newline='')
264 for result in results:
265 if not csv_writer:
266 # Initialize CSV writer with fields from first result
267 fieldnames = list(result.keys())
268 csv_writer = csv.DictWriter(
269 csv_file,
270 fieldnames=fieldnames,
271 delimiter=delimiter
272 )
273 csv_writer.writeheader()
274 csv_writer.writerow(result)
276 def cleanup() -> None:
277 if csv_file:
278 csv_file.close()
280 return write_csv, cleanup
283def create_json_writer(
284 file_path: Path
285) -> tuple[Callable[[List[Dict[str, Any]]], None], Callable[[], None]]:
286 """Create a JSON writer function that accumulates results.
288 Args:
289 file_path: Path to the output file
291 Returns:
292 Tuple of (writer_function, cleanup_function)
293 """
294 all_results: List[Dict[str, Any]] = []
296 def write_json(results: List[Dict[str, Any]]) -> None:
297 nonlocal all_results
298 all_results.extend(results)
300 def cleanup() -> None:
301 # Write all results at once
302 with open(file_path, 'w') as f:
303 json.dump(all_results, f, indent=2)
305 return write_json, cleanup