Coverage for src/dataknobs_fsm/utils/file_utils.py: 38%

111 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-16 20:47 -0600

1"""File processing utilities for FSM. 

2 

3This module provides utilities for reading and writing various file formats 

4in the context of FSM stream processing. 

5""" 

6 

7import csv 

8import json 

9from pathlib import Path 

10from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union 

11 

12 

13def detect_format(file_path: Union[str, Path], for_output: bool = False) -> str: 

14 """Detect file format from extension. 

15 

16 Args: 

17 file_path: Path to the file 

18 for_output: If True, detect output format (defaults to jsonl for unknown) 

19 

20 Returns: 

21 Detected format string 

22 """ 

23 path = Path(file_path) 

24 suffix = path.suffix.lower() 

25 

26 if suffix in ['.jsonl', '.ndjson']: 

27 return 'jsonl' 

28 elif suffix == '.json': 

29 return 'json' 

30 elif suffix in ['.csv', '.tsv']: 

31 return 'csv' 

32 elif suffix in ['.txt', '.text', '.log']: 

33 return 'text' 

34 else: 

35 # Default to jsonl for output, text for input 

36 return 'jsonl' if for_output else 'text' 

37 

38 

39def get_csv_delimiter(file_path: Union[str, Path]) -> str: 

40 """Get CSV delimiter based on file extension. 

41 

42 Args: 

43 file_path: Path to the file 

44 

45 Returns: 

46 Delimiter character 

47 """ 

48 path = Path(file_path) 

49 return '\t' if path.suffix.lower() == '.tsv' else ',' 

50 

51 

52async def create_file_reader( 

53 file_path: Union[str, Path], 

54 input_format: str = 'auto', 

55 text_field_name: str = 'text', 

56 csv_delimiter: str = ',', 

57 csv_has_header: bool = True, 

58 skip_empty_lines: bool = True 

59) -> AsyncIterator[Dict[str, Any]]: 

60 """Create an async iterator for reading files in various formats. 

61 

62 Args: 

63 file_path: Path to the input file 

64 input_format: File format ('auto', 'jsonl', 'json', 'csv', 'text') 

65 text_field_name: Field name for text lines 

66 csv_delimiter: Delimiter for CSV files 

67 csv_has_header: Whether CSV has header row 

68 skip_empty_lines: Skip empty lines in text files 

69 

70 Yields: 

71 Dictionaries representing each record from the file 

72 

73 Raises: 

74 ValueError: If input format is not supported 

75 """ 

76 path = Path(file_path) 

77 

78 # Auto-detect format if needed 

79 if input_format == 'auto': 

80 input_format = detect_format(path) 

81 if input_format == 'csv' and path.suffix.lower() == '.tsv': 

82 csv_delimiter = '\t' 

83 

84 if input_format == 'jsonl': 

85 async for record in read_jsonl_file(path): 

86 yield record 

87 

88 elif input_format == 'json': 

89 async for record in read_json_file(path): 

90 yield record 

91 

92 elif input_format == 'csv': 

93 async for record in read_csv_file(path, csv_delimiter, csv_has_header): 

94 yield record 

95 

96 elif input_format == 'text': 

97 async for record in read_text_file(path, text_field_name, skip_empty_lines): 

98 yield record 

99 

100 else: 

101 raise ValueError(f"Unsupported input format: {input_format}") 

102 

103 

104async def read_jsonl_file(file_path: Path) -> AsyncIterator[Dict[str, Any]]: 

105 """Read a JSONL (JSON Lines) file. 

106 

107 Args: 

108 file_path: Path to the JSONL file 

109 

110 Yields: 

111 Dictionaries from each valid JSON line 

112 """ 

113 with open(file_path) as f: 

114 for line in f: 

115 if line.strip(): 

116 try: 

117 yield json.loads(line) 

118 except json.JSONDecodeError: 

119 # Skip malformed JSON lines 

120 continue 

121 

122 

123async def read_json_file(file_path: Path) -> AsyncIterator[Dict[str, Any]]: 

124 """Read a JSON file (single object or array). 

125 

126 Args: 

127 file_path: Path to the JSON file 

128 

129 Yields: 

130 Dictionary or dictionaries from the JSON file 

131 """ 

132 with open(file_path) as f: 

133 data = json.load(f) 

134 if isinstance(data, list): 

135 for item in data: 

136 yield item 

137 else: 

138 yield data 

139 

140 

141async def read_csv_file( 

142 file_path: Path, 

143 delimiter: str = ',', 

144 has_header: bool = True 

145) -> AsyncIterator[Dict[str, Any]]: 

146 """Read a CSV file. 

147 

148 Args: 

149 file_path: Path to the CSV file 

150 delimiter: CSV delimiter character 

151 has_header: Whether the CSV has a header row 

152 

153 Yields: 

154 Dictionaries representing each row 

155 """ 

156 with open(file_path, newline='') as f: 

157 if has_header: 

158 dict_reader = csv.DictReader(f, delimiter=delimiter) 

159 for row in dict_reader: 

160 yield row 

161 else: 

162 list_reader = csv.reader(f, delimiter=delimiter) 

163 for row_list in list_reader: 

164 yield {f'col_{i}': val for i, val in enumerate(row_list)} 

165 

166 

167async def read_text_file( 

168 file_path: Path, 

169 field_name: str = 'text', 

170 skip_empty: bool = True 

171) -> AsyncIterator[Dict[str, Any]]: 

172 """Read a plain text file line by line. 

173 

174 Args: 

175 file_path: Path to the text file 

176 field_name: Field name to use for each line 

177 skip_empty: Skip empty lines 

178 

179 Yields: 

180 Dictionaries with each line as a field 

181 """ 

182 with open(file_path) as f: 

183 for line in f: 

184 line = line.rstrip('\n\r') 

185 if line or not skip_empty: 

186 yield {field_name: line} 

187 

188 

189def create_file_writer( 

190 file_path: Union[str, Path], 

191 output_format: Optional[str] = None 

192) -> tuple[Callable[[List[Dict[str, Any]]], None], Optional[Callable[[], None]]]: 

193 """Create a file writer function for the specified format. 

194 

195 Args: 

196 file_path: Path to the output file 

197 output_format: Output format (auto-detected if None) 

198 

199 Returns: 

200 Tuple of (writer_function, cleanup_function) 

201 The cleanup_function is None for formats that don't need cleanup 

202 """ 

203 path = Path(file_path) 

204 

205 # Auto-detect format if not specified 

206 if output_format is None: 

207 output_format = detect_format(path, for_output=True) 

208 

209 if output_format == 'jsonl': 

210 return create_jsonl_writer(path), None 

211 

212 elif output_format == 'csv': 

213 delimiter = get_csv_delimiter(path) 

214 return create_csv_writer(path, delimiter) 

215 

216 elif output_format == 'json': 

217 return create_json_writer(path) 

218 

219 else: 

220 # Default to JSONL 

221 return create_jsonl_writer(path), None 

222 

223 

224def create_jsonl_writer(file_path: Path) -> Callable[[List[Dict[str, Any]]], None]: 

225 """Create a JSONL writer function. 

226 

227 Args: 

228 file_path: Path to the output file 

229 

230 Returns: 

231 Writer function that appends to JSONL file 

232 """ 

233 def write_jsonl(results: List[Dict[str, Any]]) -> None: 

234 from dataknobs_fsm.utils.json_encoder import dumps 

235 with open(file_path, 'a') as f: 

236 for result in results: 

237 f.write(dumps(result) + '\n') 

238 

239 return write_jsonl 

240 

241 

242def create_csv_writer( 

243 file_path: Path, 

244 delimiter: str = ',' 

245) -> tuple[Callable[[List[Dict[str, Any]]], None], Callable[[], None]]: 

246 """Create a CSV writer function with state management. 

247 

248 Args: 

249 file_path: Path to the output file 

250 delimiter: CSV delimiter character 

251 

252 Returns: 

253 Tuple of (writer_function, cleanup_function) 

254 """ 

255 csv_writer: Optional[csv.DictWriter] = None 

256 csv_file: Optional[Any] = None 

257 

258 def write_csv(results: List[Dict[str, Any]]) -> None: 

259 nonlocal csv_writer, csv_file 

260 

261 if not csv_file: 

262 csv_file = open(file_path, 'w', newline='') 

263 

264 for result in results: 

265 if not csv_writer: 

266 # Initialize CSV writer with fields from first result 

267 fieldnames = list(result.keys()) 

268 csv_writer = csv.DictWriter( 

269 csv_file, 

270 fieldnames=fieldnames, 

271 delimiter=delimiter 

272 ) 

273 csv_writer.writeheader() 

274 csv_writer.writerow(result) 

275 

276 def cleanup() -> None: 

277 if csv_file: 

278 csv_file.close() 

279 

280 return write_csv, cleanup 

281 

282 

283def create_json_writer( 

284 file_path: Path 

285) -> tuple[Callable[[List[Dict[str, Any]]], None], Callable[[], None]]: 

286 """Create a JSON writer function that accumulates results. 

287 

288 Args: 

289 file_path: Path to the output file 

290 

291 Returns: 

292 Tuple of (writer_function, cleanup_function) 

293 """ 

294 all_results: List[Dict[str, Any]] = [] 

295 

296 def write_json(results: List[Dict[str, Any]]) -> None: 

297 nonlocal all_results 

298 all_results.extend(results) 

299 

300 def cleanup() -> None: 

301 # Write all results at once 

302 with open(file_path, 'w') as f: 

303 json.dump(all_results, f, indent=2) 

304 

305 return write_json, cleanup