Coverage for src/dataknobs_fsm/patterns/file_processing.py: 0%

306 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-20 16:46 -0600

1"""File processing pattern implementation. 

2 

3This module provides pre-configured FSM patterns for processing files, 

4including CSV, JSON, XML, and other formats with streaming support. 

5""" 

6 

7from typing import Any, Dict, List, Callable, AsyncIterator 

8from dataclasses import dataclass, field 

9from enum import Enum 

10from pathlib import Path 

11import re 

12from dataknobs_data import Record 

13 

14from ..api.simple import SimpleFSM 

15from dataknobs_fsm.core.data_modes import DataHandlingMode 

16from ..streaming.file_stream import FileStreamSource, FileStreamSink 

17from ..functions.library.validators import SchemaValidator 

18 

19 

20class FileFormat(Enum): 

21 """Supported file formats.""" 

22 JSON = "json" 

23 CSV = "csv" 

24 XML = "xml" 

25 PARQUET = "parquet" 

26 TEXT = "text" 

27 BINARY = "binary" 

28 

29 

30class ProcessingMode(Enum): 

31 """File processing modes.""" 

32 STREAM = "stream" # Process file as stream 

33 BATCH = "batch" # Process in batches 

34 WHOLE = "whole" # Load entire file 

35 

36 

37@dataclass 

38class FileProcessingConfig: 

39 """Configuration for file processing.""" 

40 input_path: str 

41 output_path: str | None = None 

42 format: FileFormat | None = None # Auto-detect if not specified 

43 mode: ProcessingMode = ProcessingMode.STREAM 

44 chunk_size: int = 1000 

45 parallel_chunks: int = 4 

46 encoding: str = "utf-8" 

47 

48 # Processing options 

49 validation_schema: Dict[str, Any] | None = None 

50 transformations: List[Callable] | None = None 

51 filters: List[Callable] | None = None 

52 aggregations: Dict[str, Callable] | None = None 

53 

54 # Output options 

55 output_format: FileFormat | None = None 

56 compression: str | None = None # gzip, bz2, etc. 

57 partition_by: str | None = None # Field to partition output 

58 

59 # Format-specific configs 

60 json_config: Dict[str, Any] = field(default_factory=dict) 

61 log_config: Dict[str, Any] = field(default_factory=dict) 

62 

63 

64class FileProcessor: 

65 """File processor using FSM pattern.""" 

66 

67 def __init__(self, config: FileProcessingConfig): 

68 """Initialize file processor. 

69  

70 Args: 

71 config: File processing configuration 

72 """ 

73 self.config = config 

74 self._detect_format() 

75 self._fsm = self._build_fsm() 

76 self._metrics = { 

77 'lines_read': 0, 

78 'records_processed': 0, 

79 'records_written': 0, 

80 'errors': 0, 

81 'skipped': 0 

82 } 

83 

84 def _detect_format(self) -> None: 

85 """Auto-detect file format if not specified.""" 

86 if not self.config.format: 

87 path = Path(self.config.input_path) 

88 ext = path.suffix.lower() 

89 

90 format_map = { 

91 '.json': FileFormat.JSON, 

92 '.jsonl': FileFormat.JSON, 

93 '.csv': FileFormat.CSV, 

94 '.tsv': FileFormat.CSV, 

95 '.xml': FileFormat.XML, 

96 '.parquet': FileFormat.PARQUET, 

97 '.txt': FileFormat.TEXT 

98 } 

99 

100 self.config.format = format_map.get(ext, FileFormat.BINARY) 

101 

102 # Set output format if not specified 

103 if not self.config.output_format: 

104 self.config.output_format = self.config.format 

105 

106 def _build_fsm(self) -> SimpleFSM: 

107 """Build FSM for file processing.""" 

108 # Determine data mode based on processing mode 

109 if self.config.mode == ProcessingMode.STREAM: 

110 data_mode = DataHandlingMode.REFERENCE # Use reference for streaming 

111 elif self.config.mode == ProcessingMode.BATCH: 

112 data_mode = DataHandlingMode.COPY # Use copy for batch isolation 

113 else: 

114 data_mode = DataHandlingMode.DIRECT # Use direct for whole file 

115 

116 # Create FSM configuration 

117 fsm_config = { 

118 'name': 'File_Processor', 

119 'data_mode': data_mode.value, 

120 'states': [ 

121 { 

122 'name': 'read', 

123 'is_start': True 

124 }, 

125 { 

126 'name': 'parse', 

127 }, 

128 { 

129 'name': 'validate', 

130 }, 

131 { 

132 'name': 'filter', 

133 }, 

134 { 

135 'name': 'transform', 

136 }, 

137 { 

138 'name': 'aggregate', 

139 }, 

140 { 

141 'name': 'write', 

142 }, 

143 { 

144 'name': 'complete', 

145 'is_end': True 

146 }, 

147 { 

148 'name': 'error', 

149 'is_end': True 

150 } 

151 ], 

152 'arcs': self._build_arcs(), 

153 'functions': self._build_functions() 

154 } 

155 

156 return SimpleFSM(fsm_config, data_mode=data_mode) 

157 

158 def _build_arcs(self) -> List[Dict[str, Any]]: 

159 """Build FSM arcs based on configuration.""" 

160 arcs = [ 

161 { 

162 'from': 'read', 

163 'to': 'parse', 

164 'name': 'read_line' 

165 }, 

166 { 

167 'from': 'parse', 

168 'to': 'validate' if self.config.validation_schema else 'filter', 

169 'name': 'parsed', 

170 'transform': {'type': 'inline', 'code': self._get_parser_code()} 

171 } 

172 ] 

173 

174 # Add validation arc if schema provided 

175 if self.config.validation_schema: 

176 arcs.extend([ 

177 { 

178 'from': 'validate', 

179 'to': 'filter' if self.config.filters else 'transform', 

180 'name': 'valid', 

181 'condition': {'type': 'inline', 'code': self._get_validator_code()} 

182 }, 

183 { 

184 'from': 'validate', 

185 'to': 'error', 

186 'name': 'invalid' 

187 } 

188 ]) 

189 

190 # Add filter arc if filters provided 

191 if self.config.filters: 

192 arcs.extend([ 

193 { 

194 'from': 'filter', 

195 'to': 'transform' if self.config.transformations else 'aggregate', 

196 'name': 'passed', 

197 'condition': {'type': 'inline', 'code': self._get_filter_code()} 

198 }, 

199 { 

200 'from': 'filter', 

201 'to': 'complete', 

202 'name': 'filtered_out' 

203 } 

204 ]) 

205 

206 # Add transformation arc if transformations provided 

207 if self.config.transformations: 

208 next_state = 'aggregate' if self.config.aggregations else 'write' 

209 arcs.append({ 

210 'from': 'transform', 

211 'to': next_state, 

212 'name': 'transformed', 

213 'transform': {'type': 'inline', 'code': self._get_transformer_code()} 

214 }) 

215 

216 # Add aggregation arc if aggregations provided 

217 if self.config.aggregations: 

218 arcs.append({ 

219 'from': 'aggregate', 

220 'to': 'write', 

221 'name': 'aggregated', 

222 'transform': {'type': 'inline', 'code': self._get_aggregator_code()} 

223 }) 

224 

225 # Add write arc 

226 arcs.extend([ 

227 { 

228 'from': 'write', 

229 'to': 'complete', 

230 'name': 'written' 

231 } 

232 ]) 

233 

234 return arcs # type: ignore 

235 

236 def _build_functions(self) -> Dict[str, Any]: 

237 """Build functions registry for FSM.""" 

238 functions = {} 

239 

240 # Register filter functions 

241 if self.config.filters: 

242 for i, filter_func in enumerate(self.config.filters): 

243 functions[f'filter_{i}'] = filter_func 

244 

245 # Register transformation functions  

246 if self.config.transformations: 

247 for i, transform_func in enumerate(self.config.transformations): 

248 functions[f'transform_{i}'] = transform_func 

249 

250 # Register aggregation functions 

251 if self.config.aggregations: 

252 for agg_name, agg_func in self.config.aggregations.items(): 

253 functions[f'agg_{agg_name}'] = agg_func 

254 

255 # Register parser function 

256 functions['parser'] = self._create_parser() 

257 

258 return functions 

259 

260 def _get_parser_code(self) -> str: 

261 """Get parser code for file format.""" 

262 if self.config.format == FileFormat.JSON: 

263 return "import json; json.loads(data) if isinstance(data, str) else data" 

264 elif self.config.format == FileFormat.CSV: 

265 return """ 

266import csv 

267from io import StringIO 

268if isinstance(data, str): 

269 reader = csv.DictReader(StringIO(data)) 

270 rows = list(reader) 

271 data = rows[0] if rows else {} 

272data 

273""" 

274 elif self.config.format == FileFormat.XML: 

275 return """ 

276import xml.etree.ElementTree as ET 

277if isinstance(data, str): 

278 root = ET.fromstring(data) 

279 data = {child.tag: child.text for child in root} 

280data 

281""" 

282 else: 

283 return "data" 

284 

285 def _get_validator_code(self) -> str: 

286 """Get validator code.""" 

287 if not self.config.validation_schema: 

288 return "True" 

289 

290 # Generate validation code based on schema 

291 validations = [] 

292 for field_name, constraints in self.config.validation_schema.items(): 

293 if isinstance(constraints, dict): 

294 if constraints.get('required'): 

295 validations.append(f"'{field_name}' in data") 

296 if 'type' in constraints: 

297 python_type = constraints['type'] 

298 validations.append(f"isinstance(data.get('{field_name}'), {python_type})") 

299 if 'min' in constraints: 

300 validations.append(f"data.get('{field_name}', 0) >= {constraints['min']}") 

301 if 'max' in constraints: 

302 validations.append(f"data.get('{field_name}', 0) <= {constraints['max']}") 

303 if 'pattern' in constraints: 

304 validations.append(f"re.match(r'{constraints['pattern']}', str(data.get('{field_name}', '')))") 

305 elif constraints is True: # Required field 

306 validations.append(f"'{field_name}' in data") 

307 

308 return " and ".join(validations) if validations else "True" 

309 

310 def _get_filter_code(self) -> str: 

311 """Get filter code.""" 

312 if not self.config.filters: 

313 return "True" 

314 

315 # Apply all filters in sequence - data must pass all filters 

316 filter_conditions = [] 

317 for i, _filter_func in enumerate(self.config.filters): 

318 # Use registered function name 

319 filter_conditions.append(f"filter_{i}(data)") 

320 

321 return " and ".join(filter_conditions) if filter_conditions else "True" 

322 

323 def _get_transformer_code(self) -> str: 

324 """Get transformer code.""" 

325 if not self.config.transformations: 

326 return "data" 

327 

328 # Apply transformations in sequence 

329 result_code = "data" 

330 for i, _transform_func in enumerate(self.config.transformations): 

331 # Each transformation receives the result of the previous one 

332 result_code = f"transform_{i}({result_code})" 

333 

334 return result_code 

335 

336 def _get_aggregator_code(self) -> str: 

337 """Get aggregator code.""" 

338 if not self.config.aggregations: 

339 return "data" 

340 

341 # Apply aggregations to create summary data 

342 aggregation_results = [] 

343 for agg_name in self.config.aggregations.keys(): 

344 aggregation_results.append(f"'{agg_name}': agg_{agg_name}(data)") 

345 

346 if aggregation_results: 

347 return "{" + ", ".join(aggregation_results) + "}" 

348 else: 

349 return "data" 

350 

351 def _create_parser(self) -> Callable: 

352 """Create parser for file format.""" 

353 if self.config.format == FileFormat.JSON: 

354 import json 

355 return lambda data: json.loads(data) if isinstance(data, str) else data 

356 elif self.config.format == FileFormat.CSV: 

357 import csv 

358 from io import StringIO 

359 

360 def parse_csv(data): 

361 if isinstance(data, str): 

362 reader = csv.DictReader(StringIO(data)) 

363 return next(iter(reader)) if reader else {} 

364 return data 

365 return parse_csv 

366 elif self.config.format == FileFormat.XML: 

367 import xml.etree.ElementTree as ET 

368 

369 def parse_xml(data): 

370 if isinstance(data, str): 

371 root = ET.fromstring(data) 

372 return {child.tag: child.text for child in root} 

373 return data 

374 return parse_xml 

375 else: 

376 return lambda data: data 

377 

378 def _create_validator(self) -> Callable | None: 

379 """Create validator function.""" 

380 if not self.config.validation_schema: 

381 return None 

382 

383 validator = SchemaValidator(self.config.validation_schema) 

384 return lambda state: validator.validate(Record(state.data)) # type: ignore 

385 

386 def _create_filter(self) -> Callable | None: 

387 """Create filter function.""" 

388 if not self.config.filters: 

389 return None 

390 

391 def apply_filters(state): 

392 return all(filter_func(state.data) for filter_func in self.config.filters) 

393 

394 return apply_filters 

395 

396 def _create_transformer(self) -> Callable | None: 

397 """Create transformation function.""" 

398 if not self.config.transformations: 

399 return None 

400 

401 async def transform(data: Dict[str, Any]) -> Dict[str, Any]: 

402 result = data 

403 for transformer in self.config.transformations: 

404 if hasattr(transformer, 'transform'): 

405 result = await transformer.transform(result) 

406 elif callable(transformer): 

407 result = transformer(result) 

408 return result 

409 

410 return transform 

411 

412 def _create_aggregator(self) -> Callable | None: 

413 """Create aggregation function.""" 

414 if not self.config.aggregations: 

415 return None 

416 

417 # Store aggregation state 

418 agg_state = {key: [] for key in self.config.aggregations} 

419 

420 def aggregate(data: Dict[str, Any]) -> Dict[str, Any]: 

421 # Accumulate values 

422 for key in self.config.aggregations: 

423 if key in data: 

424 agg_state[key].append(data[key]) 

425 

426 # Return aggregated results 

427 return { 

428 key: self.config.aggregations[key](values) # type: ignore 

429 for key, values in agg_state.items() 

430 } 

431 

432 return aggregate 

433 

434 async def process(self) -> Dict[str, Any]: 

435 """Process the file. 

436  

437 Returns: 

438 Processing metrics 

439 """ 

440 if self.config.mode == ProcessingMode.STREAM: 

441 return await self._process_stream() 

442 elif self.config.mode == ProcessingMode.BATCH: 

443 return await self._process_batch() 

444 else: 

445 return await self._process_whole() 

446 

447 async def _process_stream(self) -> Dict[str, Any]: 

448 """Process file as stream.""" 

449 # Create stream source 

450 source = FileStreamSource( 

451 self.config.input_path, 

452 chunk_size=self.config.chunk_size, 

453 encoding=self.config.encoding 

454 ) 

455 

456 # Create stream sink if output specified 

457 sink = None 

458 if self.config.output_path: 

459 sink = FileStreamSink( 

460 self.config.output_path, 

461 encoding=self.config.encoding, 

462 compression=self.config.compression 

463 ) 

464 

465 # Process stream 

466 result = self._fsm.process_stream( 

467 source=source, 

468 sink=sink, 

469 chunk_size=self.config.chunk_size, 

470 on_progress=self._update_progress 

471 ) 

472 

473 self._metrics.update(result) 

474 return self._metrics 

475 

476 async def _process_batch(self) -> Dict[str, Any]: 

477 """Process file in batches.""" 

478 # Read file in batches 

479 batches = [] 

480 async for batch in self._read_batches(): 

481 batches.append(batch) 

482 

483 # Process batches 

484 for batch in batches: 

485 results = self._fsm.process_batch( 

486 data=batch, # type: ignore 

487 batch_size=self.config.chunk_size, 

488 max_workers=self.config.parallel_chunks 

489 ) 

490 

491 # Update metrics 

492 for result in results: 

493 if result['success']: 

494 self._metrics['records_processed'] += 1 

495 if result['final_state'] == 'complete': 

496 self._metrics['records_written'] += 1 

497 else: 

498 self._metrics['errors'] += 1 

499 

500 return self._metrics 

501 

502 async def _process_whole(self) -> Dict[str, Any]: 

503 """Process entire file at once.""" 

504 # Read entire file 

505 with open(self.config.input_path, encoding=self.config.encoding) as f: 

506 content = f.read() 

507 

508 # Parse content 

509 if self.config.format == FileFormat.JSON: 

510 import json 

511 data = json.loads(content) 

512 elif self.config.format == FileFormat.CSV: 

513 import csv 

514 from io import StringIO 

515 reader = csv.DictReader(StringIO(content)) 

516 data = list(reader) 

517 else: 

518 data = {'content': content} 

519 

520 # Process data 

521 if isinstance(data, list): 

522 results = self._fsm.process_batch(data) 

523 else: 

524 results = [self._fsm.process(data)] 

525 

526 # Write output if specified 

527 if self.config.output_path and results: 

528 await self._write_output(results) 

529 

530 self._metrics['records_processed'] = len(results) 

531 self._metrics['records_written'] = sum( 

532 1 for r in results if r['success'] 

533 ) 

534 

535 return self._metrics 

536 

537 async def _read_batches(self) -> AsyncIterator[List[Dict[str, Any]]]: 

538 """Read file in batches.""" 

539 batch = [] 

540 with open(self.config.input_path, encoding=self.config.encoding) as f: 

541 for line in f: 

542 self._metrics['lines_read'] += 1 

543 

544 # Parse line based on format 

545 if self.config.format == FileFormat.JSON: 

546 import json 

547 try: 

548 record = json.loads(line) 

549 batch.append(record) 

550 except json.JSONDecodeError: 

551 self._metrics['errors'] += 1 

552 continue 

553 else: 

554 batch.append({'line': line.strip()}) 

555 

556 if len(batch) >= self.config.chunk_size: 

557 yield batch 

558 batch = [] 

559 

560 if batch: 

561 yield batch 

562 

563 async def _write_output(self, results: List[Dict[str, Any]]) -> None: 

564 """Write processed results to output file.""" 

565 output_data = [r['data'] for r in results if r['success']] 

566 

567 with open(self.config.output_path, 'w', encoding=self.config.encoding) as f: # type: ignore 

568 if self.config.output_format == FileFormat.JSON: 

569 import json 

570 json.dump(output_data, f, indent=2) 

571 elif self.config.output_format == FileFormat.CSV: 

572 import csv 

573 if output_data: 

574 writer = csv.DictWriter(f, fieldnames=output_data[0].keys()) 

575 writer.writeheader() 

576 writer.writerows(output_data) 

577 else: 

578 for item in output_data: 

579 f.write(str(item) + '\n') 

580 

581 def _update_progress(self, progress: Dict[str, Any]) -> None: 

582 """Update progress metrics.""" 

583 self._metrics.update(progress) 

584 

585 

586# Factory functions for common file processing patterns 

587 

588def create_csv_processor( 

589 input_file: str, 

590 output_file: str | None = None, 

591 transformations: List[Callable] | None = None, 

592 filters: List[Callable] | None = None 

593) -> FileProcessor: 

594 """Create CSV file processor. 

595  

596 Args: 

597 input_file: Input CSV file path 

598 output_file: Optional output file path 

599 transformations: Data transformations 

600 filters: Row filters 

601  

602 Returns: 

603 Configured FileProcessor 

604 """ 

605 config = FileProcessingConfig( 

606 input_path=input_file, 

607 output_path=output_file, 

608 format=FileFormat.CSV, 

609 mode=ProcessingMode.STREAM, 

610 transformations=transformations, 

611 filters=filters 

612 ) 

613 

614 return FileProcessor(config) 

615 

616 

617def create_json_stream_processor( 

618 input_file: str, 

619 output_file: str | None = None, 

620 validation_schema: Dict[str, Any] | None = None, 

621 chunk_size: int = 1000 

622) -> FileProcessor: 

623 """Create JSON lines stream processor. 

624  

625 Args: 

626 input_file: Input JSONL file path 

627 output_file: Optional output file path 

628 validation_schema: JSON schema for validation 

629 chunk_size: Processing chunk size 

630  

631 Returns: 

632 Configured FileProcessor 

633 """ 

634 config = FileProcessingConfig( 

635 input_path=input_file, 

636 output_path=output_file, 

637 format=FileFormat.JSON, 

638 mode=ProcessingMode.STREAM, 

639 chunk_size=chunk_size, 

640 validation_schema=validation_schema 

641 ) 

642 

643 return FileProcessor(config) 

644 

645 

646def create_log_analyzer( 

647 log_file: str, 

648 output_file: str | None = None, 

649 patterns: List[str] | None = None, 

650 aggregations: Dict[str, Callable] | None = None 

651) -> FileProcessor: 

652 """Create log file analyzer. 

653  

654 Args: 

655 log_file: Log file path 

656 output_file: Optional analysis output 

657 patterns: Regex patterns to extract 

658 aggregations: Aggregation functions 

659  

660 Returns: 

661 Configured FileProcessor 

662 """ 

663 # Create pattern extractors 

664 transformations = [] 

665 if patterns: 

666 def extract_patterns(data): 

667 result = data.copy() 

668 for pattern in patterns: 

669 match = re.search(pattern, data.get('line', '')) 

670 if match: 

671 result.update(match.groupdict()) 

672 return result 

673 transformations.append(extract_patterns) 

674 

675 config = FileProcessingConfig( 

676 input_path=log_file, 

677 output_path=output_file, 

678 format=FileFormat.TEXT, 

679 mode=ProcessingMode.STREAM, 

680 transformations=transformations, 

681 aggregations=aggregations 

682 ) 

683 

684 return FileProcessor(config) 

685 

686 

687def create_file_processor( 

688 input_path: str, 

689 output_path: str, 

690 pattern: str = "*", 

691 mode: ProcessingMode = ProcessingMode.WHOLE, 

692 transformations: List[Callable] | None = None 

693) -> FileProcessor: 

694 """Create generic file processor. 

695  

696 Args: 

697 input_path: Input directory or file 

698 output_path: Output directory or file  

699 pattern: File pattern to match (currently unused) 

700 mode: Processing mode 

701 transformations: Data transformation functions 

702  

703 Returns: 

704 Configured FileProcessor 

705 """ 

706 # Note: pattern parameter is currently not used in FileProcessingConfig 

707 config = FileProcessingConfig( 

708 input_path=input_path, 

709 output_path=output_path, 

710 format=FileFormat.TEXT, 

711 mode=mode, 

712 transformations=transformations or [] 

713 ) 

714 

715 return FileProcessor(config) 

716 

717 

718def create_json_processor( 

719 input_path: str, 

720 output_path: str, 

721 pretty_print: bool = False, 

722 array_processing: bool = False 

723) -> FileProcessor: 

724 """Create JSON file processor. 

725  

726 Args: 

727 input_path: Input directory 

728 output_path: Output directory 

729 pretty_print: Whether to pretty print JSON 

730 array_processing: Process as JSON arrays 

731  

732 Returns: 

733 Configured FileProcessor  

734 """ 

735 config = FileProcessingConfig( 

736 input_path=input_path, 

737 output_path=output_path, 

738 format=FileFormat.JSON, 

739 mode=ProcessingMode.WHOLE, 

740 json_config={ 

741 'pretty_print': pretty_print, 

742 'array_processing': array_processing 

743 } 

744 ) 

745 

746 return FileProcessor(config) 

747 

748 

749def create_log_processor( 

750 input_path: str, 

751 output_path: str, 

752 parse_timestamps: bool = False, 

753 extract_errors: bool = False 

754) -> FileProcessor: 

755 """Create log file processor. 

756  

757 Args: 

758 input_path: Input directory 

759 output_path: Output directory 

760 pattern: Log file pattern 

761 parse_timestamps: Whether to parse timestamps 

762 extract_errors: Whether to extract error entries 

763  

764 Returns: 

765 Configured FileProcessor 

766 """ 

767 config = FileProcessingConfig( 

768 input_path=input_path, 

769 output_path=output_path, 

770 format=FileFormat.TEXT, 

771 mode=ProcessingMode.STREAM, 

772 log_config={ 

773 'parse_timestamps': parse_timestamps, 

774 'extract_errors': extract_errors 

775 } 

776 ) 

777 

778 return FileProcessor(config) 

779 

780 

781def create_batch_file_processor( 

782 input_paths: List[str], 

783 output_path: str, 

784 patterns: List[str], 

785 batch_size: int = 10 

786) -> FileProcessor: 

787 """Create batch file processor. 

788  

789 Args: 

790 input_paths: List of input directories 

791 output_path: Output directory 

792 patterns: File patterns to match 

793 batch_size: Batch processing size 

794  

795 Returns: 

796 Configured FileProcessor 

797 """ 

798 # Use first input path for config, handle multiple paths in processor 

799 config = FileProcessingConfig( # type: ignore 

800 input_path=input_paths[0] if input_paths else "", 

801 output_path=output_path, 

802 pattern=patterns[0] if patterns else "*", 

803 format=FileFormat.TEXT, 

804 mode=ProcessingMode.BATCH, 

805 batch_size=batch_size 

806 ) 

807 

808 return FileProcessor(config)