Coverage for src/dataknobs_fsm/patterns/file_processing.py: 0%
306 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 16:46 -0600
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 16:46 -0600
1"""File processing pattern implementation.
3This module provides pre-configured FSM patterns for processing files,
4including CSV, JSON, XML, and other formats with streaming support.
5"""
7from typing import Any, Dict, List, Callable, AsyncIterator
8from dataclasses import dataclass, field
9from enum import Enum
10from pathlib import Path
11import re
12from dataknobs_data import Record
14from ..api.simple import SimpleFSM
15from dataknobs_fsm.core.data_modes import DataHandlingMode
16from ..streaming.file_stream import FileStreamSource, FileStreamSink
17from ..functions.library.validators import SchemaValidator
20class FileFormat(Enum):
21 """Supported file formats."""
22 JSON = "json"
23 CSV = "csv"
24 XML = "xml"
25 PARQUET = "parquet"
26 TEXT = "text"
27 BINARY = "binary"
30class ProcessingMode(Enum):
31 """File processing modes."""
32 STREAM = "stream" # Process file as stream
33 BATCH = "batch" # Process in batches
34 WHOLE = "whole" # Load entire file
37@dataclass
38class FileProcessingConfig:
39 """Configuration for file processing."""
40 input_path: str
41 output_path: str | None = None
42 format: FileFormat | None = None # Auto-detect if not specified
43 mode: ProcessingMode = ProcessingMode.STREAM
44 chunk_size: int = 1000
45 parallel_chunks: int = 4
46 encoding: str = "utf-8"
48 # Processing options
49 validation_schema: Dict[str, Any] | None = None
50 transformations: List[Callable] | None = None
51 filters: List[Callable] | None = None
52 aggregations: Dict[str, Callable] | None = None
54 # Output options
55 output_format: FileFormat | None = None
56 compression: str | None = None # gzip, bz2, etc.
57 partition_by: str | None = None # Field to partition output
59 # Format-specific configs
60 json_config: Dict[str, Any] = field(default_factory=dict)
61 log_config: Dict[str, Any] = field(default_factory=dict)
64class FileProcessor:
65 """File processor using FSM pattern."""
67 def __init__(self, config: FileProcessingConfig):
68 """Initialize file processor.
70 Args:
71 config: File processing configuration
72 """
73 self.config = config
74 self._detect_format()
75 self._fsm = self._build_fsm()
76 self._metrics = {
77 'lines_read': 0,
78 'records_processed': 0,
79 'records_written': 0,
80 'errors': 0,
81 'skipped': 0
82 }
84 def _detect_format(self) -> None:
85 """Auto-detect file format if not specified."""
86 if not self.config.format:
87 path = Path(self.config.input_path)
88 ext = path.suffix.lower()
90 format_map = {
91 '.json': FileFormat.JSON,
92 '.jsonl': FileFormat.JSON,
93 '.csv': FileFormat.CSV,
94 '.tsv': FileFormat.CSV,
95 '.xml': FileFormat.XML,
96 '.parquet': FileFormat.PARQUET,
97 '.txt': FileFormat.TEXT
98 }
100 self.config.format = format_map.get(ext, FileFormat.BINARY)
102 # Set output format if not specified
103 if not self.config.output_format:
104 self.config.output_format = self.config.format
106 def _build_fsm(self) -> SimpleFSM:
107 """Build FSM for file processing."""
108 # Determine data mode based on processing mode
109 if self.config.mode == ProcessingMode.STREAM:
110 data_mode = DataHandlingMode.REFERENCE # Use reference for streaming
111 elif self.config.mode == ProcessingMode.BATCH:
112 data_mode = DataHandlingMode.COPY # Use copy for batch isolation
113 else:
114 data_mode = DataHandlingMode.DIRECT # Use direct for whole file
116 # Create FSM configuration
117 fsm_config = {
118 'name': 'File_Processor',
119 'data_mode': data_mode.value,
120 'states': [
121 {
122 'name': 'read',
123 'is_start': True
124 },
125 {
126 'name': 'parse',
127 },
128 {
129 'name': 'validate',
130 },
131 {
132 'name': 'filter',
133 },
134 {
135 'name': 'transform',
136 },
137 {
138 'name': 'aggregate',
139 },
140 {
141 'name': 'write',
142 },
143 {
144 'name': 'complete',
145 'is_end': True
146 },
147 {
148 'name': 'error',
149 'is_end': True
150 }
151 ],
152 'arcs': self._build_arcs(),
153 'functions': self._build_functions()
154 }
156 return SimpleFSM(fsm_config, data_mode=data_mode)
158 def _build_arcs(self) -> List[Dict[str, Any]]:
159 """Build FSM arcs based on configuration."""
160 arcs = [
161 {
162 'from': 'read',
163 'to': 'parse',
164 'name': 'read_line'
165 },
166 {
167 'from': 'parse',
168 'to': 'validate' if self.config.validation_schema else 'filter',
169 'name': 'parsed',
170 'transform': {'type': 'inline', 'code': self._get_parser_code()}
171 }
172 ]
174 # Add validation arc if schema provided
175 if self.config.validation_schema:
176 arcs.extend([
177 {
178 'from': 'validate',
179 'to': 'filter' if self.config.filters else 'transform',
180 'name': 'valid',
181 'condition': {'type': 'inline', 'code': self._get_validator_code()}
182 },
183 {
184 'from': 'validate',
185 'to': 'error',
186 'name': 'invalid'
187 }
188 ])
190 # Add filter arc if filters provided
191 if self.config.filters:
192 arcs.extend([
193 {
194 'from': 'filter',
195 'to': 'transform' if self.config.transformations else 'aggregate',
196 'name': 'passed',
197 'condition': {'type': 'inline', 'code': self._get_filter_code()}
198 },
199 {
200 'from': 'filter',
201 'to': 'complete',
202 'name': 'filtered_out'
203 }
204 ])
206 # Add transformation arc if transformations provided
207 if self.config.transformations:
208 next_state = 'aggregate' if self.config.aggregations else 'write'
209 arcs.append({
210 'from': 'transform',
211 'to': next_state,
212 'name': 'transformed',
213 'transform': {'type': 'inline', 'code': self._get_transformer_code()}
214 })
216 # Add aggregation arc if aggregations provided
217 if self.config.aggregations:
218 arcs.append({
219 'from': 'aggregate',
220 'to': 'write',
221 'name': 'aggregated',
222 'transform': {'type': 'inline', 'code': self._get_aggregator_code()}
223 })
225 # Add write arc
226 arcs.extend([
227 {
228 'from': 'write',
229 'to': 'complete',
230 'name': 'written'
231 }
232 ])
234 return arcs # type: ignore
236 def _build_functions(self) -> Dict[str, Any]:
237 """Build functions registry for FSM."""
238 functions = {}
240 # Register filter functions
241 if self.config.filters:
242 for i, filter_func in enumerate(self.config.filters):
243 functions[f'filter_{i}'] = filter_func
245 # Register transformation functions
246 if self.config.transformations:
247 for i, transform_func in enumerate(self.config.transformations):
248 functions[f'transform_{i}'] = transform_func
250 # Register aggregation functions
251 if self.config.aggregations:
252 for agg_name, agg_func in self.config.aggregations.items():
253 functions[f'agg_{agg_name}'] = agg_func
255 # Register parser function
256 functions['parser'] = self._create_parser()
258 return functions
260 def _get_parser_code(self) -> str:
261 """Get parser code for file format."""
262 if self.config.format == FileFormat.JSON:
263 return "import json; json.loads(data) if isinstance(data, str) else data"
264 elif self.config.format == FileFormat.CSV:
265 return """
266import csv
267from io import StringIO
268if isinstance(data, str):
269 reader = csv.DictReader(StringIO(data))
270 rows = list(reader)
271 data = rows[0] if rows else {}
272data
273"""
274 elif self.config.format == FileFormat.XML:
275 return """
276import xml.etree.ElementTree as ET
277if isinstance(data, str):
278 root = ET.fromstring(data)
279 data = {child.tag: child.text for child in root}
280data
281"""
282 else:
283 return "data"
285 def _get_validator_code(self) -> str:
286 """Get validator code."""
287 if not self.config.validation_schema:
288 return "True"
290 # Generate validation code based on schema
291 validations = []
292 for field_name, constraints in self.config.validation_schema.items():
293 if isinstance(constraints, dict):
294 if constraints.get('required'):
295 validations.append(f"'{field_name}' in data")
296 if 'type' in constraints:
297 python_type = constraints['type']
298 validations.append(f"isinstance(data.get('{field_name}'), {python_type})")
299 if 'min' in constraints:
300 validations.append(f"data.get('{field_name}', 0) >= {constraints['min']}")
301 if 'max' in constraints:
302 validations.append(f"data.get('{field_name}', 0) <= {constraints['max']}")
303 if 'pattern' in constraints:
304 validations.append(f"re.match(r'{constraints['pattern']}', str(data.get('{field_name}', '')))")
305 elif constraints is True: # Required field
306 validations.append(f"'{field_name}' in data")
308 return " and ".join(validations) if validations else "True"
310 def _get_filter_code(self) -> str:
311 """Get filter code."""
312 if not self.config.filters:
313 return "True"
315 # Apply all filters in sequence - data must pass all filters
316 filter_conditions = []
317 for i, _filter_func in enumerate(self.config.filters):
318 # Use registered function name
319 filter_conditions.append(f"filter_{i}(data)")
321 return " and ".join(filter_conditions) if filter_conditions else "True"
323 def _get_transformer_code(self) -> str:
324 """Get transformer code."""
325 if not self.config.transformations:
326 return "data"
328 # Apply transformations in sequence
329 result_code = "data"
330 for i, _transform_func in enumerate(self.config.transformations):
331 # Each transformation receives the result of the previous one
332 result_code = f"transform_{i}({result_code})"
334 return result_code
336 def _get_aggregator_code(self) -> str:
337 """Get aggregator code."""
338 if not self.config.aggregations:
339 return "data"
341 # Apply aggregations to create summary data
342 aggregation_results = []
343 for agg_name in self.config.aggregations.keys():
344 aggregation_results.append(f"'{agg_name}': agg_{agg_name}(data)")
346 if aggregation_results:
347 return "{" + ", ".join(aggregation_results) + "}"
348 else:
349 return "data"
351 def _create_parser(self) -> Callable:
352 """Create parser for file format."""
353 if self.config.format == FileFormat.JSON:
354 import json
355 return lambda data: json.loads(data) if isinstance(data, str) else data
356 elif self.config.format == FileFormat.CSV:
357 import csv
358 from io import StringIO
360 def parse_csv(data):
361 if isinstance(data, str):
362 reader = csv.DictReader(StringIO(data))
363 return next(iter(reader)) if reader else {}
364 return data
365 return parse_csv
366 elif self.config.format == FileFormat.XML:
367 import xml.etree.ElementTree as ET
369 def parse_xml(data):
370 if isinstance(data, str):
371 root = ET.fromstring(data)
372 return {child.tag: child.text for child in root}
373 return data
374 return parse_xml
375 else:
376 return lambda data: data
378 def _create_validator(self) -> Callable | None:
379 """Create validator function."""
380 if not self.config.validation_schema:
381 return None
383 validator = SchemaValidator(self.config.validation_schema)
384 return lambda state: validator.validate(Record(state.data)) # type: ignore
386 def _create_filter(self) -> Callable | None:
387 """Create filter function."""
388 if not self.config.filters:
389 return None
391 def apply_filters(state):
392 return all(filter_func(state.data) for filter_func in self.config.filters)
394 return apply_filters
396 def _create_transformer(self) -> Callable | None:
397 """Create transformation function."""
398 if not self.config.transformations:
399 return None
401 async def transform(data: Dict[str, Any]) -> Dict[str, Any]:
402 result = data
403 for transformer in self.config.transformations:
404 if hasattr(transformer, 'transform'):
405 result = await transformer.transform(result)
406 elif callable(transformer):
407 result = transformer(result)
408 return result
410 return transform
412 def _create_aggregator(self) -> Callable | None:
413 """Create aggregation function."""
414 if not self.config.aggregations:
415 return None
417 # Store aggregation state
418 agg_state = {key: [] for key in self.config.aggregations}
420 def aggregate(data: Dict[str, Any]) -> Dict[str, Any]:
421 # Accumulate values
422 for key in self.config.aggregations:
423 if key in data:
424 agg_state[key].append(data[key])
426 # Return aggregated results
427 return {
428 key: self.config.aggregations[key](values) # type: ignore
429 for key, values in agg_state.items()
430 }
432 return aggregate
434 async def process(self) -> Dict[str, Any]:
435 """Process the file.
437 Returns:
438 Processing metrics
439 """
440 if self.config.mode == ProcessingMode.STREAM:
441 return await self._process_stream()
442 elif self.config.mode == ProcessingMode.BATCH:
443 return await self._process_batch()
444 else:
445 return await self._process_whole()
447 async def _process_stream(self) -> Dict[str, Any]:
448 """Process file as stream."""
449 # Create stream source
450 source = FileStreamSource(
451 self.config.input_path,
452 chunk_size=self.config.chunk_size,
453 encoding=self.config.encoding
454 )
456 # Create stream sink if output specified
457 sink = None
458 if self.config.output_path:
459 sink = FileStreamSink(
460 self.config.output_path,
461 encoding=self.config.encoding,
462 compression=self.config.compression
463 )
465 # Process stream
466 result = self._fsm.process_stream(
467 source=source,
468 sink=sink,
469 chunk_size=self.config.chunk_size,
470 on_progress=self._update_progress
471 )
473 self._metrics.update(result)
474 return self._metrics
476 async def _process_batch(self) -> Dict[str, Any]:
477 """Process file in batches."""
478 # Read file in batches
479 batches = []
480 async for batch in self._read_batches():
481 batches.append(batch)
483 # Process batches
484 for batch in batches:
485 results = self._fsm.process_batch(
486 data=batch, # type: ignore
487 batch_size=self.config.chunk_size,
488 max_workers=self.config.parallel_chunks
489 )
491 # Update metrics
492 for result in results:
493 if result['success']:
494 self._metrics['records_processed'] += 1
495 if result['final_state'] == 'complete':
496 self._metrics['records_written'] += 1
497 else:
498 self._metrics['errors'] += 1
500 return self._metrics
502 async def _process_whole(self) -> Dict[str, Any]:
503 """Process entire file at once."""
504 # Read entire file
505 with open(self.config.input_path, encoding=self.config.encoding) as f:
506 content = f.read()
508 # Parse content
509 if self.config.format == FileFormat.JSON:
510 import json
511 data = json.loads(content)
512 elif self.config.format == FileFormat.CSV:
513 import csv
514 from io import StringIO
515 reader = csv.DictReader(StringIO(content))
516 data = list(reader)
517 else:
518 data = {'content': content}
520 # Process data
521 if isinstance(data, list):
522 results = self._fsm.process_batch(data)
523 else:
524 results = [self._fsm.process(data)]
526 # Write output if specified
527 if self.config.output_path and results:
528 await self._write_output(results)
530 self._metrics['records_processed'] = len(results)
531 self._metrics['records_written'] = sum(
532 1 for r in results if r['success']
533 )
535 return self._metrics
537 async def _read_batches(self) -> AsyncIterator[List[Dict[str, Any]]]:
538 """Read file in batches."""
539 batch = []
540 with open(self.config.input_path, encoding=self.config.encoding) as f:
541 for line in f:
542 self._metrics['lines_read'] += 1
544 # Parse line based on format
545 if self.config.format == FileFormat.JSON:
546 import json
547 try:
548 record = json.loads(line)
549 batch.append(record)
550 except json.JSONDecodeError:
551 self._metrics['errors'] += 1
552 continue
553 else:
554 batch.append({'line': line.strip()})
556 if len(batch) >= self.config.chunk_size:
557 yield batch
558 batch = []
560 if batch:
561 yield batch
563 async def _write_output(self, results: List[Dict[str, Any]]) -> None:
564 """Write processed results to output file."""
565 output_data = [r['data'] for r in results if r['success']]
567 with open(self.config.output_path, 'w', encoding=self.config.encoding) as f: # type: ignore
568 if self.config.output_format == FileFormat.JSON:
569 import json
570 json.dump(output_data, f, indent=2)
571 elif self.config.output_format == FileFormat.CSV:
572 import csv
573 if output_data:
574 writer = csv.DictWriter(f, fieldnames=output_data[0].keys())
575 writer.writeheader()
576 writer.writerows(output_data)
577 else:
578 for item in output_data:
579 f.write(str(item) + '\n')
581 def _update_progress(self, progress: Dict[str, Any]) -> None:
582 """Update progress metrics."""
583 self._metrics.update(progress)
586# Factory functions for common file processing patterns
588def create_csv_processor(
589 input_file: str,
590 output_file: str | None = None,
591 transformations: List[Callable] | None = None,
592 filters: List[Callable] | None = None
593) -> FileProcessor:
594 """Create CSV file processor.
596 Args:
597 input_file: Input CSV file path
598 output_file: Optional output file path
599 transformations: Data transformations
600 filters: Row filters
602 Returns:
603 Configured FileProcessor
604 """
605 config = FileProcessingConfig(
606 input_path=input_file,
607 output_path=output_file,
608 format=FileFormat.CSV,
609 mode=ProcessingMode.STREAM,
610 transformations=transformations,
611 filters=filters
612 )
614 return FileProcessor(config)
617def create_json_stream_processor(
618 input_file: str,
619 output_file: str | None = None,
620 validation_schema: Dict[str, Any] | None = None,
621 chunk_size: int = 1000
622) -> FileProcessor:
623 """Create JSON lines stream processor.
625 Args:
626 input_file: Input JSONL file path
627 output_file: Optional output file path
628 validation_schema: JSON schema for validation
629 chunk_size: Processing chunk size
631 Returns:
632 Configured FileProcessor
633 """
634 config = FileProcessingConfig(
635 input_path=input_file,
636 output_path=output_file,
637 format=FileFormat.JSON,
638 mode=ProcessingMode.STREAM,
639 chunk_size=chunk_size,
640 validation_schema=validation_schema
641 )
643 return FileProcessor(config)
646def create_log_analyzer(
647 log_file: str,
648 output_file: str | None = None,
649 patterns: List[str] | None = None,
650 aggregations: Dict[str, Callable] | None = None
651) -> FileProcessor:
652 """Create log file analyzer.
654 Args:
655 log_file: Log file path
656 output_file: Optional analysis output
657 patterns: Regex patterns to extract
658 aggregations: Aggregation functions
660 Returns:
661 Configured FileProcessor
662 """
663 # Create pattern extractors
664 transformations = []
665 if patterns:
666 def extract_patterns(data):
667 result = data.copy()
668 for pattern in patterns:
669 match = re.search(pattern, data.get('line', ''))
670 if match:
671 result.update(match.groupdict())
672 return result
673 transformations.append(extract_patterns)
675 config = FileProcessingConfig(
676 input_path=log_file,
677 output_path=output_file,
678 format=FileFormat.TEXT,
679 mode=ProcessingMode.STREAM,
680 transformations=transformations,
681 aggregations=aggregations
682 )
684 return FileProcessor(config)
687def create_file_processor(
688 input_path: str,
689 output_path: str,
690 pattern: str = "*",
691 mode: ProcessingMode = ProcessingMode.WHOLE,
692 transformations: List[Callable] | None = None
693) -> FileProcessor:
694 """Create generic file processor.
696 Args:
697 input_path: Input directory or file
698 output_path: Output directory or file
699 pattern: File pattern to match (currently unused)
700 mode: Processing mode
701 transformations: Data transformation functions
703 Returns:
704 Configured FileProcessor
705 """
706 # Note: pattern parameter is currently not used in FileProcessingConfig
707 config = FileProcessingConfig(
708 input_path=input_path,
709 output_path=output_path,
710 format=FileFormat.TEXT,
711 mode=mode,
712 transformations=transformations or []
713 )
715 return FileProcessor(config)
718def create_json_processor(
719 input_path: str,
720 output_path: str,
721 pretty_print: bool = False,
722 array_processing: bool = False
723) -> FileProcessor:
724 """Create JSON file processor.
726 Args:
727 input_path: Input directory
728 output_path: Output directory
729 pretty_print: Whether to pretty print JSON
730 array_processing: Process as JSON arrays
732 Returns:
733 Configured FileProcessor
734 """
735 config = FileProcessingConfig(
736 input_path=input_path,
737 output_path=output_path,
738 format=FileFormat.JSON,
739 mode=ProcessingMode.WHOLE,
740 json_config={
741 'pretty_print': pretty_print,
742 'array_processing': array_processing
743 }
744 )
746 return FileProcessor(config)
749def create_log_processor(
750 input_path: str,
751 output_path: str,
752 parse_timestamps: bool = False,
753 extract_errors: bool = False
754) -> FileProcessor:
755 """Create log file processor.
757 Args:
758 input_path: Input directory
759 output_path: Output directory
760 pattern: Log file pattern
761 parse_timestamps: Whether to parse timestamps
762 extract_errors: Whether to extract error entries
764 Returns:
765 Configured FileProcessor
766 """
767 config = FileProcessingConfig(
768 input_path=input_path,
769 output_path=output_path,
770 format=FileFormat.TEXT,
771 mode=ProcessingMode.STREAM,
772 log_config={
773 'parse_timestamps': parse_timestamps,
774 'extract_errors': extract_errors
775 }
776 )
778 return FileProcessor(config)
781def create_batch_file_processor(
782 input_paths: List[str],
783 output_path: str,
784 patterns: List[str],
785 batch_size: int = 10
786) -> FileProcessor:
787 """Create batch file processor.
789 Args:
790 input_paths: List of input directories
791 output_path: Output directory
792 patterns: File patterns to match
793 batch_size: Batch processing size
795 Returns:
796 Configured FileProcessor
797 """
798 # Use first input path for config, handle multiple paths in processor
799 config = FileProcessingConfig( # type: ignore
800 input_path=input_paths[0] if input_paths else "",
801 output_path=output_path,
802 pattern=patterns[0] if patterns else "*",
803 format=FileFormat.TEXT,
804 mode=ProcessingMode.BATCH,
805 batch_size=batch_size
806 )
808 return FileProcessor(config)