Coverage for src/prosemark/adapters/markdown_binder_parser.py: 100%
91 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-09-24 18:08 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-09-24 18:08 +0000
1# Copyright (c) 2024 Prosemark Contributors
2# This software is licensed under the MIT License
4"""Markdown binder parser for converting between binder structures and markdown text."""
6import re
7from typing import NoReturn
9from prosemark.domain.models import Binder, BinderItem, NodeId
10from prosemark.exceptions import BinderFormatError
13class MarkdownBinderParser:
14 """Parser for converting between Binder objects and markdown list format.
16 This adapter handles bidirectional conversion between:
17 - Binder domain objects with tree structure
18 - Markdown unordered list representation with links
20 Supported markdown format:
21 ```
22 - [Title](file.md)
23 - [Nested Item](nested.md)
24 - [Another Root](another.md)
25 ```
27 The parser maintains:
28 - Hierarchical structure through indentation
29 - NodeId extraction from filenames (assumes {id}.md pattern)
30 - Placeholder support for items without links
31 - Proper tree parent-child relationships
32 """
34 # Pattern to match markdown list items with optional links
35 # Updated to handle brackets in titles and empty links
36 LIST_ITEM_PATTERN = re.compile(r'^(\s*)- \[(.*?)\](?:\(([^)]*)\))?(?:\s*)$', re.MULTILINE)
38 # Pattern to extract NodeId from markdown links (assuming {id}.md format, possibly with path)
39 NODE_ID_PATTERN = re.compile(r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})(?:\.md)?$')
41 def parse_to_binder(self, markdown_content: str) -> Binder:
42 """Parse markdown content into a Binder object.
44 Args:
45 markdown_content: Markdown text with unordered list structure
47 Returns:
48 Binder object with parsed hierarchy
50 Raises:
51 BinderFormatError: If markdown format is invalid or malformed
53 """
54 try:
55 # Validate markdown format
56 MarkdownBinderParser._validate_markdown_format(markdown_content)
58 # Find all list items with their indentation
59 matches = self.LIST_ITEM_PATTERN.findall(markdown_content)
60 if not matches:
61 MarkdownBinderParser._handle_no_matches(markdown_content)
62 return Binder(roots=[])
64 # Build tree structure
65 return self._build_binder_tree(matches)
67 except BinderFormatError:
68 raise
69 except Exception as exc: # noqa: BLE001
70 MarkdownBinderParser._raise_parse_error(exc)
72 def render_from_binder(self, binder: Binder) -> str:
73 """Render Binder object as markdown list content.
75 Args:
76 binder: Binder object to render
78 Returns:
79 Markdown text with unordered list structure
81 """
82 lines: list[str] = []
83 for root in binder.roots:
84 self._render_item(root, 0, lines)
85 return '\n'.join(lines)
87 @staticmethod
88 def _validate_markdown_format(markdown_content: str) -> None:
89 """Validate markdown format and raise errors for malformed patterns."""
90 lines = markdown_content.strip().split('\n')
91 for line in lines:
92 stripped_line = line.strip()
93 if stripped_line: # Skip empty lines
94 MarkdownBinderParser._check_bracket_patterns(stripped_line)
96 @staticmethod
97 def _check_bracket_patterns(line: str) -> None:
98 """Check for malformed bracket patterns in a line."""
99 if '- [' in line and line.count('[') != line.count(']'):
100 MarkdownBinderParser._raise_malformed_error('unmatched brackets')
101 if '- [' in line and '[' in line and not line.endswith(']') and ')' not in line:
102 MarkdownBinderParser._raise_malformed_error('unclosed bracket')
104 @staticmethod
105 def _handle_no_matches(markdown_content: str) -> None:
106 """Handle case where no list items were matched."""
107 lines = markdown_content.strip().split('\n')
108 for line in lines:
109 stripped_line = line.strip()
110 if stripped_line and ('- ' in stripped_line or '* ' in stripped_line or stripped_line.startswith(' - ')):
111 MarkdownBinderParser._raise_malformed_error('invalid list item format')
112 # If there's any non-empty content but no valid list items, it might be malformed
113 if any(line.strip() for line in lines):
114 MarkdownBinderParser._raise_malformed_error('content found but no valid list items')
116 def _build_binder_tree(self, matches: list[tuple[str, str, str]]) -> Binder:
117 """Build the binder tree structure from matched list items.
119 Returns:
120 Constructed Binder with hierarchical structure
122 """
123 root_items = []
124 item_stack: list[tuple[int, BinderItem]] = [] # (indent_level, item)
126 for indent_str, title, link in matches:
127 indent_level = len(indent_str)
129 # Extract NodeId from link if present
130 node_id = self._extract_node_id(link) if link else None
132 # Create binder item
133 item = BinderItem(display_title=title.strip(), node_id=node_id, children=[])
135 # Find parent based on indentation
136 parent = MarkdownBinderParser._find_parent(item_stack, indent_level)
138 if parent is None:
139 # Root level item
140 root_items.append(item)
141 else:
142 # Child item
143 parent.children.append(item)
145 # Update stack - remove items at same or deeper levels, then add current
146 item_stack = [(level, stack_item) for level, stack_item in item_stack if level < indent_level]
147 item_stack.append((indent_level, item))
149 return Binder(roots=root_items)
151 @staticmethod
152 def _raise_malformed_error(issue: str) -> NoReturn:
153 """Raise a BinderFormatError with malformed markdown message.
155 Raises:
156 BinderFormatError: Always raised with issue-specific message
158 """
159 msg = f'Malformed markdown: {issue}'
160 raise BinderFormatError(msg)
162 @staticmethod
163 def _raise_parse_error(exc: Exception) -> NoReturn:
164 """Raise a BinderFormatError for parse failures.
166 Raises:
167 BinderFormatError: Always raised with exception context
169 """
170 msg = 'Failed to parse markdown binder content'
171 raise BinderFormatError(msg) from exc
173 def _render_item(self, item: BinderItem, depth: int, lines: list[str]) -> None:
174 """Render a single binder item and its children to lines."""
175 indent = ' ' * depth
176 if item.node_id:
177 # Item with link
178 lines.append(f'{indent}- [{item.display_title}]({item.node_id}.md)')
179 else:
180 # Placeholder item
181 lines.append(f'{indent}- [{item.display_title}]()')
183 # Render children
184 for child in item.children:
185 self._render_item(child, depth + 1, lines)
187 def _extract_node_id(self, link: str) -> NodeId | None:
188 """Extract NodeId from markdown link if valid UUID format.
190 Returns:
191 NodeId if link contains valid UUID, None otherwise
193 """
194 if not link:
195 return None
197 match = self.NODE_ID_PATTERN.search(link)
198 if match:
199 try:
200 return NodeId(match.group(1))
201 except ValueError: # pragma: no cover
202 # Invalid UUID format
203 return None
204 return None
206 @staticmethod
207 def _find_parent(item_stack: list[tuple[int, BinderItem]], indent_level: int) -> BinderItem | None:
208 """Find the appropriate parent item based on indentation level.
210 Returns:
211 Parent BinderItem or None if no appropriate parent found
213 """
214 # Find the item with the largest indent level that's less than current
215 parent = None
216 for level, item in reversed(item_stack):
217 if level < indent_level:
218 parent = item
219 break
220 return parent