Coverage for src/prosemark/adapters/markdown_binder_parser.py: 100%

91 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-09-24 18:08 +0000

1# Copyright (c) 2024 Prosemark Contributors 

2# This software is licensed under the MIT License 

3 

4"""Markdown binder parser for converting between binder structures and markdown text.""" 

5 

6import re 

7from typing import NoReturn 

8 

9from prosemark.domain.models import Binder, BinderItem, NodeId 

10from prosemark.exceptions import BinderFormatError 

11 

12 

13class MarkdownBinderParser: 

14 """Parser for converting between Binder objects and markdown list format. 

15 

16 This adapter handles bidirectional conversion between: 

17 - Binder domain objects with tree structure 

18 - Markdown unordered list representation with links 

19 

20 Supported markdown format: 

21 ``` 

22 - [Title](file.md) 

23 - [Nested Item](nested.md) 

24 - [Another Root](another.md) 

25 ``` 

26 

27 The parser maintains: 

28 - Hierarchical structure through indentation 

29 - NodeId extraction from filenames (assumes {id}.md pattern) 

30 - Placeholder support for items without links 

31 - Proper tree parent-child relationships 

32 """ 

33 

34 # Pattern to match markdown list items with optional links 

35 # Updated to handle brackets in titles and empty links 

36 LIST_ITEM_PATTERN = re.compile(r'^(\s*)- \[(.*?)\](?:\(([^)]*)\))?(?:\s*)$', re.MULTILINE) 

37 

38 # Pattern to extract NodeId from markdown links (assuming {id}.md format, possibly with path) 

39 NODE_ID_PATTERN = re.compile(r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})(?:\.md)?$') 

40 

41 def parse_to_binder(self, markdown_content: str) -> Binder: 

42 """Parse markdown content into a Binder object. 

43 

44 Args: 

45 markdown_content: Markdown text with unordered list structure 

46 

47 Returns: 

48 Binder object with parsed hierarchy 

49 

50 Raises: 

51 BinderFormatError: If markdown format is invalid or malformed 

52 

53 """ 

54 try: 

55 # Validate markdown format 

56 MarkdownBinderParser._validate_markdown_format(markdown_content) 

57 

58 # Find all list items with their indentation 

59 matches = self.LIST_ITEM_PATTERN.findall(markdown_content) 

60 if not matches: 

61 MarkdownBinderParser._handle_no_matches(markdown_content) 

62 return Binder(roots=[]) 

63 

64 # Build tree structure 

65 return self._build_binder_tree(matches) 

66 

67 except BinderFormatError: 

68 raise 

69 except Exception as exc: # noqa: BLE001 

70 MarkdownBinderParser._raise_parse_error(exc) 

71 

72 def render_from_binder(self, binder: Binder) -> str: 

73 """Render Binder object as markdown list content. 

74 

75 Args: 

76 binder: Binder object to render 

77 

78 Returns: 

79 Markdown text with unordered list structure 

80 

81 """ 

82 lines: list[str] = [] 

83 for root in binder.roots: 

84 self._render_item(root, 0, lines) 

85 return '\n'.join(lines) 

86 

87 @staticmethod 

88 def _validate_markdown_format(markdown_content: str) -> None: 

89 """Validate markdown format and raise errors for malformed patterns.""" 

90 lines = markdown_content.strip().split('\n') 

91 for line in lines: 

92 stripped_line = line.strip() 

93 if stripped_line: # Skip empty lines 

94 MarkdownBinderParser._check_bracket_patterns(stripped_line) 

95 

96 @staticmethod 

97 def _check_bracket_patterns(line: str) -> None: 

98 """Check for malformed bracket patterns in a line.""" 

99 if '- [' in line and line.count('[') != line.count(']'): 

100 MarkdownBinderParser._raise_malformed_error('unmatched brackets') 

101 if '- [' in line and '[' in line and not line.endswith(']') and ')' not in line: 

102 MarkdownBinderParser._raise_malformed_error('unclosed bracket') 

103 

104 @staticmethod 

105 def _handle_no_matches(markdown_content: str) -> None: 

106 """Handle case where no list items were matched.""" 

107 lines = markdown_content.strip().split('\n') 

108 for line in lines: 

109 stripped_line = line.strip() 

110 if stripped_line and ('- ' in stripped_line or '* ' in stripped_line or stripped_line.startswith(' - ')): 

111 MarkdownBinderParser._raise_malformed_error('invalid list item format') 

112 # If there's any non-empty content but no valid list items, it might be malformed 

113 if any(line.strip() for line in lines): 

114 MarkdownBinderParser._raise_malformed_error('content found but no valid list items') 

115 

116 def _build_binder_tree(self, matches: list[tuple[str, str, str]]) -> Binder: 

117 """Build the binder tree structure from matched list items. 

118 

119 Returns: 

120 Constructed Binder with hierarchical structure 

121 

122 """ 

123 root_items = [] 

124 item_stack: list[tuple[int, BinderItem]] = [] # (indent_level, item) 

125 

126 for indent_str, title, link in matches: 

127 indent_level = len(indent_str) 

128 

129 # Extract NodeId from link if present 

130 node_id = self._extract_node_id(link) if link else None 

131 

132 # Create binder item 

133 item = BinderItem(display_title=title.strip(), node_id=node_id, children=[]) 

134 

135 # Find parent based on indentation 

136 parent = MarkdownBinderParser._find_parent(item_stack, indent_level) 

137 

138 if parent is None: 

139 # Root level item 

140 root_items.append(item) 

141 else: 

142 # Child item 

143 parent.children.append(item) 

144 

145 # Update stack - remove items at same or deeper levels, then add current 

146 item_stack = [(level, stack_item) for level, stack_item in item_stack if level < indent_level] 

147 item_stack.append((indent_level, item)) 

148 

149 return Binder(roots=root_items) 

150 

151 @staticmethod 

152 def _raise_malformed_error(issue: str) -> NoReturn: 

153 """Raise a BinderFormatError with malformed markdown message. 

154 

155 Raises: 

156 BinderFormatError: Always raised with issue-specific message 

157 

158 """ 

159 msg = f'Malformed markdown: {issue}' 

160 raise BinderFormatError(msg) 

161 

162 @staticmethod 

163 def _raise_parse_error(exc: Exception) -> NoReturn: 

164 """Raise a BinderFormatError for parse failures. 

165 

166 Raises: 

167 BinderFormatError: Always raised with exception context 

168 

169 """ 

170 msg = 'Failed to parse markdown binder content' 

171 raise BinderFormatError(msg) from exc 

172 

173 def _render_item(self, item: BinderItem, depth: int, lines: list[str]) -> None: 

174 """Render a single binder item and its children to lines.""" 

175 indent = ' ' * depth 

176 if item.node_id: 

177 # Item with link 

178 lines.append(f'{indent}- [{item.display_title}]({item.node_id}.md)') 

179 else: 

180 # Placeholder item 

181 lines.append(f'{indent}- [{item.display_title}]()') 

182 

183 # Render children 

184 for child in item.children: 

185 self._render_item(child, depth + 1, lines) 

186 

187 def _extract_node_id(self, link: str) -> NodeId | None: 

188 """Extract NodeId from markdown link if valid UUID format. 

189 

190 Returns: 

191 NodeId if link contains valid UUID, None otherwise 

192 

193 """ 

194 if not link: 

195 return None 

196 

197 match = self.NODE_ID_PATTERN.search(link) 

198 if match: 

199 try: 

200 return NodeId(match.group(1)) 

201 except ValueError: # pragma: no cover 

202 # Invalid UUID format 

203 return None 

204 return None 

205 

206 @staticmethod 

207 def _find_parent(item_stack: list[tuple[int, BinderItem]], indent_level: int) -> BinderItem | None: 

208 """Find the appropriate parent item based on indentation level. 

209 

210 Returns: 

211 Parent BinderItem or None if no appropriate parent found 

212 

213 """ 

214 # Find the item with the largest indent level that's less than current 

215 parent = None 

216 for level, item in reversed(item_stack): 

217 if level < indent_level: 

218 parent = item 

219 break 

220 return parent