Coverage for src/alprina_cli/guardrails/output_guardrails.py: 26%

186 statements  

« prev     ^ index     » next       coverage.py v7.11.3, created at 2025-11-14 11:27 +0100

1""" 

2Output Guardrails 

3 

4Sanitize sensitive information from tool outputs. 

5Prevent leaking: PII, credentials, internal IPs, file paths, etc. 

6""" 

7 

8from abc import ABC, abstractmethod 

9from typing import Any, Dict, Optional, List 

10from pydantic import BaseModel 

11from loguru import logger 

12import re 

13 

14 

15class SanitizationResult(BaseModel): 

16 """Result from output sanitization""" 

17 sanitized_value: Any 

18 redactions_made: int = 0 

19 redaction_types: List[str] = [] 

20 

21 

22class OutputGuardrail(ABC): 

23 """ 

24 Base class for output guardrails. 

25 

26 Context Engineering: 

27 - Fast sanitization (< 10ms per check) 

28 - Preserve data utility while removing sensitive info 

29 - Track what was redacted for audit logs 

30 """ 

31 

32 name: str = "OutputGuardrail" 

33 

34 @abstractmethod 

35 def sanitize(self, value: Any) -> SanitizationResult: 

36 """ 

37 Sanitize output value. 

38 

39 Args: 

40 value: Output value to sanitize 

41 

42 Returns: 

43 SanitizationResult with sanitized value and redaction info 

44 """ 

45 raise NotImplementedError 

46 

47 

48class PIIScrubber(OutputGuardrail): 

49 """ 

50 Scrub Personally Identifiable Information from outputs. 

51 

52 Patterns detected: 

53 - Email addresses 

54 - Phone numbers (US/International) 

55 - Social Security Numbers 

56 - Credit card numbers 

57 - IP addresses (when configured) 

58 """ 

59 

60 name: str = "PIIScrubber" 

61 

62 # PII patterns 

63 EMAIL_PATTERN = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' 

64 # Phone pattern to match various formats: 555-123-4567, (555) 123-4567, 5551234567, +1-555-123-4567 

65 PHONE_PATTERN = r'\b(?:\+?1[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})\b' 

66 SSN_PATTERN = r'\b\d{3}-\d{2}-\d{4}\b' 

67 CREDIT_CARD_PATTERN = r'\b(?:\d{4}[-\s]?){3}\d{4}\b' 

68 

69 def __init__(self, scrub_emails: bool = True, scrub_phones: bool = True, 

70 scrub_ssn: bool = True, scrub_credit_cards: bool = True): 

71 self.scrub_emails = scrub_emails 

72 self.scrub_phones = scrub_phones 

73 self.scrub_ssn = scrub_ssn 

74 self.scrub_credit_cards = scrub_credit_cards 

75 

76 def sanitize(self, value: Any) -> SanitizationResult: 

77 """Scrub PII from value""" 

78 if not isinstance(value, str): 

79 return SanitizationResult(sanitized_value=value, redactions_made=0) 

80 

81 sanitized = value 

82 redactions = 0 

83 redaction_types = [] 

84 

85 # Scrub emails 

86 if self.scrub_emails: 

87 emails_found = re.findall(self.EMAIL_PATTERN, sanitized) 

88 if emails_found: 

89 sanitized = re.sub(self.EMAIL_PATTERN, '[EMAIL_REDACTED]', sanitized) 

90 redactions += len(emails_found) 

91 redaction_types.append("email") 

92 logger.debug(f"Redacted {len(emails_found)} email(s)") 

93 

94 # Scrub phone numbers 

95 if self.scrub_phones: 

96 phones_found = re.findall(self.PHONE_PATTERN, sanitized) 

97 if phones_found: 

98 sanitized = re.sub(self.PHONE_PATTERN, '[PHONE_REDACTED]', sanitized) 

99 redactions += len(phones_found) 

100 redaction_types.append("phone") 

101 logger.debug(f"Redacted {len(phones_found)} phone number(s)") 

102 

103 # Scrub SSNs 

104 if self.scrub_ssn: 

105 ssns_found = re.findall(self.SSN_PATTERN, sanitized) 

106 if ssns_found: 

107 sanitized = re.sub(self.SSN_PATTERN, '[SSN_REDACTED]', sanitized) 

108 redactions += len(ssns_found) 

109 redaction_types.append("ssn") 

110 logger.debug(f"Redacted {len(ssns_found)} SSN(s)") 

111 

112 # Scrub credit cards 

113 if self.scrub_credit_cards: 

114 cards_found = re.findall(self.CREDIT_CARD_PATTERN, sanitized) 

115 if cards_found: 

116 sanitized = re.sub(self.CREDIT_CARD_PATTERN, '[CREDIT_CARD_REDACTED]', sanitized) 

117 redactions += len(cards_found) 

118 redaction_types.append("credit_card") 

119 logger.debug(f"Redacted {len(cards_found)} credit card(s)") 

120 

121 return SanitizationResult( 

122 sanitized_value=sanitized, 

123 redactions_made=redactions, 

124 redaction_types=redaction_types 

125 ) 

126 

127 

128class CredentialFilter(OutputGuardrail): 

129 """ 

130 Filter credentials and secrets from outputs. 

131 

132 Patterns detected: 

133 - API keys (common formats) 

134 - AWS credentials 

135 - JWT tokens 

136 - Password patterns 

137 - Private keys 

138 - OAuth tokens 

139 """ 

140 

141 name: str = "CredentialFilter" 

142 

143 # Credential patterns 

144 PATTERNS = [ 

145 (r'api[_-]?key[_-]?[=:]\s*["\']?([a-zA-Z0-9_\-]{20,})["\']?', 'api_key'), 

146 (r'AKIA[0-9A-Z]{16}', 'aws_access_key'), 

147 (r'aws[_-]?secret[_-]?[=:]\s*["\']?([a-zA-Z0-9/+=]{40})["\']?', 'aws_secret'), 

148 (r'eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*', 'jwt_token'), 

149 (r'password[_-]?[=:]\s*["\']?([^\s"\']{8,})["\']?', 'password'), 

150 (r'passwd[_-]?[=:]\s*["\']?([^\s"\']{8,})["\']?', 'password'), 

151 (r'token[_-]?[=:]\s*["\']?([a-zA-Z0-9_\-]{20,})["\']?', 'token'), 

152 (r'-----BEGIN (RSA |DSA )?PRIVATE KEY-----', 'private_key'), 

153 (r'-----BEGIN OPENSSH PRIVATE KEY-----', 'ssh_key'), 

154 (r'oauth[_-]?token[_-]?[=:]\s*["\']?([a-zA-Z0-9_\-]{20,})["\']?', 'oauth_token'), 

155 (r'gh[pousr]_[A-Za-z0-9_]{36,}', 'github_token'), 

156 (r'sk_live_[a-zA-Z0-9]{24,}', 'stripe_key'), 

157 (r'AIza[0-9A-Za-z_\-]{35}', 'google_api_key'), 

158 (r'SK[a-zA-Z0-9]{32}', 'twilio_key'), 

159 ] 

160 

161 def sanitize(self, value: Any) -> SanitizationResult: 

162 """Filter credentials from value""" 

163 if not isinstance(value, str): 

164 return SanitizationResult(sanitized_value=value, redactions_made=0) 

165 

166 sanitized = value 

167 redactions = 0 

168 redaction_types = [] 

169 

170 # Check each credential pattern 

171 for pattern, cred_type in self.PATTERNS: 

172 matches = re.findall(pattern, sanitized, re.IGNORECASE) 

173 if matches: 

174 sanitized = re.sub(pattern, f'[{cred_type.upper()}_REDACTED]', sanitized, flags=re.IGNORECASE) 

175 redactions += len(matches) if isinstance(matches[0], str) else len(matches) 

176 if cred_type not in redaction_types: 

177 redaction_types.append(cred_type) 

178 logger.warning(f"Redacted {cred_type} from output") 

179 

180 return SanitizationResult( 

181 sanitized_value=sanitized, 

182 redactions_made=redactions, 

183 redaction_types=redaction_types 

184 ) 

185 

186 

187class IPRedactor(OutputGuardrail): 

188 """ 

189 Redact internal IP addresses and hostnames. 

190 

191 Patterns redacted: 

192 - Private IP ranges (10.x, 172.16-31.x, 192.168.x) 

193 - IPv6 private addresses 

194 - Internal hostnames 

195 - MAC addresses (optional) 

196 """ 

197 

198 name: str = "IPRedactor" 

199 

200 # IP patterns 

201 PRIVATE_IP_PATTERNS = [ 

202 r'\b10\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', # 10.x.x.x 

203 r'\b172\.(1[6-9]|2[0-9]|3[0-1])\.\d{1,3}\.\d{1,3}\b', # 172.16-31.x.x 

204 r'\b192\.168\.\d{1,3}\.\d{1,3}\b', # 192.168.x.x 

205 r'\b127\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', # 127.x.x.x (loopback) 

206 ] 

207 

208 IPV6_PRIVATE_PATTERN = r'\bfe80:[0-9a-fA-F:]+\b' # IPv6 link-local 

209 MAC_ADDRESS_PATTERN = r'\b([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})\b' 

210 

211 def __init__(self, redact_private_ips: bool = True, redact_ipv6: bool = True, 

212 redact_mac: bool = False): 

213 self.redact_private_ips = redact_private_ips 

214 self.redact_ipv6 = redact_ipv6 

215 self.redact_mac = redact_mac 

216 

217 def sanitize(self, value: Any) -> SanitizationResult: 

218 """Redact IPs from value""" 

219 if not isinstance(value, str): 

220 return SanitizationResult(sanitized_value=value, redactions_made=0) 

221 

222 sanitized = value 

223 redactions = 0 

224 redaction_types = [] 

225 

226 # Redact private IPs 

227 if self.redact_private_ips: 

228 for pattern in self.PRIVATE_IP_PATTERNS: 

229 ips_found = re.findall(pattern, sanitized) 

230 if ips_found: 

231 sanitized = re.sub(pattern, '[IP_REDACTED]', sanitized) 

232 redactions += len(ips_found) 

233 if "private_ip" not in redaction_types: 

234 redaction_types.append("private_ip") 

235 

236 # Redact IPv6 

237 if self.redact_ipv6: 

238 ipv6_found = re.findall(self.IPV6_PRIVATE_PATTERN, sanitized) 

239 if ipv6_found: 

240 sanitized = re.sub(self.IPV6_PRIVATE_PATTERN, '[IPV6_REDACTED]', sanitized) 

241 redactions += len(ipv6_found) 

242 redaction_types.append("ipv6") 

243 

244 # Redact MAC addresses 

245 if self.redact_mac: 

246 mac_found = re.findall(self.MAC_ADDRESS_PATTERN, sanitized) 

247 if mac_found: 

248 sanitized = re.sub(self.MAC_ADDRESS_PATTERN, '[MAC_REDACTED]', sanitized) 

249 redactions += len(mac_found) 

250 redaction_types.append("mac_address") 

251 

252 if redactions > 0: 

253 logger.debug(f"Redacted {redactions} IP/MAC address(es)") 

254 

255 return SanitizationResult( 

256 sanitized_value=sanitized, 

257 redactions_made=redactions, 

258 redaction_types=redaction_types 

259 ) 

260 

261 

262class PathSanitizer(OutputGuardrail): 

263 """ 

264 Sanitize sensitive file paths from outputs. 

265 

266 Patterns sanitized: 

267 - User home directories 

268 - System paths 

269 - Windows paths with usernames 

270 - Temporary file paths with usernames 

271 """ 

272 

273 name: str = "PathSanitizer" 

274 

275 # Path patterns 

276 PATTERNS = [ 

277 (r'/home/([^/\s]+)', '/home/[USER]'), 

278 (r'/Users/([^/\s]+)', '/Users/[USER]'), 

279 (r'C:\\Users\\([^\\]+)', r'C:\\Users\\[USER]'), 

280 (r'/tmp/([^/\s]+)', '/tmp/[USER]'), 

281 (r'/var/tmp/([^/\s]+)', '/var/tmp/[USER]'), 

282 ] 

283 

284 def __init__(self, sanitize_user_paths: bool = True): 

285 self.sanitize_user_paths = sanitize_user_paths 

286 

287 def sanitize(self, value: Any) -> SanitizationResult: 

288 """Sanitize paths from value""" 

289 if not isinstance(value, str): 

290 return SanitizationResult(sanitized_value=value, redactions_made=0) 

291 

292 if not self.sanitize_user_paths: 

293 return SanitizationResult(sanitized_value=value, redactions_made=0) 

294 

295 sanitized = value 

296 redactions = 0 

297 redaction_types = [] 

298 

299 # Sanitize each path pattern 

300 for pattern, replacement in self.PATTERNS: 

301 matches = re.findall(pattern, sanitized) 

302 if matches: 

303 sanitized = re.sub(pattern, replacement, sanitized) 

304 redactions += len(matches) 

305 if "user_path" not in redaction_types: 

306 redaction_types.append("user_path") 

307 

308 if redactions > 0: 

309 logger.debug(f"Sanitized {redactions} user path(s)") 

310 

311 return SanitizationResult( 

312 sanitized_value=sanitized, 

313 redactions_made=redactions, 

314 redaction_types=redaction_types 

315 ) 

316 

317 

318# Default output guardrails chain 

319DEFAULT_OUTPUT_GUARDRAILS = [ 

320 PIIScrubber(), 

321 CredentialFilter(), 

322 IPRedactor(redact_private_ips=True, redact_ipv6=False, redact_mac=False), 

323 PathSanitizer() 

324] 

325 

326 

327def sanitize_output( 

328 value: Any, 

329 guardrails: Optional[List[OutputGuardrail]] = None 

330) -> SanitizationResult: 

331 """ 

332 Sanitize output through guardrail chain. 

333 

334 Args: 

335 value: Output value to sanitize 

336 guardrails: List of guardrails to apply (defaults to DEFAULT_OUTPUT_GUARDRAILS) 

337 

338 Returns: 

339 SanitizationResult with sanitized value and redaction summary 

340 """ 

341 if guardrails is None: 

342 guardrails = DEFAULT_OUTPUT_GUARDRAILS 

343 

344 sanitized = value 

345 total_redactions = 0 

346 all_redaction_types = [] 

347 

348 # Apply each guardrail in sequence 

349 for guardrail in guardrails: 

350 result = guardrail.sanitize(sanitized) 

351 sanitized = result.sanitized_value 

352 total_redactions += result.redactions_made 

353 all_redaction_types.extend(result.redaction_types) 

354 

355 # Remove duplicates from redaction types 

356 all_redaction_types = list(set(all_redaction_types)) 

357 

358 return SanitizationResult( 

359 sanitized_value=sanitized, 

360 redactions_made=total_redactions, 

361 redaction_types=all_redaction_types 

362 ) 

363 

364 

365def sanitize_dict( 

366 data: Dict[str, Any], 

367 guardrails: Optional[List[OutputGuardrail]] = None 

368) -> tuple[Dict[str, Any], int]: 

369 """ 

370 Recursively sanitize all string values in a dictionary. 

371 

372 Args: 

373 data: Dictionary to sanitize 

374 guardrails: List of guardrails to apply 

375 

376 Returns: 

377 Tuple of (sanitized_dict, total_redactions) 

378 """ 

379 sanitized = {} 

380 total_redactions = 0 

381 

382 for key, value in data.items(): 

383 if isinstance(value, str): 

384 result = sanitize_output(value, guardrails) 

385 sanitized[key] = result.sanitized_value 

386 total_redactions += result.redactions_made 

387 elif isinstance(value, dict): 

388 sanitized[key], redactions = sanitize_dict(value, guardrails) 

389 total_redactions += redactions 

390 elif isinstance(value, list): 

391 sanitized[key], redactions = sanitize_list(value, guardrails) 

392 total_redactions += redactions 

393 else: 

394 sanitized[key] = value 

395 

396 return sanitized, total_redactions 

397 

398 

399def sanitize_list( 

400 data: List[Any], 

401 guardrails: Optional[List[OutputGuardrail]] = None 

402) -> tuple[List[Any], int]: 

403 """ 

404 Recursively sanitize all string values in a list. 

405 

406 Args: 

407 data: List to sanitize 

408 guardrails: List of guardrails to apply 

409 

410 Returns: 

411 Tuple of (sanitized_list, total_redactions) 

412 """ 

413 sanitized = [] 

414 total_redactions = 0 

415 

416 for item in data: 

417 if isinstance(item, str): 

418 result = sanitize_output(item, guardrails) 

419 sanitized.append(result.sanitized_value) 

420 total_redactions += result.redactions_made 

421 elif isinstance(item, dict): 

422 sanitized_item, redactions = sanitize_dict(item, guardrails) 

423 sanitized.append(sanitized_item) 

424 total_redactions += redactions 

425 elif isinstance(item, list): 

426 sanitized_item, redactions = sanitize_list(item, guardrails) 

427 sanitized.append(sanitized_item) 

428 total_redactions += redactions 

429 else: 

430 sanitized.append(item) 

431 

432 return sanitized, total_redactions