Coverage for src/alprina_cli/agents/llm_security.py: 0%

121 statements  

« prev     ^ index     » next       coverage.py v7.11.3, created at 2025-11-14 11:27 +0100

1""" 

2LLM Security Agent - Protect AI/LLM applications from security threats. 

3 

4Inspired by Noma Security's AI security platform. 

5Detects: Prompt injection, jailbreaking, model poisoning, data leakage. 

6""" 

7 

8from typing import Dict, List, Any, Optional 

9from loguru import logger 

10import re 

11import json 

12 

13 

14class LLMSecurityAgent: 

15 """ 

16 Security agent for Large Language Model (LLM) applications. 

17  

18 Capabilities: 

19 - Prompt injection detection 

20 - Jailbreak attempt detection 

21 - PII/sensitive data leakage detection 

22 - Output validation 

23 - Token usage monitoring 

24 - Model behavior analysis 

25 """ 

26 

27 def __init__(self): 

28 """Initialize LLM Security Agent.""" 

29 self.name = "LLM Security Agent" 

30 self.agent_type = "llm-security" 

31 logger.debug(f"{self.name} initialized") 

32 

33 # Known prompt injection patterns 

34 self.injection_patterns = [ 

35 # System prompt override attempts 

36 r"(?i)ignore (all )?previous (instructions?|prompts?|commands?)", 

37 r"(?i)disregard (all )?(previous|above|prior) (instructions?|prompts?)", 

38 r"(?i)forget (all )?(previous|above) (instructions?|prompts?)", 

39 r"(?i)new (instructions?|prompts?|system)", 

40 

41 # Jailbreak attempts 

42 r"(?i)(you are|act as|pretend to be|roleplay as) (dan|a? ?jailbreak|evil|malicious)", 

43 r"(?i)developer mode", 

44 r"(?i)sudo mode", 

45 r"(?i)god mode", 

46 r"(?i)unrestricted mode", 

47 

48 # Prompt leakage attempts 

49 r"(?i)show (me )?(your|the) (system|initial|original) prompt", 

50 r"(?i)print (your|the) (instructions?|prompts?)", 

51 r"(?i)reveal (your|the) (system|hidden) (prompt|instructions?)", 

52 r"(?i)what (are|were) you told", 

53 

54 # Command injection 

55 r"(?i)execute|eval|system|subprocess|os\.|shell", 

56 r"(?i)__import__|importlib", 

57 

58 # Multi-turn attacks 

59 r"(?i)(step|part|phase) (\d+|one|two|three).*?(final|last) step", 

60 r"(?i)first.*?then.*?finally", 

61 ] 

62 

63 # PII patterns 

64 self.pii_patterns = { 

65 "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", 

66 "ssn": r"\b\d{3}-\d{2}-\d{4}\b", 

67 "credit_card": r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b", 

68 "phone": r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b", 

69 "api_key": r"(api[_-]?key|apikey|api[_-]?token)[\s:=]+['\"]?([a-zA-Z0-9_\-]{20,})['\"]?", 

70 "password": r"(password|passwd|pwd)[\s:=]+['\"]?([^\s\"']{8,})['\"]?", 

71 } 

72 

73 # Toxic/harmful content patterns 

74 self.harmful_patterns = [ 

75 r"(?i)(how to|guide to|instructions for) (hack|crack|break into|bypass)", 

76 r"(?i)(build|create|make) (a )?(bomb|weapon|malware|virus)", 

77 r"(?i)(illegal|illicit) (activities?|drugs|substances)", 

78 ] 

79 

80 def scan_prompt(self, prompt: str, context: Optional[Dict] = None) -> Dict[str, Any]: 

81 """ 

82 Scan a user prompt for security issues before sending to LLM. 

83  

84 Args: 

85 prompt: User's input prompt 

86 context: Optional context (conversation history, etc.) 

87  

88 Returns: 

89 Security analysis results 

90 """ 

91 logger.debug(f"Scanning prompt: {prompt[:100]}...") 

92 

93 findings = [] 

94 risk_score = 0 

95 

96 # Check for prompt injection 

97 injection_results = self._detect_prompt_injection(prompt) 

98 if injection_results["detected"]: 

99 findings.append({ 

100 "type": "prompt_injection", 

101 "severity": "HIGH", 

102 "title": "Prompt Injection Attempt Detected", 

103 "description": "User prompt contains patterns attempting to override system instructions", 

104 "details": injection_results["matches"], 

105 "recommendation": "Block this prompt and log the attempt" 

106 }) 

107 risk_score += 80 

108 

109 # Check for jailbreak attempts 

110 jailbreak_results = self._detect_jailbreak(prompt) 

111 if jailbreak_results["detected"]: 

112 findings.append({ 

113 "type": "jailbreak_attempt", 

114 "severity": "HIGH", 

115 "title": "Jailbreak Attempt Detected", 

116 "description": "User trying to bypass model safety constraints", 

117 "details": jailbreak_results["matches"], 

118 "recommendation": "Block this prompt and monitor user" 

119 }) 

120 risk_score += 85 

121 

122 # Check for PII in prompt 

123 pii_results = self._detect_pii(prompt) 

124 if pii_results["detected"]: 

125 findings.append({ 

126 "type": "pii_detected", 

127 "severity": "MEDIUM", 

128 "title": "PII Detected in Prompt", 

129 "description": "Prompt contains personally identifiable information", 

130 "details": pii_results["pii_types"], 

131 "recommendation": "Redact PII before sending to LLM" 

132 }) 

133 risk_score += 40 

134 

135 # Check for harmful content requests 

136 harmful_results = self._detect_harmful_content(prompt) 

137 if harmful_results["detected"]: 

138 findings.append({ 

139 "type": "harmful_request", 

140 "severity": "HIGH", 

141 "title": "Harmful Content Request", 

142 "description": "User requesting harmful or illegal information", 

143 "details": harmful_results["matches"], 

144 "recommendation": "Block this prompt and log for review" 

145 }) 

146 risk_score += 90 

147 

148 # Calculate final risk level 

149 if risk_score >= 70: 

150 risk_level = "CRITICAL" 

151 elif risk_score >= 40: 

152 risk_level = "HIGH" 

153 elif risk_score >= 20: 

154 risk_level = "MEDIUM" 

155 else: 

156 risk_level = "LOW" 

157 

158 return { 

159 "safe": len(findings) == 0, 

160 "risk_level": risk_level, 

161 "risk_score": min(risk_score, 100), 

162 "findings": findings, 

163 "prompt": prompt, 

164 "timestamp": self._get_timestamp() 

165 } 

166 

167 def scan_output(self, output: str, context: Optional[Dict] = None) -> Dict[str, Any]: 

168 """ 

169 Scan LLM output for security issues before returning to user. 

170  

171 Args: 

172 output: LLM's generated output 

173 context: Optional context 

174  

175 Returns: 

176 Security analysis results 

177 """ 

178 logger.debug(f"Scanning LLM output: {output[:100]}...") 

179 

180 findings = [] 

181 risk_score = 0 

182 

183 # Check for leaked PII 

184 pii_results = self._detect_pii(output) 

185 if pii_results["detected"]: 

186 findings.append({ 

187 "type": "pii_leakage", 

188 "severity": "HIGH", 

189 "title": "PII Leaked in Output", 

190 "description": "LLM output contains personally identifiable information", 

191 "details": pii_results["pii_types"], 

192 "recommendation": "Redact PII before showing to user" 

193 }) 

194 risk_score += 70 

195 

196 # Check for sensitive data exposure 

197 sensitive_results = self._detect_sensitive_data(output) 

198 if sensitive_results["detected"]: 

199 findings.append({ 

200 "type": "sensitive_data_exposure", 

201 "severity": "MEDIUM", 

202 "title": "Sensitive Data in Output", 

203 "description": "Output may contain sensitive information", 

204 "details": sensitive_results["types"], 

205 "recommendation": "Review and redact sensitive data" 

206 }) 

207 risk_score += 50 

208 

209 # Check for hallucination indicators 

210 hallucination_results = self._detect_hallucination_indicators(output) 

211 if hallucination_results["likely"]: 

212 findings.append({ 

213 "type": "potential_hallucination", 

214 "severity": "LOW", 

215 "title": "Potential Hallucination Detected", 

216 "description": "Output may contain fabricated information", 

217 "details": hallucination_results["indicators"], 

218 "recommendation": "Verify information before trusting" 

219 }) 

220 risk_score += 20 

221 

222 return { 

223 "safe": len(findings) == 0, 

224 "risk_score": min(risk_score, 100), 

225 "findings": findings, 

226 "output": output, 

227 "timestamp": self._get_timestamp() 

228 } 

229 

230 def _detect_prompt_injection(self, text: str) -> Dict[str, Any]: 

231 """Detect prompt injection patterns.""" 

232 matches = [] 

233 for pattern in self.injection_patterns: 

234 found = re.findall(pattern, text) 

235 if found: 

236 matches.append({ 

237 "pattern": pattern, 

238 "matches": found 

239 }) 

240 

241 return { 

242 "detected": len(matches) > 0, 

243 "matches": matches 

244 } 

245 

246 def _detect_jailbreak(self, text: str) -> Dict[str, Any]: 

247 """Detect jailbreak attempt patterns.""" 

248 jailbreak_keywords = [ 

249 "dan", "jailbreak", "developer mode", "god mode", 

250 "unrestricted", "bypass", "ignore rules", "evil mode" 

251 ] 

252 

253 matches = [] 

254 text_lower = text.lower() 

255 for keyword in jailbreak_keywords: 

256 if keyword in text_lower: 

257 matches.append(keyword) 

258 

259 return { 

260 "detected": len(matches) > 0, 

261 "matches": matches 

262 } 

263 

264 def _detect_pii(self, text: str) -> Dict[str, Any]: 

265 """Detect personally identifiable information.""" 

266 pii_found = {} 

267 

268 for pii_type, pattern in self.pii_patterns.items(): 

269 matches = re.findall(pattern, text, re.IGNORECASE) 

270 if matches: 

271 # Redact matched PII 

272 pii_found[pii_type] = [self._redact(m) for m in matches] 

273 

274 return { 

275 "detected": len(pii_found) > 0, 

276 "pii_types": pii_found 

277 } 

278 

279 def _detect_harmful_content(self, text: str) -> Dict[str, Any]: 

280 """Detect requests for harmful or illegal content.""" 

281 matches = [] 

282 for pattern in self.harmful_patterns: 

283 found = re.findall(pattern, text) 

284 if found: 

285 matches.append({ 

286 "pattern": pattern, 

287 "matches": found 

288 }) 

289 

290 return { 

291 "detected": len(matches) > 0, 

292 "matches": matches 

293 } 

294 

295 def _detect_sensitive_data(self, text: str) -> Dict[str, Any]: 

296 """Detect sensitive data in output.""" 

297 sensitive_types = [] 

298 

299 # Check for API keys, tokens, secrets 

300 if re.search(r"(api[_-]?key|token|secret|password)[\s:=]", text, re.IGNORECASE): 

301 sensitive_types.append("credentials") 

302 

303 # Check for internal URLs/IPs 

304 if re.search(r"(192\.168\.|10\.|172\.(1[6-9]|2[0-9]|3[0-1])\.)", text): 

305 sensitive_types.append("internal_ip") 

306 

307 # Check for file paths 

308 if re.search(r"(/etc/|/home/|/root/|C:\\|/var/)", text): 

309 sensitive_types.append("file_paths") 

310 

311 return { 

312 "detected": len(sensitive_types) > 0, 

313 "types": sensitive_types 

314 } 

315 

316 def _detect_hallucination_indicators(self, text: str) -> Dict[str, Any]: 

317 """Detect potential hallucination indicators in output.""" 

318 indicators = [] 

319 

320 # Check for uncertainty markers 

321 uncertainty_phrases = [ 

322 "i'm not sure", "i think", "maybe", "possibly", 

323 "i don't know", "unclear", "uncertain" 

324 ] 

325 

326 text_lower = text.lower() 

327 for phrase in uncertainty_phrases: 

328 if phrase in text_lower: 

329 indicators.append(f"Uncertainty: '{phrase}'") 

330 

331 # Check for inconsistencies 

332 if "however" in text_lower and "but" in text_lower: 

333 indicators.append("Contradictory statements") 

334 

335 return { 

336 "likely": len(indicators) > 0, 

337 "indicators": indicators 

338 } 

339 

340 def _redact(self, text: str) -> str: 

341 """Redact sensitive information.""" 

342 if isinstance(text, tuple): 

343 text = text[0] if text else "" 

344 return f"***REDACTED ({len(str(text))} chars)***" 

345 

346 def _get_timestamp(self) -> str: 

347 """Get current timestamp.""" 

348 from datetime import datetime 

349 return datetime.utcnow().isoformat() 

350 

351 def get_recommendations(self) -> List[str]: 

352 """Get general security recommendations for LLM applications.""" 

353 return [ 

354 "Always validate and sanitize user inputs before sending to LLM", 

355 "Implement rate limiting to prevent abuse", 

356 "Monitor for unusual patterns in user prompts", 

357 "Use separate system prompts that users cannot override", 

358 "Implement output filtering to prevent PII leakage", 

359 "Log all suspicious prompts for security review", 

360 "Use prompt templates with fixed structure", 

361 "Implement content moderation on both input and output", 

362 "Monitor token usage for anomalies", 

363 "Keep model and framework dependencies updated" 

364 ] 

365 

366 

367# CLI entry point 

368def run_llm_security_scan(target: str, options: Dict[str, Any]) -> Dict[str, Any]: 

369 """ 

370 Run LLM security scan. 

371  

372 Args: 

373 target: Text to scan (prompt or output) 

374 options: Scan options 

375  

376 Returns: 

377 Scan results 

378 """ 

379 agent = LLMSecurityAgent() 

380 

381 scan_type = options.get("type", "prompt") # "prompt" or "output" 

382 

383 if scan_type == "prompt": 

384 results = agent.scan_prompt(target) 

385 else: 

386 results = agent.scan_output(target) 

387 

388 return { 

389 "agent": "LLM Security Agent", 

390 "scan_type": scan_type, 

391 "results": results, 

392 "recommendations": agent.get_recommendations() if not results["safe"] else [] 

393 }