Coverage for src/alprina_cli/agents/llm_security.py: 0%
121 statements
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-14 11:27 +0100
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-14 11:27 +0100
1"""
2LLM Security Agent - Protect AI/LLM applications from security threats.
4Inspired by Noma Security's AI security platform.
5Detects: Prompt injection, jailbreaking, model poisoning, data leakage.
6"""
8from typing import Dict, List, Any, Optional
9from loguru import logger
10import re
11import json
14class LLMSecurityAgent:
15 """
16 Security agent for Large Language Model (LLM) applications.
18 Capabilities:
19 - Prompt injection detection
20 - Jailbreak attempt detection
21 - PII/sensitive data leakage detection
22 - Output validation
23 - Token usage monitoring
24 - Model behavior analysis
25 """
27 def __init__(self):
28 """Initialize LLM Security Agent."""
29 self.name = "LLM Security Agent"
30 self.agent_type = "llm-security"
31 logger.debug(f"{self.name} initialized")
33 # Known prompt injection patterns
34 self.injection_patterns = [
35 # System prompt override attempts
36 r"(?i)ignore (all )?previous (instructions?|prompts?|commands?)",
37 r"(?i)disregard (all )?(previous|above|prior) (instructions?|prompts?)",
38 r"(?i)forget (all )?(previous|above) (instructions?|prompts?)",
39 r"(?i)new (instructions?|prompts?|system)",
41 # Jailbreak attempts
42 r"(?i)(you are|act as|pretend to be|roleplay as) (dan|a? ?jailbreak|evil|malicious)",
43 r"(?i)developer mode",
44 r"(?i)sudo mode",
45 r"(?i)god mode",
46 r"(?i)unrestricted mode",
48 # Prompt leakage attempts
49 r"(?i)show (me )?(your|the) (system|initial|original) prompt",
50 r"(?i)print (your|the) (instructions?|prompts?)",
51 r"(?i)reveal (your|the) (system|hidden) (prompt|instructions?)",
52 r"(?i)what (are|were) you told",
54 # Command injection
55 r"(?i)execute|eval|system|subprocess|os\.|shell",
56 r"(?i)__import__|importlib",
58 # Multi-turn attacks
59 r"(?i)(step|part|phase) (\d+|one|two|three).*?(final|last) step",
60 r"(?i)first.*?then.*?finally",
61 ]
63 # PII patterns
64 self.pii_patterns = {
65 "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
66 "ssn": r"\b\d{3}-\d{2}-\d{4}\b",
67 "credit_card": r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b",
68 "phone": r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b",
69 "api_key": r"(api[_-]?key|apikey|api[_-]?token)[\s:=]+['\"]?([a-zA-Z0-9_\-]{20,})['\"]?",
70 "password": r"(password|passwd|pwd)[\s:=]+['\"]?([^\s\"']{8,})['\"]?",
71 }
73 # Toxic/harmful content patterns
74 self.harmful_patterns = [
75 r"(?i)(how to|guide to|instructions for) (hack|crack|break into|bypass)",
76 r"(?i)(build|create|make) (a )?(bomb|weapon|malware|virus)",
77 r"(?i)(illegal|illicit) (activities?|drugs|substances)",
78 ]
80 def scan_prompt(self, prompt: str, context: Optional[Dict] = None) -> Dict[str, Any]:
81 """
82 Scan a user prompt for security issues before sending to LLM.
84 Args:
85 prompt: User's input prompt
86 context: Optional context (conversation history, etc.)
88 Returns:
89 Security analysis results
90 """
91 logger.debug(f"Scanning prompt: {prompt[:100]}...")
93 findings = []
94 risk_score = 0
96 # Check for prompt injection
97 injection_results = self._detect_prompt_injection(prompt)
98 if injection_results["detected"]:
99 findings.append({
100 "type": "prompt_injection",
101 "severity": "HIGH",
102 "title": "Prompt Injection Attempt Detected",
103 "description": "User prompt contains patterns attempting to override system instructions",
104 "details": injection_results["matches"],
105 "recommendation": "Block this prompt and log the attempt"
106 })
107 risk_score += 80
109 # Check for jailbreak attempts
110 jailbreak_results = self._detect_jailbreak(prompt)
111 if jailbreak_results["detected"]:
112 findings.append({
113 "type": "jailbreak_attempt",
114 "severity": "HIGH",
115 "title": "Jailbreak Attempt Detected",
116 "description": "User trying to bypass model safety constraints",
117 "details": jailbreak_results["matches"],
118 "recommendation": "Block this prompt and monitor user"
119 })
120 risk_score += 85
122 # Check for PII in prompt
123 pii_results = self._detect_pii(prompt)
124 if pii_results["detected"]:
125 findings.append({
126 "type": "pii_detected",
127 "severity": "MEDIUM",
128 "title": "PII Detected in Prompt",
129 "description": "Prompt contains personally identifiable information",
130 "details": pii_results["pii_types"],
131 "recommendation": "Redact PII before sending to LLM"
132 })
133 risk_score += 40
135 # Check for harmful content requests
136 harmful_results = self._detect_harmful_content(prompt)
137 if harmful_results["detected"]:
138 findings.append({
139 "type": "harmful_request",
140 "severity": "HIGH",
141 "title": "Harmful Content Request",
142 "description": "User requesting harmful or illegal information",
143 "details": harmful_results["matches"],
144 "recommendation": "Block this prompt and log for review"
145 })
146 risk_score += 90
148 # Calculate final risk level
149 if risk_score >= 70:
150 risk_level = "CRITICAL"
151 elif risk_score >= 40:
152 risk_level = "HIGH"
153 elif risk_score >= 20:
154 risk_level = "MEDIUM"
155 else:
156 risk_level = "LOW"
158 return {
159 "safe": len(findings) == 0,
160 "risk_level": risk_level,
161 "risk_score": min(risk_score, 100),
162 "findings": findings,
163 "prompt": prompt,
164 "timestamp": self._get_timestamp()
165 }
167 def scan_output(self, output: str, context: Optional[Dict] = None) -> Dict[str, Any]:
168 """
169 Scan LLM output for security issues before returning to user.
171 Args:
172 output: LLM's generated output
173 context: Optional context
175 Returns:
176 Security analysis results
177 """
178 logger.debug(f"Scanning LLM output: {output[:100]}...")
180 findings = []
181 risk_score = 0
183 # Check for leaked PII
184 pii_results = self._detect_pii(output)
185 if pii_results["detected"]:
186 findings.append({
187 "type": "pii_leakage",
188 "severity": "HIGH",
189 "title": "PII Leaked in Output",
190 "description": "LLM output contains personally identifiable information",
191 "details": pii_results["pii_types"],
192 "recommendation": "Redact PII before showing to user"
193 })
194 risk_score += 70
196 # Check for sensitive data exposure
197 sensitive_results = self._detect_sensitive_data(output)
198 if sensitive_results["detected"]:
199 findings.append({
200 "type": "sensitive_data_exposure",
201 "severity": "MEDIUM",
202 "title": "Sensitive Data in Output",
203 "description": "Output may contain sensitive information",
204 "details": sensitive_results["types"],
205 "recommendation": "Review and redact sensitive data"
206 })
207 risk_score += 50
209 # Check for hallucination indicators
210 hallucination_results = self._detect_hallucination_indicators(output)
211 if hallucination_results["likely"]:
212 findings.append({
213 "type": "potential_hallucination",
214 "severity": "LOW",
215 "title": "Potential Hallucination Detected",
216 "description": "Output may contain fabricated information",
217 "details": hallucination_results["indicators"],
218 "recommendation": "Verify information before trusting"
219 })
220 risk_score += 20
222 return {
223 "safe": len(findings) == 0,
224 "risk_score": min(risk_score, 100),
225 "findings": findings,
226 "output": output,
227 "timestamp": self._get_timestamp()
228 }
230 def _detect_prompt_injection(self, text: str) -> Dict[str, Any]:
231 """Detect prompt injection patterns."""
232 matches = []
233 for pattern in self.injection_patterns:
234 found = re.findall(pattern, text)
235 if found:
236 matches.append({
237 "pattern": pattern,
238 "matches": found
239 })
241 return {
242 "detected": len(matches) > 0,
243 "matches": matches
244 }
246 def _detect_jailbreak(self, text: str) -> Dict[str, Any]:
247 """Detect jailbreak attempt patterns."""
248 jailbreak_keywords = [
249 "dan", "jailbreak", "developer mode", "god mode",
250 "unrestricted", "bypass", "ignore rules", "evil mode"
251 ]
253 matches = []
254 text_lower = text.lower()
255 for keyword in jailbreak_keywords:
256 if keyword in text_lower:
257 matches.append(keyword)
259 return {
260 "detected": len(matches) > 0,
261 "matches": matches
262 }
264 def _detect_pii(self, text: str) -> Dict[str, Any]:
265 """Detect personally identifiable information."""
266 pii_found = {}
268 for pii_type, pattern in self.pii_patterns.items():
269 matches = re.findall(pattern, text, re.IGNORECASE)
270 if matches:
271 # Redact matched PII
272 pii_found[pii_type] = [self._redact(m) for m in matches]
274 return {
275 "detected": len(pii_found) > 0,
276 "pii_types": pii_found
277 }
279 def _detect_harmful_content(self, text: str) -> Dict[str, Any]:
280 """Detect requests for harmful or illegal content."""
281 matches = []
282 for pattern in self.harmful_patterns:
283 found = re.findall(pattern, text)
284 if found:
285 matches.append({
286 "pattern": pattern,
287 "matches": found
288 })
290 return {
291 "detected": len(matches) > 0,
292 "matches": matches
293 }
295 def _detect_sensitive_data(self, text: str) -> Dict[str, Any]:
296 """Detect sensitive data in output."""
297 sensitive_types = []
299 # Check for API keys, tokens, secrets
300 if re.search(r"(api[_-]?key|token|secret|password)[\s:=]", text, re.IGNORECASE):
301 sensitive_types.append("credentials")
303 # Check for internal URLs/IPs
304 if re.search(r"(192\.168\.|10\.|172\.(1[6-9]|2[0-9]|3[0-1])\.)", text):
305 sensitive_types.append("internal_ip")
307 # Check for file paths
308 if re.search(r"(/etc/|/home/|/root/|C:\\|/var/)", text):
309 sensitive_types.append("file_paths")
311 return {
312 "detected": len(sensitive_types) > 0,
313 "types": sensitive_types
314 }
316 def _detect_hallucination_indicators(self, text: str) -> Dict[str, Any]:
317 """Detect potential hallucination indicators in output."""
318 indicators = []
320 # Check for uncertainty markers
321 uncertainty_phrases = [
322 "i'm not sure", "i think", "maybe", "possibly",
323 "i don't know", "unclear", "uncertain"
324 ]
326 text_lower = text.lower()
327 for phrase in uncertainty_phrases:
328 if phrase in text_lower:
329 indicators.append(f"Uncertainty: '{phrase}'")
331 # Check for inconsistencies
332 if "however" in text_lower and "but" in text_lower:
333 indicators.append("Contradictory statements")
335 return {
336 "likely": len(indicators) > 0,
337 "indicators": indicators
338 }
340 def _redact(self, text: str) -> str:
341 """Redact sensitive information."""
342 if isinstance(text, tuple):
343 text = text[0] if text else ""
344 return f"***REDACTED ({len(str(text))} chars)***"
346 def _get_timestamp(self) -> str:
347 """Get current timestamp."""
348 from datetime import datetime
349 return datetime.utcnow().isoformat()
351 def get_recommendations(self) -> List[str]:
352 """Get general security recommendations for LLM applications."""
353 return [
354 "Always validate and sanitize user inputs before sending to LLM",
355 "Implement rate limiting to prevent abuse",
356 "Monitor for unusual patterns in user prompts",
357 "Use separate system prompts that users cannot override",
358 "Implement output filtering to prevent PII leakage",
359 "Log all suspicious prompts for security review",
360 "Use prompt templates with fixed structure",
361 "Implement content moderation on both input and output",
362 "Monitor token usage for anomalies",
363 "Keep model and framework dependencies updated"
364 ]
367# CLI entry point
368def run_llm_security_scan(target: str, options: Dict[str, Any]) -> Dict[str, Any]:
369 """
370 Run LLM security scan.
372 Args:
373 target: Text to scan (prompt or output)
374 options: Scan options
376 Returns:
377 Scan results
378 """
379 agent = LLMSecurityAgent()
381 scan_type = options.get("type", "prompt") # "prompt" or "output"
383 if scan_type == "prompt":
384 results = agent.scan_prompt(target)
385 else:
386 results = agent.scan_output(target)
388 return {
389 "agent": "LLM Security Agent",
390 "scan_type": scan_type,
391 "results": results,
392 "recommendations": agent.get_recommendations() if not results["safe"] else []
393 }