Coverage for aipyapp/plugins/p_web_tools.py: 0%
72 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-11 12:02 +0200
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-11 12:02 +0200
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
4import requests
5from typing import Dict, Any, Optional
6from urllib.parse import urlparse
8from aipyapp import TaskPlugin
10class WebToolsPlugin(TaskPlugin):
11 """网络工具插件 - 提供网页抓取、URL分析等功能"""
13 name = "web_tools"
14 version = "1.0.0"
15 description = "提供网页抓取、URL分析、HTTP请求等网络工具功能"
16 author = "AiPy Team"
18 def init(self):
19 """初始化网络工具配置"""
20 self.timeout = self.config.get('timeout', 30)
21 self.user_agent = self.config.get('user_agent', 'AiPy WebTools/1.0')
22 self.max_content_length = self.config.get('max_content_length', 1024 * 1024) # 1MB
23 self.headers = {
24 'User-Agent': self.user_agent,
25 **self.config.get('default_headers', {})
26 }
28 self.logger.info(f"初始化网络工具,超时: {self.timeout}s")
30 def fn_fetch_webpage(self, url: str, extract_text: bool = True) -> Dict[str, Any]:
31 """
32 抓取网页内容
34 Args:
35 url: 目标URL
36 extract_text: 是否只提取文本内容
38 Returns:
39 包含网页信息的字典
41 Examples:
42 >>> fn_fetch_webpage("https://www.baidu.com")
43 {'success': True, 'url': 'https://www.baidu.com', 'status_code': 200, 'headers': {'Content-Type': 'text/html; charset=utf-8'}, 'content_type': 'text/html; charset=utf-8', 'encoding': 'utf-8', 'text': '百度一下,你就知道', 'title': '百度一下,你就知道'}
44 >>> fn_fetch_webpage("https://www.baidu.com", extract_text=False)
45 {'success': True, 'url': 'https://www.baidu.com', 'status_code': 200, 'headers': {'Content-Type': 'text/html; charset=utf-8'}, 'content_type': 'text/html; charset=utf-8', 'encoding': 'utf-8', 'content': '<!DOCTYPE html>...'}
46 """
47 return self._fetch_webpage(url, extract_text)
49 def fn_http_request(self, url: str, method: str = "GET", headers: Optional[Dict[str, str]] = None, params: Optional[Dict[str, Any]] = None, json_data: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
50 """
51 发送HTTP请求
53 Args:
54 url: 请求URL
55 method: HTTP方法 (GET/POST/PUT/DELETE等)
56 headers: 请求头
57 params: URL参数
58 json_data: JSON请求体
60 Returns:
61 包含网页信息的字典,其中content为响应内容,success为是否成功,status_code为状态码
63 Examples:
64 >>> fn_http_request("https://www.baidu.com")
65 {'success': True, 'status_code': 200, 'headers': {'Content-Type': 'text/html; charset=utf-8'}, 'content': '百度一下,你就知道', 'elapsed': '0.000000s'}
66 >>> fn_http_request("https://www.baidu.com", method="POST", json_data={"name": "John", "age": 30})
67 {'success': True, 'status_code': 200, 'headers': {'Content-Type': 'text/html; charset=utf-8'}, 'content': '百度一下,你就知道', 'elapsed': '0.000000s'}
68 >>> fn_http_request("https://www.baidu.com")
69 {'sucess': False, 'error': '400 Client Error: Bad Request for url'}
70 """
71 return self._http_request(url, method, headers, params, json_data)
73 def fn_analyze_url(self, url: str) -> Dict[str, str]:
74 """
75 分析URL的各个组成部分
77 Args:
78 url: 待分析的URL
80 Returns:
81 URL组成信息字典
82 """
83 return self._analyze_url(url)
85 def fn_check_url_status(self, url: str) -> Dict[str, Any]:
86 """
87 检查URL状态
89 Args:
90 url: 目标URL
92 Returns:
93 URL状态信息字典
94 """
95 return self._check_url_status(url)
97 def _fetch_webpage(self, url: str, extract_text: bool) -> Dict[str, Any]:
98 """抓取网页内容"""
99 try:
100 response = requests.get(
101 url,
102 headers=self.headers,
103 timeout=self.timeout,
104 stream=True
105 )
107 # 检查内容长度
108 content_length = response.headers.get('content-length')
109 if content_length and int(content_length) > self.max_content_length:
110 return {
111 "success": False,
112 "error": f"内容太大 ({content_length} bytes),超过限制 ({self.max_content_length} bytes)"
113 }
115 response.raise_for_status()
117 result = {
118 "success": True,
119 "url": url,
120 "status_code": response.status_code,
121 "headers": dict(response.headers),
122 "content_type": response.headers.get('content-type', ''),
123 "encoding": response.encoding
124 }
126 if extract_text and 'text/html' in response.headers.get('content-type', ''):
127 try:
128 from bs4 import BeautifulSoup
129 soup = BeautifulSoup(response.content, 'html.parser')
131 # 移除script和style标签
132 for script in soup(["script", "style"]):
133 script.decompose()
135 result["text"] = soup.get_text(separator=' ', strip=True)
136 result["title"] = soup.title.string if soup.title else ""
138 except ImportError:
139 result["text"] = response.text
140 result["raw_html"] = response.text[:2000] + "..." if len(response.text) > 2000 else response.text
141 else:
142 result["content"] = response.text[:2000] + "..." if len(response.text) > 2000 else response.text
144 return result
146 except Exception as e:
147 self.logger.error(f"抓取网页失败 {url}: {e}")
148 return {
149 "success": False,
150 "url": url,
151 "error": str(e)
152 }
154 def _analyze_url(self, url: str) -> Dict[str, str]:
155 """分析URL结构"""
156 try:
157 parsed = urlparse(url)
158 return {
159 "scheme": parsed.scheme,
160 "netloc": parsed.netloc,
161 "hostname": parsed.hostname,
162 "port": str(parsed.port) if parsed.port else "",
163 "path": parsed.path,
164 "params": parsed.params,
165 "query": parsed.query,
166 "fragment": parsed.fragment,
167 "username": parsed.username or "",
168 "password": "***" if parsed.password else ""
169 }
170 except Exception as e:
171 return {"error": str(e)}
173 def _http_request(self, url: str, method: str, headers: Optional[Dict], params: Optional[Dict], json_data: Optional[Dict]) -> Dict[str, Any]:
174 """发送HTTP请求"""
175 try:
176 request_headers = self.headers.copy()
177 if headers:
178 request_headers.update(headers)
180 kwargs = {
181 'headers': request_headers,
182 'timeout': self.timeout,
183 'params': params
184 }
186 if json_data:
187 kwargs['json'] = json_data
189 response = requests.request(method.upper(), url, **kwargs)
191 return {
192 "success": True,
193 "status_code": response.status_code,
194 "headers": dict(response.headers),
195 "content": response.text,
196 "elapsed": str(response.elapsed)
197 }
199 except Exception as e:
200 self.logger.error(f"HTTP请求失败 {method} {url}: {e}")
201 return {
202 "success": False,
203 "error": str(e)
204 }
206 def _check_url_status(self, url: str) -> Dict[str, Any]:
207 """检查URL状态"""
208 try:
209 response = requests.head(url, headers=self.headers, timeout=self.timeout)
210 return {
211 "accessible": True,
212 "status_code": response.status_code,
213 "content_type": response.headers.get('content-type', ''),
214 "content_length": response.headers.get('content-length', ''),
215 "last_modified": response.headers.get('last-modified', ''),
216 "server": response.headers.get('server', '')
217 }
218 except Exception as e:
219 return {
220 "accessible": False,
221 "error": str(e)
222 }