Coverage for aipyapp/plugins/p_web_tools.py: 0%

72 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-11 12:02 +0200

1#!/usr/bin/env python 

2# -*- coding: utf-8 -*- 

3 

4import requests 

5from typing import Dict, Any, Optional 

6from urllib.parse import urlparse 

7 

8from aipyapp import TaskPlugin 

9 

10class WebToolsPlugin(TaskPlugin): 

11 """网络工具插件 - 提供网页抓取、URL分析等功能""" 

12 

13 name = "web_tools" 

14 version = "1.0.0" 

15 description = "提供网页抓取、URL分析、HTTP请求等网络工具功能" 

16 author = "AiPy Team" 

17 

18 def init(self): 

19 """初始化网络工具配置""" 

20 self.timeout = self.config.get('timeout', 30) 

21 self.user_agent = self.config.get('user_agent', 'AiPy WebTools/1.0') 

22 self.max_content_length = self.config.get('max_content_length', 1024 * 1024) # 1MB 

23 self.headers = { 

24 'User-Agent': self.user_agent, 

25 **self.config.get('default_headers', {}) 

26 } 

27 

28 self.logger.info(f"初始化网络工具,超时: {self.timeout}s") 

29 

30 def fn_fetch_webpage(self, url: str, extract_text: bool = True) -> Dict[str, Any]: 

31 """ 

32 抓取网页内容 

33  

34 Args: 

35 url: 目标URL 

36 extract_text: 是否只提取文本内容 

37  

38 Returns: 

39 包含网页信息的字典 

40  

41 Examples: 

42 >>> fn_fetch_webpage("https://www.baidu.com") 

43 {'success': True, 'url': 'https://www.baidu.com', 'status_code': 200, 'headers': {'Content-Type': 'text/html; charset=utf-8'}, 'content_type': 'text/html; charset=utf-8', 'encoding': 'utf-8', 'text': '百度一下,你就知道', 'title': '百度一下,你就知道'} 

44 >>> fn_fetch_webpage("https://www.baidu.com", extract_text=False) 

45 {'success': True, 'url': 'https://www.baidu.com', 'status_code': 200, 'headers': {'Content-Type': 'text/html; charset=utf-8'}, 'content_type': 'text/html; charset=utf-8', 'encoding': 'utf-8', 'content': '<!DOCTYPE html>...'}  

46 """ 

47 return self._fetch_webpage(url, extract_text) 

48 

49 def fn_http_request(self, url: str, method: str = "GET", headers: Optional[Dict[str, str]] = None, params: Optional[Dict[str, Any]] = None, json_data: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: 

50 """ 

51 发送HTTP请求 

52  

53 Args: 

54 url: 请求URL 

55 method: HTTP方法 (GET/POST/PUT/DELETE等) 

56 headers: 请求头 

57 params: URL参数 

58 json_data: JSON请求体 

59  

60 Returns: 

61 包含网页信息的字典,其中content为响应内容,success为是否成功,status_code为状态码 

62 

63 Examples: 

64 >>> fn_http_request("https://www.baidu.com") 

65 {'success': True, 'status_code': 200, 'headers': {'Content-Type': 'text/html; charset=utf-8'}, 'content': '百度一下,你就知道', 'elapsed': '0.000000s'} 

66 >>> fn_http_request("https://www.baidu.com", method="POST", json_data={"name": "John", "age": 30}) 

67 {'success': True, 'status_code': 200, 'headers': {'Content-Type': 'text/html; charset=utf-8'}, 'content': '百度一下,你就知道', 'elapsed': '0.000000s'} 

68 >>> fn_http_request("https://www.baidu.com") 

69 {'sucess': False, 'error': '400 Client Error: Bad Request for url'} 

70 """ 

71 return self._http_request(url, method, headers, params, json_data) 

72 

73 def fn_analyze_url(self, url: str) -> Dict[str, str]: 

74 """ 

75 分析URL的各个组成部分 

76  

77 Args: 

78 url: 待分析的URL 

79  

80 Returns: 

81 URL组成信息字典 

82 """ 

83 return self._analyze_url(url) 

84 

85 def fn_check_url_status(self, url: str) -> Dict[str, Any]: 

86 """ 

87 检查URL状态 

88  

89 Args: 

90 url: 目标URL 

91  

92 Returns: 

93 URL状态信息字典 

94 """ 

95 return self._check_url_status(url) 

96 

97 def _fetch_webpage(self, url: str, extract_text: bool) -> Dict[str, Any]: 

98 """抓取网页内容""" 

99 try: 

100 response = requests.get( 

101 url, 

102 headers=self.headers, 

103 timeout=self.timeout, 

104 stream=True 

105 ) 

106 

107 # 检查内容长度 

108 content_length = response.headers.get('content-length') 

109 if content_length and int(content_length) > self.max_content_length: 

110 return { 

111 "success": False, 

112 "error": f"内容太大 ({content_length} bytes),超过限制 ({self.max_content_length} bytes)" 

113 } 

114 

115 response.raise_for_status() 

116 

117 result = { 

118 "success": True, 

119 "url": url, 

120 "status_code": response.status_code, 

121 "headers": dict(response.headers), 

122 "content_type": response.headers.get('content-type', ''), 

123 "encoding": response.encoding 

124 } 

125 

126 if extract_text and 'text/html' in response.headers.get('content-type', ''): 

127 try: 

128 from bs4 import BeautifulSoup 

129 soup = BeautifulSoup(response.content, 'html.parser') 

130 

131 # 移除script和style标签 

132 for script in soup(["script", "style"]): 

133 script.decompose() 

134 

135 result["text"] = soup.get_text(separator=' ', strip=True) 

136 result["title"] = soup.title.string if soup.title else "" 

137 

138 except ImportError: 

139 result["text"] = response.text 

140 result["raw_html"] = response.text[:2000] + "..." if len(response.text) > 2000 else response.text 

141 else: 

142 result["content"] = response.text[:2000] + "..." if len(response.text) > 2000 else response.text 

143 

144 return result 

145 

146 except Exception as e: 

147 self.logger.error(f"抓取网页失败 {url}: {e}") 

148 return { 

149 "success": False, 

150 "url": url, 

151 "error": str(e) 

152 } 

153 

154 def _analyze_url(self, url: str) -> Dict[str, str]: 

155 """分析URL结构""" 

156 try: 

157 parsed = urlparse(url) 

158 return { 

159 "scheme": parsed.scheme, 

160 "netloc": parsed.netloc, 

161 "hostname": parsed.hostname, 

162 "port": str(parsed.port) if parsed.port else "", 

163 "path": parsed.path, 

164 "params": parsed.params, 

165 "query": parsed.query, 

166 "fragment": parsed.fragment, 

167 "username": parsed.username or "", 

168 "password": "***" if parsed.password else "" 

169 } 

170 except Exception as e: 

171 return {"error": str(e)} 

172 

173 def _http_request(self, url: str, method: str, headers: Optional[Dict], params: Optional[Dict], json_data: Optional[Dict]) -> Dict[str, Any]: 

174 """发送HTTP请求""" 

175 try: 

176 request_headers = self.headers.copy() 

177 if headers: 

178 request_headers.update(headers) 

179 

180 kwargs = { 

181 'headers': request_headers, 

182 'timeout': self.timeout, 

183 'params': params 

184 } 

185 

186 if json_data: 

187 kwargs['json'] = json_data 

188 

189 response = requests.request(method.upper(), url, **kwargs) 

190 

191 return { 

192 "success": True, 

193 "status_code": response.status_code, 

194 "headers": dict(response.headers), 

195 "content": response.text, 

196 "elapsed": str(response.elapsed) 

197 } 

198 

199 except Exception as e: 

200 self.logger.error(f"HTTP请求失败 {method} {url}: {e}") 

201 return { 

202 "success": False, 

203 "error": str(e) 

204 } 

205 

206 def _check_url_status(self, url: str) -> Dict[str, Any]: 

207 """检查URL状态""" 

208 try: 

209 response = requests.head(url, headers=self.headers, timeout=self.timeout) 

210 return { 

211 "accessible": True, 

212 "status_code": response.status_code, 

213 "content_type": response.headers.get('content-type', ''), 

214 "content_length": response.headers.get('content-length', ''), 

215 "last_modified": response.headers.get('last-modified', ''), 

216 "server": response.headers.get('server', '') 

217 } 

218 except Exception as e: 

219 return { 

220 "accessible": False, 

221 "error": str(e) 

222 }