Coverage for aipyapp/aipy/multimodal.py: 24%
116 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-11 12:02 +0200
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-11 12:02 +0200
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
4from base64 import b64encode
5import re
6from pathlib import Path
7from typing import Union, List, Dict, Any
8import mimetypes
10from loguru import logger
11from charset_normalizer import from_bytes
13MessageList = List[Dict[str, Any]]
14LLMContext = Union[str, MessageList]
16class MMContentError(Exception):
17 """多模态内容处理的基础异常类"""
18 pass
20class FileReadError(MMContentError):
21 """文件读取失败异常"""
22 def __init__(self, file_path: str, original_error: Exception):
23 self.file_path = file_path
24 self.original_error = original_error
25 super().__init__(f"无法读取文件 {file_path}: {original_error}")
27def is_text_file(path, blocksize=4096):
28 try:
29 with open(path, 'rb') as f:
30 chunk = f.read(blocksize)
31 result = from_bytes(chunk)
32 if not result:
33 return False
34 best = result.best()
35 if best is None:
36 return False
37 # encoding 存在且 chaos 很低,认为是文本
38 if best.encoding and best.chaos < 0.1:
39 return True
40 return False
41 except Exception:
42 logger.exception('Failed to check if file is text')
43 return False
45class MMContent:
46 """
47 多模态内容类,支持文本、图片、文件的统一处理。
48 """
49 def __init__(self, string: str, base_path: Path = None):
50 self.string = string
51 self.items = self._from_string(string, base_path)
52 self.log = logger.bind(type='multimodal')
54 def _from_string(self, text: str, base_path: Path = None) -> list:
55 """
56 从输入字符串解析多模态内容,支持@file.pdf、@image.jpg等文件引用,返回MMContent对象
57 支持带引号的文件路径,如 @"path with spaces.txt"
58 """
59 # 匹配 @文件路径,支持带引号的路径
60 parts = re.split(r'(@(?:"[^"]*"|\'[^\']*\'|[^\s]+))', text)
61 items = []
62 for part in parts:
63 part = part.strip()
64 if not part:
65 continue
66 if part.startswith('@'):
67 file_path = part[1:]
68 # 去除文件路径的引号
69 if (file_path.startswith('"') and file_path.endswith('"')) or \
70 (file_path.startswith("'") and file_path.endswith("'")):
71 file_path = file_path[1:-1]
72 ext = Path(file_path).suffix.lower()
73 file_type = 'image' if ext in {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'} else None
74 if base_path:
75 p = Path(file_path)
76 if not p.is_absolute():
77 file_path = str(base_path / p)
79 # 检查文件是否存在,如果不存在则作为普通文本处理
80 if not Path(file_path).exists():
81 items.append({'type': 'text', 'text': part})
82 continue
84 if not file_type:
85 # 判断文本/二进制
86 if is_text_file(file_path):
87 file_type = 'document'
88 else:
89 file_type = 'file'
90 items.append({'type': file_type, 'path': file_path})
91 else:
92 items.append({'type': 'text', 'text': part})
93 return items
95 @property
96 def is_multimodal(self) -> bool:
97 return any(item['type'] in ('image', 'file', 'document') for item in self.items)
99 def _is_network_url(self, url: str) -> bool:
100 """判断是否为网络URL"""
101 return url.startswith(('http://', 'https://', 'data:'))
103 def _get_mime_type(self, file_path: str, default_mime: str) -> str:
104 """获取文件的MIME类型"""
105 mime, _ = mimetypes.guess_type(file_path)
106 return mime or default_mime
108 def _read_file(self, file_path: str, base64: bool = False) -> str:
109 """读取文件内容,支持 base64 编码"""
110 try:
111 with open(file_path, 'rb') as f:
112 data = f.read()
113 if base64:
114 data = b64encode(data)
115 return data.decode('utf-8')
116 except Exception as e:
117 raise FileReadError(file_path, e)
119 def _process_image_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
120 """处理图片项"""
121 url = item['path']
123 # 网络URL直接使用
124 if self._is_network_url(url):
125 return {"type": "image_url", "image_url": {"url": url}}
127 # 本地图片转换为data URL
128 mime = self._get_mime_type(url, 'image/jpeg')
129 b64_data = self._read_file(url, base64=True)
130 data_url = f"data:{mime};base64,{b64_data}"
131 return {"type": "image_url", "image_url": {"url": data_url}}
133 def _process_file_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
134 """处理文件项(仅二进制文件)"""
135 return {"type": "text", "text": f"file: {item['path']}"}
137 def _process_document_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
138 """处理文本文件项(document)"""
139 path = str(item['path'])
140 content = self._read_file(path, base64=False)
141 text = f"<attachment filename=\"{path}\">{content}</attachment>"
142 return {"type": "text", "text": text}
144 def _process_text_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
145 """处理文本项"""
146 return {"type": "text", "text": item['text']}
148 @property
149 def content(self) -> LLMContext:
150 """返回多模态内容的结构化列表
152 转换为 LLM API 可接受的 context 格式:
153 - 只有一个纯文本时,直接返回字符串
154 - 图片:image_url,自动转data url
155 - 文本文件(document):转为text类型,内容包裹在<document>标签
156 - 文件(如PDF等二进制):file类型
157 - 文本文件(document):转为text类型,内容包裹在<document>标签
158 - 文件(如PDF等二进制):file类型
159 """
160 results = []
161 has_image = False
162 for item in self.items:
163 if item['type'] == 'text':
164 result = self._process_text_item(item)
165 elif item['type'] == 'image':
166 has_image = True
167 result = self._process_image_item(item)
168 elif item['type'] == 'document':
169 result = self._process_document_item(item)
170 else:
171 # TODO: 处理其他类型
172 result = self._process_file_item(item)
173 results.append(result)
175 if not has_image:
176 texts = [r['text'] for r in results if r['type'] == 'text']
177 return '\n'.join(texts)
178 return results