多模态输入
Trae IDE 支持多模态输入,允许用户通过文本、图像、音频和文件等多种方式与 AI 助手进行交互。本文档详细介绍多模态输入的功能、使用方法和最佳实践。
概述
多模态输入是现代 AI 交互的重要特性,它使得用户可以:
- 文本输入:传统的文字描述和代码
- 图像输入:截图、设计稿、图表、手绘草图
- 音频输入:语音指令和音频文件
- 文件输入:代码文件、文档、数据文件
- 混合输入:同时使用多种输入方式
支持的输入类型
文本输入
文本输入是最基础的交互方式,支持:
- 自然语言描述:用日常语言描述需求
- 代码片段:直接粘贴代码进行分析
- 技术规范:API 文档、需求文档等
- Markdown 格式:支持格式化文本
markdown
# 示例文本输入
请帮我创建一个 React 组件,要求:
1. 显示用户列表
2. 支持搜索功能
3. 使用 TypeScript
4. 包含分页
```javascript
// 现有的用户数据结构
interface User {
id: string;
name: string;
email: string;
avatar?: string;
}
### 图像输入
#### 支持的图像格式
- **PNG**:无损压缩,适合截图和图表
- **JPEG/JPG**:有损压缩,适合照片
- **GIF**:支持动画,适合演示流程
- **SVG**:矢量图形,适合图标和简单图形
- **WebP**:现代格式,平衡质量和大小
#### 图像输入场景
**1. 界面设计**
```typescript
// 上传设计稿后的处理
interface DesignAnalysis {
components: ComponentInfo[];
layout: LayoutInfo;
colors: ColorPalette;
typography: TypographyInfo;
interactions: InteractionInfo[];
}
class DesignToCodeConverter {
async analyzeDesign(imageData: ImageData): Promise<DesignAnalysis> {
const analysis = await this.visionModel.analyze(imageData);
return {
components: this.extractComponents(analysis),
layout: this.analyzeLayout(analysis),
colors: this.extractColors(analysis),
typography: this.analyzeTypography(analysis),
interactions: this.detectInteractions(analysis)
};
}
async generateCode(analysis: DesignAnalysis): Promise<string> {
const componentCode = await this.generateComponents(analysis.components);
const styleCode = await this.generateStyles(analysis.colors, analysis.typography);
const layoutCode = await this.generateLayout(analysis.layout);
return this.combineCode(componentCode, styleCode, layoutCode);
}
}2. 错误截图分析
typescript
// 错误截图分析
interface ErrorAnalysis {
errorType: string;
errorMessage: string;
stackTrace?: string;
suggestedFixes: string[];
relatedFiles: string[];
}
class ErrorScreenshotAnalyzer {
async analyzeError(screenshot: ImageData): Promise<ErrorAnalysis> {
// 使用 OCR 提取错误信息
const ocrResult = await this.ocrService.extractText(screenshot);
// 分析错误类型
const errorType = this.classifyError(ocrResult.text);
// 提取错误消息
const errorMessage = this.extractErrorMessage(ocrResult.text);
// 生成修复建议
const suggestedFixes = await this.generateFixes(errorType, errorMessage);
return {
errorType,
errorMessage,
stackTrace: this.extractStackTrace(ocrResult.text),
suggestedFixes,
relatedFiles: this.findRelatedFiles(errorMessage)
};
}
}3. 手绘草图识别
typescript
// 手绘草图转代码
interface SketchAnalysis {
shapes: Shape[];
text: TextElement[];
connections: Connection[];
layout: LayoutStructure;
}
class SketchToCodeConverter {
async analyzeSketch(sketchImage: ImageData): Promise<SketchAnalysis> {
// 识别基本形状
const shapes = await this.shapeDetector.detect(sketchImage);
// 识别文字
const text = await this.textRecognizer.recognize(sketchImage);
// 分析连接关系
const connections = this.analyzeConnections(shapes, text);
// 推断布局结构
const layout = this.inferLayout(shapes, text, connections);
return { shapes, text, connections, layout };
}
async generateFromSketch(analysis: SketchAnalysis): Promise<string> {
// 根据草图生成对应的代码结构
if (this.isUISketch(analysis)) {
return this.generateUICode(analysis);
} else if (this.isFlowchartSketch(analysis)) {
return this.generateFlowchartCode(analysis);
} else if (this.isDatabaseSketch(analysis)) {
return this.generateDatabaseSchema(analysis);
}
return this.generateGenericCode(analysis);
}
}音频输入
支持的音频格式
- MP3:通用格式,兼容性好
- WAV:无损格式,质量高
- M4A:Apple 格式,质量好
- OGG:开源格式,压缩效率高
- FLAC:无损压缩,适合高质量音频
语音识别功能
typescript
// 语音转文本服务
class SpeechToTextService {
private recognizer: SpeechRecognizer;
async transcribe(audioData: AudioData): Promise<TranscriptionResult> {
const result = await this.recognizer.recognize(audioData);
return {
text: result.text,
confidence: result.confidence,
language: result.detectedLanguage,
timestamps: result.wordTimestamps,
alternatives: result.alternatives
};
}
async transcribeRealtime(audioStream: AudioStream): Promise<AsyncIterable<string>> {
return this.recognizer.recognizeStream(audioStream);
}
}
// 语音命令处理
class VoiceCommandProcessor {
private commandPatterns = [
{
pattern: /创建.*组件/,
action: 'create-component',
extractor: (text: string) => this.extractComponentInfo(text)
},
{
pattern: /修复.*错误/,
action: 'fix-error',
extractor: (text: string) => this.extractErrorInfo(text)
},
{
pattern: /优化.*性能/,
action: 'optimize-performance',
extractor: (text: string) => this.extractOptimizationTarget(text)
}
];
async processCommand(transcription: string): Promise<CommandResult> {
for (const pattern of this.commandPatterns) {
if (pattern.pattern.test(transcription)) {
const params = pattern.extractor(transcription);
return this.executeCommand(pattern.action, params);
}
}
// 如果没有匹配的命令模式,作为普通对话处理
return this.processAsConversation(transcription);
}
}音频分析功能
typescript
// 音频内容分析
class AudioContentAnalyzer {
async analyzeAudio(audioData: AudioData): Promise<AudioAnalysis> {
const transcription = await this.speechToText.transcribe(audioData);
const sentiment = await this.sentimentAnalyzer.analyze(transcription.text);
const intent = await this.intentClassifier.classify(transcription.text);
const entities = await this.entityExtractor.extract(transcription.text);
return {
transcription,
sentiment,
intent,
entities,
audioFeatures: await this.extractAudioFeatures(audioData)
};
}
private async extractAudioFeatures(audioData: AudioData): Promise<AudioFeatures> {
return {
duration: audioData.duration,
sampleRate: audioData.sampleRate,
channels: audioData.channels,
volume: this.calculateAverageVolume(audioData),
pitch: this.analyzePitch(audioData),
tempo: this.detectTempo(audioData)
};
}
}文件输入
支持的文件类型
代码文件:
- JavaScript/TypeScript (
.js,.ts,.jsx,.tsx) - Python (
.py,.pyw) - Java (
.java) - C/C++ (
.c,.cpp,.h,.hpp) - Go (
.go) - Rust (
.rs) - PHP (
.php) - Ruby (
.rb) - Swift (
.swift) - Kotlin (
.kt)
配置文件:
- JSON (
.json) - YAML (
.yml,.yaml) - XML (
.xml) - TOML (
.toml) - INI (
.ini) - 环境变量 (
.env)
文档文件:
- Markdown (
.md) - 纯文本 (
.txt) - CSV (
.csv) - PDF (
.pdf) - Word (
.docx)
文件处理流程
typescript
// 文件上传处理器
class FileUploadProcessor {
private processors = new Map<string, FileProcessor>();
constructor() {
this.registerProcessors();
}
async processFile(file: File): Promise<ProcessedFile> {
const fileType = this.detectFileType(file);
const processor = this.processors.get(fileType);
if (!processor) {
throw new Error(`不支持的文件类型: ${fileType}`);
}
const content = await this.readFileContent(file);
const analysis = await processor.analyze(content);
return {
name: file.name,
type: fileType,
size: file.size,
content,
analysis,
metadata: await this.extractMetadata(file)
};
}
private registerProcessors(): void {
this.processors.set('javascript', new JavaScriptProcessor());
this.processors.set('typescript', new TypeScriptProcessor());
this.processors.set('python', new PythonProcessor());
this.processors.set('json', new JSONProcessor());
this.processors.set('markdown', new MarkdownProcessor());
// ... 其他处理器
}
}
// JavaScript 文件处理器
class JavaScriptProcessor implements FileProcessor {
async analyze(content: string): Promise<FileAnalysis> {
const ast = this.parseAST(content);
const dependencies = this.extractDependencies(ast);
const exports = this.extractExports(ast);
const functions = this.extractFunctions(ast);
const classes = this.extractClasses(ast);
const complexity = this.calculateComplexity(ast);
return {
language: 'javascript',
dependencies,
exports,
functions,
classes,
complexity,
issues: await this.detectIssues(ast),
suggestions: await this.generateSuggestions(ast)
};
}
private parseAST(content: string): AST {
try {
return parse(content, {
sourceType: 'module',
plugins: ['jsx', 'typescript']
});
} catch (error) {
throw new Error(`解析 JavaScript 代码失败: ${error.message}`);
}
}
}多模态组合使用
图像 + 文本
typescript
// 图像和文本组合分析
class MultimodalAnalyzer {
async analyzeImageWithText(
image: ImageData,
text: string
): Promise<MultimodalAnalysis> {
// 并行分析图像和文本
const [imageAnalysis, textAnalysis] = await Promise.all([
this.imageAnalyzer.analyze(image),
this.textAnalyzer.analyze(text)
]);
// 关联分析结果
const correlations = this.findCorrelations(imageAnalysis, textAnalysis);
// 生成综合理解
const understanding = await this.generateUnderstanding(
imageAnalysis,
textAnalysis,
correlations
);
return {
imageAnalysis,
textAnalysis,
correlations,
understanding,
confidence: this.calculateConfidence(correlations)
};
}
private findCorrelations(
imageAnalysis: ImageAnalysis,
textAnalysis: TextAnalysis
): Correlation[] {
const correlations: Correlation[] = [];
// 查找图像中提到的元素
for (const entity of textAnalysis.entities) {
const imageElements = this.findInImage(entity, imageAnalysis);
if (imageElements.length > 0) {
correlations.push({
type: 'entity-match',
textEntity: entity,
imageElements,
confidence: this.calculateMatchConfidence(entity, imageElements)
});
}
}
return correlations;
}
}音频 + 文件
typescript
// 音频说明 + 代码文件分析
class AudioCodeAnalyzer {
async analyzeAudioWithCode(
audio: AudioData,
codeFiles: File[]
): Promise<AudioCodeAnalysis> {
// 转录音频
const transcription = await this.speechToText.transcribe(audio);
// 分析代码文件
const codeAnalyses = await Promise.all(
codeFiles.map(file => this.fileProcessor.processFile(file))
);
// 理解音频指令
const instructions = await this.instructionParser.parse(transcription.text);
// 将指令映射到代码
const mappings = this.mapInstructionsToCode(instructions, codeAnalyses);
// 生成执行计划
const executionPlan = await this.generateExecutionPlan(mappings);
return {
transcription,
codeAnalyses,
instructions,
mappings,
executionPlan
};
}
private mapInstructionsToCode(
instructions: Instruction[],
codeAnalyses: ProcessedFile[]
): InstructionMapping[] {
return instructions.map(instruction => {
const relevantFiles = this.findRelevantFiles(instruction, codeAnalyses);
const targetFunctions = this.findTargetFunctions(instruction, relevantFiles);
return {
instruction,
relevantFiles,
targetFunctions,
confidence: this.calculateMappingConfidence(instruction, relevantFiles)
};
});
}
}实时多模态交互
流式处理
typescript
// 实时多模态流处理
class RealtimeMultimodalProcessor {
private streams = new Map<string, MediaStream>();
private processors = new Map<string, StreamProcessor>();
async startRealtimeSession(): Promise<string> {
const sessionId = this.generateSessionId();
// 启动各种流处理器
this.processors.set(`${sessionId}-audio`, new AudioStreamProcessor());
this.processors.set(`${sessionId}-video`, new VideoStreamProcessor());
this.processors.set(`${sessionId}-text`, new TextStreamProcessor());
return sessionId;
}
async processRealtimeInput(
sessionId: string,
inputType: 'audio' | 'video' | 'text',
data: any
): Promise<void> {
const processor = this.processors.get(`${sessionId}-${inputType}`);
if (!processor) {
throw new Error(`未找到 ${inputType} 处理器`);
}
// 处理输入数据
const result = await processor.process(data);
// 触发实时响应
this.emitRealtimeResponse(sessionId, inputType, result);
// 更新会话上下文
await this.updateSessionContext(sessionId, inputType, result);
}
private emitRealtimeResponse(
sessionId: string,
inputType: string,
result: ProcessingResult
): void {
this.eventEmitter.emit('realtime-response', {
sessionId,
inputType,
result,
timestamp: Date.now()
});
}
}WebRTC 集成
typescript
// WebRTC 多媒体流处理
class WebRTCMultimodalHandler {
private peerConnection: RTCPeerConnection;
private mediaRecorder: MediaRecorder;
async setupMediaStreams(): Promise<void> {
// 获取用户媒体流
const stream = await navigator.mediaDevices.getUserMedia({
video: true,
audio: true
});
// 设置媒体录制器
this.mediaRecorder = new MediaRecorder(stream);
this.mediaRecorder.ondataavailable = this.handleMediaData.bind(this);
// 设置 WebRTC 连接
this.peerConnection = new RTCPeerConnection({
iceServers: [{ urls: 'stun:stun.l.google.com:19302' }]
});
// 添加流到连接
stream.getTracks().forEach(track => {
this.peerConnection.addTrack(track, stream);
});
}
private async handleMediaData(event: BlobEvent): Promise<void> {
const blob = event.data;
const arrayBuffer = await blob.arrayBuffer();
// 根据 MIME 类型处理不同的媒体数据
if (blob.type.startsWith('audio/')) {
await this.processAudioData(arrayBuffer);
} else if (blob.type.startsWith('video/')) {
await this.processVideoData(arrayBuffer);
}
}
private async processAudioData(audioData: ArrayBuffer): Promise<void> {
// 实时语音识别
const transcription = await this.speechToText.transcribeRealtime(audioData);
// 处理语音命令
if (transcription.isFinal) {
const command = await this.voiceCommandProcessor.process(transcription.text);
this.executeCommand(command);
}
}
private async processVideoData(videoData: ArrayBuffer): Promise<void> {
// 提取关键帧
const frames = await this.videoProcessor.extractFrames(videoData);
// 分析最新帧
const latestFrame = frames[frames.length - 1];
const analysis = await this.imageAnalyzer.analyze(latestFrame);
// 检测手势或其他视觉指令
const gestures = await this.gestureDetector.detect(analysis);
if (gestures.length > 0) {
this.handleGestures(gestures);
}
}
}用户界面集成
拖拽上传
typescript
// 拖拽上传组件
class DragDropUploader {
private dropZone: HTMLElement;
private fileHandler: FileUploadProcessor;
constructor(dropZoneId: string) {
this.dropZone = document.getElementById(dropZoneId)!;
this.fileHandler = new FileUploadProcessor();
this.setupEventListeners();
}
private setupEventListeners(): void {
this.dropZone.addEventListener('dragover', this.handleDragOver.bind(this));
this.dropZone.addEventListener('drop', this.handleDrop.bind(this));
this.dropZone.addEventListener('paste', this.handlePaste.bind(this));
}
private handleDragOver(event: DragEvent): void {
event.preventDefault();
this.dropZone.classList.add('drag-over');
}
private async handleDrop(event: DragEvent): Promise<void> {
event.preventDefault();
this.dropZone.classList.remove('drag-over');
const files = Array.from(event.dataTransfer?.files || []);
const images = this.extractImages(event.dataTransfer);
// 处理文件
for (const file of files) {
await this.processFile(file);
}
// 处理图像
for (const image of images) {
await this.processImage(image);
}
}
private async handlePaste(event: ClipboardEvent): Promise<void> {
const items = Array.from(event.clipboardData?.items || []);
for (const item of items) {
if (item.type.startsWith('image/')) {
const file = item.getAsFile();
if (file) {
await this.processImage(file);
}
} else if (item.type === 'text/plain') {
const text = await new Promise<string>(resolve => {
item.getAsString(resolve);
});
await this.processText(text);
}
}
}
}语音输入界面
typescript
// 语音输入组件
class VoiceInputComponent {
private isRecording = false;
private mediaRecorder: MediaRecorder;
private audioChunks: Blob[] = [];
async startRecording(): Promise<void> {
if (this.isRecording) return;
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
this.mediaRecorder = new MediaRecorder(stream);
this.mediaRecorder.ondataavailable = (event) => {
this.audioChunks.push(event.data);
};
this.mediaRecorder.onstop = () => {
this.processRecording();
};
this.mediaRecorder.start();
this.isRecording = true;
this.updateUI('recording');
} catch (error) {
console.error('无法启动录音:', error);
this.showError('无法访问麦克风,请检查权限设置');
}
}
stopRecording(): void {
if (!this.isRecording) return;
this.mediaRecorder.stop();
this.isRecording = false;
this.updateUI('processing');
}
private async processRecording(): Promise<void> {
const audioBlob = new Blob(this.audioChunks, { type: 'audio/wav' });
this.audioChunks = [];
try {
// 转换为 ArrayBuffer
const arrayBuffer = await audioBlob.arrayBuffer();
// 语音识别
const transcription = await this.speechToText.transcribe(arrayBuffer);
// 显示转录结果
this.displayTranscription(transcription);
// 处理语音命令
const result = await this.voiceCommandProcessor.process(transcription.text);
this.handleCommandResult(result);
} catch (error) {
console.error('处理录音失败:', error);
this.showError('语音识别失败,请重试');
} finally {
this.updateUI('idle');
}
}
private updateUI(state: 'idle' | 'recording' | 'processing'): void {
const button = document.getElementById('voice-button');
const indicator = document.getElementById('recording-indicator');
switch (state) {
case 'recording':
button?.classList.add('recording');
indicator?.classList.add('active');
break;
case 'processing':
button?.classList.add('processing');
indicator?.classList.remove('active');
break;
case 'idle':
button?.classList.remove('recording', 'processing');
indicator?.classList.remove('active');
break;
}
}
}最佳实践
性能优化
- 文件大小限制:
typescript
const FILE_SIZE_LIMITS = {
image: 10 * 1024 * 1024, // 10MB
audio: 50 * 1024 * 1024, // 50MB
video: 100 * 1024 * 1024, // 100MB
document: 5 * 1024 * 1024 // 5MB
};
function validateFileSize(file: File): boolean {
const limit = FILE_SIZE_LIMITS[getFileCategory(file.type)];
return file.size <= limit;
}- 压缩和优化:
typescript
class MediaOptimizer {
async optimizeImage(imageData: ImageData): Promise<ImageData> {
// 调整尺寸
const resized = await this.resizeImage(imageData, { maxWidth: 1920, maxHeight: 1080 });
// 压缩质量
const compressed = await this.compressImage(resized, { quality: 0.8 });
return compressed;
}
async optimizeAudio(audioData: AudioData): Promise<AudioData> {
// 降低采样率
const resampled = await this.resampleAudio(audioData, 16000);
// 压缩编码
const compressed = await this.compressAudio(resampled, 'mp3', { bitrate: 128 });
return compressed;
}
}- 缓存策略:
typescript
class MultimodalCache {
private cache = new Map<string, CachedResult>();
async getOrProcess<T>(
key: string,
processor: () => Promise<T>,
ttl: number = 3600000
): Promise<T> {
const cached = this.cache.get(key);
if (cached && Date.now() < cached.expiry) {
return cached.result as T;
}
const result = await processor();
this.cache.set(key, {
result,
expiry: Date.now() + ttl
});
return result;
}
generateKey(data: any): string {
// 为不同类型的数据生成唯一键
if (data instanceof ArrayBuffer) {
return this.hashArrayBuffer(data);
} else if (typeof data === 'string') {
return this.hashString(data);
} else {
return this.hashObject(data);
}
}
}错误处理
typescript
class MultimodalErrorHandler {
async handleProcessingError(
error: Error,
inputType: string,
inputData: any
): Promise<ErrorResponse> {
const errorInfo = {
type: error.constructor.name,
message: error.message,
inputType,
timestamp: new Date().toISOString()
};
// 记录错误
await this.logError(errorInfo);
// 生成用户友好的错误消息
const userMessage = this.generateUserMessage(error, inputType);
// 提供恢复建议
const suggestions = this.generateRecoverySuggestions(error, inputType);
return {
success: false,
error: userMessage,
suggestions,
canRetry: this.canRetry(error)
};
}
private generateUserMessage(error: Error, inputType: string): string {
const messages = {
'NetworkError': '网络连接失败,请检查网络设置',
'QuotaExceededError': '文件太大,请选择较小的文件',
'NotSupportedError': `不支持的${inputType}格式`,
'SecurityError': '权限不足,请检查浏览器权限设置'
};
return messages[error.constructor.name] || '处理失败,请重试';
}
}隐私保护
typescript
class PrivacyProtector {
async sanitizeInput(input: any, inputType: string): Promise<any> {
switch (inputType) {
case 'image':
return this.sanitizeImage(input);
case 'audio':
return this.sanitizeAudio(input);
case 'text':
return this.sanitizeText(input);
case 'file':
return this.sanitizeFile(input);
default:
return input;
}
}
private async sanitizeImage(imageData: ImageData): Promise<ImageData> {
// 移除 EXIF 数据
const sanitized = await this.removeExifData(imageData);
// 检测和模糊敏感信息
const withBlurredSensitiveInfo = await this.blurSensitiveInfo(sanitized);
return withBlurredSensitiveInfo;
}
private sanitizeText(text: string): string {
// 移除或替换敏感信息
const patterns = [
{ pattern: /\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b/g, replacement: '[CARD_NUMBER]' },
{ pattern: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g, replacement: '[EMAIL]' },
{ pattern: /\b\d{3}[\s-]?\d{3}[\s-]?\d{4}\b/g, replacement: '[PHONE]' }
];
let sanitized = text;
patterns.forEach(({ pattern, replacement }) => {
sanitized = sanitized.replace(pattern, replacement);
});
return sanitized;
}
}总结
多模态输入为 Trae IDE 提供了丰富的交互方式,使得用户可以通过最自然和直观的方式与 AI 助手进行交互。关键要点包括:
- 多样化输入:支持文本、图像、音频、文件等多种输入方式
- 智能分析:对不同类型的输入进行深度分析和理解
- 组合处理:支持多种输入方式的组合使用
- 实时交互:提供流式处理和实时响应能力
- 性能优化:通过压缩、缓存等技术提升处理效率
- 隐私保护:确保用户数据的安全和隐私
通过合理使用多模态输入功能,开发者可以获得更加智能、高效和自然的编程体验。