优化了文本解析的输出字段

2025-07-15 18:20:36 +08:00 · 2025-07-15 18:20:36 +08:00 · f6cff4e5c0
commit f6cff4e5c0
parent c66273b393
8 changed files with 48 additions and 402 deletions
--- a/api/routers/pycache/content_integration.cpython-312.pyc
+++ b/api/routers/pycache/content_integration.cpython-312.pyc
--- a/api/routers/content_integration.py
+++ b/api/routers/content_integration.py
@ -71,15 +71,22 @@ async def integrate_content(request: ContentIntegrationRequest) -> ContentIntegr
                document_info=result["document_info"],
                xhs_info=result["xhs_info"],
                integrated_content=result["integrated_content"],
-                search_config=result["search_config"]
+                search_config=result["search_config"],
+                error_message=None  # 成功时无错误信息
            )
            logger.info(f"内容整合成功，处理时间：{result['processing_time']}")
        else:
+            from datetime import datetime
            response = ContentIntegrationResponse(
                success=False,
-                timestamp=result["timestamp"],
-                processing_time=result["processing_time"],
-                error_message=result["error_message"]
+                timestamp=result.get("timestamp", datetime.now().strftime("%Y%m%d_%H%M%S")),
+                processing_time=result.get("processing_time", "0秒"),
+                input_summary=result.get("input_summary"),
+                document_info=result.get("document_info"),
+                xhs_info=result.get("xhs_info"),
+                integrated_content=result.get("integrated_content"),
+                search_config=result.get("search_config"),
+                error_message=result.get("error_message")
            )
            logger.error(f"内容整合失败：{result['error_message']}")
        
--- a/core/init.py
+++ b/core/init.py
@ -8,13 +8,8 @@ Core Module
 提供内容整合的核心服务，通过适配器模式与xhs_spider和document模块交互
 """

-# 导入主要服务
-from .content_integration_service import (
-    ContentIntegrationService,
-    ProcessingConfig,
-    ProcessingResult,
-    ProcessingStats
-)
+# 注意：ContentIntegrationService已移至api.services模块
+# 这里不再导入，以避免与API服务冲突

 # 导入适配器
 from .xhs_adapter import XHSAdapter, XHSNote, XHSSearchResult
@ -30,14 +25,6 @@ __author__ = "TravelContentCreator Team"

 # 导出所有公共接口
 __all__ = [
-    # 主要服务
-    'ContentIntegrationService',
-    
-    # 数据模型
-    'ProcessingConfig',
-    'ProcessingResult', 
-    'ProcessingStats',
-    
    # 适配器
    'XHSAdapter',
    'DocumentAdapter',
@ -57,26 +44,4 @@ __all__ = [
    '__author__'
 ]

-# 便捷函数
-def create_integration_service(cookie_config_path: str = "cookies.json",
-                              media_storage_path: str = "media",
-                              enable_logging: bool = True) -> ContentIntegrationService:
-    """
-    创建内容整合服务实例的便捷函数
-    
-    Args:
-        cookie_config_path: Cookie配置文件路径
-        media_storage_path: 媒体存储路径
-        enable_logging: 是否启用日志
-        
-    Returns:
-        ContentIntegrationService: 内容整合服务实例
-    """
-    return ContentIntegrationService(
-        cookie_config_path=cookie_config_path,
-        media_storage_path=media_storage_path,
-        enable_logging=enable_logging
-    )
-
-# 添加到导出列表
-__all__.append('create_integration_service')
+# 注意：ContentIntegrationService及相关模型已移至api.services模块
--- a/core/pycache/init.cpython-312.pyc
+++ b/core/pycache/init.cpython-312.pyc
--- a/core/content_integration_service.py
+++ b/core/content_integration_service.py
@ -1,349 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-"""
-Content Integration Service
-内容整合服务
-
-core模块的主要服务，通过适配器与xhs_spider和document模块交互
-"""
-
-import asyncio
-from typing import Dict, List, Optional, Any
-from dataclasses import dataclass, field
-from datetime import datetime
-import logging
-import uuid
-
-from .xhs_adapter import XHSAdapter, XHSSearchResult
-from .document_adapter import DocumentAdapter, IntegratedContent
-from .cookie_manager import CookieManager
-from .media_manager import ImageStorageManager
-
-logger = logging.getLogger(__name__)
-
-@dataclass
-class ProcessingConfig:
-    """处理配置"""
-    keyword: str
-    document_paths: Optional[List[str]] = None
-    max_notes: int = 20
-    output_format: str = "summary"
-    download_media: bool = True
-    custom_prompt: Optional[str] = None
-
-@dataclass
-class ProcessingResult:
-    """处理结果"""
-    task_id: str
-    config: ProcessingConfig
-    success: bool
-    xhs_result: Optional[XHSSearchResult] = None
-    document_result: Optional[IntegratedContent] = None
-    final_content: str = ""
-    error_message: str = ""
-    processing_time: float = 0.0
-    created_time: datetime = field(default_factory=datetime.now)
-
-@dataclass
-class ProcessingStats:
-    """处理统计信息"""
-    total_tasks: int = 0
-    successful_tasks: int = 0
-    failed_tasks: int = 0
-    total_processing_time: float = 0.0
-    average_processing_time: float = 0.0
-    total_notes_processed: int = 0
-    total_documents_processed: int = 0
-    start_time: datetime = field(default_factory=datetime.now)
-    last_updated: datetime = field(default_factory=datetime.now)
-
-class ContentIntegrationService:
-    """内容整合服务"""
-    
-    def __init__(self, 
-                 cookie_config_path: str = "cookies.json",
-                 media_storage_path: str = "media",
-                 enable_logging: bool = True):
-        """
-        初始化服务
-        
-        Args:
-            cookie_config_path: Cookie配置文件路径
-            media_storage_path: 媒体存储路径
-            enable_logging: 是否启用日志
-        """
-        self.cookie_config_path = cookie_config_path
-        self.media_storage_path = media_storage_path
-        self.enable_logging = enable_logging
-        
-        # 初始化适配器
-        self.xhs_adapter = XHSAdapter(cookie_config_path)
-        self.document_adapter = DocumentAdapter()
-        
-        # 初始化管理器
-        self.cookie_manager = CookieManager(cookie_config_path)
-        self.media_manager = ImageStorageManager(media_storage_path)
-        
-        # 结果存储和统计
-        self.results: Dict[str, ProcessingResult] = {}
-        self.stats = ProcessingStats()
-        
-        logger.info("内容整合服务初始化完成")
-    
-    async def process_content(self, 
-                            keyword: str,
-                            document_paths: Optional[List[str]] = None,
-                            output_format: str = "summary",
-                            max_notes: int = 20,
-                            custom_prompt: Optional[str] = None) -> ProcessingResult:
-        """
-        处理内容整合
-        
-        Args:
-            keyword: 搜索关键词
-            document_paths: 文档路径列表
-            output_format: 输出格式
-            max_notes: 最大笔记数量
-            custom_prompt: 自定义提示词
-            
-        Returns:
-            ProcessingResult: 处理结果
-        """
-        task_id = str(uuid.uuid4())
-        start_time = datetime.now()
-        
-        config = ProcessingConfig(
-            keyword=keyword,
-            document_paths=document_paths,
-            max_notes=max_notes,
-            output_format=output_format,
-            custom_prompt=custom_prompt
-        )
-        
-        result = ProcessingResult(
-            task_id=task_id,
-            config=config,
-            success=False
-        )
-        
-        try:
-            # 1. 搜索小红书内容（如果有关键词）
-            xhs_result = None
-            if keyword:
-                logger.info(f"开始搜索小红书内容: {keyword}")
-                xhs_result = self.xhs_adapter.search_notes(
-                    keyword=keyword,
-                    max_notes=max_notes,
-                    
-                )
-                result.xhs_result = xhs_result
-                logger.info(f"小红书搜索完成，找到 {len(xhs_result.notes)} 条笔记")
-            
-            # 2. 处理文档内容（如果有文档）
-            document_result = None
-            if document_paths:
-                logger.info(f"开始处理文档: {len(document_paths)} 个文件")
-                document_result = self.document_adapter.integrate_documents(document_paths)
-                result.document_result = document_result
-                logger.info(f"文档处理完成，总长度: {document_result.total_length}")
-            
-            # 3. 整合内容
-            final_content = self._integrate_content(
-                xhs_result=xhs_result,
-                document_result=document_result,
-                output_format=output_format,
-                custom_prompt=custom_prompt
-            )
-            result.final_content = final_content
-            
-            # 4. 下载媒体文件（如果需要）
-            if config.download_media and xhs_result:
-                await self._download_media(xhs_result)
-            
-            result.success = True
-            logger.info(f"任务 {task_id} 处理完成")
-            
-        except Exception as e:
-            result.error_message = str(e)
-            logger.error(f"任务 {task_id} 处理失败: {e}")
-        
-        # 计算处理时间
-        result.processing_time = (datetime.now() - start_time).total_seconds()
-        
-        # 存储结果和更新统计
-        self.results[task_id] = result
-        self._update_statistics(result)
-        
-        return result
-    
-    async def batch_process(self,
-                          tasks: List[Dict[str, Any]],
-                          output_format: str = "summary") -> List[ProcessingResult]:
-        """
-        批量处理任务
-        
-        Args:
-            tasks: 任务列表，每个任务包含keyword和document_paths
-            output_format: 输出格式
-            
-        Returns:
-            List[ProcessingResult]: 处理结果列表
-        """
-        results = []
-        
-        # 并发处理任务
-        async def process_task(task_config):
-            return await self.process_content(
-                keyword=task_config.get("keyword", ""),
-                document_paths=task_config.get("document_paths"),
-                output_format=output_format,
-                max_notes=task_config.get("max_notes", 20),
-                custom_prompt=task_config.get("custom_prompt")
-            )
-        
-        # 使用asyncio.gather并发执行
-        tasks_coroutines = [process_task(task) for task in tasks]
-        results = await asyncio.gather(*tasks_coroutines, return_exceptions=True)
-        
-        # 处理异常结果
-        processed_results = []
-        for i, result in enumerate(results):
-            if isinstance(result, Exception):
-                error_result = ProcessingResult(
-                    task_id=str(uuid.uuid4()),
-                    config=ProcessingConfig(keyword=tasks[i].get("keyword", "")),
-                    success=False,
-                    error_message=str(result)
-                )
-                processed_results.append(error_result)
-            else:
-                processed_results.append(result)
-        
-        return processed_results
-    
-    def _integrate_content(self,
-                          xhs_result: Optional[XHSSearchResult],
-                          document_result: Optional[IntegratedContent],
-                          output_format: str,
-                          custom_prompt: Optional[str]) -> str:
-        """
-        整合内容
-        
-        Args:
-            xhs_result: 小红书搜索结果
-            document_result: 文档处理结果
-            output_format: 输出格式
-            custom_prompt: 自定义提示词
-            
-        Returns:
-            str: 整合后的内容
-        """
-        content_parts = []
-        
-        # 添加小红书内容
-        if xhs_result and xhs_result.notes:
-            xhs_content = "\n".join([
-                f"标题: {note.title}\n内容: {note.content}\n作者: {note.author}\n"
-                for note in xhs_result.notes[:5]  # 只取前5条
-            ])
-            content_parts.append(f"=== 小红书相关内容 ===\n{xhs_content}")
-        
-        # 添加文档内容
-        if document_result and document_result.integrated_text:
-            content_parts.append(f"=== 文档相关内容 ===\n{document_result.integrated_text}")
-        
-        # 合并内容
-        if not content_parts:
-            return "没有找到相关内容"
-        
-        combined_content = "\n\n".join(content_parts)
-        
-        # 根据输出格式转换内容
-        if self.document_adapter.is_available():
-            try:
-                formatted_content = self.document_adapter.transform_content(
-                    combined_content, 
-                    output_format
-                )
-                return formatted_content
-            except Exception as e:
-                logger.error(f"内容格式转换失败: {e}")
-        
-        # 简单格式化
-        return self._simple_format(combined_content, output_format)
-    
-    def _simple_format(self, content: str, output_format: str) -> str:
-        """简单的内容格式化"""
-        if output_format == "summary":
-            return f"摘要:\n{content[:500]}..."
-        elif output_format == "blog_post":
-            return f"# 博客文章\n\n{content}"
-        elif output_format == "travel_guide":
-            return f"# 旅游攻略\n\n{content}"
-        else:
-            return content
-    
-    async def _download_media(self, xhs_result: XHSSearchResult):
-        """下载媒体文件"""
-        try:
-            for note in xhs_result.notes:
-                # 下载图片
-                for image_url in note.images:
-                    await self.media_manager.download_image(image_url, note.note_id)
-                
-                # 下载视频
-                for video_url in note.videos:
-                    await self.media_manager.download_video(video_url, note.note_id)
-                    
-        except Exception as e:
-            logger.error(f"媒体下载失败: {e}")
-    
-    def _update_statistics(self, result: ProcessingResult):
-        """更新统计信息"""
-        self.stats.total_tasks += 1
-        self.stats.total_processing_time += result.processing_time
-        
-        if result.success:
-            self.stats.successful_tasks += 1
-            if result.xhs_result:
-                self.stats.total_notes_processed += len(result.xhs_result.notes)
-            if result.document_result:
-                self.stats.total_documents_processed += len(result.document_result.documents)
-        else:
-            self.stats.failed_tasks += 1
-        
-        # 计算平均处理时间
-        if self.stats.total_tasks > 0:
-            self.stats.average_processing_time = (
-                self.stats.total_processing_time / self.stats.total_tasks
-            )
-        
-        self.stats.last_updated = datetime.now()
-    
-    def get_statistics(self) -> ProcessingStats:
-        """获取处理统计信息"""
-        return self.stats
-    
-    def get_result(self, task_id: str) -> Optional[ProcessingResult]:
-        """获取处理结果"""
-        return self.results.get(task_id)
-    
-    def get_all_results(self) -> Dict[str, ProcessingResult]:
-        """获取所有处理结果"""
-        return self.results.copy()
-    
-    def clear_results(self):
-        """清空处理结果"""
-        self.results.clear()
-    
-    def get_status(self) -> Dict[str, Any]:
-        """获取服务状态"""
-        return {
-            "xhs_adapter": self.xhs_adapter.get_status(),
-            "document_adapter": self.document_adapter.get_status(),
-            "cookie_manager": self.cookie_manager.get_status() if hasattr(self.cookie_manager, 'get_status') else {},
-            "media_manager": self.media_manager.get_status() if hasattr(self.media_manager, 'get_status') else {},
-            "statistics": self.stats,
-            "total_results": len(self.results)
-        } 
--- a/resource/prompt/integration/system.txt
+++ b/resource/prompt/integration/system.txt
@ -21,24 +21,47 @@
 {
  "attractions": [
    {
-      "name": "景区/酒店名称",
+      "name": "景区名称",
+      "address": "景区详细地址",
+      "trafficInfo": "交通指南详细信息",
+      "description": "景区空泛描述信息",
+      "advantage": "景区最大优势",
+      "highlight": "景区亮点描述",
      "products": [
        {
-          "product_name": "产品套餐名称",
-          "usage_rules": "产品使用规则详细说明",
+          "product_name": "产品名称",
+          "originPrice": "原始价格",
+          "realPrice": "实际价格",
+          "packageInfo": "产品详情完整描述",
+          "salesPeriod": "销售周期信息",
+          "stock": "库存数量",
+          "usage_rules": {
+            "rules": "使用规则详细说明",
+            "surcharge": "加价说明",
+            "reservation": "预约规则",
+            "refund": "退改政策",
+            "discounts": "优惠内容"
+          },
          "transportation": {
-            "guide": "交通指南",
+            "guide": "交通指南详细信息",
            "address": "详细地址"
          },
-          "price": "产品价格信息",
-          "key_advantages": "产品最突出优势",
+          "key_advantages": "产品最大优势描述",
+          "highlights": "产品亮点详细描述",
          "detailed_description": [
-            "1. 详细描述项目1",
-            "2. 详细描述项目2",
-            "..."
+            "1. 游泳池尺寸：长xx米，宽xx米，深xx米",
+            "2. 泳池特色：如无边泳池、恒温泳池等",
+            "3. 开放时间：xx:xx-xx:xx",
+            "4. 适合人群：儿童/成人等",
+            "5. 安全设施：救生员配置等",
+            "6. 周边景点：xxx景点距离xx米",
+            "7. 风景描述：如海景、山景等",
+            "8. 小贴士：需要携带的物品等",
+            "9. 游玩路线建议",
+            "10. 推荐游玩时长"
          ]
        }
      ]
    }
  ]
-} 
+}
--- a/utils/pycache/prompts.cpython-312.pyc
+++ b/utils/pycache/prompts.cpython-312.pyc
--- a/utils/prompts.py
+++ b/utils/prompts.py
@ -342,7 +342,7 @@ class ContentPromptBuilder(BasePromptBuilder):
        style_filename = topic.get("style", "")
        style_content = self._find_and_read_file(style_filename, self.resource_config.style.paths) or "通用风格"

-        demand_filename = topic.get("target_audience", "")
+        demand_filename = topic.get("targetAudience", "")
        demand_content = self._find_and_read_file(demand_filename, self.resource_config.demand.paths) or "通用用户画像"

        object_name = topic.get("object", "")