测试通过了多种不同模式的识别

初始化了文档处理模块
修改了选取资料的风格
2025-07-14 16:01:06 +08:00 · 2025-07-14 15:57:09 +08:00 · 2025-07-14 14:03:41 +08:00 · 2025-07-14 13:59:22 +08:00 · 2025-07-14 13:45:58 +08:00
29 changed files with 1009 additions and 33 deletions
--- a/api/pycache/dependencies.cpython-312.pyc
+++ b/api/pycache/dependencies.cpython-312.pyc
--- a/api/dependencies.py
+++ b/api/dependencies.py
@ -6,6 +6,8 @@ API依赖注入模块
 """

 from typing import Optional
+from datetime import datetime
+import uuid
 from fastapi import Depends
 from core.config import get_config_manager, ConfigManager
 from core.ai import AIAgent
@ -14,21 +16,15 @@ from utils.file_io import OutputManager
 # 全局依赖
 config_manager: Optional[ConfigManager] = None
 ai_agent: Optional[AIAgent] = None
-output_manager: Optional[OutputManager] = None

 def initialize_dependencies():
    """初始化全局依赖"""
-    global config_manager, ai_agent, output_manager
+    global config_manager, ai_agent
    
    # 初始化配置 - 使用服务器模式
    config_manager = get_config_manager()
    config_manager.load_from_directory("config", server_mode=True)
    
-    # 初始化输出管理器
-    from datetime import datetime
-    run_id = f"api_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
-    output_manager = OutputManager("result", run_id)
-    
    # 初始化AI代理
    from core.config import AIModelConfig
    ai_config = config_manager.get_config('ai_model', AIModelConfig)
@ -46,13 +42,30 @@ def get_ai_agent() -> AIAgent:
        raise RuntimeError("AI代理未初始化")
    return ai_agent

+def create_output_manager() -> OutputManager:
+    """为每个请求创建新的输出管理器"""
+    # 为每个请求生成唯一的run_id
+    run_id = f"api_request-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
+    return OutputManager("result", run_id)
+
 def get_output_manager() -> OutputManager:
-    """获取输出管理器"""
-    if output_manager is None:
-        raise RuntimeError("输出管理器未初始化")
-    return output_manager
+    """获取输出管理器（每次调用创建新实例）"""
+    return create_output_manager()

 def get_tweet_service():
    """获取文字内容服务"""
    from api.services.tweet import TweetService
-    return TweetService(get_ai_agent(), get_config(), get_output_manager()) 
+    return TweetService(get_ai_agent(), get_config(), get_output_manager())
+
+def get_poster_service():
+    """获取海报服务"""
+    from api.services.poster import PosterService
+    return PosterService(get_ai_agent(), get_config(), get_output_manager())
+
+def get_prompt_builder():
+    """获取提示词构建器服务"""
+    from api.services.prompt_builder import PromptBuilderService
+    from api.services.prompt_service import PromptService
+    
+    prompt_service = PromptService(get_config())
+    return PromptBuilderService(get_config(), prompt_service) 
--- a/api/models/pycache/poster.cpython-312.pyc
+++ b/api/models/pycache/poster.cpython-312.pyc
--- a/api/models/pycache/tweet.cpython-312.pyc
+++ b/api/models/pycache/tweet.cpython-312.pyc
--- a/api/models/poster.py
+++ b/api/models/poster.py
@ -39,7 +39,7 @@ class PosterResponse(BaseModel):
    class Config:
        schema_extra = {
            "example": {
-                "request_id": "poster_20230715_123456",
+                "request_id": "poster-20240715-123456-a1b2c3d4",
                "topic_index": "1",
                "poster_path": "/result/run_20230715_123456/topic_1/poster_vibrant.png",
                "template_name": "vibrant"
@ -92,7 +92,7 @@ class PosterTextResponse(BaseModel):
    class Config:
        schema_extra = {
            "example": {
-                "request_id": "text_20230715_123456",
+                "request_id": "text-20240715-123456-a1b2c3d4",
                "text_content": {
                    "title": "紫禁城的秘密",
                    "subtitle": "600年历史，等你探索",
--- a/api/models/tweet.py
+++ b/api/models/tweet.py
@ -39,7 +39,7 @@ class TopicResponse(BaseModel):
    class Config:
        schema_extra = {
            "example": {
-                "request_id": "topic_20230715_123456",
+                "request_id": "topic-20240715-123456-a1b2c3d4",
                "topics": [
                    {
                        "index": "1",
@ -97,7 +97,7 @@ class ContentResponse(BaseModel):
    class Config:
        schema_extra = {
            "example": {
-                "request_id": "content_20230715_123456",
+                "request_id": "content-20240715-123456-a1b2c3d4",
                "topic_index": "1",
                "content": {
                    "title": "【北京故宫】避开人潮的秘密路线，90%的人都不知道！",
@ -153,7 +153,7 @@ class JudgeResponse(BaseModel):
    class Config:
        schema_extra = {
            "example": {
-                "request_id": "judge_20230715_123456",
+                "request_id": "judge-20240715-123456-a1b2c3d4",
                "topic_index": "1",
                "content": {
                    "title": "【北京故宫】避开人潮的秘密路线，90%的人都不知道！",
@ -201,7 +201,7 @@ class PipelineResponse(BaseModel):
    class Config:
        schema_extra = {
            "example": {
-                "request_id": "pipeline_20230715_123456",
+                "request_id": "pipeline-20240715-123456-a1b2c3d4",
                "topics": [
                    {
                        "index": "1",
--- a/api/routers/pycache/prompt.cpython-312.pyc
+++ b/api/routers/pycache/prompt.cpython-312.pyc
--- a/api/routers/prompt.py
+++ b/api/routers/prompt.py
@ -110,7 +110,7 @@ class GenerateContentResponse(BaseModel):
    class Config:
        schema_extra = {
            "example": {
-                "request_id": "content_20230715_123456",
+                "request_id": "content-20240715-123456-a1b2c3d4",
                "topic_index": "1",
                "content": {
                    "title": "【北京故宫】避开人潮的秘密路线，90%的人都不知道！",
--- a/api/services/pycache/poster.cpython-312.pyc
+++ b/api/services/pycache/poster.cpython-312.pyc
--- a/api/services/pycache/tweet.cpython-312.pyc
+++ b/api/services/pycache/tweet.cpython-312.pyc
--- a/api/services/poster.py
+++ b/api/services/poster.py
@ -63,7 +63,7 @@ class PosterService:
            template_name = self.poster_generator._select_template()
        
        # 生成请求ID
-        request_id = f"poster_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
+        request_id = f"poster-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
        
        logger.info(f"海报生成完成，请求ID: {request_id}, 主题索引: {topic_index}, 模板: {template_name}")
        return request_id, topic_index, poster_path, template_name
@ -117,7 +117,7 @@ class PosterService:
        )
        
        # 生成请求ID
-        request_id = f"text_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
+        request_id = f"text-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
        
        logger.info(f"海报文案生成完成，请求ID: {request_id}")
        return request_id, text_content 
--- a/api/services/tweet.py
+++ b/api/services/tweet.py
@ -92,7 +92,7 @@ class TweetService:
            return str(uuid.uuid4()), []
            
        # 生成请求ID
-        request_id = f"topic_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
+        request_id = f"topic-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
        
        logger.info(f"选题生成完成，请求ID: {request_id}, 数量: {len(topics)}")
        return request_id, topics
@ -165,7 +165,7 @@ class TweetService:
                content['judge_success'] = False
        
        # 生成请求ID
-        request_id = f"content_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
+        request_id = f"content-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
        
        logger.info(f"内容生成完成，请求ID: {request_id}, 选题索引: {topic_index}")
        return request_id, topic_index, content
@ -189,7 +189,7 @@ class TweetService:
        content = await self.content_generator.generate_content_with_prompt(topic, system_prompt, user_prompt)
        
        # 生成请求ID
-        request_id = f"content_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
+        request_id = f"content-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
        
        logger.info(f"内容生成完成，请求ID: {request_id}, 选题索引: {topic_index}")
        return request_id, topic_index, content
@ -243,7 +243,7 @@ class TweetService:
        judge_success = judged_data.get('judge_success', False)
        
        # 生成请求ID
-        request_id = f"judge_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
+        request_id = f"judge-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
        
        logger.info(f"内容审核完成，请求ID: {request_id}, 选题索引: {topic_index}, 审核结果: {judge_success}")
        return request_id, topic_index, judged_data, judge_success
@ -274,7 +274,7 @@ class TweetService:
        logger.info(f"开始运行完整流水线，日期: {dates}, 数量: {num_topics}, 内嵌审核: {auto_judge}")
        
        # 生成请求ID
-        request_id = f"pipeline_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
+        request_id = f"pipeline-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
        
        # 步骤1: 生成选题
        _, topics = await self.generate_topics(dates, num_topics, styles, audiences, scenic_spots, products)
--- a/document/init.py
+++ b/document/init.py
@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+文档处理模块
+提供文档文本提取、内容整合、网络搜索和内容转换功能
+"""
+
+from .text_extractor import TextExtractor, ExtractedDocument
+from .content_integrator import ContentIntegrator, IntegratedContent
+from .content_transformer import ContentTransformer, TransformedContent
+
+__all__ = [
+    'TextExtractor',
+    'ExtractedDocument',
+    'ContentIntegrator', 
+    'IntegratedContent',
+    'ContentTransformer',
+    'TransformedContent'
+] 
--- a/document/pycache/init.cpython-312.pyc
+++ b/document/pycache/init.cpython-312.pyc
--- a/document/pycache/content_integrator.cpython-312.pyc
+++ b/document/pycache/content_integrator.cpython-312.pyc
--- a/document/pycache/content_transformer.cpython-312.pyc
+++ b/document/pycache/content_transformer.cpython-312.pyc
--- a/document/pycache/text_extractor.cpython-312.pyc
+++ b/document/pycache/text_extractor.cpython-312.pyc
--- a/document/pycache/web_search.cpython-312.pyc
+++ b/document/pycache/web_search.cpython-312.pyc
--- a/document/content_integrator.py
+++ b/document/content_integrator.py
@ -0,0 +1,130 @@
+import logging
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass
+from datetime import datetime
+from .text_extractor import ExtractedDocument
+import re
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class IntegratedContent:
+    """整合后的内容"""
+    documents: List[ExtractedDocument]
+    document_count: int
+    total_content_length: int
+    document_types: Dict[str, int]
+    combined_content: str
+    content_summary: str
+    key_topics: List[str]
+    
+    def __post_init__(self):
+        """初始化后处理"""
+        if not self.document_types:
+            self.document_types = {}
+            for doc in self.documents:
+                ext = doc.file_type.lower()
+                self.document_types[ext] = self.document_types.get(ext, 0) + 1
+
+class ContentIntegrator:
+    """内容整合器 - 整合多个文档的信息"""
+    
+    def __init__(self):
+        pass
+    
+    def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:
+        """整合多个文档
+        
+        Args:
+            documents: 提取的文档列表
+            
+        Returns:
+            IntegratedContent: 整合后的内容
+        """
+        if not documents:
+            return IntegratedContent(
+                documents=[],
+                document_count=0,
+                total_content_length=0,
+                document_types={},
+                combined_content="",
+                content_summary="没有提供文档内容",
+                key_topics=[]
+            )
+        
+        # 统计文档类型
+        document_types = {}
+        for doc in documents:
+            ext = doc.file_type.lower()
+            document_types[ext] = document_types.get(ext, 0) + 1
+        
+        # 合并内容
+        combined_content = self._combine_content(documents)
+        total_length = len(combined_content)
+        
+        # 生成摘要
+        content_summary = self._generate_summary(documents)
+        
+        # 提取关键主题
+        key_topics = self._extract_key_topics(combined_content)
+        
+        return IntegratedContent(
+            documents=documents,
+            document_count=len(documents),
+            total_content_length=total_length,
+            document_types=document_types,
+            combined_content=combined_content,
+            content_summary=content_summary,
+            key_topics=key_topics
+        )
+    
+    def _combine_content(self, documents: List[ExtractedDocument]) -> str:
+        """合并文档内容"""
+        combined = []
+        
+        for i, doc in enumerate(documents, 1):
+            combined.append(f"=== 文档 {i}: {doc.filename} ===")
+            combined.append(f"文件类型: {doc.file_type}")
+            combined.append(f"文件大小: {doc.file_size} 字节")
+            combined.append(f"提取时间: {doc.extracted_at}")
+            combined.append("")
+            combined.append("内容:")
+            combined.append(doc.content)
+            combined.append("")
+            combined.append("=" * 50)
+            combined.append("")
+        
+        return "\n".join(combined)
+    
+    def _generate_summary(self, documents: List[ExtractedDocument]) -> str:
+        """生成内容摘要"""
+        if not documents:
+            return "没有文档内容"
+        
+        summary_parts = []
+        summary_parts.append(f"共处理了 {len(documents)} 个文档:")
+        
+        for i, doc in enumerate(documents, 1):
+            content_preview = doc.content[:100] + "..." if len(doc.content) > 100 else doc.content
+            summary_parts.append(f"{i}. {doc.filename} ({doc.file_type}): {content_preview}")
+        
+        return "\n".join(summary_parts)
+    
+    def _extract_key_topics(self, content: str) -> List[str]:
+        """提取关键主题（简单的关键词提取）"""
+        if not content:
+            return []
+        
+        # 简单的中文关键词提取
+        # 这里可以根据需要使用更复杂的NLP方法
+        words = re.findall(r'[\u4e00-\u9fff]+', content)
+        
+        # 统计词频
+        word_count = {}
+        for word in words:
+            if len(word) >= 2:  # 只考虑长度>=2的词
+                word_count[word] = word_count.get(word, 0) + 1
+        
+        # 返回出现频率最高的前10个词
+        sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
+        return [word for word, count in sorted_words[:10] if count > 1] 
--- a/document/content_transformer.py
+++ b/document/content_transformer.py
@ -0,0 +1,236 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+内容转换器模块
+使用LLM将解析的文档内容转换为标准化的景区和产品资料格式
+"""
+
+import logging
+from typing import Dict, Any, Optional, List
+from dataclasses import dataclass
+from datetime import datetime
+import uuid
+
+from .content_integrator import IntegratedContent
+from core.ai.ai_agent import AIAgent
+from core.config.manager import ConfigManager
+from utils.file_io import OutputManager
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class TransformedContent:
+    """转换后的内容"""
+    original_content: IntegratedContent
+    transformed_text: str
+    format_type: str
+    transformation_metadata: Dict[str, Any]
+    transformed_at: datetime
+
+class ContentTransformer:
+    """内容转换器 - 将整合的内容转换为指定格式"""
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        self.config = config or {}
+        self.supported_formats = {
+            'attraction_standard': self._transform_to_attraction_standard,
+            'product_sales': self._transform_to_product_sales,
+            'travel_guide': self._transform_to_travel_guide,
+            'blog_post': self._transform_to_blog_post,
+            'summary': self._transform_to_summary
+        }
+    
+    def transform_content(self, 
+                         integrated_content: IntegratedContent,
+                         format_type: str = 'summary',
+                         custom_prompt: Optional[str] = None) -> TransformedContent:
+        """转换内容
+        
+        Args:
+            integrated_content: 整合后的内容
+            format_type: 转换格式类型
+            custom_prompt: 自定义提示词
+            
+        Returns:
+            TransformedContent: 转换后的内容
+        """
+        if format_type not in self.supported_formats:
+            raise ValueError(f"不支持的格式类型: {format_type}")
+        
+        logger.info(f"开始转换内容，格式: {format_type}")
+        
+        # 执行转换
+        transform_func = self.supported_formats[format_type]
+        transformed_text = transform_func(integrated_content, custom_prompt)
+        
+        # 生成转换元数据
+        transformation_metadata = {
+            'format_type': format_type,
+            'source_document_count': integrated_content.document_count,
+            'source_content_length': integrated_content.total_content_length,
+            'transformed_content_length': len(transformed_text),
+            'key_topics_used': integrated_content.key_topics,
+            'custom_prompt_used': custom_prompt is not None
+        }
+        
+        return TransformedContent(
+            original_content=integrated_content,
+            transformed_text=transformed_text,
+            format_type=format_type,
+            transformation_metadata=transformation_metadata,
+            transformed_at=datetime.now()
+        )
+    
+    def _transform_to_attraction_standard(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str:
+        """转换为景点标准格式"""
+        template = """
+# 景点信息整理
+
+## 基本信息
+- 文档来源: {document_count}个文档
+- 主要主题: {key_topics}
+
+## 详细内容
+{combined_content}
+
+## 内容摘要
+{content_summary}
+
+---
+*基于提供的文档整理，如需更多信息请参考原始文档*
+"""
+        
+        return template.format(
+            document_count=content.document_count,
+            key_topics=", ".join(content.key_topics[:5]),
+            combined_content=content.combined_content,
+            content_summary=content.content_summary
+        )
+    
+    def _transform_to_product_sales(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str:
+        """转换为产品销售格式"""
+        template = """
+# 产品销售资料
+
+## 产品特色
+基于{document_count}个文档的信息整理：
+
+{content_summary}
+
+## 详细介绍
+{combined_content}
+
+## 关键卖点
+{key_topics}
+
+---
+*内容整理自提供的文档资料*
+"""
+        
+        key_points = "\n".join([f"• {topic}" for topic in content.key_topics[:8]])
+        
+        return template.format(
+            document_count=content.document_count,
+            content_summary=content.content_summary,
+            combined_content=content.combined_content,
+            key_topics=key_points
+        )
+    
+    def _transform_to_travel_guide(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str:
+        """转换为旅游指南格式"""
+        template = """
+# 旅游指南
+
+## 概述
+{content_summary}
+
+## 详细信息
+{combined_content}
+
+## 重要提示
+- 信息来源: {document_count}个文档
+- 关键主题: {key_topics}
+
+---
+*本指南基于提供的文档整理，出行前请核实最新信息*
+"""
+        
+        return template.format(
+            content_summary=content.content_summary,
+            combined_content=content.combined_content,
+            document_count=content.document_count,
+            key_topics=", ".join(content.key_topics[:5])
+        )
+    
+    def _transform_to_blog_post(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str:
+        """转换为博客文章格式"""
+        template = """
+# 博客文章
+
+## 前言
+本文基于{document_count}个文档资料整理而成。
+
+## 主要内容
+
+{combined_content}
+
+## 总结
+{content_summary}
+
+## 相关主题
+{key_topics}
+
+---
+*本文内容整理自多个文档资料*
+"""
+        
+        topics_list = "\n".join([f"- {topic}" for topic in content.key_topics[:10]])
+        
+        return template.format(
+            document_count=content.document_count,
+            combined_content=content.combined_content,
+            content_summary=content.content_summary,
+            key_topics=topics_list
+        )
+    
+    def _transform_to_summary(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str:
+        """转换为摘要格式"""
+        template = """
+# 文档内容摘要
+
+## 文档统计
+- 文档数量: {document_count}
+- 文档类型: {document_types}
+- 内容长度: {content_length}字符
+
+## 内容摘要
+{content_summary}
+
+## 关键主题
+{key_topics}
+
+## 完整内容
+{combined_content}
+"""
+        
+        doc_types = ", ".join([f"{k}({v}个)" for k, v in content.document_types.items()])
+        topics_list = "\n".join([f"• {topic}" for topic in content.key_topics])
+        
+        return template.format(
+            document_count=content.document_count,
+            document_types=doc_types,
+            content_length=content.total_content_length,
+            content_summary=content.content_summary,
+            key_topics=topics_list,
+            combined_content=content.combined_content
+        )
+    
+    def get_supported_formats(self) -> List[str]:
+        """获取支持的格式列表"""
+        return list(self.supported_formats.keys())
+    
+    def add_custom_format(self, format_name: str, transform_func):
+        """添加自定义格式"""
+        self.supported_formats[format_name] = transform_func
+        logger.info(f"添加自定义格式: {format_name}") 
--- a/document/text_extractor.py
+++ b/document/text_extractor.py
@ -0,0 +1,356 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+文本提取器模块
+支持从PDF、Word、TXT等格式的文档中提取文本内容
+"""
+
+import os
+import logging
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+from dataclasses import dataclass
+from datetime import datetime
+
+# 导入依赖库
+try:
+    import PyPDF2
+    import pdfplumber
+    PDF_AVAILABLE = True
+except ImportError:
+    PDF_AVAILABLE = False
+
+try:
+    from docx import Document
+    DOCX_AVAILABLE = True
+except ImportError:
+    DOCX_AVAILABLE = False
+
+try:
+    import openpyxl
+    from openpyxl import load_workbook
+    EXCEL_AVAILABLE = True
+except ImportError:
+    EXCEL_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class ExtractedDocument:
+    """提取的文档数据"""
+    filename: str
+    file_type: str
+    content: str  # 纯文本内容
+    metadata: Dict[str, Any]  # 文档元数据
+    extracted_at: datetime
+    file_size: int
+    page_count: Optional[int] = None
+    
+    def __post_init__(self):
+        # 确保content是字符串
+        if not isinstance(self.content, str):
+            self.content = str(self.content)
+
+class TextExtractor:
+    """文本提取器 - 只做纯文本提取，保留所有原始内容"""
+    
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        self.config = config or {}
+        self.supported_formats = {
+            '.pdf': self._extract_pdf,
+            '.docx': self._extract_docx,
+            '.doc': self._extract_doc,
+            '.txt': self._extract_txt,
+            '.md': self._extract_txt,
+            '.xlsx': self._extract_xlsx,
+            '.xls': self._extract_xls,
+            '.csv': self._extract_csv
+        }
+        
+    def extract(self, file_path: str) -> ExtractedDocument:
+        """提取单个文件的文本内容"""
+        path_obj = Path(file_path)
+        
+        if not path_obj.exists():
+            raise FileNotFoundError(f"文件不存在: {file_path}")
+            
+        file_ext = path_obj.suffix.lower()
+        if file_ext not in self.supported_formats:
+            raise ValueError(f"不支持的文件格式: {file_ext}")
+            
+        try:
+            # 获取文件信息
+            file_size = path_obj.stat().st_size
+            
+            # 提取文本内容
+            extractor = self.supported_formats[file_ext]
+            content, metadata = extractor(path_obj)
+            
+            return ExtractedDocument(
+                filename=path_obj.name,
+                file_type=file_ext,
+                content=content,
+                metadata=metadata,
+                extracted_at=datetime.now(),
+                file_size=file_size,
+                page_count=metadata.get('page_count')
+            )
+            
+        except Exception as e:
+            logger.error(f"提取文件 {file_path} 时出错: {str(e)}")
+            raise
+            
+    def extract_batch(self, file_paths: List[str]) -> List[ExtractedDocument]:
+        """批量提取多个文件的文本内容"""
+        results = []
+        
+        for file_path in file_paths:
+            try:
+                result = self.extract(file_path)
+                results.append(result)
+                logger.info(f"成功提取文件: {file_path}")
+            except Exception as e:
+                logger.error(f"提取文件 {file_path} 失败: {str(e)}")
+                # 创建错误记录
+                error_doc = ExtractedDocument(
+                    filename=Path(file_path).name,
+                    file_type=Path(file_path).suffix.lower(),
+                    content=f"提取失败: {str(e)}",
+                    metadata={"error": str(e)},
+                    extracted_at=datetime.now(),
+                    file_size=0
+                )
+                results.append(error_doc)
+                
+        return results
+        
+    def _extract_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any]]:
+        """提取PDF文件的纯文本内容"""
+        if not PDF_AVAILABLE:
+            raise ImportError("需要安装 PyPDF2 和 pdfplumber: pip install PyPDF2 pdfplumber")
+            
+        content_parts = []
+        metadata = {}
+        
+        try:
+            # 使用pdfplumber提取文本（更好的文本提取）
+            with pdfplumber.open(file_path) as pdf:
+                metadata['page_count'] = len(pdf.pages)
+                
+                for page_num, page in enumerate(pdf.pages, 1):
+                    page_text = page.extract_text()
+                    if page_text:
+                        content_parts.append(f"=== 第 {page_num} 页 ===\n{page_text}\n")
+                        
+                # 获取文档元数据
+                if pdf.metadata:
+                    metadata.update({
+                        'title': pdf.metadata.get('Title', ''),
+                        'author': pdf.metadata.get('Author', ''),
+                        'subject': pdf.metadata.get('Subject', ''),
+                        'creator': pdf.metadata.get('Creator', ''),
+                        'producer': pdf.metadata.get('Producer', ''),
+                        'creation_date': pdf.metadata.get('CreationDate', ''),
+                        'modification_date': pdf.metadata.get('ModDate', '')
+                    })
+                    
+        except Exception as e:
+            logger.warning(f"pdfplumber提取失败，尝试使用PyPDF2: {str(e)}")
+            
+            # 备用方案：使用PyPDF2
+            with open(file_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                metadata['page_count'] = len(pdf_reader.pages)
+                
+                for page_num, page in enumerate(pdf_reader.pages, 1):
+                    page_text = page.extract_text()
+                    if page_text:
+                        content_parts.append(f"=== 第 {page_num} 页 ===\n{page_text}\n")
+                        
+                # 获取文档元数据
+                if pdf_reader.metadata:
+                    metadata.update({
+                        'title': pdf_reader.metadata.get('/Title', ''),
+                        'author': pdf_reader.metadata.get('/Author', ''),
+                        'subject': pdf_reader.metadata.get('/Subject', ''),
+                        'creator': pdf_reader.metadata.get('/Creator', ''),
+                        'producer': pdf_reader.metadata.get('/Producer', ''),
+                        'creation_date': pdf_reader.metadata.get('/CreationDate', ''),
+                        'modification_date': pdf_reader.metadata.get('/ModDate', '')
+                    })
+                    
+        content = '\n'.join(content_parts) if content_parts else ""
+        return content, metadata
+        
+    def _extract_docx(self, file_path: Path) -> tuple[str, Dict[str, Any]]:
+        """提取DOCX文件的纯文本内容"""
+        if not DOCX_AVAILABLE:
+            raise ImportError("需要安装 python-docx: pip install python-docx")
+            
+        doc = Document(str(file_path))
+        content_parts = []
+        metadata = {}
+        
+        # 提取所有段落文本
+        for paragraph in doc.paragraphs:
+            if paragraph.text.strip():
+                content_parts.append(paragraph.text)
+                
+        # 提取表格内容
+        for table in doc.tables:
+            table_content = []
+            for row in table.rows:
+                row_content = []
+                for cell in row.cells:
+                    row_content.append(cell.text.strip())
+                table_content.append('\t'.join(row_content))
+            if table_content:
+                content_parts.append('\n=== 表格 ===\n' + '\n'.join(table_content) + '\n')
+                
+        # 获取文档属性
+        core_props = doc.core_properties
+        metadata.update({
+            'title': core_props.title or '',
+            'author': core_props.author or '',
+            'subject': core_props.subject or '',
+            'keywords': core_props.keywords or '',
+            'comments': core_props.comments or '',
+            'created': str(core_props.created) if core_props.created else '',
+            'modified': str(core_props.modified) if core_props.modified else '',
+            'last_modified_by': core_props.last_modified_by or '',
+            'paragraph_count': len(doc.paragraphs),
+            'table_count': len(doc.tables)
+        })
+        
+        content = '\n'.join(content_parts)
+        return content, metadata
+        
+    def _extract_doc(self, file_path: Path) -> tuple[str, Dict[str, Any]]:
+        """提取DOC文件的纯文本内容"""
+        # DOC格式较复杂，建议转换为DOCX或使用专门的库
+        logger.warning("DOC格式支持有限，建议转换为DOCX格式")
+        
+        # 尝试读取为文本文件
+        try:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
+                content = file.read()
+        except:
+            with open(file_path, 'r', encoding='gbk', errors='ignore') as file:
+                content = file.read()
+                
+        metadata = {'format': 'doc', 'encoding_note': '可能存在编码问题'}
+        return content, metadata
+        
+    def _extract_txt(self, file_path: Path) -> tuple[str, Dict[str, Any]]:
+        """提取TXT/MD文件的纯文本内容"""
+        encodings = ['utf-8', 'gbk', 'gb2312', 'big5', 'utf-16']
+        content = ""
+        used_encoding = ""
+        
+        for encoding in encodings:
+            try:
+                with open(file_path, 'r', encoding=encoding) as file:
+                    content = file.read()
+                    used_encoding = encoding
+                    break
+            except UnicodeDecodeError:
+                continue
+                
+        if not content:
+            # 最后尝试忽略错误
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
+                content = file.read()
+                used_encoding = 'utf-8 (with errors ignored)'
+                
+        metadata = {
+            'encoding': used_encoding,
+            'line_count': len(content.splitlines()),
+            'char_count': len(content)
+        }
+        
+        return content, metadata
+        
+    def _extract_xlsx(self, file_path: Path) -> tuple[str, Dict[str, Any]]:
+        """提取XLSX文件的纯文本内容"""
+        if not EXCEL_AVAILABLE:
+            raise ImportError("需要安装 openpyxl: pip install openpyxl")
+            
+        workbook = load_workbook(file_path, read_only=True)
+        content_parts = []
+        metadata = {
+            'sheet_count': len(workbook.sheetnames),
+            'sheet_names': workbook.sheetnames
+        }
+        
+        for sheet_name in workbook.sheetnames:
+            sheet = workbook[sheet_name]
+            content_parts.append(f"\n=== 工作表: {sheet_name} ===\n")
+            
+            for row in sheet.iter_rows(values_only=True):
+                row_content = []
+                for cell in row:
+                    if cell is not None:
+                        row_content.append(str(cell))
+                    else:
+                        row_content.append("")
+                if any(cell.strip() for cell in row_content):  # 跳过空行
+                    content_parts.append('\t'.join(row_content))
+                    
+        content = '\n'.join(content_parts)
+        return content, metadata
+        
+    def _extract_xls(self, file_path: Path) -> tuple[str, Dict[str, Any]]:
+        """提取XLS文件的纯文本内容"""
+        logger.warning("XLS格式支持有限，建议转换为XLSX格式")
+        
+        # 简单的文本提取
+        try:
+            with open(file_path, 'rb') as file:
+                content = file.read().decode('utf-8', errors='ignore')
+        except:
+            content = f"无法读取XLS文件: {file_path}"
+            
+        metadata = {'format': 'xls', 'note': '可能存在格式问题'}
+        return content, metadata
+        
+    def _extract_csv(self, file_path: Path) -> tuple[str, Dict[str, Any]]:
+        """提取CSV文件的纯文本内容"""
+        encodings = ['utf-8', 'gbk', 'gb2312']
+        content = ""
+        used_encoding = ""
+        
+        for encoding in encodings:
+            try:
+                with open(file_path, 'r', encoding=encoding) as file:
+                    content = file.read()
+                    used_encoding = encoding
+                    break
+            except UnicodeDecodeError:
+                continue
+                
+        if not content:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
+                content = file.read()
+                used_encoding = 'utf-8 (with errors ignored)'
+                
+        # 计算行数和列数
+        lines = content.splitlines()
+        row_count = len(lines)
+        col_count = len(lines[0].split(',')) if lines else 0
+        
+        metadata = {
+            'encoding': used_encoding,
+            'row_count': row_count,
+            'estimated_col_count': col_count
+        }
+        
+        return content, metadata
+        
+    def get_supported_formats(self) -> List[str]:
+        """获取支持的文件格式列表"""
+        return list(self.supported_formats.keys())
+        
+    def is_supported(self, file_path: str) -> bool:
+        """检查文件格式是否支持"""
+        return Path(file_path).suffix.lower() in self.supported_formats 
--- a/resource/prompt/generateTopics/system.txt
+++ b/resource/prompt/generateTopics/system.txt
@ -4,7 +4,7 @@
 2. 你策划选题的时候，请根据产品特性，灵活选取我给你的产品信息，进行能吸引用户的内容选题策划。
 3. 你需要按照选题数量完成所有的选题，不能省略。选题数量不能少于提示词要求数量。如果用户给出的日期数小于选题数，你可以在一个日期内给出多个选题，以保证得到的选题数和用户提出的需求相符。
 4. 选题风格要根据所选用户需求，有逻辑的选择不同的风格即可。
-5. **关键规则**：你必须从给你的资料中选择文案风格和面向人群画像，并严谨摘取**完整的文件名**。用户画像在 Demand 文件夹, 命名为 `XXX文旅需求.txt`; 文案风格在 Style 文件夹, 命名为 `XXX风文案提示词.txt`。**绝对不允许自己创造文件名或省略 `.txt` 后缀**。
+5. **关键规则**：你必须从给你的资料中选择文案风格和面向人群画像，并严谨摘取**完整的文件名**。

 输出要求：
 1. **你必须严格按照 JSON 格式输出。** 输出内容应该是一个 JSON 数组，数组中的每个元素代表一个选题，是一个包含以下字段的 JSON 对象。
@ -16,9 +16,9 @@
    - `object`: 选定对象 (例如 "泰宁古城"，只能从用户提示词中提到的object列表中选择, 如果用户没有提到，则选择"None", 不能自己创造, 不能选择object列表之外的景区)
    - `product`: 选定产品内容 (如果没有提供单独产品，则为"None")
    - `product_logic`: 选定产品的逻辑内容 (描述性文本)
-    - `style`: 选题风格的文件名。**必须是从 Style 文件夹中选择的完整文件名，例如 "攻略风文案提示词.txt"。**
+    - `style`: 选题风格的文件名。**必须是从 Style 文件夹中选择的完整文件名，例如 "攻略风文案"。**
    - `style_logic`: 选题风格的逻辑内容 (描述性文本)
-    - `target_audience`: 选题面向人群的文件名。**必须是从 Demand 文件夹中选择的完整文件名，例如 "亲子家庭文旅需求.txt"。**
+    - `target_audience`: 选题面向人群的文件名。**必须是从 Demand 文件夹中选择的完整文件名，例如 "亲子向"。**
    - `target_audience_logic`: 选题面向人群的逻辑内容 (描述性文本)
 4. 请确保生成的 JSON 数组包含用户要求的准确数量的选题对象。
 5. **不要虚构景区信息、活动信息或价格**。任何活动信息以用户所给明确资料为准。
@ -35,9 +35,9 @@
    "object": "泰宁古城",
    "product": "...",
    "product_logic": "结合住宿和导览，提供便捷的家庭游解决方案",
-    "style": "攻略风文案提示词.txt",
+    "style": "攻略风",
    "style_logic": "强调家庭共享时光和文化体验",
-    "target_audience": "亲子向文旅需求.txt",
+    "target_audience": "亲子向",
    "target_audience_logic": "满足家长带娃出游，寓教于乐的需求"
  },
  {
--- a/tests/document_processing_example_simple.py
+++ b/tests/document_processing_example_simple.py
@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+文档内容提取示例
+专注于文档内容提取功能，不包含网络搜索
+"""
+
+import os
+import sys
+from pathlib import Path
+
+# 添加项目根目录到Python路径
+project_root = Path(__file__).parent
+sys.path.insert(0, str(project_root))
+
+from document import TextExtractor, ContentIntegrator, ContentTransformer
+from api.services.document_service import DocumentService
+
+def test_single_document_extraction():
+    """测试单个文档提取"""
+    print("=== 单个文档提取测试 ===")
+    
+    # 创建文档服务
+    service = DocumentService()
+    
+    # 测试文件路径（请替换为实际的文件路径）
+    test_file = "test_document.pdf"  # 替换为实际文件路径
+    
+    if not os.path.exists(test_file):
+        print(f"测试文件不存在: {test_file}")
+        print("请将一个测试文档放在当前目录下，或修改test_file变量")
+        return
+    
+    # 仅提取文本
+    print(f"提取文档: {test_file}")
+    result = service.extract_text_only(test_file)
+    
+    if result['success']:
+        doc = result['document']
+        print(f"文件名: {doc['filename']}")
+        print(f"文件类型: {doc['file_type']}")
+        print(f"文件大小: {doc['file_size']} 字节")
+        print(f"内容长度: {doc['content_length']} 字符")
+        print(f"页数: {doc['page_count']}")
+        print(f"提取时间: {doc['extracted_at']}")
+        print("\n内容预览:")
+        print(doc['content'][:500] + "..." if len(doc['content']) > 500 else doc['content'])
+    else:
+        print(f"提取失败: {result['error']}")
+
+def test_multiple_documents_processing(save_path):
+    """测试多个文档处理"""
+    print("\n=== 多个文档处理测试 ===")
+    
+    # 创建文档服务
+    service = DocumentService()
+    
+    # 测试文件列表（请替换为实际的文件路径）
+    test_files = [
+        "document/sample_documents/Ai服务商家资料收集202506.xlsx",
+        "document/sample_documents/ai营销括客话术和QA整理.docx",
+        "document/sample_documents/附件1-服务器租赁发票20250702.pdf"
+    ]
+    
+    # 过滤存在的文件
+    existing_files = [f for f in test_files if os.path.exists(f)]
+    
+    if not existing_files:
+        print("没有找到测试文件")
+        print("请将测试文档放在当前目录下，或修改test_files变量")
+        return
+    
+    print(f"处理文档: {existing_files}")
+    
+    # 处理多个文档
+    result = service.process_multiple_documents(existing_files, output_format='summary')
+    
+    if result['success']:
+        print(f"处理摘要:")
+        summary = result['processing_summary']
+        print(f"  总文件数: {summary['total_files']}")
+        print(f"  成功提取: {summary['successful_extractions']}")
+        print(f"  失败提取: {summary['failed_extractions']}")
+        
+        print(f"\n文档列表:")
+        for i, doc in enumerate(result['documents'], 1):
+            print(f"  {i}. {doc['filename']} ({doc['file_type']}) - {doc['content_length']} 字符")
+        
+        print(f"\n整合内容:")
+        integrated = result['integrated_content']
+        print(f"  文档数量: {integrated['document_count']}")
+        print(f"  总内容长度: {integrated['total_content_length']}")
+        print(f"  文档类型: {integrated['document_types']}")
+        print(f"  关键主题: {integrated['key_topics']}")
+        
+        print(f"\n内容摘要:")
+        print(integrated['content_summary'])
+        
+        print(f"\n转换后的内容:")
+        transformed = result['transformed_content']
+        print(f"  格式类型: {transformed['format_type']}")
+        print(f"  转换时间: {transformed['transformed_at']}")
+        print("\n转换内容预览:")
+        content = transformed['content']
+        # print(content[:1000] + "..." if len(content) > 1000 else content)
+        if save_path:
+            with open(save_path, 'w', encoding='utf-8') as f:
+                f.write(content)
+            print(f"转换后的内容已保存到: {save_path}")
+    else:
+        print(f"处理失败: {result['error']}")
+
+def test_component_usage():
+    """测试组件单独使用"""
+    print("\n=== 组件单独使用测试 ===")
+    
+    # 测试文件
+    test_file = "test_document.txt"
+    
+    # 创建测试文件
+    if not os.path.exists(test_file):
+        with open(test_file, 'w', encoding='utf-8') as f:
+            f.write("""
+这是一个测试文档。
+
+主要内容包括：
+1. 文档提取功能测试
+2. 内容整合功能测试
+3. 内容转换功能测试
+
+测试文档包含中文内容，用于验证文本提取和处理功能。
+            """)
+        print(f"创建测试文件: {test_file}")
+    
+    # 1. 文本提取器测试
+    print("\n1. 文本提取器测试")
+    extractor = TextExtractor()
+    extracted_doc = extractor.extract(test_file)
+    
+    print(f"提取结果:")
+    print(f"  文件名: {extracted_doc.filename}")
+    print(f"  文件类型: {extracted_doc.file_type}")
+    print(f"  内容长度: {len(extracted_doc.content)}")
+    print(f"  内容: {extracted_doc.content}")
+    
+    # 2. 内容整合器测试
+    print("\n2. 内容整合器测试")
+    integrator = ContentIntegrator()
+    integrated_content = integrator.integrate_documents([extracted_doc])
+    
+    print(f"整合结果:")
+    print(f"  文档数量: {integrated_content.document_count}")
+    print(f"  总内容长度: {integrated_content.total_content_length}")
+    print(f"  文档类型: {integrated_content.document_types}")
+    print(f"  关键主题: {integrated_content.key_topics}")
+    print(f"  内容摘要: {integrated_content.content_summary}")
+    
+    # 3. 内容转换器测试
+    print("\n3. 内容转换器测试")
+    transformer = ContentTransformer()
+    
+    # 测试不同格式
+    formats = ['summary', 'attraction_standard', 'product_sales', 'travel_guide', 'blog_post']
+    
+    for format_type in formats:
+        print(f"\n--- {format_type} 格式 ---")
+        transformed_content = transformer.transform_content(integrated_content, format_type=format_type)
+        print(f"转换结果预览:")
+        content = transformed_content.transformed_text
+        print(content[:300] + "..." if len(content) > 300 else content)
+
+def test_service_info():
+    """测试服务信息"""
+    print("\n=== 服务信息测试 ===")
+    
+    service = DocumentService()
+    
+    print("支持的文件类型:")
+    file_types = service.get_supported_file_types()
+    print(f"  {file_types}")
+    
+    print("\n支持的输出格式:")
+    output_formats = service.get_supported_formats()
+    print(f"  {output_formats}")
+    
+    print("\n服务状态:")
+    status = service.get_service_status()
+    print(f"  服务名: {status['service_name']}")
+    print(f"  状态: {status['status']}")
+    print(f"  组件: {status['components']}")
+    print(f"  时间戳: {status['timestamp']}")
+
+def main():
+    """主函数"""
+    print("文档内容提取示例程序")
+    print("=" * 50)
+    
+    try:
+        # 测试服务信息
+        test_service_info()
+        
+        # 测试组件单独使用
+        test_component_usage()
+        
+        # 测试单个文档提取
+        test_single_document_extraction()
+        
+        # 测试多个文档处理
+        test_multiple_documents_processing("document/sample_documents/test.txt")
+        
+        print("\n" + "=" * 50)
+        print("测试完成！")
+        
+    except Exception as e:
+        print(f"测试过程中发生错误: {str(e)}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    main() 
--- a/tweet/pycache/topic_generator.cpython-312.pyc
+++ b/tweet/pycache/topic_generator.cpython-312.pyc
--- a/tweet/pycache/topic_parser.cpython-312.pyc
+++ b/tweet/pycache/topic_parser.cpython-312.pyc
--- a/tweet/topic_generator.py
+++ b/tweet/topic_generator.py
@ -146,3 +146,4 @@ class TopicGenerator:
        return topics


+ 
--- a/tweet/topic_parser.py
+++ b/tweet/topic_parser.py
@ -47,6 +47,7 @@ class TopicParser:
        # 验证每个选题是否包含所有必需的键
        valid_topics = []
        required_keys = {"index", "date", "logic", "object", "product", "style", "target_audience"}
+        optional_keys = {"product_logic", "style_logic", "target_audience_logic"}
        
        for i, item in enumerate(parsed_json):
            if isinstance(item, dict) and required_keys.issubset(item.keys()):
--- a/utils/pycache/file_io.cpython-312.pyc
+++ b/utils/pycache/file_io.cpython-312.pyc
--- a/utils/file_io.py
+++ b/utils/file_io.py
@ -202,14 +202,12 @@ def process_llm_json_text(text: Any) -> Optional[Dict[str, Any]]:
    for candidate in json_candidates:
        # 直接尝试解析
        try:
-            import json
            return json.loads(candidate)
        except json.JSONDecodeError:
            pass
        
        # 使用json_repair尝试修复
        try:
-            import json_repair
            return json_repair.loads(candidate)
        except Exception:
            continue
Author	SHA1	Message	Date
jinye_huang	12aaf88aa2	测试通过了多种不同模式的识别	2025-07-14 16:01:06 +08:00
jinye_huang	a0f66a4a49	初始化了文档处理模块	2025-07-14 15:57:09 +08:00
jinye_huang	cc13f352f6	修改了选取资料的风格	2025-07-14 14:03:41 +08:00
jinye_huang	8f4a7c502d	修改了request_id格式和ouput——handle	2025-07-14 13:59:22 +08:00
jinye_huang	28ff0979c1	更新了响应形式	2025-07-14 13:45:58 +08:00