Compare commits
No commits in common. "12aaf88aa2d7d4f87e37fc5e90dfa85c6ad228dd" and "d04e40697223424c463541e2fdcf28aebd6bfa16" have entirely different histories.
12aaf88aa2
...
d04e406972
Binary file not shown.
@ -6,8 +6,6 @@ API依赖注入模块
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from datetime import datetime
|
|
||||||
import uuid
|
|
||||||
from fastapi import Depends
|
from fastapi import Depends
|
||||||
from core.config import get_config_manager, ConfigManager
|
from core.config import get_config_manager, ConfigManager
|
||||||
from core.ai import AIAgent
|
from core.ai import AIAgent
|
||||||
@ -16,15 +14,21 @@ from utils.file_io import OutputManager
|
|||||||
# 全局依赖
|
# 全局依赖
|
||||||
config_manager: Optional[ConfigManager] = None
|
config_manager: Optional[ConfigManager] = None
|
||||||
ai_agent: Optional[AIAgent] = None
|
ai_agent: Optional[AIAgent] = None
|
||||||
|
output_manager: Optional[OutputManager] = None
|
||||||
|
|
||||||
def initialize_dependencies():
|
def initialize_dependencies():
|
||||||
"""初始化全局依赖"""
|
"""初始化全局依赖"""
|
||||||
global config_manager, ai_agent
|
global config_manager, ai_agent, output_manager
|
||||||
|
|
||||||
# 初始化配置 - 使用服务器模式
|
# 初始化配置 - 使用服务器模式
|
||||||
config_manager = get_config_manager()
|
config_manager = get_config_manager()
|
||||||
config_manager.load_from_directory("config", server_mode=True)
|
config_manager.load_from_directory("config", server_mode=True)
|
||||||
|
|
||||||
|
# 初始化输出管理器
|
||||||
|
from datetime import datetime
|
||||||
|
run_id = f"api_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
||||||
|
output_manager = OutputManager("result", run_id)
|
||||||
|
|
||||||
# 初始化AI代理
|
# 初始化AI代理
|
||||||
from core.config import AIModelConfig
|
from core.config import AIModelConfig
|
||||||
ai_config = config_manager.get_config('ai_model', AIModelConfig)
|
ai_config = config_manager.get_config('ai_model', AIModelConfig)
|
||||||
@ -42,30 +46,13 @@ def get_ai_agent() -> AIAgent:
|
|||||||
raise RuntimeError("AI代理未初始化")
|
raise RuntimeError("AI代理未初始化")
|
||||||
return ai_agent
|
return ai_agent
|
||||||
|
|
||||||
def create_output_manager() -> OutputManager:
|
|
||||||
"""为每个请求创建新的输出管理器"""
|
|
||||||
# 为每个请求生成唯一的run_id
|
|
||||||
run_id = f"api_request-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
|
|
||||||
return OutputManager("result", run_id)
|
|
||||||
|
|
||||||
def get_output_manager() -> OutputManager:
|
def get_output_manager() -> OutputManager:
|
||||||
"""获取输出管理器(每次调用创建新实例)"""
|
"""获取输出管理器"""
|
||||||
return create_output_manager()
|
if output_manager is None:
|
||||||
|
raise RuntimeError("输出管理器未初始化")
|
||||||
|
return output_manager
|
||||||
|
|
||||||
def get_tweet_service():
|
def get_tweet_service():
|
||||||
"""获取文字内容服务"""
|
"""获取文字内容服务"""
|
||||||
from api.services.tweet import TweetService
|
from api.services.tweet import TweetService
|
||||||
return TweetService(get_ai_agent(), get_config(), get_output_manager())
|
return TweetService(get_ai_agent(), get_config(), get_output_manager())
|
||||||
|
|
||||||
def get_poster_service():
|
|
||||||
"""获取海报服务"""
|
|
||||||
from api.services.poster import PosterService
|
|
||||||
return PosterService(get_ai_agent(), get_config(), get_output_manager())
|
|
||||||
|
|
||||||
def get_prompt_builder():
|
|
||||||
"""获取提示词构建器服务"""
|
|
||||||
from api.services.prompt_builder import PromptBuilderService
|
|
||||||
from api.services.prompt_service import PromptService
|
|
||||||
|
|
||||||
prompt_service = PromptService(get_config())
|
|
||||||
return PromptBuilderService(get_config(), prompt_service)
|
|
||||||
Binary file not shown.
Binary file not shown.
@ -39,7 +39,7 @@ class PosterResponse(BaseModel):
|
|||||||
class Config:
|
class Config:
|
||||||
schema_extra = {
|
schema_extra = {
|
||||||
"example": {
|
"example": {
|
||||||
"request_id": "poster-20240715-123456-a1b2c3d4",
|
"request_id": "poster_20230715_123456",
|
||||||
"topic_index": "1",
|
"topic_index": "1",
|
||||||
"poster_path": "/result/run_20230715_123456/topic_1/poster_vibrant.png",
|
"poster_path": "/result/run_20230715_123456/topic_1/poster_vibrant.png",
|
||||||
"template_name": "vibrant"
|
"template_name": "vibrant"
|
||||||
@ -92,7 +92,7 @@ class PosterTextResponse(BaseModel):
|
|||||||
class Config:
|
class Config:
|
||||||
schema_extra = {
|
schema_extra = {
|
||||||
"example": {
|
"example": {
|
||||||
"request_id": "text-20240715-123456-a1b2c3d4",
|
"request_id": "text_20230715_123456",
|
||||||
"text_content": {
|
"text_content": {
|
||||||
"title": "紫禁城的秘密",
|
"title": "紫禁城的秘密",
|
||||||
"subtitle": "600年历史,等你探索",
|
"subtitle": "600年历史,等你探索",
|
||||||
|
|||||||
@ -39,7 +39,7 @@ class TopicResponse(BaseModel):
|
|||||||
class Config:
|
class Config:
|
||||||
schema_extra = {
|
schema_extra = {
|
||||||
"example": {
|
"example": {
|
||||||
"request_id": "topic-20240715-123456-a1b2c3d4",
|
"request_id": "topic_20230715_123456",
|
||||||
"topics": [
|
"topics": [
|
||||||
{
|
{
|
||||||
"index": "1",
|
"index": "1",
|
||||||
@ -97,7 +97,7 @@ class ContentResponse(BaseModel):
|
|||||||
class Config:
|
class Config:
|
||||||
schema_extra = {
|
schema_extra = {
|
||||||
"example": {
|
"example": {
|
||||||
"request_id": "content-20240715-123456-a1b2c3d4",
|
"request_id": "content_20230715_123456",
|
||||||
"topic_index": "1",
|
"topic_index": "1",
|
||||||
"content": {
|
"content": {
|
||||||
"title": "【北京故宫】避开人潮的秘密路线,90%的人都不知道!",
|
"title": "【北京故宫】避开人潮的秘密路线,90%的人都不知道!",
|
||||||
@ -153,7 +153,7 @@ class JudgeResponse(BaseModel):
|
|||||||
class Config:
|
class Config:
|
||||||
schema_extra = {
|
schema_extra = {
|
||||||
"example": {
|
"example": {
|
||||||
"request_id": "judge-20240715-123456-a1b2c3d4",
|
"request_id": "judge_20230715_123456",
|
||||||
"topic_index": "1",
|
"topic_index": "1",
|
||||||
"content": {
|
"content": {
|
||||||
"title": "【北京故宫】避开人潮的秘密路线,90%的人都不知道!",
|
"title": "【北京故宫】避开人潮的秘密路线,90%的人都不知道!",
|
||||||
@ -201,7 +201,7 @@ class PipelineResponse(BaseModel):
|
|||||||
class Config:
|
class Config:
|
||||||
schema_extra = {
|
schema_extra = {
|
||||||
"example": {
|
"example": {
|
||||||
"request_id": "pipeline-20240715-123456-a1b2c3d4",
|
"request_id": "pipeline_20230715_123456",
|
||||||
"topics": [
|
"topics": [
|
||||||
{
|
{
|
||||||
"index": "1",
|
"index": "1",
|
||||||
|
|||||||
Binary file not shown.
@ -110,7 +110,7 @@ class GenerateContentResponse(BaseModel):
|
|||||||
class Config:
|
class Config:
|
||||||
schema_extra = {
|
schema_extra = {
|
||||||
"example": {
|
"example": {
|
||||||
"request_id": "content-20240715-123456-a1b2c3d4",
|
"request_id": "content_20230715_123456",
|
||||||
"topic_index": "1",
|
"topic_index": "1",
|
||||||
"content": {
|
"content": {
|
||||||
"title": "【北京故宫】避开人潮的秘密路线,90%的人都不知道!",
|
"title": "【北京故宫】避开人潮的秘密路线,90%的人都不知道!",
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
@ -63,7 +63,7 @@ class PosterService:
|
|||||||
template_name = self.poster_generator._select_template()
|
template_name = self.poster_generator._select_template()
|
||||||
|
|
||||||
# 生成请求ID
|
# 生成请求ID
|
||||||
request_id = f"poster-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
|
request_id = f"poster_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
|
||||||
|
|
||||||
logger.info(f"海报生成完成,请求ID: {request_id}, 主题索引: {topic_index}, 模板: {template_name}")
|
logger.info(f"海报生成完成,请求ID: {request_id}, 主题索引: {topic_index}, 模板: {template_name}")
|
||||||
return request_id, topic_index, poster_path, template_name
|
return request_id, topic_index, poster_path, template_name
|
||||||
@ -117,7 +117,7 @@ class PosterService:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# 生成请求ID
|
# 生成请求ID
|
||||||
request_id = f"text-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
|
request_id = f"text_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
|
||||||
|
|
||||||
logger.info(f"海报文案生成完成,请求ID: {request_id}")
|
logger.info(f"海报文案生成完成,请求ID: {request_id}")
|
||||||
return request_id, text_content
|
return request_id, text_content
|
||||||
@ -92,7 +92,7 @@ class TweetService:
|
|||||||
return str(uuid.uuid4()), []
|
return str(uuid.uuid4()), []
|
||||||
|
|
||||||
# 生成请求ID
|
# 生成请求ID
|
||||||
request_id = f"topic-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
|
request_id = f"topic_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
|
||||||
|
|
||||||
logger.info(f"选题生成完成,请求ID: {request_id}, 数量: {len(topics)}")
|
logger.info(f"选题生成完成,请求ID: {request_id}, 数量: {len(topics)}")
|
||||||
return request_id, topics
|
return request_id, topics
|
||||||
@ -165,7 +165,7 @@ class TweetService:
|
|||||||
content['judge_success'] = False
|
content['judge_success'] = False
|
||||||
|
|
||||||
# 生成请求ID
|
# 生成请求ID
|
||||||
request_id = f"content-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
|
request_id = f"content_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
|
||||||
|
|
||||||
logger.info(f"内容生成完成,请求ID: {request_id}, 选题索引: {topic_index}")
|
logger.info(f"内容生成完成,请求ID: {request_id}, 选题索引: {topic_index}")
|
||||||
return request_id, topic_index, content
|
return request_id, topic_index, content
|
||||||
@ -189,7 +189,7 @@ class TweetService:
|
|||||||
content = await self.content_generator.generate_content_with_prompt(topic, system_prompt, user_prompt)
|
content = await self.content_generator.generate_content_with_prompt(topic, system_prompt, user_prompt)
|
||||||
|
|
||||||
# 生成请求ID
|
# 生成请求ID
|
||||||
request_id = f"content-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
|
request_id = f"content_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
|
||||||
|
|
||||||
logger.info(f"内容生成完成,请求ID: {request_id}, 选题索引: {topic_index}")
|
logger.info(f"内容生成完成,请求ID: {request_id}, 选题索引: {topic_index}")
|
||||||
return request_id, topic_index, content
|
return request_id, topic_index, content
|
||||||
@ -243,7 +243,7 @@ class TweetService:
|
|||||||
judge_success = judged_data.get('judge_success', False)
|
judge_success = judged_data.get('judge_success', False)
|
||||||
|
|
||||||
# 生成请求ID
|
# 生成请求ID
|
||||||
request_id = f"judge-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
|
request_id = f"judge_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
|
||||||
|
|
||||||
logger.info(f"内容审核完成,请求ID: {request_id}, 选题索引: {topic_index}, 审核结果: {judge_success}")
|
logger.info(f"内容审核完成,请求ID: {request_id}, 选题索引: {topic_index}, 审核结果: {judge_success}")
|
||||||
return request_id, topic_index, judged_data, judge_success
|
return request_id, topic_index, judged_data, judge_success
|
||||||
@ -274,7 +274,7 @@ class TweetService:
|
|||||||
logger.info(f"开始运行完整流水线,日期: {dates}, 数量: {num_topics}, 内嵌审核: {auto_judge}")
|
logger.info(f"开始运行完整流水线,日期: {dates}, 数量: {num_topics}, 内嵌审核: {auto_judge}")
|
||||||
|
|
||||||
# 生成请求ID
|
# 生成请求ID
|
||||||
request_id = f"pipeline-{datetime.now().strftime('%Y%m%d-%H%M%S')}-{str(uuid.uuid4())[:8]}"
|
request_id = f"pipeline_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{str(uuid.uuid4())[:8]}"
|
||||||
|
|
||||||
# 步骤1: 生成选题
|
# 步骤1: 生成选题
|
||||||
_, topics = await self.generate_topics(dates, num_topics, styles, audiences, scenic_spots, products)
|
_, topics = await self.generate_topics(dates, num_topics, styles, audiences, scenic_spots, products)
|
||||||
|
|||||||
@ -1,20 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
"""
|
|
||||||
文档处理模块
|
|
||||||
提供文档文本提取、内容整合、网络搜索和内容转换功能
|
|
||||||
"""
|
|
||||||
|
|
||||||
from .text_extractor import TextExtractor, ExtractedDocument
|
|
||||||
from .content_integrator import ContentIntegrator, IntegratedContent
|
|
||||||
from .content_transformer import ContentTransformer, TransformedContent
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
'TextExtractor',
|
|
||||||
'ExtractedDocument',
|
|
||||||
'ContentIntegrator',
|
|
||||||
'IntegratedContent',
|
|
||||||
'ContentTransformer',
|
|
||||||
'TransformedContent'
|
|
||||||
]
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,130 +0,0 @@
|
|||||||
import logging
|
|
||||||
from typing import List, Dict, Any, Optional
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from datetime import datetime
|
|
||||||
from .text_extractor import ExtractedDocument
|
|
||||||
import re
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class IntegratedContent:
|
|
||||||
"""整合后的内容"""
|
|
||||||
documents: List[ExtractedDocument]
|
|
||||||
document_count: int
|
|
||||||
total_content_length: int
|
|
||||||
document_types: Dict[str, int]
|
|
||||||
combined_content: str
|
|
||||||
content_summary: str
|
|
||||||
key_topics: List[str]
|
|
||||||
|
|
||||||
def __post_init__(self):
|
|
||||||
"""初始化后处理"""
|
|
||||||
if not self.document_types:
|
|
||||||
self.document_types = {}
|
|
||||||
for doc in self.documents:
|
|
||||||
ext = doc.file_type.lower()
|
|
||||||
self.document_types[ext] = self.document_types.get(ext, 0) + 1
|
|
||||||
|
|
||||||
class ContentIntegrator:
|
|
||||||
"""内容整合器 - 整合多个文档的信息"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def integrate_documents(self, documents: List[ExtractedDocument]) -> IntegratedContent:
|
|
||||||
"""整合多个文档
|
|
||||||
|
|
||||||
Args:
|
|
||||||
documents: 提取的文档列表
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
IntegratedContent: 整合后的内容
|
|
||||||
"""
|
|
||||||
if not documents:
|
|
||||||
return IntegratedContent(
|
|
||||||
documents=[],
|
|
||||||
document_count=0,
|
|
||||||
total_content_length=0,
|
|
||||||
document_types={},
|
|
||||||
combined_content="",
|
|
||||||
content_summary="没有提供文档内容",
|
|
||||||
key_topics=[]
|
|
||||||
)
|
|
||||||
|
|
||||||
# 统计文档类型
|
|
||||||
document_types = {}
|
|
||||||
for doc in documents:
|
|
||||||
ext = doc.file_type.lower()
|
|
||||||
document_types[ext] = document_types.get(ext, 0) + 1
|
|
||||||
|
|
||||||
# 合并内容
|
|
||||||
combined_content = self._combine_content(documents)
|
|
||||||
total_length = len(combined_content)
|
|
||||||
|
|
||||||
# 生成摘要
|
|
||||||
content_summary = self._generate_summary(documents)
|
|
||||||
|
|
||||||
# 提取关键主题
|
|
||||||
key_topics = self._extract_key_topics(combined_content)
|
|
||||||
|
|
||||||
return IntegratedContent(
|
|
||||||
documents=documents,
|
|
||||||
document_count=len(documents),
|
|
||||||
total_content_length=total_length,
|
|
||||||
document_types=document_types,
|
|
||||||
combined_content=combined_content,
|
|
||||||
content_summary=content_summary,
|
|
||||||
key_topics=key_topics
|
|
||||||
)
|
|
||||||
|
|
||||||
def _combine_content(self, documents: List[ExtractedDocument]) -> str:
|
|
||||||
"""合并文档内容"""
|
|
||||||
combined = []
|
|
||||||
|
|
||||||
for i, doc in enumerate(documents, 1):
|
|
||||||
combined.append(f"=== 文档 {i}: {doc.filename} ===")
|
|
||||||
combined.append(f"文件类型: {doc.file_type}")
|
|
||||||
combined.append(f"文件大小: {doc.file_size} 字节")
|
|
||||||
combined.append(f"提取时间: {doc.extracted_at}")
|
|
||||||
combined.append("")
|
|
||||||
combined.append("内容:")
|
|
||||||
combined.append(doc.content)
|
|
||||||
combined.append("")
|
|
||||||
combined.append("=" * 50)
|
|
||||||
combined.append("")
|
|
||||||
|
|
||||||
return "\n".join(combined)
|
|
||||||
|
|
||||||
def _generate_summary(self, documents: List[ExtractedDocument]) -> str:
|
|
||||||
"""生成内容摘要"""
|
|
||||||
if not documents:
|
|
||||||
return "没有文档内容"
|
|
||||||
|
|
||||||
summary_parts = []
|
|
||||||
summary_parts.append(f"共处理了 {len(documents)} 个文档:")
|
|
||||||
|
|
||||||
for i, doc in enumerate(documents, 1):
|
|
||||||
content_preview = doc.content[:100] + "..." if len(doc.content) > 100 else doc.content
|
|
||||||
summary_parts.append(f"{i}. {doc.filename} ({doc.file_type}): {content_preview}")
|
|
||||||
|
|
||||||
return "\n".join(summary_parts)
|
|
||||||
|
|
||||||
def _extract_key_topics(self, content: str) -> List[str]:
|
|
||||||
"""提取关键主题(简单的关键词提取)"""
|
|
||||||
if not content:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# 简单的中文关键词提取
|
|
||||||
# 这里可以根据需要使用更复杂的NLP方法
|
|
||||||
words = re.findall(r'[\u4e00-\u9fff]+', content)
|
|
||||||
|
|
||||||
# 统计词频
|
|
||||||
word_count = {}
|
|
||||||
for word in words:
|
|
||||||
if len(word) >= 2: # 只考虑长度>=2的词
|
|
||||||
word_count[word] = word_count.get(word, 0) + 1
|
|
||||||
|
|
||||||
# 返回出现频率最高的前10个词
|
|
||||||
sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
|
|
||||||
return [word for word, count in sorted_words[:10] if count > 1]
|
|
||||||
@ -1,236 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
"""
|
|
||||||
内容转换器模块
|
|
||||||
使用LLM将解析的文档内容转换为标准化的景区和产品资料格式
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from typing import Dict, Any, Optional, List
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from datetime import datetime
|
|
||||||
import uuid
|
|
||||||
|
|
||||||
from .content_integrator import IntegratedContent
|
|
||||||
from core.ai.ai_agent import AIAgent
|
|
||||||
from core.config.manager import ConfigManager
|
|
||||||
from utils.file_io import OutputManager
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class TransformedContent:
|
|
||||||
"""转换后的内容"""
|
|
||||||
original_content: IntegratedContent
|
|
||||||
transformed_text: str
|
|
||||||
format_type: str
|
|
||||||
transformation_metadata: Dict[str, Any]
|
|
||||||
transformed_at: datetime
|
|
||||||
|
|
||||||
class ContentTransformer:
|
|
||||||
"""内容转换器 - 将整合的内容转换为指定格式"""
|
|
||||||
|
|
||||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
||||||
self.config = config or {}
|
|
||||||
self.supported_formats = {
|
|
||||||
'attraction_standard': self._transform_to_attraction_standard,
|
|
||||||
'product_sales': self._transform_to_product_sales,
|
|
||||||
'travel_guide': self._transform_to_travel_guide,
|
|
||||||
'blog_post': self._transform_to_blog_post,
|
|
||||||
'summary': self._transform_to_summary
|
|
||||||
}
|
|
||||||
|
|
||||||
def transform_content(self,
|
|
||||||
integrated_content: IntegratedContent,
|
|
||||||
format_type: str = 'summary',
|
|
||||||
custom_prompt: Optional[str] = None) -> TransformedContent:
|
|
||||||
"""转换内容
|
|
||||||
|
|
||||||
Args:
|
|
||||||
integrated_content: 整合后的内容
|
|
||||||
format_type: 转换格式类型
|
|
||||||
custom_prompt: 自定义提示词
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
TransformedContent: 转换后的内容
|
|
||||||
"""
|
|
||||||
if format_type not in self.supported_formats:
|
|
||||||
raise ValueError(f"不支持的格式类型: {format_type}")
|
|
||||||
|
|
||||||
logger.info(f"开始转换内容,格式: {format_type}")
|
|
||||||
|
|
||||||
# 执行转换
|
|
||||||
transform_func = self.supported_formats[format_type]
|
|
||||||
transformed_text = transform_func(integrated_content, custom_prompt)
|
|
||||||
|
|
||||||
# 生成转换元数据
|
|
||||||
transformation_metadata = {
|
|
||||||
'format_type': format_type,
|
|
||||||
'source_document_count': integrated_content.document_count,
|
|
||||||
'source_content_length': integrated_content.total_content_length,
|
|
||||||
'transformed_content_length': len(transformed_text),
|
|
||||||
'key_topics_used': integrated_content.key_topics,
|
|
||||||
'custom_prompt_used': custom_prompt is not None
|
|
||||||
}
|
|
||||||
|
|
||||||
return TransformedContent(
|
|
||||||
original_content=integrated_content,
|
|
||||||
transformed_text=transformed_text,
|
|
||||||
format_type=format_type,
|
|
||||||
transformation_metadata=transformation_metadata,
|
|
||||||
transformed_at=datetime.now()
|
|
||||||
)
|
|
||||||
|
|
||||||
def _transform_to_attraction_standard(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str:
|
|
||||||
"""转换为景点标准格式"""
|
|
||||||
template = """
|
|
||||||
# 景点信息整理
|
|
||||||
|
|
||||||
## 基本信息
|
|
||||||
- 文档来源: {document_count}个文档
|
|
||||||
- 主要主题: {key_topics}
|
|
||||||
|
|
||||||
## 详细内容
|
|
||||||
{combined_content}
|
|
||||||
|
|
||||||
## 内容摘要
|
|
||||||
{content_summary}
|
|
||||||
|
|
||||||
---
|
|
||||||
*基于提供的文档整理,如需更多信息请参考原始文档*
|
|
||||||
"""
|
|
||||||
|
|
||||||
return template.format(
|
|
||||||
document_count=content.document_count,
|
|
||||||
key_topics=", ".join(content.key_topics[:5]),
|
|
||||||
combined_content=content.combined_content,
|
|
||||||
content_summary=content.content_summary
|
|
||||||
)
|
|
||||||
|
|
||||||
def _transform_to_product_sales(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str:
|
|
||||||
"""转换为产品销售格式"""
|
|
||||||
template = """
|
|
||||||
# 产品销售资料
|
|
||||||
|
|
||||||
## 产品特色
|
|
||||||
基于{document_count}个文档的信息整理:
|
|
||||||
|
|
||||||
{content_summary}
|
|
||||||
|
|
||||||
## 详细介绍
|
|
||||||
{combined_content}
|
|
||||||
|
|
||||||
## 关键卖点
|
|
||||||
{key_topics}
|
|
||||||
|
|
||||||
---
|
|
||||||
*内容整理自提供的文档资料*
|
|
||||||
"""
|
|
||||||
|
|
||||||
key_points = "\n".join([f"• {topic}" for topic in content.key_topics[:8]])
|
|
||||||
|
|
||||||
return template.format(
|
|
||||||
document_count=content.document_count,
|
|
||||||
content_summary=content.content_summary,
|
|
||||||
combined_content=content.combined_content,
|
|
||||||
key_topics=key_points
|
|
||||||
)
|
|
||||||
|
|
||||||
def _transform_to_travel_guide(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str:
|
|
||||||
"""转换为旅游指南格式"""
|
|
||||||
template = """
|
|
||||||
# 旅游指南
|
|
||||||
|
|
||||||
## 概述
|
|
||||||
{content_summary}
|
|
||||||
|
|
||||||
## 详细信息
|
|
||||||
{combined_content}
|
|
||||||
|
|
||||||
## 重要提示
|
|
||||||
- 信息来源: {document_count}个文档
|
|
||||||
- 关键主题: {key_topics}
|
|
||||||
|
|
||||||
---
|
|
||||||
*本指南基于提供的文档整理,出行前请核实最新信息*
|
|
||||||
"""
|
|
||||||
|
|
||||||
return template.format(
|
|
||||||
content_summary=content.content_summary,
|
|
||||||
combined_content=content.combined_content,
|
|
||||||
document_count=content.document_count,
|
|
||||||
key_topics=", ".join(content.key_topics[:5])
|
|
||||||
)
|
|
||||||
|
|
||||||
def _transform_to_blog_post(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str:
|
|
||||||
"""转换为博客文章格式"""
|
|
||||||
template = """
|
|
||||||
# 博客文章
|
|
||||||
|
|
||||||
## 前言
|
|
||||||
本文基于{document_count}个文档资料整理而成。
|
|
||||||
|
|
||||||
## 主要内容
|
|
||||||
|
|
||||||
{combined_content}
|
|
||||||
|
|
||||||
## 总结
|
|
||||||
{content_summary}
|
|
||||||
|
|
||||||
## 相关主题
|
|
||||||
{key_topics}
|
|
||||||
|
|
||||||
---
|
|
||||||
*本文内容整理自多个文档资料*
|
|
||||||
"""
|
|
||||||
|
|
||||||
topics_list = "\n".join([f"- {topic}" for topic in content.key_topics[:10]])
|
|
||||||
|
|
||||||
return template.format(
|
|
||||||
document_count=content.document_count,
|
|
||||||
combined_content=content.combined_content,
|
|
||||||
content_summary=content.content_summary,
|
|
||||||
key_topics=topics_list
|
|
||||||
)
|
|
||||||
|
|
||||||
def _transform_to_summary(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str:
|
|
||||||
"""转换为摘要格式"""
|
|
||||||
template = """
|
|
||||||
# 文档内容摘要
|
|
||||||
|
|
||||||
## 文档统计
|
|
||||||
- 文档数量: {document_count}
|
|
||||||
- 文档类型: {document_types}
|
|
||||||
- 内容长度: {content_length}字符
|
|
||||||
|
|
||||||
## 内容摘要
|
|
||||||
{content_summary}
|
|
||||||
|
|
||||||
## 关键主题
|
|
||||||
{key_topics}
|
|
||||||
|
|
||||||
## 完整内容
|
|
||||||
{combined_content}
|
|
||||||
"""
|
|
||||||
|
|
||||||
doc_types = ", ".join([f"{k}({v}个)" for k, v in content.document_types.items()])
|
|
||||||
topics_list = "\n".join([f"• {topic}" for topic in content.key_topics])
|
|
||||||
|
|
||||||
return template.format(
|
|
||||||
document_count=content.document_count,
|
|
||||||
document_types=doc_types,
|
|
||||||
content_length=content.total_content_length,
|
|
||||||
content_summary=content.content_summary,
|
|
||||||
key_topics=topics_list,
|
|
||||||
combined_content=content.combined_content
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_supported_formats(self) -> List[str]:
|
|
||||||
"""获取支持的格式列表"""
|
|
||||||
return list(self.supported_formats.keys())
|
|
||||||
|
|
||||||
def add_custom_format(self, format_name: str, transform_func):
|
|
||||||
"""添加自定义格式"""
|
|
||||||
self.supported_formats[format_name] = transform_func
|
|
||||||
logger.info(f"添加自定义格式: {format_name}")
|
|
||||||
@ -1,356 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
"""
|
|
||||||
文本提取器模块
|
|
||||||
支持从PDF、Word、TXT等格式的文档中提取文本内容
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import logging
|
|
||||||
from typing import List, Dict, Any, Optional
|
|
||||||
from pathlib import Path
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
# 导入依赖库
|
|
||||||
try:
|
|
||||||
import PyPDF2
|
|
||||||
import pdfplumber
|
|
||||||
PDF_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
PDF_AVAILABLE = False
|
|
||||||
|
|
||||||
try:
|
|
||||||
from docx import Document
|
|
||||||
DOCX_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
DOCX_AVAILABLE = False
|
|
||||||
|
|
||||||
try:
|
|
||||||
import openpyxl
|
|
||||||
from openpyxl import load_workbook
|
|
||||||
EXCEL_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
EXCEL_AVAILABLE = False
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ExtractedDocument:
|
|
||||||
"""提取的文档数据"""
|
|
||||||
filename: str
|
|
||||||
file_type: str
|
|
||||||
content: str # 纯文本内容
|
|
||||||
metadata: Dict[str, Any] # 文档元数据
|
|
||||||
extracted_at: datetime
|
|
||||||
file_size: int
|
|
||||||
page_count: Optional[int] = None
|
|
||||||
|
|
||||||
def __post_init__(self):
|
|
||||||
# 确保content是字符串
|
|
||||||
if not isinstance(self.content, str):
|
|
||||||
self.content = str(self.content)
|
|
||||||
|
|
||||||
class TextExtractor:
|
|
||||||
"""文本提取器 - 只做纯文本提取,保留所有原始内容"""
|
|
||||||
|
|
||||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
||||||
self.config = config or {}
|
|
||||||
self.supported_formats = {
|
|
||||||
'.pdf': self._extract_pdf,
|
|
||||||
'.docx': self._extract_docx,
|
|
||||||
'.doc': self._extract_doc,
|
|
||||||
'.txt': self._extract_txt,
|
|
||||||
'.md': self._extract_txt,
|
|
||||||
'.xlsx': self._extract_xlsx,
|
|
||||||
'.xls': self._extract_xls,
|
|
||||||
'.csv': self._extract_csv
|
|
||||||
}
|
|
||||||
|
|
||||||
def extract(self, file_path: str) -> ExtractedDocument:
|
|
||||||
"""提取单个文件的文本内容"""
|
|
||||||
path_obj = Path(file_path)
|
|
||||||
|
|
||||||
if not path_obj.exists():
|
|
||||||
raise FileNotFoundError(f"文件不存在: {file_path}")
|
|
||||||
|
|
||||||
file_ext = path_obj.suffix.lower()
|
|
||||||
if file_ext not in self.supported_formats:
|
|
||||||
raise ValueError(f"不支持的文件格式: {file_ext}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# 获取文件信息
|
|
||||||
file_size = path_obj.stat().st_size
|
|
||||||
|
|
||||||
# 提取文本内容
|
|
||||||
extractor = self.supported_formats[file_ext]
|
|
||||||
content, metadata = extractor(path_obj)
|
|
||||||
|
|
||||||
return ExtractedDocument(
|
|
||||||
filename=path_obj.name,
|
|
||||||
file_type=file_ext,
|
|
||||||
content=content,
|
|
||||||
metadata=metadata,
|
|
||||||
extracted_at=datetime.now(),
|
|
||||||
file_size=file_size,
|
|
||||||
page_count=metadata.get('page_count')
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"提取文件 {file_path} 时出错: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def extract_batch(self, file_paths: List[str]) -> List[ExtractedDocument]:
|
|
||||||
"""批量提取多个文件的文本内容"""
|
|
||||||
results = []
|
|
||||||
|
|
||||||
for file_path in file_paths:
|
|
||||||
try:
|
|
||||||
result = self.extract(file_path)
|
|
||||||
results.append(result)
|
|
||||||
logger.info(f"成功提取文件: {file_path}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"提取文件 {file_path} 失败: {str(e)}")
|
|
||||||
# 创建错误记录
|
|
||||||
error_doc = ExtractedDocument(
|
|
||||||
filename=Path(file_path).name,
|
|
||||||
file_type=Path(file_path).suffix.lower(),
|
|
||||||
content=f"提取失败: {str(e)}",
|
|
||||||
metadata={"error": str(e)},
|
|
||||||
extracted_at=datetime.now(),
|
|
||||||
file_size=0
|
|
||||||
)
|
|
||||||
results.append(error_doc)
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
def _extract_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any]]:
|
|
||||||
"""提取PDF文件的纯文本内容"""
|
|
||||||
if not PDF_AVAILABLE:
|
|
||||||
raise ImportError("需要安装 PyPDF2 和 pdfplumber: pip install PyPDF2 pdfplumber")
|
|
||||||
|
|
||||||
content_parts = []
|
|
||||||
metadata = {}
|
|
||||||
|
|
||||||
try:
|
|
||||||
# 使用pdfplumber提取文本(更好的文本提取)
|
|
||||||
with pdfplumber.open(file_path) as pdf:
|
|
||||||
metadata['page_count'] = len(pdf.pages)
|
|
||||||
|
|
||||||
for page_num, page in enumerate(pdf.pages, 1):
|
|
||||||
page_text = page.extract_text()
|
|
||||||
if page_text:
|
|
||||||
content_parts.append(f"=== 第 {page_num} 页 ===\n{page_text}\n")
|
|
||||||
|
|
||||||
# 获取文档元数据
|
|
||||||
if pdf.metadata:
|
|
||||||
metadata.update({
|
|
||||||
'title': pdf.metadata.get('Title', ''),
|
|
||||||
'author': pdf.metadata.get('Author', ''),
|
|
||||||
'subject': pdf.metadata.get('Subject', ''),
|
|
||||||
'creator': pdf.metadata.get('Creator', ''),
|
|
||||||
'producer': pdf.metadata.get('Producer', ''),
|
|
||||||
'creation_date': pdf.metadata.get('CreationDate', ''),
|
|
||||||
'modification_date': pdf.metadata.get('ModDate', '')
|
|
||||||
})
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"pdfplumber提取失败,尝试使用PyPDF2: {str(e)}")
|
|
||||||
|
|
||||||
# 备用方案:使用PyPDF2
|
|
||||||
with open(file_path, 'rb') as file:
|
|
||||||
pdf_reader = PyPDF2.PdfReader(file)
|
|
||||||
metadata['page_count'] = len(pdf_reader.pages)
|
|
||||||
|
|
||||||
for page_num, page in enumerate(pdf_reader.pages, 1):
|
|
||||||
page_text = page.extract_text()
|
|
||||||
if page_text:
|
|
||||||
content_parts.append(f"=== 第 {page_num} 页 ===\n{page_text}\n")
|
|
||||||
|
|
||||||
# 获取文档元数据
|
|
||||||
if pdf_reader.metadata:
|
|
||||||
metadata.update({
|
|
||||||
'title': pdf_reader.metadata.get('/Title', ''),
|
|
||||||
'author': pdf_reader.metadata.get('/Author', ''),
|
|
||||||
'subject': pdf_reader.metadata.get('/Subject', ''),
|
|
||||||
'creator': pdf_reader.metadata.get('/Creator', ''),
|
|
||||||
'producer': pdf_reader.metadata.get('/Producer', ''),
|
|
||||||
'creation_date': pdf_reader.metadata.get('/CreationDate', ''),
|
|
||||||
'modification_date': pdf_reader.metadata.get('/ModDate', '')
|
|
||||||
})
|
|
||||||
|
|
||||||
content = '\n'.join(content_parts) if content_parts else ""
|
|
||||||
return content, metadata
|
|
||||||
|
|
||||||
def _extract_docx(self, file_path: Path) -> tuple[str, Dict[str, Any]]:
|
|
||||||
"""提取DOCX文件的纯文本内容"""
|
|
||||||
if not DOCX_AVAILABLE:
|
|
||||||
raise ImportError("需要安装 python-docx: pip install python-docx")
|
|
||||||
|
|
||||||
doc = Document(str(file_path))
|
|
||||||
content_parts = []
|
|
||||||
metadata = {}
|
|
||||||
|
|
||||||
# 提取所有段落文本
|
|
||||||
for paragraph in doc.paragraphs:
|
|
||||||
if paragraph.text.strip():
|
|
||||||
content_parts.append(paragraph.text)
|
|
||||||
|
|
||||||
# 提取表格内容
|
|
||||||
for table in doc.tables:
|
|
||||||
table_content = []
|
|
||||||
for row in table.rows:
|
|
||||||
row_content = []
|
|
||||||
for cell in row.cells:
|
|
||||||
row_content.append(cell.text.strip())
|
|
||||||
table_content.append('\t'.join(row_content))
|
|
||||||
if table_content:
|
|
||||||
content_parts.append('\n=== 表格 ===\n' + '\n'.join(table_content) + '\n')
|
|
||||||
|
|
||||||
# 获取文档属性
|
|
||||||
core_props = doc.core_properties
|
|
||||||
metadata.update({
|
|
||||||
'title': core_props.title or '',
|
|
||||||
'author': core_props.author or '',
|
|
||||||
'subject': core_props.subject or '',
|
|
||||||
'keywords': core_props.keywords or '',
|
|
||||||
'comments': core_props.comments or '',
|
|
||||||
'created': str(core_props.created) if core_props.created else '',
|
|
||||||
'modified': str(core_props.modified) if core_props.modified else '',
|
|
||||||
'last_modified_by': core_props.last_modified_by or '',
|
|
||||||
'paragraph_count': len(doc.paragraphs),
|
|
||||||
'table_count': len(doc.tables)
|
|
||||||
})
|
|
||||||
|
|
||||||
content = '\n'.join(content_parts)
|
|
||||||
return content, metadata
|
|
||||||
|
|
||||||
def _extract_doc(self, file_path: Path) -> tuple[str, Dict[str, Any]]:
|
|
||||||
"""提取DOC文件的纯文本内容"""
|
|
||||||
# DOC格式较复杂,建议转换为DOCX或使用专门的库
|
|
||||||
logger.warning("DOC格式支持有限,建议转换为DOCX格式")
|
|
||||||
|
|
||||||
# 尝试读取为文本文件
|
|
||||||
try:
|
|
||||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
|
|
||||||
content = file.read()
|
|
||||||
except:
|
|
||||||
with open(file_path, 'r', encoding='gbk', errors='ignore') as file:
|
|
||||||
content = file.read()
|
|
||||||
|
|
||||||
metadata = {'format': 'doc', 'encoding_note': '可能存在编码问题'}
|
|
||||||
return content, metadata
|
|
||||||
|
|
||||||
def _extract_txt(self, file_path: Path) -> tuple[str, Dict[str, Any]]:
|
|
||||||
"""提取TXT/MD文件的纯文本内容"""
|
|
||||||
encodings = ['utf-8', 'gbk', 'gb2312', 'big5', 'utf-16']
|
|
||||||
content = ""
|
|
||||||
used_encoding = ""
|
|
||||||
|
|
||||||
for encoding in encodings:
|
|
||||||
try:
|
|
||||||
with open(file_path, 'r', encoding=encoding) as file:
|
|
||||||
content = file.read()
|
|
||||||
used_encoding = encoding
|
|
||||||
break
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not content:
|
|
||||||
# 最后尝试忽略错误
|
|
||||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
|
|
||||||
content = file.read()
|
|
||||||
used_encoding = 'utf-8 (with errors ignored)'
|
|
||||||
|
|
||||||
metadata = {
|
|
||||||
'encoding': used_encoding,
|
|
||||||
'line_count': len(content.splitlines()),
|
|
||||||
'char_count': len(content)
|
|
||||||
}
|
|
||||||
|
|
||||||
return content, metadata
|
|
||||||
|
|
||||||
def _extract_xlsx(self, file_path: Path) -> tuple[str, Dict[str, Any]]:
|
|
||||||
"""提取XLSX文件的纯文本内容"""
|
|
||||||
if not EXCEL_AVAILABLE:
|
|
||||||
raise ImportError("需要安装 openpyxl: pip install openpyxl")
|
|
||||||
|
|
||||||
workbook = load_workbook(file_path, read_only=True)
|
|
||||||
content_parts = []
|
|
||||||
metadata = {
|
|
||||||
'sheet_count': len(workbook.sheetnames),
|
|
||||||
'sheet_names': workbook.sheetnames
|
|
||||||
}
|
|
||||||
|
|
||||||
for sheet_name in workbook.sheetnames:
|
|
||||||
sheet = workbook[sheet_name]
|
|
||||||
content_parts.append(f"\n=== 工作表: {sheet_name} ===\n")
|
|
||||||
|
|
||||||
for row in sheet.iter_rows(values_only=True):
|
|
||||||
row_content = []
|
|
||||||
for cell in row:
|
|
||||||
if cell is not None:
|
|
||||||
row_content.append(str(cell))
|
|
||||||
else:
|
|
||||||
row_content.append("")
|
|
||||||
if any(cell.strip() for cell in row_content): # 跳过空行
|
|
||||||
content_parts.append('\t'.join(row_content))
|
|
||||||
|
|
||||||
content = '\n'.join(content_parts)
|
|
||||||
return content, metadata
|
|
||||||
|
|
||||||
def _extract_xls(self, file_path: Path) -> tuple[str, Dict[str, Any]]:
|
|
||||||
"""提取XLS文件的纯文本内容"""
|
|
||||||
logger.warning("XLS格式支持有限,建议转换为XLSX格式")
|
|
||||||
|
|
||||||
# 简单的文本提取
|
|
||||||
try:
|
|
||||||
with open(file_path, 'rb') as file:
|
|
||||||
content = file.read().decode('utf-8', errors='ignore')
|
|
||||||
except:
|
|
||||||
content = f"无法读取XLS文件: {file_path}"
|
|
||||||
|
|
||||||
metadata = {'format': 'xls', 'note': '可能存在格式问题'}
|
|
||||||
return content, metadata
|
|
||||||
|
|
||||||
def _extract_csv(self, file_path: Path) -> tuple[str, Dict[str, Any]]:
|
|
||||||
"""提取CSV文件的纯文本内容"""
|
|
||||||
encodings = ['utf-8', 'gbk', 'gb2312']
|
|
||||||
content = ""
|
|
||||||
used_encoding = ""
|
|
||||||
|
|
||||||
for encoding in encodings:
|
|
||||||
try:
|
|
||||||
with open(file_path, 'r', encoding=encoding) as file:
|
|
||||||
content = file.read()
|
|
||||||
used_encoding = encoding
|
|
||||||
break
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not content:
|
|
||||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
|
|
||||||
content = file.read()
|
|
||||||
used_encoding = 'utf-8 (with errors ignored)'
|
|
||||||
|
|
||||||
# 计算行数和列数
|
|
||||||
lines = content.splitlines()
|
|
||||||
row_count = len(lines)
|
|
||||||
col_count = len(lines[0].split(',')) if lines else 0
|
|
||||||
|
|
||||||
metadata = {
|
|
||||||
'encoding': used_encoding,
|
|
||||||
'row_count': row_count,
|
|
||||||
'estimated_col_count': col_count
|
|
||||||
}
|
|
||||||
|
|
||||||
return content, metadata
|
|
||||||
|
|
||||||
def get_supported_formats(self) -> List[str]:
|
|
||||||
"""获取支持的文件格式列表"""
|
|
||||||
return list(self.supported_formats.keys())
|
|
||||||
|
|
||||||
def is_supported(self, file_path: str) -> bool:
|
|
||||||
"""检查文件格式是否支持"""
|
|
||||||
return Path(file_path).suffix.lower() in self.supported_formats
|
|
||||||
@ -4,7 +4,7 @@
|
|||||||
2. 你策划选题的时候,请根据产品特性,灵活选取我给你的产品信息,进行能吸引用户的内容选题策划。
|
2. 你策划选题的时候,请根据产品特性,灵活选取我给你的产品信息,进行能吸引用户的内容选题策划。
|
||||||
3. 你需要按照选题数量完成所有的选题,不能省略。选题数量不能少于提示词要求数量。如果用户给出的日期数小于选题数,你可以在一个日期内给出多个选题,以保证得到的选题数和用户提出的需求相符。
|
3. 你需要按照选题数量完成所有的选题,不能省略。选题数量不能少于提示词要求数量。如果用户给出的日期数小于选题数,你可以在一个日期内给出多个选题,以保证得到的选题数和用户提出的需求相符。
|
||||||
4. 选题风格要根据所选用户需求,有逻辑的选择不同的风格即可。
|
4. 选题风格要根据所选用户需求,有逻辑的选择不同的风格即可。
|
||||||
5. **关键规则**:你必须从给你的资料中选择文案风格和面向人群画像,并严谨摘取**完整的文件名**。
|
5. **关键规则**:你必须从给你的资料中选择文案风格和面向人群画像,并严谨摘取**完整的文件名**。用户画像在 Demand 文件夹, 命名为 `XXX文旅需求.txt`; 文案风格在 Style 文件夹, 命名为 `XXX风文案提示词.txt`。**绝对不允许自己创造文件名或省略 `.txt` 后缀**。
|
||||||
|
|
||||||
输出要求:
|
输出要求:
|
||||||
1. **你必须严格按照 JSON 格式输出。** 输出内容应该是一个 JSON 数组,数组中的每个元素代表一个选题,是一个包含以下字段的 JSON 对象。
|
1. **你必须严格按照 JSON 格式输出。** 输出内容应该是一个 JSON 数组,数组中的每个元素代表一个选题,是一个包含以下字段的 JSON 对象。
|
||||||
@ -16,9 +16,9 @@
|
|||||||
- `object`: 选定对象 (例如 "泰宁古城",只能从用户提示词中提到的object列表中选择, 如果用户没有提到,则选择"None", 不能自己创造, 不能选择object列表之外的景区)
|
- `object`: 选定对象 (例如 "泰宁古城",只能从用户提示词中提到的object列表中选择, 如果用户没有提到,则选择"None", 不能自己创造, 不能选择object列表之外的景区)
|
||||||
- `product`: 选定产品内容 (如果没有提供单独产品,则为"None")
|
- `product`: 选定产品内容 (如果没有提供单独产品,则为"None")
|
||||||
- `product_logic`: 选定产品的逻辑内容 (描述性文本)
|
- `product_logic`: 选定产品的逻辑内容 (描述性文本)
|
||||||
- `style`: 选题风格的文件名。**必须是从 Style 文件夹中选择的完整文件名,例如 "攻略风文案"。**
|
- `style`: 选题风格的文件名。**必须是从 Style 文件夹中选择的完整文件名,例如 "攻略风文案提示词.txt"。**
|
||||||
- `style_logic`: 选题风格的逻辑内容 (描述性文本)
|
- `style_logic`: 选题风格的逻辑内容 (描述性文本)
|
||||||
- `target_audience`: 选题面向人群的文件名。**必须是从 Demand 文件夹中选择的完整文件名,例如 "亲子向"。**
|
- `target_audience`: 选题面向人群的文件名。**必须是从 Demand 文件夹中选择的完整文件名,例如 "亲子家庭文旅需求.txt"。**
|
||||||
- `target_audience_logic`: 选题面向人群的逻辑内容 (描述性文本)
|
- `target_audience_logic`: 选题面向人群的逻辑内容 (描述性文本)
|
||||||
4. 请确保生成的 JSON 数组包含用户要求的准确数量的选题对象。
|
4. 请确保生成的 JSON 数组包含用户要求的准确数量的选题对象。
|
||||||
5. **不要虚构景区信息、活动信息或价格**。任何活动信息以用户所给明确资料为准。
|
5. **不要虚构景区信息、活动信息或价格**。任何活动信息以用户所给明确资料为准。
|
||||||
@ -35,9 +35,9 @@
|
|||||||
"object": "泰宁古城",
|
"object": "泰宁古城",
|
||||||
"product": "...",
|
"product": "...",
|
||||||
"product_logic": "结合住宿和导览,提供便捷的家庭游解决方案",
|
"product_logic": "结合住宿和导览,提供便捷的家庭游解决方案",
|
||||||
"style": "攻略风",
|
"style": "攻略风文案提示词.txt",
|
||||||
"style_logic": "强调家庭共享时光和文化体验",
|
"style_logic": "强调家庭共享时光和文化体验",
|
||||||
"target_audience": "亲子向",
|
"target_audience": "亲子向文旅需求.txt",
|
||||||
"target_audience_logic": "满足家长带娃出游,寓教于乐的需求"
|
"target_audience_logic": "满足家长带娃出游,寓教于乐的需求"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@ -1,221 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
"""
|
|
||||||
文档内容提取示例
|
|
||||||
专注于文档内容提取功能,不包含网络搜索
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# 添加项目根目录到Python路径
|
|
||||||
project_root = Path(__file__).parent
|
|
||||||
sys.path.insert(0, str(project_root))
|
|
||||||
|
|
||||||
from document import TextExtractor, ContentIntegrator, ContentTransformer
|
|
||||||
from api.services.document_service import DocumentService
|
|
||||||
|
|
||||||
def test_single_document_extraction():
|
|
||||||
"""测试单个文档提取"""
|
|
||||||
print("=== 单个文档提取测试 ===")
|
|
||||||
|
|
||||||
# 创建文档服务
|
|
||||||
service = DocumentService()
|
|
||||||
|
|
||||||
# 测试文件路径(请替换为实际的文件路径)
|
|
||||||
test_file = "test_document.pdf" # 替换为实际文件路径
|
|
||||||
|
|
||||||
if not os.path.exists(test_file):
|
|
||||||
print(f"测试文件不存在: {test_file}")
|
|
||||||
print("请将一个测试文档放在当前目录下,或修改test_file变量")
|
|
||||||
return
|
|
||||||
|
|
||||||
# 仅提取文本
|
|
||||||
print(f"提取文档: {test_file}")
|
|
||||||
result = service.extract_text_only(test_file)
|
|
||||||
|
|
||||||
if result['success']:
|
|
||||||
doc = result['document']
|
|
||||||
print(f"文件名: {doc['filename']}")
|
|
||||||
print(f"文件类型: {doc['file_type']}")
|
|
||||||
print(f"文件大小: {doc['file_size']} 字节")
|
|
||||||
print(f"内容长度: {doc['content_length']} 字符")
|
|
||||||
print(f"页数: {doc['page_count']}")
|
|
||||||
print(f"提取时间: {doc['extracted_at']}")
|
|
||||||
print("\n内容预览:")
|
|
||||||
print(doc['content'][:500] + "..." if len(doc['content']) > 500 else doc['content'])
|
|
||||||
else:
|
|
||||||
print(f"提取失败: {result['error']}")
|
|
||||||
|
|
||||||
def test_multiple_documents_processing(save_path):
|
|
||||||
"""测试多个文档处理"""
|
|
||||||
print("\n=== 多个文档处理测试 ===")
|
|
||||||
|
|
||||||
# 创建文档服务
|
|
||||||
service = DocumentService()
|
|
||||||
|
|
||||||
# 测试文件列表(请替换为实际的文件路径)
|
|
||||||
test_files = [
|
|
||||||
"document/sample_documents/Ai服务商家资料收集202506.xlsx",
|
|
||||||
"document/sample_documents/ai营销括客话术和QA整理.docx",
|
|
||||||
"document/sample_documents/附件1-服务器租赁发票20250702.pdf"
|
|
||||||
]
|
|
||||||
|
|
||||||
# 过滤存在的文件
|
|
||||||
existing_files = [f for f in test_files if os.path.exists(f)]
|
|
||||||
|
|
||||||
if not existing_files:
|
|
||||||
print("没有找到测试文件")
|
|
||||||
print("请将测试文档放在当前目录下,或修改test_files变量")
|
|
||||||
return
|
|
||||||
|
|
||||||
print(f"处理文档: {existing_files}")
|
|
||||||
|
|
||||||
# 处理多个文档
|
|
||||||
result = service.process_multiple_documents(existing_files, output_format='summary')
|
|
||||||
|
|
||||||
if result['success']:
|
|
||||||
print(f"处理摘要:")
|
|
||||||
summary = result['processing_summary']
|
|
||||||
print(f" 总文件数: {summary['total_files']}")
|
|
||||||
print(f" 成功提取: {summary['successful_extractions']}")
|
|
||||||
print(f" 失败提取: {summary['failed_extractions']}")
|
|
||||||
|
|
||||||
print(f"\n文档列表:")
|
|
||||||
for i, doc in enumerate(result['documents'], 1):
|
|
||||||
print(f" {i}. {doc['filename']} ({doc['file_type']}) - {doc['content_length']} 字符")
|
|
||||||
|
|
||||||
print(f"\n整合内容:")
|
|
||||||
integrated = result['integrated_content']
|
|
||||||
print(f" 文档数量: {integrated['document_count']}")
|
|
||||||
print(f" 总内容长度: {integrated['total_content_length']}")
|
|
||||||
print(f" 文档类型: {integrated['document_types']}")
|
|
||||||
print(f" 关键主题: {integrated['key_topics']}")
|
|
||||||
|
|
||||||
print(f"\n内容摘要:")
|
|
||||||
print(integrated['content_summary'])
|
|
||||||
|
|
||||||
print(f"\n转换后的内容:")
|
|
||||||
transformed = result['transformed_content']
|
|
||||||
print(f" 格式类型: {transformed['format_type']}")
|
|
||||||
print(f" 转换时间: {transformed['transformed_at']}")
|
|
||||||
print("\n转换内容预览:")
|
|
||||||
content = transformed['content']
|
|
||||||
# print(content[:1000] + "..." if len(content) > 1000 else content)
|
|
||||||
if save_path:
|
|
||||||
with open(save_path, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(content)
|
|
||||||
print(f"转换后的内容已保存到: {save_path}")
|
|
||||||
else:
|
|
||||||
print(f"处理失败: {result['error']}")
|
|
||||||
|
|
||||||
def test_component_usage():
|
|
||||||
"""测试组件单独使用"""
|
|
||||||
print("\n=== 组件单独使用测试 ===")
|
|
||||||
|
|
||||||
# 测试文件
|
|
||||||
test_file = "test_document.txt"
|
|
||||||
|
|
||||||
# 创建测试文件
|
|
||||||
if not os.path.exists(test_file):
|
|
||||||
with open(test_file, 'w', encoding='utf-8') as f:
|
|
||||||
f.write("""
|
|
||||||
这是一个测试文档。
|
|
||||||
|
|
||||||
主要内容包括:
|
|
||||||
1. 文档提取功能测试
|
|
||||||
2. 内容整合功能测试
|
|
||||||
3. 内容转换功能测试
|
|
||||||
|
|
||||||
测试文档包含中文内容,用于验证文本提取和处理功能。
|
|
||||||
""")
|
|
||||||
print(f"创建测试文件: {test_file}")
|
|
||||||
|
|
||||||
# 1. 文本提取器测试
|
|
||||||
print("\n1. 文本提取器测试")
|
|
||||||
extractor = TextExtractor()
|
|
||||||
extracted_doc = extractor.extract(test_file)
|
|
||||||
|
|
||||||
print(f"提取结果:")
|
|
||||||
print(f" 文件名: {extracted_doc.filename}")
|
|
||||||
print(f" 文件类型: {extracted_doc.file_type}")
|
|
||||||
print(f" 内容长度: {len(extracted_doc.content)}")
|
|
||||||
print(f" 内容: {extracted_doc.content}")
|
|
||||||
|
|
||||||
# 2. 内容整合器测试
|
|
||||||
print("\n2. 内容整合器测试")
|
|
||||||
integrator = ContentIntegrator()
|
|
||||||
integrated_content = integrator.integrate_documents([extracted_doc])
|
|
||||||
|
|
||||||
print(f"整合结果:")
|
|
||||||
print(f" 文档数量: {integrated_content.document_count}")
|
|
||||||
print(f" 总内容长度: {integrated_content.total_content_length}")
|
|
||||||
print(f" 文档类型: {integrated_content.document_types}")
|
|
||||||
print(f" 关键主题: {integrated_content.key_topics}")
|
|
||||||
print(f" 内容摘要: {integrated_content.content_summary}")
|
|
||||||
|
|
||||||
# 3. 内容转换器测试
|
|
||||||
print("\n3. 内容转换器测试")
|
|
||||||
transformer = ContentTransformer()
|
|
||||||
|
|
||||||
# 测试不同格式
|
|
||||||
formats = ['summary', 'attraction_standard', 'product_sales', 'travel_guide', 'blog_post']
|
|
||||||
|
|
||||||
for format_type in formats:
|
|
||||||
print(f"\n--- {format_type} 格式 ---")
|
|
||||||
transformed_content = transformer.transform_content(integrated_content, format_type=format_type)
|
|
||||||
print(f"转换结果预览:")
|
|
||||||
content = transformed_content.transformed_text
|
|
||||||
print(content[:300] + "..." if len(content) > 300 else content)
|
|
||||||
|
|
||||||
def test_service_info():
|
|
||||||
"""测试服务信息"""
|
|
||||||
print("\n=== 服务信息测试 ===")
|
|
||||||
|
|
||||||
service = DocumentService()
|
|
||||||
|
|
||||||
print("支持的文件类型:")
|
|
||||||
file_types = service.get_supported_file_types()
|
|
||||||
print(f" {file_types}")
|
|
||||||
|
|
||||||
print("\n支持的输出格式:")
|
|
||||||
output_formats = service.get_supported_formats()
|
|
||||||
print(f" {output_formats}")
|
|
||||||
|
|
||||||
print("\n服务状态:")
|
|
||||||
status = service.get_service_status()
|
|
||||||
print(f" 服务名: {status['service_name']}")
|
|
||||||
print(f" 状态: {status['status']}")
|
|
||||||
print(f" 组件: {status['components']}")
|
|
||||||
print(f" 时间戳: {status['timestamp']}")
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""主函数"""
|
|
||||||
print("文档内容提取示例程序")
|
|
||||||
print("=" * 50)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# 测试服务信息
|
|
||||||
test_service_info()
|
|
||||||
|
|
||||||
# 测试组件单独使用
|
|
||||||
test_component_usage()
|
|
||||||
|
|
||||||
# 测试单个文档提取
|
|
||||||
test_single_document_extraction()
|
|
||||||
|
|
||||||
# 测试多个文档处理
|
|
||||||
test_multiple_documents_processing("document/sample_documents/test.txt")
|
|
||||||
|
|
||||||
print("\n" + "=" * 50)
|
|
||||||
print("测试完成!")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"测试过程中发生错误: {str(e)}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
Binary file not shown.
Binary file not shown.
@ -146,4 +146,3 @@ class TopicGenerator:
|
|||||||
return topics
|
return topics
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -47,7 +47,6 @@ class TopicParser:
|
|||||||
# 验证每个选题是否包含所有必需的键
|
# 验证每个选题是否包含所有必需的键
|
||||||
valid_topics = []
|
valid_topics = []
|
||||||
required_keys = {"index", "date", "logic", "object", "product", "style", "target_audience"}
|
required_keys = {"index", "date", "logic", "object", "product", "style", "target_audience"}
|
||||||
optional_keys = {"product_logic", "style_logic", "target_audience_logic"}
|
|
||||||
|
|
||||||
for i, item in enumerate(parsed_json):
|
for i, item in enumerate(parsed_json):
|
||||||
if isinstance(item, dict) and required_keys.issubset(item.keys()):
|
if isinstance(item, dict) and required_keys.issubset(item.keys()):
|
||||||
|
|||||||
Binary file not shown.
@ -202,12 +202,14 @@ def process_llm_json_text(text: Any) -> Optional[Dict[str, Any]]:
|
|||||||
for candidate in json_candidates:
|
for candidate in json_candidates:
|
||||||
# 直接尝试解析
|
# 直接尝试解析
|
||||||
try:
|
try:
|
||||||
|
import json
|
||||||
return json.loads(candidate)
|
return json.loads(candidate)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# 使用json_repair尝试修复
|
# 使用json_repair尝试修复
|
||||||
try:
|
try:
|
||||||
|
import json_repair
|
||||||
return json_repair.loads(candidate)
|
return json_repair.loads(candidate)
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user