TravelContentCreator/core/topic_parser.py

132 lines
6.1 KiB
Python
Raw Normal View History

import re
import json
import os
import traceback # Import traceback for better error logging
class TopicParser:
"""选题解析器类,负责解析和处理选题"""
@staticmethod
def parse_topics(result):
"""解析 AI 返回的 JSON 格式的选题列表"""
print("\n开始解析 JSON 格式的选题结果...")
result_list = []
try:
# 尝试直接将整个结果解析为 JSON
# 先移除可能的Markdown代码块标记 ```json ... ```
cleaned_result = re.sub(r'^```json\s*|\s*```$', '', result.strip(), flags=re.MULTILINE)
parsed_json = json.loads(cleaned_result)
# 验证解析结果是否为列表
if isinstance(parsed_json, list):
print(f"成功解析 JSON包含 {len(parsed_json)} 个潜在选题。")
required_keys = {"index", "date", "logic", "object", "product", "product_logic", "style", "style_logic", "target_audience", "target_audience_logic"}
for i, item in enumerate(parsed_json):
# 验证每个元素是否为字典并包含所有必需的键
if isinstance(item, dict) and required_keys.issubset(item.keys()):
# 可以添加更细致的类型验证,例如检查 'index' 是否为数字字符串等
# 确保所有值都是字符串,以防 AI 返回非字符串类型
valid_item = {str(k): str(v) for k, v in item.items()}
# 添加 'error' 字段以兼容旧接口(如果需要)
valid_item['error'] = False
result_list.append(valid_item)
else:
print(f"警告: 第 {i+1} 个元素不是有效选题对象或缺少键: {item}")
else:
print(f"错误: 解析结果不是一个 JSON 数组 (List)。实际类型: {type(parsed_json)}")
except json.JSONDecodeError as e:
print(f"错误: 解析 JSON 失败 - {e}")
print("------ 无法解析的原始文本 ------")
print(result)
print("-------------------------------")
# 在失败时返回空列表
return []
except Exception as e: # Catch other potential errors
print(f"解析选题时发生意外错误: {e}")
traceback.print_exc()
return []
print(f"最终成功解析选题数量:{len(result_list)}")
# (可选)保留索引重新分配逻辑,处理 AI 可能生成的无效或重复索引
if result_list:
print("重新分配和验证选题索引...")
used_indices = set()
# 第一步尝试解析已有的index如果是有效数字则保留
for item in result_list:
raw_index = item.get('index', '')
try:
index_value = int(raw_index.strip())
if index_value in used_indices:
item['index'] = None # Mark for reassignment
else:
item['index'] = str(index_value) # Keep valid, unique index
used_indices.add(index_value)
except (ValueError, TypeError):
item['index'] = None # Mark for reassignment
# 第二步为所有无效或重复的index分配新值
next_available_index = 1
for item in result_list:
if item.get('index') is None:
while next_available_index in used_indices:
next_available_index += 1
item['index'] = str(next_available_index)
used_indices.add(next_available_index)
next_available_index += 1
print("选题索引已重新分配完毕。")
return result_list
@staticmethod
def save_topics(result_list, output_dir, run_id, result=None): # result is optional now
"""保存解析后的选题到JSON文件"""
os.makedirs(output_dir, exist_ok=True)
json_path = os.path.join(output_dir, f"tweet_topic_{run_id}.json") # Consistent naming with README?
try:
with open(json_path, "w", encoding="utf-8") as f:
json.dump(result_list, f, ensure_ascii=False, indent=4)
print(f"选题结果已保存到: {json_path}")
return True, json_path
except Exception as e:
print(f"错误: 保存选题 JSON 文件失败 - {e}")
traceback.print_exc()
# Log raw result if saving fails and result was provided
if result:
error_log_file = os.path.join(output_dir, f"error_log_{run_id}.txt")
try:
with open(error_log_file, "w", encoding="utf-8") as f:
f.write("无法解析或保存选题,原始内容如下:\n\n")
f.write(result)
print(f"原始 AI 输出已记录到: {error_log_file}")
except Exception as log_e:
print(f"错误: 记录原始输出失败 - {log_e}")
return False, None
@staticmethod
def load_topics_from_json(json_path):
"""从JSON文件加载选题列表"""
try:
with open(json_path, "r", encoding="utf-8") as f:
topics_list = json.load(f)
# Basic validation
if isinstance(topics_list, list):
print(f"{json_path} 加载了 {len(topics_list)} 个选题。")
return topics_list
else:
print(f"错误: {json_path} 中的内容不是一个有效的 JSON 数组。")
return None
except FileNotFoundError:
print(f"错误: 找不到选题文件 {json_path}")
return None
except json.JSONDecodeError as e:
print(f"错误: 解析选题文件 {json_path} 失败 - {e}")
return None
except Exception as e:
print(f"加载选题文件时发生意外错误: {e}")
traceback.print_exc()
return None