更改了思考结果中json读取的方式, 修复了部分json文件解析失败的BUG。修改了默认海报配置内容，如果海报文案没有正常生成，会给出空文件。

2025-04-27 13:08:04 +08:00 · 2025-04-27 13:08:04 +08:00 · 078ff22725
commit 078ff22725
parent 32562f62e7
1 changed files with 136 additions and 51 deletions
--- a/utils/content_generator.py
+++ b/utils/content_generator.py
@ -70,44 +70,87 @@ class ContentGenerator:
            分割后的json内容
        """
        try:
+            # 记录原始内容的前200个字符（用于调试）
+            self.logger.debug(f"解析内容，原始内容前200字符: {content[:200]}")
+            
            # 首先尝试直接解析整个内容，以防已经是干净的 JSON
            try:
-                return json.loads(content)
+                parsed_data = json.loads(content)
+                
+                # 验证解析后的数据格式
+                if isinstance(parsed_data, list):
+                    # 如果是列表，验证每个元素是否符合预期结构
+                    for item in parsed_data:
+                        if isinstance(item, dict) and ('main_title' in item or 'texts' in item):
+                            # 至少有一个元素符合海报配置结构
+                            self.logger.info("成功直接解析为JSON格式列表，符合预期结构")
+                            return parsed_data
+                    
+                    # 如果到这里，说明列表内没有符合结构的元素
+                    if len(parsed_data) > 0 and isinstance(parsed_data[0], str):
+                        self.logger.warning(f"解析到JSON列表，但内容是字符串列表: {parsed_data}")
+                        # 将字符串列表返回供后续修复
+                        return parsed_data
+                    
+                    self.logger.warning("解析到JSON列表，但结构不符合预期")
+                    
+                elif isinstance(parsed_data, dict) and ('main_title' in parsed_data or 'texts' in parsed_data):
+                    # 单个字典结构符合预期
+                    self.logger.info("成功直接解析为JSON字典，符合预期结构")
+                    return parsed_data
+                
+                # 如果结构不符合预期，记录但仍返回解析结果，交给后续函数修复
+                self.logger.warning(f"解析到JSON，但结构不完全符合预期: {parsed_data}")
+                return parsed_data
+                
            except json.JSONDecodeError:
-                pass  # 不是干净的 JSON，继续处理
+                # 不是完整有效的JSON，继续尝试提取
+                self.logger.debug("直接JSON解析失败，尝试提取结构化内容")
            
            # 常规模式：查找 ```json 和 ``` 之间的内容
            if "```json" in content:
                json_str = content.split("```json")[1].split("```")[0].strip()
                try:
-                    return json.loads(json_str)
+                    parsed_json = json.loads(json_str)
+                    self.logger.info("成功从```json```代码块提取JSON")
+                    return parsed_json
                except json.JSONDecodeError as e:
-                    self.logger.warning(f"常规格式解析失败: {e}, 尝试其他方法")
+                    self.logger.warning(f"从```json```提取的内容解析失败: {e}, 尝试其他方法")
            
            # 备用模式1：查找连续的 [ 开头和 ] 结尾的部分
            import re
-            json_pattern = r'(\[.*?\])'
+            json_pattern = r'(\[(?:\s*\{.*?\}\s*,?)+\s*\])'  # 更严格的模式，要求[]内至少有一个{}对象
            json_matches = re.findall(json_pattern, content, re.DOTALL)
-            if json_matches:
-                for match in json_matches:
-                    try:
-                        result = json.loads(match)
-                        if isinstance(result, list) and len(result) > 0:
-                            return result
-                    except:
-                        continue
+            
+            for match in json_matches:
+                try:
+                    result = json.loads(match)
+                    if isinstance(result, list) and len(result) > 0:
+                        # 验证结构
+                        for item in result:
+                            if isinstance(item, dict) and ('main_title' in item or 'texts' in item):
+                                self.logger.info("成功从正则表达式提取JSON数组")
+                                return result
+                        self.logger.warning("从正则表达式提取的JSON数组不符合预期结构")
+                except Exception as e:
+                    self.logger.warning(f"解析正则匹配的内容失败: {e}")
+                    continue
            
            # 备用模式2：查找 [ 开头 和 ] 结尾，并尝试解析
            content = content.strip()
            square_bracket_start = content.find('[')
            square_bracket_end = content.rfind(']')
            
-            if square_bracket_start != -1 and square_bracket_end != -1:
+            if square_bracket_start != -1 and square_bracket_end != -1 and square_bracket_end > square_bracket_start:
                potential_json = content[square_bracket_start:square_bracket_end + 1]
                try:
-                    return json.loads(potential_json)
-                except:
-                    self.logger.warning("尝试提取方括号内容失败")
+                    result = json.loads(potential_json)
+                    if isinstance(result, list):
+                        # 检查列表内容
+                        self.logger.info(f"成功从方括号内容提取列表: {result}")
+                        return result
+                except Exception as e:
+                    self.logger.warning(f"尝试提取方括号内容失败: {e}")
            
            # 最后一种尝试：查找所有可能的 JSON 结构并尝试解析
            json_structures = re.findall(r'({.*?})', content, re.DOTALL)
@ -117,14 +160,22 @@ class ContentGenerator:
                    try:
                        item = json.loads(struct)
                        # 验证结构包含预期字段
-                        if 'main_title' in item and ('texts' in item or 'index' in item):
+                        if isinstance(item, dict) and ('main_title' in item or 'texts' in item):
                            items.append(item)
-                    except:
+                    except Exception as e:
+                        self.logger.warning(f"解析可能的JSON结构 {i+1} 失败: {e}")
                        continue
                
                if items:
+                    self.logger.info(f"成功从文本中提取 {len(items)} 个JSON对象")
                    return items
            
+            # 如果以上所有方法都失败，尝试简单字符串处理
+            if "|" in content or "必打卡" in content or "性价比" in content:
+                # 这可能是一个简单的标签字符串
+                self.logger.warning(f"无法提取标准JSON，但发现可能的标签字符串: {content}")
+                return content.strip()
+            
            # 都失败了，打印错误并引发异常
            self.logger.error(f"无法解析内容，返回原始文本: {content[:200]}...")
            raise ValueError("无法从响应中提取有效的 JSON 格式")
@ -132,7 +183,7 @@ class ContentGenerator:
        except Exception as e:
            self.logger.error(f"解析内容时出错: {e}")
            self.logger.debug(f"原始内容: {content[:200]}...")  # 仅显示前200个字符
-            raise e
+            return content.strip()  # 返回原始内容，让后续验证函数处理
    
    def generate_posters(self, 
                        poster_num, 
@ -296,8 +347,8 @@ class ContentGenerator:
        for i in range(poster_num):
            default_configs.append({
                "index": i + 1,
-                "main_title": f" ",
-                "texts": [" ", " "]
+                "main_title": "",
+                "texts": ["", ""]
            })
        return json.dumps(default_configs, ensure_ascii=False)

@ -321,8 +372,7 @@ class ContentGenerator:
            parsed_data = self.split_content(full_response)
            
            # 验证内容格式并修复
-            # validated_data = self._validate_and_fix_data(parsed_data)
-            validated_data = parsed_data
+            validated_data = self._validate_and_fix_data(parsed_data)
            # 创建结果文件路径
            result_path = os.path.join(output_dir, f"{date_time}.json")
            os.makedirs(os.path.dirname(result_path), exist_ok=True)
@ -337,7 +387,7 @@ class ContentGenerator:
        except Exception as e:
            self.logger.error(f"保存结果到文件时出错: {e}")
            # 尝试创建一个简单的备用配置
-            fallback_data = [{"main_title": " ", "texts": [" ", " "], "index": 1}]
+            fallback_data = [{"main_title": "", "texts": ["", ""], "index": 1}]
            
            # 保存备用数据
            result_path = os.path.join(output_dir, f"{date_time}_fallback.json")
@ -359,9 +409,19 @@ class ContentGenerator:
        返回:
            修复后的数据
        """
-        return data
        fixed_data = []
        
+        # 记录原始数据格式信息
+        self.logger.info(f"验证和修复数据，原始数据类型: {type(data)}")
+        if isinstance(data, list):
+            self.logger.info(f"原始数据是列表，长度: {len(data)}")
+            if len(data) > 0:
+                self.logger.info(f"第一个元素类型: {type(data[0])}")
+        elif isinstance(data, str):
+            self.logger.info(f"原始数据是字符串: {data[:100]}")
+        else:
+            self.logger.info(f"原始数据是其他类型: {data}")
+        
        # 如果数据是列表
        if isinstance(data, list):
            for i, item in enumerate(data):
@ -370,41 +430,49 @@ class ContentGenerator:
                    # 确保必需字段存在
                    fixed_item = {
                        "index": item.get("index", i + 1),
-                        "main_title": item.get("main_title", f""),
-                        "texts": item.get("texts", [" ", " "])
+                        "main_title": item.get("main_title", ""),
+                        "texts": item.get("texts", ["", ""])
                    }
                    
                    # 确保texts是列表格式
                    if not isinstance(fixed_item["texts"], list):
                        if isinstance(fixed_item["texts"], str):
-                            fixed_item["texts"] = [fixed_item["texts"], " "]
+                            fixed_item["texts"] = [fixed_item["texts"], ""]
                        else:
-                            fixed_item["texts"] = [" ", " "]
+                            fixed_item["texts"] = ["", ""]
                            
                    # 限制texts最多包含两个元素
                    if len(fixed_item["texts"]) > 2:
                        fixed_item["texts"] = fixed_item["texts"][:2]
                    elif len(fixed_item["texts"]) < 2:
                        while len(fixed_item["texts"]) < 2:
-                            fixed_item["texts"].append(" ")
+                            fixed_item["texts"].append("")
                    
                    fixed_data.append(fixed_item)
                
                # 如果项目是字符串（可能是错误格式的texts值）
                elif isinstance(item, str):
-                    self.logger.warning(f"配置项 {i+1} 是字符串格式，将转换为标准格式")
+                    self.logger.warning(f"配置项 {i+1} 是字符串格式: '{item}'，将转换为标准格式")
+                    
+                    # 尝试解析字符串格式，例如"性价比|必打卡"
+                    texts = []
+                    if "|" in item:
+                        texts = item.split("|")
+                    else:
+                        texts = [item, ""]
+                    
                    fixed_item = {
                        "index": i + 1,
-                        "main_title": f"",
-                        "texts": [item, " "]
+                        "main_title": "",
+                        "texts": texts
                    }
                    fixed_data.append(fixed_item)
                else:
                    self.logger.warning(f"配置项 {i+1} 格式不支持: {type(item)}，将使用默认值")
                    fixed_data.append({
                        "index": i + 1,
-                        "main_title": f"",
-                        "texts": [" ", " "]
+                        "main_title": "",
+                        "texts": ["", ""]
                    })
        
        # 如果数据是字典
@ -412,42 +480,60 @@ class ContentGenerator:
            fixed_item = {
                "index": data.get("index", 1),
                "main_title": data.get("main_title", ""),
-                "texts": data.get("texts", [" ", " "])
+                "texts": data.get("texts", ["", ""])
            }
            
            # 确保texts是列表格式
            if not isinstance(fixed_item["texts"], list):
                if isinstance(fixed_item["texts"], str):
-                    fixed_item["texts"] = [fixed_item["texts"], " "]
+                    fixed_item["texts"] = [fixed_item["texts"], ""]
                else:
-                    fixed_item["texts"] = [" ", " "]
+                    fixed_item["texts"] = ["", ""]
                    
            # 限制texts最多包含两个元素
            if len(fixed_item["texts"]) > 2:
                fixed_item["texts"] = fixed_item["texts"][:2]
            elif len(fixed_item["texts"]) < 2:
                while len(fixed_item["texts"]) < 2:
-                    fixed_item["texts"].append(" ")
+                    fixed_item["texts"].append("")
            
            fixed_data.append(fixed_item)
        
-        # 如果数据是字符串或其他格式
+        # 如果数据是字符串
+        elif isinstance(data, str):
+            self.logger.warning(f"数据是字符串格式: '{data}'，尝试转换为标准格式")
+            
+            # 尝试解析字符串格式，例如"性价比|必打卡"
+            texts = []
+            if "|" in data:
+                texts = data.split("|")
+            else:
+                texts = [data, ""]
+                
+            fixed_data.append({
+                "index": 1,
+                "main_title": "",
+                "texts": texts
+            })
+        
+        # 如果数据是其他格式
        else:
            self.logger.warning(f"数据格式不支持: {type(data)}，将使用默认值")
            fixed_data.append({
                "index": 1,
-                "main_title": " ",
-                "texts": [" ", " "]
+                "main_title": "",
+                "texts": ["", ""]
            })
        
        # 确保至少有一个配置项
        if not fixed_data:
            fixed_data.append({
                "index": 1,
-                "main_title": " ",
-                "texts": [" ", " "]
+                "main_title": "",
+                "texts": ["", ""]
            })
            
+        self.logger.info(f"修复后的数据: {fixed_data}")
        return fixed_data

    def run(self, info_directory, poster_num, content_data, system_prompt=None,
@ -492,10 +578,9 @@ class ContentGenerator:
            result_data = self.split_content(full_response)
            
            # 验证并修复数据
-            # fixed_data = self._validate_and_fix_data(result_data)
-            fixed_data = result_data
+            fixed_data = self._validate_and_fix_data(result_data)
            
-            self.logger.info(f"成功生成并修复海报配置数据，包含 {len(fixed_data)} 个项目")
+            self.logger.info(f"成功生成并修复海报配置数据，包含 {len(fixed_data) if isinstance(fixed_data, list) else 1} 个项目")
            return fixed_data
                
        except Exception as e:
@ -508,8 +593,8 @@ class ContentGenerator:
            for i in range(poster_num):
                default_configs.append({
                    "index": i + 1,
-                    "main_title": f" ",
-                    "texts": [" ", " "]
+                    "main_title": "",
+                    "texts": ["", ""]
                })
            return default_configs