diff --git a/utils/__pycache__/content_judger.cpython-312.pyc b/utils/__pycache__/content_judger.cpython-312.pyc index 4e7640e..c99420d 100644 Binary files a/utils/__pycache__/content_judger.cpython-312.pyc and b/utils/__pycache__/content_judger.cpython-312.pyc differ diff --git a/utils/__pycache__/output_handler.cpython-312.pyc b/utils/__pycache__/output_handler.cpython-312.pyc index 03ae793..293bda0 100644 Binary files a/utils/__pycache__/output_handler.cpython-312.pyc and b/utils/__pycache__/output_handler.cpython-312.pyc differ diff --git a/utils/__pycache__/tweet_generator.cpython-312.pyc b/utils/__pycache__/tweet_generator.cpython-312.pyc index 6d6effd..7004369 100644 Binary files a/utils/__pycache__/tweet_generator.cpython-312.pyc and b/utils/__pycache__/tweet_generator.cpython-312.pyc differ diff --git a/utils/content_judger.py b/utils/content_judger.py index 813c6d7..a4e688b 100644 --- a/utils/content_judger.py +++ b/utils/content_judger.py @@ -68,7 +68,7 @@ class ContentJudger: 7. 面向人群保留:请尽量保留文案原本的面向人群和风格,这是同一产品面向多种人群营销的策略。例如产品资料中写明亲子游时,文案写"为情侣定制的山水秘境"是可以接受的。 8. 案例如下,请参考案例评判真假信息的尺度,逐行逐句仔细分析不符点和修改思路,并按照分析思路落实对每一处不符的修改措施,严格审查每一篇文案: [ - "产品资料": +"产品资料": "周末不加收【南沙越秀喜来登】1088元/套,豪华客房1间1晚+双人自助早餐+自助晚餐+2大1小水鸟世界门票,免费儿童乐园,户外泳池+健身房~ 不想待在家,又想带娃出去玩?更不想开长途车、人挤人?为你推荐路程短、不塞车、景点多、坐地铁就能直达的溜娃地! 南沙越秀喜来登是广州南沙区首家国际品牌酒店,坐拥广州南大门,拥有得天独厚的中心位置,可俯瞰蕉门河美景,车程短,不出广州也能玩! @@ -98,7 +98,7 @@ class ContentJudger: 酒店地址:广东省广州市南沙区海熙大街79-80号 导航关键词:广州南沙越秀喜来登酒店" - + "生成文案": "title": "五一遛娃👶必囤!南沙喜来登1088元住景观房+双早+门票", "content": "五一不想挤人潮?南沙这家酒店直接承包遛娃+度假双重快乐‼️\n地铁直达!2大1小1088元住景观房,含双早+自助晚餐+水鸟世界门票,儿童乐园/泳池/健身房全开放!\n🌟【遛娃刚需全配齐】\n✅ 儿童乐园:10:00-20:00全程开放,滑梯/积木/绘本一应俱全\n✅ 户外泳池:9:00-18:00恒温开放(五一期间每日消毒3次)\n✅ 健身房:8:00-22:00配备亲子瑜伽课程(需提前预约)\n\n📍【1小时玩转南沙】\n① 南沙天后宫(车程20分钟):穿汉服拍大片,听妈祖传说涨知识\n② 南沙湿地公园(40分钟):5月芦苇摇曳,带娃认鸟类+乘船探秘\n③ 十九涌海鲜街(45分钟):现捞现煮生猛海鲜,人均50元吃到撑\n\n🍽️【家长友好细节】\n• 自助晚餐隐藏彩蛋:儿童餐区设独立洗手台+热食保温柜\n• 房内配置:加厚床垫/卡通洗漱杯/尿布台(无需额外购买)\n• 安全保障:全区域监控+24小时安保巡逻\n\n🎁【五一专属加码】\n5月1-5日期间入住,凭房卡可免费领取儿童防晒冰袖+湿巾礼包\n\n📌Tips:\n1. 周一至周四仅限双床房型,周五起可选大床房\n2. 水鸟世界门票需提前1小时至前台领取纸质票\n3. 地铁四号线金洲站下车,打车15分钟直达酒店\n\n这个五一,南沙喜来登让你躺着遛娃!不用长途跋涉,家门口就能玩出仪式感~" @@ -508,7 +508,7 @@ class ContentJudger: "content_base64": base64.b64encode(content_text.encode('utf-8')).decode('utf-8'), "analysis": f"内容审核失败: {error_msg}", "analysis_base64": base64.b64encode(f"内容审核失败: {error_msg}".encode('utf-8')).decode('utf-8') - } + } def _build_user_prompt(self, product_info, content_gen): """ diff --git a/utils/output_handler.py b/utils/output_handler.py index 784d5e1..4f1f25a 100644 --- a/utils/output_handler.py +++ b/utils/output_handler.py @@ -145,173 +145,155 @@ class FileSystemOutputHandler(OutputHandler): import copy input_data = copy.deepcopy(content_data) - # 统一使用tags字段,避免tag和tags重复 - if "tag" in input_data and "tags" not in input_data: - # 只有tag字段存在,复制到tags - input_data["tags"] = input_data["tag"] - elif "tag" in input_data and "tags" in input_data: - # 两个字段都存在,保留tags - pass - - # 确保即使在未启用审核的情况下,字段也保持一致 - if not input_data.get("judged", False): - input_data["judged"] = False - # 添加original字段(临时),值为当前值 - if "title" in input_data and "original_title" not in input_data: - input_data["original_title"] = input_data["title"] - if "content" in input_data and "original_content" not in input_data: - input_data["original_content"] = input_data["content"] - if "tags" in input_data and "original_tags" not in input_data: - input_data["original_tags"] = input_data["tags"] - - # 统一审核分析字段,优先使用judge_analysis,其次使用不良内容分析 - if "judge_analysis" not in input_data and "不良内容分析" in input_data: - input_data["judge_analysis"] = input_data["不良内容分析"] - elif "不良内容分析" not in input_data and "judge_analysis" in input_data: - input_data["不良内容分析"] = input_data["judge_analysis"] - - # 保存原始值用于txt文件生成和调试 - original_title = input_data.get("original_title", input_data.get("title", "")) - original_content = input_data.get("original_content", input_data.get("content", "")) - original_tags = input_data.get("original_tags", input_data.get("tags", "")) - judge_title = input_data.get("title", "") - judge_content = input_data.get("content", "") - judge_tags = input_data.get("tags", "") - original_judge_analysis = input_data.get("judge_analysis", "") - - # 创建一个只包含元数据和base64编码的输出数据对象 + # 简化输出数据结构,只保留必要的元数据和base64编码内容 output_data = { - # 保留元数据字段 "judged": input_data.get("judged", False), - "judge_success": input_data.get("judge_success", False) + "judge_success": input_data.get("judge_success", False), + "error": input_data.get("error", False) } - # 为所有内容字段创建base64编码版本 + # 检查并处理内容字段,确保全部以base64编码保存 try: - # 1. 标题和内容 - if "title" in input_data and input_data["title"]: - output_data["title_base64"] = base64.b64encode(input_data["title"].encode('utf-8')).decode('ascii') + # 检查是否已经是base64编码的字段 + def is_base64(s): + if not isinstance(s, str): + return False + try: + # 尝试解码看是否成功 + base64.b64decode(s).decode('utf-8') + # 如果能成功解码,而且是标准base64长度(4的倍数),则可能是base64 + return len(s) % 4 == 0 and all(c in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=' for c in s) + except: + return False - if "content" in input_data and input_data["content"]: - output_data["content_base64"] = base64.b64encode(input_data["content"].encode('utf-8')).decode('ascii') + # 1. 处理标题和内容 + if "title" in input_data: + if is_base64(input_data["title"]): + # 已经是base64编码,直接使用 + output_data["title_base64"] = input_data["title"] + # 尝试解码用于txt文件 + try: + title_text = base64.b64decode(input_data["title"]).decode('utf-8') + except: + title_text = input_data["title"] + else: + # 未编码,进行base64编码 + title_text = input_data["title"] + output_data["title_base64"] = base64.b64encode(title_text.encode('utf-8')).decode('ascii') - # 2. 标签 - if "tags" in input_data and input_data["tags"]: - output_data["tags_base64"] = base64.b64encode(input_data["tags"].encode('utf-8')).decode('ascii') + if "content" in input_data: + if is_base64(input_data["content"]): + # 已经是base64编码,直接使用 + output_data["content_base64"] = input_data["content"] + # 尝试解码用于txt文件 + try: + content_text = base64.b64decode(input_data["content"]).decode('utf-8') + except: + content_text = input_data["content"] + else: + # 未编码,进行base64编码 + content_text = input_data["content"] + output_data["content_base64"] = base64.b64encode(content_text.encode('utf-8')).decode('ascii') - # 3. 原始内容 - if "original_title" in input_data and input_data["original_title"]: - output_data["original_title_base64"] = base64.b64encode(input_data["original_title"].encode('utf-8')).decode('ascii') + # 2. 处理标签 + tags_text = input_data.get("tags", input_data.get("tag", "")) + if tags_text: + if is_base64(tags_text): + output_data["tags_base64"] = tags_text + try: + tags_text = base64.b64decode(tags_text).decode('utf-8') + except: + pass + else: + output_data["tags_base64"] = base64.b64encode(tags_text.encode('utf-8')).decode('ascii') - if "original_content" in input_data and input_data["original_content"]: - output_data["original_content_base64"] = base64.b64encode(input_data["original_content"].encode('utf-8')).decode('ascii') - - # 4. 原始标签 - if "original_tags" in input_data and input_data["original_tags"]: - output_data["original_tags_base64"] = base64.b64encode(input_data["original_tags"].encode('utf-8')).decode('ascii') - - # 5. 审核分析 - 检查judge_analysis和不良内容分析两个字段 - judge_analysis = input_data.get("judge_analysis", input_data.get("不良内容分析", "")) - if judge_analysis: - output_data["judge_analysis_base64"] = base64.b64encode(judge_analysis.encode('utf-8')).decode('ascii') - - logging.info("成功添加Base64编码内容") + # 3. 处理分析 + analysis_text = input_data.get("analysis", input_data.get("judge_analysis", "")) + if analysis_text: + if is_base64(analysis_text): + output_data["analysis_base64"] = analysis_text + try: + analysis_text = base64.b64decode(analysis_text).decode('utf-8') + except: + pass + else: + output_data["analysis_base64"] = base64.b64encode(analysis_text.encode('utf-8')).decode('ascii') + + logging.info("成功处理内容并添加Base64编码") except Exception as e: - logging.error(f"Base64编码内容时出错: {e}") - - # 保存可能有用的额外字段 - if "error" in input_data: - output_data["error"] = input_data["error"] + logging.error(f"处理内容或Base64编码时出错: {e}") - # 保存统一格式的article.json (只包含base64编码和元数据) + # 保存处理后的article.json content_path = os.path.join(variant_dir, "article.json") try: with open(content_path, "w", encoding="utf-8") as f: - # 使用标准json - json.dump(output_data, f, ensure_ascii=False, indent=4, ignore_nan=True) + # 使用标准json并确保正确处理中文和特殊字符 + json.dump(output_data, f, ensure_ascii=False, indent=4) logging.info(f"Content JSON saved to: {content_path}") except Exception as e: logging.exception(f"Failed to save content JSON to {content_path}: {e}") - # 创建一份article.txt文件以便直接查看 + # 创建一份article.txt文件,使用解码后的文本 txt_path = os.path.join(variant_dir, "article.txt") try: - # 重新组织内容显示,明确区分原始内容和审核后内容 - with open(txt_path, "w", encoding="utf-8") as f: - # 根据审核状态决定显示哪些内容 - is_judged = input_data.get("judged", False) - is_judge_success = input_data.get("judge_success", False) - - if is_judged and is_judge_success: - # 显示审核后的内容 - f.write(f"{judge_title}\n\n") - if judge_content: - f.write(judge_content) - if judge_tags: - f.write(f"\n\n{judge_tags}") - - # 在最后添加原始内容作为参考 - if original_title != judge_title or original_content != judge_content: - f.write("\n\n=== 原始内容 ===\n") - f.write(f"{original_title}\n\n") - if original_content: - f.write(original_content) - if original_tags and original_tags != judge_tags: - f.write(f"\n\n{original_tags}") - elif is_judged and not is_judge_success: - # 审核失败,显示审核失败信息和原始内容 - f.write("审核失败\n\n") - f.write(f"{original_title}\n\n") - if original_content: - f.write(original_content) - if original_tags: - f.write(f"\n\n{original_tags}") - else: - # 未审核,直接显示原始内容 - f.write(f"{original_title}\n\n") - if original_content: - f.write(original_content) - if original_tags: - f.write(f"\n\n{original_tags}") - - # 添加审核分析信息(如果有) - if original_judge_analysis: - f.write(f"\n\n=== 审核分析 ===\n{original_judge_analysis}") + is_judged = output_data.get("judged", False) + is_judge_success = output_data.get("judge_success", False) + # 确保我们有可用的文本版本 + title_text = title_text if 'title_text' in locals() else "未找到标题" + content_text = content_text if 'content_text' in locals() else "未找到内容" + tags_text = tags_text if 'tags_text' in locals() else "" + + with open(txt_path, "w", encoding="utf-8") as f: + # 根据审核状态决定显示内容 + if is_judged and is_judge_success: + f.write(f"{title_text}\n\n") + f.write(content_text) + if tags_text: + f.write(f"\n\n{tags_text}") + else: + # 未审核或审核未通过 + if not is_judged: + f.write(f"{title_text}\n\n") + else: + # 审核失败 + f.write(f"审核失败\n\n{title_text}\n\n") + f.write(content_text) + if tags_text: + f.write(f"\n\n{tags_text}") + + # 添加审核分析 + if 'analysis_text' in locals() and analysis_text: + f.write(f"\n\n=== 审核分析 ===\n{analysis_text}") + logging.info(f"Article text saved to: {txt_path}") except Exception as e: logging.error(f"Failed to save article.txt: {e}") - # 记录调试信息,无论是否成功 (包含原始数据的完整副本以便调试) + # 保存调试信息 debug_path = os.path.join(variant_dir, "debug_content.txt") try: with open(debug_path, "w", encoding="utf-8") as f: - f.write(f"原始标题: {original_title}\n\n") - f.write(f"原始内容: {original_content}\n\n") - if original_tags: - f.write(f"原始标签: {original_tags}\n\n") - - if is_judged: - f.write(f"审核状态: {'成功' if is_judge_success else '失败'}\n") - if is_judge_success: - f.write(f"审核后标题: {judge_title}\n\n") - f.write(f"审核后内容: {judge_content}\n\n") - - if original_judge_analysis: - f.write(f"审核分析: {original_judge_analysis}\n\n") - - f.write("---处理后---\n\n") + f.write(f"处理前内容信息:\n") + f.write(f"标题: {input_data.get('title', '未提供')[:200]}...\n\n") + f.write(f"内容: {input_data.get('content', '未提供')[:200]}...\n\n") + f.write(f"标签: {input_data.get('tags', input_data.get('tag', '未提供'))}\n\n") + f.write(f"审核状态: judged={input_data.get('judged', False)}, judge_success={input_data.get('judge_success', False)}\n\n") + + f.write("处理后JSON输出字段:\n") for key, value in output_data.items(): - if isinstance(value, str): - f.write(f"{key}: (length: {len(value)})\n") - f.write(f"{repr(value[:200])}...\n\n") - else: - f.write(f"{key}: {type(value)}\n") + value_preview = str(value)[:100] + "..." if isinstance(value, str) and len(str(value)) > 100 else value + f.write(f"{key}: {value_preview}\n") + + f.write("\n解码后文本内容:\n") + f.write(f"标题: {title_text if 'title_text' in locals() else '未解码'}\n\n") + f.write(f"内容: {content_text[:200] if 'content_text' in locals() else '未解码'}...\n") + logging.info(f"调试内容已保存到: {debug_path}") except Exception as debug_err: logging.error(f"保存调试内容失败: {debug_err}") - # Save content prompt + # 保存提示词 prompt_path = os.path.join(variant_dir, "tweet_prompt.txt") try: with open(prompt_path, "w", encoding="utf-8") as f: diff --git a/utils/tweet_generator.py b/utils/tweet_generator.py index e03f417..f35290f 100644 --- a/utils/tweet_generator.py +++ b/utils/tweet_generator.py @@ -11,6 +11,7 @@ import sys import traceback import logging # Add logging import re +import base64 # sys.path.append('/root/autodl-tmp') # No longer needed if running as a module or if path is set correctly # 从本地模块导入 # from TravelContentCreator.core.ai_agent import AI_Agent # Remove project name prefix @@ -546,26 +547,21 @@ def generate_content_for_topic(ai_agent: AI_Agent, if judged_result and isinstance(judged_result, dict): if "title" in judged_result and "content" in judged_result: # 保存原始标题和内容 - content_json["original_title"] = content_json.get("title", "") - content_json["original_content"] = content_json.get("content", "") + # content_json["original_title"] = content_json.get("title", "") + # content_json["original_content"] = content_json.get("content", "") # 保存原始标签(优先使用tags,如果没有则使用tag) - original_tags = content_json.get("tags", content_json.get("tag", "")) - content_json["original_tags"] = original_tags + tags = content_json.get("tags", content_json.get("tag", "")) + content_json["tags"] = base64.b64encode(tags.encode('utf-8')).decode('utf-8') # 更新为审核后的内容 - content_json["title"] = judged_result["title"] - content_json["content"] = judged_result["content"] - # 保留原始标签,避免重复 - content_json["tags"] = original_tags - # 删除可能存在的重复tag字段 - if "tag" in content_json: - del content_json["tag"] + content_json["title"] = judged_result["title_base64"] + content_json["content"] = judged_result["content_base64"] # 添加审核标记 content_json["judged"] = True # 添加judge_success状态 content_json["judge_success"] = judged_result.get("judge_success", False) # 处理分析结果,优先使用"analysis"字段,兼容"不良内容分析"字段 if "analysis" in judged_result: - content_json["judge_analysis"] = judged_result["analysis"] + content_json["analysis"] = judged_result["analysis_base64"] else: logging.warning(f" 审核结果缺少title或content字段,保留原内容") content_json["judge_success"] = False