From 1018fd53bc553ae21aa76be32ebcdc2afc0bfa1e Mon Sep 17 00:00:00 2001 From: yujie_jiang <2621675592@qq.com> Date: Fri, 9 May 2025 16:55:12 +0800 Subject: [PATCH] add_tags_to_detectedarticles --- add_tags.py | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 add_tags.py diff --git a/add_tags.py b/add_tags.py new file mode 100644 index 0000000..4d961ac --- /dev/null +++ b/add_tags.py @@ -0,0 +1,81 @@ +import json +import os +import glob +import re + +def copy_tags_for_all_folders(): + # 基础目录路径 + base_dir = "Content_detector" + + # 查找所有"四季梦幻"开头的文件夹 + dream_folders = [folder for folder in os.listdir(base_dir) + if os.path.isdir(os.path.join(base_dir, folder)) and folder.startswith("四季梦幻")] + + print(f"找到以下'四季梦幻'文件夹:{dream_folders}") + + total_processed = 0 + + for dream_folder in dream_folders: + dream_path = os.path.join(base_dir, dream_folder) + + # 提取文件夹编号 (例如: "四季梦幻1" -> "1") + folder_number = re.search(r'四季梦幻(\d+)', dream_folder) + if folder_number: + subfolder_prefix = folder_number.group(1) + "_" + else: + subfolder_prefix = "*_" # 如果无法提取编号,使用通配符 + + print(f"处理 {dream_folder},子文件夹前缀: {subfolder_prefix}") + + # 查找所有日期文件夹 + date_dirs = glob.glob(os.path.join(dream_path, "*")) + + for date_dir in date_dirs: + if not os.path.isdir(date_dir): + continue + + # 查找所有子文件夹 (使用提取的前缀,如 "1_*" 或 "2_*") + sub_dirs = glob.glob(os.path.join(date_dir, f"{subfolder_prefix}*")) + + folder_processed = 0 + + for sub_dir in sub_dirs: + article_path = os.path.join(sub_dir, "article.json") + article_detect_path = os.path.join(sub_dir, "article_detect.json") + + # 检查文件是否存在 + if not os.path.exists(article_path) or not os.path.exists(article_detect_path): + print(f"跳过 {sub_dir}:文件不存在") + continue + + try: + # 读取article.json + with open(article_path, 'r', encoding='utf-8') as f: + article_data = json.load(f) + + # 读取article_detect.json + with open(article_detect_path, 'r', encoding='utf-8') as f: + article_detect_data = json.load(f) + + # 复制tag字段 + if 'tag' in article_data: + article_detect_data['tag'] = article_data['tag'] + + # 写回article_detect.json + with open(article_detect_path, 'w', encoding='utf-8') as f: + json.dump(article_detect_data, f, ensure_ascii=False, indent=4) + + folder_processed += 1 + total_processed += 1 + else: + print(f"跳过 {sub_dir}:没有tag字段") + + except Exception as e: + print(f"处理 {sub_dir} 时出错: {e}") + + print(f"在文件夹 {date_dir} 中处理了 {folder_processed} 个文件") + + print(f"任务完成!总共处理了 {total_processed} 个文件。") + +if __name__ == "__main__": + copy_tags_for_all_folders()