正确修改了下采方式

2025-07-09 14:53:24 +08:00 · 2025-07-09 14:53:24 +08:00 · 44c57349f5
commit 44c57349f5
parent d4d23068e5
2 changed files with 50 additions and 16 deletions
--- a/utils/pycache/prompts.cpython-312.pyc
+++ b/utils/pycache/prompts.cpython-312.pyc
--- a/utils/prompts.py
+++ b/utils/prompts.py
@ -140,8 +140,30 @@ class BasePromptBuilder(PromptTemplate):
                    data = json.load(f)
                
                if "examples" in data and isinstance(data["examples"], list):
-                    formatted_examples = [f"- {item.get('content', '')}" for item in data["examples"]]
-                    return f"参考标题列表:\n" + "\n".join(formatted_examples)
+                    examples = data["examples"]
+                    return f"参考标题列表:\n" + "\n".join([f"- {item.get('content', '')}" for item in examples])
+                else:
+                    return json.dumps(data, ensure_ascii=False, indent=2)
+            except Exception as e:
+                logger.error(f"解析或格式化JSON文件 '{path}' 失败: {e}")
+                return f"加载文件 '{path.name}' 失败。"
+        else:
+            return path.read_text('utf-8')
+            
+    def _load_and_format_content_with_sampling(self, path: Path, sampling_rate: float) -> str:
+        """根据文件类型加载和格式化内容，并应用采样率"""
+        if path.suffix == '.json':
+            try:
+                with path.open('r', encoding='utf-8') as f:
+                    data = json.load(f)
+                
+                if "examples" in data and isinstance(data["examples"], list):
+                    examples = data["examples"]
+                    # 应用采样率
+                    sample_size = max(1, int(len(examples) * sampling_rate))
+                    sampled_examples = random.sample(examples, sample_size)
+                    logger.info(f"文件 '{path.name}' 中的examples采样: {sample_size}/{len(examples)} (采样率: {sampling_rate:.2f})")
+                    return f"参考标题列表:\n" + "\n".join([f"- {item.get('content', '')}" for item in sampled_examples])
                else:
                    return json.dumps(data, ensure_ascii=False, indent=2)
            except Exception as e:
@ -187,25 +209,37 @@ class BasePromptBuilder(PromptTemplate):

                full_path = self._get_full_path(path_str)
                
-                files_to_read = []
+                # 简化逻辑：对于单个文件，直接应用采样率决定是否加载
                if full_path.is_file():
-                    if random.random() < sampling_rate:
-                        files_to_read.append(full_path)
-                        logger.info(f"文件 '{path_str}' 采样成功 (采样率: {sampling_rate})")
+                    # 对于JSON文件，对内容进行采样
+                    if full_path.suffix == '.json':
+                        file_content = self._load_and_format_content_with_sampling(full_path, sampling_rate)
+                        content_parts.append(f"--- {full_path.name} ---\n{file_content}")
+                        logger.info(f"加载JSON文件 '{path_str}' 并应用内部采样")
+                    # 对于其他文件，根据采样率决定是否完全加载
+                    elif random.random() < sampling_rate:
+                        file_content = self._load_and_format_content(full_path)
+                        content_parts.append(f"--- {full_path.name} ---\n{file_content}")
+                        logger.info(f"文件 '{path_str}' 采样成功 (采样率: {sampling_rate:.2f})")
                    else:
-                        logger.info(f"文件 '{path_str}' 采样失败 (采样率: {sampling_rate})")
+                        logger.info(f"文件 '{path_str}' 采样失败 (采样率: {sampling_rate:.2f})")
+                # 对于目录，直接选择指定比例的文件
                elif full_path.is_dir():
                    all_files = sorted(p for p in full_path.iterdir() if p.is_file())
-                    if sampling_rate < 1.0:
-                        num_to_sample = max(1, int(len(all_files) * sampling_rate))
-                        files_to_read = random.sample(all_files, num_to_sample)
-                        logger.info(f"对目录 '{path_str}' 进行采样 (采样率: {sampling_rate})，选取 {len(files_to_read)}/{len(all_files)} 个文件。")
+                    if all_files:
+                        if sampling_rate < 1.0:
+                            sample_size = max(1, int(len(all_files) * sampling_rate))
+                            files_to_read = random.sample(all_files, sample_size)
+                            logger.info(f"目录 '{path_str}' 采样: {sample_size}/{len(all_files)} 个文件 (采样率: {sampling_rate:.2f})")
+                        else:
+                            files_to_read = all_files
+                            logger.info(f"目录 '{path_str}' 全部加载: {len(all_files)} 个文件")
+                        
+                        for f_path in files_to_read:
+                            file_content = self._load_and_format_content(f_path)
+                            content_parts.append(f"--- {f_path.name} ---\n{file_content}")
                    else:
-                        files_to_read = all_files
-                
-                for f_path in files_to_read:
-                    file_content = self._load_and_format_content(f_path)
-                    content_parts.append(f"--- {f_path.name} ---\n{file_content}")
+                        logger.warning(f"目录 '{path_str}' 中没有文件")

            except Exception as e:
                logger.error(f"加载Refer资源 '{ref_item}' 失败: {e}", exc_info=True)