"""检查JSONL文件是否符合API要求""" import json import os def check_jsonl_file(file_path): """检查JSONL文件格式""" print("=" * 60) print("检查 JSONL 文件格式") print("=" * 60) print(f"文件: {file_path}") print() if not os.path.exists(file_path): print("[ERROR] 文件不存在!") return False # 获取文件大小 file_size = os.path.getsize(file_path) file_size_mb = file_size / (1024 * 1024) print(f"文件大小: {file_size} 字节 ({file_size_mb:.2f} MB)") # 检查文件大小限制 if file_size > 500 * 1024 * 1024: print("[ERROR] 文件大小超过500MB限制!") return False print() print("-" * 60) print("逐行检查:") print("-" * 60) all_valid = True with open(file_path, "r", encoding="utf-8") as f: lines = f.readlines() print(f"总行数: {len(lines)}") print() for i, line in enumerate(lines, 1): line_size = len(line.encode("utf-8")) line_size_mb = line_size / (1024 * 1024) print(f"第 {i} 行:") print(f" - 长度: {line_size} 字节 ({line_size_mb:.2f} MB)") # 检查是否空行 if len(line.strip()) == 0: print(f" - [ERROR] 空白行!API不允许空白行") all_valid = False continue # 检查单行大小限制 if line_size > 6 * 1024 * 1024: print(f" - [ERROR] 单行超过6MB限制!") all_valid = False continue # 检查JSON格式 try: obj = json.loads(line) print(f" - JSON格式: [OK]") # 检查必需字段 required_fields = ["custom_id", "method", "url", "body"] missing_fields = [field for field in required_fields if field not in obj] if missing_fields: print(f" - [ERROR] 缺少必需字段: {missing_fields}") all_valid = False else: print(f" - 必需字段: [OK]") print(f" - custom_id: {obj['custom_id']}") print(f" - method: {obj['method']}") print(f" - url: {obj['url']}") # 检查body字段 if "body" in obj: body = obj["body"] if "model" in body: print(f" - model: {body['model']}") else: print(f" - [ERROR] body中缺少model字段") all_valid = False if "messages" in body: print(f" - messages: {len(body['messages'])} 条消息") else: print(f" - [ERROR] body中缺少messages字段") all_valid = False except json.JSONDecodeError as e: print(f" - [ERROR] JSON解析失败: {e}") all_valid = False print() print("=" * 60) if all_valid: print("[SUCCESS] 文件格式检查通过!") else: print("[FAILED] 文件格式存在问题,请修复后再试") print("=" * 60) return all_valid if __name__ == "__main__": check_jsonl_file("demo/test_model.jsonl")