CategorizeLabel/demo/check_jsonl.py
2025-10-15 17:19:26 +08:00

110 lines
3.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""检查JSONL文件是否符合API要求"""
import json
import os
def check_jsonl_file(file_path):
"""检查JSONL文件格式"""
print("=" * 60)
print("检查 JSONL 文件格式")
print("=" * 60)
print(f"文件: {file_path}")
print()
if not os.path.exists(file_path):
print("[ERROR] 文件不存在!")
return False
# 获取文件大小
file_size = os.path.getsize(file_path)
file_size_mb = file_size / (1024 * 1024)
print(f"文件大小: {file_size} 字节 ({file_size_mb:.2f} MB)")
# 检查文件大小限制
if file_size > 500 * 1024 * 1024:
print("[ERROR] 文件大小超过500MB限制")
return False
print()
print("-" * 60)
print("逐行检查:")
print("-" * 60)
all_valid = True
with open(file_path, "r", encoding="utf-8") as f:
lines = f.readlines()
print(f"总行数: {len(lines)}")
print()
for i, line in enumerate(lines, 1):
line_size = len(line.encode("utf-8"))
line_size_mb = line_size / (1024 * 1024)
print(f"{i} 行:")
print(f" - 长度: {line_size} 字节 ({line_size_mb:.2f} MB)")
# 检查是否空行
if len(line.strip()) == 0:
print(f" - [ERROR] 空白行API不允许空白行")
all_valid = False
continue
# 检查单行大小限制
if line_size > 6 * 1024 * 1024:
print(f" - [ERROR] 单行超过6MB限制")
all_valid = False
continue
# 检查JSON格式
try:
obj = json.loads(line)
print(f" - JSON格式: [OK]")
# 检查必需字段
required_fields = ["custom_id", "method", "url", "body"]
missing_fields = [field for field in required_fields if field not in obj]
if missing_fields:
print(f" - [ERROR] 缺少必需字段: {missing_fields}")
all_valid = False
else:
print(f" - 必需字段: [OK]")
print(f" - custom_id: {obj['custom_id']}")
print(f" - method: {obj['method']}")
print(f" - url: {obj['url']}")
# 检查body字段
if "body" in obj:
body = obj["body"]
if "model" in body:
print(f" - model: {body['model']}")
else:
print(f" - [ERROR] body中缺少model字段")
all_valid = False
if "messages" in body:
print(f" - messages: {len(body['messages'])} 条消息")
else:
print(f" - [ERROR] body中缺少messages字段")
all_valid = False
except json.JSONDecodeError as e:
print(f" - [ERROR] JSON解析失败: {e}")
all_valid = False
print()
print("=" * 60)
if all_valid:
print("[SUCCESS] 文件格式检查通过!")
else:
print("[FAILED] 文件格式存在问题,请修复后再试")
print("=" * 60)
return all_valid
if __name__ == "__main__":
check_jsonl_file("demo/test_model.jsonl")