CategorizeLabel/demo/check_jsonl.py

110 lines
3.2 KiB
Python
Raw Normal View History

2025-10-15 17:19:26 +08:00
"""检查JSONL文件是否符合API要求"""
import json
import os
def check_jsonl_file(file_path):
"""检查JSONL文件格式"""
print("=" * 60)
print("检查 JSONL 文件格式")
print("=" * 60)
print(f"文件: {file_path}")
print()
if not os.path.exists(file_path):
print("[ERROR] 文件不存在!")
return False
# 获取文件大小
file_size = os.path.getsize(file_path)
file_size_mb = file_size / (1024 * 1024)
print(f"文件大小: {file_size} 字节 ({file_size_mb:.2f} MB)")
# 检查文件大小限制
if file_size > 500 * 1024 * 1024:
print("[ERROR] 文件大小超过500MB限制")
return False
print()
print("-" * 60)
print("逐行检查:")
print("-" * 60)
all_valid = True
with open(file_path, "r", encoding="utf-8") as f:
lines = f.readlines()
print(f"总行数: {len(lines)}")
print()
for i, line in enumerate(lines, 1):
line_size = len(line.encode("utf-8"))
line_size_mb = line_size / (1024 * 1024)
print(f"{i} 行:")
print(f" - 长度: {line_size} 字节 ({line_size_mb:.2f} MB)")
# 检查是否空行
if len(line.strip()) == 0:
print(f" - [ERROR] 空白行API不允许空白行")
all_valid = False
continue
# 检查单行大小限制
if line_size > 6 * 1024 * 1024:
print(f" - [ERROR] 单行超过6MB限制")
all_valid = False
continue
# 检查JSON格式
try:
obj = json.loads(line)
print(f" - JSON格式: [OK]")
# 检查必需字段
required_fields = ["custom_id", "method", "url", "body"]
missing_fields = [field for field in required_fields if field not in obj]
if missing_fields:
print(f" - [ERROR] 缺少必需字段: {missing_fields}")
all_valid = False
else:
print(f" - 必需字段: [OK]")
print(f" - custom_id: {obj['custom_id']}")
print(f" - method: {obj['method']}")
print(f" - url: {obj['url']}")
# 检查body字段
if "body" in obj:
body = obj["body"]
if "model" in body:
print(f" - model: {body['model']}")
else:
print(f" - [ERROR] body中缺少model字段")
all_valid = False
if "messages" in body:
print(f" - messages: {len(body['messages'])} 条消息")
else:
print(f" - [ERROR] body中缺少messages字段")
all_valid = False
except json.JSONDecodeError as e:
print(f" - [ERROR] JSON解析失败: {e}")
all_valid = False
print()
print("=" * 60)
if all_valid:
print("[SUCCESS] 文件格式检查通过!")
else:
print("[FAILED] 文件格式存在问题,请修复后再试")
print("=" * 60)
return all_valid
if __name__ == "__main__":
check_jsonl_file("demo/test_model.jsonl")