110 lines
3.2 KiB
Python
110 lines
3.2 KiB
Python
"""检查JSONL文件是否符合API要求"""
|
||
|
||
import json
|
||
import os
|
||
|
||
|
||
def check_jsonl_file(file_path):
|
||
"""检查JSONL文件格式"""
|
||
print("=" * 60)
|
||
print("检查 JSONL 文件格式")
|
||
print("=" * 60)
|
||
print(f"文件: {file_path}")
|
||
print()
|
||
|
||
if not os.path.exists(file_path):
|
||
print("[ERROR] 文件不存在!")
|
||
return False
|
||
|
||
# 获取文件大小
|
||
file_size = os.path.getsize(file_path)
|
||
file_size_mb = file_size / (1024 * 1024)
|
||
print(f"文件大小: {file_size} 字节 ({file_size_mb:.2f} MB)")
|
||
|
||
# 检查文件大小限制
|
||
if file_size > 500 * 1024 * 1024:
|
||
print("[ERROR] 文件大小超过500MB限制!")
|
||
return False
|
||
|
||
print()
|
||
print("-" * 60)
|
||
print("逐行检查:")
|
||
print("-" * 60)
|
||
|
||
all_valid = True
|
||
with open(file_path, "r", encoding="utf-8") as f:
|
||
lines = f.readlines()
|
||
|
||
print(f"总行数: {len(lines)}")
|
||
print()
|
||
|
||
for i, line in enumerate(lines, 1):
|
||
line_size = len(line.encode("utf-8"))
|
||
line_size_mb = line_size / (1024 * 1024)
|
||
|
||
print(f"第 {i} 行:")
|
||
print(f" - 长度: {line_size} 字节 ({line_size_mb:.2f} MB)")
|
||
|
||
# 检查是否空行
|
||
if len(line.strip()) == 0:
|
||
print(f" - [ERROR] 空白行!API不允许空白行")
|
||
all_valid = False
|
||
continue
|
||
|
||
# 检查单行大小限制
|
||
if line_size > 6 * 1024 * 1024:
|
||
print(f" - [ERROR] 单行超过6MB限制!")
|
||
all_valid = False
|
||
continue
|
||
|
||
# 检查JSON格式
|
||
try:
|
||
obj = json.loads(line)
|
||
print(f" - JSON格式: [OK]")
|
||
|
||
# 检查必需字段
|
||
required_fields = ["custom_id", "method", "url", "body"]
|
||
missing_fields = [field for field in required_fields if field not in obj]
|
||
if missing_fields:
|
||
print(f" - [ERROR] 缺少必需字段: {missing_fields}")
|
||
all_valid = False
|
||
else:
|
||
print(f" - 必需字段: [OK]")
|
||
print(f" - custom_id: {obj['custom_id']}")
|
||
print(f" - method: {obj['method']}")
|
||
print(f" - url: {obj['url']}")
|
||
|
||
# 检查body字段
|
||
if "body" in obj:
|
||
body = obj["body"]
|
||
if "model" in body:
|
||
print(f" - model: {body['model']}")
|
||
else:
|
||
print(f" - [ERROR] body中缺少model字段")
|
||
all_valid = False
|
||
|
||
if "messages" in body:
|
||
print(f" - messages: {len(body['messages'])} 条消息")
|
||
else:
|
||
print(f" - [ERROR] body中缺少messages字段")
|
||
all_valid = False
|
||
|
||
except json.JSONDecodeError as e:
|
||
print(f" - [ERROR] JSON解析失败: {e}")
|
||
all_valid = False
|
||
|
||
print()
|
||
|
||
print("=" * 60)
|
||
if all_valid:
|
||
print("[SUCCESS] 文件格式检查通过!")
|
||
else:
|
||
print("[FAILED] 文件格式存在问题,请修复后再试")
|
||
print("=" * 60)
|
||
|
||
return all_valid
|
||
|
||
|
||
if __name__ == "__main__":
|
||
check_jsonl_file("demo/test_model.jsonl")
|