110 lines
3.2 KiB
Python
110 lines
3.2 KiB
Python
|
|
"""检查JSONL文件是否符合API要求"""
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import os
|
|||
|
|
|
|||
|
|
|
|||
|
|
def check_jsonl_file(file_path):
|
|||
|
|
"""检查JSONL文件格式"""
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("检查 JSONL 文件格式")
|
|||
|
|
print("=" * 60)
|
|||
|
|
print(f"文件: {file_path}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
if not os.path.exists(file_path):
|
|||
|
|
print("[ERROR] 文件不存在!")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 获取文件大小
|
|||
|
|
file_size = os.path.getsize(file_path)
|
|||
|
|
file_size_mb = file_size / (1024 * 1024)
|
|||
|
|
print(f"文件大小: {file_size} 字节 ({file_size_mb:.2f} MB)")
|
|||
|
|
|
|||
|
|
# 检查文件大小限制
|
|||
|
|
if file_size > 500 * 1024 * 1024:
|
|||
|
|
print("[ERROR] 文件大小超过500MB限制!")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
print()
|
|||
|
|
print("-" * 60)
|
|||
|
|
print("逐行检查:")
|
|||
|
|
print("-" * 60)
|
|||
|
|
|
|||
|
|
all_valid = True
|
|||
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|||
|
|
lines = f.readlines()
|
|||
|
|
|
|||
|
|
print(f"总行数: {len(lines)}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
for i, line in enumerate(lines, 1):
|
|||
|
|
line_size = len(line.encode("utf-8"))
|
|||
|
|
line_size_mb = line_size / (1024 * 1024)
|
|||
|
|
|
|||
|
|
print(f"第 {i} 行:")
|
|||
|
|
print(f" - 长度: {line_size} 字节 ({line_size_mb:.2f} MB)")
|
|||
|
|
|
|||
|
|
# 检查是否空行
|
|||
|
|
if len(line.strip()) == 0:
|
|||
|
|
print(f" - [ERROR] 空白行!API不允许空白行")
|
|||
|
|
all_valid = False
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 检查单行大小限制
|
|||
|
|
if line_size > 6 * 1024 * 1024:
|
|||
|
|
print(f" - [ERROR] 单行超过6MB限制!")
|
|||
|
|
all_valid = False
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 检查JSON格式
|
|||
|
|
try:
|
|||
|
|
obj = json.loads(line)
|
|||
|
|
print(f" - JSON格式: [OK]")
|
|||
|
|
|
|||
|
|
# 检查必需字段
|
|||
|
|
required_fields = ["custom_id", "method", "url", "body"]
|
|||
|
|
missing_fields = [field for field in required_fields if field not in obj]
|
|||
|
|
if missing_fields:
|
|||
|
|
print(f" - [ERROR] 缺少必需字段: {missing_fields}")
|
|||
|
|
all_valid = False
|
|||
|
|
else:
|
|||
|
|
print(f" - 必需字段: [OK]")
|
|||
|
|
print(f" - custom_id: {obj['custom_id']}")
|
|||
|
|
print(f" - method: {obj['method']}")
|
|||
|
|
print(f" - url: {obj['url']}")
|
|||
|
|
|
|||
|
|
# 检查body字段
|
|||
|
|
if "body" in obj:
|
|||
|
|
body = obj["body"]
|
|||
|
|
if "model" in body:
|
|||
|
|
print(f" - model: {body['model']}")
|
|||
|
|
else:
|
|||
|
|
print(f" - [ERROR] body中缺少model字段")
|
|||
|
|
all_valid = False
|
|||
|
|
|
|||
|
|
if "messages" in body:
|
|||
|
|
print(f" - messages: {len(body['messages'])} 条消息")
|
|||
|
|
else:
|
|||
|
|
print(f" - [ERROR] body中缺少messages字段")
|
|||
|
|
all_valid = False
|
|||
|
|
|
|||
|
|
except json.JSONDecodeError as e:
|
|||
|
|
print(f" - [ERROR] JSON解析失败: {e}")
|
|||
|
|
all_valid = False
|
|||
|
|
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
print("=" * 60)
|
|||
|
|
if all_valid:
|
|||
|
|
print("[SUCCESS] 文件格式检查通过!")
|
|||
|
|
else:
|
|||
|
|
print("[FAILED] 文件格式存在问题,请修复后再试")
|
|||
|
|
print("=" * 60)
|
|||
|
|
|
|||
|
|
return all_valid
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
check_jsonl_file("demo/test_model.jsonl")
|