90 lines
2.8 KiB
Python
90 lines
2.8 KiB
Python
|
import json
|
||
|
import os
|
||
|
import pandas as pd
|
||
|
import sqlite3
|
||
|
|
||
|
# 读取1.json文件
|
||
|
def read_json_file(file_path):
|
||
|
try:
|
||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||
|
return json.load(f)
|
||
|
except Exception as e:
|
||
|
print(f"读取文件 {file_path} 时出错: {e}")
|
||
|
return {}
|
||
|
|
||
|
# 主要处理函数
|
||
|
def create_prompt_json(data, category_data):
|
||
|
|
||
|
item = data
|
||
|
category = category_data
|
||
|
# 创建包含提示词和两个文件内容的JSON结构
|
||
|
prompt_data = {
|
||
|
"instruction": "将基本信息根据分类体系进行分类判断",
|
||
|
"task_description": "你是一个精准的内容分类专家。请阅读提供的基本信息(item_info),并根据分类体系(category_system)为该项目确定最合适的分类。",
|
||
|
"input": {
|
||
|
"item_info": item,
|
||
|
"category_system": category
|
||
|
},
|
||
|
"output_requirements": {
|
||
|
"fields": {
|
||
|
"primary_category": "一级分类名称",
|
||
|
"secondary_category": "二级分类名称(如果有)",
|
||
|
"tertiary_category": "三级分类名称(如果有)",
|
||
|
"confidence": "分类置信度(0-1)",
|
||
|
"reasoning": "简要说明分类理由"
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return prompt_data
|
||
|
|
||
|
# 使用示例
|
||
|
if __name__ == "__main__":
|
||
|
sort_json_path = "sort.json" # 分类信息文件
|
||
|
category_data = read_json_file(sort_json_path)
|
||
|
|
||
|
# 文件路径
|
||
|
table_name = 'data'
|
||
|
conn = sqlite3.connect(f'{table_name}.db') # 替换为您的数据库文件名
|
||
|
cursor = conn.cursor()
|
||
|
sort_name = '产品类型'
|
||
|
sort_value = ('住宿', '门票', '抢购')
|
||
|
num_limit = "500"
|
||
|
# SQL查询 - 随机选择500个指定产品类型的记录
|
||
|
query = f"""
|
||
|
SELECT * FROM data
|
||
|
WHERE {sort_name} IN {sort_value}
|
||
|
ORDER BY RANDOM()
|
||
|
LIMIT {num_limit}
|
||
|
"""
|
||
|
cursor.execute(query)
|
||
|
result = cursor.fetchall()
|
||
|
results = []
|
||
|
for i, row in enumerate(result):
|
||
|
prompt_data = create_prompt_json(row ,category_data)
|
||
|
#print(prompt_data)
|
||
|
result_entry = []
|
||
|
result_entry.extend(row) # 将row的所有元素分别添加到result_entry
|
||
|
result_entry.append(i)
|
||
|
if i == 5:
|
||
|
break
|
||
|
|
||
|
results.append(result_entry)
|
||
|
|
||
|
df = pd.DataFrame(results)
|
||
|
excel_file = '111.xlsx'
|
||
|
# 获取数据库表的列名作为DataFrame的表头
|
||
|
columns = [description[0] for description in cursor.description]
|
||
|
columns.append('index') # 添加index列
|
||
|
df.columns = columns
|
||
|
|
||
|
# 将第三列移到第一列
|
||
|
cols = df.columns.tolist()
|
||
|
third_col = cols.pop(2) # 第三列索引为2
|
||
|
cols.insert(0, third_col)
|
||
|
df = df[cols]
|
||
|
|
||
|
df.to_excel(excel_file, index=False)
|
||
|
print(f"\n所有生成文本已保存到 {excel_file}")
|
||
|
print(results)
|
||
|
# 将结果保存为JSON文件
|