892 lines
34 KiB
Python
892 lines
34 KiB
Python
|
|
#!/usr/bin/env python
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import sqlite3
|
|||
|
|
import pandas as pd
|
|||
|
|
import argparse
|
|||
|
|
import logging
|
|||
|
|
from datetime import datetime
|
|||
|
|
import json
|
|||
|
|
|
|||
|
|
# 配置日志
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|||
|
|
handlers=[
|
|||
|
|
logging.StreamHandler()
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
# 数据库表结构定义
|
|||
|
|
DB_SCHEMA = {
|
|||
|
|
"contents": """
|
|||
|
|
CREATE TABLE IF NOT EXISTS contents (
|
|||
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|||
|
|
entry_id TEXT NOT NULL UNIQUE,
|
|||
|
|
output_txt_path TEXT,
|
|||
|
|
poster_path TEXT,
|
|||
|
|
article_json_path TEXT,
|
|||
|
|
product TEXT,
|
|||
|
|
object TEXT,
|
|||
|
|
date TEXT,
|
|||
|
|
logic TEXT,
|
|||
|
|
judge_status INTEGER,
|
|||
|
|
is_distributed INTEGER DEFAULT 0,
|
|||
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|||
|
|
)
|
|||
|
|
""",
|
|||
|
|
|
|||
|
|
"users": """
|
|||
|
|
CREATE TABLE IF NOT EXISTS users (
|
|||
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|||
|
|
email TEXT NOT NULL UNIQUE,
|
|||
|
|
username TEXT,
|
|||
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|||
|
|
)
|
|||
|
|
""",
|
|||
|
|
|
|||
|
|
"distributions": """
|
|||
|
|
CREATE TABLE IF NOT EXISTS distributions (
|
|||
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|||
|
|
content_id INTEGER,
|
|||
|
|
user_id INTEGER,
|
|||
|
|
distribution_date TIMESTAMP,
|
|||
|
|
send_status TEXT,
|
|||
|
|
batch_id TEXT,
|
|||
|
|
FOREIGN KEY (content_id) REFERENCES contents(id),
|
|||
|
|
FOREIGN KEY (user_id) REFERENCES users(id)
|
|||
|
|
)
|
|||
|
|
"""
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def parse_arguments():
|
|||
|
|
"""解析命令行参数"""
|
|||
|
|
parser = argparse.ArgumentParser(description='内容分发数据库管理')
|
|||
|
|
|
|||
|
|
# 数据库路径
|
|||
|
|
parser.add_argument('--db-path', type=str, default='/root/autodl-tmp/TravelContentCreator/distribution.db',
|
|||
|
|
help='数据库文件路径')
|
|||
|
|
|
|||
|
|
# 子命令解析
|
|||
|
|
subparsers = parser.add_subparsers(dest='command', help='子命令')
|
|||
|
|
|
|||
|
|
# 初始化数据库
|
|||
|
|
init_parser = subparsers.add_parser('init', help='初始化数据库')
|
|||
|
|
|
|||
|
|
# 导入内容
|
|||
|
|
import_contents_parser = subparsers.add_parser('import-contents', help='导入内容清单')
|
|||
|
|
import_contents_parser.add_argument('--file', type=str, required=True,
|
|||
|
|
help='内容清单CSV文件路径')
|
|||
|
|
|
|||
|
|
# 导入用户
|
|||
|
|
import_users_parser = subparsers.add_parser('import-users', help='导入用户数据')
|
|||
|
|
import_users_parser.add_argument('--file', type=str, required=True,
|
|||
|
|
help='用户CSV文件路径')
|
|||
|
|
import_users_parser.add_argument('--email-column', type=str, default='达人邮箱',
|
|||
|
|
help='邮箱列名')
|
|||
|
|
import_users_parser.add_argument('--username-column', type=str, default='小红书ID',
|
|||
|
|
help='用户名列名')
|
|||
|
|
|
|||
|
|
# 导入分发记录
|
|||
|
|
import_dist_parser = subparsers.add_parser('import-distribution', help='导入分发记录')
|
|||
|
|
import_dist_parser.add_argument('--file', type=str, required=True,
|
|||
|
|
help='分发记录CSV文件路径')
|
|||
|
|
import_dist_parser.add_argument('--batch-id', type=str, default=None,
|
|||
|
|
help='分发批次ID,默认使用文件名')
|
|||
|
|
|
|||
|
|
# 导入分发结果(从分发脚本)
|
|||
|
|
import_dist_result_parser = subparsers.add_parser('import-distribution-result', help='导入分发结果')
|
|||
|
|
import_dist_result_parser.add_argument('--distribution-csv', type=str, required=True,
|
|||
|
|
help='分发记录CSV文件路径')
|
|||
|
|
import_dist_result_parser.add_argument('--result-csv', type=str, required=True,
|
|||
|
|
help='发送结果CSV文件路径')
|
|||
|
|
import_dist_result_parser.add_argument('--manifest-csv', type=str, required=True,
|
|||
|
|
help='内容清单CSV文件路径')
|
|||
|
|
import_dist_result_parser.add_argument('--batch-id', type=str, default=None,
|
|||
|
|
help='分发批次ID,默认使用时间戳')
|
|||
|
|
|
|||
|
|
# 查询内容状态
|
|||
|
|
query_content_parser = subparsers.add_parser('query-content', help='查询内容分发状态')
|
|||
|
|
query_content_parser.add_argument('--entry-id', type=str, default=None,
|
|||
|
|
help='内容ID')
|
|||
|
|
query_content_parser.add_argument('--product', type=str, default=None,
|
|||
|
|
help='产品名称')
|
|||
|
|
query_content_parser.add_argument('--object', type=str, default=None,
|
|||
|
|
help='景点名称')
|
|||
|
|
query_content_parser.add_argument('--output', type=str, default=None,
|
|||
|
|
help='输出CSV文件路径')
|
|||
|
|
|
|||
|
|
# 查询用户接收内容
|
|||
|
|
query_user_parser = subparsers.add_parser('query-user', help='查询用户接收内容')
|
|||
|
|
query_user_parser.add_argument('--email', type=str, required=True,
|
|||
|
|
help='用户邮箱')
|
|||
|
|
query_user_parser.add_argument('--output', type=str, default=None,
|
|||
|
|
help='输出CSV文件路径')
|
|||
|
|
|
|||
|
|
# 生成统计报告
|
|||
|
|
report_parser = subparsers.add_parser('report', help='生成统计报告')
|
|||
|
|
report_parser.add_argument('--output', type=str, default=None,
|
|||
|
|
help='输出CSV文件路径')
|
|||
|
|
report_parser.add_argument('--format', type=str, choices=['csv', 'json'], default='csv',
|
|||
|
|
help='输出格式 (csv 或 json)')
|
|||
|
|
|
|||
|
|
return parser.parse_args()
|
|||
|
|
|
|||
|
|
def create_connection(db_path):
|
|||
|
|
"""创建数据库连接"""
|
|||
|
|
try:
|
|||
|
|
conn = sqlite3.connect(db_path)
|
|||
|
|
# 启用外键约束
|
|||
|
|
conn.execute("PRAGMA foreign_keys = ON")
|
|||
|
|
# 设置返回结果为字典格式
|
|||
|
|
conn.row_factory = sqlite3.Row
|
|||
|
|
return conn
|
|||
|
|
except sqlite3.Error as e:
|
|||
|
|
logger.error(f"创建数据库连接失败: {e}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def init_database(conn):
|
|||
|
|
"""初始化数据库,创建表结构"""
|
|||
|
|
try:
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
|
|||
|
|
# 创建表
|
|||
|
|
for table_name, create_sql in DB_SCHEMA.items():
|
|||
|
|
cursor.execute(create_sql)
|
|||
|
|
logger.info(f"创建或验证表 '{table_name}' 成功")
|
|||
|
|
|
|||
|
|
# 创建索引
|
|||
|
|
cursor.execute("CREATE INDEX IF NOT EXISTS idx_contents_entry_id ON contents(entry_id)")
|
|||
|
|
cursor.execute("CREATE INDEX IF NOT EXISTS idx_users_email ON users(email)")
|
|||
|
|
cursor.execute("CREATE INDEX IF NOT EXISTS idx_dist_content_user ON distributions(content_id, user_id)")
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
logger.info("数据库初始化成功")
|
|||
|
|
return True
|
|||
|
|
except sqlite3.Error as e:
|
|||
|
|
logger.error(f"初始化数据库失败: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def import_contents(conn, file_path):
|
|||
|
|
"""导入内容清单到数据库"""
|
|||
|
|
try:
|
|||
|
|
# 读取CSV文件
|
|||
|
|
df = pd.read_csv(file_path)
|
|||
|
|
required_columns = ['EntryID']
|
|||
|
|
|
|||
|
|
# 检查必要列
|
|||
|
|
missing_columns = [col for col in required_columns if col not in df.columns]
|
|||
|
|
if missing_columns:
|
|||
|
|
logger.error(f"内容清单CSV中缺少必要列: {', '.join(missing_columns)}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 记录导入前数据量
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
cursor.execute("SELECT COUNT(*) FROM contents")
|
|||
|
|
before_count = cursor.fetchone()[0]
|
|||
|
|
|
|||
|
|
# 导入数据
|
|||
|
|
success_count = 0
|
|||
|
|
for _, row in df.iterrows():
|
|||
|
|
try:
|
|||
|
|
# 准备数据
|
|||
|
|
data = {
|
|||
|
|
'entry_id': row['EntryID'],
|
|||
|
|
'output_txt_path': row.get('OutputTxtPath', ''),
|
|||
|
|
'poster_path': row.get('PosterPath', ''),
|
|||
|
|
'article_json_path': row.get('ArticleJsonPath', ''),
|
|||
|
|
'product': row.get('Product', ''),
|
|||
|
|
'object': row.get('Object', ''),
|
|||
|
|
'date': row.get('Date', ''),
|
|||
|
|
'logic': row.get('Logic', ''),
|
|||
|
|
'judge_status': int(row.get('JudgeStatus', 0)) if 'JudgeStatus' in row else None,
|
|||
|
|
'is_distributed': int(row.get('IsDistributed', 0)) if 'IsDistributed' in row else 0
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 插入或更新内容
|
|||
|
|
cursor.execute("""
|
|||
|
|
INSERT OR REPLACE INTO contents
|
|||
|
|
(entry_id, output_txt_path, poster_path, article_json_path,
|
|||
|
|
product, object, date, logic, judge_status, is_distributed)
|
|||
|
|
VALUES
|
|||
|
|
(:entry_id, :output_txt_path, :poster_path, :article_json_path,
|
|||
|
|
:product, :object, :date, :logic, :judge_status, :is_distributed)
|
|||
|
|
""", data)
|
|||
|
|
|
|||
|
|
success_count += 1
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"导入内容 {row.get('EntryID')} 失败: {e}")
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
|
|||
|
|
# 记录导入后数据量
|
|||
|
|
cursor.execute("SELECT COUNT(*) FROM contents")
|
|||
|
|
after_count = cursor.fetchone()[0]
|
|||
|
|
|
|||
|
|
logger.info(f"内容清单导入完成,成功导入 {success_count} 条内容")
|
|||
|
|
logger.info(f"数据库现有 {after_count} 条内容记录,本次操作新增 {after_count - before_count} 条")
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"导入内容清单失败: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def import_users(conn, file_path, email_column='达人邮箱', username_column='小红书ID'):
|
|||
|
|
"""导入用户数据到数据库"""
|
|||
|
|
try:
|
|||
|
|
# 读取CSV文件
|
|||
|
|
df = pd.read_csv(file_path)
|
|||
|
|
|
|||
|
|
# 检查必要列
|
|||
|
|
if email_column not in df.columns:
|
|||
|
|
logger.error(f"用户CSV中缺少邮箱列 '{email_column}'")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 过滤有效邮箱
|
|||
|
|
df = df[df[email_column].notna()]
|
|||
|
|
df = df[df[email_column].astype(str).str.contains('@')]
|
|||
|
|
|
|||
|
|
# 获取用户名列,如果不存在则使用邮箱前缀
|
|||
|
|
if username_column not in df.columns:
|
|||
|
|
df[username_column] = df[email_column].apply(lambda x: x.split('@')[0])
|
|||
|
|
|
|||
|
|
# 记录导入前数据量
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
cursor.execute("SELECT COUNT(*) FROM users")
|
|||
|
|
before_count = cursor.fetchone()[0]
|
|||
|
|
|
|||
|
|
# 导入数据
|
|||
|
|
success_count = 0
|
|||
|
|
for _, row in df.iterrows():
|
|||
|
|
try:
|
|||
|
|
email = row[email_column]
|
|||
|
|
username = row[username_column] if pd.notna(row[username_column]) else email.split('@')[0]
|
|||
|
|
|
|||
|
|
# 插入或更新用户
|
|||
|
|
cursor.execute("""
|
|||
|
|
INSERT OR REPLACE INTO users (email, username)
|
|||
|
|
VALUES (?, ?)
|
|||
|
|
""", (email, username))
|
|||
|
|
|
|||
|
|
success_count += 1
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"导入用户 {row.get(email_column)} 失败: {e}")
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
|
|||
|
|
# 记录导入后数据量
|
|||
|
|
cursor.execute("SELECT COUNT(*) FROM users")
|
|||
|
|
after_count = cursor.fetchone()[0]
|
|||
|
|
|
|||
|
|
logger.info(f"用户数据导入完成,成功导入 {success_count} 个用户")
|
|||
|
|
logger.info(f"数据库现有 {after_count} 个用户记录,本次操作新增 {after_count - before_count} 个")
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"导入用户数据失败: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def import_distribution(conn, file_path, batch_id=None):
|
|||
|
|
"""导入分发记录到数据库"""
|
|||
|
|
try:
|
|||
|
|
# 读取CSV文件
|
|||
|
|
df = pd.read_csv(file_path)
|
|||
|
|
|
|||
|
|
# 检查必要列
|
|||
|
|
required_columns = ['email', 'entry_id']
|
|||
|
|
missing_columns = [col for col in required_columns if col not in df.columns]
|
|||
|
|
if missing_columns:
|
|||
|
|
logger.error(f"分发记录CSV中缺少必要列: {', '.join(missing_columns)}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 设置批次ID
|
|||
|
|
if not batch_id:
|
|||
|
|
batch_id = os.path.basename(file_path).split('.')[0]
|
|||
|
|
|
|||
|
|
# 记录导入前数据量
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
cursor.execute("SELECT COUNT(*) FROM distributions")
|
|||
|
|
before_count = cursor.fetchone()[0]
|
|||
|
|
|
|||
|
|
# 导入数据
|
|||
|
|
success_count = 0
|
|||
|
|
skipped_count = 0
|
|||
|
|
for _, row in df.iterrows():
|
|||
|
|
try:
|
|||
|
|
email = row['email']
|
|||
|
|
entry_id = row['entry_id']
|
|||
|
|
send_status = row.get('send_status', 'unknown')
|
|||
|
|
|
|||
|
|
# 查找用户ID
|
|||
|
|
cursor.execute("SELECT id FROM users WHERE email = ?", (email,))
|
|||
|
|
user_result = cursor.fetchone()
|
|||
|
|
if not user_result:
|
|||
|
|
# 用户不存在,添加到数据库
|
|||
|
|
cursor.execute("""
|
|||
|
|
INSERT INTO users (email, username)
|
|||
|
|
VALUES (?, ?)
|
|||
|
|
""", (email, row.get('username', email.split('@')[0])))
|
|||
|
|
user_id = cursor.lastrowid
|
|||
|
|
else:
|
|||
|
|
user_id = user_result[0]
|
|||
|
|
|
|||
|
|
# 查找内容ID
|
|||
|
|
cursor.execute("SELECT id FROM contents WHERE entry_id = ?", (entry_id,))
|
|||
|
|
content_result = cursor.fetchone()
|
|||
|
|
if not content_result:
|
|||
|
|
logger.warning(f"内容 {entry_id} 不存在于数据库中,尝试添加基本记录")
|
|||
|
|
cursor.execute("""
|
|||
|
|
INSERT INTO contents (entry_id)
|
|||
|
|
VALUES (?)
|
|||
|
|
""", (entry_id,))
|
|||
|
|
content_id = cursor.lastrowid
|
|||
|
|
else:
|
|||
|
|
content_id = content_result[0]
|
|||
|
|
|
|||
|
|
# 检查是否已存在相同记录
|
|||
|
|
cursor.execute("""
|
|||
|
|
SELECT id FROM distributions
|
|||
|
|
WHERE content_id = ? AND user_id = ? AND batch_id = ?
|
|||
|
|
""", (content_id, user_id, batch_id))
|
|||
|
|
|
|||
|
|
if cursor.fetchone():
|
|||
|
|
# 记录已存在,更新状态
|
|||
|
|
cursor.execute("""
|
|||
|
|
UPDATE distributions
|
|||
|
|
SET send_status = ?, distribution_date = CURRENT_TIMESTAMP
|
|||
|
|
WHERE content_id = ? AND user_id = ? AND batch_id = ?
|
|||
|
|
""", (send_status, content_id, user_id, batch_id))
|
|||
|
|
skipped_count += 1
|
|||
|
|
else:
|
|||
|
|
# 添加新记录
|
|||
|
|
cursor.execute("""
|
|||
|
|
INSERT INTO distributions
|
|||
|
|
(content_id, user_id, distribution_date, send_status, batch_id)
|
|||
|
|
VALUES (?, ?, CURRENT_TIMESTAMP, ?, ?)
|
|||
|
|
""", (content_id, user_id, send_status, batch_id))
|
|||
|
|
success_count += 1
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"导入分发记录 {row.get('entry_id')} -> {row.get('email')} 失败: {e}")
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
|
|||
|
|
# 记录导入后数据量
|
|||
|
|
cursor.execute("SELECT COUNT(*) FROM distributions")
|
|||
|
|
after_count = cursor.fetchone()[0]
|
|||
|
|
|
|||
|
|
logger.info(f"分发记录导入完成,成功导入 {success_count} 条记录,更新 {skipped_count} 条记录")
|
|||
|
|
logger.info(f"数据库现有 {after_count} 条分发记录,本次操作新增 {after_count - before_count} 条")
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"导入分发记录失败: {e}")
|
|||
|
|
conn.rollback()
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def import_distribution_result(conn, distribution_csv, result_csv, manifest_csv, batch_id=None):
|
|||
|
|
"""从分发脚本结果导入数据到数据库"""
|
|||
|
|
try:
|
|||
|
|
# 设置批次ID
|
|||
|
|
if not batch_id:
|
|||
|
|
batch_id = f"batch_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|||
|
|
|
|||
|
|
# 1. 导入内容清单
|
|||
|
|
logger.info(f"正在从清单导入内容: {manifest_csv}")
|
|||
|
|
import_contents(conn, manifest_csv)
|
|||
|
|
|
|||
|
|
# 2. 导入分发记录
|
|||
|
|
logger.info(f"正在导入分发记录: {distribution_csv}")
|
|||
|
|
dist_df = pd.read_csv(distribution_csv)
|
|||
|
|
|
|||
|
|
# 3. 合并发送结果
|
|||
|
|
if os.path.exists(result_csv):
|
|||
|
|
logger.info(f"正在合并发送结果: {result_csv}")
|
|||
|
|
result_df = pd.read_csv(result_csv)
|
|||
|
|
|
|||
|
|
# 创建邮箱到状态的映射
|
|||
|
|
email_status = {}
|
|||
|
|
for _, row in result_df.iterrows():
|
|||
|
|
email_status[row['email']] = row['status']
|
|||
|
|
|
|||
|
|
# 添加发送状态
|
|||
|
|
dist_df['send_status'] = dist_df['email'].map(email_status)
|
|||
|
|
else:
|
|||
|
|
logger.warning(f"结果CSV文件不存在: {result_csv},所有记录状态将标记为'unknown'")
|
|||
|
|
dist_df['send_status'] = 'unknown'
|
|||
|
|
|
|||
|
|
# 保存临时CSV文件用于导入
|
|||
|
|
temp_csv = os.path.join(os.path.dirname(distribution_csv), f"temp_combined_{os.path.basename(distribution_csv)}")
|
|||
|
|
dist_df.to_csv(temp_csv, index=False)
|
|||
|
|
|
|||
|
|
# 导入合并后的分发记录
|
|||
|
|
success = import_distribution(conn, temp_csv, batch_id)
|
|||
|
|
|
|||
|
|
# 删除临时文件
|
|||
|
|
try:
|
|||
|
|
os.remove(temp_csv)
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
if success:
|
|||
|
|
logger.info(f"分发结果成功导入数据库,批次ID: {batch_id}")
|
|||
|
|
return True
|
|||
|
|
else:
|
|||
|
|
logger.error(f"分发结果导入数据库失败")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"导入分发结果失败: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def record_extracted_content(
|
|||
|
|
entry_id,
|
|||
|
|
output_txt_path=None,
|
|||
|
|
poster_path=None,
|
|||
|
|
article_json_path=None,
|
|||
|
|
product=None,
|
|||
|
|
object=None,
|
|||
|
|
date=None,
|
|||
|
|
logic=None,
|
|||
|
|
judge_status=None,
|
|||
|
|
is_distributed=0,
|
|||
|
|
db_path=None
|
|||
|
|
):
|
|||
|
|
"""在提取内容时直接记录到数据库
|
|||
|
|
|
|||
|
|
此函数可以在extract_and_render.py脚本中调用,实现实时记录提取的内容
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
entry_id: 内容ID
|
|||
|
|
output_txt_path: 输出文本文件路径
|
|||
|
|
poster_path: 海报图片路径
|
|||
|
|
article_json_path: 文章JSON文件路径
|
|||
|
|
product: 产品名称
|
|||
|
|
object: 景点名称
|
|||
|
|
date: 日期
|
|||
|
|
logic: 逻辑
|
|||
|
|
judge_status: 审核状态 (0=未通过, 1=通过)
|
|||
|
|
is_distributed: 是否已分发 (0=未分发, 1=已分发)
|
|||
|
|
db_path: 数据库路径,默认使用配置的路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
bool: 是否成功记录
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
# 使用默认数据库路径
|
|||
|
|
if db_path is None:
|
|||
|
|
db_path = '/root/autodl-tmp/TravelContentCreator/distribution.db'
|
|||
|
|
|
|||
|
|
# 检查数据库是否存在,如果不存在则初始化
|
|||
|
|
if not os.path.exists(db_path):
|
|||
|
|
logger.warning(f"数据库文件不存在: {db_path},将自动创建")
|
|||
|
|
conn = create_connection(db_path)
|
|||
|
|
init_database(conn)
|
|||
|
|
else:
|
|||
|
|
conn = create_connection(db_path)
|
|||
|
|
|
|||
|
|
if not conn:
|
|||
|
|
logger.error(f"无法连接到数据库: {db_path}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
|
|||
|
|
# 准备数据
|
|||
|
|
data = {
|
|||
|
|
'entry_id': entry_id,
|
|||
|
|
'output_txt_path': output_txt_path or '',
|
|||
|
|
'poster_path': poster_path or '',
|
|||
|
|
'article_json_path': article_json_path or '',
|
|||
|
|
'product': product or '',
|
|||
|
|
'object': object or '',
|
|||
|
|
'date': date or '',
|
|||
|
|
'logic': logic or '',
|
|||
|
|
'judge_status': judge_status if judge_status is not None else None,
|
|||
|
|
'is_distributed': is_distributed
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 插入或更新内容
|
|||
|
|
cursor.execute("""
|
|||
|
|
INSERT OR REPLACE INTO contents
|
|||
|
|
(entry_id, output_txt_path, poster_path, article_json_path,
|
|||
|
|
product, object, date, logic, judge_status, is_distributed)
|
|||
|
|
VALUES
|
|||
|
|
(:entry_id, :output_txt_path, :poster_path, :article_json_path,
|
|||
|
|
:product, :object, :date, :logic, :judge_status, :is_distributed)
|
|||
|
|
""", data)
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
logger.info(f"已将内容 {entry_id} 记录到数据库")
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"记录内容到数据库失败: {e}")
|
|||
|
|
conn.rollback()
|
|||
|
|
return False
|
|||
|
|
finally:
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"记录提取内容时发生错误: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def query_content_status(conn, entry_id=None, product=None, object=None, output=None):
|
|||
|
|
"""查询内容分发状态"""
|
|||
|
|
try:
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
|
|||
|
|
# 构建查询条件
|
|||
|
|
conditions = []
|
|||
|
|
params = []
|
|||
|
|
|
|||
|
|
if entry_id:
|
|||
|
|
conditions.append("c.entry_id = ?")
|
|||
|
|
params.append(entry_id)
|
|||
|
|
|
|||
|
|
if product:
|
|||
|
|
conditions.append("c.product LIKE ?")
|
|||
|
|
params.append(f"%{product}%")
|
|||
|
|
|
|||
|
|
if object:
|
|||
|
|
conditions.append("c.object LIKE ?")
|
|||
|
|
params.append(f"%{object}%")
|
|||
|
|
|
|||
|
|
where_clause = " AND ".join(conditions) if conditions else "1=1"
|
|||
|
|
|
|||
|
|
# 执行查询
|
|||
|
|
query = f"""
|
|||
|
|
SELECT
|
|||
|
|
c.entry_id,
|
|||
|
|
c.product,
|
|||
|
|
c.object,
|
|||
|
|
c.judge_status,
|
|||
|
|
COUNT(DISTINCT d.user_id) as distribution_count,
|
|||
|
|
SUM(CASE WHEN d.send_status LIKE '%success%' THEN 1 ELSE 0 END) as success_count,
|
|||
|
|
GROUP_CONCAT(DISTINCT u.email) as recipients
|
|||
|
|
FROM
|
|||
|
|
contents c
|
|||
|
|
LEFT JOIN
|
|||
|
|
distributions d ON c.id = d.content_id
|
|||
|
|
LEFT JOIN
|
|||
|
|
users u ON d.user_id = u.id
|
|||
|
|
WHERE
|
|||
|
|
{where_clause}
|
|||
|
|
GROUP BY
|
|||
|
|
c.id
|
|||
|
|
ORDER BY
|
|||
|
|
distribution_count DESC, c.entry_id
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
cursor.execute(query, params)
|
|||
|
|
results = [dict(row) for row in cursor.fetchall()]
|
|||
|
|
|
|||
|
|
# 输出结果
|
|||
|
|
if results:
|
|||
|
|
logger.info(f"查询到 {len(results)} 条内容状态记录")
|
|||
|
|
|
|||
|
|
# 如果指定了输出文件
|
|||
|
|
if output:
|
|||
|
|
df = pd.DataFrame(results)
|
|||
|
|
df.to_csv(output, index=False)
|
|||
|
|
logger.info(f"查询结果已保存到: {output}")
|
|||
|
|
else:
|
|||
|
|
# 打印简要结果
|
|||
|
|
for row in results[:10]: # 只显示前10条
|
|||
|
|
status = "已审核通过" if row['judge_status'] == 1 else "未审核通过" if row['judge_status'] == 0 else "未知"
|
|||
|
|
logger.info(f"内容ID: {row['entry_id']}, 产品: {row['product']}, 景点: {row['object']}, "
|
|||
|
|
f"状态: {status}, 分发次数: {row['distribution_count']}, 成功: {row['success_count']}")
|
|||
|
|
|
|||
|
|
if len(results) > 10:
|
|||
|
|
logger.info(f"... 还有 {len(results) - 10} 条记录未显示")
|
|||
|
|
else:
|
|||
|
|
logger.info("未查询到相关内容记录")
|
|||
|
|
|
|||
|
|
return results
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"查询内容状态失败: {e}")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
def query_user_contents(conn, email, output=None):
|
|||
|
|
"""查询用户接收的内容"""
|
|||
|
|
try:
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
|
|||
|
|
# 查询用户是否存在
|
|||
|
|
cursor.execute("SELECT id FROM users WHERE email = ?", (email,))
|
|||
|
|
user_result = cursor.fetchone()
|
|||
|
|
|
|||
|
|
if not user_result:
|
|||
|
|
logger.info(f"用户 {email} 不存在于数据库中")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
user_id = user_result[0]
|
|||
|
|
|
|||
|
|
# 查询用户接收的内容
|
|||
|
|
query = """
|
|||
|
|
SELECT
|
|||
|
|
u.email,
|
|||
|
|
u.username,
|
|||
|
|
c.entry_id,
|
|||
|
|
c.product,
|
|||
|
|
c.object,
|
|||
|
|
d.distribution_date,
|
|||
|
|
d.send_status,
|
|||
|
|
d.batch_id
|
|||
|
|
FROM
|
|||
|
|
distributions d
|
|||
|
|
JOIN
|
|||
|
|
contents c ON d.content_id = c.id
|
|||
|
|
JOIN
|
|||
|
|
users u ON d.user_id = u.id
|
|||
|
|
WHERE
|
|||
|
|
d.user_id = ?
|
|||
|
|
ORDER BY
|
|||
|
|
d.distribution_date DESC
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
cursor.execute(query, (user_id,))
|
|||
|
|
results = [dict(row) for row in cursor.fetchall()]
|
|||
|
|
|
|||
|
|
# 输出结果
|
|||
|
|
if results:
|
|||
|
|
logger.info(f"用户 {email} 共接收了 {len(results)} 条内容")
|
|||
|
|
|
|||
|
|
# 如果指定了输出文件
|
|||
|
|
if output:
|
|||
|
|
df = pd.DataFrame(results)
|
|||
|
|
df.to_csv(output, index=False)
|
|||
|
|
logger.info(f"查询结果已保存到: {output}")
|
|||
|
|
else:
|
|||
|
|
# 打印简要结果
|
|||
|
|
for row in results[:10]: # 只显示前10条
|
|||
|
|
dist_date = row['distribution_date']
|
|||
|
|
logger.info(f"内容ID: {row['entry_id']}, 产品: {row['product']}, 景点: {row['object']}, "
|
|||
|
|
f"发送时间: {dist_date}, 状态: {row['send_status']}, 批次: {row['batch_id']}")
|
|||
|
|
|
|||
|
|
if len(results) > 10:
|
|||
|
|
logger.info(f"... 还有 {len(results) - 10} 条记录未显示")
|
|||
|
|
else:
|
|||
|
|
logger.info(f"用户 {email} 未接收任何内容")
|
|||
|
|
|
|||
|
|
return results
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"查询用户内容失败: {e}")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
def generate_report(conn, output=None, format='csv'):
|
|||
|
|
"""生成统计报告"""
|
|||
|
|
try:
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
|
|||
|
|
# 基本统计信息
|
|||
|
|
statistics = {}
|
|||
|
|
|
|||
|
|
# 统计内容总数
|
|||
|
|
cursor.execute("SELECT COUNT(*) FROM contents")
|
|||
|
|
statistics['total_contents'] = cursor.fetchone()[0]
|
|||
|
|
|
|||
|
|
# 统计用户总数
|
|||
|
|
cursor.execute("SELECT COUNT(*) FROM users")
|
|||
|
|
statistics['total_users'] = cursor.fetchone()[0]
|
|||
|
|
|
|||
|
|
# 统计分发记录总数
|
|||
|
|
cursor.execute("SELECT COUNT(*) FROM distributions")
|
|||
|
|
statistics['total_distributions'] = cursor.fetchone()[0]
|
|||
|
|
|
|||
|
|
# 统计成功发送的分发记录数
|
|||
|
|
cursor.execute("SELECT COUNT(*) FROM distributions WHERE send_status LIKE '%success%'")
|
|||
|
|
statistics['successful_distributions'] = cursor.fetchone()[0]
|
|||
|
|
|
|||
|
|
# 统计不同批次的分发数量
|
|||
|
|
cursor.execute("""
|
|||
|
|
SELECT batch_id, COUNT(*) as count
|
|||
|
|
FROM distributions
|
|||
|
|
GROUP BY batch_id
|
|||
|
|
ORDER BY count DESC
|
|||
|
|
""")
|
|||
|
|
statistics['batch_statistics'] = [dict(row) for row in cursor.fetchall()]
|
|||
|
|
|
|||
|
|
# 按产品统计分发情况
|
|||
|
|
cursor.execute("""
|
|||
|
|
SELECT
|
|||
|
|
c.product,
|
|||
|
|
COUNT(DISTINCT c.id) as total_contents,
|
|||
|
|
COUNT(DISTINCT d.id) as distribution_count,
|
|||
|
|
COUNT(DISTINCT d.user_id) as user_count,
|
|||
|
|
SUM(CASE WHEN d.send_status LIKE '%success%' THEN 1 ELSE 0 END) as success_count
|
|||
|
|
FROM
|
|||
|
|
contents c
|
|||
|
|
LEFT JOIN
|
|||
|
|
distributions d ON c.id = d.content_id
|
|||
|
|
WHERE
|
|||
|
|
c.product IS NOT NULL AND c.product != ''
|
|||
|
|
GROUP BY
|
|||
|
|
c.product
|
|||
|
|
ORDER BY
|
|||
|
|
distribution_count DESC
|
|||
|
|
""")
|
|||
|
|
statistics['product_statistics'] = [dict(row) for row in cursor.fetchall()]
|
|||
|
|
|
|||
|
|
# 按景点统计分发情况
|
|||
|
|
cursor.execute("""
|
|||
|
|
SELECT
|
|||
|
|
c.object,
|
|||
|
|
COUNT(DISTINCT c.id) as total_contents,
|
|||
|
|
COUNT(DISTINCT d.id) as distribution_count,
|
|||
|
|
COUNT(DISTINCT d.user_id) as user_count,
|
|||
|
|
SUM(CASE WHEN d.send_status LIKE '%success%' THEN 1 ELSE 0 END) as success_count
|
|||
|
|
FROM
|
|||
|
|
contents c
|
|||
|
|
LEFT JOIN
|
|||
|
|
distributions d ON c.id = d.content_id
|
|||
|
|
WHERE
|
|||
|
|
c.object IS NOT NULL AND c.object != ''
|
|||
|
|
GROUP BY
|
|||
|
|
c.object
|
|||
|
|
ORDER BY
|
|||
|
|
distribution_count DESC
|
|||
|
|
LIMIT 20
|
|||
|
|
""")
|
|||
|
|
statistics['object_statistics'] = [dict(row) for row in cursor.fetchall()]
|
|||
|
|
|
|||
|
|
# 分发最多的内容
|
|||
|
|
cursor.execute("""
|
|||
|
|
SELECT
|
|||
|
|
c.entry_id,
|
|||
|
|
c.product,
|
|||
|
|
c.object,
|
|||
|
|
COUNT(d.id) as distribution_count,
|
|||
|
|
SUM(CASE WHEN d.send_status LIKE '%success%' THEN 1 ELSE 0 END) as success_count
|
|||
|
|
FROM
|
|||
|
|
contents c
|
|||
|
|
JOIN
|
|||
|
|
distributions d ON c.id = d.content_id
|
|||
|
|
GROUP BY
|
|||
|
|
c.id
|
|||
|
|
ORDER BY
|
|||
|
|
distribution_count DESC
|
|||
|
|
LIMIT 10
|
|||
|
|
""")
|
|||
|
|
statistics['most_distributed_contents'] = [dict(row) for row in cursor.fetchall()]
|
|||
|
|
|
|||
|
|
# 接收最多内容的用户
|
|||
|
|
cursor.execute("""
|
|||
|
|
SELECT
|
|||
|
|
u.email,
|
|||
|
|
u.username,
|
|||
|
|
COUNT(d.id) as received_count,
|
|||
|
|
SUM(CASE WHEN d.send_status LIKE '%success%' THEN 1 ELSE 0 END) as success_count
|
|||
|
|
FROM
|
|||
|
|
users u
|
|||
|
|
JOIN
|
|||
|
|
distributions d ON u.id = d.user_id
|
|||
|
|
GROUP BY
|
|||
|
|
u.id
|
|||
|
|
ORDER BY
|
|||
|
|
received_count DESC
|
|||
|
|
LIMIT 10
|
|||
|
|
""")
|
|||
|
|
statistics['most_active_users'] = [dict(row) for row in cursor.fetchall()]
|
|||
|
|
|
|||
|
|
# 打印基本统计信息
|
|||
|
|
logger.info("\n====== 内容分发数据库统计报告 ======")
|
|||
|
|
logger.info(f"内容总数: {statistics['total_contents']}")
|
|||
|
|
logger.info(f"用户总数: {statistics['total_users']}")
|
|||
|
|
logger.info(f"分发记录总数: {statistics['total_distributions']}")
|
|||
|
|
logger.info(f"成功发送记录数: {statistics['successful_distributions']}")
|
|||
|
|
|
|||
|
|
logger.info("\n----- 按产品统计 -----")
|
|||
|
|
for product in statistics['product_statistics'][:5]:
|
|||
|
|
if product['distribution_count'] > 0:
|
|||
|
|
logger.info(f"产品: {product['product']}, 内容数: {product['total_contents']}, "
|
|||
|
|
f"分发次数: {product['distribution_count']}, 成功: {product['success_count']}")
|
|||
|
|
|
|||
|
|
logger.info("\n----- 分发最多的内容 -----")
|
|||
|
|
for content in statistics['most_distributed_contents'][:5]:
|
|||
|
|
logger.info(f"内容ID: {content['entry_id']}, 产品: {content['product']}, "
|
|||
|
|
f"景点: {content['object']}, 分发次数: {content['distribution_count']}, "
|
|||
|
|
f"成功: {content['success_count']}")
|
|||
|
|
|
|||
|
|
# 输出报告
|
|||
|
|
if output:
|
|||
|
|
if format == 'json':
|
|||
|
|
with open(output, 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(statistics, f, ensure_ascii=False, indent=2)
|
|||
|
|
else: # CSV格式
|
|||
|
|
# 创建多个CSV文件
|
|||
|
|
output_base = os.path.splitext(output)[0]
|
|||
|
|
|
|||
|
|
# 产品统计
|
|||
|
|
pd.DataFrame(statistics['product_statistics']).to_csv(f"{output_base}_products.csv", index=False)
|
|||
|
|
|
|||
|
|
# 景点统计
|
|||
|
|
pd.DataFrame(statistics['object_statistics']).to_csv(f"{output_base}_objects.csv", index=False)
|
|||
|
|
|
|||
|
|
# 内容分发统计
|
|||
|
|
pd.DataFrame(statistics['most_distributed_contents']).to_csv(f"{output_base}_contents.csv", index=False)
|
|||
|
|
|
|||
|
|
# 用户接收统计
|
|||
|
|
pd.DataFrame(statistics['most_active_users']).to_csv(f"{output_base}_users.csv", index=False)
|
|||
|
|
|
|||
|
|
# 批次统计
|
|||
|
|
pd.DataFrame(statistics['batch_statistics']).to_csv(f"{output_base}_batches.csv", index=False)
|
|||
|
|
|
|||
|
|
# 基本统计
|
|||
|
|
basic_stats = {k: v for k, v in statistics.items() if not isinstance(v, list)}
|
|||
|
|
pd.DataFrame([basic_stats]).to_csv(f"{output_base}_summary.csv", index=False)
|
|||
|
|
|
|||
|
|
logger.info(f"统计报告已保存到: {output}")
|
|||
|
|
|
|||
|
|
return statistics
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"生成统计报告失败: {e}")
|
|||
|
|
return {}
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
args = parse_arguments()
|
|||
|
|
|
|||
|
|
# 创建数据库连接
|
|||
|
|
conn = create_connection(args.db_path)
|
|||
|
|
if not conn:
|
|||
|
|
logger.error("无法连接到数据库,程序退出")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 根据命令执行相应操作
|
|||
|
|
if args.command == 'init':
|
|||
|
|
init_database(conn)
|
|||
|
|
|
|||
|
|
elif args.command == 'import-contents':
|
|||
|
|
import_contents(conn, args.file)
|
|||
|
|
|
|||
|
|
elif args.command == 'import-users':
|
|||
|
|
import_users(conn, args.file, args.email_column, args.username_column)
|
|||
|
|
|
|||
|
|
elif args.command == 'import-distribution':
|
|||
|
|
import_distribution(conn, args.file, args.batch_id)
|
|||
|
|
|
|||
|
|
elif args.command == 'import-distribution-result':
|
|||
|
|
import_distribution_result(conn, args.distribution_csv, args.result_csv,
|
|||
|
|
args.manifest_csv, args.batch_id)
|
|||
|
|
|
|||
|
|
elif args.command == 'query-content':
|
|||
|
|
query_content_status(conn, args.entry_id, args.product, args.object, args.output)
|
|||
|
|
|
|||
|
|
elif args.command == 'query-user':
|
|||
|
|
query_user_contents(conn, args.email, args.output)
|
|||
|
|
|
|||
|
|
elif args.command == 'report':
|
|||
|
|
generate_report(conn, args.output, args.format)
|
|||
|
|
|
|||
|
|
else:
|
|||
|
|
logger.error("未指定有效命令,请使用 --help 查看帮助")
|
|||
|
|
|
|||
|
|
finally:
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|