201 lines
8.9 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sqlite3
import csv
import sys
# CSV文件路径
csv_file = '/root/autodl-tmp/TravelContentCreator/output/5.15.csv'
# 数据库路径
db_path = '/root/autodl-tmp/TravelContentCreator/distribution.db'
def main():
# 检查CSV文件是否存在
if not os.path.exists(csv_file):
print(f"错误: CSV文件不存在: {csv_file}")
return False
# 连接数据库
try:
conn = sqlite3.connect(db_path)
conn.execute("PRAGMA foreign_keys = OFF") # 禁用外键约束,避免可能的导入问题
cursor = conn.cursor()
print(f"已连接到数据库: {db_path}")
# 检查数据库表结构确保xhs_user_id字段存在
print("检查数据库表结构...")
cursor.execute("PRAGMA table_info(users)")
columns = {col[1] for col in cursor.fetchall()}
if 'xhs_user_id' not in columns:
print("错误: 数据库users表中不存在xhs_user_id字段")
print("请先运行 fix_database.py 脚本修复数据库结构")
return False
print(f"当前用户表字段: {', '.join(columns)}")
# 清除所有历史用户数据
clear_users = input("是否清除所有历史用户数据? (y/n): ").strip().lower()
if clear_users == 'y':
try:
# 先检查是否有关联的分发记录
cursor.execute("SELECT COUNT(*) FROM distributions")
distribution_count = cursor.fetchone()[0]
if distribution_count > 0:
confirm_delete = input(f"警告: 数据库中有 {distribution_count} 条分发记录与用户关联,确定要删除所有用户吗? (y/n): ").strip().lower()
if confirm_delete != 'y':
print("取消清除用户数据")
return False
# 清除所有用户数据
cursor.execute("DELETE FROM users")
conn.commit()
print(f"已清除所有历史用户数据")
except sqlite3.Error as e:
print(f"清除用户数据时出错: {e}")
conn.rollback()
return False
# 检查数据库中已存在的小红书User码
cursor.execute("SELECT xhs_user_id FROM users WHERE xhs_user_id IS NOT NULL")
existing_user_ids = set(row[0] for row in cursor.fetchall() if row[0])
print(f"数据库中已有 {len(existing_user_ids)} 个带有小红书User码的用户")
except sqlite3.Error as e:
print(f"数据库连接错误: {e}")
return False
# 计数器
success_count = 0
skip_count = 0
error_count = 0
update_count = 0
# 记录失败和跳过的条目
failed_entries = []
skipped_entries = []
# 读取CSV文件并导入数据
try:
with open(csv_file, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
# 跳过标题行
headers = next(reader)
print(f"CSV文件标题: {headers}")
for row_num, row in enumerate(reader, 2): # 从2开始计数因为第1行是标题
if len(row) >= 4: # 确保行至少有4列
try:
# 从CSV提取数据 (小红书User码, 小红书ID, 粉丝数, 邮箱)
xhs_user_id, username, fans, email = row
# 如果邮箱为空,跳过
if not email:
skip_info = f"邮箱为空"
skipped_entries.append((row_num, row, skip_info))
print(f"警告: 跳过无邮箱用户: {username} (行 {row_num})")
skip_count += 1
continue
# 确保邮箱格式正确
if '@' not in email:
error_info = f"无效邮箱格式"
failed_entries.append((row_num, row, error_info))
print(f"警告: 跳过无效邮箱: {email} (行 {row_num})")
error_count += 1
continue
# 检查小红书User码是否有效
if not xhs_user_id:
error_info = f"小红书User码为空"
failed_entries.append((row_num, row, error_info))
print(f"警告: 小红书User码为空: {email} (行 {row_num})")
error_count += 1
continue
# 检查小红书User码是否已存在如果存在则更新信息
if xhs_user_id in existing_user_ids:
print(f"用户User码已存在将更新: {xhs_user_id} -> {username}, {email}")
cursor.execute("""
UPDATE users
SET username = ?, email = ?
WHERE xhs_user_id = ?
""", (username, email, xhs_user_id))
update_count += 1
skipped_entries.append((row_num, row, "用户User码已存在已更新信息"))
else:
# 插入新用户
cursor.execute("""
INSERT INTO users (email, username, xhs_user_id)
VALUES (?, ?, ?)
""", (email, username, xhs_user_id))
existing_user_ids.add(xhs_user_id) # 添加到已存在列表
success_count += 1
print(f"插入新用户: {xhs_user_id} -> {username}, {email}")
except sqlite3.IntegrityError as e:
# 处理唯一约束冲突
if "UNIQUE constraint failed: users.xhs_user_id" in str(e):
error_info = f"小红书User码重复"
failed_entries.append((row_num, row, error_info))
print(f"错误: 小红书User码重复: {xhs_user_id} (行 {row_num})")
else:
error_info = f"数据库完整性错误: {e}"
failed_entries.append((row_num, row, error_info))
print(f"插入行时出错 (行 {row_num}): {row}, 错误: {e}")
error_count += 1
except Exception as e:
error_info = f"数据库错误: {e}"
failed_entries.append((row_num, row, error_info))
print(f"插入行时出错 (行 {row_num}): {row}, 错误: {e}")
error_count += 1
else:
error_info = f"列数不足需要至少4列"
failed_entries.append((row_num, row, error_info))
print(f"警告: 跳过格式不正确的行 {row_num}: {row}")
error_count += 1
# 提交事务
conn.commit()
print(f"\n导入完成! 成功插入: {success_count}, 更新: {update_count}, 跳过: {skip_count}, 失败: {error_count}")
# 如果有失败的条目,输出详细信息
if failed_entries:
print("\n===== 导入失败的记录 =====")
for row_num, row, error in failed_entries:
print(f"{row_num}: {row} - 失败原因: {error}")
# 将失败记录保存到文件
failure_file = os.path.join(os.path.dirname(csv_file), "import_failures_5.15.csv")
with open(failure_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['行号', '原始数据', '失败原因'])
for row_num, row, error in failed_entries:
writer.writerow([row_num, ','.join(row), error])
print(f"\n失败记录已保存到: {failure_file}")
# 如果有跳过的条目,输出详细信息
if skipped_entries:
print("\n===== 跳过的记录 =====")
for row_num, row, reason in skipped_entries:
print(f"{row_num}: {row} - 跳过原因: {reason}")
except Exception as e:
print(f"读取CSV文件时出错: {e}")
conn.rollback()
return False
finally:
conn.close()
return True
if __name__ == "__main__":
if main():
print("用户数据导入成功!")
else:
print("用户数据导入失败!")
sys.exit(1)