201 lines
8.9 KiB
Python
201 lines
8.9 KiB
Python
|
|
#!/usr/bin/env python
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import sqlite3
|
|||
|
|
import csv
|
|||
|
|
import sys
|
|||
|
|
|
|||
|
|
# CSV文件路径
|
|||
|
|
csv_file = '/root/autodl-tmp/TravelContentCreator/output/5.15.csv'
|
|||
|
|
|
|||
|
|
# 数据库路径
|
|||
|
|
db_path = '/root/autodl-tmp/TravelContentCreator/distribution.db'
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
# 检查CSV文件是否存在
|
|||
|
|
if not os.path.exists(csv_file):
|
|||
|
|
print(f"错误: CSV文件不存在: {csv_file}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 连接数据库
|
|||
|
|
try:
|
|||
|
|
conn = sqlite3.connect(db_path)
|
|||
|
|
conn.execute("PRAGMA foreign_keys = OFF") # 禁用外键约束,避免可能的导入问题
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
print(f"已连接到数据库: {db_path}")
|
|||
|
|
|
|||
|
|
# 检查数据库表结构,确保xhs_user_id字段存在
|
|||
|
|
print("检查数据库表结构...")
|
|||
|
|
cursor.execute("PRAGMA table_info(users)")
|
|||
|
|
columns = {col[1] for col in cursor.fetchall()}
|
|||
|
|
|
|||
|
|
if 'xhs_user_id' not in columns:
|
|||
|
|
print("错误: 数据库users表中不存在xhs_user_id字段")
|
|||
|
|
print("请先运行 fix_database.py 脚本修复数据库结构")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
print(f"当前用户表字段: {', '.join(columns)}")
|
|||
|
|
|
|||
|
|
# 清除所有历史用户数据
|
|||
|
|
clear_users = input("是否清除所有历史用户数据? (y/n): ").strip().lower()
|
|||
|
|
if clear_users == 'y':
|
|||
|
|
try:
|
|||
|
|
# 先检查是否有关联的分发记录
|
|||
|
|
cursor.execute("SELECT COUNT(*) FROM distributions")
|
|||
|
|
distribution_count = cursor.fetchone()[0]
|
|||
|
|
|
|||
|
|
if distribution_count > 0:
|
|||
|
|
confirm_delete = input(f"警告: 数据库中有 {distribution_count} 条分发记录与用户关联,确定要删除所有用户吗? (y/n): ").strip().lower()
|
|||
|
|
if confirm_delete != 'y':
|
|||
|
|
print("取消清除用户数据")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 清除所有用户数据
|
|||
|
|
cursor.execute("DELETE FROM users")
|
|||
|
|
conn.commit()
|
|||
|
|
print(f"已清除所有历史用户数据")
|
|||
|
|
except sqlite3.Error as e:
|
|||
|
|
print(f"清除用户数据时出错: {e}")
|
|||
|
|
conn.rollback()
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 检查数据库中已存在的小红书User码
|
|||
|
|
cursor.execute("SELECT xhs_user_id FROM users WHERE xhs_user_id IS NOT NULL")
|
|||
|
|
existing_user_ids = set(row[0] for row in cursor.fetchall() if row[0])
|
|||
|
|
print(f"数据库中已有 {len(existing_user_ids)} 个带有小红书User码的用户")
|
|||
|
|
|
|||
|
|
except sqlite3.Error as e:
|
|||
|
|
print(f"数据库连接错误: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 计数器
|
|||
|
|
success_count = 0
|
|||
|
|
skip_count = 0
|
|||
|
|
error_count = 0
|
|||
|
|
update_count = 0
|
|||
|
|
# 记录失败和跳过的条目
|
|||
|
|
failed_entries = []
|
|||
|
|
skipped_entries = []
|
|||
|
|
|
|||
|
|
# 读取CSV文件并导入数据
|
|||
|
|
try:
|
|||
|
|
with open(csv_file, 'r', encoding='utf-8') as f:
|
|||
|
|
reader = csv.reader(f)
|
|||
|
|
# 跳过标题行
|
|||
|
|
headers = next(reader)
|
|||
|
|
print(f"CSV文件标题: {headers}")
|
|||
|
|
|
|||
|
|
for row_num, row in enumerate(reader, 2): # 从2开始计数,因为第1行是标题
|
|||
|
|
if len(row) >= 4: # 确保行至少有4列
|
|||
|
|
try:
|
|||
|
|
# 从CSV提取数据 (小红书User码, 小红书ID, 粉丝数, 邮箱)
|
|||
|
|
xhs_user_id, username, fans, email = row
|
|||
|
|
|
|||
|
|
# 如果邮箱为空,跳过
|
|||
|
|
if not email:
|
|||
|
|
skip_info = f"邮箱为空"
|
|||
|
|
skipped_entries.append((row_num, row, skip_info))
|
|||
|
|
print(f"警告: 跳过无邮箱用户: {username} (行 {row_num})")
|
|||
|
|
skip_count += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 确保邮箱格式正确
|
|||
|
|
if '@' not in email:
|
|||
|
|
error_info = f"无效邮箱格式"
|
|||
|
|
failed_entries.append((row_num, row, error_info))
|
|||
|
|
print(f"警告: 跳过无效邮箱: {email} (行 {row_num})")
|
|||
|
|
error_count += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 检查小红书User码是否有效
|
|||
|
|
if not xhs_user_id:
|
|||
|
|
error_info = f"小红书User码为空"
|
|||
|
|
failed_entries.append((row_num, row, error_info))
|
|||
|
|
print(f"警告: 小红书User码为空: {email} (行 {row_num})")
|
|||
|
|
error_count += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 检查小红书User码是否已存在,如果存在则更新信息
|
|||
|
|
if xhs_user_id in existing_user_ids:
|
|||
|
|
print(f"用户User码已存在,将更新: {xhs_user_id} -> {username}, {email}")
|
|||
|
|
cursor.execute("""
|
|||
|
|
UPDATE users
|
|||
|
|
SET username = ?, email = ?
|
|||
|
|
WHERE xhs_user_id = ?
|
|||
|
|
""", (username, email, xhs_user_id))
|
|||
|
|
update_count += 1
|
|||
|
|
skipped_entries.append((row_num, row, "用户User码已存在,已更新信息"))
|
|||
|
|
else:
|
|||
|
|
# 插入新用户
|
|||
|
|
cursor.execute("""
|
|||
|
|
INSERT INTO users (email, username, xhs_user_id)
|
|||
|
|
VALUES (?, ?, ?)
|
|||
|
|
""", (email, username, xhs_user_id))
|
|||
|
|
existing_user_ids.add(xhs_user_id) # 添加到已存在列表
|
|||
|
|
success_count += 1
|
|||
|
|
print(f"插入新用户: {xhs_user_id} -> {username}, {email}")
|
|||
|
|
|
|||
|
|
except sqlite3.IntegrityError as e:
|
|||
|
|
# 处理唯一约束冲突
|
|||
|
|
if "UNIQUE constraint failed: users.xhs_user_id" in str(e):
|
|||
|
|
error_info = f"小红书User码重复"
|
|||
|
|
failed_entries.append((row_num, row, error_info))
|
|||
|
|
print(f"错误: 小红书User码重复: {xhs_user_id} (行 {row_num})")
|
|||
|
|
else:
|
|||
|
|
error_info = f"数据库完整性错误: {e}"
|
|||
|
|
failed_entries.append((row_num, row, error_info))
|
|||
|
|
print(f"插入行时出错 (行 {row_num}): {row}, 错误: {e}")
|
|||
|
|
error_count += 1
|
|||
|
|
except Exception as e:
|
|||
|
|
error_info = f"数据库错误: {e}"
|
|||
|
|
failed_entries.append((row_num, row, error_info))
|
|||
|
|
print(f"插入行时出错 (行 {row_num}): {row}, 错误: {e}")
|
|||
|
|
error_count += 1
|
|||
|
|
else:
|
|||
|
|
error_info = f"列数不足,需要至少4列"
|
|||
|
|
failed_entries.append((row_num, row, error_info))
|
|||
|
|
print(f"警告: 跳过格式不正确的行 {row_num}: {row}")
|
|||
|
|
error_count += 1
|
|||
|
|
|
|||
|
|
# 提交事务
|
|||
|
|
conn.commit()
|
|||
|
|
print(f"\n导入完成! 成功插入: {success_count}, 更新: {update_count}, 跳过: {skip_count}, 失败: {error_count}")
|
|||
|
|
|
|||
|
|
# 如果有失败的条目,输出详细信息
|
|||
|
|
if failed_entries:
|
|||
|
|
print("\n===== 导入失败的记录 =====")
|
|||
|
|
for row_num, row, error in failed_entries:
|
|||
|
|
print(f"行 {row_num}: {row} - 失败原因: {error}")
|
|||
|
|
|
|||
|
|
# 将失败记录保存到文件
|
|||
|
|
failure_file = os.path.join(os.path.dirname(csv_file), "import_failures_5.15.csv")
|
|||
|
|
with open(failure_file, 'w', newline='', encoding='utf-8') as f:
|
|||
|
|
writer = csv.writer(f)
|
|||
|
|
writer.writerow(['行号', '原始数据', '失败原因'])
|
|||
|
|
for row_num, row, error in failed_entries:
|
|||
|
|
writer.writerow([row_num, ','.join(row), error])
|
|||
|
|
|
|||
|
|
print(f"\n失败记录已保存到: {failure_file}")
|
|||
|
|
|
|||
|
|
# 如果有跳过的条目,输出详细信息
|
|||
|
|
if skipped_entries:
|
|||
|
|
print("\n===== 跳过的记录 =====")
|
|||
|
|
for row_num, row, reason in skipped_entries:
|
|||
|
|
print(f"行 {row_num}: {row} - 跳过原因: {reason}")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"读取CSV文件时出错: {e}")
|
|||
|
|
conn.rollback()
|
|||
|
|
return False
|
|||
|
|
finally:
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
if main():
|
|||
|
|
print("用户数据导入成功!")
|
|||
|
|
else:
|
|||
|
|
print("用户数据导入失败!")
|
|||
|
|
sys.exit(1)
|