#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 爬虫基类 """ import logging from abc import ABC, abstractmethod from typing import List, Optional import aiohttp import asyncio from ..models import HotTopic, HotTopicSource logger = logging.getLogger(__name__) class BaseCrawler(ABC): """爬虫基类""" source: HotTopicSource = HotTopicSource.CUSTOM name: str = "基础爬虫" def __init__(self): self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") self._session: Optional[aiohttp.ClientSession] = None @abstractmethod async def fetch(self) -> List[HotTopic]: """ 获取热点列表 Returns: 热点话题列表 """ pass async def fetch_with_retry(self, max_retries: int = 3) -> List[HotTopic]: """带重试的获取""" for attempt in range(max_retries): try: return await self.fetch() except Exception as e: self.logger.warning(f"获取失败 (尝试 {attempt + 1}/{max_retries}): {e}") if attempt < max_retries - 1: await asyncio.sleep(2 ** attempt) # 指数退避 self.logger.error(f"{self.name} 获取失败,已达最大重试次数") return [] async def _get_session(self) -> aiohttp.ClientSession: """获取 HTTP 会话""" if self._session is None or self._session.closed: self._session = aiohttp.ClientSession( headers=self._get_default_headers(), timeout=aiohttp.ClientTimeout(total=30) ) return self._session async def close(self): """关闭会话""" if self._session and not self._session.closed: await self._session.close() def _get_default_headers(self) -> dict: """获取默认请求头""" return { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', } async def __aenter__(self): return self async def __aexit__(self, exc_type, exc_val, exc_tb): await self.close()