79 lines
2.4 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
爬虫基类
"""
import logging
from abc import ABC, abstractmethod
from typing import List, Optional
import aiohttp
import asyncio
from ..models import HotTopic, HotTopicSource
logger = logging.getLogger(__name__)
class BaseCrawler(ABC):
"""爬虫基类"""
source: HotTopicSource = HotTopicSource.CUSTOM
name: str = "基础爬虫"
def __init__(self):
self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
self._session: Optional[aiohttp.ClientSession] = None
@abstractmethod
async def fetch(self) -> List[HotTopic]:
"""
获取热点列表
Returns:
热点话题列表
"""
pass
async def fetch_with_retry(self, max_retries: int = 3) -> List[HotTopic]:
"""带重试的获取"""
for attempt in range(max_retries):
try:
return await self.fetch()
except Exception as e:
self.logger.warning(f"获取失败 (尝试 {attempt + 1}/{max_retries}): {e}")
if attempt < max_retries - 1:
await asyncio.sleep(2 ** attempt) # 指数退避
self.logger.error(f"{self.name} 获取失败,已达最大重试次数")
return []
async def _get_session(self) -> aiohttp.ClientSession:
"""获取 HTTP 会话"""
if self._session is None or self._session.closed:
self._session = aiohttp.ClientSession(
headers=self._get_default_headers(),
timeout=aiohttp.ClientTimeout(total=30)
)
return self._session
async def close(self):
"""关闭会话"""
if self._session and not self._session.closed:
await self._session.close()
def _get_default_headers(self) -> dict:
"""获取默认请求头"""
return {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.close()