148 lines
5.2 KiB
Python
Raw Normal View History

2025-12-10 10:07:40 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Bing 热门搜索爬虫
数据源: Bing 热门搜索 API
"""
import logging
import re
from typing import List, Optional
from datetime import datetime, timedelta
import aiohttp
from .base import BaseCrawler
from ..models import HotTopic, HotTopicSource, HotTopicCategory
logger = logging.getLogger(__name__)
class BingCrawler(BaseCrawler):
"""Bing 热门搜索爬虫"""
source = HotTopicSource.BING
name = "Bing热搜"
# Bing 热门搜索 API
# 中国区
API_URL_CN = "https://cn.bing.com/HPImageArchive.aspx?format=js&idx=0&n=1&mkt=zh-CN"
# 热门搜索页面
TRENDING_URL = "https://cn.bing.com/search?q=热门搜索&form=QBLH"
# Bing 搜索建议 API (可获取热门词)
SUGGEST_API = "https://api.bing.com/qsonhs.aspx?type=cb&q={keyword}&cb=callback"
# 预设旅游相关搜索词
TRAVEL_KEYWORDS = [
"旅游攻略", "景点推荐", "酒店预订", "机票", "自驾游",
"周边游", "亲子游", "温泉度假", "滑雪场", "海岛游",
"三亚旅游", "云南旅游", "成都旅游", "杭州旅游", "西安旅游",
]
def __init__(self, keywords: List[str] = None):
super().__init__()
self.keywords = keywords or self.TRAVEL_KEYWORDS
async def fetch(self) -> List[HotTopic]:
"""获取 Bing 热门搜索"""
topics = []
try:
session = await self._get_session()
# 1. 获取搜索建议 (基于旅游关键词)
for idx, keyword in enumerate(self.keywords[:10]): # 限制10个
suggestions = await self._get_suggestions(session, keyword)
for suggestion in suggestions[:3]: # 每个关键词取3个建议
topic = HotTopic(
title=suggestion,
source=self.source,
rank=len(topics) + 1,
heat=50000 - (len(topics) * 1000), # 递减热度
category=HotTopicCategory.TRAVEL,
url=f"https://cn.bing.com/search?q={suggestion}",
tags=[keyword],
fetched_at=datetime.now(),
expires_at=datetime.now() + timedelta(hours=4),
extra={'source_keyword': keyword, 'engine': 'bing'}
)
topics.append(topic)
# 去重
seen = set()
unique_topics = []
for t in topics:
if t.title not in seen:
seen.add(t.title)
unique_topics.append(t)
self.logger.info(f"获取到 {len(unique_topics)} 条 Bing 搜索建议")
return unique_topics
except Exception as e:
self.logger.error(f"获取 Bing 热搜失败: {e}")
return topics
async def _get_suggestions(self, session, keyword: str) -> List[str]:
"""获取搜索建议"""
suggestions = []
try:
url = f"https://api.bing.com/qsonhs.aspx?type=cb&q={keyword}"
async with session.get(url) as response:
if response.status != 200:
return []
text = await response.text()
# 解析 JSONP 响应
# 格式: /* {"AS":{"Query":"xxx","FullResults":1,"Results":[{"Type":"AS","Suggests":[...]}]}} */
match = re.search(r'\{.*\}', text)
if match:
import json
data = json.loads(match.group())
results = data.get('AS', {}).get('Results', [])
for result in results:
for suggest in result.get('Suggests', []):
txt = suggest.get('Txt', '')
if txt and txt != keyword:
suggestions.append(txt)
except Exception as e:
self.logger.debug(f"获取建议失败 [{keyword}]: {e}")
return suggestions
async def search_travel(self, query: str) -> List[HotTopic]:
"""搜索旅游相关内容"""
topics = []
try:
session = await self._get_session()
suggestions = await self._get_suggestions(session, query)
for idx, suggestion in enumerate(suggestions):
topic = HotTopic(
title=suggestion,
source=self.source,
rank=idx + 1,
heat=50000 - (idx * 2000),
category=HotTopicCategory.TRAVEL,
url=f"https://cn.bing.com/search?q={suggestion}",
fetched_at=datetime.now(),
expires_at=datetime.now() + timedelta(hours=2),
extra={'query': query, 'engine': 'bing'}
)
topics.append(topic)
except Exception as e:
self.logger.error(f"搜索失败: {e}")
return topics