148 lines
5.2 KiB
Python
148 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
Bing 热门搜索爬虫
|
|
|
|
数据源: Bing 热门搜索 API
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import List, Optional
|
|
from datetime import datetime, timedelta
|
|
import aiohttp
|
|
|
|
from .base import BaseCrawler
|
|
from ..models import HotTopic, HotTopicSource, HotTopicCategory
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BingCrawler(BaseCrawler):
|
|
"""Bing 热门搜索爬虫"""
|
|
|
|
source = HotTopicSource.BING
|
|
name = "Bing热搜"
|
|
|
|
# Bing 热门搜索 API
|
|
# 中国区
|
|
API_URL_CN = "https://cn.bing.com/HPImageArchive.aspx?format=js&idx=0&n=1&mkt=zh-CN"
|
|
# 热门搜索页面
|
|
TRENDING_URL = "https://cn.bing.com/search?q=热门搜索&form=QBLH"
|
|
|
|
# Bing 搜索建议 API (可获取热门词)
|
|
SUGGEST_API = "https://api.bing.com/qsonhs.aspx?type=cb&q={keyword}&cb=callback"
|
|
|
|
# 预设旅游相关搜索词
|
|
TRAVEL_KEYWORDS = [
|
|
"旅游攻略", "景点推荐", "酒店预订", "机票", "自驾游",
|
|
"周边游", "亲子游", "温泉度假", "滑雪场", "海岛游",
|
|
"三亚旅游", "云南旅游", "成都旅游", "杭州旅游", "西安旅游",
|
|
]
|
|
|
|
def __init__(self, keywords: List[str] = None):
|
|
super().__init__()
|
|
self.keywords = keywords or self.TRAVEL_KEYWORDS
|
|
|
|
async def fetch(self) -> List[HotTopic]:
|
|
"""获取 Bing 热门搜索"""
|
|
topics = []
|
|
|
|
try:
|
|
session = await self._get_session()
|
|
|
|
# 1. 获取搜索建议 (基于旅游关键词)
|
|
for idx, keyword in enumerate(self.keywords[:10]): # 限制10个
|
|
suggestions = await self._get_suggestions(session, keyword)
|
|
|
|
for suggestion in suggestions[:3]: # 每个关键词取3个建议
|
|
topic = HotTopic(
|
|
title=suggestion,
|
|
source=self.source,
|
|
rank=len(topics) + 1,
|
|
heat=50000 - (len(topics) * 1000), # 递减热度
|
|
category=HotTopicCategory.TRAVEL,
|
|
url=f"https://cn.bing.com/search?q={suggestion}",
|
|
tags=[keyword],
|
|
fetched_at=datetime.now(),
|
|
expires_at=datetime.now() + timedelta(hours=4),
|
|
extra={'source_keyword': keyword, 'engine': 'bing'}
|
|
)
|
|
topics.append(topic)
|
|
|
|
# 去重
|
|
seen = set()
|
|
unique_topics = []
|
|
for t in topics:
|
|
if t.title not in seen:
|
|
seen.add(t.title)
|
|
unique_topics.append(t)
|
|
|
|
self.logger.info(f"获取到 {len(unique_topics)} 条 Bing 搜索建议")
|
|
return unique_topics
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"获取 Bing 热搜失败: {e}")
|
|
|
|
return topics
|
|
|
|
async def _get_suggestions(self, session, keyword: str) -> List[str]:
|
|
"""获取搜索建议"""
|
|
suggestions = []
|
|
|
|
try:
|
|
url = f"https://api.bing.com/qsonhs.aspx?type=cb&q={keyword}"
|
|
|
|
async with session.get(url) as response:
|
|
if response.status != 200:
|
|
return []
|
|
|
|
text = await response.text()
|
|
|
|
# 解析 JSONP 响应
|
|
# 格式: /* {"AS":{"Query":"xxx","FullResults":1,"Results":[{"Type":"AS","Suggests":[...]}]}} */
|
|
match = re.search(r'\{.*\}', text)
|
|
if match:
|
|
import json
|
|
data = json.loads(match.group())
|
|
|
|
results = data.get('AS', {}).get('Results', [])
|
|
for result in results:
|
|
for suggest in result.get('Suggests', []):
|
|
txt = suggest.get('Txt', '')
|
|
if txt and txt != keyword:
|
|
suggestions.append(txt)
|
|
|
|
except Exception as e:
|
|
self.logger.debug(f"获取建议失败 [{keyword}]: {e}")
|
|
|
|
return suggestions
|
|
|
|
async def search_travel(self, query: str) -> List[HotTopic]:
|
|
"""搜索旅游相关内容"""
|
|
topics = []
|
|
|
|
try:
|
|
session = await self._get_session()
|
|
suggestions = await self._get_suggestions(session, query)
|
|
|
|
for idx, suggestion in enumerate(suggestions):
|
|
topic = HotTopic(
|
|
title=suggestion,
|
|
source=self.source,
|
|
rank=idx + 1,
|
|
heat=50000 - (idx * 2000),
|
|
category=HotTopicCategory.TRAVEL,
|
|
url=f"https://cn.bing.com/search?q={suggestion}",
|
|
fetched_at=datetime.now(),
|
|
expires_at=datetime.now() + timedelta(hours=2),
|
|
extra={'query': query, 'engine': 'bing'}
|
|
)
|
|
topics.append(topic)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"搜索失败: {e}")
|
|
|
|
return topics
|