深入分析大众点评 Ajax 接口:直接请求 JSON 数据高效获取评论
一、Ajax接口分析与逆向工程
1.1 接口识别与分析
大众点评网站采用前后端分离架构,通过Ajax接口动态加载数据。我们可以通过浏览器开发者工具来识别这些接口:
- 打开大众点评网站,进入任意商户页面
- 按F12打开开发者工具,切换到Network(网络)面板
- 刷新页面,筛选XHR或Fetch请求
- 查看请求响应,寻找包含评论数据的JSON接口
通过分析,我们发现大众点评评论数据主要通过以下接口获取:
text
https://www.dianping.com/ajax/json/shopDynamic/allReview?shopId={shopId}&cityId={cityId}&shopType={shopType}&page={page}&pageSize={pageSize}
1.2 接口参数分析
对接口进行深入分析后,我们识别出以下关键参数:
参数名 | 含义 | 示例 |
---|---|---|
shopId | 商户ID | H9dNInbVZA9jJ5qH |
cityId | 城市ID | 1(上海) |
shopType | 商户类型 | 10(美食) |
page | 页码 | 1 |
pageSize | 每页数量 | 20 |
_token | 加密令牌 | 动态生成 |
其中,<font style="color:rgb(15, 17, 21);background-color:rgb(235, 238, 242);">_token</font>
参数是大众点评的反爬机制核心,需要特别处理。
1.3 加密参数逆向分析
大众点评使用JavaScript生成加密参数,我们可以通过以下方式分析:
// 大众点评token生成逻辑(简化版)
function generateToken() {var e = Math.floor((new Date).getTime() / 1e3);return md5("".concat(e).concat("some_secret_key"));
}
实际环境中,我们需要使用Python重现这一加密逻辑。
二、环境准备与依赖安装
2.1 JavaScript环境配置
由于大众点评使用JavaScript生成加密参数,我们需要在Python中执行JS代码:
# 安装Node.js(用于执行JavaScript代码)
# 访问 https://nodejs.org/ 下载并安装# 安装PyExecJS
pip install pyexecjs
三、核心代码实现
3.1 加密参数生成器
import execjs
import time
import hashlibclass DianpingTokenGenerator:def __init__(self):# 加载JavaScript加密代码with open('dianping_encrypt.js', 'r', encoding='utf-8') as f:js_code = f.read()self.ctx = execjs.compile(js_code)def generate_token(self):"""生成大众点评接口所需的_token参数"""timestamp = int(time.time())# 调用JavaScript代码生成tokentoken = self.ctx.call('generateToken', timestamp)return tokendef generate_signature(self, params):"""生成请求签名"""# 对参数进行排序并拼接sorted_params = sorted(params.items(), key=lambda x: x[0])param_str = '&'.join([f'{k}={v}' for k, v in sorted_params])# 添加密钥并计算MD5secret_key = 'dianping_secret_key_2023'sign_str = param_str + secret_keysignature = hashlib.md5(sign_str.encode('utf-8')).hexdigest()return signature# 对应的JavaScript文件 dianping_encrypt.js
"""
function generateToken(timestamp) {var e = timestamp.toString();var n = "your_secret_key_here"; // 需要根据实际分析获取return md5(e + n);
}function md5(input) {// 简单的MD5实现,实际中可能需要更完整的实现var crypto = require('crypto');return crypto.createHash('md5').update(input).digest('hex');
}
"""
3.2 接口请求类
import requests
import json
import time
import random
from urllib.parse import urlencode, quoteclass DianpingAPI:def __init__(self):self.session = requests.Session()self.token_generator = DianpingTokenGenerator()self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36','Referer': 'https://www.dianping.com/','Accept': 'application/json, text/plain, */*','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8','Accept-Encoding': 'gzip, deflate, br','Connection': 'keep-alive','X-Requested-With': 'XMLHttpRequest',}# 代理池配置(可选)self.proxies = [# 添加代理服务器列表]def get_reviews(self, shop_id, city_id=1, shop_type=10, max_pages=10):"""获取商户评论数据"""all_reviews = []for page in range(1, max_pages + 1):try:reviews = self._get_single_page_reviews(shop_id, city_id, shop_type, page)if not reviews:breakall_reviews.extend(reviews)print(f"已获取第{page}页评论,共{len(reviews)}条")# 随机延迟,避免请求过于频繁time.sleep(random.uniform(1, 3))except Exception as e:print(f"获取第{page}页评论失败: {str(e)}")breakreturn all_reviewsdef _get_single_page_reviews(self, shop_id, city_id, shop_type, page):"""获取单页评论数据"""# 构造请求参数base_params = {'shopId': shop_id,'cityId': city_id,'shopType': shop_type,'page': page,'pageSize': 20,'sort': 0, # 排序方式:0-默认,1-最新'originUrl': f'/shop/{shop_id}','t': int(time.time() * 1000), # 时间戳}# 生成token和签名token = self.token_generator.generate_token()base_params['_token'] = tokensignature = self.token_generator.generate_signature(base_params)base_params['sign'] = signature# 构造请求URLapi_url = 'https://www.dianping.com/ajax/json/shopDynamic/allReview'url = f"{api_url}?{urlencode(base_params)}"# 设置随机代理(如果有)if self.proxies:proxy = random.choice(self.proxies)self.session.proxies = {'http': proxy, 'https': proxy}try:response = self.session.get(url, headers=self.headers, timeout=10)response.raise_for_status()# 解析响应数据data = response.json()if data.get('code') == 200 and data.get('msg') == 'success':return self._parse_reviews(data.get('data', {}).get('reviewList', []))else:print(f"接口返回错误: {data.get('msg')}")return []except requests.exceptions.RequestException as e:print(f"请求失败: {str(e)}")return []except json.JSONDecodeError as e:print(f"JSON解析失败: {str(e)}")return []def _parse_reviews(self, review_list):"""解析评论数据"""reviews = []for review_data in review_list:review = {'review_id': review_data.get('reviewId'),'user_id': review_data.get('userId'),'user_name': review_data.get('userName'),'user_level': review_data.get('userLevel'),'rating': review_data.get('reviewScore'),'rating_text': self._convert_rating_to_text(review_data.get('reviewScore')),'content': review_data.get('reviewBody'),'publish_time': review_data.get('reviewTime'),'publish_timestamp': self._convert_time_to_timestamp(review_data.get('reviewTime')),'like_count': review_data.get('likeCount'),'view_count': review_data.get('viewCount'),'reply_count': review_data.get('replyCount'),'pictures': review_data.get('reviewPics', []),'merchant_reply': review_data.get('merchantReply'),'additional_tags': review_data.get('additionalTags', []),'crawl_time': time.strftime('%Y-%m-%d %H:%M:%S')}reviews.append(review)return reviewsdef _convert_rating_to_text(self, rating):"""将评分转换为文本描述"""rating_map = {50: '超赞',40: '很好',30: '一般',20: '较差',10: '很差'}return rating_map.get(rating, '未知')def _convert_time_to_timestamp(self, time_str):"""将时间字符串转换为时间戳"""from dateutil import parsertry:dt = parser.parse(time_str)return int(dt.timestamp())except:return 0
3.3 数据存储模块
import json
import csv
import pymongo
from datetime import datetimeclass DataStorage:def __init__(self, storage_type='json', **kwargs):self.storage_type = storage_typeif storage_type == 'mongo':self.client = pymongo.MongoClient(kwargs.get('mongo_uri', 'mongodb://localhost:27017/'))self.db = self.client[kwargs.get('db_name', 'dianping')]self.collection = self.db[kwargs.get('collection_name', 'reviews')]def save_reviews(self, reviews, filename=None):"""保存评论数据"""if self.storage_type == 'json':self._save_to_json(reviews, filename)elif self.storage_type == 'csv':self._save_to_csv(reviews, filename)elif self.storage_type == 'mongo':self._save_to_mongo(reviews)def _save_to_json(self, reviews, filename):"""保存为JSON文件"""if not filename:filename = f'dianping_reviews_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'with open(filename, 'w', encoding='utf-8') as f:json.dump(reviews, f, ensure_ascii=False, indent=2)print(f"数据已保存到 {filename}")def _save_to_csv(self, reviews, filename):"""保存为CSV文件"""if not filename:filename = f'dianping_reviews_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'if reviews:fieldnames = reviews[0].keys()with open(filename, 'w', encoding='utf-8', newline='') as f:writer = csv.DictWriter(f, fieldnames=fieldnames)writer.writeheader()writer.writerows(reviews)print(f"数据已保存到 {filename}")def _save_to_mongo(self, reviews):"""保存到MongoDB"""if reviews:result = self.collection.insert_many(reviews)print(f"已插入 {len(result.inserted_ids)} 条数据到MongoDB")
3.4 完整示例代码
def main():# 初始化API客户端api_client = DianpingAPI()# 设置目标商户参数shop_id = "H9dNInbVZA9jJ5qH" # 示例商户IDcity_id = 1 # 上海shop_type = 10 # 美食# 获取评论数据print("开始获取评论数据...")reviews = api_client.get_reviews(shop_id, city_id, shop_type, max_pages=5)print(f"共获取到 {len(reviews)} 条评论")# 保存数据storage = DataStorage(storage_type='json')storage.save_reviews(reviews)# 打印前几条评论作为示例for i, review in enumerate(reviews[:3]):print(f"\n--- 评论 {i+1} ---")print(f"用户: {review['user_name']} (等级: {review['user_level']})")print(f"评分: {review['rating_text']} ({review['rating']})")print(f"内容: {review['content'][:100]}...")print(f"时间: {review['publish_time']}")print(f"点赞数: {review['like_count']}")if __name__ == "__main__":main()
四、高级技巧与优化策略
4.1 并发请求优化
import concurrent.futures
from tqdm import tqdmclass ConcurrentDianpingAPI(DianpingAPI):def get_reviews_concurrently(self, shop_id, city_id=1, shop_type=10, max_pages=10, max_workers=5):"""并发获取多页评论数据"""all_reviews = []with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:# 创建任务列表future_to_page = {executor.submit(self._get_single_page_reviews, shop_id, city_id, shop_type, page): page for page in range(1, max_pages + 1)}# 使用tqdm显示进度with tqdm(total=len(future_to_page), desc="获取评论") as pbar:for future in concurrent.futures.as_completed(future_to_page):page = future_to_page[future]try:reviews = future.result()if reviews:all_reviews.extend(reviews)pbar.set_postfix_str(f"第{page}页: {len(reviews)}条")else:pbar.set_postfix_str(f"第{page}页: 无数据")except Exception as e:pbar.set_postfix_str(f"第{page}页: 错误")finally:pbar.update(1)return all_reviews
4.2 错误处理与重试机制
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_typeclass RobustDianpingAPI(DianpingAPI):@retry(stop=stop_after_attempt(3),wait=wait_exponential(multiplier=1, min=4, max=10),retry=retry_if_exception_type((requests.exceptions.RequestException, json.JSONDecodeError)))def _get_single_page_reviews_with_retry(self, shop_id, city_id, shop_type, page):"""带重试机制的单页评论获取"""return self._get_single_page_reviews(shop_id, city_id, shop_type, page)
五、伦理与法律考量
在实施大众点评数据采集时,必须注意以下法律和伦理问题:
- 遵守Robots协议:尊重网站的robots.txt文件规定
- 控制请求频率:避免对目标网站造成过大负担
- 数据使用限制:仅将数据用于合法合规的研究和分析目的
- 用户隐私保护:对采集到的用户信息进行脱敏处理
- 知识产权尊重:不侵犯大众点评的商业秘密和知识产权