当前位置：首页 > news >正文

Python爬虫数据可视化：深度分析贝壳成交价格趋势与分布

news 2025/10/16 6:09:25

一、数据获取：精准捕获目标信息

任何数据分析的基石都是高质量的数据。由于贝壳的成交数据并未提供公开的API，我们需要通过Python爬虫来模拟浏览器行为，从网页中提取结构化信息。

技术栈选择：

**requests**: 用于发送HTTP请求，获取网页源代码。
**BeautifulSoup** 或 **lxml**: 用于解析HTML/XML，提取所需数据。
**pandas**: 用于数据清洗、整理和存储。

实现思路与代码：

贝壳的成交数据通常存在于详情页或通过异步接口加载。为了简化示例，我们假设从一个包含成交列表的页面开始。在实际操作中，你可能需要先爬取列表页获取每个房源的ID，再构造详情页URL或找到背后的Ajax接口。

以下是一个简化的爬虫示例，重点在于数据提取和整理的逻辑：

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random# 代理配置信息
proxyHost = "www.16yun.cn"
proxyPort = "5445"
proxyUser = "16QMSOML"
proxyPass = "280651"# 模拟真实浏览器的请求头，绕过简单的反爬机制
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
}# 方式1：构造代理字典（推荐）
proxies = {'http': f'http://{proxyUser}:{proxyPass}@{proxyHost}:{proxyPort}','https': f'https://{proxyUser}:{proxyPass}@{proxyHost}:{proxyPort}'
}# 方式2：使用requests的auth参数（备选）
# proxies = {
#     'http': f'http://{proxyHost}:{proxyPort}',
#     'https': f'https://{proxyHost}:{proxyPort}'
# }def crawl_beike_deal_data(city_code='sh', max_pages=5):"""爬取贝壳成交数据（示例框架，实际URL和解析规则需根据实际情况调整）"""all_deals = []for page in range(1, max_pages + 1):# 示例URL，实际需要根据城市和分页规律构造url = f'https://{city_code}.ke.com/chengjiao/pg{page}'try:# 方式1：使用代理字典response = requests.get(url, headers=headers, proxies=proxies, timeout=15)# 方式2：如果使用第二种代理配置方式，需要添加auth参数# from requests.auth import HTTPProxyAuth# proxy_auth = HTTPProxyAuth(proxyUser, proxyPass)# response = requests.get(url, headers=headers, proxies=proxies, #                       auth=proxy_auth, timeout=15)response.raise_for_status() # 如果状态码不是200，抛出异常# 检查是否被反爬，可以通过检查响应内容来判断if "验证" in response.text or "antispider" in response.text:print(f"第{page}页可能触发了反爬机制，请检查代理IP是否有效")continuesoup = BeautifulSoup(response.text, 'html.parser')# 找到所有成交房源的列表项（此选择器为示例，需通过浏览器开发者工具核实）deal_items = soup.find_all('li', class_='cluster_item')if not deal_items:print(f"第{page}页未找到数据，可能页面结构已更新或IP被限制")continuefor item in deal_items:deal_info = {}# 提取关键信息（以下选择器均为示例，必须更新）# 小区名称deal_info['district'] = item.find('div', class_='district').get_text(strip=True) if item.find('div', class_='district') else None# 户型、面积、楼层、朝向等信息house_info = item.find('div', class_='houseInfo').get_text(strip=True) if item.find('div', class_='houseInfo') else ''# 这里需要对house_info进行字符串分割，提取具体信息（示例略）deal_info['layout'] = house_info.split('|')[0].strip() if '|' in house_info else house_infoif '|' in house_info:deal_info['area'] = house_info.split('|')[1].strip().replace('平米', '') if len(house_info.split('|')) > 1 else ''else:deal_info['area'] = ''# ... 其他信息提取# 成交总价total_price_str = item.find('div', class_='totalPrice').get_text(strip=True) if item.find('div', class_='totalPrice') else Nonedeal_info['total_price'] = float(total_price_str.replace('万', '')) if total_price_str else None# 成交日期deal_info['deal_date'] = item.find('div', class_='dealDate').get_text(strip=True) if item.find('div', class_='dealDate') else None# 计算单价 (假设面积area已提取为数字)if deal_info['total_price'] and deal_info.get('area'):try:deal_info['unit_price'] = deal_info['total_price'] * 10000 / float(deal_info['area'])except (ValueError, ZeroDivisionError):deal_info['unit_price'] = Noneelse:deal_info['unit_price'] = Noneall_deals.append(deal_info)print(f"第{page}页爬取完成，共{len(deal_items)}条数据。")# 随机延时，避免请求过于频繁time.sleep(random.uniform(2, 5))except requests.RequestException as e:print(f"爬取第{page}页时发生错误: {e}")# 可以在这里添加代理IP切换逻辑continue# 转换为DataFramedf = pd.DataFrame(all_deals)return df# 测试代理连接
def test_proxy():"""测试代理是否工作正常"""test_url = "http://httpbin.org/ip"try:response = requests.get(test_url, proxies=proxies, timeout=10)if response.status_code == 200:print(f"代理测试成功，当前IP: {response.json()['origin']}")return Trueelse:print("代理测试失败")return Falseexcept Exception as e:print(f"代理测试异常: {e}")return False# 执行前先测试代理
if test_proxy():print("代理配置正确，开始爬取数据...")# 执行爬虫df = crawl_beike_deal_data(max_pages=3) # 小规模测试if not df.empty:# 将数据保存到CSV文件，避免重复爬取df.to_csv('beike_deal_data.csv', index=False, encoding='utf-8-sig')print(f"数据爬取完成，共{len(df)}条记录，已保存到 beike_deal_data.csv")else:print("未获取到数据，请检查代理IP或网站结构")
else:print("代理配置有误，请检查代理信息")print("爬虫程序执行完毕。")

重要提示： 贝壳找房拥有严格的反爬虫机制，直接运行上述代码可能无法成功。在实际项目中，你可能需要处理动态加载（使用Selenium或Playwright）、验证码、IP代理等问题。本文的重点在于可视化，因此我们将假设已经成功获取了一份数据并保存为beike_deal_data.csv。

二、数据清洗与准备：为可视化奠定基础

原始数据往往存在缺失值、异常值和格式不一致等问题，直接可视化会产生误导。

# 假设我们从CSV文件加载数据
import pandas as pd
import numpy as np# 加载数据
df = pd.read_csv('beike_deal_data.csv')# 1. 查看数据概览
print("数据概览：")
print(df.info())
print(df.head())# 2. 数据清洗
# 处理缺失值
df_clean = df.dropna(subset=['total_price', 'unit_price', 'area', 'deal_date']).copy()# 转换日期格式
df_clean['deal_date'] = pd.to_datetime(df_clean['deal_date'], errors='coerce')
df_clean = df_clean.dropna(subset=['deal_date'])# 处理异常值：例如，剔除单价或面积过于离谱的数据
Q1 = df_clean['unit_price'].quantile(0.25)
Q3 = df_clean['unit_price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQRdf_clean = df_clean[(df_clean['unit_price'] >= lower_bound) & (df_clean['unit_price'] <= upper_bound)]# 3. 特征工程：从日期中提取年份和月份
df_clean['deal_year'] = df_clean['deal_date'].dt.year
df_clean['deal_year_month'] = df_clean['deal_date'].dt.to_period('M')print(f"清洗后数据量: {len(df_clean)}")

三、数据可视化：洞察价格趋势与分布

我们将使用matplotlib和seaborn这两个强大的可视化库来创建图表。

1. 价格趋势分析：时间序列下的市场冷暖

目标： 观察成交单价随时间的变化趋势。

import matplotlib.pyplot as plt
import seaborn as sns# 设置中文字体和图形样式
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False   # 用来正常显示负号
sns.set_style("whitegrid")# 按年月计算平均单价
price_trend = df_clean.groupby('deal_year_month')['unit_price'].mean().reset_index()
price_trend['deal_year_month'] = price_trend['deal_year_month'].astype(str) # 将Period转换为字符串以便绘图# 创建趋势图
plt.figure(figsize=(14, 7))
plt.plot(price_trend['deal_year_month'], price_trend['unit_price'], marker='o', linewidth=2, markersize=4)
plt.title('贝壳成交房源平均单价趋势图', fontsize=16, fontweight='bold')
plt.xlabel('成交年月')
plt.ylabel('平均单价 (元/平米)')
plt.xticks(rotation=45) # 旋转x轴标签，避免重叠
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout() # 自动调整布局
plt.show()

分析解读： 通过这条时间序列折线图，我们可以清晰地看到在观测时间段内，该区域房价的整体走势。是平稳上升、急速攀升，还是高位盘整甚至略有回落？这为判断市场周期提供了最直观的依据。

2. 价格分布分析：市场的集中与离散

目标： 了解成交单价主要集中在哪个区间，分布形态如何。

# 创建分布图 - 直方图与核密度估计
plt.figure(figsize=(14, 7))# 子图1：直方图
plt.subplot(1, 2, 1)
sns.histplot(df_clean['unit_price'], kde=True, bins=30, color='skyblue')
plt.title('成交单价分布直方图', fontsize=14)
plt.xlabel('单价 (元/平米)')
plt.ylabel('频数')# 子图2：箱线图
plt.subplot(1, 2, 2)
sns.boxplot(y=df_clean['unit_price'], color='lightgreen')
plt.title('成交单价箱线图', fontsize=14)
plt.ylabel('单价 (元/平米)')plt.tight_layout()
plt.show()

分析解读：

直方图展示了价格的集中趋势。一个明显的单峰表示价格集中在一个主流区间；双峰或多峰则可能暗示市场被分割（例如，老破小和豪宅两个截然不同的市场）。
箱线图则清晰地展示了数据的五个关键统计量：最小值、第一四分位数（Q1）、中位数、第三四分位数（Q3）和最大值。箱体外的点可能是离群值（异常高或低的豪宅/特殊房源）。中位数相较于均值，能更好地避免极端值的影响，反映价格的“典型”水平。

4. 区域对比分析：地理空间上的价格差异

目标： 比较不同行政区的房价水平。

# 假设数据中有'district'（行政区）字段
# 按区域计算单价中位数并排序
district_price = df_clean.groupby('district')['unit_price'].median().sort_values(ascending=False)plt.figure(figsize=(12, 8))
sns.barplot(x=district_price.values, y=district_price.index, palette='rocket')
plt.title('各行政区成交单价中位数对比', fontsize=16)
plt.xlabel('单价中位数 (元/平米)')
plt.tight_layout()
plt.show()

分析解读： 条形图是进行类别比较的最佳工具之一。通过这个图表，我们可以一目了然地看出哪个区是“房价高地”，哪个区是“价值洼地”。这种区域差异通常与配套设施、产业布局、学区资源等密切相关。

四、高级可视化：使用PyEcharts打造交互式图表

静态图表虽然清晰，但交互式图表能提供更深入的探索体验。PyEcharts是一个优秀的库，可以生成基于Web的交互式图表。

from pyecharts import options as opts
from pyecharts.charts import Bar, Line, Scatter
from pyecharts.globals import ThemeType# 示例：创建一个交互式的月度价格趋势图（双Y轴，显示均价和成交量）
trend_df = df_clean.groupby('deal_year_month').agg({'unit_price': 'mean', 'total_price': 'count'}).reset_index()
trend_df['deal_year_month'] = trend_df['deal_year_month'].astype(str)line = (Line(init_opts=opts.InitOpts(theme=ThemeType.MACARONS)).add_xaxis(trend_df['deal_year_month'].tolist()).add_yaxis("平均单价",trend_df['unit_price'].round(2).tolist(),yaxis_index=0,color="#d14a61",label_opts=opts.LabelOpts(is_show=False),).extend_axis(yaxis=opts.AxisOpts(name="成交量",type_="value",min_=0,axislabel_opts=opts.LabelOpts(formatter="{value} 套"),name_textstyle_opts=opts.TextStyleOpts(color="#5793f3"),)).set_global_opts(title_opts=opts.TitleOpts(title="贝壳成交价格与成交量趋势", pos_left='center'),tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"),xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=45)),yaxis_opts=opts.AxisOpts(name="单价",type_="value",axislabel_opts=opts.LabelOpts(formatter="{value} 元/平米"),name_textstyle_opts=opts.TextStyleOpts(color="#d14a61"),),)
)bar = (Bar().add_xaxis(trend_df['deal_year_month'].tolist()).add_yaxis("成交量",trend_df['total_price'].tolist(),yaxis_index=1,color="#5793f3",label_opts=opts.LabelOpts(is_show=False),)
)line.overlap(bar)
# 渲染图表到HTML文件，可在浏览器中打开并交互
line.render("beike_price_volume_trend.html")
print("交互式图表已生成，请打开 'beike_price_volume_trend.html' 文件查看。")