当前位置：首页 > news >正文

网络爬虫指南：从原理到实战

news 2025/10/25 8:13:04

网络爬虫是现代数据科学和人工智能领域的重要工具，它能够自动化地从互联网上获取数据。本文将深入探讨爬虫的工作原理、伦理规范和技术实现，带你掌握数据获取的核心技能。

1. 爬虫基础：理解网络爬虫的工作原理

什么是网络爬虫？

网络爬虫是一种自动化程序，能够模拟浏览器行为，从网站上批量获取和提取数据。

传统方式 vs 爬虫方式：

传统方式：手动打开浏览器 → 访问网页 → 复制粘贴数据
爬虫方式：程序自动请求 → 解析网页 → 批量提取 → 自动存储

爬虫工作流程

class Spider:"""爬虫基本工作流程1. 发送请求 → 2. 获取响应 → 3. 解析数据 → 4. 存储结果"""def __init__(self):self.url = ""          # 目标URLself.item_info = []    # 解析后的数据self.data_list = []    # 原始数据def get_data(self):"""获取网页数据"""passdef parse_data(self):"""解析数据"""passdef save_data(self):"""保存数据"""pass

2. 爬虫伦理与法律规范

Robots协议

Robots协议是网站告诉爬虫哪些内容可以抓取、哪些不能抓取的规则文件。

查看Robots协议：

https://www.taobao.com/robots.txt
https://www.baidu.com/robots.txt
https://www.douban.com/robots.txt

重要原则：

遵守网站的Robots协议
控制访问频率，避免对服务器造成压力
不抓取敏感信息和隐私数据
尊重知识产权

爬虫的正当用途

搜索引擎索引
学术研究数据收集
公开数据分析
价格监控（合法范围内）

3. 网页基础知识

URL结构

https://movie.douban.com/top250?start=0&filter=
└─┬─┘ └──────┬──────┘ └─┬─┘ └──┬──┘协议       域名      路径    参数

HTML基础

HTML是网页的骨架，由各种标签组成：

<!DOCTYPE html>
<html>
<head><title>网页标题</title>
</head>
<body><div class="movie-item"><span class="title">电影名称</span><span class="rating">9.5</span></div>
</body>
</html>

开发者工具使用

按F12打开开发者工具：

Elements：查看网页HTML结构
Network：监控网络请求
Console：执行JavaScript代码

4. 数据获取：urllib库详解

基本请求

import urllib.request
import urllib.parse
import urllib.errordef basic_request():"""基本的网页请求"""url = "https://www.baidu.com/"try:# 创建请求对象request = urllib.request.Request(url)# 发送请求并获取响应response = urllib.request.urlopen(request)# 读取响应内容html_data = response.read().decode('utf-8')print(f"状态码: {response.getcode()}")print(f"网页大小: {len(html_data)} 字符")return html_dataexcept urllib.error.HTTPError as e:print(f"HTTP错误: {e.code} - {e.reason}")except urllib.error.URLError as e:print(f"URL错误: {e.reason}")# 使用示例
html_content = basic_request()

output:

状态码: 200
网页大小: 28918 字符

请求头伪装

def advanced_request_with_headers():"""带请求头的高级请求"""url = "https://movie.douban.com/top250"# 构造请求头，伪装成浏览器headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8','Connection': 'keep-alive'}try:request = urllib.request.Request(url, headers=headers)response = urllib.request.urlopen(request)# 检查响应状态if response.getcode() == 200:return response.read().decode('utf-8')else:print(f"请求失败，状态码: {response.getcode()}")except Exception as e:print(f"请求异常: {e}")return None# 使用伪装请求
html_content = advanced_request_with_headers()

错误处理

def robust_request(url):"""健壮的请求函数，包含完整错误处理"""headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}try:request = urllib.request.Request(url, headers=headers)response = urllib.request.urlopen(request, timeout=10)# HTTP状态码检查if response.getcode() == 200:return response.read().decode('utf-8')elif response.getcode() == 403:print("访问被禁止，可能需要更换User-Agent")elif response.getcode() == 404:print("页面不存在")elif response.getcode() == 418:print("被识别为爬虫，需要更好的伪装")else:print(f"HTTP错误: {response.getcode()}")except urllib.error.HTTPError as e:print(f"HTTP错误: {e.code} - {e.reason}")except urllib.error.URLError as e:print(f"网络错误: {e.reason}")except Exception as e:print(f"其他错误: {e}")return None

5. 数据解析：BeautifulSoup库详解

BeautifulSoup基础

from bs4 import BeautifulSoup
import redef parse_html_basic(html_content):"""基本的HTML解析"""# 创建BeautifulSoup对象soup = BeautifulSoup(html_content, 'html.parser')print("=== BeautifulSoup四大对象 ===")# 1. Tag对象title_tag = soup.titleprint(f"Tag对象: {title_tag}")print(f"Tag名称: {title_tag.name}")print(f"Tag内容: {title_tag.string}")# 2. NavigableString对象if title_tag.string:print(f"NavigableString: {type(title_tag.string)}")# 3. BeautifulSoup对象print(f"BeautifulSoup对象: {type(soup)}")# 4. Comment对象（注释）# 如果有注释节点，会显示为Comment类型return soup# 解析示例
soup = parse_html_basic(html_content)

output:

=== BeautifulSoup四大对象 ===
Tag对象: <title>
豆瓣电影 Top 250
</title>
Tag名称: title
Tag内容: 
豆瓣电影 Top 250NavigableString: <class 'bs4.element.NavigableString'>
BeautifulSoup对象: <class 'bs4.BeautifulSoup'>

数据搜索与提取

def advanced_parsing(soup):"""高级解析技巧"""print("\n=== 多种搜索方法 ===")# find_all() 方法all_links = soup.find_all('a')print(f"找到 {len(all_links)} 个链接")# 按class搜索movie_items = soup.find_all('div', class_='movie-item')print(f"找到 {len(movie_items)} 个电影项目")# 按属性搜索images = soup.find_all('img', src=True)print(f"找到 {len(images)} 张图片")# CSS选择器titles = soup.select('.title')ratings = soup.select('.rating_num')print(f"找到 {len(titles)} 个标题")print(f"找到 {len(ratings)} 个评分")return titles, ratingsdef extract_movie_info(soup):"""提取电影信息"""movies = []# 查找所有电影项目movie_items = soup.find_all('div', class_='item')for item in movie_items:movie_info = {}try:# 提取标题title_elem = item.find('span', class_='title')if title_elem:movie_info['title'] = title_elem.get_text().strip()# 提取评分rating_elem = item.find('span', class_='rating_num')if rating_elem:movie_info['rating'] = rating_elem.get_text().strip()# 提取引用语quote_elem = item.find('span', class_='inq')if quote_elem:movie_info['quote'] = quote_elem.get_text().strip()# 提取链接link_elem = item.find('a')if link_elem and link_elem.get('href'):movie_info['link'] = link_elem['href']if movie_info:  # 只添加有数据的电影movies.append(movie_info)except Exception as e:print(f"提取电影信息时出错: {e}")continuereturn movies

6. 正则表达式：精准数据提取

正则表达式基础

import redef regex_demo():"""正则表达式演示"""text = "我的电话是138-1234-5678，邮箱是example@email.com"# 匹配电话号码phone_pattern = r'\d{3}-\d{4}-\d{4}'phones = re.findall(phone_pattern, text)print(f"找到电话号码: {phones}")# 匹配邮箱email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'emails = re.findall(email_pattern, text)print(f"找到邮箱: {emails}")# 编译模式对象（提高效率）phone_pat = re.compile(phone_pattern)result = phone_pat.search(text)if result:print(f"搜索到的电话: {result.group()}")# 常用正则表达式符号
"""
.       匹配任意字符（除了换行符）
*       匹配前一个字符0次或多次
+       匹配前一个字符1次或多次
?       匹配前一个字符0次或1次
\d      匹配数字
\w      匹配字母、数字、下划线
\s      匹配空白字符
[abc]   匹配a、b或c
[^abc]  匹配除了a、b、c以外的字符
^       匹配字符串开头
$       匹配字符串结尾
"""def create_movie_regex_patterns():"""创建电影信息正则表达式模式"""patterns = {'title': r'<span class="title">(.*?)</span>','rating': r'<span class="rating_num".*?>(.*?)</span>','quote': r'<span class="inq">(.*?)</span>','link': r'<a href="(.*?)".*?>','year': r'(\d{4})','director': r'导演:\s*(.*?)\s*主演'}return patternsdef parse_with_regex(html_content, patterns):"""使用正则表达式解析数据"""movie_data = {}for key, pattern in patterns.items():matches = re.findall(pattern, html_content, re.DOTALL)if matches:movie_data[key] = matches[0]  # 取第一个匹配项print(f"{key}: {matches[0]}")return movie_data

7. 完整爬虫实战：豆瓣电影Top250

import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import time
import jsonclass DoubanSpider:"""豆瓣电影Top250爬虫"""def __init__(self):self.base_url = "https://movie.douban.com/top250"self.movies = []self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',}def get_html(self, url):"""获取网页HTML"""try:request = urllib.request.Request(url, headers=self.headers)response = urllib.request.urlopen(request, timeout=10)if response.getcode() == 200:return response.read().decode('utf-8')else:print(f"请求失败，状态码: {response.getcode()}")return Noneexcept Exception as e:print(f"获取页面失败: {e}")return Nonedef parse_single_movie(self, item):"""解析单部电影信息"""movie = {}try:# 标题title_elem = item.find('span', class_='title')if title_elem:movie['title'] = title_elem.get_text().strip()# 评分rating_elem = item.find('span', class_='rating_num')if rating_elem:movie['rating'] = float(rating_elem.get_text().strip())# 评价人数eval_elem = item.find('div', class_='star')if eval_elem:eval_text = eval_elem.get_text()people_match = re.search(r'(\d+)人评价', eval_text)if people_match:movie['evaluation_count'] = int(people_match.group(1))# 引用语quote_elem = item.find('span', class_='inq')if quote_elem:movie['quote'] = quote_elem.get_text().strip()# 详细信息info_elem = item.find('div', class_='bd')if info_elem:info_text = info_elem.get_text()# 提取导演和演员director_match = re.search(r'导演:\s*(.*?)\s*\n', info_text)if director_match:movie['director'] = director_match.group(1).strip()# 提取年份和国家year_country_match = re.search(r'(\d{4})\s*/\s*([^/]+?)\s*/\s*([^\n]+)', info_text)if year_country_match:movie['year'] = int(year_country_match.group(1))movie['country'] = year_country_match.group(2).strip()movie['genre'] = year_country_match.group(3).strip()# 链接link_elem = item.find('a')if link_elem and link_elem.get('href'):movie['link'] = link_elem['href']except Exception as e:print(f"解析电影信息失败: {e}")return movie if movie else Nonedef crawl_all_pages(self):"""爬取所有页面"""start = 0page_size = 25total_movies = 250while start < total_movies:url = f"{self.base_url}?start={start}&filter="print(f"正在爬取: {url}")html_content = self.get_html(url)if not html_content:print(f"第 {start//25 + 1} 页爬取失败")breaksoup = BeautifulSoup(html_content, 'html.parser')movie_items = soup.find_all('div', class_='item')for item in movie_items:movie = self.parse_single_movie(item)if movie:self.movies.append(movie)print(f"第 {start//25 + 1} 页完成，已获取 {len(self.movies)} 部电影")start += page_sizetime.sleep(2)  # 礼貌性延迟，避免对服务器造成压力print(f"爬取完成！共获取 {len(self.movies)} 部电影信息")def find_movie_by_title(self, title):"""根据标题查找特定电影"""for movie in self.movies:if title in movie.get('title', ''):return moviereturn Nonedef save_to_json(self, filename='douban_top250.json'):"""保存为JSON文件"""with open(filename, 'w', encoding='utf-8') as f:json.dump(self.movies, f, ensure_ascii=False, indent=2)print(f"数据已保存到 {filename}")def save_to_xls(self, filename='douban_top250.xls'):"""保存为Excel文件（需要安装xlwt库）"""try:import xlwtworkbook = xlwt.Workbook(encoding='utf-8')worksheet = workbook.add_sheet('豆瓣电影Top250')# 写入标题行headers = ['标题', '评分', '评价人数', '年份', '国家', '类型', '导演', '引用语', '链接']for col, header in enumerate(headers):worksheet.write(0, col, header)# 写入数据for row, movie in enumerate(self.movies, 1):worksheet.write(row, 0, movie.get('title', ''))worksheet.write(row, 1, movie.get('rating', 0))worksheet.write(row, 2, movie.get('evaluation_count', 0))worksheet.write(row, 3, movie.get('year', ''))worksheet.write(row, 4, movie.get('country', ''))worksheet.write(row, 5, movie.get('genre', ''))worksheet.write(row, 6, movie.get('director', ''))worksheet.write(row, 7, movie.get('quote', ''))worksheet.write(row, 8, movie.get('link', ''))workbook.save(filename)print(f"数据已保存到 {filename}")except ImportError:print("请先安装xlwt库: pip install xlwt")except Exception as e:print(f"保存Excel文件失败: {e}")# 使用爬虫
def main():spider = DoubanSpider()# 爬取所有数据spider.crawl_all_pages()# 查找特定电影forest_gump = spider.find_movie_by_title('阿甘正传')if forest_gump:print("\n=== 《阿甘正传》信息 ===")for key, value in forest_gump.items():print(f"{key}: {value}")# 保存数据spider.save_to_json()spider.save_to_xls()if __name__ == "__main__":main()

8. 数据存储技术

JSON格式存储

import jsondef save_json_data(data, filename):"""保存数据为JSON格式"""with open(filename, 'w', encoding='utf-8') as f:json.dump(data, f, ensure_ascii=False, indent=2)print(f"数据已保存到 {filename}")def load_json_data(filename):"""从JSON文件加载数据"""try:with open(filename, 'r', encoding='utf-8') as f:return json.load(f)except FileNotFoundError:print(f"文件 {filename} 不存在")return []

Excel格式存储

def save_excel_data(data, filename):"""保存数据为Excel格式"""try:import xlwtworkbook = xlwt.Workbook(encoding='utf-8')worksheet = workbook.add_sheet('数据')# 写入标题if data:headers = list(data[0].keys())for col, header in enumerate(headers):worksheet.write(0, col, header)# 写入数据for row, item in enumerate(data, 1):for col, key in enumerate(headers):worksheet.write(row, col, str(item.get(key, '')))workbook.save(filename)print(f"数据已保存到 {filename}")except ImportError:print("请安装xlwt: pip install xlwt")

CSV格式存储

import csvdef save_csv_data(data, filename):"""保存数据为CSV格式"""if not data:print("没有数据可保存")returnwith open(filename, 'w', encoding='utf-8', newline='') as f:writer = csv.DictWriter(f, fieldnames=data[0].keys())writer.writeheader()writer.writerows(data)print(f"数据已保存到 {filename}")

9. 高级技巧与最佳实践

请求延迟与礼貌爬取

import random
import timeclass PoliteSpider:"""礼貌的爬虫，避免对服务器造成压力"""def __init__(self, min_delay=1, max_delay=3):self.min_delay = min_delayself.max_delay = max_delaydef polite_request(self, url):"""礼貌的请求，包含随机延迟"""# 随机延迟delay = random.uniform(self.min_delay, self.max_delay)time.sleep(delay)# 发送请求return self.get_html(url)def batch_crawl(self, urls):"""批量爬取，保持礼貌"""results = []for i, url in enumerate(urls, 1):print(f"爬取进度: {i}/{len(urls)}")html = self.polite_request(url)if html:results.append(html)else:print(f"跳过URL: {url}")return results

代理IP使用

def use_proxy_request(url, proxy_url):"""使用代理IP发送请求"""proxy_handler = urllib.request.ProxyHandler({'http': proxy_url, 'https': proxy_url})opener = urllib.request.build_opener(proxy_handler)try:response = opener.open(url, timeout=10)return response.read().decode('utf-8')except Exception as e:print(f"代理请求失败: {e}")return None

10. 综合练习：获取《阿甘正传》信息

def get_forest_gump_info():"""专门获取《阿甘正传》的HTML内容"""base_url = "https://movie.douban.com/top250"# 爬取第一页寻找阿甘正传html_content = basic_request(base_url)if not html_content:return Nonesoup = BeautifulSoup(html_content, 'html.parser')# 查找包含"阿甘正传"的电影项movie_items = soup.find_all('div', class_='item')for item in movie_items:title_elem = item.find('span', class_='title')if title_elem and '阿甘正传' in title_elem.get_text():print("找到《阿甘正传》！")print("完整HTML内容:")print("=" * 50)print(item.prettify())print("=" * 50)return item.prettify()print("未找到《阿甘正传》")return None# 执行练习
forest_gump_html = get_forest_gump_info()

11. 错误处理与调试

常见错误及解决方案

def debug_spider():"""爬虫调试函数"""common_issues = {'403错误': '检查User-Agent和请求头','404错误': '检查URL是否正确','418错误': '被识别为爬虫，需要更好的伪装','编码错误': '检查网页编码格式','超时错误': '增加超时时间或检查网络连接','SSL错误': '可能需要忽略SSL证书验证'}print("=== 常见爬虫问题及解决方案 ===")for issue, solution in common_issues.items():print(f"{issue}: {solution}")def robust_spider_template():"""健壮爬虫模板"""url = "你的目标URL"try:# 请求代码passexcept urllib.error.HTTPError as e:if e.code == 403:print("解决方案: 更新User-Agent，添加Referer等头部信息")elif e.code == 404:print("解决方案: 检查URL是否正确")elif e.code == 418:print("解决方案: 使用更真实的浏览器指纹")except Exception as e:print(f"其他错误: {e}")