当前位置：首页 > news >正文

Python爬虫第5课：正则表达式与数据清洗技术

news 2025/10/18 16:46:28

- 课程目标
- 1. 正则表达式基础
- - 1.1 基本语法
  - 1.2 字符类和量词
  - 1.3 分组和捕获
- 2. 爬虫中的正则表达式应用
- - 2.1 提取URL链接
  - 2.2 提取邮箱和电话
  - 2.3 提取价格信息
- 3. 数据清洗技术
- - 3.1 文本清理
  - 3.2 数据验证
  - 3.3 数据标准化
- 4. 实战案例：新闻数据清洗
- 5. 实践练习
- - 练习1：邮箱提取器
  - 练习2：价格监控
  - 练习3：数据清洗管道
- 6. 课程小结
- 7. 下节预告
- 8. 作业

专栏导读

🌸 欢迎来到Python办公自动化专栏—Python处理办公问题，解放您的双手

🏳️‍🌈 个人博客主页：请点击——> 个人的博客主页求收藏

🏳️‍🌈 Github主页：请点击——> Github主页求Star⭐

🏳️‍🌈 知乎主页：请点击——> 知乎主页求关注

🏳️‍🌈 CSDN博客主页：请点击——> CSDN的博客主页求关注

👍 该系列文章专栏：请点击——>Python办公自动化专栏求订阅

🕷 此外还有爬虫专栏：请点击——>Python爬虫基础专栏求订阅

📕 此外还有python基础专栏：请点击——>Python基础学习专栏求订阅

文章作者技术和水平有限，如果文中出现错误，希望大家能指正🙏

❤️ 欢迎各位佬关注！ ❤️

课程目标

掌握正则表达式的基本语法和高级用法
学会在爬虫中使用正则表达式提取数据
掌握数据清洗和预处理技术
理解数据质量控制的重要性

1. 正则表达式基础

正则表达式（Regular Expression）是一种强大的文本处理工具，在爬虫中常用于数据提取和清洗。

1.1 基本语法

import re# 基本匹配
text = "Hello World 123"
pattern = r"World"
match = re.search(pattern, text)
if match:print(f"找到匹配：{match.group()}")# 元字符
# . : 匹配任意字符（除换行符）
# * : 匹配前面的字符0次或多次
# + : 匹配前面的字符1次或多次
# ? : 匹配前面的字符0次或1次
# ^ : 匹配字符串开始
# $ : 匹配字符串结束# 示例
text = "abc123def456"
numbers = re.findall(r'\d+', text)  # 查找所有数字
print(numbers)  # ['123', '456']letters = re.findall(r'[a-zA-Z]+', text)  # 查找所有字母
print(letters)  # ['abc', 'def']

1.2 字符类和量词

# 字符类
text = "Phone: 138-1234-5678, Email: user@example.com"# \d : 数字 [0-9]
# \w : 字母数字下划线 [a-zA-Z0-9_]
# \s : 空白字符
# \D : 非数字
# \W : 非字母数字下划线
# \S : 非空白字符# 提取手机号
phone = re.search(r'\d{3}-\d{4}-\d{4}', text)
if phone:print(f"手机号：{phone.group()}")# 提取邮箱
email = re.search(r'\w+@\w+\.\w+', text)
if email:print(f"邮箱：{email.group()}")# 量词
# {n} : 恰好n次
# {n,} : 至少n次
# {n,m} : n到m次
# * : {0,}
# + : {1,}
# ? : {0,1}

1.3 分组和捕获

# 分组
text = "姓名：张三，年龄：25，电话：138-1234-5678"# 使用括号创建分组
pattern = r'姓名：(\w+)，年龄：(\d+)，电话：(\d{3}-\d{4}-\d{4})'
match = re.search(pattern, text)if match:name = match.group(1)age = match.group(2)phone = match.group(3)print(f"姓名：{name}, 年龄：{age}, 电话：{phone}")# 命名分组
pattern = r'姓名：(?P<name>\w+)，年龄：(?P<age>\d+)，电话：(?P<phone>\d{3}-\d{4}-\d{4})'
match = re.search(pattern, text)if match:print(f"姓名：{match.group('name')}")print(f"年龄：{match.group('age')}")print(f"电话：{match.group('phone')}")

2. 爬虫中的正则表达式应用

2.1 提取URL链接

import requests
import redef extract_links(url):"""从网页中提取所有链接"""try:response = requests.get(url)html = response.text# 提取所有href属性link_pattern = r'href=["\']([^"\']+)["\']'links = re.findall(link_pattern, html)# 过滤和清理链接clean_links = []for link in links:if link.startswith('http'):clean_links.append(link)elif link.startswith('/'):# 相对路径转绝对路径base_url = re.match(r'https?://[^/]+', url).group()clean_links.append(base_url + link)return clean_linksexcept Exception as e:print(f"提取链接失败：{e}")return []# 使用示例
links = extract_links('https://example.com')
for link in links[:10]:  # 显示前10个链接print(link)

2.2 提取邮箱和电话

def extract_contact_info(text):"""从文本中提取联系信息"""# 邮箱正则表达式email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'emails = re.findall(email_pattern, text)# 手机号正则表达式（中国）phone_patterns = [r'1[3-9]\d{9}',  # 11位手机号r'1[3-9]\d-\d{4}-\d{4}',  # 带横线的手机号r'\(\d{3}\)\s?\d{3}-\d{4}',  # 美式电话格式]phones = []for pattern in phone_patterns:phones.extend(re.findall(pattern, text))# 固定电话landline_pattern = r'0\d{2,3}-\d{7,8}'landlines = re.findall(landline_pattern, text)return {'emails': emails,'mobile_phones': phones,'landlines': landlines}# 测试
text = """
联系我们：
邮箱：contact@example.com, support@test.org
手机：13812345678, 139-1234-5678
固话：010-12345678, 021-87654321
"""contact_info = extract_contact_info(text)
print(contact_info)

2.3 提取价格信息

def extract_prices(text):"""从文本中提取价格信息"""# 各种价格格式price_patterns = [r'￥\s*(\d+(?:\.\d{2})?)',  # ￥100.00r'\$\s*(\d+(?:\.\d{2})?)',  # $100.00r'(\d+(?:\.\d{2})?)\s*元',  # 100.00元r'价格[：:]\s*(\d+(?:\.\d{2})?)',  # 价格：100.00r'(\d+(?:,\d{3})*(?:\.\d{2})?)',  # 1,000.00]all_prices = []for pattern in price_patterns:prices = re.findall(pattern, text)all_prices.extend(prices)# 转换为浮点数并去重numeric_prices = []for price in all_prices:try:# 移除逗号并转换为浮点数clean_price = float(price.replace(',', ''))if clean_price not in numeric_prices:numeric_prices.append(clean_price)except ValueError:continuereturn sorted(numeric_prices)# 测试
text = """
商品价格：￥199.99
原价：$299.00
现价：158元
会员价格：1,299.00
"""prices = extract_prices(text)
print(f"提取到的价格：{prices}")

3. 数据清洗技术

3.1 文本清理

import re
import htmlclass TextCleaner:def __init__(self):# 预编译常用正则表达式self.html_tag_pattern = re.compile(r'<[^>]+>')self.whitespace_pattern = re.compile(r'\s+')self.special_chars_pattern = re.compile(r'[^\w\s\u4e00-\u9fff]')def clean_html(self, text):"""清理HTML标签"""# 解码HTML实体text = html.unescape(text)# 移除HTML标签text = self.html_tag_pattern.sub('', text)return textdef clean_whitespace(self, text):"""清理多余的空白字符"""# 替换多个空白字符为单个空格text = self.whitespace_pattern.sub(' ', text)# 移除首尾空白text = text.strip()return textdef clean_special_chars(self, text, keep_chinese=True):"""清理特殊字符"""if keep_chinese:# 保留中文、英文、数字和空格text = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)else:# 只保留英文、数字和空格text = re.sub(r'[^\w\s]', '', text)return textdef normalize_text(self, text):"""标准化文本"""# 转换为小写text = text.lower()# 全角转半角text = self.full_to_half_width(text)return textdef full_to_half_width(self, text):"""全角字符转半角"""result = ""for char in text:code = ord(char)if code == 0x3000:  # 全角空格result += chr(0x0020)elif 0xFF01 <= code <= 0xFF5E:  # 全角字符result += chr(code - 0xFEE0)else:result += charreturn resultdef clean_all(self, text):"""执行所有清理操作"""text = self.clean_html(text)text = self.clean_whitespace(text)text = self.normalize_text(text)return text# 使用示例
cleaner = TextCleaner()dirty_text = """
<div class="content"><p>这是一个&nbsp;&nbsp;包含<strong>HTML标签</strong>的文本。</p><p>还有一些特殊字符：！@#￥%……&*（）</p>
</div>
"""clean_text = cleaner.clean_all(dirty_text)
print(f"清理后的文本：{clean_text}")

3.2 数据验证

import re
from datetime import datetimeclass DataValidator:def __init__(self):self.email_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')self.phone_pattern = re.compile(r'^1[3-9]\d{9}$')self.url_pattern = re.compile(r'^https?://[^\s/$.?#].[^\s]*$')def validate_email(self, email):"""验证邮箱格式"""return bool(self.email_pattern.match(email))def validate_phone(self, phone):"""验证手机号格式"""# 清理手机号（移除空格和横线）clean_phone = re.sub(r'[\s-]', '', phone)return bool(self.phone_pattern.match(clean_phone))def validate_url(self, url):"""验证URL格式"""return bool(self.url_pattern.match(url))def validate_date(self, date_str, format='%Y-%m-%d'):"""验证日期格式"""try:datetime.strptime(date_str, format)return Trueexcept ValueError:return Falsedef validate_price(self, price_str):"""验证价格格式"""try:price = float(re.sub(r'[^\d.]', '', price_str))return price >= 0except ValueError:return Falsedef validate_data_dict(self, data, rules):"""批量验证数据"""errors = []for field, value in data.items():if field in rules:rule = rules[field]if rule == 'email' and not self.validate_email(value):errors.append(f"{field}: 邮箱格式不正确")elif rule == 'phone' and not self.validate_phone(value):errors.append(f"{field}: 手机号格式不正确")elif rule == 'url' and not self.validate_url(value):errors.append(f"{field}: URL格式不正确")elif rule == 'price' and not self.validate_price(value):errors.append(f"{field}: 价格格式不正确")return errors# 使用示例
validator = DataValidator()# 测试数据
test_data = {'email': 'user@example.com','phone': '138-1234-5678','url': 'https://www.example.com','price': '￥199.99'
}# 验证规则
validation_rules = {'email': 'email','phone': 'phone','url': 'url','price': 'price'
}errors = validator.validate_data_dict(test_data, validation_rules)
if errors:print("验证错误：")for error in errors:print(f"  - {error}")
else:print("所有数据验证通过")

3.3 数据标准化

import re
from datetime import datetimeclass DataNormalizer:def __init__(self):self.phone_pattern = re.compile(r'[\s()-]')self.price_pattern = re.compile(r'[^\d.]')def normalize_phone(self, phone):"""标准化手机号"""# 移除所有非数字字符clean_phone = self.phone_pattern.sub('', phone)# 确保是11位数字if len(clean_phone) == 11 and clean_phone.startswith('1'):return clean_phonereturn Nonedef normalize_price(self, price_str):"""标准化价格"""try:# 移除货币符号和其他非数字字符（保留小数点）clean_price = self.price_pattern.sub('', price_str)return float(clean_price)except ValueError:return Nonedef normalize_date(self, date_str):"""标准化日期"""# 常见日期格式date_formats = ['%Y-%m-%d','%Y/%m/%d','%d/%m/%Y','%d-%m-%Y','%Y年%m月%d日','%m月%d日',]for fmt in date_formats:try:date_obj = datetime.strptime(date_str, fmt)return date_obj.strftime('%Y-%m-%d')except ValueError:continuereturn Nonedef normalize_name(self, name):"""标准化姓名"""# 移除多余空格name = re.sub(r'\s+', ' ', name.strip())# 首字母大写name = name.title()return namedef normalize_address(self, address):"""标准化地址"""# 移除多余空格address = re.sub(r'\s+', ' ', address.strip())# 标准化省市区address = re.sub(r'省|市|区|县', lambda m: m.group() + ' ', address)address = re.sub(r'\s+', ' ', address)return address.strip()# 使用示例
normalizer = DataNormalizer()# 测试数据
test_data = [{'phone': '138 1234 5678', 'price': '￥199.99', 'date': '2024/01/01'},{'phone': '(139) 1234-5678', 'price': '$299.00', 'date': '2024年1月2日'},{'phone': '150-1234-5678', 'price': '158元', 'date': '1月3日'},
]for i, data in enumerate(test_data):print(f"数据 {i+1}:")print(f"  原始手机号: {data['phone']}")print(f"  标准化手机号: {normalizer.normalize_phone(data['phone'])}")print(f"  原始价格: {data['price']}")print(f"  标准化价格: {normalizer.normalize_price(data['price'])}")print(f"  原始日期: {data['date']}")print(f"  标准化日期: {normalizer.normalize_date(data['date'])}")print()

4. 实战案例：新闻数据清洗

import requests
from bs4 import BeautifulSoup
import re
import json
from datetime import datetimeclass NewsDataCleaner:def __init__(self):self.text_cleaner = TextCleaner()self.validator = DataValidator()self.normalizer = DataNormalizer()# 新闻相关的正则表达式self.author_pattern = re.compile(r'作者[：:]\s*([^\s]+)')self.source_pattern = re.compile(r'来源[：:]\s*([^\s]+)')self.time_pattern = re.compile(r'(\d{4}[-/年]\d{1,2}[-/月]\d{1,2}[日]?)')def crawl_and_clean_news(self, url):"""爬取并清洗新闻数据"""try:response = requests.get(url)response.encoding = 'utf-8'soup = BeautifulSoup(response.text, 'html.parser')# 提取原始数据raw_data = self.extract_raw_data(soup)# 清洗数据clean_data = self.clean_news_data(raw_data)return clean_dataexcept Exception as e:print(f"爬取新闻失败：{e}")return Nonedef extract_raw_data(self, soup):"""提取原始新闻数据"""# 这里是示例，实际需要根据具体网站调整title = soup.find('h1', class_='title')content = soup.find('div', class_='content')meta = soup.find('div', class_='meta')return {'title': title.get_text() if title else '','content': content.get_text() if content else '','meta': meta.get_text() if meta else '','url': soup.find('link', {'rel': 'canonical'})['href'] if soup.find('link', {'rel': 'canonical'}) else ''}def clean_news_data(self, raw_data):"""清洗新闻数据"""clean_data = {}# 清洗标题clean_data['title'] = self.text_cleaner.clean_all(raw_data['title'])# 清洗内容content = self.text_cleaner.clean_html(raw_data['content'])content = self.text_cleaner.clean_whitespace(content)clean_data['content'] = content# 提取作者author_match = self.author_pattern.search(raw_data['meta'])clean_data['author'] = author_match.group(1) if author_match else ''# 提取来源source_match = self.source_pattern.search(raw_data['meta'])clean_data['source'] = source_match.group(1) if source_match else ''# 提取时间time_match = self.time_pattern.search(raw_data['meta'])if time_match:clean_data['publish_time'] = self.normalizer.normalize_date(time_match.group(1))else:clean_data['publish_time'] = ''# 验证URLif self.validator.validate_url(raw_data['url']):clean_data['url'] = raw_data['url']else:clean_data['url'] = ''# 添加清洗时间clean_data['clean_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')# 计算内容长度clean_data['content_length'] = len(clean_data['content'])return clean_datadef batch_clean_news(self, urls):"""批量清洗新闻"""results = []for i, url in enumerate(urls):print(f"处理第 {i+1}/{len(urls)} 条新闻...")clean_data = self.crawl_and_clean_news(url)if clean_data:results.append(clean_data)# 避免请求过快import timetime.sleep(1)return resultsdef save_clean_data(self, data_list, filename='clean_news.json'):"""保存清洗后的数据"""with open(filename, 'w', encoding='utf-8') as f:json.dump(data_list, f, ensure_ascii=False, indent=2)print(f"清洗后的数据已保存到 {filename}")def generate_report(self, data_list):"""生成数据质量报告"""if not data_list:print("没有数据可分析")returntotal_count = len(data_list)# 统计各字段的完整性fields = ['title', 'content', 'author', 'source', 'publish_time', 'url']completeness = {}for field in fields:non_empty_count = sum(1 for item in data_list if item.get(field, '').strip())completeness[field] = non_empty_count / total_count * 100# 内容长度统计content_lengths = [item.get('content_length', 0) for item in data_list]avg_length = sum(content_lengths) / len(content_lengths)print("数据质量报告")print("=" * 50)print(f"总记录数：{total_count}")print(f"平均内容长度：{avg_length:.0f} 字符")print("\n字段完整性：")for field, percentage in completeness.items():print(f"  {field}: {percentage:.1f}%")# 使用示例
if __name__ == "__main__":cleaner = NewsDataCleaner()# 示例URL列表news_urls = ['https://example-news1.com/article/1','https://example-news2.com/article/2',# 更多URL...]# 批量清洗clean_results = cleaner.batch_clean_news(news_urls)# 保存结果cleaner.save_clean_data(clean_results)# 生成报告cleaner.generate_report(clean_results)

5. 实践练习

练习1：邮箱提取器

编写程序从网页中提取所有邮箱地址，并验证其有效性。

练习2：价格监控

创建一个价格提取和标准化工具，处理各种价格格式。

练习3：数据清洗管道

设计一个通用的数据清洗管道，可以处理不同类型的爬虫数据。

6. 课程小结

本课程我们学习了：

正则表达式的基本语法和高级用法
在爬虫中使用正则表达式提取数据
文本清理和标准化技术
数据验证和质量控制
实战案例和最佳实践

7. 下节预告

下一课我们将学习：

处理JavaScript动态内容
Selenium自动化浏览器
模拟用户行为
处理复杂的交互式网站

8. 作业

编写一个通用的联系信息提取器
创建一个数据清洗工具类
实现一个数据质量检查器
练习处理各种格式的日期和时间数据

提示：数据清洗是爬虫项目中非常重要的环节，高质量的数据是后续分析的基础。

结尾

希望对初学者有帮助；致力于办公自动化的小小程序员一枚

希望能得到大家的【❤️一个免费关注❤️】感谢！

求个 🤞 关注 🤞 +❤️ 喜欢 ❤️ +👍 收藏 👍

此外还有办公自动化专栏，欢迎大家订阅：Python办公自动化专栏

此外还有爬虫专栏，欢迎大家订阅：Python爬虫基础专栏

此外还有Python基础专栏，欢迎大家订阅：Python基础学习专栏