当前位置：首页 > news >正文

Python3 正则表达式详解

news 2025/10/21 11:24:42

正则表达式是用于处理字符串的强大工具，Python 通过 re 模块提供完整的正则表达式功能。

1. 基本语法和特殊字符

常用元字符

import re# . - 匹配任意单个字符（除了换行符）
text = "cat bat hat rat"
result = re.findall(r".at", text)
print(result)  # ['cat', 'bat', 'hat', 'rat']# ^ - 匹配字符串开头
text = "hello world"
result = re.findall(r"^hello", text)
print(result)  # ['hello']# $ - 匹配字符串结尾
text = "hello world"
result = re.findall(r"world$", text)
print(result)  # ['world']# * - 匹配前一个字符0次或多次
text = "ct cat caat caaat"
result = re.findall(r"ca*t", text)
print(result)  # ['ct', 'cat', 'caat', 'caaat']# + - 匹配前一个字符1次或多次
result = re.findall(r"ca+t", text)
print(result)  # ['cat', 'caat', 'caaat']# ? - 匹配前一个字符0次或1次
result = re.findall(r"ca?t", text)
print(result)  # ['ct', 'cat']

字符集和范围

# [] - 字符集，匹配括号内的任意字符
text = "cat bat hat rat"
result = re.findall(r"[bc]at", text)
print(result)  # ['cat', 'bat']# [^] - 否定字符集
result = re.findall(r"[^bc]at", text)
print(result)  # ['hat', 'rat']# [a-z] - 字符范围
text = "a1 b2 c3 d4 A5 B6"
result = re.findall(r"[a-z]\d", text)
print(result)  # ['a1', 'b2', 'c3', 'd4']# | - 或操作
text = "cat dog bird"
result = re.findall(r"cat|dog", text)
print(result)  # ['cat', 'dog']

2. 预定义字符集

# \d - 数字字符，等价于 [0-9]
text = "电话：123-4567-890"
result = re.findall(r"\d+", text)
print(result)  # ['123', '4567', '890']# \D - 非数字字符
result = re.findall(r"\D+", text)
print(result)  # ['电话：', '-', '-']# \w - 单词字符，等价于 [a-zA-Z0-9_]
text = "user_name123 @email.com"
result = re.findall(r"\w+", text)
print(result)  # ['user_name123', 'email', 'com']# \W - 非单词字符
result = re.findall(r"\W+", text)
print(result)  # [' ', ' @', '.']# \s - 空白字符（空格、制表符、换行符等）
text = "hello\tworld\npython"
result = re.findall(r"\s+", text)
print(result)  # ['\t', '\n']# \S - 非空白字符
result = re.findall(r"\S+", text)
print(result)  # ['hello', 'world', 'python']

3. 量词和分组

量词

# {n} - 精确匹配n次
text = "color colour colouur"
result = re.findall(r"colou?r", text)  # ? 匹配0次或1次
print(result)  # ['color', 'colour']# {n,} - 匹配至少n次
# {n,m} - 匹配n到m次
text = "a aa aaa aaaa aaaaa"
result = re.findall(r"a{2,4}", text)
print(result)  # ['aa', 'aaa', 'aaaa', 'aaaa'] 注意：'aaaaa'被匹配为'aaaa'# 贪婪匹配 vs 非贪婪匹配
text = "<div>content</div><div>more</div>"
# 贪婪匹配
greedy = re.findall(r"<div>.*</div>", text)
print("贪婪:", greedy)  # ['<div>content</div><div>more</div>']# 非贪婪匹配
non_greedy = re.findall(r"<div>.*?</div>", text)
print("非贪婪:", non_greedy)  # ['<div>content</div>', '<div>more</div>']

分组

# () - 捕获分组
text = "2023-10-25 2024-01-15"
result = re.findall(r"(\d{4})-(\d{2})-(\d{2})", text)
print(result)  # [('2023', '10', '25'), ('2024', '01', '15')]# 命名分组 (?P<name>...)
text = "姓名: 张三, 年龄: 25"
pattern = r"姓名: (?P<name>\w+), 年龄: (?P<age>\d+)"
match = re.search(pattern, text)
if match:print(f"姓名: {match.group('name')}")  # 姓名: 张三print(f"年龄: {match.group('age')}")   # 年龄: 25# 非捕获分组 (?:...)
text = "hello world hello python"
result = re.findall(r"(?:hello) (world|python)", text)
print(result)  # ['world', 'python']

4. re模块主要函数

re.match() - 从字符串开头匹配

def test_match():text = "hello world"# 匹配开头result = re.match(r"hello", text)if result:print("match found:", result.group())  # hello# 不匹配开头result = re.match(r"world", text)if not result:print("no match for 'world' at beginning")test_match()

re.search() - 搜索整个字符串

def test_search():text = "hello world"# 搜索任意位置result = re.search(r"world", text)if result:print("search found:", result.group())  # worldprint("position:", result.span())      # (6, 11)test_search()

re.findall() - 查找所有匹配

def test_findall():text = "苹果 10元, 香蕉 5元, 橙子 8元"# 查找所有价格prices = re.findall(r"\d+元", text)print("所有价格:", prices)  # ['10元', '5元', '8元']# 查找所有商品和价格items = re.findall(r"(\w+) (\d+)元", text)print("商品和价格:", items)  # [('苹果', '10'), ('香蕉', '5'), ('橙子', '8')]test_findall()

re.finditer() - 返回迭代器

def test_finditer():text = "苹果 10元, 香蕉 5元, 橙子 8元"for match in re.finditer(r"(\w+) (\d+)元", text):print(f"商品: {match.group(1)}, 价格: {match.group(2)}元")print(f"位置: {match.span()}")test_finditer()

re.sub() - 替换匹配内容

def test_sub():text = "今天是2023-10-25，明天是2023-10-26"# 替换日期格式result = re.sub(r"(\d{4})-(\d{2})-(\d{2})", r"\1年\2月\3日", text)print(result)  # 今天是2023年10月25日，明天是2023年10月26日# 使用函数进行替换def double_number(match):num = int(match.group())return str(num * 2)text2 = "数字: 1, 2, 3, 4, 5"result2 = re.sub(r"\d+", double_number, text2)print(result2)  # 数字: 2, 4, 6, 8, 10test_sub()

re.split() - 根据模式分割字符串

def test_split():text = "苹果,香蕉;橙子 葡萄|梨"# 使用多种分隔符result = re.split(r"[,; \|]+", text)print("分割结果:", result)  # ['苹果', '香蕉', '橙子', '葡萄', '梨']# 保留分隔符result2 = re.split(r"([,;])", text)print("保留分隔符:", result2)test_split()

5. 编译正则表达式

对于需要重复使用的模式，可以编译以提高效率：

def compiled_regex():# 编译正则表达式pattern = re.compile(r"\b\w{4}\b")  # 匹配4字母单词text = "This is a test string with some words"# 使用编译后的模式result1 = pattern.findall(text)print("4字母单词:", result1)  # ['This', 'test', 'with', 'some']result2 = pattern.sub("****", text)print("替换后:", result2)  # **** is a **** string with **** wordscompiled_regex()

6. 标志参数

def regex_flags():text = "Hello\nWorld\nPython"# re.IGNORECASE - 忽略大小写result1 = re.findall(r"hello", text, re.IGNORECASE)print("忽略大小写:", result1)  # ['Hello']# re.MULTILINE - 多行模式result2 = re.findall(r"^.+", text, re.MULTILINE)print("多行模式:", result2)  # ['Hello', 'World', 'Python']# re.DOTALL - 让 . 匹配包括换行符在内的所有字符result3 = re.findall(r"Hello.*Python", text, re.DOTALL)print("DOTALL模式:", result3)  # ['Hello\nWorld\nPython']# 组合使用多个标志result4 = re.findall(r"^[a-z]+", text, re.IGNORECASE | re.MULTILINE)print("组合标志:", result4)  # ['Hello', 'World', 'Python']regex_flags()

7. 实际应用示例

邮箱验证

def validate_email(email):pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'return bool(re.match(pattern, email))# 测试邮箱验证
emails = ["user@example.com","user.name@domain.co.uk", "invalid-email","user@domain","user@.com"
]for email in emails:print(f"{email}: {validate_email(email)}")

电话号码提取

def extract_phone_numbers(text):# 匹配多种电话号码格式pattern = r'''(?:\+86\s?)?              # 可选国际码(?:1[3-9]\d{9})           # 手机号|                          # 或(?:\d{3,4}-)?\d{7,8}      # 座机号'''phones = re.findall(pattern, text, re.VERBOSE)return phonestext = """
联系方式：
手机：13812345678，13587654321
电话：010-12345678，021-87654321
国际：+86 13900001111
"""phones = extract_phone_numbers(text)
print("提取的电话号码:", phones)

URL解析

def parse_url(url):pattern = r'''^(https?)://          # 协议([^/:]+)             # 域名(?:[:](\d+))?        # 端口（可选）(/.*)?               # 路径（可选）$'''match = re.match(pattern, url, re.VERBOSE)if match:return {'protocol': match.group(1),'domain': match.group(2),'port': match.group(3) or '80','path': match.group(4) or '/'}return Noneurls = ["https://www.example.com","http://localhost:8080/api/v1/users","https://example.com:443/path/to/page"
]for url in urls:parsed = parse_url(url)print(f"{url} -> {parsed}")

HTML内容提取

def extract_html_content(html):# 提取所有链接links = re.findall(r'<a[^>]*href="([^"]*)"[^>]*>', html)# 提取所有图片images = re.findall(r'<img[^>]*src="([^"]*)"[^>]*>', html)# 提取文本内容（简单版本）text = re.sub(r'<[^>]+>', '', html)text = re.sub(r'\s+', ' ', text).strip()return {'links': links,'images': images,'text': text}html_content = """
<div><h1>标题</h1><p>这是一个<a href="https://example.com">链接</a></p><img src="image.jpg" alt="图片"><p>另一个<a href="/page.html">页面</a></p>
</div>
"""result = extract_html_content(html_content)
print("HTML解析结果:", result)

8. 调试和测试技巧

def debug_regex():pattern = r"(\d{3})-(\d{4})-(\d{4})"text = "电话：010-1234-5678"match = re.search(pattern, text)if match:print("完整匹配:", match.group(0))print("分组:", match.groups())print("分组字典:", match.groupdict())print("匹配位置:", match.span())# 详细调试信息print("\n调试信息:")for i, group in enumerate(match.groups(), 1):print(f"  分组{i}: '{group}' (位置: {match.span(i)})")debug_regex()

9. 性能优化建议

def optimize_regex():import timetext = "a" * 1000 + "b"# 不好的写法 - 灾难性回溯start = time.time()try:re.match(r"(a+)+b", text)except:passprint(f"灾难性回溯耗时: {time.time() - start:.4f}s")# 好的写法start = time.time()re.match(r"a+b", text)print(f"优化后耗时: {time.time() - start:.4f}s")# 使用编译后的正则表达式pattern = re.compile(r"a+b")start = time.time()for _ in range(1000):pattern.match(text)print(f"编译后1000次匹配: {time.time() - start:.4f}s")optimize_regex()

查看全文

http://www.dtcms.com/a/508223.html