当前位置：首页 > news >正文

python字符串处理与正则表达式--之八

news 2025/10/31 17:42:40

第8：字符串处理与正则表达式

字符串基础

字符串是由字符组成的序列，在Python中是不可变的。

字符串创建

# 单引号
str1 = 'Hello'# 双引号
str2 = "World"# 三引号（多行字符串）
str3 = '''这是第一行
这是第二行'''str4 = """这也是多行字符串
可以包含"双引号"和'单引号'"""# 转义字符
str5 = "Hello\nWorld"  # 换行符
str6 = "Hello\tWorld"  # 制表符
str7 = "He said, \"Hello\""  # 双引号转义
str8 = 'It\'s a beautiful day'  # 单引号转义# 原始字符串（不处理转义字符）
raw_str = r"C:\Users\Name\Documents"  # 路径中的反斜杠不会被转义

字符串索引与切片

text = "Python Programming"# 索引访问
print(text[0])    # 输出：P
print(text[5])    # 输出：n
print(text[-1])   # 输出：g（负索引从末尾开始）# 切片操作
print(text[0:6])     # 输出：Python
print(text[7:])      # 输出：Programming
print(text[:6])      # 输出：Python
print(text[::2])     # 输出：Pto rgamn（每隔一个字符）
print(text[::-1])    # 输出：gnimmargorP nohtyP（反转字符串）

字符串格式化

Python提供了多种字符串格式化方法。

% 格式化（旧式）

name = "张三"
age = 25
height = 175.5# 基本格式化
formatted = "姓名：%s，年龄：%d，身高：%.1f" % (name, age, height)
print(formatted)  # 输出：姓名：张三，年龄：25，身高：175.5# 常用格式说明符
# %s - 字符串
# %d - 整数
# %f - 浮点数
# %.2f - 保留两位小数的浮点数
# %x - 十六进制
# %o - 八进制

format() 方法（新式）

name = "张三"
age = 25
height = 175.5# 位置参数
formatted = "姓名：{}，年龄：{}，身高：{:.1f}".format(name, age, height)
print(formatted)# 索引参数
formatted = "姓名：{0}，年龄：{1}，身高：{2:.1f}".format(name, age, height)
print(formatted)# 关键字参数
formatted = "姓名：{name}，年龄：{age}，身高：{height:.1f}".format(name=name, age=age, height=height)
print(formatted)# 字典参数
person = {"name": "张三", "age": 25, "height": 175.5}
formatted = "姓名：{name}，年龄：{age}，身高：{height:.1f}".format(**person)
print(formatted)

f-string（推荐，Python 3.6+）

name = "张三"
age = 25
height = 175.5# 基本f-string
formatted = f"姓名：{name}，年龄：{age}，身高：{height:.1f}"
print(formatted)# 表达式
x, y = 10, 20
result = f"{x} + {y} = {x + y}"
print(result)  # 输出：10 + 20 = 30# 函数调用
def get_greeting():return "Hello"message = f"{get_greeting()}, {name}!"
print(message)  # 输出：Hello, 张三!# 格式化选项
pi = 3.14159
print(f"圆周率：{pi:.2f}")     # 输出：圆周率：3.14
print(f"数字：{42:0>5}")       # 输出：数字：00042
print(f"百分比：{0.854:.1%}")  # 输出：百分比：85.4%

字符串方法

Python字符串提供了丰富的内置方法。

大小写转换

text = "Hello, World!"# 转换为大写
print(text.upper())  # 输出：HELLO, WORLD!# 转换为小写
print(text.lower())  # 输出：hello, world!# 首字母大写
print(text.capitalize())  # 输出：Hello, world!# 每个单词首字母大写
print(text.title())  # 输出：Hello, World!# 大小写互换
print(text.swapcase())  # 输出：hELLO, wORLD!

去除空白字符

text = "  Hello, World!  "# 去除两端空白
print(text.strip())   # 输出：Hello, World!# 去除左端空白
print(text.lstrip())  # 输出：Hello, World!  # 去除右端空白
print(text.rstrip())  # 输出：  Hello, World!# 去除指定字符
text2 = "...Hello, World!..."
print(text2.strip("."))  # 输出：Hello, World!

查找和替换

text = "Hello, Python! Welcome to Python world!"# 查找子字符串
print(text.find("Python"))     # 输出：7（第一次出现的位置）
print(text.find("Java"))       # 输出：-1（未找到）
print(text.rfind("Python"))    # 输出：25（最后一次出现的位置）# 计算子字符串出现次数
print(text.count("Python"))    # 输出：2# 替换子字符串
print(text.replace("Python", "Java"))  # 输出：Hello, Java! Welcome to Java world!# 替换指定次数
print(text.replace("Python", "Java", 1))  # 只替换第一次出现的# 判断是否包含子字符串
print("Python" in text)        # 输出：True
print(text.startswith("Hello"))  # 输出：True
print(text.endswith("world!"))   # 输出：True

分割和连接

# 分割字符串
text = "apple,banana,orange,grape"# 按分隔符分割
fruits = text.split(",")
print(fruits)  # 输出：['apple', 'banana', 'orange', 'grape']# 按行分割
lines = "第一行\n第二行\n第三行".splitlines()
print(lines)   # 输出：['第一行', '第二行', '第三行']# 限制分割次数
text2 = "a-b-c-d-e"
parts = text2.split("-", 2)  # 只分割前2次
print(parts)   # 输出：['a', 'b', 'c-d-e']# 连接字符串
fruits = ["apple", "banana", "orange"]
result = ",".join(fruits)
print(result)  # 输出：apple,banana,orange# 使用不同分隔符连接
result2 = " | ".join(fruits)
print(result2)  # 输出：apple | banana | orange

判断方法

# 判断字符串类型
print("123".isdigit())      # 输出：True（是否全为数字）
print("abc".isalpha())      # 输出：True（是否全为字母）
print("abc123".isalnum())   # 输出：True（是否全为字母或数字）
print("Hello".islower())    # 输出：False（是否全为小写）
print("HELLO".isupper())    # 输出：True（是否全为大写）
print("Hello World".istitle())  # 输出：True（是否为标题格式）
print("   ".isspace())      # 输出：True（是否全为空白字符）# 判断是否可打印
print("Hello\nWorld".isprintable())  # 输出：False（包含换行符）
print("Hello World".isprintable())   # 输出：True

对齐和填充

text = "Hello"# 左对齐
print(text.ljust(10, "*"))  # 输出：Hello*****# 右对齐
print(text.rjust(10, "*"))  # 输出：*****Hello# 居中对齐
print(text.center(10, "*"))  # 输出：**Hello***# 零填充
print("42".zfill(5))        # 输出：00042

字符串编码

Python 3中字符串默认使用Unicode编码。

编码转换

# 字符串编码为字节
text = "你好，世界"
utf8_bytes = text.encode("utf-8")
print(utf8_bytes)  # 输出：b'\xe4\xbd\xa0\xe5\xa5\xbd\xef\xbc\x8c\xe4\xb8\x96\xe7\x95\x8c'# 字节解码为字符串
decoded_text = utf8_bytes.decode("utf-8")
print(decoded_text)  # 输出：你好，世界# 其他编码
gbk_bytes = text.encode("gbk")
print(gbk_bytes)     # 输出：b'\xc4\xe3\xba\xc3\xa3\xac\xca\xc0\xbd\xe7'# 解码时指定编码
decoded_gbk = gbk_bytes.decode("gbk")
print(decoded_gbk)   # 输出：你好，世界

常见编码问题处理

# 处理编码错误
text = "Hello, 世界"# 忽略无法编码的字符
encoded = text.encode("ascii", errors="ignore")
print(encoded)  # 输出：b'Hello, '# 替换无法编码的字符
encoded2 = text.encode("ascii", errors="replace")
print(encoded2)  # 输出：b'Hello, ??'# 使用xmlcharrefreplace处理
encoded3 = text.encode("ascii", errors="xmlcharrefreplace")
print(encoded3)  # 输出：b'Hello, &#19990;&#30028;'

正则表达式基础

正则表达式是一种强大的文本处理工具，用于匹配、查找和替换文本模式。

正则表达式语法

import re# 基本匹配
pattern = r"hello"
text = "hello world"
match = re.search(pattern, text)
if match:print("找到匹配：", match.group())  # 输出：找到匹配： hello# 字符类
pattern = r"[aeiou]"  # 匹配元音字母
text = "hello"
matches = re.findall(pattern, text)
print(matches)  # 输出：['e', 'o']# 量词
pattern = r"a+"  # 匹配一个或多个a
text = "aaabbb"
match = re.search(pattern, text)
if match:print("找到匹配：", match.group())  # 输出：找到匹配： aaa# 边界匹配
pattern = r"^hello"  # 匹配行首的hello
text = "hello world"
match = re.search(pattern, text)
if match:print("找到匹配：", match.group())  # 输出：找到匹配： hello

re模块

Python的re模块提供了正则表达式的支持。

基本函数

import retext = "我的电话号码是13812345678，邮箱是example@email.com"# search() - 查找第一个匹配
phone_pattern = r"1[3-9]\d{9}"
match = re.search(phone_pattern, text)
if match:print("电话号码：", match.group())  # 输出：电话号码： 13812345678# findall() - 查找所有匹配
email_pattern = r"\w+@\w+\.\w+"
emails = re.findall(email_pattern, text)
print("邮箱：", emails)  # 输出：邮箱： ['example@email.com']# finditer() - 返回匹配对象的迭代器
for match in re.finditer(phone_pattern, text):print("找到电话号码：", match.group())# sub() - 替换匹配的文本
masked_text = re.sub(phone_pattern, "***********", text)
print(masked_text)  # 输出：我的电话号码是***********，邮箱是example@email.com# split() - 按模式分割字符串
text2 = "apple,banana;orange:grape"
parts = re.split(r"[,;:]", text2)
print(parts)  # 输出：['apple', 'banana', 'orange', 'grape']

编译正则表达式

import re# 编译正则表达式以提高性能
phone_pattern = re.compile(r"1[3-9]\d{9}")text = "联系电话：13812345678，备用：15987654321"
matches = phone_pattern.findall(text)
print(matches)  # 输出：['13812345678', '15987654321']# 使用编译后的模式进行多次操作
for match in phone_pattern.finditer(text):print("电话号码：", match.group())

正则表达式模式

正则表达式提供了丰富的模式匹配语法。

常用模式

import re# 字符类
patterns = [(r"[abc]", "匹配a、b或c"),(r"[^abc]", "匹配除了a、b、c之外的字符"),(r"[a-z]", "匹配小写字母"),(r"[A-Z]", "匹配大写字母"),(r"[0-9]", "匹配数字"),(r"[a-zA-Z0-9]", "匹配字母和数字")
]# 量词
quantifiers = [(r"a*", "匹配0个或多个a"),(r"a+", "匹配1个或多个a"),(r"a?", "匹配0个或1个a"),(r"a{3}", "匹配恰好3个a"),(r"a{3,}", "匹配3个或更多a"),(r"a{3,5}", "匹配3到5个a")
]# 特殊字符
special_chars = [(r".", "匹配任意字符（除了换行符）"),(r"\d", "匹配数字，等价于[0-9]"),(r"\D", "匹配非数字"),(r"\w", "匹配字母、数字或下划线"),(r"\W", "匹配非字母、数字或下划线"),(r"\s", "匹配空白字符"),(r"\S", "匹配非空白字符")
]# 边界匹配
boundaries = [(r"^hello", "匹配行首的hello"),(r"world$", "匹配行尾的world"),(r"\bword\b", "匹配完整单词word"),(r"\Bword\B", "匹配非单词边界的word")
]

分组和捕获

import re# 基本分组
text = "张三的电话是13812345678"
pattern = r"(.+)的电话是(1[3-9]\d{9})"
match = re.search(pattern, text)
if match:name = match.group(1)phone = match.group(2)print(f"姓名：{name}，电话：{phone}")# 输出：姓名：张三，电话：13812345678# 命名分组
pattern = r"(?P<name>.+)的电话是(?P<phone>1[3-9]\d{9})"
match = re.search(pattern, text)
if match:name = match.group("name")phone = match.group("phone")print(f"姓名：{name}，电话：{phone}")# 非捕获分组
pattern = r"(?:Mr|Mrs|Ms)\. (\w+)"
text = "Mr. Smith and Mrs. Johnson"
matches = re.findall(pattern, text)
print(matches)  # 输出：['Smith', 'Johnson']

正则表达式修饰符

修饰符可以改变正则表达式的行为。

import re# re.IGNORECASE (re.I) - 忽略大小写
pattern = r"hello"
text = "Hello World"
match = re.search(pattern, text, re.IGNORECASE)
if match:print("找到匹配：", match.group())  # 输出：找到匹配： Hello# re.MULTILINE (re.M) - 多行模式
text = "第一行\n第二行\n第三行"
pattern = r"^第二"
match = re.search(pattern, text, re.MULTILINE)
if match:print("找到匹配：", match.group())  # 输出：找到匹配： 第二# re.DOTALL (re.S) - 让点号匹配换行符
text = "第一行\n第二行"
pattern = r"第一.*第二"
match = re.search(pattern, text, re.DOTALL)
if match:print("找到匹配：", match.group())# 组合修饰符
match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)

实际应用示例

import re# 邮箱验证
def validate_email(email):"""验证邮箱格式"""pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"return re.match(pattern, email) is not Noneprint(validate_email("test@example.com"))  # 输出：True
print(validate_email("invalid.email"))     # 输出：False# 手机号验证
def validate_phone(phone):"""验证手机号格式"""pattern = r"^1[3-9]\d{9}$"return re.match(pattern, phone) is not Noneprint(validate_phone("13812345678"))  # 输出：True
print(validate_phone("12345678901"))  # 输出：False# 密码强度检查
def check_password_strength(password):"""检查密码强度"""checks = [(r".{8,}", "至少8个字符"),(r"[a-z]", "包含小写字母"),(r"[A-Z]", "包含大写字母"),(r"\d", "包含数字"),(r"[!@#$%^&*(),.?\":{}|<>]", "包含特殊字符")]strength = 0missing = []for pattern, requirement in checks:if re.search(pattern, password):strength += 1else:missing.append(requirement)return strength, missingpassword = "MyPass123!"
strength, missing = check_password_strength(password)
print(f"密码强度：{strength}/5")
if missing:print("缺少：", "、".join(missing))# 文本清理和格式化
def clean_text(text):"""清理文本"""# 去除多余的空白字符text = re.sub(r"\s+", " ", text)# 去除行首行尾空白text = text.strip()# 修复标点符号前后空格text = re.sub(r"\s+([,.!?;:])", r"\1", text)text = re.sub(r"([,.!?;:])([^\s])", r"\1 \2", text)return textmessy_text = "  这是   一段  很   乱的   文本  ，    需要  清理  。  "
cleaned = clean_text(messy_text)
print(f"原文：{messy_text}")
print(f"清理后：{cleaned}")# 日志分析
def analyze_log(log_text):"""分析日志"""# 提取IP地址ip_pattern = r"(\d{1,3}\.){3}\d{1,3}"ips = re.findall(ip_pattern, log_text)# 提取时间戳timestamp_pattern = r"\[(.*?)\]"timestamps = re.findall(timestamp_pattern, log_text)# 提取HTTP状态码status_pattern = r'" (\d{3}) 'statuses = re.findall(status_pattern, log_text)return {"ips": ips,"timestamps": timestamps,"statuses": statuses}# 模拟日志数据
log_data = '''
192.168.1.1 - - [25/Dec/2023:10:00:00 +0000] "GET /index.html HTTP/1.1" 200 1234
192.168.1.2 - - [25/Dec/2023:10:01:00 +0000] "POST /login HTTP/1.1" 404 567
'''analysis = analyze_log(log_data)
print("日志分析结果：")
print(f"IP地址：{analysis['ips']}")
print(f"时间戳：{analysis['timestamps']}")
print(f"状态码：{analysis['statuses']}")

性能优化建议

import re
import time# 1. 编译正则表达式以提高性能
# 不好的做法：每次调用都编译
def validate_email_slow(email):return re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", email) is not None# 好的做法：预编译
EMAIL_PATTERN = re.compile(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")def validate_email_fast(email):return EMAIL_PATTERN.match(email) is not None# 2. 使用更具体的模式
# 不好的做法：过于宽泛
def extract_numbers_slow(text):return re.findall(r".*\d+.*", text)# 好的做法：精确匹配
def extract_numbers_fast(text):return re.findall(r"\d+", text)# 3. 避免回溯
# 不好的做法：可能导致大量回溯
def bad_pattern(text):return re.match(r"^(a+)+$", text)# 好的做法：避免嵌套量词
def good_pattern(text):return re.match(r"^a+$", text)# 4. 使用原始字符串
# 好的做法
pattern = r"\d+\.\d+"
# 避免这种写法
# pattern = "\\d+\\.\\d+"