当前位置：首页 > news >正文

3.Python高级数据结构与文本处理

news 2025/9/10 11:42:28

学习目标：完善Python数据结构知识体系，掌握文本数据的处理方法，建立数据预处理的基础能力

3.1 集合：处理唯一数据的专家

> 什么是集合

想象一下，你在整理书架上的书籍，发现有些书买了两本相同的。你想要去掉重复的，只保留每种书一本。集合（Set） 就像一个神奇的书架，它会自动去掉重复的书籍，只保留独一无二的那些。

# 创建集合的不同方法
favorite_colors = {"红色", "蓝色", "绿色", "红色", "黄色"}
print(favorite_colors)  # {'红色', '蓝色', '绿色', '黄色'}  # 注意：重复的红色被自动去掉了# 从列表创建集合（自动去重）
fruit_list = ["苹果", "香蕉", "苹果", "橘子", "香蕉", "葡萄"]
fruit_set = set(fruit_list)
print(fruit_set)  # {'苹果', '香蕉', '橘子', '葡萄'}# 创建空集合
empty_set = set()  # 注意：不能用{}，那样会创建空字典

> 集合的基本操作

# 向集合中添加元素
my_hobbies = {"读书", "游泳"}
my_hobbies.add("画画")     # 添加一个爱好
print(my_hobbies)         # {'读书', '游泳', '画画'}my_hobbies.add("读书")    # 添加已存在的爱好
print(my_hobbies)         # {'读书', '游泳', '画画'}  # 没有变化，因为已经存在# 删除元素
my_hobbies.remove("游泳") # 删除游泳
print(my_hobbies)         # {'读书', '画画'}# 安全删除（如果不存在也不会出错）
my_hobbies.discard("跑步")  # 即使"跑步"不在集合中也不会出错# 检查元素是否在集合中
if "读书" in my_hobbies:print("我喜欢读书")
else:print("我不喜欢读书")

> 集合的实用功能：数据清洗

集合在数据处理中非常有用，特别是需要去除重复数据的时候：

# 学生考试成绩去重
all_scores = [85, 92, 78, 85, 96, 78, 88, 92, 85]
unique_scores = list(set(all_scores))  # 转回列表
print(f"原始成绩：{all_scores}")
print(f"去重后的成绩：{sorted(unique_scores)}")  # 排序让结果更整齐# 统计班级里有多少种不同的爱好
student_hobbies = ["篮球", "读书", "游戏", "篮球", "音乐", "读书", "绘画", "游戏", "篮球", "音乐"
]
unique_hobbies = set(student_hobbies)
print(f"班级爱好种类数：{len(unique_hobbies)}")
print(f"具体爱好：{unique_hobbies}")

3.2 集合运算：数据分析的利器

> 集合的数学运算

集合支持数学中的集合运算，这在数据分析中非常有用：

# 两个班级的学生爱好
class_a_hobbies = {"篮球", "读书", "游戏", "音乐"}
class_b_hobbies = {"篮球", "绘画", "音乐", "舞蹈"}# 交集：两个班都有的爱好
common_hobbies = class_a_hobbies & class_b_hobbies
print(f"共同爱好：{common_hobbies}")  # {'篮球', '音乐'}# 并集：两个班所有的爱好
all_hobbies = class_a_hobbies | class_b_hobbies
print(f"所有爱好：{all_hobbies}")  # {'篮球', '读书', '游戏', '音乐', '绘画', '舞蹈'}# 差集：A班有但B班没有的爱好
only_a_hobbies = class_a_hobbies - class_b_hobbies
print(f"只有A班有的爱好：{only_a_hobbies}")  # {'读书', '游戏'}# 对称差集：只在一个班出现的爱好
unique_to_each = class_a_hobbies ^ class_b_hobbies
print(f"各班独有的爱好：{unique_to_each}")  # {'读书', '游戏', '绘画', '舞蹈'}

> 实际应用示例：用户行为分析

# 模拟电商网站的用户行为分析
def analyze_user_behavior():# 不同时间段访问网站的用户morning_users = {"用户A", "用户B", "用户C", "用户D"}afternoon_users = {"用户B", "用户C", "用户E", "用户F"}evening_users = {"用户A", "用户C", "用户F", "用户G"}# 分析用户活跃情况print("用户活跃度分析")print("-" * 30)# 全天都活跃的用户very_active = morning_users & afternoon_users & evening_usersprint(f"超级活跃用户：{very_active}")# 至少在两个时段活跃的用户quite_active = (morning_users & afternoon_users) | \(morning_users & evening_users) | \(afternoon_users & evening_users)print(f"比较活跃用户：{quite_active}")# 只在一个时段活跃的用户less_active = (morning_users | afternoon_users | evening_users) - quite_activeprint(f"较少活跃用户：{less_active}")# 总用户数total_users = morning_users | afternoon_users | evening_usersprint(f"总用户数：{len(total_users)}")print(f"所有用户：{total_users}")# 运行分析
analyze_user_behavior()

3.3 字符串处理进阶：文本数据的艺术

> 常用字符串方法

字符串是文本数据处理的基础，Python提供了丰富的字符串处理方法：

# 基本字符串处理
user_input = "  Hello Python World!  "# 清理字符串
clean_text = user_input.strip()        # 去掉两端空格
print(f"去空格：'{clean_text}'")lower_text = clean_text.lower()        # 转小写
upper_text = clean_text.upper()        # 转大写
title_text = clean_text.title()        # 标题格式（每个单词首字母大写）print(f"小写：{lower_text}")
print(f"大写：{upper_text}")
print(f"标题：{title_text}")# 字符串查找和替换
text = "Python是一门编程语言，Python很好学"# 查找子字符串
position = text.find("Python")         # 找到第一个"Python"的位置
print(f"第一个'Python'在位置：{position}")count = text.count("Python")           # 统计"Python"出现的次数
print(f"'Python'出现了{count}次")# 替换字符串
new_text = text.replace("Python", "编程")
print(f"替换后：{new_text}")

> 字符串分割和连接

# 分割字符串：处理CSV数据或文本解析时很有用
student_info = "张三,20,计算机,北京"
parts = student_info.split(",")        # 用逗号分割
print(f"分割结果：{parts}")# 分割多行文本
poem = """床前明月光，
疑是地上霜。
举头望明月，
低头思故乡。"""lines = poem.split("\n")               # 按行分割
print(f"诗句行数：{len(lines)}")
for i, line in enumerate(lines, 1):print(f"第{i}行：{line}")# 连接字符串
words = ["Python", "是", "最好的", "编程语言"]
sentence = " ".join(words)             # 用空格连接
print(f"连接结果：{sentence}")# 连接文件路径
path_parts = ["home", "user", "documents", "python", "project.py"]
file_path = "/".join(path_parts)
print(f"文件路径：{file_path}")

> 字符串格式化：让输出更美观

# 现代Python推荐的f-string格式化
name = "小明"
age = 20
score = 85.67# f-string格式化（推荐）
info = f"学生{name}今年{age}岁，考试得了{score:.1f}分"
print(info)# 格式化数字
price = 123.456
print(f"价格：￥{price:.2f}")          # 保留2位小数
print(f"价格：￥{price:,.2f}")         # 添加千位分隔符# 格式化日期和时间
from datetime import datetime
now = datetime.now()
print(f"当前时间：{now:%Y年%m月%d日 %H:%M:%S}")# 字符串对齐
items = ["苹果", "香蕉", "橘子"]
prices = [3.5, 2.8, 4.2]print("商品价目表")
print("-" * 20)
for item, price in zip(items, prices):print(f"{item:<6} ￥{price:>6.2f}")  # 左对齐商品名，右对齐价格

3.4 文件操作：数据持久化的基础

> 文件读取：获取外部数据

文件操作让我们能够处理存储在硬盘上的数据：

# 创建一个示例文本文件（实际应用中这个文件已经存在）
def create_sample_file():"""创建示例文件用于演示"""sample_data = """张三,85,数学
李四,92,物理
王五,78,化学
赵六,96,数学
钱七,88,物理"""with open("student_scores.txt", "w", encoding="utf-8") as file:file.write(sample_data)print("示例文件已创建")# 读取文件的不同方法
def read_file_examples():"""演示文件读取的不同方法"""# 方法1：一次性读取整个文件with open("student_scores.txt", "r", encoding="utf-8") as file:content = file.read()print("整个文件内容：")print(content)print("\n" + "-"*30 + "\n")# 方法2：逐行读取with open("student_scores.txt", "r", encoding="utf-8") as file:print("逐行读取：")for line_number, line in enumerate(file, 1):line = line.strip()  # 去掉行末的换行符print(f"第{line_number}行：{line}")print("\n" + "-"*30 + "\n")# 方法3：读取所有行到列表with open("student_scores.txt", "r", encoding="utf-8") as file:lines = file.readlines()print("所有行的列表：")for line in lines:print(repr(line))  # repr显示字符串的真实内容，包括\n# 创建文件并演示读取
create_sample_file()
read_file_examples()

> 文件写入：保存数据

def write_file_examples():"""演示文件写入的不同方法"""# 方法1：写入新文件（会覆盖原文件）students = [{"name": "小明", "score": 85},{"name": "小红", "score": 92},{"name": "小刚", "score": 78}]with open("processed_scores.txt", "w", encoding="utf-8") as file:file.write("学生成绩报告\n")file.write("="*20 + "\n")total_score = 0for student in students:line = f"{student['name']}：{student['score']}分\n"file.write(line)total_score += student['score']average = total_score / len(students)file.write(f"\n平均分：{average:.1f}分\n")print("成绩报告已写入processed_scores.txt")# 方法2：追加到文件末尾with open("processed_scores.txt", "a", encoding="utf-8") as file:file.write("\n补充信息：\n")file.write("本次考试难度：中等\n")file.write("及格率：100%\n")print("补充信息已追加到文件")# 演示文件写入
write_file_examples()# 验证写入结果
print("\n文件写入结果：")
with open("processed_scores.txt", "r", encoding="utf-8") as file:print(file.read())

> 文件操作的安全实践

import osdef safe_file_operations():"""演示安全的文件操作方法"""filename = "important_data.txt"# 检查文件是否存在if os.path.exists(filename):print(f"文件{filename}存在")# 获取文件信息file_size = os.path.getsize(filename)print(f"文件大小：{file_size}字节")else:print(f"文件{filename}不存在，创建新文件")with open(filename, "w", encoding="utf-8") as file:file.write("这是重要数据\n")# 安全读取文件（处理可能的错误）try:with open(filename, "r", encoding="utf-8") as file:content = file.read()print(f"文件内容：{content}")except FileNotFoundError:print(f"错误：找不到文件{filename}")except PermissionError:print(f"错误：没有权限访问文件{filename}")except Exception as e:print(f"读取文件时发生错误：{e}")# 演示安全操作
safe_file_operations()

3.5 数据结构综合对比与选择指南

> 四种数据结构的全面对比

数据结构	有序性	可变性	重复元素	访问方式	主要用途	性能特点
列表	有序	可变	允许重复	索引访问	存储序列数据	查找慢O(n)
字典	无序	可变	键唯一，值可重复	键名访问	键值对映射	查找快O(1)
集合	无序	可变	不允许重复	成员测试	去重、集合运算	查找快O(1)
元组	有序	不可变	允许重复	索引访问	不变数据组合	内存效率高

*注：Python 3.7+字典保持插入顺序

def demonstrate_data_structures():"""演示不同数据结构的特点和使用场景"""print("数据结构使用场景演示")print("="*40)# 场景1：处理学生名单（有顺序，可能有重复录入）print("1. 学生名单管理 - 使用列表")student_list = ["张三", "李四", "王五", "张三"]  # 可能有重复录入print(f"原始名单：{student_list}")# 去重但保持顺序unique_students = []for student in student_list:if student not in unique_students:unique_students.append(student)print(f"去重名单：{unique_students}")print("\n" + "-"*30 + "\n")# 场景2：快速查找学生信息 - 使用字典print("2. 学生信息查询 - 使用字典")student_info = {"张三": {"age": 20, "major": "计算机"},"李四": {"age": 19, "major": "数学"},"王五": {"age": 21, "major": "物理"}}search_name = "李四"if search_name in student_info:info = student_info[search_name]print(f"{search_name}的信息：年龄{info['age']}岁，专业{info['major']}")print("\n" + "-"*30 + "\n")# 场景3：分析选课情况 - 使用集合print("3. 选课分析 - 使用集合")math_students = {"张三", "李四", "王五", "赵六"}physics_students = {"李四", "王五", "钱七", "孙八"}both_courses = math_students & physics_studentsonly_math = math_students - physics_studentsall_students = math_students | physics_studentsprint(f"同时选择数学和物理的学生：{both_courses}")print(f"只选数学的学生：{only_math}")print(f"选课学生总数：{len(all_students)}")print("\n" + "-"*30 + "\n")# 场景4：存储坐标点 - 使用元组print("4. 坐标点存储 - 使用元组")points = [(0, 0), (1, 2), (3, 4), (5, 6)]print(f"路径点：{points}")# 计算路径总长度total_distance = 0for i in range(len(points) - 1):x1, y1 = points[i]x2, y2 = points[i + 1]distance = ((x2-x1)**2 + (y2-y1)**2)**0.5total_distance += distanceprint(f"从{points[i]}到{points[i+1]}的距离：{distance:.2f}")print(f"总路径长度：{total_distance:.2f}")# 运行演示
demonstrate_data_structures()

> 选择决策树

def choose_data_structure():"""数据结构选择的决策指导"""print("数据结构选择决策指南")print("="*30)decisions = {"需要保持元素顺序吗？": {"是": {"数据会改变吗？": {"是": "使用列表（List）","否": "使用元组（Tuple）"}},"否": {"需要通过名字快速查找吗？": {"是": "使用字典（Dictionary）","否": {"需要去除重复元素吗？": {"是": "使用集合（Set）","否": "使用列表（List）"}}}}}}# 一些实际的选择示例examples = [("存储购物清单", "列表 - 有顺序，可修改"),("存储学生姓名和成绩的对应关系", "字典 - 通过姓名快速查找成绩"),("存储班级里所有不同的兴趣爱好", "集合 - 自动去重"),("存储地图上的坐标点", "元组 - 坐标不变，配对存储"),("存储网站用户的访问记录", "列表 - 需要保持时间顺序"),("存储产品ID和产品信息", "字典 - 通过ID快速查找产品")]for scenario, recommendation in examples:print(f"场景：{scenario}")print(f"推荐：{recommendation}")print()# 显示选择指南
choose_data_structure()

3.6 实战项目：文本数据分析工具

现在让我们综合运用本课学到的所有知识，创建一个完整的文本分析工具：

def create_text_analyzer():"""创建综合文本分析工具"""def create_sample_data():"""创建示例数据文件"""sample_texts = ["Python是一门简单易学的编程语言","机器学习需要大量的数据和计算资源","Python在数据科学领域应用广泛","深度学习是机器学习的一个重要分支","编程语言有很多种，Python是其中最受欢迎的","数据科学家需要掌握统计学和编程技能","Python的语法简洁明了，适合初学者学习"]with open("sample_texts.txt", "w", encoding="utf-8") as file:for text in sample_texts:file.write(text + "\n")print("示例数据文件已创建：sample_texts.txt")def analyze_text_file(filename):"""分析文本文件的完整函数"""try:# 1. 读取文件内容with open(filename, "r", encoding="utf-8") as file:lines = file.readlines()print(f"文本分析报告：{filename}")print("="*50)# 2. 基本统计total_lines = len(lines)total_chars = sum(len(line) for line in lines)# 3. 清理和分词all_words = []all_chars = set()for line in lines:# 清理文本clean_line = line.strip()if clean_line:# 简单的中文分词（按标点符号分割）import rewords = re.findall(r'[\u4e00-\u9fff]+|[A-Za-z]+', clean_line)all_words.extend(words)# 收集所有字符for char in clean_line:if char.strip():all_chars.add(char)# 4. 词频统计（使用字典）word_count = {}for word in all_words:word_count[word] = word_count.get(word, 0) + 1# 5. 字符去重（使用集合）unique_chars = set(''.join(lines))unique_chars.discard('\n')  # 去掉换行符unique_chars.discard(' ')   # 去掉空格# 6. 生成报告print(f"文本行数：{total_lines}")print(f"总字符数：{total_chars}")print(f"总词汇数：{len(all_words)}")print(f"独特词汇数：{len(set(all_words))}")print(f"独特字符数：{len(unique_chars)}")# 7. 词频排行榜print(f"\n词频排行榜（前10名）：")print("-" * 30)sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)for i, (word, count) in enumerate(sorted_words[:10], 1):percentage = (count / len(all_words)) * 100print(f"{i:2d}. {word:<8} {count:2d}次 ({percentage:5.1f}%)")# 8. 关键词分析print(f"\n关键词分析：")print("-" * 30)keywords = {"Python", "数据", "学习", "编程"}found_keywords = set()for word in all_words:if word in keywords:found_keywords.add(word)print(f"发现的关键词：{found_keywords}")print(f"关键词覆盖率：{len(found_keywords)}/{len(keywords)} ({len(found_keywords)/len(keywords)*100:.1f}%)")# 9. 保存分析结果save_analysis_results(filename, {'total_lines': total_lines,'total_chars': total_chars,'word_count': word_count,'unique_chars': len(unique_chars),'keywords_found': found_keywords})return word_count, unique_charsexcept FileNotFoundError:print(f"错误：找不到文件 {filename}")return None, Noneexcept Exception as e:print(f"分析文件时发生错误：{e}")return None, Nonedef save_analysis_results(original_filename, results):"""保存分析结果到文件"""result_filename = f"analysis_result_{original_filename.replace('.txt', '')}.txt"with open(result_filename, "w", encoding="utf-8") as file:file.write(f"文件 {original_filename} 的分析结果\n")file.write("=" * 50 + "\n\n")file.write(f"基本统计：\n")file.write(f"  总行数：{results['total_lines']}\n")file.write(f"  总字符数：{results['total_chars']}\n")file.write(f"  独特字符数：{results['unique_chars']}\n\n")file.write(f"词频统计（按频率排序）：\n")sorted_words = sorted(results['word_count'].items(), key=lambda x: x[1], reverse=True)for word, count in sorted_words:file.write(f"  {word}: {count}次\n")file.write(f"\n发现的关键词：{list(results['keywords_found'])}\n")print(f"分析结果已保存到：{result_filename}")def compare_texts():"""比较多个文本的相似性"""print(f"\n文本相似性分析：")print("-" * 30)# 创建两个示例文本进行比较text1_words = {"Python", "编程", "学习", "语言", "简单"}text2_words = {"Python", "数据", "学习", "科学", "分析"}# 使用集合运算分析相似性common_words = text1_words & text2_wordsall_words = text1_words | text2_wordsunique_to_text1 = text1_words - text2_wordsunique_to_text2 = text2_words - text1_wordssimilarity = len(common_words) / len(all_words) * 100print(f"文本1关键词：{text1_words}")print(f"文本2关键词：{text2_words}")print(f"共同词汇：{common_words}")print(f"文本1独有：{unique_to_text1}")print(f"文本2独有：{unique_to_text2}")print(f"相似度：{similarity:.1f}%")# 执行分析流程print("启动文本数据分析工具")print("="*40)# 创建示例数据create_sample_data()print()# 分析文本文件word_freq, unique_chars = analyze_text_file("sample_texts.txt")# 文本比较分析compare_texts()return word_freq, unique_chars# 运行文本分析工具
word_frequencies, character_set = create_text_analyzer()