当前位置：首页 > news >正文

python数据清洗与预处理指南

news 2025/10/27 11:25:31

数据清洗和预处理是数据科学项目中最为关键但经常被忽视的环节。据统计，数据科学家花费约80%的时间在数据清洗和预处理上。本文将深入探讨数据清洗的各个方面，帮助你掌握打造高质量数据集的必备技能。

1. 数据质量问题概述

常见数据质量问题

在实际数据工作中，我们经常会遇到三种主要的数据质量问题：

数据缺失 - 某些字段没有值
数据重复 - 存在完全或部分相同的记录
数据异常 - 超出正常范围的异常值

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns# 创建包含各种数据质量问题的示例数据集
def create_sample_data():"""创建包含数据质量问题的示例数据集"""data = {'姓名': ['张三', '李四', '王五', '赵六', '钱七', '孙八', '周九', '吴十', '郑一', '王二'],'年龄': [25, 32, 28, np.nan, 35, 150, 29, np.nan, 31, 28],  # 包含缺失值和异常值'工资': [5000, 8000, 6000, 7000, 9000, -1000, 7500, 8200, 6800, 6000],  # 包含异常值'城市': ['北京', '上海', '北京', '广州', '深圳', '北京', '上海', '广州', '北京', '北京'],'部门': ['技术部', '销售部', '技术部', '市场部', '技术部', '销售部', '技术部', '市场部', '技术部', '技术部']}# 创建DataFrame并添加重复行df = pd.DataFrame(data)df = pd.concat([df, df.iloc[[0, 2]]], ignore_index=True)  # 添加重复行return df# 查看示例数据
df = create_sample_data()
print("原始数据集:")
print(df)

output:

原始数据集:姓名     年龄    工资  城市   部门
0   张三   25.0  5000  北京  技术部
1   李四   32.0  8000  上海  销售部
2   王五   28.0  6000  北京  技术部
3   赵六    NaN  7000  广州  市场部
4   钱七   35.0  9000  深圳  技术部
5   孙八  150.0 -1000  北京  销售部
6   周九   29.0  7500  上海  技术部
7   吴十    NaN  8200  广州  市场部
8   郑一   31.0  6800  北京  技术部
9   王二   28.0  6000  北京  技术部
10  张三   25.0  5000  北京  技术部
11  王五   28.0  6000  北京  技术部

2. 数据缺失处理

检测缺失数据

def detect_missing_data(df):"""全面检测数据缺失情况"""print("=== 数据缺失检测 ===")# 1. 基本缺失信息print(f"数据集形状: {df.shape}")print(f"\n每列缺失值数量:")print(df.isnull().sum())print(f"\n每列缺失值比例:")print((df.isnull().sum() / len(df) * 100).round(2))# 2. 使用isnull()创建布尔数组missing_mask = df.isnull()print(f"\n缺失值布尔数组 (前5行):")print(missing_mask.head())# 3. 可视化缺失数据plt.figure(figsize=(10, 6))plt.subplot(1, 2, 1)sns.heatmap(missing_mask, cbar=True, cmap='viridis')plt.title('缺失数据热力图')plt.subplot(1, 2, 2)missing_percentage = (df.isnull().sum() / len(df)) * 100missing_percentage.plot(kind='bar')plt.title('各列缺失值比例')plt.ylabel('缺失比例 (%)')plt.tight_layout()plt.show()return missing_mask# 检测缺失数据
missing_mask = detect_missing_data(df)

output:

=== 数据缺失检测 ===
数据集形状: (12, 5)每列缺失值数量:
姓名    0
年龄    2
工资    0
城市    0
部门    0
dtype: int64每列缺失值比例:
姓名     0.00
年龄    16.67
工资     0.00
城市     0.00
部门     0.00
dtype: float64缺失值布尔数组 (前5行):姓名     年龄     工资     城市     部门
0  False  False  False  False  False
1  False  False  False  False  False
2  False  False  False  False  False
3  False   True  False  False  False
4  False  False  False  False  False

筛选缺失数据

def filter_missing_data(df):"""筛选包含缺失数据的行和列"""print("=== 筛选缺失数据 ===")# 1. 筛选包含缺失值的行rows_with_missing = df[df.isnull().any(axis=1)]print(f"包含缺失值的行数: {len(rows_with_missing)}")print("包含缺失值的行:")print(rows_with_missing)# 2. 筛选包含缺失值的列cols_with_missing = df.columns[df.isnull().any()].tolist()print(f"\n包含缺失值的列: {cols_with_missing}")return rows_with_missing, cols_with_missingrows_with_missing, cols_with_missing = filter_missing_data(df)

output:

=== 筛选缺失数据 ===
包含缺失值的行数: 2
包含缺失值的行:姓名  年龄    工资  城市   部门
3  赵六 NaN  7000  广州  市场部
7  吴十 NaN  8200  广州  市场部包含缺失值的列: ['年龄']

删除缺失数据

def handle_missing_by_dropping(df):"""通过删除处理缺失数据"""print("=== 删除缺失数据 ===")# 1. 删除任何包含缺失值的行df_dropped_rows = df.dropna()print(f"删除缺失行后数据集形状: {df_dropped_rows.shape}")# 2. 删除任何包含缺失值的列df_dropped_cols = df.dropna(axis=1)print(f"删除缺失列后数据集形状: {df_dropped_cols.shape}")# 3. 删除所有值都缺失的行df_dropped_all_na = df.dropna(how='all')print(f"删除全缺失行后数据集形状: {df_dropped_all_na.shape}")# 4. 删除在特定列有缺失值的行df_dropped_specific = df.dropna(subset=['年龄'])print(f"删除'年龄'列缺失行后数据集形状: {df_dropped_specific.shape}")# 5. 只有某列缺失值超过阈值时才删除threshold = len(df) * 0.1  # 缺失值超过10%才删除df_dropped_threshold = df.dropna(thresh=len(df.columns) - 1)  # 至少保留N-1个非空值print(f"阈值删除后数据集形状: {df_dropped_threshold.shape}")return {'dropped_rows': df_dropped_rows,'dropped_cols': df_dropped_cols,'dropped_all_na': df_dropped_all_na,'dropped_specific': df_dropped_specific,'dropped_threshold': df_dropped_threshold}drop_results = handle_missing_by_dropping(df)

output:

=== 删除缺失数据 ===
删除缺失行后数据集形状: (10, 5)
删除缺失列后数据集形状: (12, 4)
删除全缺失行后数据集形状: (12, 5)
删除'年龄'列缺失行后数据集形状: (10, 5)
阈值删除后数据集形状: (12, 5)

填充缺失数据

def handle_missing_by_filling(df):"""通过填充处理缺失数据"""print("=== 填充缺失数据 ===")# 创建数据副本以避免修改原始数据df_filled = df.copy()# 1. 用固定值填充df_fixed = df_filled.fillna(0)print("用0填充后的年龄列:")print(df_fixed['年龄'].unique())# 2. 用前向填充df_ffill = df_filled.fillna(method='ffill')print("\n前向填充后的年龄列:")print(df_ffill['年龄'].unique())# 3. 用后向填充df_bfill = df_filled.fillna(method='bfill')print("\n后向填充后的年龄列:")print(df_bfill['年龄'].unique())# 4. 用统计量填充age_mean = df_filled['年龄'].mean()df_mean = df_filled.fillna({'年龄': age_mean})print(f"\n用均值({age_mean:.2f})填充后的年龄列:")print(df_mean['年龄'].unique())age_median = df_filled['年龄'].median()df_median = df_filled.fillna({'年龄': age_median})print(f"\n用中位数({age_median})填充后的年龄列:")print(df_median['年龄'].unique())# 5. 分组填充（更智能的方式）df_group_fill = df_filled.copy()df_group_fill['年龄'] = df_group_fill.groupby('部门')['年龄'].transform(lambda x: x.fillna(x.median()))print("\n按部门分组用中位数填充后的年龄列:")print(df_group_fill[['姓名', '部门', '年龄']].head(8))# 6. 用插值法填充df_interpolated = df_filled.copy()df_interpolated['年龄'] = df_interpolated['年龄'].interpolate()print("\n插值填充后的年龄列:")print(df_interpolated['年龄'].unique())return {'fixed': df_fixed,'ffill': df_ffill,'bfill': df_bfill,'mean': df_mean,'median': df_median,'group_fill': df_group_fill,'interpolated': df_interpolated}fill_results = handle_missing_by_filling(df)

output:

=== 填充缺失数据 ===
用0填充后的年龄列:
[ 25.  32.  28.   0.  35. 150.  29.  31.]前向填充后的年龄列:
[ 25.  32.  28.  35. 150.  29.  31.]后向填充后的年龄列:
[ 25.  32.  28.  35. 150.  29.  31.]用均值(41.10)填充后的年龄列:
[ 25.   32.   28.   41.1  35.  150.   29.   31. ]用中位数(28.5)填充后的年龄列:
[ 25.   32.   28.   28.5  35.  150.   29.   31. ]按部门分组用中位数填充后的年龄列:姓名   部门     年龄
0  张三  技术部   25.0
1  李四  销售部   32.0
2  王五  技术部   28.0
3  赵六  市场部    NaN
4  钱七  技术部   35.0
5  孙八  销售部  150.0
6  周九  技术部   29.0
7  吴十  市场部    NaN插值填充后的年龄列:
[ 25.   32.   28.   31.5  35.  150.   29.   30.   31. ]

3. 数据重复处理

检测重复数据

def detect_duplicate_data(df):"""全面检测重复数据"""print("=== 重复数据检测 ===")# 1. 检测完全重复的行duplicate_rows = df[df.duplicated()]print(f"完全重复的行数: {len(duplicate_rows)}")print("完全重复的行:")print(duplicate_rows)# 2. 检测在特定列上重复的行duplicate_names = df[df.duplicated(subset=['姓名'])]print(f"\n姓名重复的行数: {len(duplicate_names)}")print("姓名重复的行:")print(duplicate_names)# 3. 使用duplicated()方法的不同参数duplicate_keep_first = df[df.duplicated(keep='first')]print(f"\n保留第一次出现外的重复行数: {len(duplicate_keep_first)}")duplicate_keep_last = df[df.duplicated(keep='last')]print(f"保留最后一次出现外的重复行数: {len(duplicate_keep_last)}")duplicate_all = df[df.duplicated(keep=False)]print(f"所有重复行数(包括所有重复): {len(duplicate_all)}")# 4. 检测重复列（需要转置）duplicate_cols = df.T.duplicated()print(f"\n重复的列: {duplicate_cols[duplicate_cols].index.tolist()}")return {'duplicate_rows': duplicate_rows,'duplicate_names': duplicate_names,'duplicate_keep_first': duplicate_keep_first,'duplicate_keep_last': duplicate_keep_last,'duplicate_all': duplicate_all}duplicate_results = detect_duplicate_data(df)

output:

=== 重复数据检测 ===
完全重复的行数: 2
完全重复的行:姓名    年龄    工资  城市   部门
10  张三  25.0  5000  北京  技术部
11  王五  28.0  6000  北京  技术部姓名重复的行数: 2
姓名重复的行:姓名    年龄    工资  城市   部门
10  张三  25.0  5000  北京  技术部
11  王五  28.0  6000  北京  技术部保留第一次出现外的重复行数: 2
保留最后一次出现外的重复行数: 2
所有重复行数(包括所有重复): 4重复的列: []

删除重复数据

def remove_duplicate_data(df):"""删除重复数据的多种方法"""print("=== 删除重复数据 ===")# 1. 删除所有完全重复的行（保留第一个）df_dedup_first = df.drop_duplicates()print(f"删除重复行(保留第一个)后形状: {df_dedup_first.shape}")# 2. 删除所有完全重复的行（保留最后一个）df_dedup_last = df.drop_duplicates(keep='last')print(f"删除重复行(保留最后一个)后形状: {df_dedup_last.shape}")# 3. 删除所有重复行（不保留任何重复）df_dedup_none = df.drop_duplicates(keep=False)print(f"删除所有重复行后形状: {df_dedup_none.shape}")# 4. 基于特定列删除重复df_dedup_subset = df.drop_duplicates(subset=['姓名'])print(f"基于姓名删除重复后形状: {df_dedup_subset.shape}")# 5. 考虑多列的重复删除df_dedup_multi = df.drop_duplicates(subset=['姓名', '城市'])print(f"基于姓名和城市删除重复后形状: {df_dedup_multi.shape}")# 6. 删除重复列df_dedup_cols = df.T.drop_duplicates().Tprint(f"删除重复列后形状: {df_dedup_cols.shape}")return {'dedup_first': df_dedup_first,'dedup_last': df_dedup_last,'dedup_none': df_dedup_none,'dedup_subset': df_dedup_subset,'dedup_multi': df_dedup_multi,'dedup_cols': df_dedup_cols}dedup_results = remove_duplicate_data(df)

output:

=== 删除重复数据 ===
删除重复行(保留第一个)后形状: (10, 5)
删除重复行(保留最后一个)后形状: (10, 5)
删除所有重复行后形状: (8, 5)
基于姓名删除重复后形状: (10, 5)
基于姓名和城市删除重复后形状: (10, 5)
删除重复列后形状: (12, 5)

4. 数据异常处理

识别异常值

def detect_outliers(df):"""识别数据中的异常值"""print("=== 异常值检测 ===")# 选择数值列进行分析numeric_cols = df.select_dtypes(include=[np.number]).columnsprint(f"数值列: {list(numeric_cols)}")outlier_results = {}for col in numeric_cols:print(f"\n--- {col}列异常值分析 ---")# 基本统计信息data = df[col].dropna()Q1 = data.quantile(0.25)Q3 = data.quantile(0.75)IQR = Q3 - Q1lower_bound = Q1 - 1.5 * IQRupper_bound = Q3 + 1.5 * IQRprint(f"Q1(25%分位数): {Q1:.2f}")print(f"Q3(75%分位数): {Q3:.2f}")print(f"IQR: {IQR:.2f}")print(f"正常值范围: [{lower_bound:.2f}, {upper_bound:.2f}]")# 识别异常值outliers = data[(data < lower_bound) | (data > upper_bound)]print(f"异常值数量: {len(outliers)}")print(f"异常值: {list(outliers)}")# 使用Z-score方法from scipy import statsz_scores = np.abs(stats.zscore(data.dropna()))z_outliers = data[z_scores > 3]print(f"Z-score异常值(>3σ): {len(z_outliers)}")outlier_results[col] = {'IQR_outliers': outliers,'z_outliers': z_outliers,'bounds': (lower_bound, upper_bound)}# 可视化异常值plt.figure(figsize=(12, 6))for i, col in enumerate(numeric_cols, 1):plt.subplot(1, len(numeric_cols), i)df[col].dropna().plot(kind='box')plt.title(f'{col}分布')plt.tight_layout()plt.show()return outlier_resultsoutlier_results = detect_outliers(df)

output:

=== 异常值检测 ===
数值列: ['年龄', '工资']--- 年龄列异常值分析 ---
Q1(25%分位数): 28.00
Q3(75%分位数): 31.75
IQR: 3.75
正常值范围: [22.38, 37.38]
异常值数量: 1
异常值: [150.0]
Z-score异常值(>3σ): 0--- 工资列异常值分析 ---
Q1(25%分位数): 5750.00
Q3(75%分位数): 7625.00
IQR: 1875.00
正常值范围: [2937.50, 10437.50]
异常值数量: 1
异常值: [-1000]
Z-score异常值(>3σ): 0

处理异常值

def handle_outliers(df):"""处理数据中的异常值"""print("=== 异常值处理 ===")df_cleaned = df.copy()numeric_cols = df.select_dtypes(include=[np.number]).columnshandling_methods = {}for col in numeric_cols:print(f"\n处理 {col} 列的异常值:")data = df[col].dropna()Q1 = data.quantile(0.25)Q3 = data.quantile(0.75)IQR = Q3 - Q1lower_bound = Q1 - 1.5 * IQRupper_bound = Q3 + 1.5 * IQR# 方法1: 删除异常值df_remove = df_cleaned[(df_cleaned[col] >= lower_bound) & (df_cleaned[col] <= upper_bound) | (df_cleaned[col].isna())].copy()print(f"删除异常值后行数: {len(df_remove)}")# 方法2: 缩尾处理（Winsorizing）df_winsor = df_cleaned.copy()df_winsor[col] = np.where(df_winsor[col] < lower_bound, lower_bound, np.where(df_winsor[col] > upper_bound, upper_bound, df_winsor[col]))print(f"缩尾处理后{col}范围: [{df_winsor[col].min():.2f}, {df_winsor[col].max():.2f}]")# 方法3: 用中位数替换异常值df_median = df_cleaned.copy()median_val = data.median()df_median[col] = np.where((df_median[col] < lower_bound) | (df_median[col] > upper_bound), median_val, df_median[col])print(f"中位数替换后{col}范围: [{df_median[col].min():.2f}, {df_median[col].max():.2f}]")# 方法4: 用边界值替换异常值df_cap = df_cleaned.copy()df_cap[col] = np.where(df_cap[col] < lower_bound, lower_bound,np.where(df_cap[col] > upper_bound, upper_bound,df_cap[col]))print(f"边界值替换后{col}范围: [{df_cap[col].min():.2f}, {df_cap[col].max():.2f}]")handling_methods[col] = {'remove': df_remove,'winsor': df_winsor,'median': df_median,'cap': df_cap}# 可视化处理效果plt.figure(figsize=(15, 10))example_col = '年龄'methods = ['原始数据', '删除异常值', '缩尾处理', '中位数替换', '边界值替换']datasets = [df_cleaned[example_col].dropna(),handling_methods[example_col]['remove'][example_col].dropna(),handling_methods[example_col]['winsor'][example_col].dropna(),handling_methods[example_col]['median'][example_col].dropna(),handling_methods[example_col]['cap'][example_col].dropna()]for i, (method, data) in enumerate(zip(methods, datasets), 1):plt.subplot(2, 3, i)plt.boxplot(data)plt.title(f'{method}\n范围: [{data.min():.1f}, {data.max():.1f}]')plt.tight_layout()plt.show()return handling_methodsoutlier_handling = handle_outliers(df)

output:

=== 异常值处理 ===处理 年龄 列的异常值:
删除异常值后行数: 11
缩尾处理后年龄范围: [25.00, 37.38]
中位数替换后年龄范围: [25.00, 35.00]
边界值替换后年龄范围: [25.00, 37.38]处理 工资 列的异常值:
删除异常值后行数: 11
缩尾处理后工资范围: [2937.50, 9000.00]
中位数替换后工资范围: [5000.00, 9000.00]
边界值替换后工资范围: [2937.50, 9000.00]

5. 数据转换

使用map方法进行数据转换

def data_transformation_with_map(df):"""使用map方法进行数据转换"""print("=== 数据转换 - map方法 ===")# 1. 基本map使用 - 数值映射department_mapping = {'技术部': 'Technology','销售部': 'Sales', '市场部': 'Marketing'}df_mapped = df.copy()df_mapped['部门英文'] = df_mapped['部门'].map(department_mapping)print("部门名称映射:")print(df_mapped[['部门', '部门英文']].head())# 2. 使用函数进行复杂映射def salary_level(salary):if salary < 0:return '无效数据'elif salary < 6000:return '初级'elif salary < 8000:return '中级'else:return '高级'df_mapped['工资等级'] = df_mapped['工资'].map(salary_level)print("\n工资等级映射:")print(df_mapped[['工资', '工资等级']].head())# 3. 使用lambda函数df_mapped['城市代码'] = df_mapped['城市'].map(lambda x: x[0] if pd.notna(x) else '未知')print("\n城市代码映射:")print(df_mapped[['城市', '城市代码']].head())# 4. 处理映射中不存在的值partial_mapping = {'北京': 'Beijing', '上海': 'Shanghai'}  # 故意缺少广州、深圳df_mapped['城市英文'] = df_mapped['城市'].map(partial_mapping)print("\n部分映射结果(缺失值变为NaN):")print(df_mapped[['城市', '城市英文']].head())# 5. 使用map的na_action参数df_mapped['城市英文安全'] = df_mapped['城市'].map(partial_mapping, na_action='ignore')print("\n安全映射结果:")print(df_mapped[['城市', '城市英文安全']].head())return df_mappeddf_transformed = data_transformation_with_map(df)

output:

=== 数据转换 - map方法 ===
部门名称映射:部门        部门英文
0  技术部  Technology
1  销售部       Sales
2  技术部  Technology
3  市场部   Marketing
4  技术部  Technology工资等级映射:工资 工资等级
0  5000   初级
1  8000   高级
2  6000   中级
3  7000   中级
4  9000   高级城市代码映射:城市 城市代码
0  北京    北
1  上海    上
2  北京    北
3  广州    广
4  深圳    深部分映射结果(缺失值变为NaN):城市      城市英文
0  北京   Beijing
1  上海  Shanghai
2  北京   Beijing
3  广州       NaN
4  深圳       NaN安全映射结果:城市    城市英文安全
0  北京   Beijing
1  上海  Shanghai
2  北京   Beijing
3  广州       NaN
4  深圳       NaN

综合数据清洗管道

def create_data_cleaning_pipeline(df):"""创建完整的数据清洗管道"""print("=== 数据清洗完整管道 ===")# 步骤1: 创建数据副本df_clean = df.copy()print(f"初始数据形状: {df_clean.shape}")# 步骤2: 处理缺失值# 数值列用中位数填充，分类列用众数填充for col in df_clean.columns:if df_clean[col].dtype in ['int64', 'float64']:fill_value = df_clean[col].median()else:fill_value = df_clean[col].mode()[0] if len(df_clean[col].mode()) > 0 else '未知'df_clean[col] = df_clean[col].fillna(fill_value)print(f"处理缺失值后形状: {df_clean.shape}")# 步骤3: 处理重复数据df_clean = df_clean.drop_duplicates()print(f"处理重复数据后形状: {df_clean.shape}")# 步骤4: 处理异常值（使用边界值方法）numeric_cols = df_clean.select_dtypes(include=[np.number]).columnsfor col in numeric_cols:Q1 = df_clean[col].quantile(0.25)Q3 = df_clean[col].quantile(0.75)IQR = Q3 - Q1lower_bound = Q1 - 1.5 * IQRupper_bound = Q3 + 1.5 * IQRdf_clean[col] = np.where(df_clean[col] < lower_bound, lower_bound,np.where(df_clean[col] > upper_bound, upper_bound,df_clean[col]))print(f"处理异常值后形状: {df_clean.shape}")# 步骤5: 数据转换和标准化# 创建年龄分组def age_group(age):if age <= 25:return '青年'elif age <= 35:return '中年'else:return '资深'df_clean['年龄分组'] = df_clean['年龄'].map(age_group)# 工资标准化 (0-1范围)df_clean['工资标准化'] = (df_clean['工资'] - df_clean['工资'].min()) / (df_clean['工资'].max() - df_clean['工资'].min())print("清洗后的数据:")print(df_clean.head())# 对比清洗前后plt.figure(figsize=(15, 6))# 清洗前后年龄分布对比plt.subplot(1, 2, 1)plt.hist(df['年龄'].dropna(), alpha=0.7, label='清洗前', bins=10)plt.hist(df_clean['年龄'], alpha=0.7, label='清洗后', bins=10)plt.legend()plt.title('年龄分布对比')# 清洗前后工资分布对比plt.subplot(1, 2, 2)plt.hist(df['工资'], alpha=0.7, label='清洗前', bins=10)plt.hist(df_clean['工资'], alpha=0.7, label='清洗后', bins=10)plt.legend()plt.title('工资分布对比')plt.tight_layout()plt.show()return df_cleandf_final_clean = create_data_cleaning_pipeline(df)

output:

=== 数据清洗完整管道 ===
初始数据形状: (12, 5)
处理缺失值后形状: (12, 5)
处理重复数据后形状: (10, 5)
处理异常值后形状: (10, 5)
清洗后的数据:姓名    年龄      工资  城市   部门 年龄分组     工资标准化
0  张三  25.0  5000.0  北京  技术部   青年  0.311828
1  李四  32.0  8000.0  上海  销售部   中年  0.827957
2  王五  28.0  6000.0  北京  技术部   中年  0.483871
3  赵六  28.5  7000.0  广州  市场部   中年  0.655914
4  钱七  35.0  9000.0  深圳  技术部   中年  1.000000

6. 实战练习：花卉名称翻译

def flower_name_translation_exercise():"""实战练习：花卉名称英文转中文"""# 创建花卉数据flower_data = {'英文名': ['rose', 'lily', 'tulip', 'sunflower', 'orchid', 'daisy', 'carnation', 'peony', 'chrysanthemum', 'jasmine'],'颜色': ['red', 'white', 'yellow', 'yellow', 'pink','white', 'pink', 'pink', 'yellow', 'white'],'价格': [5.0, 4.5, 3.5, 2.5, 8.0, 3.0, 4.0, 6.5, 3.8, 5.5]}df_flowers = pd.DataFrame(flower_data)# 添加一些数据质量问题df_flowers.loc[5, '价格'] = np.nan  # 缺失值df_flowers.loc[8, '价格'] = -10.0   # 异常值df_flowers = pd.concat([df_flowers, df_flowers.iloc[[0, 1]]])  # 重复值print("原始花卉数据:")print(df_flowers)# 定义翻译映射字典flower_translation = {'rose': '玫瑰','lily': '百合', 'tulip': '郁金香','sunflower': '向日葵','orchid': '兰花','daisy': '雏菊','carnation': '康乃馨','peony': '牡丹','chrysanthemum': '菊花','jasmine': '茉莉'}# 使用map方法进行翻译df_flowers['中文名'] = df_flowers['英文名'].map(flower_translation)print("\n翻译后的花卉数据:")print(df_flowers)# 数据清洗df_flowers_clean = df_flowers.copy()# 1. 处理重复数据df_flowers_clean = df_flowers_clean.drop_duplicates()# 2. 处理缺失值和异常值price_median = df_flowers_clean['价格'].median()df_flowers_clean['价格'] = df_flowers_clean['价格'].fillna(price_median)df_flowers_clean['价格'] = np.where(df_flowers_clean['价格'] < 0, price_median, df_flowers_clean['价格'])print("\n清洗后的花卉数据:")print(df_flowers_clean)return df_flowers_clean# 执行练习
cleaned_flowers = flower_name_translation_exercise()

output:

原始花卉数据:英文名      颜色    价格
0           rose     red   5.0
1           lily   white   4.5
2          tulip  yellow   3.5
3      sunflower  yellow   2.5
4         orchid    pink   8.0
5          daisy   white   NaN
6      carnation    pink   4.0
7          peony    pink   6.5
8  chrysanthemum  yellow -10.0
9        jasmine   white   5.5
0           rose     red   5.0
1           lily   white   4.5翻译后的花卉数据:英文名      颜色    价格  中文名
0           rose     red   5.0   玫瑰
1           lily   white   4.5   百合
2          tulip  yellow   3.5  郁金香
3      sunflower  yellow   2.5  向日葵
4         orchid    pink   8.0   兰花
5          daisy   white   NaN   雏菊
6      carnation    pink   4.0  康乃馨
7          peony    pink   6.5   牡丹
8  chrysanthemum  yellow -10.0   菊花
9        jasmine   white   5.5   茉莉
0           rose     red   5.0   玫瑰
1           lily   white   4.5   百合清洗后的花卉数据:英文名      颜色   价格  中文名
0           rose     red  5.0   玫瑰
1           lily   white  4.5   百合
2          tulip  yellow  3.5  郁金香
3      sunflower  yellow  2.5  向日葵
4         orchid    pink  8.0   兰花
5          daisy   white  4.5   雏菊
6      carnation    pink  4.0  康乃馨
7          peony    pink  6.5   牡丹
8  chrysanthemum  yellow  4.5   菊花
9        jasmine   white  5.5   茉莉

7. 最佳实践总结

数据清洗检查清单

def data_cleaning_checklist(df):"""数据清洗完整性检查清单"""checklist = {'缺失值检查': df.isnull().sum().sum() == 0,'重复值检查': df.duplicated().sum() == 0,'数据类型检查': all(df.dtypes != 'object'),  # 简化检查'异常值检查': True,  # 需要根据具体业务逻辑'数据一致性检查': True  # 需要根据具体业务逻辑}print("=== 数据清洗检查清单 ===")for item, status in checklist.items():status_icon = "✅" if status else "❌"print(f"{status_icon} {item}")return checklist# 执行检查清单
checklist_results = data_cleaning_checklist(df_final_clean)

自动化数据清洗类

class DataCleaner:"""自动化数据清洗类"""def __init__(self, df):self.df = df.copy()self.cleaning_log = []def handle_missing_values(self, strategy='median', fill_value=None):"""处理缺失值"""df_clean = self.df.copy()for col in df_clean.columns:if df_clean[col].isnull().sum() > 0:if strategy == 'median' and df_clean[col].dtype in ['int64', 'float64']:fill_val = df_clean[col].median()elif strategy == 'mean' and df_clean[col].dtype in ['int64', 'float64']:fill_val = df_clean[col].mean()elif strategy == 'mode':fill_val = df_clean[col].mode()[0] if len(df_clean[col].mode()) > 0 else '未知'elif fill_value is not None:fill_val = fill_valueelse:fill_val = '未知'df_clean[col] = df_clean[col].fillna(fill_val)self.cleaning_log.append(f"填充 {col} 列的缺失值")self.df = df_cleanreturn selfdef remove_duplicates(self, subset=None, keep='first'):"""删除重复值"""initial_shape = self.df.shapeself.df = self.df.drop_duplicates(subset=subset, keep=keep)removed_count = initial_shape[0] - self.df.shape[0]if removed_count > 0:self.cleaning_log.append(f"删除了 {removed_count} 个重复行")return selfdef handle_outliers(self, method='cap', multiplier=1.5):"""处理异常值"""df_clean = self.df.copy()numeric_cols = df_clean.select_dtypes(include=[np.number]).columnsfor col in numeric_cols:Q1 = df_clean[col].quantile(0.25)Q3 = df_clean[col].quantile(0.75)IQR = Q3 - Q1lower_bound = Q1 - multiplier * IQRupper_bound = Q3 + multiplier * IQRif method == 'cap':df_clean[col] = np.where(df_clean[col] < lower_bound, lower_bound,np.where(df_clean[col] > upper_bound, upper_bound,df_clean[col]))elif method == 'remove':df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]self.cleaning_log.append(f"处理 {col} 列的异常值")self.df = df_cleanreturn selfdef transform_data(self, column, mapping_dict):"""数据转换"""self.df[column] = self.df[column].map(mapping_dict)self.cleaning_log.append(f"转换 {column} 列的数据")return selfdef get_cleaned_data(self):"""获取清洗后的数据"""return self.dfdef get_cleaning_log(self):"""获取清洗日志"""return self.cleaning_log# 使用自动化清洗类
cleaner = DataCleaner(df)
cleaned_df = (cleaner.handle_missing_values(strategy='median').remove_duplicates().handle_outliers(method='cap').get_cleaned_data())print("自动化清洗日志:")
for log in cleaner.get_cleaning_log():print(f"- {log}")