当前位置: 首页 > news >正文

MySQL(149)如何进行数据清洗?

数据清洗在数据处理和分析过程中至关重要,确保数据质量和一致性。以下是一个详细的指南,展示如何使用Java进行数据清洗,包括处理缺失值、重复值、异常值、数据类型转换以及标准化等步骤。

一、准备工作

确保安装有Java开发环境(JDK)和Maven或Gradle等依赖管理工具。我们将使用Apache Commons CSV库来处理CSV文件,并使用Java标准库进行数据清洗操作。

二、加载数据

首先,我们加载数据。假设我们有一个CSV文件(data.csv)作为数据源。

1. 添加依赖

在Maven项目的pom.xml文件中添加Apache Commons CSV库的依赖:

<dependency><groupId>org.apache.commons</groupId><artifactId>commons-csv</artifactId><version>1.8</version>
</dependency>
2. 加载CSV文件
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;import java.nio.file.Files;
import java.nio.file.Paths;
import java.io.IOException;
import java.util.List;public class DataLoader {public static List<CSVRecord> loadData(String filePath) throws IOException {CSVParser parser = CSVParser.parse(Paths.get(filePath), CSVFormat.DEFAULT.withFirstRecordAsHeader());return parser.getRecords();}public static void main(String[] args) {try {List<CSVRecord> records = loadData("data.csv");System.out.println("Data loaded successfully.");System.out.println(records.get(0));} catch (IOException e) {System.err.println("Failed to load data: " + e.getMessage());}}
}

三、处理缺失值

1. 检查缺失值
public static void checkMissingValues(List<CSVRecord> records) {for (CSVRecord record : records) {for (String field : record) {if (field == null || field.trim().isEmpty()) {System.out.println("Missing value found in record: " + record);}}}
}
2. 填充缺失值

假设需要使用均值填充缺失值,对于数值类型的列。

import java.util.HashMap;
import java.util.Map;public class MissingValueHandler {public static List<CSVRecord> fillMissingValuesWithMean(List<CSVRecord> records, String columnName) {double sum = 0;int count = 0;Map<CSVRecord, Double> missingRecords = new HashMap<>();for (CSVRecord record : records) {String value = record.get(columnName);if (value == null || value.trim().isEmpty()) {missingRecords.put(record, null);} else {double numericValue = Double.parseDouble(value);sum += numericValue;count++;}}double mean = sum / count;for (CSVRecord record : missingRecords.keySet()) {record.toMap().put(columnName, String.valueOf(mean));}return records;}
}

四、处理重复值

1. 检查重复值
import java.util.HashSet;
import java.util.Set;public class DuplicateHandler {public static void checkDuplicates(List<CSVRecord> records) {Set<CSVRecord> uniqueRecords = new HashSet<>();for (CSVRecord record : records) {if (!uniqueRecords.add(record)) {System.out.println("Duplicate record found: " + record);}}}
}
2. 删除重复值
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;public class DuplicateHandler {public static List<CSVRecord> removeDuplicates(List<CSVRecord> records) {Set<CSVRecord> uniqueRecords = new LinkedHashSet<>(records);return new ArrayList<>(uniqueRecords);}
}

五、处理异常值

1. 使用统计方法检测异常值
import java.util.ArrayList;
import java.util.List;public class OutlierHandler {public static List<CSVRecord> removeOutliers(List<CSVRecord> records, String columnName) {List<Double> values = new ArrayList<>();for (CSVRecord record : records) {values.add(Double.parseDouble(record.get(columnName)));}double q1 = getPercentile(values, 25);double q3 = getPercentile(values, 75);double iqr = q3 - q1;double lowerBound = q1 - 1.5 * iqr;double upperBound = q3 + 1.5 * iqr;List<CSVRecord> filteredRecords = new ArrayList<>();for (CSVRecord record : records) {double value = Double.parseDouble(record.get(columnName));if (value >= lowerBound && value <= upperBound) {filteredRecords.add(record);}}return filteredRecords;}private static double getPercentile(List<Double> values, double percentile) {values.sort(Double::compareTo);int index = (int) Math.ceil(percentile / 100.0 * values.size());return values.get(index - 1);}
}

六、数据类型转换

1. 检查数据类型
import java.util.List;public class DataTypeChecker {public static void checkDataTypes(List<CSVRecord> records, String columnName) {for (CSVRecord record : records) {String field = record.get(columnName);try {Integer.parseInt(field);System.out.println("Field " + field + " is of type Integer");} catch (NumberFormatException e) {try {Double.parseDouble(field);System.out.println("Field " + field + " is of type Double");} catch (NumberFormatException ex) {System.out.println("Field " + field + " is of type String");}}}}
}
2. 转换数据类型
import java.util.List;public class DataTypeConverter {public static void convertToDouble(List<CSVRecord> records, String columnName) {for (CSVRecord record : records) {String value = record.get(columnName);try {double doubleValue = Double.parseDouble(value);record.toMap().put(columnName, String.valueOf(doubleValue));} catch (NumberFormatException e) {System.err.println("Failed to convert " + value + " to Double");}}}
}

七、数据标准化与规范化

1. 标准化
import java.util.ArrayList;
import java.util.List;public class DataNormalizer {public static void standardizeData(List<CSVRecord> records, String columnName) {List<Double> values = new ArrayList<>();for (CSVRecord record : records) {values.add(Double.parseDouble(record.get(columnName)));}double mean = values.stream().mapToDouble(Double::doubleValue).average().orElse(0.0);double stdDev = Math.sqrt(values.stream().mapToDouble(v -> Math.pow(v - mean, 2)).average().orElse(0.0));for (CSVRecord record : records) {double value = Double.parseDouble(record.get(columnName));double standardizedValue = (value - mean) / stdDev;record.toMap().put(columnName, String.valueOf(standardizedValue));}}
}
2. 规范化
import java.util.ArrayList;
import java.util.List;public class DataNormalizer {public static void normalizeData(List<CSVRecord> records, String columnName) {List<Double> values = new ArrayList<>();for (CSVRecord record : records) {values.add(Double.parseDouble(record.get(columnName)));}double min = values.stream().min(Double::compareTo).orElse(0.0);double max = values.stream().max(Double::compareTo).orElse(1.0);for (CSVRecord record : records) {double value = Double.parseDouble(record.get(columnName));double normalizedValue = (value - min) / (max - min);record.toMap().put(columnName, String.valueOf(normalizedValue));}}
}

八、保存清洗后的数据

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.csv.CSVRecord;import java.io.FileWriter;
import java.io.IOException;
import java.util.List;public class DataSaver {public static void saveData(String filePath, List<CSVRecord> records) {try (FileWriter writer = new FileWriter(filePath);CSVPrinter printer = new CSVPrinter(writer, CSVFormat.DEFAULT.withHeader(records.get(0).toMap().keySet().toArray(new String[0])))) {for (CSVRecord record : records) {printer.printRecord(record.toMap().values());}} catch (IOException e) {System.err.println("Failed to save data: " + e.getMessage());}}
}
http://www.dtcms.com/a/290536.html

相关文章:

  • 09_Spring Boot 整合 Freemarker 模板引擎的坑
  • 【C++】stack和queue拓展学习
  • 库卡气体保护焊机器人省气的方法
  • Mac上安装Homebrew的详细步骤
  • 【CNN】卷积神经网络池化- part2
  • Pytorch01:深度学习中的专业名词及基本介绍
  • 有关Maven的个人笔记总结
  • Zetane:让深度学习不再抽象,一键3D可视化
  • SpringSecurity 详细介绍(认证和授权)
  • 直播专用域名租用全解析:开启直播新境界
  • 板凳-------Mysql cookbook学习 (十二--------3_2)
  • 基于 STM32 的数字闹钟系统 Proteus 仿真设计与实现
  • ASP .NET Core 8高效集成Redis缓存实战
  • C++中的deque容器
  • C#/.NET/.NET Core技术前沿周刊 | 第 47 期(2025年7.14-7.20)
  • 解决vscode中vue格式化后缩进太小的问题,并去除分号 - 设置Vetur tabSize从2到4,设置prettier取消分号semi
  • Hyperledger Fabric V2.5 生产环境部署及安装Java智能合约
  • 从0开始学习R语言--Day53--AFT模型
  • 什么是“差分“?
  • WebkitSpeechRecognition 语音识别
  • 【备忘录】Ubuntu 配置虚拟显示器
  • 嵌入式学习-土堆目标检测(1)-day26
  • 每日数据推荐:一线城市基于手机信令的职住数据
  • 主流监控设备RTSP地址
  • 从env到mm_struct:环境变量与虚拟内存的底层实现
  • Linux 任务调度在进程管理中的关系和运行机制
  • STM32键盘带蓝牙功能
  • 时空数据可视化新范式:基于Three.js的生产全流程时间轴回溯技术解析
  • 首家!数巅AskBI通过中国信通院数据分析智能体专项测试
  • Shell实现服务检查看门程序