Java 实现poi方式读取word文件内容
本文介绍了一个简单的Java程序,该程序能够从指定路径的 .doc/.docx 文件中读取文本内容。通过使用Apache POI库中的WordExtractor类,实现了对Microsoft Word文档的解析。
1、Maven Jar包
<!-- .docx -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.3</version>
</dependency>
<!-- .doc -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.2.3</version>
</dependency>
2、Java代码
package org.example.utils;import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;public class WordUtils {public String read(String path) {try {if (path.toLowerCase().endsWith(".docx"))return readDocx(path);else if (path.toLowerCase().endsWith(".doc"))return readDoc(path);elsethrow new IllegalArgumentException("不支持的文件格式");} catch (Exception e){e.printStackTrace();}return null;}public String readDocx(String path) throws IOException {try (InputStream in = new FileInputStream(path);XWPFDocument doc = new XWPFDocument(in)) {return new XWPFWordExtractor(doc).getText();}}public String readDoc(String path) throws IOException {try (InputStream in = new FileInputStream(path);HWPFDocument doc = new HWPFDocument(in)) {return new WordExtractor(doc).getText();}}public static void main(String[] args) {WordUtils wordUtils = new WordUtils();try {String docx = wordUtils.read("/Users/work/Documents/数据分析报告.doc");System.out.println(docx);} catch (Exception e) {throw new RuntimeException(e);}}}