当前位置: 首页 > news >正文

Java实现word、pdf转html保留格式

一、word转html

依赖:

<properties><poi.version>5.2.3</poi.version><xhtml.version>2.0.4</xhtml.version>
</properties><!--word转html-->
<dependency><groupId>org.apache.poi</groupId><artifactId>poi-scratchpad</artifactId><version>${poi.version}</version>
</dependency>
<!--word转html-->
<dependency><groupId>fr.opensagres.xdocreport</groupId><artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId><version>${xhtml.version}</version>
</dependency>
<!--处理office文档表格相关 2007+版-->
<dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>${poi.version}</version>
</dependency>
<!--处理office文档表格相关 2003版-->
<dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>${poi.version}</version>
</dependency>

代码:

import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.commons.codec.binary.Base64;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.xwpf.usermodel.XWPFDocument;import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.net.URL;public class WordUtil {public static String wordToHtml(String fileUrl,String fileSuffix) throws Exception {URL url = new URL(fileUrl);try (InputStream inputStream = url.openStream()) {if(fileSuffix.equals(".docx") || fileSuffix.equals(".DOCX")){return word2007ToHtml(inputStream);} else if (fileSuffix.equals(".doc") || fileSuffix.equals(".DOC")) {return word2003ToHtml(inputStream);}else{throw new RuntimeException("错误的文件后缀");}} catch (RuntimeException e) {throw new RuntimeException(e.getMessage());}}/*** word2007转换成html* 对于docx,可以用下面这种方式:* @throws Exception*/public static String word2007ToHtml(InputStream inputStream) {try (ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();XWPFDocument docxDocument = new XWPFDocument(inputStream)) {XHTMLOptions options = XHTMLOptions.create();// 是否忽略未使用的样式options.setIgnoreStylesIfUnused(false);// 设置片段模式,<div>标签包裹options.setFragment(true);// 图片转base64options.setImageManager(new Base64EmbedImgManager());// 转换htm1XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);return htmlStream.toString();} catch (Exception e) {System.out.println("Word转Html过程出现异常!");throw new RuntimeException(e.getMessage());}}/*** word2003转换成html* 对于doc,可以用下面这种方式:* @throws Exception*/public static String word2003ToHtml(InputStream inputStream ) throws Exception {try (StringWriter writer = new StringWriter();HWPFDocument document = new HWPFDocument(inputStream)) {WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());//将图片转成base64的格式wordToHtmlConverter.setPicturesManager((bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes));wordToHtmlConverter.processDocument(document);org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();DOMSource domSource = new DOMSource(htmlDocument);TransformerFactory factory = TransformerFactory.newInstance();Transformer serializer = factory.newTransformer();serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");serializer.setOutputProperty(OutputKeys.INDENT, "yes");serializer.setOutputProperty(OutputKeys.METHOD, "html");serializer.transform(domSource, new StreamResult(writer));return writer.toString();} catch (Exception e) {System.out.println("Word转Html过程出现异常!");throw new RuntimeException(e.getMessage());}}}

来源博客:Java实现word转html_java word转html-CSDN博客

二、pdf转html

依赖:

        <dependency><groupId>net.sf.cssbox</groupId><artifactId>pdf2dom</artifactId></dependency><dependency><groupId>net.mabboud.fontverter</groupId><artifactId>FontVerter</artifactId></dependency><dependency><groupId>org.reflections</groupId><artifactId>reflections</artifactId></dependency><!--pdf转文本--><dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox</artifactId></dependency>

 代码:

import org.apache.pdfbox.pdmodel.PDDocument;
import org.fit.pdfdom.PDFDomTree;import java.io.*;
import java.net.URL;public class PDFUtil {public static String pdfToHtml(String fileUrl) throws IOException {URL url = new URL(fileUrl);try (InputStream inputStream = url.openStream()){return pdfToHtml(inputStream);}catch (Exception e){throw new IOException(e.getMessage());}}public static String pdfToHtml(InputStream inputStream) throws IOException {String outFilePath = "mypdf.html";String pdfContent = "";PDDocument document = PDDocument.load(inputStream);Writer writer = new PrintWriter(outFilePath, "UTF-8");new PDFDomTree().writeText(document, writer);writer.close();document.close();// 获取html内容try (BufferedReader reader = new BufferedReader(new FileReader(outFilePath))) {StringBuilder htmlContent = new StringBuilder();String line;while ((line = reader.readLine()) != null) {htmlContent.append(line).append("\n"); // 追加每一行内容,并添加换行符}pdfContent = String.valueOf(htmlContent);return pdfContent;} catch (IOException e) {e.printStackTrace();System.err.println("读取 HTML 文件时出错。");}return null;}
}

 来源博客:使用Java实现PDF到HTML的转换_java pdf转html-CSDN博客


文章转载自:
http://allay.lbooon.cn
http://caodaism.lbooon.cn
http://antiandrogen.lbooon.cn
http://caritas.lbooon.cn
http://champaign.lbooon.cn
http://astasia.lbooon.cn
http://chirographer.lbooon.cn
http://chiphead.lbooon.cn
http://adsmith.lbooon.cn
http://calciphylaxis.lbooon.cn
http://brimstone.lbooon.cn
http://augsburg.lbooon.cn
http://cabas.lbooon.cn
http://budgie.lbooon.cn
http://cete.lbooon.cn
http://bureaucracy.lbooon.cn
http://amperometric.lbooon.cn
http://aweary.lbooon.cn
http://acrogenous.lbooon.cn
http://boxing.lbooon.cn
http://cca.lbooon.cn
http://boggle.lbooon.cn
http://ameslan.lbooon.cn
http://andesite.lbooon.cn
http://anode.lbooon.cn
http://caracas.lbooon.cn
http://chaeta.lbooon.cn
http://afflatus.lbooon.cn
http://alkene.lbooon.cn
http://burnable.lbooon.cn
http://www.dtcms.com/a/280439.html

相关文章:

  • 19.如何将 Python 字符串转换为 Slug
  • 全面安装指南:在Linux、Windows和macOS上部署Apache Cassandra
  • 基于STM32与中航ZH-E3L字符卡通信在LED屏显示数据
  • 华为敏态开发流程敏捷开发费用估算敏态IT财务分析模板
  • 进程探秘:从 PCB 到 fork 的核心原理之旅
  • Lang3
  • Spring Ioc Bean 到底是什么
  • 朝鲜升级供应链恶意软件XORIndex,再次瞄准npm生态系统
  • 从springcloud-gateway了解同步和异步,webflux webMvc、共享变量
  • 四种高效搭建SpringBoot项目的方式详解
  • 基于UDP/IP网络游戏加速高级拥塞控制算法(示意:一)
  • SpringBoot 实现 Redis读写分离
  • 【PTA数据结构 | C语言版】根据前序序列重构二叉树
  • npm install failed如何办?
  • 【10】MFC入门到精通——MFC 创建向导对话框、属性页类、属性表类、代码
  • centos 安装java 环境
  • FreeRTOS学习笔记——总览
  • 【Docker基础】Dockerfile构建与运行流程完全指南:从原理到实践优化
  • CentOS 8-BClinux8.2更换为阿里云镜像源:保姆级教程
  • 【第二章自定义功能菜单_MenuItemAttribute_顶部菜单栏(本章进度1/7)】
  • Rust基础-part5-引用
  • 【jvm|基本原理】第四天
  • 游戏行业中的恶梦:不断升级的DDoS攻击
  • 深入理解C++11 std::iota:从原理到实践
  • UDP和TCP的主要区别是什么
  • 17. 什么是 webSocket ?
  • 力扣 hot100 Day45
  • ZYNQ千兆光通信实战:Tri Mode Ethernet MAC深度解析
  • Keepalived双机热备概述
  • 基于深度学习的LSTM、GRU对大数据交通流量分析与预测的研究