当前位置: 首页 > news >正文

Java 实现将Word 转换成markdown

日常的开发中,需要将word 等各类文章信息转换成格式化语言,因此需要使用各类语言将word 转换成Markdown

1、引入 jar包

  <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>5.2.3</version>
        </dependency>

2、使用以下代码进行编写

package com.nd.ai.test.service.utils;

import com.alibaba.fastjson.JSONObject;
import com.nd.ai.test.service.dto.apitest.FileMarkdownDTO;
import org.apache.commons.compress.utils.Lists;
import org.apache.poi.xwpf.usermodel.*;

import java.io.*;
import java.util.List;
import java.util.UUID;

/**
 * @ClassName WordToMarkdownConverter
 * @Author Administrator
 */
public class WordToMarkdownConverter {


    public static FileMarkdownDTO convertWordToMarkdown(String wordPath) throws IOException {

        List<String> imagePathList = Lists.newArrayList();

        FileMarkdownDTO dto = new FileMarkdownDTO();


        try  {
            File file = new File(wordPath);
            String uuid = UUID.randomUUID().toString();
            String outputFilePath = file.getParent() + File.separator + uuid + ".md";
            File outputFile = new File(outputFilePath);
            FileInputStream fis = new FileInputStream(file); // Corrected to read the input file
            XWPFDocument document = new XWPFDocument(fis);
            FileWriter writer = new FileWriter(outputFile);

            dto.setMdPath(outputFilePath);

            String imageDir = file.getParent() + File.separator + uuid + "-images";
            new File(imageDir).mkdirs(); // Create image directory

            for (IBodyElement element : document.getBodyElements()) {
                if (element instanceof XWPFParagraph) {
                    processParagraph((XWPFParagraph) element, writer, imageDir, imagePathList);
                } else if (element instanceof XWPFTable) {
                    processTable((XWPFTable) element, writer, imageDir, imagePathList);
                }
            }

            writer.close();
            // 增加读取mardkown 文件内容
            // Read the content of the generated Markdown file
            StringBuilder markdownContent = new StringBuilder();
            try (BufferedReader br = new BufferedReader(new FileReader(outputFile))) {
                String line;
                while ((line = br.readLine()) != null) {
                    markdownContent.append(line).append("\n");
                }
            }

            dto.setMarkdownContent(markdownContent.toString());
            dto.setParserStatus("success");
        } catch (IOException e){
            dto.setParserStatus("error");
            throw new IOException(e);
        }
        dto.setFileImagePathList(imagePathList);
        return dto;
    }

    private static void processParagraph(XWPFParagraph paragraph, FileWriter writer, String imageDir,List<String> imageNamePath) throws IOException {
        String content = processParagraphContent(paragraph, imageDir,imageNamePath);
        if (content.isEmpty()) return;

        // 处理标题和列表样式
        String style = paragraph.getStyle();
        if (style != null && style.startsWith("Heading")) {
            int level = Math.min(Character.getNumericValue(style.charAt(7)), 6);
            StringBuilder heading = new StringBuilder();
            for (int i = 0; i < level; i++) {
                heading.append("#");
            }
            heading.append(" ").append(content).append("\n\n");
            writer.write(heading.toString());
//            writer.write("#".repeat(level) + " " + content + "\n\n");
        } else if (isListParagraph(paragraph)) {
            String listMark = getListMark(paragraph);
            writer.write("* " + listMark + " " + content + "\n");
        } else {
            writer.write(content + "\n\n");
        }
    }

    private static String getListMark(XWPFParagraph para) {
        int indentLevel = para.getNumIlvl() != null ? para.getNumIlvl().intValue() : 0;
        String numFmt = para.getNumFmt(); // 获取列表编号格式

        // 处理有序列表
        if ("decimal".equals(numFmt) || "upperRoman".equals(numFmt)) {
            StringBuilder prefixBuilder = new StringBuilder();
            for (int i = 0; i < indentLevel * 4; i++) {
                prefixBuilder.append(" ");
            }
            String prefix = prefixBuilder.toString();



            return prefix + ".";
        }
        // 处理无序列表
        else {
            String bullet;
            switch (para.getNumFmt()) {
                case "bullet":
                    bullet = "•"; // 实心圆点
                    break;
                default:
                    bullet = "-"; // 默认用减号
                    break;
            }
            StringBuilder prefixBuilder = new StringBuilder();
            for (int i = 0; i < indentLevel * 4; i++) {
                prefixBuilder.append(" ");
            }
            return prefixBuilder.toString() + bullet;
        }
    }

    private static boolean isListParagraph(XWPFParagraph paragraph) {
        return isOrderedList(paragraph) || isUnorderedList(paragraph); // 如果没有找到对应的样式,则不可能是列表段落
    }

    private static boolean isOrderedList(XWPFParagraph paragraph) {
        if (paragraph.getNumFmt() != null) {
            String numFmt = paragraph.getNumFmt();
            return "decimal".equals(numFmt) || "upperRoman".equals(numFmt) || "lowerRoman".equals(numFmt) ||
                    "upperLetter".equals(numFmt) || "lowerLetter".equals(numFmt);
        }
        return false;
    }

    private static boolean isUnorderedList(XWPFParagraph paragraph) {
        if (paragraph.getNumFmt() != null) {
            String numFmt = paragraph.getNumFmt();
            return "bullet".equals(numFmt);
        }
        return false;
    }

    private static void processTable(XWPFTable table, FileWriter writer, String imageDir,List<String> imageNamePath) throws IOException {
        StringBuilder mdTable = new StringBuilder();
        List<XWPFTableRow> rows = table.getRows();

        for (int i = 0; i < rows.size(); i++) {
            XWPFTableRow row = rows.get(i);
            mdTable.append("|");

            // 处理每个单元格
            for (XWPFTableCell cell : row.getTableCells()) {
                StringBuilder cellContent = new StringBuilder();
                // 处理单元格内的段落
                for (XWPFParagraph para : cell.getParagraphs()) {
                    cellContent.append(processParagraphContent(para, imageDir,imageNamePath).replace("\n", "<br>"));
                }
                mdTable.append(cellContent.toString().trim()).append("|");
            }
            mdTable.append("\n");


            // 添加表头分隔线
            if (i == 0) {
                mdTable.append("|");
                for (int j = 0; j < row.getTableCells().size(); j++) {
                    mdTable.append(" --- |");
                }
                mdTable.append("\n");
            }
        }
        writer.write(mdTable + "\n\n");
    }

    private static String processParagraphContent(XWPFParagraph paragraph, String imageDir,List<String> imageNamePath) throws IOException {
        StringBuilder sb = new StringBuilder();

        for (XWPFRun run : paragraph.getRuns()) {
            // 处理图片
            for (XWPFPicture picture : run.getEmbeddedPictures()) {
                sb.append(saveImage(picture, imageDir,imageNamePath)).append(" ");
            }
            // 处理文本样式
            String text = run.getText(0);
            if (text == null) continue;

            text = applyTextStyles(run, text);
            sb.append(text);
        }

        String content = sb.toString().trim();
        // 处理有序列表和无序列表
        if (isListParagraph(paragraph)) {
            String listMark = getListMark(paragraph);
            content ="* " + listMark + " " + content;
        }
        return content;
    }

    private static String applyTextStyles(XWPFRun run, String text) {
        if (run.isBold()) text = "**" + text + "**";
        if (run.isItalic()) text = "*" + text + "*";
        if (run.getUnderline() != UnderlinePatterns.NONE) text = "__" + text + "__";
        return text;
    }

    private static String saveImage(XWPFPicture picture, String imageDir,List<String> imageNamePath) throws IOException {
        XWPFPictureData picData = picture.getPictureData();
        String fileName = "img_" + UUID.randomUUID() + "." + picData.suggestFileExtension();
        File output = new File(imageDir, fileName);
        imageNamePath.add(output.getPath());

        try (FileOutputStream fos = new FileOutputStream(output)) {
            fos.write(picData.getData());
        }
        return "![" + fileName + "](" + imageDir + "/" + fileName + ")";
    }


    public static void main(String[] args) throws Exception {
        System.out.println(JSONObject.toJSONString( convertWordToMarkdown("word path")));
    }
}

获得信息

{
"fileImagePathList":["文件中图片路径1","文件中图片路径2"],
"markdownContent": "markdwon 信息",
"mdPath": "markdown文件地址"
}

运行上方的程序将会得到
1、解析文件中所有图片信息,保存到下方的地址
2、将word 文档转换成markdown
3、获取markdown 文件

相关文章:

  • 便携版:随时随地,高效处理 PDF 文件
  • 数据驱动的教育革命:让学习更智能、更个性化
  • BNB Chain 何以打造 AI 驱动链上应用新世界?
  • 【数据仓库】湖仓一体的核心建模理论
  • 如何用 Postman 进行高效的 Mock 测试?
  • 【区块链 + 文化版权】基于 FISCO BCOS 的方言大数据语料库 | FISCO BCOS 应用案例
  • Spring的SPEL(Spring Expression Language)的使用说明,包含语法、示例和常见场景
  • Android kill 进程的三种方式
  • Redis 中的过期策略和内存淘汰策略
  • Android SystemUI深度定制实战:QSPanel下拉状态栏动态日期显示全解析
  • Rust从入门到精通之精通篇:24.高级异步编程
  • 虚拟机与Docker与K8s的比较
  • firewall-cmd添加访问规则
  • AI写一个视频转图片帧工具(python)
  • Postman 如何高效地转换时间戳?
  • HarmonyOS:解决UIAbility调用terminateSelf()后设置不保留最近任务列表中的快照
  • [异步监听事件、异步绑定属性]通过vue的this.$refs.组件.$props和.$on实现异步绑定组件属性和事件监听
  • 前端性能优化有哪些方法?
  • 思维跃迁:生成式人工智能(GAI)认证重塑AI时代核心竞争力范式
  • Axure RP9.0教程: 多级联动【设置选项改变时->情形->面板状态】(给动态面板元件设置相关交互事件的情形,来控制其他面板不同的状态。)
  • 中国队夺跳水世界杯总决赛首金
  • 西湖大学2025年上海市综合评价招生简章发布
  • 美国第一季度经济环比萎缩0.3%
  • 夜读丨跷脚牛肉乐翘脚
  • 贵州茅台一季度净利268亿元增长11.56%,系列酒营收增近两成
  • 《中国奇谭》首部动画电影《浪浪山小妖怪》定档8月2日