Java提取markdown中的表格
Java提取markdown中的表格
说明
这篇博文是一个舍近求远的操作,如果只需要要对markdown中的表格数据进行提取,完全可以通过正在表达式或者字符串切分来完成。但是鉴于学习的目的,这次采用了commonmark包中的工具来完成。具体实现过程如下
实现步骤
引入pom依赖
<dependency><groupId>org.commonmark</groupId><artifactId>commonmark</artifactId><version>0.21.0</version></dependency><dependency><groupId>org.commonmark</groupId><artifactId>commonmark-ext-gfm-tables</artifactId><version>0.21.0</version></dependency>
自定义vistor
import org.commonmark.ext.gfm.tables.*;
import org.commonmark.node.*;import java.util.ArrayList;
import java.util.List;public class TableVisitor extends AbstractVisitor {private boolean inHeader = false;private boolean inBody = false;private List<String> currentRow = null;private List<String> headers = new ArrayList<>();private final List<List<String>> rows = new ArrayList<>();@Overridepublic void visit(CustomBlock customBlock) {if (customBlock instanceof TableBlock) {handleTableBlock((TableBlock) customBlock);} else {super.visit(customBlock);}}@Overridepublic void visit(CustomNode customNode) {if (customNode instanceof TableHead) {handleTableHead((TableHead) customNode);} else if (customNode instanceof TableBody) {handleTableBody((TableBody) customNode);} else if (customNode instanceof TableRow) {handleTableRow((TableRow) customNode);} else if (customNode instanceof TableCell) {handleTableCell((TableCell) customNode);} else {super.visit(customNode);}}private void handleTableBlock(TableBlock tableBlock) {// 重置状态inHeader = false;inBody = false;visitChildren(tableBlock);}private void handleTableHead(TableHead tableHead) {inHeader = true;visitChildren(tableHead);inHeader = false;}private void handleTableBody(TableBody tableBody) {inBody = true;visitChildren(tableBody);inBody = false;}private void handleTableRow(TableRow tableRow) {currentRow = new ArrayList<>();visitChildren(tableRow);if (inHeader) {this.headers = currentRow;} else if (inBody) {this.rows.add(currentRow);}}private void handleTableCell(TableCell tableCell) {if (currentRow != null) {currentRow.add(getTextContent(tableCell));}visitChildren(tableCell);}private String getTextContent(Node node) {StringBuilder sb = new StringBuilder();Node child = node.getFirstChild();while (child != null) {if (child instanceof Text) {sb.append(((Text) child).getLiteral());}child = child.getNext();}return sb.toString().trim();}public List<String> getTableHeaders() {return headers;}public List<List<String>> getTableRows() {return rows;}
}
测试用例
public static void main(String[] args) {String content = """| 姓名 | 性别 | 班级 | 年龄 ||--------------|------|--------------------|--------------------|| 张三 | 男 | 兴趣一班 | 17 || 李四 | 男 | 兴趣一班 | 16 |""";List<Extension> extensions = Arrays.asList(TablesExtension.create());Parser parser = Parser.builder().extensions(extensions).build();Node document = parser.parse(content);TableVisitor visitor = new TableVisitor();document.accept(visitor);List<String> tableHeaders = visitor.getTableHeaders();List<List<String>> tableRows = visitor.getTableRows();System.out.println("表头: " + tableHeaders);System.out.println("表格行数据: "+ tableRows);}
总结
由于没有在commonmark中找到我们需要的vistor,所以自定义了vistor。希望可以对其他同学有所帮助吧。