java中HtmlTableToMarkdown(表格转markdown)

该文章展示了如何使用Java的Jsoup库将HTML表格内容转换成Node对象矩阵,包括处理跨列和跨行元素的方法。
摘要由CSDN通过智能技术生成
package com.medipath.helper.web;

import cn.hutool.core.util.StrUtil;
import cn.hutool.json.JSONUtil;
import com.medipath.helper.web.vo.Node;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;

public class TableToMatrix {
    public static void main(String[] args) {
        String html = "<table>\n" +
                " <tr>\n" +
                "  <td>\n" +
                "   <p>公司</p></td>\n" +
                "  <td>\n" +
                "   <p>克隆号</p></td>\n" +
                "  <td   colspan=\"4\" >\n" +
                "   <p>即用型(ml)</p></td>\n" +
                "  <td   colspan=\"2\" >\n" +
                "   <p>原液(ml)</p></td>\n" +
                " </tr>\n" +
                " <tr>\n" +
                "  <td>\n" +
                "   <p>中杉金桥</p></td>\n" +
                "  <td>\n" +
                "   <p>OTI1A4</p></td>\n" +
                "  <td>\n" +
                "   <p>1.5</p></td>\n" +
                "  <td>\n" +
                "   <p>3</p></td>\n" +
                "  <td>\n" +
                "   <p>6</p></td>\n" +
                "  <td>\n" +
                "   <p>/</p></td>\n" +
                "  <td>\n" +
                "   <p>0.1</p></td>\n" +
                "  <td>\n" +
                "   <p>0.2</p></td>\n" +
                " </tr>\n" +
                " <tr>\n" +
                "  <td   rowspan=\"2\" >\n" +
                "   <p>安必平</p></td>\n" +
                "  <td>\n" +
                "   <p>ALK-1</p></td>\n" +
                "  <td   rowspan=\"2\" >\n" +
                "   <p>1.5</p></td>\n" +
                "  <td   rowspan=\"2\" >\n" +
                "   <p>3</p></td>\n" +
                "  <td   rowspan=\"2\" >\n" +
                "   <p>6</p></td>\n" +
                "  <td   rowspan=\"2\" >\n" +
                "   <p>/</p></td>\n" +
                "  <td   rowspan=\"2\" >\n" +
                "   <p>0.1</p></td>\n" +
                "  <td   rowspan=\"2\" >\n" +
                "   <p>0.2</p></td>\n" +
                " </tr>\n" +
                " <tr>\n" +
                "  <td>\n" +
                "   <p>5A4</p></td>\n" +
                " </tr>\n" +
                " <tr>\n" +
                "  <td   rowspan=\"2\" >\n" +
                "   <p>基因科技</p></td>\n" +
                "  <td>\n" +
                "   <p>5A4</p></td>\n" +
                "  <td   rowspan=\"2\" >\n" +
                "   <p>/</p></td>\n" +
                "  <td   rowspan=\"2\" >\n" +
                "   <p>2</p></td>\n" +
                "  <td   rowspan=\"2\" >\n" +
                "   <p>4</p></td>\n" +
                "  <td   rowspan=\"2\" >\n" +
                "   <p>7</p></td>\n" +
                "  <td   rowspan=\"2\" >\n" +
                "   <p>/</p></td>\n" +
                "  <td   rowspan=\"2\" >\n" +
                "   <p>0.2</p></td>\n" +
                " </tr>\n" +
                " <tr>\n" +
                "  <td>\n" +
                "   <p>SP8</p></td>\n" +
                " </tr>\n" +
                " <tr>\n" +
                "  <td>\n" +
                "   <p>百凌生物</p></td>\n" +
                "  <td>\n" +
                "   <p>BP6165</p></td>\n" +
                "  <td>\n" +
                "   <p>1.5</p></td>\n" +
                "  <td>\n" +
                "   <p>/</p></td>\n" +
                "  <td>\n" +
                "   <p>/</p></td>\n" +
                "  <td>\n" +
                "   <p>7</p></td>\n" +
                "  <td>\n" +
                "   <p>0.1</p></td>\n" +
                "  <td>\n" +
                "   <p>1.0</p></td>\n" +
                " </tr>\n" +
                "</table>";
        System.out.println(tableToJson(html));
        // 解析HTML

    }

    public static String tableToJson(String html) {
        Document doc = Jsoup.parse(html);

        // 获取所有的行
        Elements rows = doc.select("table tr");

        // 创建矩阵
        List<Node> nodes = new ArrayList<>();
        // 遍历行
        int rowIndex = 0;
        for (Element row : rows) {

            // 获取行中的单元格
            Elements cells = row.select("td");

            // 遍历单元格
            int cellIndex = 0;
            for (Element cell : cells) {
                // 处理跨列的单元格

                int colspan = StrUtil.isNotBlank(cell.attr("colspan"))?Integer.parseInt(cell.attr("colspan")):1;
                for (int i = 0; i < colspan; i++) {
                    // 获取单元格中的文本内容
                    String value = cell.text();
                    nodes.add(Node.builder().row(rowIndex).col(cellIndex).value(value).build());
                    int rowspan = StrUtil.isNotBlank(cell.attr("rowspan"))?Integer.parseInt(cell.attr("rowspan")):1;
                    if (rowspan > 1) {
                        for (int i2 = 1; i2 < rowspan; i2++) {
                            // 获取单元格中的文本内容
                            nodes.add(Node.builder().row(rowIndex+i2).col(cellIndex).value(value).build());
                        }
                    }
                    cellIndex++;
                }

                // 处理跨行的单元格

            }
            rowIndex ++;
        }

        // 打印矩阵
        int index = 0;
        String str = "";
        nodes.sort(Comparator.comparingInt(Node::getCol));
        nodes.sort(Comparator.comparingInt(Node::getRow));
        for (Node node : nodes) {
            if (node.getRow() != index) {
                str += " |\n ";
                index++;
            }
            str += " | "+node.getValue();
        }
        return str ;
    }
}
package com.medipath.helper.web.vo;

import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;

@Data
@AllArgsConstructor
@NoArgsConstructor
@Builder
public class Node {
    private Integer row;

    private Integer col;

    private String value;
}

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值