一、写作背景
公司和某大学合作了一个编辑器项目,其中涉及两篇文章的比对功能,领导制定的逻辑是“点击左侧文章列表,右侧页面会展示多篇文章的详情页面,点击第1篇文章的目录,其他几篇文章也要滚动到同一级别目录上”
二、实现思路
- 将文章(docx格式)使用onlyoffice的文章转换功能转成html网页
- 根据正则表达式(h1~h6标签)从html网页中提取目录信息
- 给所有目录按照“X-X-X”格式给目录起编号当做id属性
- 生成目录结构树,每个目录对象中包含目录名称、id属性值、目录级别,可以用作前端展示目录,以及实现“写作背景”中提到的多篇目录联动
- 修改原有html网页文件,往h1~h6标签上添加id属性,可以给前端同事展示用
三、Maven依赖
<dependency>
<groupId>cn.wanghaomiao</groupId>
<artifactId>JsoupXpath</artifactId>
<version>2.3.2</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.36</version>
</dependency>
四、代码
import lombok.Data;
import lombok.ToString;
import org.jsoup.Jsoup;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Test {
public static void main(String[] args) {
// 0、准备数据
String html = "<html><head><meta http-equiv=\\\"Content-Type\\\" content=\\\"text/html; charset=utf-8\\\" /></head><body><h1 style=\\\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:18pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\\\"><span style=\\\"font-family:'Arial';font-size:20pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\\\">1</span></h1><h2 style=\\\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\\\"><span style=\\\"font-family:'Arial';font-size:16pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\\\">1.1</span></h2><h3 style=\\\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\\\"><span style=\\\"font-family:'Arial';font-size:14pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\\\">1.1.1</span></h3><h2 style=\\\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\\\"><span style=\\\"font-family:'Arial';font-size:16pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\\\">1.2</span></h2><h3 style=\\\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\\\"><span style=\\\"font-family:'Arial';font-size:14pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\\\">1.2.1</span></h3><h3 style=\\\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\\\"><span style=\\\"font-family:'Arial';font-size:14pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\\\">1.2.2</span></h3><h1 style=\\\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:18pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\\\"><span style=\\\"font-family:'Arial';font-size:20pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\\\">2</span></h1><h2 style=\\\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\\\"><span style=\\\"font-family:'Arial';font-size:16pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\\\">2.1</span></h2><h2 style=\\\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\\\"><span style=\\\"font-family:'Arial';font-size:16pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\\\">2.2</span></h2><h3 style=\\\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\\\"><span style=\\\"font-family:'Arial';font-size:14pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\\\">2.2.1</span></h3><h6 style=\\\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:2pt;margin-bottom:0pt;border:none;border-left:none;border-top:none;border-right:none;border-bottom:none;mso-border-between:none\\\"><span style=\\\"font-family:'Arial';font-size:11pt;color:#595959;mso-style-textfill-fill-color:#595959\\\"><i>2.2.1.1.1.1</i></span></h6></body></html>";
// 1、提取目录信息
List<Integer> levelList = new ArrayList<>();
List<String> nameList = new ArrayList<>();
List<String> headingHtmlList = new ArrayList<>();
extractHeading(html, levelList, nameList, headingHtmlList);
// 打印目录信息
System.out.println("start 打印目录标题和级别");
for (int i = 0; i < nameList.size(); i++) {
System.out.print("标题:" + nameList.get(i) + "、级别:" + levelList.get(i) + ";");
}
System.out.println("\nend 打印目录标题和级别\n");
// 2、获取目录级别树
Heading heading = headingTree(levelList, nameList);
// 3、获取目录对应id(对应h1~6标签的属性)属性值集合
List<String> ids = getHeadingIdList(heading);
// 打印目录对应id属性值集合
System.out.println("start 打印目录对应id属性值集合");
System.out.println("目录对应id属性值集合:" + ids);
System.out.println("end 打印目录对应id属性值集合\n");
// 4、往目录标签后面添加id属性值
String newHtml = addId2Html(html, headingHtmlList, ids);
// 打印新html
System.out.println("start 打印添加id属性后的html值");
System.out.println("添加id属性后的html值:" + newHtml);
System.out.println("end 打印添加id属性后的html值\n");
// 打印目录级别树(在getHeadingIdList方法中填充了id属性,所以在此处打印目录级别树)
System.out.println("start 打印目录级别树");
System.out.println("目录级别树:" + heading);
System.out.println("end 打印目录级别树");
}
/**
* 将id属性值添加到html中
*
* @author guoming
* @date 2025/1/22 14:37
* @param oldHtml 原始网页文本
* @param headingHtmlList 提取出来的目录网页文本
* @param ids 目录id属性值集合
* @return 新网页文本
**/
private static String addId2Html(String oldHtml, List<String> headingHtmlList, List<String> ids) {
for (int i = 0; i < headingHtmlList.size(); i++) {
String headingHtml = headingHtmlList.get(i);
int index = oldHtml.indexOf(headingHtml);
StringBuilder sb = new StringBuilder(oldHtml);
sb.insert(index + 4, "id=\\\"" + ids.get(i) + "\\\"");
oldHtml = sb.toString();
}
return oldHtml;
}
/**
* 获取目录对应id(对应h1~6标签的属性)属性值集合
*
* @author 明快de玄米61
* @date 2025/1/22 14:32
* @param heading 顶级目录对象
* @return 目录对应id(对应h1~6标签的属性)属性值集合
**/
private static List<String> getHeadingIdList(Heading heading) {
List<String> ids = new ArrayList<>();
for (int i = 0; i < heading.getChildren().size(); i++) {
Heading child = heading.getChildren().get(i);
String nextPrefix = getNextLevelHeadingIdPrefix(child.getLevel(), 0, "", i + 1);
String id = nextPrefix.substring(0, nextPrefix.length() - 1);
child.setId(id);
ids.add(id);
generateHeadingId(child, nextPrefix, ids);
}
return ids;
}
/**
* 获取目录信息
*
* @author 明快de玄米61
* @date 2025/1/22 14:33
* @param html 最初网页
* @param levelList 目录级别集合
* @param nameList 目录名称集合
* @param headingHtmlList 单个目录网页代码集合
* @return
**/
private static void extractHeading(String html, List<Integer> levelList, List<String> nameList, List<String> headingHtmlList) {
Pattern pattern = Pattern.compile("<h[1-6][\\s\\S]*?</h([1-6])>");
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
levelList.add(Integer.valueOf(matcher.group(1)));
// 获取目录名称集合
nameList.add(Jsoup.parse(matcher.group()).text().trim());
headingHtmlList.add(matcher.group());
}
}
/**
* 组装目录级别树
*
* @author 明快de玄米61
* @date 2025/1/22 14:12
* @param levelList 目录级别集合
* @param nameList 目录名称集合
* @return 目录级别树-顶节点
**/
private static Heading headingTree(List<Integer> levelList, List<String> nameList) {
List<Heading> processed = new ArrayList<>();
Heading heading = new Heading();
heading.setChildren(new ArrayList<Heading>());
for (int i = 0; i < levelList.size(); i++) {
Integer level = levelList.get(i);
// 当前对象
Heading entity = new Heading();
entity.setLevel(level);
entity.setName(nameList.get(i));
entity.setChildren(new ArrayList<Heading>());
// 获取父级对象
Heading parent = getParentHeading(processed, level);
if (parent == null) {
heading.getChildren().add(entity);
} else {
parent.getChildren().add(entity);
}
// 放入集合
processed.add(entity);
}
return heading;
}
/**
* 获取下一个级别的id前缀
*
* @author 明快de玄米61
* @date 2025/1/22 14:29
* @param currentLevel 当前目录级别
* @param parentLevel 父级目录级别
* @return 下一个级别的id前缀
**/
private static String getNextLevelHeadingIdPrefix(Integer currentLevel, Integer parentLevel, String prefix, Integer sort) {
StringBuilder sb = new StringBuilder();
if (prefix.length() == 0) {
sb.append(sort).append("-");
for (Integer i = 2; i <= currentLevel; i++) {
sb.append(1).append("-");
}
} else {
if (currentLevel - 1 > parentLevel) {
sb.append(prefix);
for (Integer i = parentLevel + 2; i <= currentLevel; i++) {
sb.append(1).append("-");
}
sb.append(sort).append("-");
} else {
sb.append(prefix).append(sort).append("-");
}
}
return sb.toString();
}
/**
* 生成目录对应的id,最后会添加到html中当做id属性值
*
* @param entity 目录对象
* @param prefix 目录id前缀
**/
private static void generateHeadingId(Heading entity, String prefix, List<String> ids) {
List<Heading> childs = entity.getChildren();
if (childs == null || childs.size() == 0) {
return;
}
for (int i = 0; i < childs.size(); i++) {
Heading child = childs.get(i);
String newPrefix = getNextLevelHeadingIdPrefix(child.getLevel(), entity.getLevel(), prefix, i + 1);
String id = newPrefix.substring(0, newPrefix.length() - 1);
child.setId(id);
ids.add(id);
generateHeadingId(child, newPrefix, ids);
}
}
/**
* 获取父类目录(采用反向思维查找)
*
* @param headings 目录对象集合
* @param level 目录级别
* @return 父级目录对象
**/
private static Heading getParentHeading(List<Heading> headings, Integer level) {
if (headings.size() == 0) {
return null;
}
for (int i = headings.size() - 1; i >= 0; i--) {
Heading entity = headings.get(i);
if (entity.getLevel() < level) {
return entity;
}
}
return null;
}
}
/**
* 目录类
*/
@Data
@ToString
class Heading {
// 对应id属性值
private String id;
// 名称
private String name;
// 级别
private Integer level;
// 子级目录集合
private List<Heading> children;
}
五、结果
start 打印目录标题和级别
标题:1、级别:1;标题:1.1、级别:2;标题:1.1.1、级别:3;标题:1.2、级别:2;标题:1.2.1、级别:3;标题:1.2.2、级别:3;标题:2、级别:1;标题:2.1、级别:2;标题:2.2、级别:2;标题:2.2.1、级别:3;标题:2.2.1.1.1.1、级别:6;
end 打印目录标题和级别
start 打印目录对应id属性值集合
目录对应id属性值集合:[1, 1-1, 1-1-1, 1-2, 1-2-1, 1-2-2, 2, 2-1, 2-2, 2-2-1, 2-2-1-1-1-1]
end 打印目录对应id属性值集合
start 打印添加id属性后的html值
添加id属性后的html值:<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /></head><body><h1 id=\"1\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:18pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:20pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">1</span></h1><h2 id=\"1-1\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:16pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">1.1</span></h2><h3 id=\"1-1-1\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:14pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">1.1.1</span></h3><h2 id=\"1-2\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:16pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">1.2</span></h2><h3 id=\"1-2-1\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:14pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">1.2.1</span></h3><h3 id=\"1-2-2\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:14pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">1.2.2</span></h3><h1 id=\"2\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:18pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:20pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">2</span></h1><h2 id=\"2-1\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:16pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">2.1</span></h2><h2 id=\"2-2\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:16pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">2.2</span></h2><h3 id=\"2-2-1\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:14pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">2.2.1</span></h3><h6 id=\"2-2-1-1-1-1\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:2pt;margin-bottom:0pt;border:none;border-left:none;border-top:none;border-right:none;border-bottom:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:11pt;color:#595959;mso-style-textfill-fill-color:#595959\"><i>2.2.1.1.1.1</i></span></h6></body></html>
end 打印添加id属性后的html值
start 打印目录级别树
目录级别树:Heading(id=null, name=null, level=null, children=[Heading(id=1, name=1, level=1, children=[Heading(id=1-1, name=1.1, level=2, children=[Heading(id=1-1-1, name=1.1.1, level=3, children=[])]), Heading(id=1-2, name=1.2, level=2, children=[Heading(id=1-2-1, name=1.2.1, level=3, children=[]), Heading(id=1-2-2, name=1.2.2, level=3, children=[])])]), Heading(id=2, name=2, level=1, children=[Heading(id=2-1, name=2.1, level=2, children=[]), Heading(id=2-2, name=2.2, level=2, children=[Heading(id=2-2-1, name=2.2.1, level=3, children=[Heading(id=2-2-1-1-1-1, name=2.2.1.1.1.1, level=6, children=[])])])])])
end 打印目录级别树