import java.util.LinkedList; import java.util.List; import java.util.Stack; /** * @Author yinrushuai * @Email yinrushuai430528@foxmail.com * 2022/10/12 星期三 11:00 */ public abstract class ArticleSplitUtils { //0.将文章主标题入栈,作为哨兵。 //1.依次顺序遍历字符,捕获到《标题》时,比较old标题和new标题的等级。 //1.1 old.no<new.no,那么new标题就是old标题的子标题。new入栈。(其中,若new.no-old.no>1,报告文章格式错误!) //1.2 old.no>=new.no,那么就重新开启标题。old出栈,一直出到比new.no小;new入栈。 //直到文章末尾。 public TitleHolder split(StringBuilder content, String mainTile) { Stack<TitleHolder> stack = new Stack<>(); //0.将文章主标题入栈,作为哨兵。 TitleHolder mainT = new TitleHolder(0, mainTile); stack.push(mainT); TitleHolder oldT = mainT; int cursor = 0; TitleHolder newT = null; while ((newT = captureNextTitle(content, cursor)) != null) {//1.依次顺序遍历字符,捕获到《标题》时,比较old标题和new标题的等级。 String paragraph = content.substring(cursor, newT.startIndex); oldT.appendParagraph(paragraph); if (oldT.no < newT.no) {//1.1 old.no<new.no,那么new标题就是old标题的子标题。new入栈。(其中,若new.no-old.no>1,报告文章格式错误!) if (newT.no - oldT.no > 1) System.err.println("文章格式错误!"); } else {//1.2 old.no>=new.no,那么就重新开启标题。old出栈,一直出到比new.no小;new入栈。 while (oldT.no > newT.no) { oldT = stack.pop(); } if (oldT.no < newT.no) { stack.push(oldT); } oldT = stack.peek(); } oldT.sons.add(newT); stack.push(newT); oldT = newT; cursor = newT.endIndex + 1; } oldT.appendParagraph(content.substring(cursor)); stack.clear();//help gc return mainT; } /** * @param content * @param cursor 当前位置 * @return null表示没有标题,非null表示最近一个标题。 */ protected abstract TitleHolder captureNextTitle(StringBuilder content, int cursor); public static class TitleHolder { public int no; public String title; public int startIndex; public int endIndex; private List<TitleHolder> sons = new LinkedList<>(); private StringBuilder paragraph = new StringBuilder(); public TitleHolder() { } public TitleHolder(int no, String title) { this.no = no; this.title = title; } public void appendParagraph(String paragraphContent) { paragraph.append(paragraphContent); } public int getNo() { return no; } public String getTitle() { return title; } public int getStartIndex() { return startIndex; } public int getEndIndex() { return endIndex; } public List<TitleHolder> getSons() { return sons; } public StringBuilder getParagraph() { return paragraph; } } }
/** * @Author yinrushuai * @Email yinrushuai430528@foxmail.com * 2022/10/15 星期六 16:41 */ public class HtmlArticleSplitUtils extends ArticleSplitUtils { private final static String startStr = "<h"; private final static int startStrLen = startStr.length(); private final static String finishStr = ">"; private final static int finishStrLen = finishStr.length(); private final static String endStr = "</h"; private final static int endStrLen = endStr.length(); /** * @param content * @param cursor 当前位置 * @return null表示没有标题,非null表示最近一个标题。 */ protected TitleHolder captureNextTitle(StringBuilder content, int cursor) {//搜索到<h,并定位,然后搜索</h**> TitleHolder titleHolder = new TitleHolder(); try { //1.找头 int start1 = content.indexOf(startStr, cursor); if (start1 == -1) return null; int start2 = content.indexOf(finishStr, start1 + startStrLen); if (start2 == -1) return null; //2.解析数字 String numStr = content.substring(start1 + startStrLen, start2); int no = parseNum(numStr); if (no == -1) captureNextTitle(content, start1 + startStrLen);//必须从start1+startStrLen开始,防止<h<h<h3>...</h3>这种格式的数据。 titleHolder.no = no; //3.找尾 int end = content.indexOf(endStr + numStr + finishStr, start2 + finishStrLen); if (end == -1) captureNextTitle(content, start2 + finishStrLen); //4.填充内容 titleHolder.startIndex = start1; titleHolder.endIndex = end + endStrLen + numStr.length() + finishStrLen - 1; titleHolder.title = content.substring(start1, titleHolder.endIndex + 1); return titleHolder; } catch (IndexOutOfBoundsException ioobe) {//抛了越界异常,就说明没有标题了。 ioobe.printStackTrace(); return null; } } private static int parseNum(String str) { try { return Integer.valueOf(str); } catch (Exception e) { e.printStackTrace(); } return -1; } }