如何将一篇文章拆解为标题和段落?

import java.util.LinkedList;
import java.util.List;
import java.util.Stack;

/**
 * @Author yinrushuai
 * @Email yinrushuai430528@foxmail.com
 * 2022/10/12 星期三 11:00
 */
public abstract class ArticleSplitUtils {
    //0.将文章主标题入栈,作为哨兵。
    //1.依次顺序遍历字符,捕获到《标题》时,比较old标题和new标题的等级。
    //1.1 old.no<new.no,那么new标题就是old标题的子标题。new入栈。(其中,若new.no-old.no>1,报告文章格式错误!)
    //1.2 old.no>=new.no,那么就重新开启标题。old出栈,一直出到比new.no小;new入栈。
    //直到文章末尾。
    public TitleHolder split(StringBuilder content, String mainTile) {
        Stack<TitleHolder> stack = new Stack<>();

        //0.将文章主标题入栈,作为哨兵。
        TitleHolder mainT = new TitleHolder(0, mainTile);
        stack.push(mainT);

        TitleHolder oldT = mainT;
        int cursor = 0;

        TitleHolder newT = null;
        while ((newT = captureNextTitle(content, cursor)) != null) {//1.依次顺序遍历字符,捕获到《标题》时,比较old标题和new标题的等级。
            String paragraph = content.substring(cursor, newT.startIndex);
            oldT.appendParagraph(paragraph);

            if (oldT.no < newT.no) {//1.1 old.no<new.no,那么new标题就是old标题的子标题。new入栈。(其中,若new.no-old.no>1,报告文章格式错误!)
                if (newT.no - oldT.no > 1) System.err.println("文章格式错误!");
            } else {//1.2 old.no>=new.no,那么就重新开启标题。old出栈,一直出到比new.no小;new入栈。
                while (oldT.no > newT.no) {
                    oldT = stack.pop();
                }
                if (oldT.no < newT.no) {
                    stack.push(oldT);
                }
                oldT = stack.peek();
            }
            oldT.sons.add(newT);
            stack.push(newT);
            oldT = newT;
            cursor = newT.endIndex + 1;
        }

        oldT.appendParagraph(content.substring(cursor));
        stack.clear();//help gc
        return mainT;
    }

    /**
     * @param content
     * @param cursor  当前位置
     * @return null表示没有标题,非null表示最近一个标题。
     */
    protected abstract TitleHolder captureNextTitle(StringBuilder content, int cursor);

    public static class TitleHolder {
        public int no;

        public String title;

        public int startIndex;

        public int endIndex;
        private List<TitleHolder> sons = new LinkedList<>();
        private StringBuilder paragraph = new StringBuilder();

        public TitleHolder() {
        }

        public TitleHolder(int no, String title) {
            this.no = no;
            this.title = title;
        }

        public void appendParagraph(String paragraphContent) {
            paragraph.append(paragraphContent);
        }


        public int getNo() {
            return no;
        }

        public String getTitle() {
            return title;
        }

        public int getStartIndex() {
            return startIndex;
        }

        public int getEndIndex() {
            return endIndex;
        }

        public List<TitleHolder> getSons() {
            return sons;
        }

        public StringBuilder getParagraph() {
            return paragraph;
        }
    }
}

/**
 * @Author yinrushuai
 * @Email yinrushuai430528@foxmail.com
 * 2022/10/15 星期六 16:41
 */
public class HtmlArticleSplitUtils extends ArticleSplitUtils {

    private final static String startStr = "<h";
    private final static int startStrLen = startStr.length();
    private final static String finishStr = ">";
    private final static int finishStrLen = finishStr.length();
    private final static String endStr = "</h";
    private final static int endStrLen = endStr.length();

    /**
     * @param content
     * @param cursor  当前位置
     * @return null表示没有标题,非null表示最近一个标题。
     */
    protected TitleHolder captureNextTitle(StringBuilder content, int cursor) {//搜索到<h,并定位,然后搜索</h**>
        TitleHolder titleHolder = new TitleHolder();
        try {
            //1.找头
            int start1 = content.indexOf(startStr, cursor);
            if (start1 == -1) return null;
            int start2 = content.indexOf(finishStr, start1 + startStrLen);
            if (start2 == -1) return null;

            //2.解析数字
            String numStr = content.substring(start1 + startStrLen, start2);
            int no = parseNum(numStr);
            if (no == -1)
                captureNextTitle(content, start1 + startStrLen);//必须从start1+startStrLen开始,防止<h<h<h3>...</h3>这种格式的数据。
            titleHolder.no = no;

            //3.找尾
            int end = content.indexOf(endStr + numStr + finishStr, start2 + finishStrLen);
            if (end == -1) captureNextTitle(content, start2 + finishStrLen);

            //4.填充内容
            titleHolder.startIndex = start1;
            titleHolder.endIndex = end + endStrLen + numStr.length() + finishStrLen - 1;
            titleHolder.title = content.substring(start1, titleHolder.endIndex + 1);
            return titleHolder;
        } catch (IndexOutOfBoundsException ioobe) {//抛了越界异常,就说明没有标题了。
            ioobe.printStackTrace();
            return null;
        }
    }

    private static int parseNum(String str) {
        try {
            return Integer.valueOf(str);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return -1;
    }

}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值