java输出到html换行_使用jsoup将html转换为纯文本时如何保留换行符？

最新推荐文章于 2024-05-11 08:58:44 发布

郑小武

最新推荐文章于 2024-05-11 08:58:44 发布

阅读量724

点赞数

文章标签： java输出到html换行

本文链接：https://blog.csdn.net/weixin_34651473/article/details/114723174

版权

基于其他答案和对这个问题的评论，似乎大多数人来到这里真的在寻找一个通用的解决方案，它将提供HTML文档的格式良好的纯文本表示 . 我知道我是 .

幸运的是，JSoup已经提供了一个非常全面的例子来说明如何实现这个目标：HtmlToPlainText.java

示例 FormattingVisitor 可以很容易地根据您的喜好进行调整，并处理大多数块元素和换行 .

为了避免链接腐烂，这里完全是_2595541的解决方案：

package org.jsoup.examples;

import org.jsoup.Jsoup;

import org.jsoup.helper.StringUtil;

import org.jsoup.helper.Validate;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.nodes.Node;

import org.jsoup.nodes.TextNode;

import org.jsoup.select.Elements;

import org.jsoup.select.NodeTraversor;

import org.jsoup.select.NodeVisitor;

import java.io.IOException;

/**

* HTML to plain-text. This example program demonstrates the use of jsoup to convert HTML input to lightly-formatted

* plain-text. That is divergent from the general goal of jsoup's .text() methods, which is to get clean data from a

* scrape.

* Note that this is a fairly simplistic formatter -- for real world use you'll want to embrace and extend.

* To invoke from the command line, assuming you've downloaded the jsoup jar to your current directory:

java -cp jsoup.jar org.jsoup.examples.HtmlToPlainText url [selector]

* where url is the URL to fetch, and selector is an optional CSS selector.

* @author Jonathan Hedley, jonathan@hedley.net

public class HtmlToPlainText {

private static final String userAgent = "Mozilla/5.0 (jsoup)";

private static final int timeout = 5 * 1000;

public static void main(String... args) throws IOException {

Validate.isTrue(args.length == 1 || args.length == 2, "usage: java -cp jsoup.jar org.jsoup.examples.HtmlToPlainText url [selector]");

final String url = args[0];

final String selector = args.length == 2 ? args[1] : null;

// fetch the specified URL and parse to a HTML DOM

Document doc = Jsoup.connect(url).userAgent(userAgent).timeout(timeout).get();

HtmlToPlainText formatter = new HtmlToPlainText();

if (selector != null) {

Elements elements = doc.select(selector); // get each element that matches the CSS selector

for (Element element : elements) {

String plainText = formatter.getPlainText(element); // format that element to plain text

System.out.println(plainText);

}

} else { // format the whole doc

String plainText = formatter.getPlainText(doc);

System.out.println(plainText);

}

/**

* Format an Element to plain-text

* @param element the root element to format

* @return formatted text

public String getPlainText(Element element) {

FormattingVisitor formatter = new FormattingVisitor();

NodeTraversor traversor = new NodeTraversor(formatter);

traversor.traverse(element); // walk the DOM, and call .head() and .tail() for each node

return formatter.toString();

}

// the formatting rules, implemented in a breadth-first DOM traverse

private class FormattingVisitor implements NodeVisitor {

private static final int maxWidth = 80;

private int width = 0;

private StringBuilder accum = new StringBuilder(); // holds the accumulated text

// hit when the node is first seen

public void head(Node node, int depth) {

String name = node.nodeName();

if (node instanceof TextNode)

append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.

else if (name.equals("li"))

append("\n * ");

else if (name.equals("dt"))

append(" ");

else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr"))

append("\n");

}

// hit when all of the node's children (if any) have been visited

public void tail(Node node, int depth) {

String name = node.nodeName();

if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5"))

append("\n");

else if (name.equals("a"))

append(String.format(" ", node.absUrl("href")));

}

// appends text to the string builder with a simple word wrap method

private void append(String text) {

if (text.startsWith("\n"))

width = 0; // reset counter if starts with a newline. only from formats above, not in natural text

if (text.equals(" ") &&

(accum.length() == 0 || StringUtil.in(accum.substring(accum.length() - 1), " ", "\n")))

return; // don't accumulate long runs of empty spaces

if (text.length() + width > maxWidth) { // won't fit, needs to wrap

String words[] = text.split("\\s+");

for (int i = 0; i < words.length; i++) {

String word = words[i];

boolean last = i == words.length - 1;

if (!last) // insert a space if not the last word

word = word + " ";

if (word.length() + width > maxWidth) { // wrap and reset counter

accum.append("\n").append(word);

width = word.length();

} else {

accum.append(word);

width += word.length();

}

} else { // fits as is, without need to wrap text

accum.append(text);

width += text.length();

}

@Override

public String toString() {

return accum.toString();

}

郑小武

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java输出到html换行_使用jsoup将html转换为纯文本时如何保留换行符？

基于其他答案和对这个问题的评论，似乎大多数人来到这里真的在寻找一个通用的解决方案，它将提供HTML文档的格式良好的纯文本表示 . 我知道我是 .幸运的是，JSoup已经提供了一个非常全面的例子来说明如何实现这个目标：HtmlToPlainText.java示例 FormattingVisitor 可以很容易地根据您的喜好进行调整，并处理大多数块元素和换行 .为了避免链接腐烂，这里完全是_25955...
复制链接

扫一扫