java输出到html换行_使用jsoup将html转换为纯文本时如何保留换行符?

基于其他答案和对这个问题的评论,似乎大多数人来到这里真的在寻找一个通用的解决方案,它将提供HTML文档的格式良好的纯文本表示 . 我知道我是 .

幸运的是,JSoup已经提供了一个非常全面的例子来说明如何实现这个目标:HtmlToPlainText.java

示例 FormattingVisitor 可以很容易地根据您的喜好进行调整,并处理大多数块元素和换行 .

为了避免链接腐烂,这里完全是_2595541的解决方案:

package org.jsoup.examples;

import org.jsoup.Jsoup;

import org.jsoup.helper.StringUtil;

import org.jsoup.helper.Validate;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.nodes.Node;

import org.jsoup.nodes.TextNode;

import org.jsoup.select.Elements;

import org.jsoup.select.NodeTraversor;

import org.jsoup.select.NodeVisitor;

import java.io.IOException;

/**

* HTML to plain-text. This example program demonstrates the use of jsoup to convert HTML input to lightly-formatted

* plain-text. That is divergent from the general goal of jsoup's .text() methods, which is to get clean data from a

* scrape.

*

* Note that this is a fairly simplistic formatter -- for real world use you'll want to embrace and extend.

*

*

* To invoke from the command line, assuming you've downloaded the jsoup jar to your current directory:

*

java -cp jsoup.jar org.jsoup.examples.HtmlToPlainText url [selector]

* where url is the URL to fetch, and selector is an optional CSS selector.

*

* @author Jonathan Hedley, jonathan@hedley.net

*/

public class HtmlToPlainText {

private static final String userAgent = "Mozilla/5.0 (jsoup)";

private static final int timeout = 5 * 1000;

public static void main(String... args) throws IOException {

Validate.isTrue(args.length == 1 || args.length == 2, "usage: java -cp jsoup.jar org.jsoup.examples.HtmlToPlainText url [selector]");

final String url = args[0];

final String selector = args.length == 2 ? args[1] : null;

// fetch the specified URL and parse to a HTML DOM

Document doc = Jsoup.connect(url).userAgent(userAgent).timeout(timeout).get();

HtmlToPlainText formatter = new HtmlToPlainText();

if (selector != null) {

Elements elements = doc.select(selector); // get each element that matches the CSS selector

for (Element element : elements) {

String plainText = formatter.getPlainText(element); // format that element to plain text

System.out.println(plainText);

}

} else { // format the whole doc

String plainText = formatter.getPlainText(doc);

System.out.println(plainText);

}

}

/**

* Format an Element to plain-text

* @param element the root element to format

* @return formatted text

*/

public String getPlainText(Element element) {

FormattingVisitor formatter = new FormattingVisitor();

NodeTraversor traversor = new NodeTraversor(formatter);

traversor.traverse(element); // walk the DOM, and call .head() and .tail() for each node

return formatter.toString();

}

// the formatting rules, implemented in a breadth-first DOM traverse

private class FormattingVisitor implements NodeVisitor {

private static final int maxWidth = 80;

private int width = 0;

private StringBuilder accum = new StringBuilder(); // holds the accumulated text

// hit when the node is first seen

public void head(Node node, int depth) {

String name = node.nodeName();

if (node instanceof TextNode)

append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.

else if (name.equals("li"))

append("\n * ");

else if (name.equals("dt"))

append(" ");

else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr"))

append("\n");

}

// hit when all of the node's children (if any) have been visited

public void tail(Node node, int depth) {

String name = node.nodeName();

if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5"))

append("\n");

else if (name.equals("a"))

append(String.format(" ", node.absUrl("href")));

}

// appends text to the string builder with a simple word wrap method

private void append(String text) {

if (text.startsWith("\n"))

width = 0; // reset counter if starts with a newline. only from formats above, not in natural text

if (text.equals(" ") &&

(accum.length() == 0 || StringUtil.in(accum.substring(accum.length() - 1), " ", "\n")))

return; // don't accumulate long runs of empty spaces

if (text.length() + width > maxWidth) { // won't fit, needs to wrap

String words[] = text.split("\\s+");

for (int i = 0; i < words.length; i++) {

String word = words[i];

boolean last = i == words.length - 1;

if (!last) // insert a space if not the last word

word = word + " ";

if (word.length() + width > maxWidth) { // wrap and reset counter

accum.append("\n").append(word);

width = word.length();

} else {

accum.append(word);

width += word.length();

}

}

} else { // fits as is, without need to wrap text

accum.append(text);

width += text.length();

}

}

@Override

public String toString() {

return accum.toString();

}

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值