java操作 html 转为纯文本并且保留文本格式

最新推荐文章于 2024-07-13 02:44:55 发布

weixin_33675507

最新推荐文章于 2024-07-13 02:44:55 发布

阅读量1.1k

点赞数

文章标签： java 数据库

原文链接：http://blog.51cto.com/itstyle/1369023

版权

/**

*@Function: html 转为纯文本保留格式

*@Class Name: WebFormatter

*@Author: zhangZhiPeng

*@Date: 2013-10-29

*@Modifications:

*@Modifier Name; Date; The Reason for Modifying

public class WebFormatter{

public static void main(String[] args){

String content = html2text("<p>The Nobel(撒娇的空间卡死的快乐? Peace Prize for 2008 was given to Martti Ahtisaari. He was the president of Finland from 1994 to 2000. He won the prize for his work in solving (瑙ｅ喅)international conflicts (鍐茬獊) for more than 30 years. </p><p>   During all his life, both as president and as an international officer, he has worked for peace. For the past 20 years, he has done a lot to resolve several serious international conflicts. Some of these conflicts had lasted for years. In 1989-90, he played an important role in helping Namibia鈥檚 independence(鐙珛); in 2005 he did his best to help solve the Aceh question in Indonesia. In 1999 and again in 2005-2007, under very difficult situation he found ways to help solve the conflict in Kosovo. In 2008, together with other organizations, he has tried to help solve many of the problems in Iraq. He has also made great contributions(璐＄尞) to solving the conflict in Northern Ireland, Central Asia, and on the Horn of Africa.</p><p>    鈥淭his work has made a more peaceful world in Nobel鈥檚 spirit,鈥?the officer said, 鈥渟o he has won the prize.鈥?</p>");

System.out.println(content);

String txtSrc = "D://tomcat6_beta//webapps//acts_english//file//paper//111.txt";

createTextFile(txtSrc,content);

}

public static void createTextFile(String src, String text) {

try {

FileWriter fw = new FileWriter(src);

BufferedWriter bw = new BufferedWriter(fw);

bw.write(text);

bw.flush();

bw.close();

fw.close();

} catch (Exception e) {

e.printStackTrace();

}

public static String html2text(String html) {

StringBuffer sb = new StringBuffer(html.length());

char[] data = html.toCharArray();

int start = 0;

boolean previousIsPre = false;

Token token = null;

for(;;) {

token = parse(data, start, previousIsPre);

if(token==null)

break;

previousIsPre = token.isPreTag();

sb = sb.append(token.getText());

start += token.getLength();

}

return sb.toString();

}

private static Token parse(char[] data, int start, boolean previousIsPre) {

if(start>=data.length)

return null;

// try to read next char:

char c = data[start];

if(c=='<') {

// this is a tag or comment or script:

int end_index = indexOf(data, start+1, '>');

if(end_index==(-1)) {

// the left is all text!

return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);

}

String s = new String(data, start, end_index-start+1);

// now we got s="<...>":

if(s.startsWith("<!--")) { // this is a comment!

int end_comment_index = indexOf(data, start+1, "-->");

if(end_comment_index==(-1)) {

// illegal end, but treat as comment:

return new Token(Token.TOKEN_COMMENT, data, start, data.length, previousIsPre);

}

else

return new Token(Token.TOKEN_COMMENT, data, start, end_comment_index+3, previousIsPre);

}

String s_lowerCase = s.toLowerCase();

if(s_lowerCase.startsWith("<script")) { // this is a script:

int end_script_index = indexOf(data, start+1, "</script>");

if(end_script_index==(-1))

// illegal end, but treat as script:

return new Token(Token.TOKEN_SCRIPT, data, start, data.length, previousIsPre);

else

return new Token(Token.TOKEN_SCRIPT, data, start, end_script_index+9, previousIsPre);

}

else { // this is a tag:

return new Token(Token.TOKEN_TAG, data, start, start+s.length(), previousIsPre);

}

// this is a text:

int next_tag_index = indexOf(data, start+1, '<');

if(next_tag_index==(-1))

return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);

return new Token(Token.TOKEN_TEXT, data, start, next_tag_index, previousIsPre);

}

private static int indexOf(char[] data, int start, String s) {

char[] ss = s.toCharArray();

// TODO: performance can improve!

for(int i=start; i<(data.length-ss.length); i++) {

// compare from data[i] with ss[0]:

boolean match = true;

for(int j=0; j<ss.length; j++) {

if(data[i+j]!=ss[j]) {

match = false;

break;

}

if(match)

return i;

}

return (-1);

}

private static int indexOf(char[] data, int start, char c) {

for(int i=start; i<data.length; i++) {

if(data[i]==c)

return i;

}

return (-1);

}

@SuppressWarnings("unchecked")

class Token {

public static final int TOKEN_TEXT = 0; // html text.

public static final int TOKEN_COMMENT = 1; // comment like

public static final int TOKEN_TAG = 2; // tag like <pre>, <font>, etc.

public static final int TOKEN_SCRIPT = 3;

private static final char[] TAG_BR = "<br".toCharArray();

private static final char[] TAG_P = "<p".toCharArray();

private static final char[] TAG_LI = "<li".toCharArray();

private static final char[] TAG_PRE = "<pre".toCharArray();

private static final char[] TAG_HR = "<hr".toCharArray();

private static final char[] END_TAG_TD = "</td>".toCharArray();

private static final char[] END_TAG_TR = "</tr>".toCharArray();

private static final char[] END_TAG_LI = "</li>".toCharArray();

private static final Map SPECIAL_CHARS = new HashMap();

private int type;

private String html; // original html

private String text = null; // text!

private int length = 0; // html length

private boolean isPre = false; // isPre tag?

static {

SPECIAL_CHARS.put(""", "/");

SPECIAL_CHARS.put("<", "<");

SPECIAL_CHARS.put(">", ">");

SPECIAL_CHARS.put("&", "&");

SPECIAL_CHARS.put("®", "(r)");

SPECIAL_CHARS.put("©", "(c)");

SPECIAL_CHARS.put(" ", " ");

SPECIAL_CHARS.put("£", "?");

}

public Token(int type, char[] data, int start, int end, boolean previousIsPre) {

this.type = type;

this.length = end - start;

this.html = new String(data, start, length);

//System.out.println("[Token] html=" + html + ".");

parseText(previousIsPre);

// System.out.println("[Token] text=" + text + ".");

}

public int getLength() {

return length;

}

public boolean isPreTag() {

return isPre;

}

private void parseText(boolean previousIsPre) {

if(type==TOKEN_TAG) {

char[] cs = html.toCharArray();

if(compareTag(TAG_BR, cs) || compareTag(TAG_P, cs))

text = "\r\n";

else if(compareTag(TAG_LI, cs))

text = "\n* ";

else if(compareTag(TAG_PRE, cs))

isPre = true;

else if(compareTag(TAG_HR, cs))

text = "\n--------\n";

else if(compareString(END_TAG_TD, cs))

text = "\t";

else if(compareString(END_TAG_TR, cs) || compareString(END_TAG_LI, cs))

text = "\n";

}

// text token:

else if(type==TOKEN_TEXT) {

text = toText(html, previousIsPre);

}

public String getText() {

return text==null ? "" : text;

}

private String toText(String html, final boolean isPre) {

char[] cs = html.toCharArray();

StringBuffer buffer = new StringBuffer(cs.length);

int start = 0;

boolean continueSpace = false;

char current, next;

for(;;) {

if(start>=cs.length)

break;

current = cs[start]; // read current char

if(start+1<cs.length) // and next char

next = cs[start+1];

else

next = '\0';

if(current==' ') {

if(isPre || !continueSpace)

buffer = buffer.append(' ');

continueSpace = true;

// continue loop:

start++;

continue;

}

// not ' ', so:

if(current=='\r' && next=='\n') {

if(isPre)

buffer = buffer.append('\n');

// continue loop:

start+=2;

continue;

}

if(current=='\n' || current=='\r') {

if(isPre)

buffer = buffer.append('\n');

// continue loop:

start++;

continue;

}

// cannot continue space:

continueSpace = false;

if(current=='&') {

// maybe special char:

int length = readUtil(cs, start, ';', 10);

if(length==(-1)) { // just '&':

buffer = buffer.append('&');

// continue loop:

start++;

continue;

}

else { // check if special character:

String spec = new String(cs, start, length);

String specChar = (String)SPECIAL_CHARS.get(spec);

if(specChar!=null) { // special chars!

buffer = buffer.append(specChar);

// continue loop:

start+=length;

continue;

}

else { // check if like '&#1234':

if(next=='#') { // maybe a char

String num = new String(cs, start+2, length-3);

try {

int code = Integer.parseInt(num);

if(code>0 && code<65536) { // this is a special char:

buffer = buffer.append((char)code);

// continue loop:

start++;

continue;

}

catch(Exception e) {}

// just normal char:

buffer = buffer.append("&#");

// continue loop:

start+=2;

continue;

}

else { // just '&':

buffer = buffer.append('&');

// continue loop:

start++;

continue;

}

else { // just a normal char!

buffer = buffer.append(current);

// continue loop:

start++;

continue;

}

return buffer.toString();

}

// read from cs[start] util meet the specified char 'util',

// or null if not found:

private int readUtil(final char[] cs, final int start, final char util, final int maxLength) {

int end = start+maxLength;

if(end>cs.length)

end = cs.length;

for(int i=start; i<start+maxLength; i++) {

if(cs[i]==util) {

return i-start+1;

}

return (-1);

}

// compare standard tag "<input" with tag "<INPUT value=aa>"

private boolean compareTag(final char[] ori_tag, char[] tag) {

if(ori_tag.length>=tag.length)

return false;

for(int i=0; i<ori_tag.length; i++) {

if(Character.toLowerCase(tag[i])!=ori_tag[i])

return false;

}

// the following char should not be a-z:

if(tag.length>ori_tag.length) {

char c = Character.toLowerCase(tag[ori_tag.length]);

if(c<'a' || c>'z')

return true;

return false;

}

return true;

}

private boolean compareString(final char[] ori, char[] comp) {

if(ori.length>comp.length)

return false;

for(int i=0; i<ori.length; i++) {

if(Character.toLowerCase(comp[i])!=ori[i])

return false;

}

return true;

}

public String toString() {

return html;

}

转载于:https://blog.51cto.com/itstyle/1369023

weixin_33675507

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

java操作 html 转为纯文本 并且保留文本格式

java操作 html 转为纯文本并且保留文本格式