java 文本转 html格式_java操作 html 转为纯文本 并且保留文本格式

/**

*@Function: html 转为纯文本 保留格式

*@Class Name: WebFormatter

*@Author: zhangZhiPeng

*@Date: 2013-10-29

*@Modifications:

*@Modifier Name; Date; The Reason for Modifying

*

*/

public class WebFormatter{

public static void main(String[] args){

String content = html2text("

The Nobel(撒娇的空间卡死的快乐? Peace Prize for 2008 was given to Martti Ahtisaari. He was the president of Finland from 1994 to 2000. He won the prize for his work in solving (瑙e喅)international conflicts (鍐茬獊) for more than 30 years.

   During all his life, both as president and as an international officer, he has worked for peace. For the past 20 years, he has done a lot to resolve several serious international conflicts. Some of these conflicts had lasted for years. In 1989-90, he played an important role in helping Namibia鈥檚 independence(鐙珛); in 2005 he did his best to help solve the Aceh question in Indonesia. In 1999 and again in 2005-2007, under very difficult situation he found ways to help solve the conflict in Kosovo. In 2008, together with other organizations, he has tried to help solve many of the problems in Iraq. He has also made great contributions(璐$尞) to solving the conflict in Northern Ireland, Central Asia, and on the Horn of Africa.

    鈥淭his work has made a more peaceful world in Nobel鈥檚 spirit,鈥?the officer said, 鈥渟o he has won the prize.鈥?

");

System.out.println(content);

String txtSrc = "D://tomcat6_beta//webapps//acts_english//file//paper//111.txt";

createTextFile(txtSrc,content);

}

public static void createTextFile(String src, String text) {

try {

FileWriter fw = new FileWriter(src);

BufferedWriter bw = new BufferedWriter(fw);

bw.write(text);

bw.flush();

bw.close();

fw.close();

} catch (Exception e) {

e.printStackTrace();

}

}

public static String html2text(String html) {

StringBuffer sb = new StringBuffer(html.length());

char[] data = html.toCharArray();

int start = 0;

boolean previousIsPre = false;

Token token = null;

for(;;) {

token = parse(data, start, previousIsPre);

if(token==null)

break;

previousIsPre = token.isPreTag();

sb = sb.append(token.getText());

start += token.getLength();

}

return sb.toString();

}

private static Token parse(char[] data, int start, boolean previousIsPre) {

if(start>=data.length)

return null;

// try to read next char:

char c = data[start];

if(c=='

// this is a tag or comment or script:

int end_index = indexOf(data, start+1, '>');

if(end_index==(-1)) {

// the left is all text!

return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);

}

String s = new String(data, start, end_index-start+1);

// now we got s="<...>":

if(s.startsWith("");

if(end_comment_index==(-1)) {

// illegal end, but treat as comment:

return new Token(Token.TOKEN_COMMENT, data, start, data.length, previousIsPre);

}

else

return new Token(Token.TOKEN_COMMENT, data, start, end_comment_index+3, previousIsPre);

}

String s_lowerCase = s.toLowerCase();

if(s_lowerCase.startsWith("");

if(end_script_index==(-1))

// illegal end, but treat as script:

return new Token(Token.TOKEN_SCRIPT, data, start, data.length, previousIsPre);

else

return new Token(Token.TOKEN_SCRIPT, data, start, end_script_index+9, previousIsPre);

}

else { // this is a tag:

return new Token(Token.TOKEN_TAG, data, start, start+s.length(), previousIsPre);

}

}

// this is a text:

int next_tag_index = indexOf(data, start+1, '

if(next_tag_index==(-1))

return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);

return new Token(Token.TOKEN_TEXT, data, start, next_tag_index, previousIsPre);

}

private static int indexOf(char[] data, int start, String s) {

char[] ss = s.toCharArray();

// TODO: performance can improve!

for(int i=start; i

// compare from data[i] with ss[0]:

boolean match = true;

for(int j=0; j

if(data[i+j]!=ss[j]) {

match = false;

break;

}

}

if(match)

return i;

}

return (-1);

}

private static int indexOf(char[] data, int start, char c) {

for(int i=start; i

if(data[i]==c)

return i;

}

return (-1);

}

}

@SuppressWarnings("unchecked")

class Token {

public static final int TOKEN_TEXT    = 0; // html text.

public static final int TOKEN_COMMENT = 1; // comment like

public static final int TOKEN_TAG     = 2; // tag like

, 

public static final int TOKEN_SCRIPT  = 3;

private static final char[] TAG_BR  = "

private static final char[] TAG_P   = "

private static final char[] TAG_LI  = "

private static final char[] TAG_PRE = "

 
 

private static final char[] TAG_HR  = "


private static final char[] END_TAG_TD = "".toCharArray();

private static final char[] END_TAG_TR = "".toCharArray();

private static final char[] END_TAG_LI = "

".toCharArray();

private static final Map SPECIAL_CHARS = new HashMap();

private int type;

private String html;           // original html

private String text = null;    // text!

private int length = 0;        // html length

private boolean isPre = false; // isPre tag?

static {

SPECIAL_CHARS.put(""", "/");

SPECIAL_CHARS.put("<",   "

SPECIAL_CHARS.put(">",   ">");

SPECIAL_CHARS.put("&",  "&");

SPECIAL_CHARS.put("®",  "(r)");

SPECIAL_CHARS.put("©", "(c)");

SPECIAL_CHARS.put(" ", " ");

SPECIAL_CHARS.put("£", "?");

}

public Token(int type, char[] data, int start, int end, boolean previousIsPre) {

this.type = type;

this.length = end - start;

this.html = new String(data, start, length);

//System.out.println("[Token] html=" + html + ".");

parseText(previousIsPre);

// System.out.println("[Token] text=" + text + ".");

}

public int getLength() {

return length;

}

public boolean isPreTag() {

return isPre;

}

private void parseText(boolean previousIsPre) {

if(type==TOKEN_TAG) {

char[] cs = html.toCharArray();

if(compareTag(TAG_BR, cs) || compareTag(TAG_P, cs))

text = "\r\n";

else if(compareTag(TAG_LI, cs))

text = "\n* ";

else if(compareTag(TAG_PRE, cs))

isPre = true;

else if(compareTag(TAG_HR, cs))

text = "\n--------\n";

else if(compareString(END_TAG_TD, cs))

text = "\t";

else if(compareString(END_TAG_TR, cs) || compareString(END_TAG_LI, cs))

text = "\n";

}

// text token:

else if(type==TOKEN_TEXT) {

text = toText(html, previousIsPre);

}

}

public String getText() {

return text==null ? "" : text;

}

private String toText(String html, final boolean isPre) {

char[] cs = html.toCharArray();

StringBuffer buffer = new StringBuffer(cs.length);

int start = 0;

boolean continueSpace = false;

char current, next;

for(;;) {

if(start>=cs.length)

break;

current = cs[start]; // read current char

if(start+1

next = cs[start+1];

else

next = '\0';

if(current==' ') {

if(isPre || !continueSpace)

buffer = buffer.append(' ');

continueSpace = true;

// continue loop:

start++;

continue;

}

// not ' ', so:

if(current=='\r' && next=='\n') {

if(isPre)

buffer = buffer.append('\n');

// continue loop:

start+=2;

continue;

}

if(current=='\n' || current=='\r') {

if(isPre)

buffer = buffer.append('\n');

// continue loop:

start++;

continue;

}

// cannot continue space:

continueSpace = false;

if(current=='&') {

// maybe special char:

int length = readUtil(cs, start, ';', 10);

if(length==(-1)) { // just '&':

buffer = buffer.append('&');

// continue loop:

start++;

continue;

}

else { // check if special character:

String spec = new String(cs, start, length);

String specChar = (String)SPECIAL_CHARS.get(spec);

if(specChar!=null) { // special chars!

buffer = buffer.append(specChar);

// continue loop:

start+=length;

continue;

}

else { // check if like 'Ӓ':

if(next=='#') { // maybe a char

String num = new String(cs, start+2, length-3);

try {

int code = Integer.parseInt(num);

if(code>0 && code<65536) { // this is a special char:

buffer = buffer.append((char)code);

// continue loop:

start++;

continue;

}

}

catch(Exception e) {}

// just normal char:

buffer = buffer.append("");

// continue loop:

start+=2;

continue;

}

else { // just '&':

buffer = buffer.append('&');

// continue loop:

start++;

continue;

}

}

}

}

else { // just a normal char!

buffer = buffer.append(current);

// continue loop:

start++;

continue;

}

}

return buffer.toString();

}

// read from cs[start] util meet the specified char 'util',

// or null if not found:

private int readUtil(final char[] cs, final int start, final char util, final int maxLength) {

int end = start+maxLength;

if(end>cs.length)

end = cs.length;

for(int i=start; i

if(cs[i]==util) {

return i-start+1;

}

}

return (-1);

}

// compare standard tag ""

private boolean compareTag(final char[] ori_tag, char[] tag) {

if(ori_tag.length>=tag.length)

return false;

for(int i=0; i

if(Character.toLowerCase(tag[i])!=ori_tag[i])

return false;

}

// the following char should not be a-z:

if(tag.length>ori_tag.length) {

char c = Character.toLowerCase(tag[ori_tag.length]);

if(c'z')

return true;

return false;

}

return true;

}

private boolean compareString(final char[] ori, char[] comp) {

if(ori.length>comp.length)

return false;

for(int i=0; i

if(Character.toLowerCase(comp[i])!=ori[i])

return false;

}

return true;

}

public String toString() {

return html;

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值