java html转txt_java将HTML转换成TXT

package html2txt;

import java.util.*;

import java.io.BufferedReader;

import java.io.FileNotFoundException;

import java.io.FileReader;

import java.io.IOException;

import java.text.SimpleDateFormat;

/**

* Do some format on web display.

*

* @author Xuefeng

*/

public class WebFormatter {

public static void main(String[] args)

{

BufferedReader br;

String HtmlStr="";

try {

br = new BufferedReader(

new FileReader("e:\\a.html"));

String t=null;

while((t=br.readLine())!=null){//读一行

HtmlStr=HtmlStr+t;

}

} catch (FileNotFoundException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

//HtmlStr="

  三、改进措施
  1、 多关注优等生,让他们一定要保持稳定。多关注中等生,在一定程度上能够让他们与优等生逐一雌雄。多鼓励下游生!努力使两极分化最小化!
  2、一如既往抓学风。要做到勤说(每天每时常提醒)、勤问(每天能够与个别学生进行交流)、勤查(每天检查各科作业的完成情况),加大管理力度,让学生能在安静的环境中学习。
  3、做好学生的思想工作,阐明各校之间竞争的严峻形势,让学生有忧患意识,从而调动学习的积极性。
  4、 多与各科教师联系,及时了解学生动态,接受科任老师的建议。多与家长交流,形成合力,共同督促学生学习,使其进步。
  5、 继续进行“一帮一”互助学习帮教活动,让优生带动差生学习,共同取得进步。
  6、 学生进行深刻的自我反思,对自己的学习提出具体的要求。
  7、 举行一次学法交流会,促成每个学生形成适合自己的良好学习方法。
";

String str=html2text(HtmlStr);

System.out.println(str);

}

public static String html2text(String html) {

if(html==null||html=="")

{

return "";

}

StringBuffer sb = new StringBuffer(html.length());

char[] data = html.toCharArray();

int start = 0;

boolean previousIsPre = false;

Token token = null;

for(;;) {

token = parse(data, start, previousIsPre);

if(token==null)

break;

previousIsPre = token.isPreTag();

sb = sb.append(token.getText());

start += token.getLength();

}

return sb.toString();

}

private static Token parse(char[] data, int start, boolean previousIsPre) {

if(start>=data.length)

return null;

// try to read next char:

char c = data[start];

if(c=='

// this is a tag or comment or script:

int end_index = indexOf(data, start+1, '>');

if(end_index==(-1)) {

// the left is all text!

return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);

}

String s = new String(data, start, end_index-start+1);

// now we got s="<...>":

if(s.startsWith("");

if(end_comment_index==(-1)) {

// illegal end, but treat as comment:

return new Token(Token.TOKEN_COMMENT, data, start, data.length, previousIsPre);

}

else

return new Token(Token.TOKEN_COMMENT, data, start, end_comment_index+3, previousIsPre);

}

String s_lowerCase = s.toLowerCase();

if(s_lowerCase.startsWith("");

if(end_script_index==(-1))

// illegal end, but treat as script:

return new Token(Token.TOKEN_SCRIPT, data, start, data.length, previousIsPre);

else

return new Token(Token.TOKEN_SCRIPT, data, start, end_script_index+9, previousIsPre);

}

else { // this is a tag:

return new Token(Token.TOKEN_TAG, data, start, start+s.length(), previousIsPre);

}

}

// this is a text:

int next_tag_index = indexOf(data, start+1, '

if(next_tag_index==(-1))

return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);

return new Token(Token.TOKEN_TEXT, data, start, next_tag_index, previousIsPre);

}

private static int indexOf(char[] data, int start, String s) {

char[] ss = s.toCharArray();

// TODO: performance can improve!

for(int i=start; i

// compare from data[i] with ss[0]:

boolean match = true;

for(int j=0; j

if(data[i+j]!=ss[j]) {

match = false;

break;

}

}

if(match)

return i;

}

return (-1);

}

private static int indexOf(char[] data, int start, char c) {

for(int i=start; i

if(data[i]==c)

return i;

}

return (-1);

}

}

class Token {

public static final int TOKEN_TEXT = 0; // html text.

public static final int TOKEN_COMMENT = 1; // comment like

public static final int TOKEN_TAG = 2; // tag like

, 

public static final int TOKEN_SCRIPT = 3;

private static final char[] TAG_BR = "

private static final char[] TAG_P = "

private static final char[] TAG_LI = "

private static final char[] TAG_PRE = "


 

private static final char[] TAG_HR = "


private static final char[] END_TAG_TD = "".toCharArray();

private static final char[] END_TAG_TR = "".toCharArray();

private static final char[] END_TAG_LI = "

".toCharArray();

private static final Map SPECIAL_CHARS = new HashMap();

private int type;

private String html; // original html

private String text = null; // text!

private int length = 0; // html length

private boolean isPre = false; // isPre tag?

static {

SPECIAL_CHARS.put(""", "\"");

SPECIAL_CHARS.put("<", "

SPECIAL_CHARS.put(">", ">");

SPECIAL_CHARS.put("&", "&");

SPECIAL_CHARS.put("®", "(r)");

SPECIAL_CHARS.put("©", "(c)");

SPECIAL_CHARS.put(" ", " ");

SPECIAL_CHARS.put("£", "?");

}

public Token(int type, char[] data, int start, int end, boolean previousIsPre) {

this.type = type;

this.length = end - start;

this.html = new String(data, start, length);

// System.out.println("[Token] html=" + html + ".");

parseText(previousIsPre);

// System.out.println("[Token] text=" + text + ".");

}

public int getLength() {

return length;

}

public boolean isPreTag() {

return isPre;

}

private void parseText(boolean previousIsPre) {

if(type==TOKEN_TAG) {

char[] cs = html.toCharArray();

if(compareTag(TAG_BR, cs) || compareTag(TAG_P, cs))

text = "\n";

else if(compareTag(TAG_LI, cs))

text = "\n* ";

else if(compareTag(TAG_PRE, cs))

isPre = true;

else if(compareTag(TAG_HR, cs))

text = "\n--------\n";

else if(compareString(END_TAG_TD, cs))

text = "\t";

else if(compareString(END_TAG_TR, cs) || compareString(END_TAG_LI, cs))

text = "\n";

}

// text token:

else if(type==TOKEN_TEXT) {

text = toText(html, previousIsPre);

}

}

public String getText() {

return text==null ? "" : text;

}

private String toText(String html, final boolean isPre) {

char[] cs = html.toCharArray();

StringBuffer buffer = new StringBuffer(cs.length);

int start = 0;

boolean continueSpace = false;

char current, next;

for(;;) {

if(start>=cs.length)

break;

current = cs[start]; // read current char

if(start+1

next = cs[start+1];

else

next = '\0';

if(current==' ') {

if(isPre || !continueSpace)

buffer = buffer.append(' ');

continueSpace = true;

// continue loop:

start++;

continue;

}

// not ' ', so:

if(current=='\r' && next=='\n') {

if(isPre)

buffer = buffer.append('\n');

// continue loop:

start+=2;

continue;

}

if(current=='\n' || current=='\r') {

if(isPre)

buffer = buffer.append('\n');

// continue loop:

start++;

continue;

}

// cannot continue space:

continueSpace = false;

if(current=='&') {

// maybe special char:

int length = readUtil(cs, start, ';', 10);

if(length==(-1)) { // just '&':

buffer = buffer.append('&');

// continue loop:

start++;

continue;

}

else { // check if special character:

String spec = new String(cs, start, length);

String specChar = (String)SPECIAL_CHARS.get(spec);

if(specChar!=null) { // special chars!

buffer = buffer.append(specChar);

// continue loop:

start+=length;

continue;

}

else { // check if like 'Ӓ':

if(next=='#') { // maybe a char

String num = new String(cs, start+2, length-3);

try {

int code = Integer.parseInt(num);

if(code>0 && code<65536) { // this is a special char:

buffer = buffer.append((char)code);

// continue loop:

start++;

continue;

}

}

catch(Exception e) {}

// just normal char:

buffer = buffer.append("");

// continue loop:

start+=2;

continue;

}

else { // just '&':

buffer = buffer.append('&');

// continue loop:

start++;

continue;

}

}

}

}

else { // just a normal char!

buffer = buffer.append(current);

// continue loop:

start++;

continue;

}

}

return buffer.toString();

}

// read from cs[start] util meet the specified char 'util',

// or null if not found:

private int readUtil(final char[] cs, final int start, final char util, final int maxLength) {

int end = start+maxLength;

if(end>cs.length)

end = cs.length;

for(int i=start; i

if(cs[i]==util) {

return i-start+1;

}

}

return (-1);

}

// compare standard tag ""

private boolean compareTag(final char[] ori_tag, char[] tag) {

if(ori_tag.length>=tag.length)

return false;

for(int i=0; i

if(Character.toLowerCase(tag[i])!=ori_tag[i])

return false;

}

// the following char should not be a-z:

if(tag.length>ori_tag.length) {

char c = Character.toLowerCase(tag[ori_tag.length]);

if(c'z')

return true;

return false;

}

return true;

}

private boolean compareString(final char[] ori, char[] comp) {

if(ori.length>comp.length)

return false;

for(int i=0; i

if(Character.toLowerCase(comp[i])!=ori[i])

return false;

}

return true;

}

public String toString() {

return html;

}

}

参与评论 您还未登录,请先 登录 后发表或查看评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
©️2022 CSDN 皮肤主题:1024 设计师:我叫白小胖 返回首页

打赏作者

weixin_39787792

你的鼓励将是我创作的最大动力

¥2 ¥4 ¥6 ¥10 ¥20
输入1-500的整数
余额支付 (余额:-- )
扫码支付
扫码支付:¥2
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值