java读取html_java读取html

package test;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.net.MalformedURLException;

import java.net.URL;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

/**

*

* @author 晓峰2007.1.18 抓取雅虎知识堂的文章标题及内容(测试) 手动输入网址抓取,可进一步自动抓取整个知识堂的全部内容

*

*/

public class WebContent

{

/**

* 读取一个网页全部内容

*/

public String getOneHtml(final String htmlurl) throws IOException

{

URL url;

String temp;

final StringBuffer sb = new StringBuffer();

try

{

url = new URL(htmlurl);

final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), "utf-8"));// 读取网页全部内容

while ((temp = in.readLine()) != null)

{

sb.append(temp);

}

in.close();

}

catch (final MalformedURLException me)

{

System.out.println("你输入的URL格式有问题!请仔细输入");

me.getMessage();

throw me;

}

catch (final IOException e)

{

e.printStackTrace();

throw e;

}

return sb.toString();

}

/**

*

* @param s

* @return 获得网页标题

*/

public String getTitle(final String s)

{

String regex;

String title = "";

final List list = new ArrayList();

regex = "

.*?";

final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);

final Matcher ma = pa.matcher(s);

while (ma.find())

{

list.add(ma.group());

}

for (int i = 0; i < list.size(); i++)

{

title = title + list.get(i);

}

return outTag(title);

}

/**

*

* @param s

* @return 获得链接

*/

public List getLink(final String s)

{

String regex;

final List list = new ArrayList();

regex = "]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)";

final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

final Matcher ma = pa.matcher(s);

while (ma.find())

{

list.add(ma.group());

}

return list;

}

/**

*

* @param s

* @return 获得脚本代码

*/

public List getScript(final String s)

{

String regex;

final List list = new ArrayList();

regex = "";

final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

final Matcher ma = pa.matcher(s);

while (ma.find())

{

list.add(ma.group());

}

return list;

}

/**

*

* @param s

* @return 获得CSS

*/

public List getCSS(final String s)

{

String regex;

final List list = new ArrayList();

regex = "";

final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

final Matcher ma = pa.matcher(s);

while (ma.find())

{

list.add(ma.group());

}

return list;

}

/**

*

* @param s

* @return 去掉标记

*/

public String outTag(final String s)

{

return s.replaceAll("<.>", "");

}

/**

*

* @param s

* @return 获取雅虎知识堂文章标题及内容

*/

public HashMap getFromYahoo(final String s)

{

final HashMap hm = new HashMap();

final StringBuffer sb = new StringBuffer();

String html = "";

System.out.println("\n------------------开始读取网页(" + s + ")--------------------");

try

{

html = getOneHtml(s);

}

catch (final Exception e)

{

e.getMessage();

}

// System.out.println(html);

System.out.println("------------------读取网页(" + s + ")结束--------------------\n");

System.out.println("------------------分析(" + s + ")结果如下--------------------\n");

String title = outTag(getTitle(html));

title = title.replaceAll("_雅虎知识堂", "");

// Pattern pa=Pattern.compile("

// class=\"original\">(.*?)((\r\n)*)(.*?)((\r\n)*)(.*?)

",Pattern.DOTALL);

final Pattern pa = Pattern.compile("

(.*?)
", Pattern.DOTALL);

final Matcher ma = pa.matcher(html);

while (ma.find())

{

sb.append(ma.group());

}

String temp = sb.toString();

temp = temp.replaceAll("(
)+?", "\n");// 转化换行

temp = temp.replaceAll("

.*?

", "");// 去图片注释

hm.put("title", title);

hm.put("original", outTag(temp));

return hm;

}

/**

*

* @param args

* 测试一组网页,针对雅虎知识堂

*/

public static void main(final String args[])

{

String url = "";

final List list = new ArrayList();

System.out.print("输入URL,一行一个,输入结束后输入 go 程序开始运行: \n");

/*

*http://ks.cn.yahoo.com/question/1307121201133.html

*http://ks.cn.yahoo.com/question/1307121101907.html

*http://ks.cn.yahoo.com/question/1307121101907_2.html

*http://ks.cn.yahoo.com/question/1307121101907_3.html

*http://ks.cn.yahoo.com/question/1307121101907_4.html

*http://ks.cn.yahoo.com/question/1307121101907_5.html

*http://ks.cn.yahoo.com/question/1307121101907_6.html

*http://ks.cn.yahoo.com/question/1307121101907_7.html

*http://ks.cn.yahoo.com/question/1307121101907_8.html

*/

final BufferedReader br = new BufferedReader(new InputStreamReader(System.in));

try

{

while (!(url = br.readLine()).equals("go"))

{

list.add(url);

}

}

catch (final Exception e)

{

e.getMessage();

}

final WebContent wc = new WebContent();

HashMap hm = new HashMap();

for (int i = 0; i < list.size(); i++)

{

hm = wc.getFromYahoo(list.get(i));

System.out.println("标题: " + hm.get("title"));

System.out.println("内容: \n" + hm.get("original"));

}

/*

* String htmlurl[] = {

* "http://ks.cn.yahoo.com/question/1307121201133.html",

* "http://ks.cn.yahoo.com/question/1307121101907.html",

* "http://ks.cn.yahoo.com/question/1307121101907_2.html",

* "http://ks.cn.yahoo.com/question/1307121101907_3.html",

* "http://ks.cn.yahoo.com/question/1307121101907_4.html",

* "http://ks.cn.yahoo.com/question/1307121101907_5.html",

* "http://ks.cn.yahoo.com/question/1307121101907_6.html",

* "http://ks.cn.yahoo.com/question/1307121101907_7.html",

* "http://ks.cn.yahoo.com/question/1307121101907_8.html" }; WebContent

* wc = new WebContent(); HashMap hm = new HashMap

* String>(); for (int i = 0; i < htmlurl.length; i++) { hm =

* wc.getFromYahoo(htmlurl[i]); System.out.println("标题: " +

* hm.get("title")); System.out.println("内容: \n" + hm.get("original")); }

*/

/*

* String html=""; String link=""; String sscript=""; String content="";

* System.out.println(htmlurl+" 开始读取网页内容:");

* html=wc.getOneHtml(htmlurl); System.out.println(htmlurl+"

* 读取完毕开始分析……"); html=html.replaceAll("()","

* ");//去除脚本 html=html.replaceAll("()","

* ");//去掉CSS html=html.replaceAll("

.*?"," ");//除去页面标题

* html=html.replaceAll("]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)","

* ");//去掉链接 html=html.replaceAll("(\\s){2,}?"," ");//除去多余空格

* html=wc.outTag(html);//多余标记 System.out.println(html);

*/

/*

* String s[]=html.split(" +"); for(int i=0;i

* content=(content.length()>s[i].length())?content:s[i]; }

* System.out.println(content);

*/

// System.out.println(htmlurl+"网页内容结束");

/*

* System.out.println(htmlurl+"网页脚本开始:"); List

* script=wc.getScript(html); for(int i=0;i

* System.out.println(script.get(i)); }

* System.out.println(htmlurl+"网页脚本结束:");

*

* System.out.println(htmlurl+"CSS开始:"); List css=wc.getCSS(html);

* for(int i=0;i

* System.out.println(htmlurl+"CSS结束:");

*

* System.out.println(htmlurl+"全部链接内容开始:"); List list=wc.getLink(html);

* for(int i=0;i

* System.out.println(htmlurl+"全部链接内容结束:");

*

* System.out.println("内容"); System.out.println(wc.outTag(html));

*/

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值