java抓取网页标题内容_java正则 读取html 获取标题/超链接/链接文本/内容

1 /**

2 *3 *@paramargs4 * 测试一组网页,针对雅虎知识堂5 */

6 public static void main(finalString args[])7 {8 String url = "";9 final List list = new ArrayList();10 System.out.print("输入URL,一行一个,输入结束后输入 go 程序开始运行: \n");11 /*

12 *http://ks.cn.yahoo.com/question/1307121201133.html

13 *http://ks.cn.yahoo.com/question/1307121101907.html

14 *http://ks.cn.yahoo.com/question/1307121101907_2.html

15 *http://ks.cn.yahoo.com/question/1307121101907_3.html

16 *http://ks.cn.yahoo.com/question/1307121101907_4.html

17 *http://ks.cn.yahoo.com/question/1307121101907_5.html

18 *http://ks.cn.yahoo.com/question/1307121101907_6.html

19 *http://ks.cn.yahoo.com/question/1307121101907_7.html

20 *http://ks.cn.yahoo.com/question/1307121101907_8.html

21 */

22 final BufferedReader br = new BufferedReader(newInputStreamReader(System.in));23 try

24 {25 while (!(url = br.readLine()).equals("go"))26 {27 list.add(url);28 }29 }30 catch (finalException e)31 {32 e.getMessage();33 }34 final WebContent wc = newWebContent();35 HashMap hm = new HashMap();36 for (int i = 0; i < list.size(); i++)37 {38 hm =wc.getFromYahoo(list.get(i));39 System.out.println("标题: " + hm.get("title"));40 System.out.println("内容: \n" + hm.get("original"));41 }42 /*

43 * String htmlurl[] = {44 * "http://ks.cn.yahoo.com/question/1307121201133.html",45 * "http://ks.cn.yahoo.com/question/1307121101907.html",46 * "http://ks.cn.yahoo.com/question/1307121101907_2.html",47 * "http://ks.cn.yahoo.com/question/1307121101907_3.html",48 * "http://ks.cn.yahoo.com/question/1307121101907_4.html",49 * "http://ks.cn.yahoo.com/question/1307121101907_5.html",50 * "http://ks.cn.yahoo.com/question/1307121101907_6.html",51 * "http://ks.cn.yahoo.com/question/1307121101907_7.html",52 * "http://ks.cn.yahoo.com/question/1307121101907_8.html" }; WebContent53 * wc = new WebContent(); HashMap hm = new HashMap(); for (int i = 0; i < htmlurl.length; i++) { hm =55 * wc.getFromYahoo(htmlurl[i]); System.out.println("标题: " +56 * hm.get("title")); System.out.println("内容: \n" + hm.get("original")); }57 */

58 /*

59 * String html=""; String link=""; String sscript=""; String content="";60 * System.out.println(htmlurl+" 开始读取网页内容:");61 * html=wc.getOneHtml(htmlurl); System.out.println(htmlurl+"62 * 读取完毕开始分析……"); html=html.replaceAll("()","63 * ");//去除脚本 html=html.replaceAll("()","64 * ");//去掉CSS html=html.replaceAll(".*?"," ");//除去页面标题65 * html=html.replaceAll("]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)","66 * ");//去掉链接 html=html.replaceAll("(\\s){2,}?"," ");//除去多余空格67 * html=wc.outTag(html);//多余标记 System.out.println(html);68 */

69

70 /*

71 * String s[]=html.split(" +"); for(int i=0;is[i].length())?content:s[i]; }73 * System.out.println(content);74 */

75

76 //System.out.println(htmlurl+"网页内容结束");

77 /*

78 * System.out.println(htmlurl+"网页脚本开始:"); List79 * script=wc.getScript(html); for(int i=0;i

93 }94 }95

96

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值