java 获取网页标题,java 网页页面抓取标题和正文

最新推荐文章于 2023-01-10 12:03:49 发布

肖昱璟

最新推荐文章于 2023-01-10 12:03:49 发布

阅读量582

点赞数

文章标签： java 获取网页标题

importjava.io.BufferedReader;

importjava.io.IOException;

importjava.io.InputStreamReader;

importjava.net.MalformedURLException;

importjava.net.URL;

importjava.util.ArrayList;

importjava.util.HashMap;

importjava.util.List;

importjava.util.regex.Matcher;

importjava.util.regex.Pattern;

publicclassWebContent

{

/**

* 读取一个网页全部内容

publicString getOneHtml(finalString htmlurl)throwsIOException

{

URL url;

String temp;

finalStringBuffer sb =newStringBuffer();

try

{

url = newURL(htmlurl);

finalBufferedReader in =newBufferedReader(newInputStreamReader(url.openStream(),"utf-8"));// 读取网页全部内容

while((temp = in.readLine()) !=null)

{

sb.append(temp);

}

in.close();

}

catch(finalMalformedURLException me)

{

System.out.println("你输入的URL格式有问题！请仔细输入");

me.getMessage();

throwme;

}

catch(finalIOException e)

{

e.printStackTrace();

throwe;

}

returnsb.toString();

}

/**

* @param s

* @return 获得网页标题

publicString getTitle(finalString s)

{

String regex;

String title = "";

finalList list =newArrayList();

regex = "

.*?";

finalPattern pa = Pattern.compile(regex, Pattern.CANON_EQ);

finalMatcher ma = pa.matcher(s);

while(ma.find())

{

list.add(ma.group());

}

for(inti =0; i

{

title = title + list.get(i);

}

returnoutTag(title);

}

/**

* @param s

* @return 获得链接

publicList getLink(finalString s)

{

String regex;

finalList list =newArrayList();

regex = "]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)";

finalPattern pa = Pattern.compile(regex, Pattern.DOTALL);

finalMatcher ma = pa.matcher(s);

while(ma.find())

{

list.add(ma.group());

}

returnlist;

}

/**

* @param s

* @return 获得脚本代码

publicList getScript(finalString s)

{

String regex;

finalList list =newArrayList();

regex = "";

finalPattern pa = Pattern.compile(regex, Pattern.DOTALL);

finalMatcher ma = pa.matcher(s);

while(ma.find())

{

list.add(ma.group());

}

returnlist;

}

/**

* @param s

* @return 获得CSS

publicList getCSS(finalString s)

{

String regex;

finalList list =newArrayList();

regex = "";

finalPattern pa = Pattern.compile(regex, Pattern.DOTALL);

finalMatcher ma = pa.matcher(s);

while(ma.find())

{

list.add(ma.group());

}

returnlist;

}

/**

* @param s

* @return 去掉标记

publicString outTag(finalString s)

{

returns.replaceAll("<.>","");

}

/**

* @param s

* @return 获取雅虎知识堂文章标题及内容

publicHashMap getFromYahoo(finalString s)

{

finalHashMap hm =newHashMap();

finalStringBuffer sb =newStringBuffer();

String html = "";

System.out.println("\n------------------开始读取网页("+ s +")--------------------");

try

{

html = getOneHtml(s);

}

catch(finalException e)

{

e.getMessage();

}

// System.out.println(html);

System.out.println("------------------读取网页("+ s +")结束--------------------\n");

System.out.println("------------------分析("+ s +")结果如下--------------------\n");

String title = outTag(getTitle(html));

title = title.replaceAll("_雅虎知识堂","");

// Pattern pa=Pattern.compile("

// class=\"original\">(.*?)((\r\n)*)(.*?)((\r\n)*)(.*?)

",Pattern.DOTALL);

finalPattern pa = Pattern.compile("

(.*?)

", Pattern.DOTALL);

finalMatcher ma = pa.matcher(html);

while(ma.find())

{

sb.append(ma.group());

}

String temp = sb.toString();

temp = temp.replaceAll("(
)+?","\n");// 转化换行

temp = temp.replaceAll("

.*?

","");// 去图片注释

hm.put("title", title);

hm.put("original", outTag(temp));

returnhm;

}

/**

* @param args

* 测试一组网页，针对雅虎知识堂

publicstaticvoidmain(finalString args[])

{

String url = "";

finalList list =newArrayList();

System.out.print("输入URL，一行一个，输入结束后输入 go 程序开始运行: \n");

* http://ks.cn.yahoo.com/question/1307121201133.html

* http://ks.cn.yahoo.com/question/1307121101907.html

&, nbsp;* http://ks.cn.yahoo.com/question/1307121101907_2.html

* http://ks.cn.yahoo.com/question/1307121101907_3.html

* http://ks.cn.yahoo.com/question/1307121101907_4.html

* http://ks.cn.yahoo.com/question/1307121101907_5.html

* http://ks.cn.yahoo.com/question/1307121101907_6.html

* http://ks.cn.yahoo.com/question/1307121101907_7.html

* http://ks.cn.yahoo.com/question/1307121101907_8.html

finalBufferedReader br =newBufferedReader(newInputStreamReader(System.in));

try

{

while(!(url = br.readLine()).equals("go"))

{

list.add(url);

}

catch(finalException e)

{

e.getMessage();

}

finalWebContent wc =newWebContent();

HashMap hm = newHashMap();

for(inti =0; i

{

hm = wc.getFromYahoo(list.get(i));

System.out.println("标题： "+ hm.get("title"));

System.out.println("内容： \n"+ hm.get("original"));

}

* String htmlurl[] = {

* "http://ks.cn.yahoo.com/question/1307121201133.html",

* "http://ks.cn.yahoo.com/question/1307121101907.html",

* "http://ks.cn.yahoo.com/question/1307121101907_2.html",

* "http://ks.cn.yahoo.com/question/1307121101907_3.html",

* "http://ks.cn.yahoo.com/question/1307121101907_4.html",

* "http://ks.cn.yahoo.com/question/1307121101907_5.html",

* "http://ks.cn.yahoo.com/question/1307121101907_6.html",

* "http://ks.cn.yahoo.com/question/1307121101907_7.html",

* "http://ks.cn.yahoo.com/question/1307121101907_8.html" }; WebContent

* wc = new WebContent(); HashMap hm = new HashMap

* String>(); for (int i = 0; i

* wc.getFromYahoo(htmlurl[i]); System.out.println("标题： " +

* hm.get("title")); System.out.println("内容： \n" + hm.get("original")); }

* String html=""; String link=""; String sscript=""; String content="";

* System.out.println(htmlurl+" 开始读取网页内容：");

* html=wc.getOneHtml(htmlurl); System.out.println(htmlurl+"

* 读取完毕开始分析……"); html=html.replaceAll("()","

* ");//去除脚本 html=html.replaceAll("()","

* ");//去掉CSS html=html.replaceAll("

.*?"," ");//除去页面标题

* html=html.replaceAll("]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)","

* ");//去掉链接 html=html.replaceAll("(\\s){2,}?"," ");//除去多余空格

* html=wc.outTag(html);//多余标记 System.out.println(html);

* String s[]=html.split(" +"); for(int i=0;i

* content=(content.length()>s[i].length())?content:s[i]; }

* System.out.println(content);

// System.out.println(htmlurl+"网页内容结束");

* System.out.println(htmlurl+"网页脚本开始："); List

* script=wc.getScript(html); for(int i=0;i

* System.out.println(script.get(i)); }

* System.out.println(htmlurl+"网页脚本结束：");

* System.out.println(htmlurl+"CSS开始："); List css=wc.getCSS(html);

* for(int i=0;i

* System.out.println(htmlurl+"CSS结束：");

* System.out.println(htmlurl+"全部链接内容开始："); List list=wc.getLink(html);

* for(int i=0;i

* System.out.println(htmlurl+"全部链接内容结束：");

* System.out.println("内容"); System.out.println(wc.outTag(html));

}

肖昱璟

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java 获取网页标题,java 网页页面抓取标题和正文

importjava.io.BufferedReader;importjava.io.IOException;importjava.io.InputStreamReader;importjava.net.MalformedURLException;importjava.net.URL;importjava.util.ArrayList;importjava.util.HashMap;importj...
复制链接

扫一扫