- import
java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class WebContent - {
-
public String getOneHtml( final String htmlurl) throws IOException - {
- URL url;
- String temp;
-
final StringBuffer sb = new StringBuffer(); -
try - {
- url =
new URL(htmlurl); -
final BufferedReader in = new BufferedReader( new InputStreamReader(url.openStream(), "utf-8" ));// 读取网页全部内容 -
while ((temp = in.readLine()) != null ) - {
- sb.append(temp);
- }
- in.close();
- }
-
catch ( final MalformedURLException me) - {
- System.out.println("你输入的URL格式有问题!请仔细输入"
); - me.getMessage();
-
throw me; - }
-
catch ( final IOException e) - {
- e.printStackTrace();
-
throw e; - }
-
return sb.toString(); - }
-
public String getTitle( final String s) - {
- String regex;
- String title = ""
; -
final List<String> list = new ArrayList<String>(); - regex = "<title>.*?</title>"
; -
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ); -
final Matcher ma = pa.matcher(s); -
while (ma.find()) - {
- list.add(ma.group());
- }
-
for ( int i = 0 ; i < list.size(); i++) - {
- title = title + list.get(i);
- }
-
return outTag(title); - }
-
public List<String> getLink( final String s) - {
- String regex;
-
final List<String> list = new ArrayList<String>(); - regex = "<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)</a>"
; -
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL); -
final Matcher ma = pa.matcher(s); -
while (ma.find()) - {
- list.add(ma.group());
- }
-
return list; - }
-
public List<String> getScript( final String s) - {
- String regex;
-
final List<String> list = new ArrayList<String>(); - regex = "<script.*?</script>"
; -
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL); -
final Matcher ma = pa.matcher(s); -
while (ma.find()) - {
- list.add(ma.group());
- }
-
return list; - }
-
public List<String> getCSS( final String s) - {
- String regex;
-
final List<String> list = new ArrayList<String>(); - regex = "<style.*?</style>"
; -
final Pattern pa = Pattern.compile(regex, Pattern.DOTALL); -
final Matcher ma = pa.matcher(s); -
while (ma.find()) - {
- list.add(ma.group());
- }
-
return list; - }
-
public String outTag( final String s) - {
-
return s.replaceAll("<.*?>" , "" ); - }
-
public HashMap<String, String> getFromYahoo( final String s) - {
-
final HashMap<String, String> hm = new HashMap<String, String>(); -
final StringBuffer sb = new StringBuffer(); - String html = ""
; - System.out.println("\n------------------开始读取网页("
+ s + ")--------------------" ); -
try - {
- html = getOneHtml(s);
- }
-
catch ( final Exception e) - {
- e.getMessage();
- }
- // System.out.println(html);
- System.out.println("------------------读取网页("
+ s + ")结束--------------------\n" ); - System.out.println("------------------分析("
+ s + ")结果如下--------------------\n" ); - String title = outTag(getTitle(html));
- title = title.replaceAll("_雅虎知识堂"
, "" ); - // Pattern pa=Pattern.compile("<div
- // class=\"original\">(.*?)((\r\n)*)(.*?)((\r\n)*)(.*?)</div>",Pattern.DOTALL);
-
final Pattern pa = Pattern.compile("<div class=\"original\">(.*?)</p></div>" , Pattern.DOTALL); -
final Matcher ma = pa.matcher(html); -
while (ma.find()) - {
- sb.append(ma.group());
- }
- String temp = sb.toString();
- temp = temp.replaceAll("(<br>)+?"
, "\n" );// 转化换行 - temp = temp.replaceAll("<p><em>.*?</em></p>"
, "" );// 去图片注释 - hm.put("title"
, title); - hm.put("original"
, outTag(temp)); -
return hm; - }
-
public static void main( final String args[]) - {
- String url = ""
; -
final List<String> list = new ArrayList<String>(); - System.out.print("输入URL,一行一个,输入结束后输入 go 程序开始运行: \n"
); -
final BufferedReader br = new BufferedReader( new InputStreamReader(System.in)); -
try - {
-
while (!(url = br.readLine()).equals("go" )) - {
- list.add(url);
- }
- }
-
catch ( final Exception e) - {
- e.getMessage();
- }
-
final WebContent wc = new WebContent(); - HashMap<String, String> hm =
new HashMap<String, String>(); -
for ( int i = 0 ; i < list.size(); i++) - {
- hm = wc.getFromYahoo(list.get(i));
- System.out.println("标题: "
+ hm.get("title" )); - System.out.println("内容: \n"
+ hm.get("original" )); - }
- // System.out.println(htmlurl+"网页内容结束");
- }
- }
Java读取网页数据并分析
最新推荐文章于 2022-05-31 16:05:12 发布