一个简单的网页抓取例子

17 篇文章 0 订阅
package net;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class WebContent {
	/**
	 * 读取一个网页全部内容
	 * 
	 * @param htmlurl
	 * @return String 网页内容
	 * @throws IOException
	 */
	public String getOneHtml(final String htmlurl) throws IOException {
		URL url;
		String temp;
		final StringBuffer htmlContent = new StringBuffer();
		try {
			url = new URL(htmlurl);
			System.out.println(url.getProtocol());
			final BufferedReader in = new BufferedReader(new InputStreamReader(
					url.openStream(), "utf-8"));// 读取网页全部内容
			while ((temp = in.readLine()) != null) {
				htmlContent.append(temp);
			}
			in.close();
		} catch (final MalformedURLException me) {
			System.out.println("你输入的URL格式有问题!请仔细输入");
			me.getMessage();
			throw me;
		} catch (final IOException e) {
			e.printStackTrace();
			throw e;
		}
		return htmlContent.toString();
	}


	/**
	 * 
	 * @param s
	 * @return 获得网页标题
	 */
	public String getTitle(final String s) {
		String regex = "<title>.*?</title>";
		;
		String title = "";
		final List<String> list = new ArrayList<String>();
		final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
		final Matcher ma = pa.matcher(s);
		while (ma.find()) {
			list.add(ma.group());
		}
		for (int i = 0; i < list.size(); i++) {
			title = title + list.get(i);
		}
		return outTag(title);
	}


	/**
	 * 
	 * @param s
	 * @return 获得链接
	 */
	public List<String> getLink(final String s) {
		String regex;
		final List<String> list = new ArrayList<String>();
		regex = "<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)</a>";
		final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
		final Matcher ma = pa.matcher(s);
		while (ma.find()) {
			list.add(ma.group());
		}
		return list;
	}


	/**
	 * 
	 * @param s
	 * @return 获得脚本代码
	 */
	public List<String> getScript(final String s) {
		String regex;
		final List<String> list = new ArrayList<String>();
		regex = "<script.*?</script>";
		final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
		final Matcher ma = pa.matcher(s);
		while (ma.find()) {
			list.add(ma.group());
		}
		return list;
	}


	/**
	 * 
	 * @param s
	 * @return 获得CSS
	 */
	public List<String> getCSS(final String s) {
		String regex;
		final List<String> list = new ArrayList<String>();
		regex = "<style.*?</style>";
		final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
		final Matcher ma = pa.matcher(s);
		while (ma.find()) {
			list.add(ma.group());
		}
		return list;
	}


	/**
	 * 
	 * @param s
	 * @return 去掉标记
	 */
	public String outTag(final String s) {
		return s.replaceAll("<.*?>", "");
	}


	/**
	 * 
	 * @param s
	 * @return
	 */
	public HashMap<String, List<String>> getFromUrls(final String url) {
		final HashMap<String, List<String>> result = new HashMap<String, List<String>>();
		String content = "";
		System.out.println("\n------------------开始读取网页(" + url
				+ ")--------------------");
		try {
			content = getOneHtml(url);
		} catch (final Exception e) {
			e.getMessage();
			return null;
		}
		System.out.println("------------------读取网页(" + url
				+ ")结束--------------------\n");
		System.out.println("------------------分析网页(" + url
				+ ")结果如下--------------------\n");
		List<String> title = new ArrayList<String>();
		title.add(getTitle(content));
		result.put("title", title);
		result.put("css", getCSS(content));
		result.put("script", getScript(content));
		result.put("link", getLink(content));
		return result;
	}


	/**
	 * @param args
	 */
	public static void main(final String args[]) {
		String url = "";
		final List<String> list = new ArrayList<String>();
		System.out.print("输入URL,一行一个,输入结束后输入 go 程序开始运行:   \n");
		final BufferedReader br = new BufferedReader(new InputStreamReader(
				System.in));
		try {
			while (!(url = br.readLine()).equals("go")) // 如果输入不是go那么一直读取
			{
				list.add(url);
			}
		} catch (final Exception e) {
			e.getMessage();
		}
		final WebContent wc = new WebContent();
		HashMap<String, List<String>> hashMap = new HashMap<String, List<String>>();
		for (int i = 0; i < list.size(); i++) {
			if (wc.getFromUrls(list.get(i)) != null) {
				hashMap = wc.getFromUrls(list.get(i));
			}
			for (Iterator<String> iter = hashMap.keySet().iterator(); iter
					.hasNext();) {
				String key = iter.next();
				List<String> list2 = hashMap.get(key);
				System.out.println("--" + key + "内容如下:");
				for (int j = 0; j < list2.size(); j++) {
					System.out.println(list2.get(j));
				}
			}
		}
	}
}


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值