KDD2014文章下载

package cn.zhuqi.kdd.sample;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 
 * @author zhuqi259
 */
public class KDDParser {

	public static void praseHtml() {
		String base = "http://dl.acm.org/";
		try {
			String url = "http://dl.acm.org/citation.cfm?id=2623330&preflayout=flat";
			Document doc = Jsoup
					.connect(url)
					.header("User-Agent",
							"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2 Googlebot/2.1")
					.timeout(200 * 1000).get();
			System.out.println(doc.html());
			Elements links = doc.select("a[href~=(ft_gateway.cfm)]");
			for (Element link : links) {
				System.out.println(link.attr("href"));
				// 下载文件...
				final String url2 = base + link.attr("href");
				new Thread() {
					@Override
					public void run() {
						downloadSomething(url2, "*.pdf", "C:\\KDD\\2014");
					}
				}.start();
			}
		} catch (Exception ex) {
			System.out.println(ex);
			System.out.println("ERROR");
		}
	}

	private static void downloadSomething(String urlString, String filename,
			String savePath) {
		InputStream is = null;
		OutputStream os = null;
		HttpURLConnection conn = null;
		try {
			// 构造URL
			URL url = new URL(urlString);
			// 打开连接
			conn = (HttpURLConnection) url.openConnection();
			// 设置 User-Agent
			conn.setRequestProperty("User-Agent",
					"Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
			// 特别重要...
			conn.getResponseCode();
			// http://delivery.acm.org/10.1145/2640000/2630816/p1-etzioni.pdf?ip=59.72.109.114&id=2630816&acc=ACTIVE%20SERVICE&key=BF85BBA5741FDC6E%2EC01CA9BA055CFEA8%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&CFID=414284820&CFTOKEN=14761593&__acm__=1409406799_d23f35ea5d160c7e1816379be27389d1
			String realUrl = conn.getURL().toString();
			System.out.println(realUrl);

			String regex = "(\\w|-)*.pdf";
			Pattern p = Pattern.compile(regex);
			Matcher m = p.matcher(realUrl);
			while (m.find()) {
				filename = m.group(0);
				break;
			}
			System.out.println(filename);
			// 设置请求超时为5s
			conn.setConnectTimeout(5 * 1000);
			// 输入流
			is = conn.getInputStream();
			// 1K的数据缓冲
			byte[] bs = new byte[1024];
			// 读取到的数据长度
			int len;
			// 输出的文件流
			File sf = new File(savePath);
			if (!sf.exists()) {
				sf.mkdirs();
			}
			os = new FileOutputStream(sf.getPath() + "\\" + filename);
			// 开始读取
			while ((len = is.read(bs)) != -1) {
				os.write(bs, 0, len);
			}
		} catch (IOException e) {
			System.out.println(e);
		} finally {
			// 完毕,关闭所有链接
			if (conn != null) {
				conn.disconnect();
			}
			if (os != null) {
				try {
					os.close();
				} catch (IOException ex) {
					System.out.println(ex);
				}
			}
			if (is != null) {
				try {
					is.close();
				} catch (IOException ex) {
					System.out.println(ex);
				}
			}
		}
	}

	public static void praseStaticHtml() {
		String base = "http://dl.acm.org/";
		try {
			File input = new File("kdd2014.html");
			Document doc = Jsoup.parse(input, "UTF-8");
			Elements links = doc.select("a[href~=(ft_gateway.cfm)]");
			for (Element link : links) {
				System.out.println(link.attr("href"));
				// 下载文件...
				final String url = base + link.attr("href");
				new Thread() {
					@Override
					public void run() {
						downloadSomething(url, "*.pdf", "C:\\KDD\\2014");
					}
				}.start();
			}
			System.out.println(links.size());
		} catch (Exception ex) {
			System.out.println(ex);
			System.out.println("ERROR");
		}
	}

	public static void test() {
		String realUrl = "http://delivery.acm.org/10.1145/2640000/2630816/p1-etzioni.pdf?ip=59.72.109.114&id=2630816&acc=ACTIVE%20SERVICE&key=BF85BBA5741FDC6E%2EC01CA9BA055CFEA8%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&CFID=414284820&CFTOKEN=14761593&__acm__=1409406799_d23f35ea5d160c7e1816379be27389d1";
		System.out.println(realUrl);
		String regex = "(\\w|-)*.pdf";
		Pattern p = Pattern.compile(regex);
		Matcher m = p.matcher(realUrl);
		while (m.find()) {
			System.out.println(m.group(0));
			break;
		}

	}

	public static void main(String[] args) {
		praseHtml();
		// praseStaticHtml();
	}
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值