package cn.zhuqi.kdd.sample;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author zhuqi259
*/
public class KDDParser {
public static void praseHtml() {
String base = "http://dl.acm.org/";
try {
String url = "http://dl.acm.org/citation.cfm?id=2623330&preflayout=flat";
Document doc = Jsoup
.connect(url)
.header("User-Agent",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2 Googlebot/2.1")
.timeout(200 * 1000).get();
System.out.println(doc.html());
Elements links = doc.select("a[href~=(ft_gateway.cfm)]");
for (Element link : links) {
System.out.println(link.attr("href"));
// 下载文件...
final String url2 = base + link.attr("href");
new Thread() {
@Override
public void run() {
downloadSomething(url2, "*.pdf", "C:\\KDD\\2014");
}
}.start();
}
} catch (Exception ex) {
System.out.println(ex);
System.out.println("ERROR");
}
}
private static void downloadSomething(String urlString, String filename,
String savePath) {
InputStream is = null;
OutputStream os = null;
HttpURLConnection conn = null;
try {
// 构造URL
URL url = new URL(urlString);
// 打开连接
conn = (HttpURLConnection) url.openConnection();
// 设置 User-Agent
conn.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
// 特别重要...
conn.getResponseCode();
// http://delivery.acm.org/10.1145/2640000/2630816/p1-etzioni.pdf?ip=59.72.109.114&id=2630816&acc=ACTIVE%20SERVICE&key=BF85BBA5741FDC6E%2EC01CA9BA055CFEA8%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&CFID=414284820&CFTOKEN=14761593&__acm__=1409406799_d23f35ea5d160c7e1816379be27389d1
String realUrl = conn.getURL().toString();
System.out.println(realUrl);
String regex = "(\\w|-)*.pdf";
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(realUrl);
while (m.find()) {
filename = m.group(0);
break;
}
System.out.println(filename);
// 设置请求超时为5s
conn.setConnectTimeout(5 * 1000);
// 输入流
is = conn.getInputStream();
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
File sf = new File(savePath);
if (!sf.exists()) {
sf.mkdirs();
}
os = new FileOutputStream(sf.getPath() + "\\" + filename);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
} catch (IOException e) {
System.out.println(e);
} finally {
// 完毕,关闭所有链接
if (conn != null) {
conn.disconnect();
}
if (os != null) {
try {
os.close();
} catch (IOException ex) {
System.out.println(ex);
}
}
if (is != null) {
try {
is.close();
} catch (IOException ex) {
System.out.println(ex);
}
}
}
}
public static void praseStaticHtml() {
String base = "http://dl.acm.org/";
try {
File input = new File("kdd2014.html");
Document doc = Jsoup.parse(input, "UTF-8");
Elements links = doc.select("a[href~=(ft_gateway.cfm)]");
for (Element link : links) {
System.out.println(link.attr("href"));
// 下载文件...
final String url = base + link.attr("href");
new Thread() {
@Override
public void run() {
downloadSomething(url, "*.pdf", "C:\\KDD\\2014");
}
}.start();
}
System.out.println(links.size());
} catch (Exception ex) {
System.out.println(ex);
System.out.println("ERROR");
}
}
public static void test() {
String realUrl = "http://delivery.acm.org/10.1145/2640000/2630816/p1-etzioni.pdf?ip=59.72.109.114&id=2630816&acc=ACTIVE%20SERVICE&key=BF85BBA5741FDC6E%2EC01CA9BA055CFEA8%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&CFID=414284820&CFTOKEN=14761593&__acm__=1409406799_d23f35ea5d160c7e1816379be27389d1";
System.out.println(realUrl);
String regex = "(\\w|-)*.pdf";
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(realUrl);
while (m.find()) {
System.out.println(m.group(0));
break;
}
}
public static void main(String[] args) {
praseHtml();
// praseStaticHtml();
}
}
KDD2014文章下载
最新推荐文章于 2023-08-10 19:58:21 发布