package com.rquest.webSpider;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.HttpProtocolParams;
import org.apache.poi.util.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
/**
* Hello world!
*
*/
public class App
{
public static void main( String[] args ){
List<String> list = getFirstURLs();
List<String> downLoadURLs = getdownLoadURL(list);
String dir = downLoadDir(downLoadURLs);
try {
System.out.println(downLoadURLs.get(0));
downloadFile(downLoadURLs.get(0),dir);
} catch (Exception e) {
e.printStackTrace();
}
}
//根据URL生成文件下载目录
public static String downLoadDir(List<String> downLoadURLs) {
SimpleDateFormat sim = new SimpleDateFormat("YYYY-MM-dd");
Date curDate = new Date();
String dirPri = "D:\\WebSpider\\"+sim.format(curDate)+"\\指数样本数据库";
String dirSuf = ".xls";
String dir = dirPri + downLoadURLs.get(0).substring(59, 67) + dirSuf;
System.out.println(dir);
return dir;
}
//获取真正的文件下载URL
public static List<String> getdownLoadURL(List<String> list) {
String urlBase = "http://www.shclearing.com/cpgz/zqjqysp/zqzs/ybqmd/";
String urlSecend = null;
List<String> list2 = new ArrayList<String>();
String secendURLReg = "[A-Z]\\d{21}\\.xls";
String secendURL = null ;
String sub = null ;
for (String urlSuf : list) {
urlSecend = urlBase + urlSuf;
Document doc;
try {
doc = Jsoup.connect(urlSecend).get();
Elements scripts = doc.getElementsByClass("attachments");
String secendURLSuf = RegexString(scripts.get(0).toString(), secendURLReg);
//截取secendURLSuf
sub = secendURLSuf.substring(2, 8) + "/";
secendURL = urlBase + sub + secendURLSuf;
list2.add(secendURL);
System.out.println(secendURL);
} catch (IOException e) {
e.printStackTrace();
}
}
System.out.println(list2.size());
return list2;
}
//根据下载列表页面得到第一层URL
public static List<String> getFirstURLs() {
// 定义即将访问的链接
String urlPre = "http://www.shclearing.com/cpgz/zqjqysp/zqzs/ybqmd/index";
String urlSuf = ".html";
String url = null;
List<String> list = new ArrayList<String>();
String firstURLReg = "\\d{6}\\/t\\d{8}\\w\\d{6}\\.html";
for (int i = 0; i <= 0; i++) {
if (i==0) {
url = urlPre + urlSuf ;
}else{
url = urlPre + "_" + i + urlSuf ;
}
try {
Document doc = Jsoup.connect(url).get();
Elements links = doc.getElementsByClass("list").select("a[href]");
for(org.jsoup.nodes.Element e : links){
String firstURL = RegexString(e.toString(), firstURLReg);
list.add(firstURL);
System.out.println(firstURL);
}
} catch (IOException e) {
e.printStackTrace();
}
}
return list;
}
//正则表达式匹配需要的字符串
static String RegexString(String targetStr, String patternStr) {
// 定义一个样式模板,此中使用正则表达式,括号中是要抓的内容
// 相当于埋好了陷阱匹配的地方就会掉下去
Pattern pattern = Pattern.compile(patternStr);
// 定义一个matcher用来做匹配
Matcher matcher = pattern.matcher(targetStr);
// 如果找到了
if (matcher.find()) {
// 打印出结果
return matcher.group();
}
return "Nothing";
}
//通过指定的文件下载URL以及下载目录下载文件
public static void downloadFile(String url,String dir)throws Exception{
DefaultHttpClient httpClient=new DefaultHttpClient();
HttpProtocolParams.setUserAgent(httpClient.getParams(),
"Mozilla/5.0(Windows;U;WindowsNT5.1;zh-CN;rv:1.9.1.9)Gecko/20100315Firefox/3.5.9" );
HttpGet httpGet=new HttpGet();
httpGet.setURI(new java.net.URI(url));
InputStream input = null;
FileOutputStream output = null;
try{
HttpResponse response=httpClient.execute(httpGet);
HttpEntity entity= response.getEntity();
input=entity.getContent();
File file=new File(dir);
output= FileUtils.openOutputStream(file);
IOUtils.copy(input, output);
System.out.println("成功下载至:"+ dir);
}catch(Exception e){
e.printStackTrace();
}finally{
IOUtils.closeQuietly(output);
IOUtils.closeQuietly(input);
}
}
}
项目依赖如下:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.rquest</groupId>
<artifactId>webSpider</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>webSpider</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.9</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-io -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-io</artifactId>
<version>1.3.2</version>
</dependency>
</dependencies>
</project>