jsoup抓取url下载excle文件

最新推荐文章于 2024-03-27 07:15:00 发布

daily_strong

最新推荐文章于 2024-03-27 07:15:00 发布

阅读量2.5k

点赞数

分类专栏： maven 文章标签： jsoup

本文链接：https://blog.csdn.net/litter_Strong/article/details/61922966

版权

maven 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

package com.rquest.webSpider;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.HttpProtocolParams;
import org.apache.poi.util.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

/**
 * Hello world!
 *
 */
public class App 
{
    public static void main( String[] args ){
    	
    	List<String> list = getFirstURLs();
    	List<String> downLoadURLs = getdownLoadURL(list);
    	String dir = downLoadDir(downLoadURLs);
    	try {
    		System.out.println(downLoadURLs.get(0));
			downloadFile(downLoadURLs.get(0),dir);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
    //根据URL生成文件下载目录
	public static String downLoadDir(List<String> downLoadURLs) {
		SimpleDateFormat sim = new SimpleDateFormat("YYYY-MM-dd");
		Date curDate = new Date();
		
		String dirPri = "D:\\WebSpider\\"+sim.format(curDate)+"\\指数样本数据库";
    	String dirSuf = ".xls";
    	String dir = dirPri + downLoadURLs.get(0).substring(59, 67)  + dirSuf;
    	System.out.println(dir);
		return dir;
	}

	//获取真正的文件下载URL
	public static List<String> getdownLoadURL(List<String> list) {
		String urlBase = "http://www.shclearing.com/cpgz/zqjqysp/zqzs/ybqmd/";
    	String urlSecend = null;
    	List<String> list2 = new ArrayList<String>();
    	String secendURLReg =  "[A-Z]\\d{21}\\.xls";
    	String secendURL = null ;
    	String sub = null ;
    
    	for (String urlSuf : list) {
			urlSecend = urlBase + urlSuf;
			Document doc;
			try {
				doc = Jsoup.connect(urlSecend).get();
				Elements scripts =  doc.getElementsByClass("attachments");
		    	String secendURLSuf = RegexString(scripts.get(0).toString(), secendURLReg);
		    	//截取secendURLSuf
		    	sub = secendURLSuf.substring(2, 8) + "/";
		    	secendURL = urlBase + sub + secendURLSuf;
				list2.add(secendURL);
				System.out.println(secendURL);
			} catch (IOException e) {
				e.printStackTrace();
			}
			
		}
    	System.out.println(list2.size());
    	return list2;
	}
	
	//根据下载列表页面得到第一层URL
	public static List<String> getFirstURLs() {
		// 定义即将访问的链接
    	String urlPre = "http://www.shclearing.com/cpgz/zqjqysp/zqzs/ybqmd/index";
    	String urlSuf = ".html";
    	String url = null;
    	List<String> list = new ArrayList<String>();
    	String firstURLReg =  "\\d{6}\\/t\\d{8}\\w\\d{6}\\.html";
    	for (int i = 0; i <= 0; i++) {
    		if (i==0) {
    			url = urlPre +  urlSuf ;
			}else{
				url = urlPre + "_" + i + urlSuf ;
			}
    		try {
    			Document doc = Jsoup.connect(url).get();
    			Elements links =  doc.getElementsByClass("list").select("a[href]");
    			for(org.jsoup.nodes.Element e : links){
    				String firstURL = RegexString(e.toString(), firstURLReg);
    				list.add(firstURL);
    				System.out.println(firstURL);
    			}
    			
    			
    		} catch (IOException e) {
    			e.printStackTrace();
    		}
    	}
		return list;
	}
    
	//正则表达式匹配需要的字符串
    static String RegexString(String targetStr, String patternStr) {
  	  // 定义一个样式模板，此中使用正则表达式，括号中是要抓的内容
  	  // 相当于埋好了陷阱匹配的地方就会掉下去
  	  Pattern pattern = Pattern.compile(patternStr);
  	  // 定义一个matcher用来做匹配
  	  Matcher matcher = pattern.matcher(targetStr);
  	  // 如果找到了
  	  if (matcher.find()) {
  	   // 打印出结果
  	   return matcher.group();
  	  }
  	  return "Nothing";
  	 }
    
    //通过指定的文件下载URL以及下载目录下载文件
	public static void downloadFile(String url,String dir)throws Exception{
		DefaultHttpClient httpClient=new DefaultHttpClient();
		HttpProtocolParams.setUserAgent(httpClient.getParams(),
				"Mozilla/5.0(Windows;U;WindowsNT5.1;zh-CN;rv:1.9.1.9)Gecko/20100315Firefox/3.5.9"	);
		HttpGet httpGet=new HttpGet();
		httpGet.setURI(new java.net.URI(url));
	
		InputStream input = null;
		FileOutputStream output = null;
		try{
			HttpResponse response=httpClient.execute(httpGet);
			HttpEntity entity=	response.getEntity();
			input=entity.getContent();
			File file=new File(dir);
			output= FileUtils.openOutputStream(file);
			IOUtils.copy(input,	output);
			System.out.println("成功下载至："+ dir);
		}catch(Exception e){
			e.printStackTrace();
		}finally{
			IOUtils.closeQuietly(output);
			IOUtils.closeQuietly(input);
		}
	}
}

项目依赖如下：

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.rquest</groupId>
  <artifactId>webSpider</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <packaging>jar</packaging>

  <name>webSpider</name>
  <url>http://maven.apache.org</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  </properties>

  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>3.8.1</version>
      <scope>test</scope>
    </dependency>
    
    <!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
	<dependency>
	    <groupId>org.apache.poi</groupId>
	    <artifactId>poi</artifactId>
	    <version>3.9</version>
	</dependency>
    
    
    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
	<dependency>
	    <groupId>org.jsoup</groupId>
	    <artifactId>jsoup</artifactId>
	    <version>1.7.2</version>
	</dependency>
    
    <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
	<dependency>
	    <groupId>org.apache.httpcomponents</groupId>
	    <artifactId>httpclient</artifactId>
	    <version>4.3.4</version>
	</dependency>
	
	<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-io -->
	<dependency>
	    <groupId>org.apache.commons</groupId>
	    <artifactId>commons-io</artifactId>
	    <version>1.3.2</version>
	</dependency>
	
	
  </dependencies>
</project>