java抓取数据dome(excel导入+html抓取+html解析+excel数据导出)

import jxl.Sheet;
import jxl.Workbook;
import jxl.read.biff.BiffException;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;


public class HttpClientTest {

    public static void main(String[] args) {
        //解析Excel
        try {
            Workbook workbook = Workbook.getWorkbook(new File("E://beijing.xls"));
            Sheet sheet = workbook.getSheet(0);
             //单条:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/11/01/01/110101001.html
            System.out.println("excel总行数:"+sheet.getRows());
            String [][] values = new String[8000][3];
            int size = 0;
            for (int i = 0; i<sheet.getRows(); i++){
                System.out.println(sheet.getCell(0,i).getContents());
                int size1 = getHtml(sheet.getCell(0, i).getContents(), values, size);
                size += size1;
            }
            //System.out.println("111111111111");
            getHSSFWorkbook("xingzhengquhua.xls",null,values,null);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (BiffException e) {
            e.printStackTrace();
        }
        //------------------------------------------
        //1.生成httpclient,相当于该打开一个浏览器

    }

    public static int getHtml(String html1, String [][] values,int size){
        CloseableHttpClient httpClient = HttpClients.createDefault();
        CloseableHttpResponse response = null;
        //2.创建get请求,相当于在浏览器地址栏输入 网址
        HttpGet request = new HttpGet(html1);
        try {
            //3.执行get请求,相当于在输入地址栏后敲回车键
            response = httpClient.execute(request);

            //4.判断响应状态为200,进行处理
            if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                //5.获取响应内容
                HttpEntity httpEntity = response.getEntity();
                String html = EntityUtils.toString(httpEntity, "gb2312");
                System.out.println("获取内容:"+html);
                org.jsoup.nodes.Document parse = Jsoup.parse(html);
                org.jsoup.nodes.Element title = parse.getElementsByTag("title").first();
                System.out.println(title);
                Elements elementsByClass = parse.getElementsByClass("villagetr");
                System.out.println("获取数据:"+elementsByClass);
                int z=size;
                for (Element byClass : elementsByClass) {
                    Elements td = byClass.getElementsByTag("td");
                    String[] strings = new String[3];
                    int k=0;
                    for (Element element : td) {
                        String text = element.text();
                        System.out.println("td数据:"+text);
                        strings[k]=text;
                        k++;
                    }
                    values[z]=strings;
                    z++;
                }
                return elementsByClass.size();
            } else {
                //如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
                System.out.println("返回状态不是200");
                System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            //6.关闭
            HttpClientUtils.closeQuietly(response);
            HttpClientUtils.closeQuietly(httpClient);
        }
        return 0;
    }
    public static HSSFWorkbook getHSSFWorkbook(String sheetName,String []title,String [][]values, HSSFWorkbook wb){
        if(wb == null){
            wb = new HSSFWorkbook();
        }
        HSSFSheet sheet = wb.createSheet(sheetName);
        sheet.createRow(0);
        HSSFCellStyle style = wb.createCellStyle();
        style.setAlignment(HSSFCellStyle.ALIGN_CENTER);

        for(int i=0;i<values.length;i++){
            HSSFRow row = sheet.createRow(i + 1);
            for(int j=0;j<values[i].length;j++){
                row.createCell(j).setCellValue(values[i][j]);
            }
        }
        try {
            FileOutputStream fileOutputStream = new FileOutputStream("E:/beijingshi.xls");
            wb.write(fileOutputStream);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return wb;
    }
}

/*所需jar

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.12.1</version>
</dependency>
<dependency>
    <groupId>net.sourceforge.jexcelapi</groupId>
    <artifactId>jxl</artifactId>
    <version>2.6.12</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>3.6</version>
</dependency>

*/

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值