java实现maven项目最简单的网络爬虫并导出爬取的数据

一、pom包引入

         <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpmime</artifactId>
            <version>4.5.3</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>

       <!--   poi的依赖包 -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.16</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml-schemas</artifactId>
            <version>3.16</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-excelant</artifactId>
            <version>3.16</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-examples</artifactId>
            <version>3.16</version>
        </dependency>

二、创建实体类Product  属性可以根据自已的业务定义 (生成set get 方法   快捷键 alt+insert )

     private String productName; //品名
     private String specs;//规格
     private String brand; //品牌
     private String lowerPrice; //价格
     private String OfferArea; //报价区域
     private String OfferCompany; //报价企业
     private String offerDate; //报价日  

三、抓取数据业务代码

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.dark.pojo.Product;
import com.dark.util.POItoExcel;

public class Reptile {
       public static void main(String[] args) throws FileNotFoundException, IOException {
    	   List<Product> list=getInfor("https://xfm.dazpin.com/prices/");    //要抓取数据的 url 返回多条数据,
    	   POItoExcel.toExcel(list); //导出到Excel 根据业务可以存入数据库
	}

       public static List<Product> getInfor(String url){
    	   List<Product> proList=new ArrayList<Product>();
    		 try {
				 Document doc=Jsoup.connect(url).get(); //使用Jsoup 解析HTML
				 Elements pages=doc.select(".pages a"); //.pages a  HTML类选择器 pages  下面的 a 标签,这里是获取最大页数
				 for (int i = 0; i < pages.size(); i++) { //遍历页数 抓取每页数据
				 	 url = "https://xfm.dazpin.com/prices/"+ (i + 1) +".html";
					 doc=Jsoup.connect(url).get();
					 Elements table=doc.select(".b-j-con table:first-child"); // .b-j-con 找到要抓取数据的table表 (.xx 代表类选择器)
					 Elements tbody=table.select("tbody");//获取到表单的体
					 Elements trList=tbody.select("tr");//找到 tr 标签 里面的数据
					 trList.remove(0);
					 for(Element tr:trList){ //遍历抓取数据
						 Elements tdList=tr.select("td");
						 Product product=new Product();
                              //该td 标签下面有二级标签 product.setProductName(tdList.get(0).select("p a").html().toString());//品名 product.setSpecs(tdList.get(1).html().toString());//规格 product.setBrand(tdList.get(2).html().toString());//品牌 product.setLowerPrice(tdList.get(3).html().toString());//价格 product.setOfferArea(tdList.get(4).select("p").html().toString());//报价区域 product.setOfferCompany(tdList.get(5).select("p").html().toString());//报价企业 product.setOfferDate(tdList.get(6).html().toString()); proList.add(product); //数据封装List } } } catch (IOException e) { e.printStackTrace(); } return proList; } }

 四、导出数据 (根据业务可以存入数据库) 

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.HorizontalAlignment;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.VerticalAlignment;
import org.apache.poi.ss.util.CellRangeAddress;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import com.dark.pojo.Product;


public class POItoExcel {
	public static void toExcel(List<Product> list) throws FileNotFoundException, IOException{
		 
        XSSFWorkbook workBook=new XSSFWorkbook();
        
        XSSFSheet sheet=workBook.createSheet();
        
      		CellRangeAddress cra=new CellRangeAddress(0,1, 0, 6);
      		sheet.addMergedRegion(cra);
      		Row row2=sheet.createRow(0);
      		Cell cell=row2.createCell(0);
      		cell.setCellValue("新凤鸣报价单");
      		CellStyle cs=workBook.createCellStyle();
      		cs.setAlignment(HorizontalAlignment.CENTER);
      		cs.setVerticalAlignment(VerticalAlignment.CENTER);
      		cs.setFillBackgroundColor((short) 59);
      	    cell.setCellStyle(cs);
      		
      		
      		Row row=sheet.createRow(2);
      		Cell cell11=row.createCell(0);
      		cell11.setCellValue("品名");
      		Cell cell22=row.createCell(1);
      		cell22.setCellValue("规格型号");
      		Cell cell33=row.createCell(2);
      		cell33.setCellValue("品牌");
      		Cell cell44=row.createCell(3);
      		cell44.setCellValue("价格");
      		Cell cell55=row.createCell(4);
      		cell55.setCellValue("报价区域");
      		Cell cell66=row.createCell(5);
      		cell66.setCellValue("报价企业");
      		Cell cell77=row.createCell(6);
      		cell77.setCellValue("报价时间");
        
        for(int i=0;i<list.size();i++){
        XSSFRow row4=sheet.createRow(i+3);
        XSSFCell cell1=row4.createCell(0);
        XSSFCell cell2=row4.createCell(1);
        XSSFCell cell3=row4.createCell(2);
        XSSFCell cell4=row4.createCell(3);
        XSSFCell cell5=row4.createCell(4);
        XSSFCell cell6=row4.createCell(5);
        XSSFCell cell7=row4.createCell(6);

        cell1.setCellValue(list.get(i).getProductName());
        cell2.setCellValue(list.get(i).getSpecs());
        cell3.setCellValue(list.get(i).getBrand());
        cell4.setCellValue(list.get(i).getLowerPrice());
        cell5.setCellValue(list.get(i).getOfferArea());
        cell6.setCellValue(list.get(i).getOfferCompany());
        cell7.setCellValue(list.get(i).getOfferDate());
        }
        workBook.write(new FileOutputStream(new File("E:\\\\测试.xls")) );
        workBook.close();

	}
}

  

 

转载于:https://www.cnblogs.com/citime/p/10058636.html

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值