简单的爬取网站数据demo

简单的爬取网站数据demo

思路

  1. 网站如果有获取数据的api,我们可以考虑利用api来获取数据
  2. 网站没有api或者api封装的比较谨慎,我们则需要获取页面源码并解析html来获取数据.

引入依赖

该依赖作用主要是将html网页源码转为对象

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
	<groupId>org.jsoup</groupId>
	<artifactId>jsoup</artifactId>
	<version>1.13.1</version>
</dependency>

获取html源码并解析demo

package jpgk.text.demo;

import org.apache.poi.hssf.usermodel.*;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;

import java.io.FileOutputStream;
import java.util.ArrayList;
import java.util.List;

/**
 * 抓取网页数据
 */
public class GetHtmlPage {
    @Test
    public void test1() throws Exception {
        //创建一个excell文件
        HSSFWorkbook workbook = new HSSFWorkbook();
        //标题头
        HSSFCellStyle headStyle = getHeadCellStyle(workbook);
        //标题字体
        HSSFCellStyle titleStyle = getTitleCellStyle(workbook);
        //创建一个sheet页
        HSSFSheet sheet = workbook.createSheet("花桥房价");

        List<String> stringList = new ArrayList<>();
        for (int p = 1; p < 100; p++) {
            StringBuffer requestUrl = new StringBuffer();
            requestUrl.append("https://shanghai.*****.com/sale/");
            if (p > 1) {
                requestUrl.append("p");
                requestUrl.append(p);
                requestUrl.append("-");
            }
            requestUrl.append("rd1/?from=zjsr&kw=%E8%8A%B1%E6%A1%A5");
            requestUrl.append("#filtersort");

            System.err.println("开始抓取第" + p + "页数据");
            //开始抓取数据
            try {
                //获得一个和网站的链接,注意是Jsoup的connect
                Connection connect = Jsoup.connect(requestUrl.toString());
                //获得该网站的Document对象
                Document document = connect.get();
                Element element = document.getElementById("houselist-mod-new");
                Elements elementsContainingText = element.getElementsByClass("unit-price");
                for (int i = 0; i < elementsContainingText.size(); i++) {
                    Element element1 = elementsContainingText.get(i);
                    String text = (element1.text()).replace("元/m²", "");
                    stringList.add(text);
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        System.err.println("花桥房屋单价:" + stringList);
        System.err.println("花桥房屋单价数据条数:" + stringList.size());

        for (int i = 0; i < stringList.size(); i++) {
            HSSFRow row = sheet.createRow(i);
            String str = stringList.get(i);
            HSSFCell hssfCell = row.createCell(0);
            hssfCell.setCellValue(str);
        }

        FileOutputStream fileOutputStream = new FileOutputStream("D:\\花桥放房价.xls");
        workbook.write(fileOutputStream);
        fileOutputStream.close();
    }

    private HSSFCellStyle getHeadCellStyle(HSSFWorkbook workbook) {
        HSSFFont headFont = workbook.createFont();
        headFont.setFontName("Arial");
        headFont.setFontHeightInPoints((short) 18);
        headFont.setColor((short) 56);
        HSSFCellStyle headStyle = workbook.createCellStyle();
        //垂直
        headStyle.setVerticalAlignment(HSSFCellStyle.VERTICAL_CENTER);
        //水平
        headStyle.setAlignment(HSSFCellStyle.ALIGN_CENTER);
        headStyle.setFont(headFont);
        return headStyle;
    }

    private HSSFCellStyle getTitleCellStyle(HSSFWorkbook workbook) {
        HSSFFont titleFont = workbook.createFont();
        titleFont.setFontName("Arial");
        titleFont.setFontHeightInPoints((short) 11);
        //titleFont.setBoldweight(HSSFFont.BOLDWEIGHT_BOLD);
        titleFont.setColor((short) 56);
        HSSFCellStyle titleStyle = workbook.createCellStyle();
        //垂直
        titleStyle.setVerticalAlignment(HSSFCellStyle.VERTICAL_CENTER);
        //水平
        titleStyle.setAlignment(HSSFCellStyle.ALIGN_CENTER);
        titleStyle.setFont(titleFont);
        return titleStyle;
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值