爬虫入门案例

java爬虫入门案例

本博客只供学习,搜索手机。

jd

1.sql脚本

DROP TABLE IF EXISTS `jb_item`;
CREATE TABLE `jb_item`  (
  `id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键id',
  `spu` bigint(15) NULL DEFAULT NULL COMMENT '商品集合id',
  `sku` bigint(15) NULL DEFAULT NULL COMMENT '商品最小品类单元id',
  `title` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '商品标题',
  `price` bigint(10) NULL DEFAULT NULL COMMENT '商品价格',
  `pic` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '商品图片',
  `url` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '商品详情地址',
  `created` datetime(0) NULL DEFAULT NULL COMMENT '创建时间',
  `updated` datetime(0) NULL DEFAULT NULL COMMENT '更新时间',
  PRIMARY KEY (`id`) USING BTREE,
  INDEX `sku`(`sku`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '京东商品表' ROW_FORMAT = Dynamic;

SET FOREIGN_KEY_CHECKS = 1;

2.pom依赖

 <!-- SpringMVC -->
<dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- SpringData jpa -->
<dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<dependency>
    <groupId>mysql</groupId>
    <artifactId>mysql-connector-java</artifactId>
    <version>8.0.11</version>
</dependency>
<!-- HttpClient -->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
</dependency>
<!-- jsoup -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.13.1</version>
</dependency>
<!-- 工具包 -->
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-lang3</artifactId>
</dependency>

3.application.properties

spring.datasource.url=jdbc:mysql://localhost:3306/dailytest?useUnicode=true&characterEncoding=UTF-8&useSSL=false&serverTimezone=Asia/Shanghai&zeroDateTimeBehavior=CONVERT_TO_NULL&allowPublicKeyRetrieval=true
spring.datasource.driverClassName=com.mysql.cj.jdbc.Driver
spring.datasource.username=root
spring.datasource.password=123456

#jpa
spring.jpa.datasource=MySQL
spring.jpa.show=true

4.pojo

@Entity
@Table(name = "jd_item")
public class Item {

    //主键
    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    private Long id;
    //标准商品单位(商品集合)
    private Long spu;
    //库存量单位(最小品类单元)
    private Long sku;
    //商品标题
    private String title;
    //商品价格
    private Double price;
    //商品图片
    private String pic;
    //商品详情地址
    private String url;
    //创建时间
    private Date created;
    //更新时间
    private Date updated;
    //get和set方法
}

5.dao

public interface ItemDao extends JpaRepository<Item, Long> {
}

6.service

public interface ItemService {
    //保存商品
    public void save(Item item);
    //根据条件查询商品
    public List<Item> findAll(Item item);
}

7.service实现

@Service
public class ItemServiceImpl implements ItemService {

    @Autowired
    private ItemDao itemDao;
    @Override
    public void save(Item item) {
        itemDao.save(item);
    }

    @Override
    public List<Item> findAll(Item item) {
        //声明查询条件
        Example<Item> example = Example.of(item);
        //根据查询条件进行查询数据
        List<Item> list = itemDao.findAll(example);
        return list;
    }
}

8.启动类

@SpringBootApplication
//使用定时任务,添加注解
@EnableScheduling
public class Application {
    public static void main(String[] args) {
        SpringApplication.run(Application.class, args);
    }
}

9.封装Httpclient

@Component
public class HttpUtils {

    private PoolingHttpClientConnectionManager poolingHttpClientConnectionManager;

    public HttpUtils() {
        this.poolingHttpClientConnectionManager = new PoolingHttpClientConnectionManager();
        //设置最大连接数
        poolingHttpClientConnectionManager.setMaxTotal(100);
        poolingHttpClientConnectionManager.setDefaultMaxPerRoute(10);
    }

    //根据请求地址下载页面
    public String doGetHtml(String url){
        //获取httpclient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(poolingHttpClientConnectionManager).build();
        HttpGet httpGet = new HttpGet(url);
        httpGet = setJdHeader(httpGet);
        //设置请求信息
        httpGet.setConfig(getConfig());
        CloseableHttpResponse closeableHttpResponse = null;
        try {
            closeableHttpResponse = httpClient.execute(httpGet);
            //解析
            if (closeableHttpResponse.getStatusLine().getStatusCode() == 200){
                //判断响应体是否不为空
                if (closeableHttpResponse.getEntity() != null){
                    String content = EntityUtils.toString(closeableHttpResponse.getEntity(), "utf8");
                    return content;
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if (closeableHttpResponse != null){
                try {
                    closeableHttpResponse.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return "";
    }

    //下载图片,返回图片名称
    public String doGetImage(String url){
        //获取httpclient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(poolingHttpClientConnectionManager).build();
        HttpGet httpGet = new HttpGet(url);
        httpGet = setJdHeader(httpGet);
        //设置请求信息
        httpGet.setConfig(getConfig());
        CloseableHttpResponse closeableHttpResponse = null;
        try {
            closeableHttpResponse = httpClient.execute(httpGet);
            //解析
            if (closeableHttpResponse.getStatusLine().getStatusCode() == 200){
                //判断响应体是否不为空
                if (closeableHttpResponse.getEntity() != null){
                   //下载图片
                    //获取图片的后缀
                    String ext = url.substring(url.lastIndexOf("."));
                    //创建图片名,重命名图片
                    String picName = UUID.randomUUID().toString()+ext;
                    //下载图片
                    //声明outputstream
                    File file;
                    OutputStream outputStream = new FileOutputStream(new File("E:\\学习\\文件夹\\自学系列\\spider\\OutPutPic\\"+picName));
                    closeableHttpResponse.getEntity().writeTo(outputStream);
                    //返回图片名称
                    return picName;
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if (closeableHttpResponse != null){
                try {
                    closeableHttpResponse.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return "";
    }

    private RequestConfig getConfig(){
        RequestConfig config = RequestConfig.custom()
                .setConnectionRequestTimeout(500)   //获取连接的最长时间
                .setConnectTimeout(1000)    //创建连接的最长时间
                .setSocketTimeout(10000).build();
        return config;
    }

	//设置请求头
    private HttpGet setJdHeader(HttpGet httpGet){
        httpGet.setHeader(":authority", "search.jd.com");
        httpGet.setHeader(":method", "GET");
        httpGet.setHeader(":path", "s/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=14790f8e8aaa4ce198d044bff7af701e");
        httpGet.setHeader(":scheme", "https");
        httpGet.setHeader("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
        httpGet.setHeader("accept-encoding", "gzip, deflate, br");
        httpGet.setHeader("accept-language","zh-CN,zh;q=0.9");
        httpGet.setHeader("cache-control","no-cache");
        httpGet.setHeader("pragma","no-cache");
        httpGet.setHeader("referer","https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&suggest=1.his.0.0&wq=&pvid=d1daa37ee21e48a187842cccb7432d30");
        httpGet.setHeader("sec-fetch-dest","document");
        httpGet.setHeader("sec-fetch-mode","navigate");
        httpGet.setHeader("sec-fetch-site","same-origin");
        httpGet.setHeader("sec-fetch-user","?1");
        httpGet.setHeader("upgrade-insecure-requests","1");
        httpGet.setHeader("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36");
        return httpGet;
    }
}

10.定时任务类

@Component
public class ItemTask {

    @Autowired
    private HttpUtils httpUtils;

    @Autowired
    private ItemService itemService;

    private static final ObjectMapper objectMapper = new ObjectMapper();

    //下载任务完成后,间隔多长时间进行下一次任务(单位:毫秒)
    @Scheduled(fixedDelay = 100*1000)
    public void itemTask()throws Exception{
        //声明需要解析的初始地址
        String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&s=61&click=0&page=";
        //按照页码对手机搜索结果进行解析
        for (int i=1; i<2; i+=2){
            String html = httpUtils.doGetHtml(url + i);
            parse(html);
        }
        System.out.println("手机数据抓取完成!");
    }

    private void parse(String html) throws JsonProcessingException {
        Document document = Jsoup.parse(html);
        Elements spuEles = document.select("div#J_goodsList > ul > li");
        for (Element spuEle:
             spuEles) {
            //获取spu
            long spu = Long.parseLong(spuEle.attr("data-spu"));
            //获取sku
            Elements skuEles = spuEle.select("li.ps-item");
            for (Element skuEle :
                    skuEles) {
                long sku = Long.parseLong(skuEle.select("[data-sku]").attr("data-sku"));
                //根据sku查询商品数据
                Item item = new Item();
                item.setSku(sku);
                List<Item> list = itemService.findAll(item);
                if (list.size()>0){
                    continue;
                }
                item.setSpu(spu);
                //获取商品详情url
                String itemUrl = "https://item.jd.com/" + sku +".html";
                item.setUrl(itemUrl);
                String src = "https:"+ skuEle.select("img[data-sku]").first().attr("data-lazy-img");
                src = src.replace("/n9/", "/n1/");
                String image = httpUtils.doGetImage(src);
                item.setPic(src);
                //获取商品价格
                String priceHtml = httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=J_" + sku);
                double price = objectMapper.readTree(priceHtml).get(0).get("p").asDouble();
                item.setPrice(price);

                //获取标题
                String itemInfo = httpUtils.doGetHtml(item.getUrl());
                String text = Jsoup.parse(itemInfo).select("div.sku-name").text();
                item.setTitle(text);
                item.setCreated(new Date());
                item.setUpdated(item.getCreated());
                //保存到数据库
                itemService.save(item);
            }
        }
    }
}

11.结果

爬jd结果
爬jd手机图片

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值