爬虫之爬取JD商品

前言:

案例搜索JD官网内存条信息:https://search.jd.com/searchkeyword=8g%E5%86%85%E5%AD%98%E6%9D%A1%20ddr4&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=2.def.0.V07&wq=8G&uc=0#J_searchWrap

并将搜索到的商品信息保存到本地数据库,保存的表为:

整个案例使用的maven

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <parent>
        <artifactId>spiderParent</artifactId>
        <groupId>com.xucj</groupId>
        <version>1.0-SNAPSHOT</version>
    </parent>
    <modelVersion>4.0.0</modelVersion>
    <artifactId>spiderDay02</artifactId>
    <dependencies>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.3</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.7.2</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.34</version>
        </dependency>
        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>2.8.0</version>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-jdbc</artifactId>
            <version>4.0.6.RELEASE</version>
        </dependency>
        <dependency>
            <groupId>c3p0</groupId>
            <artifactId>c3p0</artifactId>
            <version>0.9.1.2</version>
        </dependency>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.16.18</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.3.2</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <!--这是jdk编译的插件 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>utf-8</encoding>
                </configuration>
            </plugin>
            <!--打包的插件-->
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <archive>
                        <manifest> <!-- 注意 此为设置程序的主入口-->
                            <mainClass>com.xucj.index.IndexJdSpider</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
            </plugin>
        </plugins>
    </build>
</project>

爬取类:

import com.google.gson.Gson;
import com.xucj.dao.ProductDao;
import com.xucj.entity.Product;
import com.xucj.utils.HttpClientUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

//获取搜索内存条的pid
public class IndexJdSpider {

    private static String url = "https://search.jd.com/search?keyword=8g%E5%86%85%E5%AD%98%E6%9D%A1%20ddr4&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=2.def.0.V07&wq=8G&uc=0#J_searchWrap";
    private static ProductDao productDao = new ProductDao();
    private static boolean isEnd = false;
    //阻塞队列
    private static BlockingQueue<String> blockingQueue = new ArrayBlockingQueue(1000);
    //线程池
    private static ExecutorService executorService = Executors.newFixedThreadPool(30);

    public static void main(String[] args) throws IOException, InterruptedException {
        new Thread(new Runnable() {
            @Override
            public void run() {
                while (true){
                    try {
                        Thread.sleep(1000);
                        System.out.println("当前队列个数:"+blockingQueue.size());
                        if(isEnd == true){
                            break;
                        }
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                }
            }
        }).start();
        manyThreadRun();
        manyPage();
    }

    public static void manyThreadRun(){
        for(int i=0;i<30;i++){
            executorService.execute(new Runnable() {
                @Override
                public void run() {
                    while (true){
                        try {
                            String pid = blockingQueue.take();
                            if(blockingQueue.peek() == null && isEnd){
                                break;
                            }
                            productDao.addProduct(parseProduct(pid));
                        } catch (Exception e) {
                            e.printStackTrace();
                        }
                    }
                }
            });
        }
    }

    /**
     * 爬取100页面的指定内容的数据
     * @throws IOException
     */
    public static void manyPage() {
        try {
            for(int i=1;i<=100;i++) {
                String nextPageUrl = "https://search.jd.com/search?keyword=8g%E5%86%85%E5%AD%98%E6%9D%A1%20ddr4&page=" + (i * 2 - 1);
                String html = null;
                html = HttpClientUtils.doGet(nextPageUrl);
                parseProductListHtml(html);
                if(i==100){
                    isEnd = true;
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * 解析整个商品列表页面数据并保存数据
     * @param html
     * @throws IOException
     */
    public static void parseProductListHtml(String html) {
        Document document = Jsoup.parse(html);
        Elements liEl = document.select("#J_goodsList ul li");
        for (Element li : liEl) {
            try {
                blockingQueue.put(li.attr("data-sku"));
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * 根据Pid封装商品信息
     * @param pid
     * @return
     */
    public static Product parseProduct(String pid)  {
        Product product = new Product();
        try {
            String iter_url = "https://item.jd.com/"+pid+".html";
            String html = null;
            html = HttpClientUtils.doGet(iter_url);

            Document document = Jsoup.parse(html);
            //获取title
            Elements title = document.select(".sku-name");
            product.setTitle(title.text());
            //设置商品url
            product.setUrl(iter_url);
            //设置商品pid
            product.setPid(pid);
            //设置商品品牌
            Elements brand = document.select("#parameter-brand");
            product.setBrand(brand.text());
            //设置商品名称
            Elements name = document.select("[class=parameter2 p-parameter-list] li:first-child");
            product.setPname(name.attr("title"));
            //设置商品价格,jd的商品价格单独获取
            String priceUrl = "https://p.3.cn/prices/mgets?skuIds=J_"+pid;
            String priceJsonStr = HttpClientUtils.doGet(priceUrl);
            Gson gson = new Gson();
            System.out.println(priceUrl);
            List<Map<String,String>> list = gson.fromJson(priceJsonStr, List.class);
            if(list != null && !list.isEmpty()){
                product.setPrice(Double.parseDouble(list.get(0).get("p")));
            }
            //        product.setPrice(Double.parseDouble("22"));
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if(product.getPrice()==null){
                product.setPrice(-1);
            }
            return product;
        }
    }

}

Dao层:

import com.mchange.v2.c3p0.ComboPooledDataSource;
import com.xucj.entity.Company;
import com.xucj.entity.PostInfo;
import com.xucj.entity.Product;
import org.springframework.jdbc.core.JdbcTemplate;

import java.beans.PropertyVetoException;

public class ProductDao extends JdbcTemplate{

    public ProductDao()  {
        ComboPooledDataSource ds = new ComboPooledDataSource();
        try {
            ds.setJdbcUrl("jdbc:mysql://localhost:3306/spider13");
            ds.setUser("root");
            ds.setPassword("root");
            ds.setDriverClass("com.mysql.jdbc.Driver");
        } catch (PropertyVetoException e) {
            e.printStackTrace();
        }
        super.setDataSource(ds);
    }
    public void addProduct(Product product){
        String sql = "insert into product values(?,?,?,?,?,?)";
        Object[] params = {product.getPid(),product.getTitle(),product.getPrice(),product.getPname(),product.getUrl(),product.getBrand()};
        update(sql,params);
    }
    public int addCompany(Company company){
        String sql = "insert into company values(?,?,?,?,?,?)";
        Object[] params = {company.getCid(),company.getCname(),company.getCity(),company.getSize(),company.getType(),company.getUrl()};
        return update(sql,params);
    }
    public int addPostInfo(PostInfo postInfo){
        String sql = "insert into postInfo values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)";
        Object[] params = {postInfo.getPid(),postInfo.getJobName(),postInfo.getSalary(),postInfo.getPostAddres(),postInfo.getWorkingExp(),
            postInfo.getEduLevel(),postInfo.getEmplType(),postInfo.getPeopleNum(),postInfo.getWelfare(),
            postInfo.getPositionInfo(),postInfo.getCid(),postInfo.getPositionURL(),postInfo.getUpdateDate(),
            postInfo.getCreateDate(),postInfo.getEndDate(),postInfo.getIntroduce()};
        return update(sql,params);
    }
}

POJO类:

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.Accessors;

@SuppressWarnings("serial")
@AllArgsConstructor
@NoArgsConstructor
@Data
@Accessors(chain=true)
public class Product {
    private String pid;
    private String url;
    private double price;
    private String brand;
    private String title;
    private String pname;
}

HttpClientUtils连接池工具类:

import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

public class HttpClientUtils {

    private static PoolingHttpClientConnectionManager connectionManager;

    static {
        //定义个连接池的工具类对象
        connectionManager = new PoolingHttpClientConnectionManager();
        //定义连接池属性
        //定义连接池最大的属性
        connectionManager.setMaxTotal(20);
        //定义主机的最大并发数
        connectionManager.setDefaultMaxPerRoute(20);
    }

    //获取CloseableHttpClient连接对象
    private static CloseableHttpClient getCloseableHttpClient(){
        CloseableHttpClient closeableHttpClient = HttpClients.custom().setConnectionManager(connectionManager).build();
        return closeableHttpClient;
    }

    //执行请求返回HTML页面
    private static String execute(HttpRequestBase httpRequestBase) throws IOException {
        httpRequestBase.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36");
        /**
         * setConnectionRequestTimeout:设置获取请求的最长时间
         *
         * setConnectTimeout: 设置创建连接的最长时间
         *
         * setSocketTimeout: 设置传输超时的最长时间
         */
        RequestConfig config = RequestConfig.custom().setConnectionRequestTimeout(5000).setConnectTimeout(5000)
                .setSocketTimeout(10 * 1000).build();

        httpRequestBase.setConfig(config);

        //获取httpClient连接
        CloseableHttpClient httpClient = getCloseableHttpClient();
        //发送请求,得到请求返回结果response
        CloseableHttpResponse response = httpClient.execute(httpRequestBase);

        String html = EntityUtils.toString(response.getEntity(), "UTF-8");

        return html;
    }

    //使用GET方式发送请求
    public static String doGet(String url) throws IOException {
        HttpGet httpGet = new HttpGet(url);
        String html = execute(httpGet);
        return html;
    }

    //使用POST方式发送请求
    public static String doPost(String url, Map<String,String> param) throws IOException {
        HttpPost httpPost = new HttpPost(url);
        List<NameValuePair> list = new ArrayList<NameValuePair>();
        for (String key : param.keySet()) {
            list.add(new BasicNameValuePair(key,param.get(key)));
        }
        HttpEntity httpEntity = new UrlEncodedFormEntity(list);
        httpPost.setEntity(httpEntity);
        String html = execute(httpPost);
        return html;
    }

}

 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值