前言:
并将搜索到的商品信息保存到本地数据库,保存的表为:
整个案例使用的maven
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>spiderParent</artifactId>
<groupId>com.xucj</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>spiderDay02</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.34</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.0</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-jdbc</artifactId>
<version>4.0.6.RELEASE</version>
</dependency>
<dependency>
<groupId>c3p0</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.1.2</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.16.18</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.3.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<!--这是jdk编译的插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>utf-8</encoding>
</configuration>
</plugin>
<!--打包的插件-->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest> <!-- 注意 此为设置程序的主入口-->
<mainClass>com.xucj.index.IndexJdSpider</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
</build>
</project>
爬取类:
import com.google.gson.Gson;
import com.xucj.dao.ProductDao;
import com.xucj.entity.Product;
import com.xucj.utils.HttpClientUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
//获取搜索内存条的pid
public class IndexJdSpider {
private static String url = "https://search.jd.com/search?keyword=8g%E5%86%85%E5%AD%98%E6%9D%A1%20ddr4&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=2.def.0.V07&wq=8G&uc=0#J_searchWrap";
private static ProductDao productDao = new ProductDao();
private static boolean isEnd = false;
//阻塞队列
private static BlockingQueue<String> blockingQueue = new ArrayBlockingQueue(1000);
//线程池
private static ExecutorService executorService = Executors.newFixedThreadPool(30);
public static void main(String[] args) throws IOException, InterruptedException {
new Thread(new Runnable() {
@Override
public void run() {
while (true){
try {
Thread.sleep(1000);
System.out.println("当前队列个数:"+blockingQueue.size());
if(isEnd == true){
break;
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}).start();
manyThreadRun();
manyPage();
}
public static void manyThreadRun(){
for(int i=0;i<30;i++){
executorService.execute(new Runnable() {
@Override
public void run() {
while (true){
try {
String pid = blockingQueue.take();
if(blockingQueue.peek() == null && isEnd){
break;
}
productDao.addProduct(parseProduct(pid));
} catch (Exception e) {
e.printStackTrace();
}
}
}
});
}
}
/**
* 爬取100页面的指定内容的数据
* @throws IOException
*/
public static void manyPage() {
try {
for(int i=1;i<=100;i++) {
String nextPageUrl = "https://search.jd.com/search?keyword=8g%E5%86%85%E5%AD%98%E6%9D%A1%20ddr4&page=" + (i * 2 - 1);
String html = null;
html = HttpClientUtils.doGet(nextPageUrl);
parseProductListHtml(html);
if(i==100){
isEnd = true;
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 解析整个商品列表页面数据并保存数据
* @param html
* @throws IOException
*/
public static void parseProductListHtml(String html) {
Document document = Jsoup.parse(html);
Elements liEl = document.select("#J_goodsList ul li");
for (Element li : liEl) {
try {
blockingQueue.put(li.attr("data-sku"));
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
/**
* 根据Pid封装商品信息
* @param pid
* @return
*/
public static Product parseProduct(String pid) {
Product product = new Product();
try {
String iter_url = "https://item.jd.com/"+pid+".html";
String html = null;
html = HttpClientUtils.doGet(iter_url);
Document document = Jsoup.parse(html);
//获取title
Elements title = document.select(".sku-name");
product.setTitle(title.text());
//设置商品url
product.setUrl(iter_url);
//设置商品pid
product.setPid(pid);
//设置商品品牌
Elements brand = document.select("#parameter-brand");
product.setBrand(brand.text());
//设置商品名称
Elements name = document.select("[class=parameter2 p-parameter-list] li:first-child");
product.setPname(name.attr("title"));
//设置商品价格,jd的商品价格单独获取
String priceUrl = "https://p.3.cn/prices/mgets?skuIds=J_"+pid;
String priceJsonStr = HttpClientUtils.doGet(priceUrl);
Gson gson = new Gson();
System.out.println(priceUrl);
List<Map<String,String>> list = gson.fromJson(priceJsonStr, List.class);
if(list != null && !list.isEmpty()){
product.setPrice(Double.parseDouble(list.get(0).get("p")));
}
// product.setPrice(Double.parseDouble("22"));
} catch (IOException e) {
e.printStackTrace();
}finally {
if(product.getPrice()==null){
product.setPrice(-1);
}
return product;
}
}
}
Dao层:
import com.mchange.v2.c3p0.ComboPooledDataSource;
import com.xucj.entity.Company;
import com.xucj.entity.PostInfo;
import com.xucj.entity.Product;
import org.springframework.jdbc.core.JdbcTemplate;
import java.beans.PropertyVetoException;
public class ProductDao extends JdbcTemplate{
public ProductDao() {
ComboPooledDataSource ds = new ComboPooledDataSource();
try {
ds.setJdbcUrl("jdbc:mysql://localhost:3306/spider13");
ds.setUser("root");
ds.setPassword("root");
ds.setDriverClass("com.mysql.jdbc.Driver");
} catch (PropertyVetoException e) {
e.printStackTrace();
}
super.setDataSource(ds);
}
public void addProduct(Product product){
String sql = "insert into product values(?,?,?,?,?,?)";
Object[] params = {product.getPid(),product.getTitle(),product.getPrice(),product.getPname(),product.getUrl(),product.getBrand()};
update(sql,params);
}
public int addCompany(Company company){
String sql = "insert into company values(?,?,?,?,?,?)";
Object[] params = {company.getCid(),company.getCname(),company.getCity(),company.getSize(),company.getType(),company.getUrl()};
return update(sql,params);
}
public int addPostInfo(PostInfo postInfo){
String sql = "insert into postInfo values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)";
Object[] params = {postInfo.getPid(),postInfo.getJobName(),postInfo.getSalary(),postInfo.getPostAddres(),postInfo.getWorkingExp(),
postInfo.getEduLevel(),postInfo.getEmplType(),postInfo.getPeopleNum(),postInfo.getWelfare(),
postInfo.getPositionInfo(),postInfo.getCid(),postInfo.getPositionURL(),postInfo.getUpdateDate(),
postInfo.getCreateDate(),postInfo.getEndDate(),postInfo.getIntroduce()};
return update(sql,params);
}
}
POJO类:
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.Accessors;
@SuppressWarnings("serial")
@AllArgsConstructor
@NoArgsConstructor
@Data
@Accessors(chain=true)
public class Product {
private String pid;
private String url;
private double price;
private String brand;
private String title;
private String pname;
}
HttpClientUtils连接池工具类:
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class HttpClientUtils {
private static PoolingHttpClientConnectionManager connectionManager;
static {
//定义个连接池的工具类对象
connectionManager = new PoolingHttpClientConnectionManager();
//定义连接池属性
//定义连接池最大的属性
connectionManager.setMaxTotal(20);
//定义主机的最大并发数
connectionManager.setDefaultMaxPerRoute(20);
}
//获取CloseableHttpClient连接对象
private static CloseableHttpClient getCloseableHttpClient(){
CloseableHttpClient closeableHttpClient = HttpClients.custom().setConnectionManager(connectionManager).build();
return closeableHttpClient;
}
//执行请求返回HTML页面
private static String execute(HttpRequestBase httpRequestBase) throws IOException {
httpRequestBase.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36");
/**
* setConnectionRequestTimeout:设置获取请求的最长时间
*
* setConnectTimeout: 设置创建连接的最长时间
*
* setSocketTimeout: 设置传输超时的最长时间
*/
RequestConfig config = RequestConfig.custom().setConnectionRequestTimeout(5000).setConnectTimeout(5000)
.setSocketTimeout(10 * 1000).build();
httpRequestBase.setConfig(config);
//获取httpClient连接
CloseableHttpClient httpClient = getCloseableHttpClient();
//发送请求,得到请求返回结果response
CloseableHttpResponse response = httpClient.execute(httpRequestBase);
String html = EntityUtils.toString(response.getEntity(), "UTF-8");
return html;
}
//使用GET方式发送请求
public static String doGet(String url) throws IOException {
HttpGet httpGet = new HttpGet(url);
String html = execute(httpGet);
return html;
}
//使用POST方式发送请求
public static String doPost(String url, Map<String,String> param) throws IOException {
HttpPost httpPost = new HttpPost(url);
List<NameValuePair> list = new ArrayList<NameValuePair>();
for (String key : param.keySet()) {
list.add(new BasicNameValuePair(key,param.get(key)));
}
HttpEntity httpEntity = new UrlEncodedFormEntity(list);
httpPost.setEntity(httpEntity);
String html = execute(httpPost);
return html;
}
}