前言:
为什么我们要爬取数据
在大数据时代,我们要获取更多数据,就要进行数据的挖掘、分析、筛选,比如当我们做一个项目的时候,需要大量真实的数据的时候,就需要去某些网站进行爬取,有些网站的数据爬取后保存到数据库还不能够直接使用,需要进行清洗、过滤后才能使用,我们知道有些数据是非常真贵的。
一: 引入pom文件依赖
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
二:新建一个测试类
public class Demo {
public static void main(String[] args)throws Exception {
//1 创建httpClient对象
CloseableHttpClient http = HttpClients.createDefault();
//输入网址发起请求
HttpGet httpGet =new HttpGet("https://www.baidu.com");
//3 返回相应
CloseableHttpResponse response = http.execute(httpGet);
//4 解析数据
if(response.getStatusLine().getStatusCode()==200){
String s = EntityUtils.toString(response.getEntity(), "utf-8");
System.out.println(s);
}
response.close();
http.close();
}
}
三 查看控制台
上面我们讲的是get无参数请求 ,像这样带参数的请求又是什么样呢 如下所示:
public static void main(String[] args)throws Exception {
//1 创建httpClient对象
CloseableHttpClient http = HttpClients.createDefault();
//输入网址发起请求
URIBuilder uriBuilder =new URIBuilder("https://www.baidu.com");
// 如果存在一个参数如下所示 多个参数继续setParameter
// uriBuilder.setParameter("key","xx").setParameter("key","xx")
uriBuilder.setParameter("key","xx");
HttpGet httpGet =new HttpGet(uriBuilder.build());
//3 返回相应
CloseableHttpResponse response = http.execute(httpGet);
//4 解析数据
if(response.getStatusLine().getStatusCode()==200){
String s = EntityUtils.toString(response.getEntity(), "utf-8");
System.out.println(s);
}
response.close();
http.close();
}
post无参数请求和get无参数请求是类似的 这里我就不写了 要改的地方就是HttpGet 换成HttpPost
接下来我们看下post带参数请求 如下:
public static void main(String[] args)throws Exception {
//1 创建httpClient对象
CloseableHttpClient http = HttpClients.createDefault();
//输入网址发起post请求 创建list集合 添加参数
List<NameValuePair> list =new ArrayList<NameValuePair>();
list.add(new BasicNameValuePair("key","value"));
UrlEncodedFormEntity urlEncodedFormEntity =new UrlEncodedFormEntity(list,"utf-8");
//添加到Http当中
HttpPost httpPost =new HttpPost("https://www.baidu.com");
httpPost.setEntity(urlEncodedFormEntity);
CloseableHttpResponse execute = http.execute(httpPost);
if(execute.getStatusLine().getStatusCode()==200){
String s = EntityUtils.toString(execute.getEntity(), "utf-8");
System.out.println(s);
}
execute.close();
http.close();
}
为了解析数据我们引入了Jsoup 如下依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
新建一个工具类HttpUtils
package com.server.cloud.utils;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
@Component
public class HttpUtils {
private PoolingHttpClientConnectionManager cm;
//连接池
public HttpUtils() {
this.cm = new PoolingHttpClientConnectionManager();
//设置最大连接数
this.cm.setMaxTotal(10000);
this.cm.setDefaultMaxPerRoute(10000);
}
/**
*
* @param url get请求加载地址
* @return
*/
public String doget(String url){
//获取HttpClient对象
CloseableHttpClient httpClient =HttpClients.custom().setConnectionManager(this.cm).build();
//创建httpClient对象 设置地址
HttpGet httpGet = new HttpGet(url);
//设置请求信息
httpGet.setConfig(this.getConfig());
//设置变量
CloseableHttpResponse respones=null;
//发起请求 获取响应
try {
respones = httpClient.execute(httpGet);
//解析响应
if(respones.getStatusLine().getStatusCode()==200){
//判断响应体是否不为空
if(respones.getEntity()!=null){
String s = EntityUtils.toString(respones.getEntity(), "utf-8");
return s;
}
}
} catch (IOException e) {
e.printStackTrace();
}finally {
if(respones.getEntity()!=null){
try {
respones.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "";
}
/**w
*
* @param url
* @return 图片名称
*/
public String doGetImage(String url){
//获取HttpClient对象
CloseableHttpClient httpClient =HttpClients.custom().setConnectionManager(this.cm).build();
//创建httpClient对象 设置地址
HttpGet httpGet = new HttpGet(url);
//设置请求信息
httpGet.setConfig(this.getConfig());
//设置变量
CloseableHttpResponse respones=null;
//发起请求 获取响应
try {
respones = httpClient.execute(httpGet);
//解析响应
if(respones.getStatusLine().getStatusCode()==200){
//判断响应体是否不为空
if(respones.getEntity()!=null){
//下载图片
//获取图片后缀
String extName = url.substring(url.lastIndexOf(","));
//创建图片名
String picName = UUID.randomUUID().toString()+extName;
//下载图片
OutputStream outputStream = new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\image4"+picName));
respones.getEntity().writeTo(outputStream);
//返回图片名称
return picName;
}
}
} catch (IOException e) {
e.printStackTrace();
}finally {
if(respones.getEntity()!=null){
try {
respones.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return "";
}
private RequestConfig getConfig() {
RequestConfig config = RequestConfig.custom()
.setConnectTimeout(1000)
.setConnectionRequestTimeout(500)
.setSocketTimeout(1000)
.build();
return config;
}
}
新建task类 这里我们是定时跑的 所以叫Task
package com.server.cloud.Task;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.server.cloud.enetity.Item;
import com.server.cloud.service.ItemService;
import com.server.cloud.utils.HttpUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.util.List;
@Component
public class TestTask {
private static final ObjectMapper MAPPER = new ObjectMapper();
@Autowired
private HttpUtils httpUtils;
@Autowired
private ItemService itemService;
@Scheduled(cron = "0 */2 * * * ?")
public void itemTask()throws Exception{
String url="https://search.jd.com/search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=shouj&ev=exbrand_%E5%8D%8E%E4%B8%BA%EF%BC%88HUAWEI%EF%BC%89%5E&page=1&s=1&click=0";
for (int i = 1; i < 10; i=i++) {
String html = httpUtils.doget(url);
//解析页面
this.parse(html);
}
}
private void parse(String html)throws Exception {
//解析HTML 获取对象
Document parse = Jsoup.parse(html);
Elements select = parse.select("div#J_goodsList > ul > li");
for (Element element : select) {
String spu = element.attr("data-spu");
//获取sku
Elements sku = element.select("li.ps-item");
for (Element skk : sku) {
String att = skk.select("[data-sku]").attr("data-sku");
//查询商品数据
Item item =new Item();
item.setSku(att);
List<Item> items = itemService.selectAll(att);
if(items.size()>0){
continue;
}
//设置spu
item.setSpu(spu);
String itemUrl ="https://item.jd.com/100002071812.html"+att+".html";
item.setUrl(itemUrl);
String src = "https://"+skk.select("img[data-sku]").first().attr("data-lazy-img");
src.replace("/n9/","/n1/");
String s = this.httpUtils.doGetImage(src);
item.setPic(s);
String price = this.httpUtils.doget("https://item.jd.com/100002071812.html" + att);
String p = String.valueOf(MAPPER.readTree(price).get(0).get("p"));
item.setPrice(p);
String info = this.httpUtils.doget(itemUrl);
String text = Jsoup.parse(info).select("div.sku-name").text();
item.setTitle(text);
//保存数据
this.itemService.insert(item);
}
}
}
}
如有不足之处请指出来 一起共同学习