文章目录
文献:https://www.kuangstudy.com/bbs/1354069127022583809
1.数据来源:数据库、mq、爬虫
2.爬虫:获取想要的页面数据
1.导入依赖
jsoup:适合爬取解析网页信息等
tika:适合爬取电影、视频、音频等
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
2.爬取核心部分编码
public static void main(String[] args) throws Exception {
new HtmlParseUtil().parseJD("vue").forEach(System.out::println);
}
public List<Content> parseJD(String keywords) throws Exception {
// 获取请求:https://search.jd.com/Search?keyword=java
String url = "https://search.jd.com/Search?keyword=" + keywords;
//解析网页,document就是浏览器的document对象
Document document = Jsoup.parse(new URL(url), 30000);
//所有js操作都可以通过document对象进行操作
// 获取商品列表
Element jGoodsListElement = document.getElementById("J_goodsList");
// 获取商品标签
Elements li = jGoodsListElement.getElementsByTag("li");
List<Content> goodsList = new ArrayList<>(li.size());
for (Element el : li) {
String img = el.getElementsByTag("img").eq(0).attr("source-data-lazy-img");
String price = el.getElementsByClass("p-price").eq(0).text();
String title = el.getElementsByClass("p-name").eq(0).text();
goodsList.add(new Content().setImg(img).setTitle(title).setPrice(price));
}
return goodsList;
}
3.测试解析成功
4.封装对象
package com.bie.pojo;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.Accessors;
/**
* @author bjh
* @date 2022/12/12
*/
@Data
@NoArgsConstructor
@AllArgsConstructor
@Accessors(chain = true)
public class Content {
private String img;
private String title;
private String price;
}
5.引入es配置类
package com.bie.config;
import org.apache.http.HttpHost;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
/**
* @author bjh
* @date 2022/12/9
*/
@Configuration
public class ElasticSearchClientConfig {
@Bean
public RestHighLevelClient restHighLevelClient() {
RestHighLevelClient client = new RestHighLevelClient(
RestClient.builder(
new HttpHost("192.168.229.132", 9200, "http")));
return client;
}
}
6.将HtmlParseUtil注册到spring
package com.bie.utils;
import com.bie.pojo.Content;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
/**
* @author bjh
* @date 2022/12/12
*/
@Component
public class HtmlParseUtil {
public static void main(String[] args) throws Exception {
new HtmlParseUtil().parseJD("vue").forEach(System.out::println);
}
public List<Content> parseJD(String keywords) throws Exception {
// 获取请求:https://search.jd.com/Search?keyword=java
String url = "https://search.jd.com/Search?keyword=" + keywords;
//解析网页,document就是浏览器的document对象
Document document = Jsoup.parse(new URL(url), 30000);
//所有js操作都可以通过document对象进行操作
// 获取商品列表
Element jGoodsListElement = document.getElementById("J_goodsList");
// 获取商品标签
Elements li = jGoodsListElement.getElementsByTag("li");
List<Content> goodsList = new ArrayList<>(li.size());
for (Element el : li) {
String img = el.getElementsByTag("img").eq(0).attr("source-data-lazy-img");
String price = el.getElementsByClass("p-price").eq(0).text();
String title = el.getElementsByClass("p-name").eq(0).text();
goodsList.add(new Content().setImg(img).setTitle(title).setPrice(price));
}
return goodsList;
}
}
7.爬取的数据入es库
1.在es创建索引"jd_goods"
2.编写controller
package com.bie.controller;
import com.bie.service.ContentService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RestController;
/**
* @author bjh
* @date 2022/12/12
*/
@RestController
public class ContentController {
@Autowired
private ContentService contentService;
@GetMapping("/parse/{keyword}")
public boolean parse(@PathVariable("keyword") String keyword) throws Exception {
return contentService.parseContent(keyword);
}
}
3.编写service
package com.bie.service;
import com.alibaba.fastjson.JSON;
import com.bie.pojo.Content;
import com.bie.utils.HtmlParseUtil;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentType;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.util.List;
/**
* @author bjh
* @date 2022/12/12
*/
@Service
public class ContentService {
@Autowired
private RestHighLevelClient restHighLevelClient;
public boolean parseContent(String keywords) throws Exception {
// 解析
List<Content> contents = new HtmlParseUtil().parseJD(keywords);
// 放入es
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout(TimeValue.timeValueSeconds(1));
for (int i = 0; i < contents.size(); i++) {
bulkRequest.add(new IndexRequest("jd_goods")
.source(JSON.toJSONString(contents.get(i)), XContentType.JSON));
}
BulkResponse bulkResponse = this.restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
return !bulkResponse.hasFailures();
}
}
4.测试
爬取java相关:
访问:http://localhost:9090/parse/java
爬取vue相关:
8.空白文件初始化vue
初始化vue项目
#1.创建空文件夹
mkdir /home/test && cd /home/test
#2.初始化npm
npm init
#3.安装vue
npm install vue
#vue-min.js文件下载地址,将内容拷贝到vue-min.js文件中
https://cdn.staticfile.org/vue/2.6.11/vue.min.js
#4.安装axios 通信
npm install axios
#5.将初始化的vue目录,拷贝到boot项目中