创建一个SpringBoot项目
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.73</version>
</dependency>
<!-- jsoup解析页面 -->
<!-- 解析网页 爬视频可 研究tiko -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<!-- thymeleaf -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-thymeleaf</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-configuration-processor</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
配置文件
server:
port: 9090
#关闭thymeleaf的缓存
spring:
thymeleaf:
cache: false
爬取数据
下面这个依赖可以解析网页,如果想要解析电影,音乐可以学习tiko
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
工具类:
查看id
完善工具类
package com.dongmu.util;
import com.dongmu.pojo.JdGoodsContent;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
@Component
public class HtmlParseUtil {
// public static void main(String[] args) throws IOException {
// for (JdGoodsContent java : new HtmlParseUtil().parseJd("asoul")) {
// System.out.println(java);
// }
// }
public List<JdGoodsContent> parseJd(String keyword) throws IOException {
//获取请求,前提需要联网
String url = "https://search.jd.com/Search?keyword="+keyword;
//解析网页,返回的Document就是页面对象
Document document = Jsoup.parse(new URL(url), 30000);
//获取对象
Element j_goodsList = document.getElementById("J_goodsList");
//获取所有的li元素
Elements li = j_goodsList.getElementsByTag("li");
List<JdGoodsContent> list = new ArrayList<>();
//遍历li元素
for (Element element : li) {
//eq(0)是获取所有img标签的第一个
//图片很多的网站,所有的图片都是延迟加载的所以这里直接拿拿不到
// String imgUrl = element.getElementsByTag("img").eq(0).attr("src");
//要使用source-data-lazy-img获取
String imgUrl = element.getElementsByTag("img").eq(0).attr("data-lazy-img");
String goodPrice = element.getElementsByClass("p-price").eq(0).text();
String goodName = element.getElementsByClass("p-name").eq(0).text();
list.add(new JdGoodsContent(goodName,imgUrl,goodPrice));
// System.out.println(imgUrl);
// System.out.println(goodPrice);
// System.out.println(goodName);
}
return list;
}
}
编写业务
service类
package com.dongmu.service;
import com.alibaba.fastjson.JSON;
import com.dongmu.pojo.JdGoodsContent;
import com.dongmu.util.HtmlParseUtil;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.xcontent.XContentType;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.util.List;
@Service
public class JdGoodsService {
@Autowired
private RestHighLevelClient restHighLevelClient;
//1:解析数据放入es的索引中
public boolean parseGoods(String keyword) throws IOException {
List<JdGoodsContent> list = new HtmlParseUtil().parseJd(keyword);
//把查询的数据放入es中
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout("2m");
for (JdGoodsContent jdGoodsContent : list) {
bulkRequest.add(new IndexRequest("jd_goods")
.source(JSON.toJSONString(jdGoodsContent), XContentType.JSON)
);
}
BulkResponse bulk = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
return !bulk.hasFailures();
}
}
把业务服务改成高亮的版本
package com.dongmu.service;
import com.alibaba.fastjson.JSON;
import com.dongmu.pojo.JdGoodsContent;
import com.dongmu.util.HtmlParseUtil;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
@Service
public class JdGoodsService {
@Autowired
private RestHighLevelClient restHighLevelClient;
//1:解析数据放入es的索引中
public boolean parseGoods(String keyword) throws IOException {
List<JdGoodsContent> list = new HtmlParseUtil().parseJd(keyword);
//把查询的数据放入es中
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout("2m");
for (JdGoodsContent jdGoodsContent : list) {
bulkRequest.add(new IndexRequest("jd_goods")
.source(JSON.toJSONString(jdGoodsContent), XContentType.JSON)
);
}
BulkResponse bulk = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
return !bulk.hasFailures();
}
//2:获取这些数据实现搜索功能
public List<Map<String,Object>> searchPage(String keyword,int pageNo,int pageSize) throws IOException {
if (pageNo<=1){
pageNo = 1;
}
//条件搜索
SearchRequest searchRequest = new SearchRequest("jd_goods");
//分页
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
searchSourceBuilder.from(pageNo);
searchSourceBuilder.size(pageSize);
//精准匹配
searchSourceBuilder.query(QueryBuilders.termQuery("title",keyword));
//高亮设置
HighlightBuilder highlightBuilder = new HighlightBuilder();
highlightBuilder.field("title");
highlightBuilder.preTags("<span style='color:red'>");
highlightBuilder.postTags("</span>");
//关闭多个高亮
// highlightBuilder.requireFieldMatch(false);
searchSourceBuilder.highlighter(highlightBuilder);
//设置查询的时间
searchSourceBuilder.timeout(new TimeValue(60,TimeUnit.SECONDS));
//执行搜索
searchRequest.source(searchSourceBuilder);
SearchResponse search = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
//解析结果
SearchHits hits = search.getHits();
List<Map<String,Object>> list = new ArrayList<>();
for (SearchHit hit : hits) {
//解析高亮的字段,
Map<String, HighlightField> highlightFields = hit.getHighlightFields();
HighlightField title = highlightFields.get("title");
//原来的结果
list.add(hit.getSourceAsMap());
Map<String, Object> sourceAsMap = hit.getSourceAsMap();
if (title!=null){
Text[] fragments = title.fragments();
//不能用StringBuilder
// StringBuilder newTitle = new StringBuilder();
String newTitle = "";
for (Text fragment : fragments) {
// newTitle.append(fragments);
newTitle +=fragment;
}
//把原来的结果替换掉
sourceAsMap.put("title",newTitle);
}
}
return list;
}
}