<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13.2</version>
<scope>test</scope>
</dependency>
<!-- springBoot 测试依赖-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
</dependency>
<!-- hutool -->
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.11</version>
</dependency>
<!-- 分页插件 -->
<dependency>
<groupId>com.github.pagehelper</groupId>
<artifactId>pagehelper-spring-boot-starter</artifactId>
<version>1.4.1</version>
</dependency>
</dependencies>
XML
* application.yml
server:
port: 8989
spring:
datasource:
driver-class-name: com.mysql.cj.jdbc.Driver
url: jdbc:mysql://localhost:3306/eee?serverTimezone=UTC&useUnicode=true&characterEncoding=utf-8
username: root
password: root
elasticsearch:
rest:
uris: 192.168.126.133:9200
集成mybatis-plus
mybatis-plus:
mapper-locations: classpath*:mapper/*.xml
configuration:
log-impl: org.apache.ibatis.logging.stdout.StdOutImpl
YAML
##### 4.2、爬取房源信息
* 编写测试类,要爬取的网址:[北京租房信息|北京租房租金\_价格\_房价|房产网-安居客租房网]( )
package com.llh;
import com.llh.domain.WebHouse;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.ArrayList;
import java.util.List;
/**
-
User: sunjunfu
-
DateTime: 2023/9/23 8:54
*/
public class House implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);@Override
public void process(Page page) {
// System.err.println(page.getHtml());// 标题 List<String> titles = page.getHtml().xpath("//div[@class='zu-info']/h3/a/b/text()").all(); // 价格 List<String> prices = page.getHtml().xpath("//div[@class='zu-side']/p/strong/b/text()").all(); // 中间人 List<String> brokers = page.getHtml().xpath("//p[@class='details-item tag']/text()").all(); // 规格 List<String> b1s = page.getHtml().xpath("//p[@class='details-item tag']/b[1]/text()").all(); List<String> b2s = page.getHtml().xpath("//p[@class='details-item tag']/b[2]/text()").all(); System.err.println(titles); System.err.println(prices); System.err.println(brokers); System.err.println(b1s); System.err.println(b2s); List<WebHouse> list = new ArrayList<>(); for (int i = 0; i < titles.size(); i++) { String title = titles.get(i); String price = prices.get(i); // 拼接规格 String b1 = b1s.get(i); String b2 = b2s.get(i); String area = b1 + "室" + b2 + "厅"; // 去掉空格再截取,或者中间人 String str = brokers.get(i); String trim = str.trim(); String broker = trim.split(" ")[1]; // 给对象赋值 WebHouse house = new WebHouse(); house.setTitle(title); house.setPrice(price); house.setArea(area); house.setBroker(broker); // 添加到集合 list.add(house); } System.out.println(list); page.putField("house", list);
}
@Override
public Site getSite() {
return site;
}public static void main(String[] args) {
Spider.create(new House())
.addUrl(“https://bj.zu.anjuke.com/?t=1&from=0&comm_exist=on&kw=3%E5%B1%85%E5%AE%A4”)
.addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
WebHouse house = resultItems.get(“house”);
}
})
.thread(5)
.run();
}
}
Java
##### 4.3、同步到es
* 同步之前,需要在实体类加上 `@Document` 注解,以及要查询的字段指定分词器和类型 [![house](https://img-blog.csdnimg.cn/img_convert/808e0b0bae975ae71d1a1626f8dd875b.png)](https://assets.llhnp.com/usr/images/web_house/house.png)
* 使用es提供的同步接口
package com.llh.repository;
import com.llh.domain.WebHouse;
import org.springframework.data.repository.CrudRepository;
import org.springframework.stereotype.Repository;
/**
- User: sunjunfu
- DateTime: 2023/9/23 9:56
*/
@Repository
public interface WebHouseRepository extends CrudRepository<WebHouse,Integer> {
}
Java
* 在测试类调用爬取的数据同步到es
@Test
public void test3(){
Spider.create(new House())
.addUrl(“https://bj.zu.anjuke.com/?t=1&from=0&comm_exist=on&kw=3%E5%B1%85%E5%AE%A4”)
.addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
List house = resultItems.get(“house”);
// 存入数据库
webHouseService.saveBatch(house);
// 全量同步到es
webHouseRepository.saveAll(house);
}
})
.thread(5)
.run();
}
Java
##### 4.4、编写测试接口
* WebHouseController.java
package com.llh.controller;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.StrUtil;
import com.llh.domain.WebHouse;
import com.llh.form.HouseForm;
import com.llh.repository.WebHouseRepository;
import com.llh.service.WebHouseService;
import com.llh.utils.Result;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MatchQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.domain.Pageable;
import org.springframework.data.elasticsearch.core.ElasticsearchRestTemplate;
import org.springframework.data.elasticsearch.core.SearchHit;
import org.springframework.data.elasticsearch.core.SearchHits;
import org.springframework.data.elasticsearch.core.query.NativeSearchQuery;
import org.springframework.data.elasticsearch.core.query.NativeSearchQueryBuilder;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import javax.annotation.PostConstruct;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
-
User: sunjunfu
-
DateTime: 2023/9/23 10:19
*/
@RestController
@RequestMapping(“/house”)
public class WebHouseController {
@Autowired
WebHouseService webHouseService;@Autowired
WebHouseRepository webHouseRepository;@Autowired
ElasticsearchRestTemplate elasticsearchRestTemplate;// 初始化 同步到es
@PostConstruct
public void init(){
List houses = webHouseService.list();
webHouseRepository.saveAll(houses);
}@RequestMapping(“/list”)
public Result list(WebHouse house){// 高亮 HighlightBuilder.Field[] highFiles = new HighlightBuilder.Field[1]; // 将title字段高亮显示 highFiles[0] = new HighlightBuilder.Field("title") .preTags("<font style='color: red'>") .postTags("</font>"); // 分页( es 中 0 代表第一页) Pageable page = PageRequest.of(house.getPageNum()-1,house.getPageSize()); // 组合查询 BoolQueryBuilder boolQueryBuilder = QueryBuilders.boolQuery(); if(StrUtil.isNotBlank(house.getTitle())){ // 模糊查询 MatchQueryBuilder matchQuery = QueryBuilders.matchQuery("title", house.getTitle()); // 同时满足条件 boolQueryBuilder.must(matchQuery); } // 构建查询条件 NativeSearchQuery query = new NativeSearchQueryBuilder() .withQuery(boolQueryBuilder) .withPageable(page) .withHighlightFields(highFiles) .build(); // 查询数据 SearchHits<WebHouse> search = elasticsearchRestTemplate.search(query, WebHouse.class); // 获取总条数 long totalHits = search.getTotalHits(); List<WebHouse> list = new ArrayList<>(); for (SearchHit<WebHouse> houseSearchHit : search) { // 获取数据对象 WebHouse webHouse = houseSearchHit.getContent(); // 获取高亮的属性 Map<String, List<String>> highlightFields = houseSearchHit.getHighlightFields(); // 获取title字段的数据 List<String> title = highlightFields.get("title"); if(CollUtil.isNotEmpty(title)){ // 将高亮后的数据覆盖 webHouse.setTitle(title.get(0)); } // 存入集合 list.add(webHouse); } // 将总条数和数据封装到对象 HouseForm houseForm = new HouseForm(totalHits, list); return Result.success(houseForm);
}
}
Java
### 5、前端页面
##### 5.1、编写展示页面
* ListView.vue
自我介绍一下,小编13年上海交大毕业,曾经在小公司待过,也去过华为、OPPO等大厂,18年进入阿里一直到现在。
深知大多数大数据工程师,想要提升技能,往往是自己摸索成长或者是报班学习,但对于培训机构动则几千的学费,着实压力不小。自己不成体系的自学效果低效又漫长,而且极易碰到天花板技术停滞不前!
因此收集整理了一份《2024年大数据全套学习资料》,初衷也很简单,就是希望能够帮助到想自学提升又不知道该从何学起的朋友。
既有适合小白学习的零基础资料,也有适合3年以上经验的小伙伴深入学习提升的进阶课程,基本涵盖了95%以上大数据开发知识点,真正体系化!
由于文件比较大,这里只是将部分目录大纲截图出来,每个节点里面都包含大厂面经、学习笔记、源码讲义、实战项目、讲解视频,并且后续会持续更新
如果你觉得这些内容对你有帮助,可以添加VX:vip204888 (备注大数据获取)
…(img-zMDv7ETe-1712830432388)]
[外链图片转存中…(img-7dzW6m46-1712830432388)]
[外链图片转存中…(img-6F69DWoe-1712830432388)]
[外链图片转存中…(img-kVWDJdnB-1712830432389)]
既有适合小白学习的零基础资料,也有适合3年以上经验的小伙伴深入学习提升的进阶课程,基本涵盖了95%以上大数据开发知识点,真正体系化!
由于文件比较大,这里只是将部分目录大纲截图出来,每个节点里面都包含大厂面经、学习笔记、源码讲义、实战项目、讲解视频,并且后续会持续更新
如果你觉得这些内容对你有帮助,可以添加VX:vip204888 (备注大数据获取)
[外链图片转存中…(img-zTlj5fmm-1712830432389)]