其他链接:
java爬虫入门jsoup入门(简单示例,五分钟)
https://blog.csdn.net/qq_33745371/article/details/109064543
文章目录
前言
这次用jsoup爬取电影天堂的数据,并用mybatisPlus存储在mysql数据库中
网址:https://www.bt-tt.com/
1.创建SpringBoot项目(Spring Initializer快速创建)
添加依赖
引入 Spring Boot Starter 父工程
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.3.5.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
其他依赖
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-boot-starter</artifactId>
<version>3.4.0</version>
</dependency>
<!--代码生成器依赖 -->
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-generator</artifactId>
<version>3.4.1</version>
</dependency>
<!--模板引擎Velocity -->
<dependency>
<groupId>org.apache.velocity</groupId>
<artifactId>velocity-engine-core</artifactId>
<version>2.2</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.16.10</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.1.6</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
</dependencies>
2.配置文件、数据库建表
配置文件-----application.yml
server:
port: 8080
spring:
datasource:
type: com.alibaba.druid.pool.DruidDataSource
url: jdbc:mysql://localhost:3306/dytt?useUnicode=true&characterEncoding=UTF-8&serverTimezone=GMT%2B8
username: root
password: 123456
# 配置日志
mybatis-plus:
configuration:
log-impl: org.apache.ibatis.logging.stdout.StdOutImpl
mysql建表语句
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
DROP TABLE IF EXISTS `movie`;
CREATE TABLE `movie` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'id',
`movieName` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '电影名称',
`movieUrl` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '电影地址',
`downloadUrl` text CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT '下载链接',
`downloadDetails` text CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT '下载详情',
`movieType` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '视频类型',
`gmtCreate` datetime(0) NULL DEFAULT NULL COMMENT '创建时间',
`gmtModify` datetime(0) NULL DEFAULT NULL COMMENT '修改时间',
`attributeCc` bigint(20) NULL DEFAULT 0 COMMENT '版本号',
`deleted` int(1) NULL DEFAULT 0 COMMENT '删除标记',
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 8181 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
SET FOREIGN_KEY_CHECKS = 1;
3.配置类、策略类、实体类、mapper类
配置类
import com.baomidou.mybatisplus.extension.plugins.OptimisticLockerInterceptor;
import com.baomidou.mybatisplus.extension.plugins.inner.PaginationInnerInterceptor;
import org.mybatis.spring.annotation.MapperScan;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.transaction.annotation.EnableTransactionManagement;
@EnableTransactionManagement//事务
@MapperScan("com.qwl.dytt.mapper")//扫描包
@Configuration//配置类
public class MybatisPlusConfig {
//注册乐观锁插件
@Bean public OptimisticLockerInterceptor optimisticLockerInterceptor() {
return new OptimisticLockerInterceptor();
}
//分页插件
@Bean
public PaginationInnerInterceptor paginationInnerInterceptor(){
return new PaginationInnerInterceptor();
}
}
策略类
import com.baomidou.mybatisplus.core.handlers.MetaObjectHandler;
import lombok.extern.slf4j.Slf4j;
import org.apache.ibatis.reflection.MetaObject;
import org.springframework.stereotype.Component;
import java.util.Date;
@Slf4j
@Component
public class MyMetaObjectHandler implements MetaObjectHandler {
//插入时的填充策略
@Override
public void insertFill(MetaObject metaObject) {
log.info("start insert fill....");
this.setFieldValByName("gmtCreate",new Date(),metaObject);
this.setFieldValByName("gmtModify",new Date(),metaObject);
}
//更新时的填充策略
@Override
public void updateFill(MetaObject metaObject) {
log.info("start update fill....");
this.setFieldValByName("updateTime",new Date(),metaObject);
}
}
实体类
import com.baomidou.mybatisplus.annotation.*;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.Date;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Movie {
@TableId(type = IdType.AUTO)//主键自增
private Long id;
@TableField(value = "movieName")
private String movieName;
@TableField(value = "movieUrl")
private String movieUrl;
@TableField(value = "downloadUrl")
private String downloadUrl;
@TableField(value = "downloadDetails")
private String downloadDetails;
@TableField(value = "movieType")
private String movieType;
@Version//版本号
@TableField(value = "attributeCc")
private Long attributeCc;
@TableLogic//逻辑删除
private Integer deleted;
@TableField(value = "gmtCreate",fill = FieldFill.INSERT)
private Date gmtCreate;//创建时间
@TableField(value = "gmtModify",fill = FieldFill.INSERT_UPDATE)
private Date gmtModify;//修改时间
}
mapper类
public interface MovieMapper extends BaseMapper<Movie> {
}
4.爬虫代码
@Slf4j
@SpringBootTest
class DyttApplicationTests {
@Resource
private MovieMapper movieMapper;
@Test
void contextLoads() throws IOException {
String movieType = "欧美电影";//选择电影类型,map中的都可以选择,"欧美电影"
HashMap<String, String> map = new HashMap<>();
map.put("欧美电影", "oumeidianying");
map.put("日韩电影", "rihandianying");
map.put("港台电影", "gangtaidianying");
map.put("大陆电影", "daludianying");
map.put("海外电影", "haiwaidianying");
map.put("欧美剧", "oumeiju");
map.put("日韩剧", "rihanju");
map.put("港台剧", "gangtaiju");
map.put("国产剧", "guochanju");
map.put("海外剧", "haiwaiju");//海外剧就几条数据,还出现了问题,你们可以修改一下代码试试,不难的,练练手吧
map.put("动漫", "dongman");
map.put("综艺", "zongyi");
map.put("纪录片", "jilupian");
String movieTypeUrl = map.get(movieType);
String url = "https://www.bt-tt.com/html/" + movieTypeUrl + "/";//https://www.bt-tt.com/html/为电影天堂网站首页
Document document = Jsoup.parse(new URL(url), 30000);//获取到了document对象
int sameCount = 0;//重复条数,如果15条,则停止遍历
//获取页数,第一页右上角有显示总页数
Integer pageSize = 1;
try {
Element page = document.getElementsByClass("page").get(0);
String pageSizeStr = page.getElementsByTag("strong").get(0).text();
pageSize = Integer.valueOf(pageSizeStr);
} catch (Throwable throwable) {
throwable.printStackTrace();
return;
}
log.info(movieType+"页数:"+pageSize);
for (int i = 1; i <= pageSize; i++) {//多页
if (i == 1) {
url = "https://www.bt-tt.com/html/" + movieTypeUrl + "/";
} else {
url = "https://www.bt-tt.com/html/" + movieTypeUrl + "/index_" + i + ".html";
}
Document doc = Jsoup.parse(new URL(url), 30000);
Element list = doc.getElementById("list");
Elements dlList = list.getElementsByTag("dl");
for (int j = 1; j < dlList.size(); j++) {//每一页的多个电影
Element dl = dlList.get(j);
Element dd = dl.getElementsByTag("dd").get(0);
Element strong = dd.getElementsByTag("strong").get(0);
Element a = strong.getElementsByTag("a").get(0);
String movieUrl = a.attr("href");
Elements fonts = a.getElementsByTag("font");
String movieName = "";
if(fonts != null && fonts.size() != 0){//大陆电影 72页 空天猎 和其他的不同,所以这里有个判断
Element font = fonts.get(0);
movieName = font.text();
}else{//空天猎
movieName = a.getElementsByTag("strong").get(0).text();
}
String u = "https://www.bt-tt.com" + movieUrl;
Document document1 = Jsoup.parse(new URL(u), 30000);
String downloadUrl = "";
String downloadDetails = "";
Elements tbodyList = document1.getElementsByTag("tbody");
for (int k=0;k<tbodyList.size();k++){//有多个下载地址
Element tbody = tbodyList.get(k);
Elements trList = tbody.getElementsByTag("tr");
for (int x=0;x<trList.size();x++){//有多个下载链接,用;拼接起来
Element tr = trList.get(x);
Element a1 = tr.getElementsByTag("a").get(0);
String href = a1.attr("href");
String downloadDetail = a1.text();
if(k==0 && x==0){
downloadUrl = href;
downloadDetails = downloadDetail;
}else{
downloadUrl = downloadUrl + ";"+href;
downloadDetails = downloadDetails + ";" + downloadDetail;
}
}
}
// System.out.println("===========");
// System.out.println(movieUrl);
// System.out.println(movieName);
// System.out.println(downloadUrl);
QueryWrapper<Movie> wrapper = new QueryWrapper<>();
wrapper
.eq("movieName", movieName)
.eq("movieUrl", "https://www.bt-tt.com" + movieUrl)
.eq("downloadUrl", downloadUrl)
.eq("downloadDetails", downloadDetails)
.eq("movieType", movieType);
Movie movieDetail = movieMapper.selectOne(wrapper);
if (movieDetail == null) {
Movie movie = new Movie();
movie.setMovieName(movieName);
movie.setMovieUrl("https://www.bt-tt.com" + movieUrl);
movie.setDownloadUrl(downloadUrl);
movie.setDownloadDetails(downloadDetails);
movie.setMovieType(movieType);
movieMapper.insert(movie);
} else {
sameCount++;
if (sameCount >= 15) {
log.info("重复15条=========结束========");
return;
}
}
}
}
log.info(movieType+"=========遍历结束========");
}
}
讲解
题外话:非常中意!
注意:
1.可能下载地址里不止一个下载链接。
2.可能不止一个"下载地址"。 我是在跑代码的时候发现出现了报错,才注意到这些问题的。
5.项目下载地址
点击打开百度网盘下载链接--------提取码:bws8