Jsoup爬取文章保存到数据库

起步

使用Jsoup爬取网页并将网页中的内容保存到数据库中。本次使用springboot搭建一个小工程,具体操作如下:

新建model

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

import lombok.Data;
import org.springframework.data.annotation.CreatedDate;
import org.springframework.data.jpa.domain.support.AuditingEntityListener;

import javax.persistence.*;
import java.util.Date;

@Entity
@Table(name = "crawl_content")
@Data
@EntityListeners(AuditingEntityListener.class)
public class CrawlContent {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Integer id;

private String title;

@Lob
private String content;

@CreatedDate
private Date createtime;
}

添加依赖

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
   <dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>

<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.7</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
</dependencies>

本次是要使用springboot-data-jpa便于操作数据库

1
2
3
4
5
6
7
8
import com.felix.project.model.CrawlContent;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Repository;

@Repository
public interface CrawlContentMapper extends JpaRepository<CrawlContent,Integer> {

}

开始抓取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
package com.felix.project.jsoup;

import com.felix.project.model.CrawlContent;
import com.felix.project.repository.CrawlContentMapper;
import com.felix.project.util.HttpClientUtils;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

@Slf4j
@Component
public class CrawlFzop {
private static final String HTTPURL = "https://www.fangzhipeng.com/java-basic.html";
private static CrawlContentMapper crawlContentMapper;

@Autowired
public CrawlFzop(CrawlContentMapper crawlContentMapper) {
CrawlFzop.crawlContentMapper=crawlContentMapper;
}

public static void crawlContent() throws IOException {
//获取目标网站网页
String html = HttpClientUtils.getHtml(HTTPURL, "utf-8");
//解析网页
Document doc = Jsoup.parse(html);
//提取 a 标签
Elements select = doc.select("article #posts-list li a");
//循环a标签
for (Element e : select) {
//创建一个list
List<CrawlContent> list = new ArrayList<CrawlContent>();
int i = select.indexOf(e) + 1;
//文章标题
String title = e.text();
//获取href
String href = e.attr("href");
String chtml = HttpClientUtils.getHtml(href, "utf-8");
log.info("获取第" + i + "篇文章");
log.info("标题:" + title);
log.info("url-->" + href);
//获取详情页
Document texthtml = Jsoup.parse(chtml);
//获取内容
Elements select1 = texthtml.select("article .row .markdown-body");
String htmltext = select1.html();
CrawlContent content = new CrawlContent();
content.setTitle(title);
content.setContent(htmltext);
list.add(content);
//保存到数据
for (CrawlContent c : list) {
crawlContentMapper.save(c);
}
}
}
}

用到的工具类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
package com.felix.project.util;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class HttpClientUtils {

public static String getHtml(String url,String charset) throws IOException{
CloseableHttpClient client = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");

CloseableHttpResponse response = client.execute(httpGet);
//返回实体
HttpEntity entity = response.getEntity();

String html = EntityUtils.toString(entity, charset);

response.close();
client.close();
return html;
}
}

启动开始抓取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
package com.felix.project;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.data.jpa.repository.config.EnableJpaAuditing;
import org.springframework.scheduling.annotation.EnableScheduling;

import java.io.IOException;

import static com.felix.project.jsoup.CrawlFzop.crawlContent;

@SpringBootApplication
@EnableScheduling
@EnableJpaAuditing
public class JsoupdemoApplication {
public static void main(String[] args) throws IOException {
SpringApplication.run(JsoupdemoApplication.class, args);
//开始抓取文章
crawlContent();
}
}
  • 3
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值