起步
使用Jsoup爬取网页并将网页中的内容保存到数据库中。本次使用springboot搭建一个小工程,具体操作如下:
新建model
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
| import lombok.Data; import org.springframework.data.annotation.CreatedDate; import org.springframework.data.jpa.domain.support.AuditingEntityListener;
import javax.persistence.*; import java.util.Date;
@Entity @Table(name = "crawl_content") @Data @EntityListeners(AuditingEntityListener.class) public class CrawlContent { @Id @GeneratedValue(strategy = GenerationType.IDENTITY) private Integer id;
private String title;
@Lob private String content;
@CreatedDate private Date createtime; }
|
添加依赖
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
| <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency>
<dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <optional>true</optional> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.7</version> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-jpa</artifactId> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> </dependency> </dependencies>
|
本次是要使用springboot-data-jpa便于操作数据库
1 2 3 4 5 6 7 8
| import com.felix.project.model.CrawlContent; import org.springframework.data.jpa.repository.JpaRepository; import org.springframework.stereotype.Repository;
@Repository public interface CrawlContentMapper extends JpaRepository<CrawlContent,Integer> {
}
|
开始抓取
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
| package com.felix.project.jsoup;
import com.felix.project.model.CrawlContent; import com.felix.project.repository.CrawlContentMapper; import com.felix.project.util.HttpClientUtils; import lombok.extern.slf4j.Slf4j; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component;
import java.io.IOException; import java.util.ArrayList; import java.util.List;
@Slf4j @Component public class CrawlFzop { private static final String HTTPURL = "https://www.fangzhipeng.com/java-basic.html"; private static CrawlContentMapper crawlContentMapper;
@Autowired public CrawlFzop(CrawlContentMapper crawlContentMapper) { CrawlFzop.crawlContentMapper=crawlContentMapper; }
public static void crawlContent() throws IOException { String html = HttpClientUtils.getHtml(HTTPURL, "utf-8"); Document doc = Jsoup.parse(html); Elements select = doc.select("article #posts-list li a"); for (Element e : select) { List<CrawlContent> list = new ArrayList<CrawlContent>(); int i = select.indexOf(e) + 1; String title = e.text(); String href = e.attr("href"); String chtml = HttpClientUtils.getHtml(href, "utf-8"); log.info("获取第" + i + "篇文章"); log.info("标题:" + title); log.info("url-->" + href); Document texthtml = Jsoup.parse(chtml); Elements select1 = texthtml.select("article .row .markdown-body"); String htmltext = select1.html(); CrawlContent content = new CrawlContent(); content.setTitle(title); content.setContent(htmltext); list.add(content); for (CrawlContent c : list) { crawlContentMapper.save(c); } } } }
|
用到的工具类
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
| package com.felix.project.util;
import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpClientUtils {
public static String getHtml(String url,String charset) throws IOException{ CloseableHttpClient client = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(url); httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
CloseableHttpResponse response = client.execute(httpGet); HttpEntity entity = response.getEntity();
String html = EntityUtils.toString(entity, charset);
response.close(); client.close(); return html; } }
|
启动开始抓取
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
| package com.felix.project;
import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.data.jpa.repository.config.EnableJpaAuditing; import org.springframework.scheduling.annotation.EnableScheduling;
import java.io.IOException;
import static com.felix.project.jsoup.CrawlFzop.crawlContent;
@SpringBootApplication @EnableScheduling @EnableJpaAuditing public class JsoupdemoApplication { public static void main(String[] args) throws IOException { SpringApplication.run(JsoupdemoApplication.class, args); crawlContent(); } }
|