对新笔趣网的热门小说的爬取
技术选型:Springboot+jsoup
第一步:导入maven依赖
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.1.2.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<!--需要往数据库插入数据,打开注释即可-->
<!--<dependency>-->
<!--<groupId>mysql</groupId>-->
<!--<artifactId>mysql-connector-java</artifactId>-->
<!--</dependency>-->
<!--<dependency>-->
<!--<groupId>com.alibaba</groupId>-->
<!--<artifactId>druid-spring-boot-starter</artifactId>-->
<!--<version>1.1.10</version>-->
<!--</dependency>-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.30</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.51</version>
</dependency>
</dependencies>
第二步:application.yml
# urls
book:
biqu:
## 笔趣阁的地址
url: http://www.xbiquge.la/
第三步: 核心代码
爬虫接口 interface: CommonPuller 便于扩展
/**
* @author zcp
* @create 2019/12/17
* @since 1.0.0
*/
public interface CommonPuller{
void pullNews();
default Document getHtmlFromUrl(String url, boolean useHtmlUnit) throws Exception{
if(!useHtmlUnit){
//模拟火狐浏览器
return Jsoup.connect(url)
.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")
.get();
}else{
WebClient webClient=new WebClient(BrowserVersion.CHROME);
//javascript
webClient.getOptions().setJavaScriptEnabled(true);
//css
webClient.getOptions().setCssEnabled(false);
//
webClient.getOptions().setActiveXNative(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setTimeout(10000);
HtmlPage rootPage=null;
try {
rootPage = webClient.getPage(url);
webClient.waitForBackgroundJavaScript(10000);
String htmlString = rootPage.asXml();
return Jsoup.parse(htmlString);
}catch (Exception e){
throw e;
}finally {
webClient.close();
}
}
}
}
例如要爬取笔趣阁,就创建一个BiQuPuller 实现这个接口,这里没有做插入数据库操作,自己根据情况构建实体类,将爬取到的内容封装进实体类,存入数据库
/**
* @author zcp
* @create 2019/12/17
* @since 1.0.0
*/
@Component("biQuPuller")
@Slf4j
public class BiQuPuller implements NewsPuller {
@Value("${book.biqu.url}")
private String url;
@Override
public void pullNews() {
log.info("开始拉取...");
//1.获取首页
Document html=null;
try {
html=getHtmlFromUrl(url, false);
}catch (Exception e){
log.error("拉取失败...url--->{}",url);
return;
}
//获取首页中热点内容
Elements hotBooks = html.select("div#main")
.select("div#hotcontent")
.select("div.l")
.select("a[href~=^http://www.xbiquge.*]");
HashSet<String> urls=new HashSet<>();
hotBooks.forEach(item->{
//小说的地址
String href = item.attr("href");
urls.add(href);
});
//爬取每一个小说,这里爬取了一篇小说,就break了,如果需要多篇热门小说打开break即可
for (String s : urls) {
pull(s);
break;
}
}
/**
* 爬取每一篇小说的章节
*
*/
public void pull(String surl){
log.info("开始拉取...");
//1.获取首页
Document html=null;
try {
html=getHtmlFromUrl(surl, false);
}catch (Exception e){
log.error("拉取失败...url--->{}",surl);
return;
}
Elements elements = html.select("div#list").select("dl").select("dd > a");
HashSet<String> zhangjieUrl=new HashSet<>();
elements.forEach(element -> {
String href = element.attr("href");
String title = element.text(); //章节标题
//章节链接
href=url+href;
System.out.println(title+" --> "+href);
zhangjieUrl.add(href);
});
//爬取小说里每一个章节,这里爬取了一个章节,就break了,如果需要多篇热门小说打开break即可
for (String s : zhangjieUrl) {
pullContent(s);
break;
}
}
/**
* 爬取每一章节内容
* @param contentUrl
*/
public void pullContent(String contentUrl){
log.info("开始拉取...");
Document html=null;
try {
html=getHtmlFromUrl(contentUrl, false);
}catch (Exception e){
log.error("拉取失败...url--->{}",contentUrl);
return;
}
Elements content = html.select("div#content");
// System.out.println(content);
content.forEach(c->{
//获取纯文字内容,不带标签 (章节内容)
System.out.println(c.text());
});
}
}
第四步:测试
@SpringBootApplication
public class CrawlerApplication {
public static void main(String[] args){
SpringApplication.run(CrawlerApplication.class,args);
}
}
采用springboot test进行测试
@SpringBootTest
@RunWith(SpringRunner.class)
public class PullTest {
@Autowired
@Qualifier("biQuPuller")
private CommonPuller biQuPuller;
@Test
public void testBiQu(){
biQuPuller.pullNews();
}
}
至此,爬虫的简单demo已经实现,核心还是在于对html页面标签的分析.
代码地址:https://github.com/itapechang/biqu-crawler.git 克隆下来直接在test包下运行即可