代码实现一
爬虫目标:优酷
爬取字段:总播放量、每日播放增量、评论数、收藏数、赞、踩
实现功能:下载解析界面
采用技术:HttpClient、 HtmlCleaner、 Xpath 、正则表达式
Page.java
package work.spider.entity;
/*
*存储页面信息实体类
* @auther lwr
* create by 2020-03-13
* */
public class Page {
//页面内容
private String content;
private String allnumber; //总播放量
private String daynumber; //每日播放增量
private String commentNuber; //评论数
private String collectNumber; //收藏数
private String supportNumber; //赞数
private String againstNumber; //踩数
private String tvName; //电视剧名称
private String url; //页面url
//子集数据
private String episodeNumber;
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getAllnumber() {
return allnumber;
}
public void setAllnumber(String allnumber) {
this.allnumber = allnumber;
}
public String getDaynumber() {
return daynumber;
}
public void setDaynumber(String daynumber) {
this.daynumber = daynumber;
}
public String getCommentNuber() {
return commentNuber;
}
public void setCommentNuber(String commentNuber) {
this.commentNuber = commentNuber;
}
public String getCollectNumber() {
return collectNumber;
}
public void setCollectNumber(String collectNumber) {
this.collectNumber = collectNumber;
}
public String getSupportNumber() {
return supportNumber;
}
public void setSupportNumber(String supportNumber) {
this.supportNumber = supportNumber;
}
public String getAgainstNumber() {
return againstNumber;
}
public void setAgainstNumber(String againstNumber) {
this.againstNumber = againstNumber;
}
public String getTvName() {
return tvName;
}
public void setTvName(String tvName) {
this.tvName = tvName;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getEpisodeNumber() {
return episodeNumber;
}
public void setEpisodeNumber(String episodeNumber) {
this.episodeNumber = episodeNumber;
}
}
2.interface IDonwLoadService
package work.spider.service;
import work.spider.entity.*;
/**
*
*页面下载接口
* @auther lwr
* create by 2020-03-13
* */
public interface IDonwLoadService {
public Page download(String url);
}
3.接口的实现 HttpClientDownloadService
package work.spider.service.impl;
import work.spider.service.IDonwLoadService;
import work.spider.util.PageDownloadUtil;
import work.spider.entity.*;
/**
*
*HttpClient页面下载实现类
* @auther lwr
* create by 2020-03-13
* */
public class HttpClientDownloadService implements IDonwLoadService {
public Page download(String url) {
Page page = new Page();
page.setContent(PageDownloadUtil.getPageContent(url));
return page;
}
}
4.工具 util --页面下载
package work.spider.util;
import java.io.IOException;
import org.apache.http.*;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import work.spider.entity.Page;
import work.spider.service.impl.HttpClientDownloadService;
/*
* 页面下载工具
* @auther lwr
* created by 2020-03-13
* */
public class PageDownloadUtil {
public static String getPageContent(String url) {
HttpClientBuilder builder=HttpClients.custom();
CloseableHttpClient client =builder.build();
HttpGet request= new HttpGet(url);
String content=null;
try {
CloseableHttpResponse response= client.execute(request);
HttpEntity entity=response.getEntity();
content=EntityUtils.toString(entity);
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return content;
}
public static void main(String[] args) {
// TODO Auto-generated method stub
String url="https://www.iqiyi.com/v_19rr7pgg08.html?vfrm="
+ "pcw_home&vfrmblk=L&vfrmrst=712211_dianying_image8";
HttpClientDownloadService down = new HttpClientDownloadService();
Page page=down.download(url);
// String content=PageDownloadUtil.getPageContent(url);
System.out.println(page.getContent());
}
}