day09_爬虫文档解析整合&数据保存准备
1 文档解析
1.1解析规则工具类ParseRuleUtils
后面的文章解析都是复制代码,基本没什么难度,而且这个头条的爬虫系统是已经做好的,基本不用开发
通过Http Client 来获取数据
public String getHttpClientRequestData(String url, Map<String, String> parameterMap, List<CrawlerCookie> cookieList, CrawlerProxy crawlerProxy) {
CookieStore cookieStore = getCookieStore(cookieList);
String jsonDate = null;
HttpHost proxy = null;
if (null != crawlerProxy) {
proxy = CrawlerProxyFactory.getHttpHostProxy(crawlerProxy);
}
try {
long currentTime = System.currentTimeMillis();
log.info("HttpClient 请求数据,url:{},parameter:{},cookies:{},proxy:{}", url, parameterMap, JSON.toJSONString(cookieList), proxy);
jsonDate = HttpClientUtils.get(url, parameterMap, getHeaderMap(), cookieStore, proxy, "UTF-8");
log.info("HttpClient 请求数据完成:url:{},parameter:{},cookies:{},proxy:{},duration:{},result:{}", url, parameterMap, JSON.toJSONString(cookieList), proxy, System.currentTimeMillis() - currentTime, jsonDate);
} catch (IOException e) {
log.error("HttpClient 请求数据异常,url:{},parameter:{},cookies:{},proxy:{},errorMsg:{}", url, parameterMap, JSON.toJSONString(cookieList), proxy, e.getMessage());
} catch (URISyntaxException e) {
log.error("HttpClient 请求数据异常,url:{},parameter:{},cookies:{},proxy:{},errorMsg:{}", url, parameterMap, JSON.toJSONString(cookieList), proxy, e.getMessage());
}
return jsonDate;
}
获取 SeleniumRequestData
public CrawlerHtml getSeleniumRequestData(String url, Map<String, String> parameterMap, CrawlerProxy proxy) {
String buildUrl = HttpClientUtils.buildGetUrl(url, parameterMap, HttpClientUtils.utf8);
String cookieName = cookieHelper.getCookieName();
CrawlerHtml crawlerHtml = seleniumClient.getCrawlerHtml(buildUrl, proxy, cookieName);
if (null != crawlerHtml) {
cookieHelper.updateCookie(crawlerHtml.getCrawlerCookieList(), proxy);
}
return crawlerHtml;
}
cookie 转 CookieStore
private CookieStore getCookieStore(List<CrawlerCookie> cookieList) {
BasicCookieStore cookieStore = null;
if (null != cookieList && !cookieList.isEmpty()) {
for (CrawlerCookie cookie : cookieList) {
if (null != cookie) {
BasicClientCookie basicClientCookie = new BasicClientCookie(cookie.getName(), cookie.getValue());
basicClientCookie.setDomain(cookie.getDomain());
basicClientCookie.setPath(cookie.getPath());
cookieStore = new BasicCookieStore();
cookieStore.addCookie(basicClientCookie);
}
}
}
return cookieStore;
}
process是定制爬虫逻辑的核心接口方法,在这里编写抽取逻辑
@Override
public void process(Page page) {
long currentTimeMillis = System.currentTimeMillis();
String handelType = crawlerHelper.getHandelType(page.getRequest());
log.info("开始解析数据页面:url:{},handelType:{}",page.getUrl(),handelType);
crawlerPageProcessorManager.handel(page);
log.info("解析数据页面完成,url:{},handelType:{},耗时:{}",page.getUrl(),handelType,System.currentTimeMillis()-currentTimeMillis);
}
@Override
public Site getSite() {
Site site = Site.me().setRetryTimes(getRetryTimes()).setRetrySleepTime(getRetrySleepTime()).setSleepTime(getSleepTime()).setTimeOut(getTimeout());
//header 配置
Map<String, String> headerMap = getHeaderMap();
if(null != headerMap && !headerMap.isEmpty()){
for(Map.Entry<String,String> entry:headerMap.entrySet()){
site.addHeader(entry.getKey(),entry.getValue());
}
}
return site;
}
解析url列表
public List<String> getHelpUrlList(List<ParseRule> helpParseRuleList){
List<String> helpUrlList = new ArrayList<>();
for(ParseRule parseRule:helpParseRuleList){
List<String> urlLinks = ParseRuleUtils.getUrlLinks(parseRule.getParseContentList());
helpUrlList.addAll(urlLinks);
}
return helpUrlList;
}
添加数据到爬虫列表
public void addSpiderRequest(List<String> urlList, Request request,CrawlerEnum.DocumentType documentType){
List<ParseItem> parseItemList = new ArrayList<>();
if(null!=urlList&& !urlList.isEmpty()){
for (String url : urlList) {
CrawlerParseItem crawlerParseItem = new CrawlerParseItem();
crawlerParseItem.setUrl(url);
crawlerParseItem.setHandelType(crawlerHelper.getHandelType(request));
crawlerParseItem.setDocumentType(documentType.name());
parseItemList.add(crawlerParseItem);
}
}
addSpiderRequest(parseItemList);
}
/**
* 处理页面
* @param page
*/
public abstract void handelPage(Page page);
/**
* 是否需要处理类型
* @param handelType
* @return
*/
public abstract boolean isNeedHandelType(String handelType);
/**
* 是否需要文档类型
* @param documentType
* @return
*/
public abstract boolean isNeedDocumentType(String documentType);
重写四个子类
@Override
public void handelPage(Page page) {
String initXpath = crawlerConfigProperty.getInitCrawlerXpath();
List<String> helpUrl = page.getHtml().xpath(initXpath).links().all();
addSpiderRequest(helpUrl,page.getRequest(), CrawlerEnum.DocumentType.HELP);
}
处理数据
@Override
public void handelPage(Page page) {
//获取类型
String handelType = crawlerHelper.getHandelType(page.getRequest());
long currentTime = System.currentTimeMillis();
String requestUrl = page.getUrl().get();
log.info("开始解析帮助页,url:{},handelType:{}",requestUrl,handelType);
//获取配置的抓取规则
String helpCrawlerXpath = crawlerConfigProperty.getHelpCrawlerXpath();
Integer crawlerHelpNextPagingSize = crawlerConfigProperty.getCrawlerHelpNextPagingSize();
List<String> helpUrlList = page.getHtml().xpath(helpCrawlerXpath).links().all();
if(null!=crawlerHelpNextPagingSize&&crawlerHelpNextPagingSize>1){
//分页逻辑处理
List<String> docPageUrlList = getDocPageUrlList(requestUrl,crawlerHelpNextPagingSize);
if(null!=docPageUrlList && !docPageUrlList.isEmpty()){
helpUrlList.addAll(docPageUrlList);
}
}
addSpiderRequest(helpUrlList,page.getRequest(),CrawlerEnum.DocumentType.PAGE);
log.info("解析帮助页数据完成,url:{},handelType:{},耗时:{}",page.getUrl(),handelType,System.currentTimeMillis()-currentTime);
}
获取分页后的数据
private List<String> getDocPageUrlList(String url, Integer pageSize) {
List<String> docPagePaingUrlList = null;
if(url.endsWith(helpUrlSuffix)){
//分页的url
List<String> pagePagingUrlList = generateHelpPagingUrl(url,pageSize);
//获取分页数据中的目标url
docPagePaingUrlList = getHelpPagingDocUrl(pagePagingUrlList);
}
return docPagePaingUrlList;
}
获取分页后的url(文章的url列表)
private List<String> getHelpPagingDocUrl(List<String> pagePagingUrlList) {
long currentTimeMillis = System.currentTimeMillis();
log.info("开始进行分页抓取doc页面");
List<String> docUrlList = new ArrayList<>();
int failCount=0;
if(!pagePagingUrlList.isEmpty()){
for (String url : pagePagingUrlList) {
log.info("开始进行help页面分页处理,url:",url);
String htmlData = getOriginalRequestHtmlData(url, null);
boolean validate = crawlerHelper.getDataValidateCallBack().validate(htmlData);
if(validate){
List<String> urlList = new Html(htmlData).xpath(crawlerConfigProperty.getHelpCrawlerXpath()).links().all();
if(!urlList.isEmpty()){
docUrlList.addAll(urlList);
}else {
failCount++;
if(failCount>2){
break;
}
}
}
}
}
log.info("分页抓取doc页面完成,耗时:{}",System.currentTimeMillis()-currentTimeMillis);
return docUrlList;
}
测试,之后开始爬行CSDN的数据进行汇总,个人感觉者爬虫系统开发的很烂,好多配置类不了解其内部的东西,只能根据他的节奏走
保存文章频得信息的接口
@Override
public String getLabelIds(String labels) {
long currentTimeMillis = System.currentTimeMillis();
log.info("获取channel信息,标签:labels:{}", labels);
List<AdLabel> adLabelList = new ArrayList<>();
if (StringUtils.isNotEmpty(labels)) {
//转换成小写
labels = labels.toLowerCase();
List<String> tmpLabels = Arrays.asList(labels.split(","));
tmpLabels = new ArrayList<>(tmpLabels);
adLabelList = adLabelMapper.queryAdLabelByLabels(tmpLabels);
if (null != adLabelList && !adLabelList.isEmpty()) {
adLabelList = addLabelList(tmpLabels, adLabelList);
} else {
adLabelList = addLabelList(tmpLabels);
}
}
List<String> labelList = adLabelList.stream().map(label -> HMStringUtils.toString(label.getId())).collect(Collectors.toList());
String resultStr = HMStringUtils.listToStr(labelList, ",");
log.info("获取channel信息完成,标签:{},labelIds:{},耗时:{}", labels, resultStr, System.currentTimeMillis() - currentTimeMillis);
return resultStr;
}
过滤保存
/**
* 过滤保存
*
* @param tmpLabels
* @param adLabelList
* @return
*/
private List<AdLabel> addLabelList(List<String> tmpLabels, List<AdLabel> adLabelList) {
if(tmpLabels!=null && !tmpLabels.isEmpty()){
for(AdLabel adLabel : adLabelList){
for (int i = 0;i<tmpLabels.size();i++) {
if(tmpLabels.get(i).contains(adLabel.getName())){
tmpLabels.remove(i);
}
}
}
}
if(tmpLabels!=null && !tmpLabels.isEmpty()){
adLabelList.addAll(addLabelList(tmpLabels));
}
return adLabelList;
}
接口测试,测试通过
数据库插入数据成功
如果出现test跑步正常,把这个换一下就好
接口查询频道的测试成功
下面的爬虫的信息保存和查询都比较简单,看以看走一些流程就好