2021SC@SDUSC
接上篇
先定位解析出所有的href超链接,即每个列表项对应的文章详情地址,然后解析文章详情的所有文本信息
Category类如下
package com.crawler.gecco;
import com.geccocrawler.gecco.annotation.HtmlField;
import com.geccocrawler.gecco.annotation.Text;
import com.geccocrawler.gecco.spider.HrefBean;
import com.geccocrawler.gecco.spider.HtmlBean;
import java.util.List;
/**
* Created by jackie on 18/1/15.
*/
public class Category implements HtmlBean {
private static final long serialVersionUID = 3018760488621382659L;
@Text
@HtmlField(cssPath="dt a")
private String parentName;
@HtmlField(cssPath="ul li")
private List<HrefBean> categorys;
public String getParentName() {
return parentName;
}
public void setParentName(String parentName) {
this.parentName = parentName;
}
public List<HrefBean> getCategorys() {
return categorys;
}
public void setCategorys(List<HrefBean> categorys) {
this.categorys = categorys;
}
}
categorys即用于手机某个分类下所有列表对应的网址
下面实现AllSortPipeline类,用于收集所有分类下的url
package com.crawler.gecco;
import com.geccocrawler.gecco.annotation.PipelineName;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.scheduler.SchedulerContext;
import com.geccocrawler.gecco.spider.HrefBean;
import java.util.ArrayList;
import java.util.List;
/**
* Created by jackie on 18/1/15.
*/
@PipelineName("allSortPipeline")
public class AllSortPipeline implements Pipeline<AllSort> {
@Override
public void process(AllSort allSort) {
System.out.println("-=======-");
List<Category> categorys = new ArrayList<Category>();
categorys.addAll(allSort.getInternet());
categorys.addAll(allSort.getElectric());
categorys.addAll(allSort.getMobileInternet());
categorys.addAll(allSort.getNetGame());
categorys.addAll(allSort.getNetMarket());
for(Category category : categorys) {
List<HrefBean> hrefs = category.getCategorys();
for(HrefBean href : hrefs) {
System.out.println("title: " + href.getTitle() + " url: " + href.getUrl());
String url = href.getUrl();
HttpRequest currRequest = allSort.getRequest();
SchedulerContext.into(currRequest.subRequest(url));
}
}
}
}
-
categorys集合用于添加所有分类下的列表
-
通过遍历的方式获取具体的url和每个url对应的title
-
将url信息存储到SchedulerContext上下文中,用于后面爬虫
到此为止,我们获取了所有的分类列表对应的url信息,并将url存储到上下文中,用于后续爬虫匹配。下面编写用于解析详情也的处理类。
新建注解类ProductDetail,用于匹配上边得到的url
package com.crawler.gecco;
import com.geccocrawler.gecco.annotation.*;
import com.geccocrawler.gecco.spider.HtmlBean;
/**
* Created by jackie on 18/1/15.
*/
@Gecco(matchUrl="http://news.iresearch.cn/content/{yeary}/{month}/{code}.shtml", pipelines={"consolePipeline", "productDetailPipeline"})
public class ProductDetail implements HtmlBean {
private static final long serialVersionUID = -377053120283382723L;
/**
* 文本内容
*/
// @Text
@HtmlField(cssPath="body > div.g-content > div.g-bd.f-mt-auto > div > div.g-mn > div > div.g-article > div.m-article")
private String content;
@RequestParameter
private String code;
@RequestParameter
private String year;
@RequestParameter
private String month;
/**
* 标题
*/
@Text
@HtmlField(cssPath="body > div.g-content > div.g-main.f-mt-auto > div > div > div.title > h1")
private String title;
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getMonth() {
return month;
}
public void setMonth(String month) {
this.month = month;
}
}
-
matchUrl是每个文章的url格式,year、month和code是注入的参数
-
同理,我们定位到title所在的cssPath和 content所在的cssPath,用于解析得到具体的title和content值
下面实现ProductDetailPipeline类,用于解析每篇文章的文本信息,并通过正则抽取所有的中文文本存储到result.txt中
package com.crawler.gecco;
import com.geccocrawler.gecco.annotation.*;
import com.geccocrawler.gecco.pipeline.Pipeline;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
/**
* Created by jackie on 18/1/15.
*/
@PipelineName("productDetailPipeline")
public class ProductDetailPipeline implements Pipeline<ProductDetail> {
@Override
public void process(ProductDetail productDetail) {
System.out.println("~~~~~~~~~productDetailPipeline~~~~~~~~~~~");
File resultFile = new File("result.txt");
if (!resultFile.exists()) {
try {
resultFile.createNewFile();
} catch (IOException e) {
System.out.println("create result file failed: " + e);
}
}
FileWriter fileWriter = null;
try {
fileWriter = new FileWriter("result.txt", true);
} catch (IOException e) {
System.out.println("IOException");
}
try {
fileWriter.write(RegrexUtil.match(productDetail.getContent()));
fileWriter.flush();
} catch (IOException e) {
System.out.println("fileWriter.write failed: " + e);
} finally {
try {
fileWriter.close();
} catch (IOException e) {
System.out.println("fileWriter.close failed");
}
}
}
}
至此,我们通过Gecco获取到了互联网行业各分类下的所有文章,并提取到所有的文本信息