爬虫实战
maven引入jar包
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.6</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.8.0</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.8.0</version>
</dependency>
示例
// sf6
List<String> hrefs = ReUtil.findAll("<a.*?href=\"(.+)\".*?>(.*?)</a>", title, 1);
String link = "https://www.streetfighter.com/6/zh-hans/character/aki";
String contentLink = HttpUtil.get(link);
Document document = Jsoup.parse(contentLink);
Elements detail_detail_profile = document.getElementsByClass("detail_detail__profile__text__8JGgO");
Elements detail_info_item = document.getElementsByClass("detail_info__item__h33Dg");
Elements lis = document.select("a[href]");
Console.log(detail_detail_profile );
Console.log(detail_info_item.text());
Console.log(lis);
schInfo
import org.apache.commons.lang3.StringUtils;
import org.jeecg.modules.sch.entity.SchoolInfoTxt;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.utils.HttpConstant;
import java.util.ArrayList;
import java.util.List;
import java.util.Vector;
@Component
public class PageProcessor implements us.codecraft.webmagic.processor.PageProcessor {
private List<SchoolInfoTxt> SchoolInfoTxtList;
private String linkUrl ;
public String getLinkUrl() {
return linkUrl;
}
public void setLinkUrl(String linkUrl) {
this.linkUrl = linkUrl;
}
public List<SchoolInfoTxt> getSchoolInfoTxtList() {
return SchoolInfoTxtList;
}
public void setSchoolInfoTxtList(List<SchoolInfoTxt> SchoolInfoTxtList) {
this.SchoolInfoTxtList = SchoolInfoTxtList;
}
private Site site = Site.me().setDomain("www.creditsailing.com");
@Override
public void process(Page page) {
List<String> links = page.getHtml().links().regex(getLinkUrl()).all();
page.addTargetRequests(links);
// System.out.println(page.getHtml());
/* //查询所有id=‘ne_article_source’的节点的文本值
String source=page.getHtml().xpath("//*[@id='ne_article_source']/text()").get();
//查询所有id=‘endText’下p节点的文本,查询的是列表
List<String> contextList=page.getHtml().xpath("//*[@id='endText']/p/text()").all();
//查询所有id='endText'下p包含的图片的src列表
List<String> imgurlList=page.getHtml().xpath("//*[@id='endText']/p/img/@src").all();
//查询所有id='endText'下p包含的图片对象列表
List<String> imgurlList2=page.getHtml().xpath("//*[@id='endText']/p/img").all();*/
// List<String> areaList = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='sch-SBSchoolList-attr']/div[@class='sch-box']/div[@id='area']/a[@class='sch-cell']").all();
List<String> schList = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='sch-SBSchoolList-list']/ul/li[@class='sch-item']").all();
/* for (String item : areaList) {
Html tmp = Html.create(item);
//标题
title = tmp.xpath("//a/text()").toString();
System.out.println("area=="+title);
//图片路径
img = tmp.xpath("//a/@href").toString();
System.out.println("img=="+img);
}*/
SchoolInfoTxtList = new ArrayList<>();
for (String item : schList) {
Html tmp = Html.create(item);
//标题
String name = tmp.xpath("//li[@class='sch-item']/div[@class='sch-item-text']/div[@class='sch-name']/p[@class='sch-name-detail']/a/text()").toString();
String href = tmp.xpath("//li[@class='sch-item']/div[@class='sch-item-text']/div[@class='sch-name']/p[@class='sch-name-detail']/a/@href").toString();
// String area = tmp.xpath("//li[@class='sch-item']/div[@class='sch-item-text']/div[@class='sch-name']/p[@class='sch-name-detail']/i/text()").toString();
page.putField("name", name);
SchoolInfoTxt SchoolInfoTxt = new SchoolInfoTxt();
SchoolInfoTxt.setName(name);
SchoolInfoTxt.setSchCode(StringUtils.substringBetween(href, "ool/", "."));
SchoolInfoTxtPageProcessor SchoolInfoTxtPageProcessor = new SchoolInfoTxtPageProcessor();
SchoolInfoTxtPageProcessor.setLinkUrl("http://www.creditsailing.com/"+href);
SchoolInfoTxtPageProcessor.setSchoolInfoTxt(SchoolInfoTxt);
Spider.create(SchoolInfoTxtPageProcessor).addUrl(SchoolInfoTxtPageProcessor.getLinkUrl())
.addPipeline(new ConsolePipeline()).run();
SchoolInfoTxtList .add(SchoolInfoTxt);
}
/*page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
page.putField("content", page.getHtml().$("div.content").toString());
page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());*/
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
PageProcessor pageProcessor = new PageProcessor();
Request request = new Request(pageProcessor.getLinkUrl());
// request.setMethod(HttpConstant.Method.GET);
// request.addHeader("User-Agent","PostmanRuntime-ApipostRuntime/1.1.0");
Spider.create(pageProcessor).addRequest(request)
.addPipeline(new ConsolePipeline()).run();
System.out.println(pageProcessor.getSchoolInfoTxtList());
}
}
schInfoSub
import org.jeecg.common.util.HTMLUtils;
import org.jeecg.modules.sch.entity.SchoolInfoTxt;
import org.jeecg.modules.sch.service.ISchoolInfoTxtService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import javax.annotation.Resource;
import java.util.List;
@Component
public class SchoolInfoTxtPageProcessor implements us.codecraft.webmagic.processor.PageProcessor {
private Site site = Site.me().setDomain("www.creditsailing.com");
private String linkUrl ;
public SchoolInfoTxt getSchoolInfoTxt() {
return SchoolInfoTxt;
}
public void setSchoolInfoTxt(SchoolInfoTxt SchoolInfoTxt) {
this.SchoolInfoTxt = SchoolInfoTxt;
}
private SchoolInfoTxt SchoolInfoTxt;
public String getLinkUrl() {
return linkUrl;
}
public void setLinkUrl(String linkUrl) {
this.linkUrl = linkUrl;
}
@Override
public void process(Page page) {
List<String> links = page.getHtml().links().regex(getLinkUrl()).all();
page.addTargetRequests(links);
// System.out.println(page.getHtml());
/* //查询所有id=‘ne_article_source’的节点的文本值
String source=page.getHtml().xpath("//*[@id='ne_article_source']/text()").get();
//查询所有id=‘endText’下p节点的文本,查询的是列表
List<String> contextList=page.getHtml().xpath("//*[@id='endText']/p/text()").all();
//查询所有id='endText'下p包含的图片的src列表
List<String> imgurlList=page.getHtml().xpath("//*[@id='endText']/p/img/@src").all();
//查询所有id='endText'下p包含的图片对象列表
List<String> imgurlList2=page.getHtml().xpath("//*[@id='endText']/p/img").all();*/
// List<String> areaList = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='sch-SBSchoolList-attr']/div[@class='sch-box']/div[@id='area']/a[@class='sch-cell']").all();
String details = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='sch-jieshao']/div[@class='info-detail']/html()").toString();
String contact = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='schlog']/div[@class='c-info']/p[@class='contact']/span/text()").toString();
String schLogo = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='schlog']/div[@class='sch-logo']/img/@src").toString();
String slabs = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='sch-jieshao']/div[@class='college-service mt25']/ul//html()").toString();
List<String> spans = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='schlog']/div[@class='c-info']/p[@class='label']/span/text()").all();
List<String> remarks = page.getHtml().xpath("//div[@class='co_left fl_l']/ul[@class='school-detail-nav']/li/a/@href").all();
/* List<String> rels = new ArrayList<>();
for (String item : slabs) {
Html tmp = Html.create(item);
//标题
String name = tmp.xpath("//li/p/text()").toString();
rels.add(name);
}*/
String lastxt = HTMLUtils.getInnerText(slabs);
String[] rels = lastxt.split("\n");
page.putField("remarks",remarks);
page.putField("details", HTMLUtils.getInnerText(details));
page.putField("schLogo", schLogo);
page.putField("creation", rels[0]);
page.putField("type", rels[1]);
page.putField("classify", rels[2]);
page.putField("belong", rels[3]);
page.putField("education", rels[4]);
page.putField("cityName", rels[5]);
page.putField("contact", contact);
page.putField("spans", HTMLUtils.intersectionForList_3(spans,HTMLUtils.getForList_2()).toString().replaceAll("(?:\\[|null|\\]| +)", ""));
if(SchoolInfoTxt == null) SchoolInfoTxt = new SchoolInfoTxt();
SchoolInfoTxt.setThumb(schLogo);
SchoolInfoTxt.setDetails(details);
SchoolInfoTxt.setCreation(rels[0].trim());
SchoolInfoTxt.setType(rels[1].trim());
SchoolInfoTxt.setClassify(rels[2].trim());
SchoolInfoTxt.setBelong(rels[3].trim());
SchoolInfoTxt.setEducation(rels[4].trim());
SchoolInfoTxt.setArea(rels[5].trim());
SchoolInfoTxt.setFamous(HTMLUtils.intersectionForList_3(spans,HTMLUtils.getForList_2()).toString().replaceAll("(?:\\[|null|\\]| +)", ""));
SchoolInfoTxt.setRemarks(remarks.toString().replaceAll("(?:\\[|null|\\]| +)", ""));
SchoolInfoTxt.setContact(contact.replaceAll("招生电话:",""));
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
SchoolInfoTxtPageProcessor SchoolInfoTxtPageProcessor = new SchoolInfoTxtPageProcessor();
SchoolInfoTxtPageProcessor.setLinkUrl("http://www.creditsailing.com/school/1438920.html");
Spider.create(SchoolInfoTxtPageProcessor).addUrl(SchoolInfoTxtPageProcessor.getLinkUrl())
.addPipeline(new ConsolePipeline()).run();
}
}
schScore
import org.apache.commons.lang3.StringUtils;
import org.jeecg.common.util.HTMLUtils;
import org.jeecg.modules.sch.entity.SchoolScoreTxt;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import java.util.ArrayList;
import java.util.List;
public class SchoolScoreTxtPageProcessor implements us.codecraft.webmagic.processor.PageProcessor {
private Site site = Site.me().setDomain("www.creditsailing.com");
private String linkUrl ;
public void setSite(Site site) {
this.site = site;
}
private List<SchoolScoreTxt> SchoolScoreTxtList;
public List<SchoolScoreTxt> getSchoolScoreTxtList() {
return SchoolScoreTxtList;
}
public void setSchoolScoreTxtList(List<SchoolScoreTxt> SchoolScoreTxtList) {
this.SchoolScoreTxtList = SchoolScoreTxtList;
}
public String getLinkUrl() {
return linkUrl;
}
public void setLinkUrl(String linkUrl) {
this.linkUrl = linkUrl;
}
@Override
public void process(Page page) {
List<String> links = page.getHtml().links().regex(getLinkUrl()).all();
page.addTargetRequests(links);
// System.out.println(page.getHtml());
/* //查询所有id=‘ne_article_source’的节点的文本值
String source=page.getHtml().xpath("//*[@id='ne_article_source']/text()").get();
//查询所有id=‘endText’下p节点的文本,查询的是列表
List<String> contextList=page.getHtml().xpath("//*[@id='endText']/p/text()").all();
//查询所有id='endText'下p包含的图片的src列表
List<String> imgurlList=page.getHtml().xpath("//*[@id='endText']/p/img/@src").all();
//查询所有id='endText'下p包含的图片对象列表
List<String> imgurlList2=page.getHtml().xpath("//*[@id='endText']/p/img").all();*/
// List<String> areaList = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='sch-SBSchoolList-attr']/div[@class='sch-box']/div[@id='area']/a[@class='sch-cell']").all();
List<String> ales = page.getHtml().xpath("//div[@class='sch-SBSchoolList-attr']/div[@class='score']/table[@class='layui-table']/tbody/").all();
String name = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='schlog']/div[@class='c-info']/p[@class='title']/text()").toString();
SchoolScoreTxtList = new ArrayList<>();
for (String ele : ales) {
String[] tdText = HTMLUtils.getTdText(ele);
SchoolScoreTxt SchoolScoreTxt = new SchoolScoreTxt();
SchoolScoreTxt.setSchCode(StringUtils.substringBetween(linkUrl, "all/all/", "."));
SchoolScoreTxt.setSchName(name);
SchoolScoreTxt.setControlined(tdText[7]);
SchoolScoreTxt.setCourse(tdText[1]);
SchoolScoreTxt.setEnroll(tdText[3]);
SchoolScoreTxt.setLined(tdText[5]);
SchoolScoreTxt.setPosited(tdText[6]);
SchoolScoreTxt.setProvinces(tdText[0]);
SchoolScoreTxt.setSpecialitd(tdText[4]);
SchoolScoreTxt.setYeard(tdText[2]);
SchoolScoreTxtList.add(SchoolScoreTxt);
}
page.putField("content",name);
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
SchoolScoreTxtPageProcessor SchoolScoreTxtPageProcessor = new SchoolScoreTxtPageProcessor();
String url = "http://www.creditsailing.com/school/sch_score/all/all/1443119.html";
SchoolScoreTxtPageProcessor.setLinkUrl(url);
Request request = new Request(url);
request.setMethod("GET");
request.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57");
Spider.create(SchoolScoreTxtPageProcessor).addRequest(request)
.addPipeline(new ConsolePipeline()).run();
}
}
subjectScore
import org.apache.commons.lang3.StringUtils;
import org.jeecg.common.util.HTMLUtils;
import org.jeecg.modules.sch.entity.SchoolScoreTxt;
import org.jeecg.modules.sch.entity.SchoolSubjectScoreTxt;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import java.util.ArrayList;
import java.util.List;
public class SubjectScoreTxtPageProcessor implements us.codecraft.webmagic.processor.PageProcessor {
private Site site = Site.me().setDomain("www.creditsailing.com");
private String linkUrl ;
public void setSite(Site site) {
this.site = site;
}
private List<SchoolSubjectScoreTxt> subjectScoreTxts;
public List<SchoolSubjectScoreTxt> getSubjectScoreTxts() {
return subjectScoreTxts;
}
public void setSubjectScoreTxts(List<SchoolSubjectScoreTxt> subjectScoreTxts) {
this.subjectScoreTxts = subjectScoreTxts;
}
public String getLinkUrl() {
return linkUrl;
}
public void setLinkUrl(String linkUrl) {
this.linkUrl = linkUrl;
}
@Override
public void process(Page page) {
List<String> links = page.getHtml().links().regex(getLinkUrl()).all();
page.addTargetRequests(links);
// System.out.println(page.getHtml());
/* //查询所有id=‘ne_article_source’的节点的文本值
String source=page.getHtml().xpath("//*[@id='ne_article_source']/text()").get();
//查询所有id=‘endText’下p节点的文本,查询的是列表
List<String> contextList=page.getHtml().xpath("//*[@id='endText']/p/text()").all();
//查询所有id='endText'下p包含的图片的src列表
List<String> imgurlList=page.getHtml().xpath("//*[@id='endText']/p/img/@src").all();
//查询所有id='endText'下p包含的图片对象列表
List<String> imgurlList2=page.getHtml().xpath("//*[@id='endText']/p/img").all();*/
// List<String> areaList = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='sch-SBSchoolList-attr']/div[@class='sch-box']/div[@id='area']/a[@class='sch-cell']").all();
List<String> ales = page.getHtml().xpath("//div[@class='sch-SBSchoolList-attr']/div[@class='score']/table[@class='layui-table']/tbody/").all();
String name = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='schlog']/div[@class='c-info']/p[@class='title']/text()").toString();
String href = page.getHtml().xpath("//div[@class='co_left fl_l']/ul[@class='school-detail-nav']/li/a/@href").toString();
List<String> res = page.getHtml().xpath("//div[@id='sch_types']/a/@href").all();
subjectScoreTxts = new ArrayList<>();
for (String ele : ales) {
String[] tdText = HTMLUtils.getTdText(ele);
SchoolSubjectScoreTxt subjectScoreTxt = new SchoolSubjectScoreTxt();
subjectScoreTxt.setSchCode(StringUtils.substringBetween(href, "ool/", "."));
subjectScoreTxt.setSchName(name);
subjectScoreTxt.setProvinces(tdText[0]);
subjectScoreTxt.setYeard(tdText[1]);
subjectScoreTxt.setCourse(tdText[2]);
subjectScoreTxt.setEnroll(tdText[3]);
subjectScoreTxt.setSpecialitd(tdText[4]);
subjectScoreTxt.setMinLined(tdText[5]);
subjectScoreTxt.setMinPosited(tdText[6]);
subjectScoreTxt.setRemarks(res.toString().replaceAll("(?:\\[|null|\\]| +)", ""));
subjectScoreTxts.add(subjectScoreTxt);
}
page.putField("content",subjectScoreTxts.get(0));
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
SubjectScoreTxtPageProcessor SchoolScoreTxtPageProcessor = new SubjectScoreTxtPageProcessor();
String url = "http://www.creditsailing.com/school/major_score/all/2022/2792339.html";
SchoolScoreTxtPageProcessor.setLinkUrl(url);
Request request = new Request(url);
request.setMethod("GET");
request.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57");
Spider.create(SchoolScoreTxtPageProcessor).addRequest(request)
.addPipeline(new ConsolePipeline()).run();
}
}