webmagic爬虫实战

爬虫实战

maven引入jar包

	<dependency>
		<groupId>cn.hutool</groupId>
		<artifactId>hutool-all</artifactId>
		<version>5.8.6</version>
	</dependency>
	<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.10.2</version>
		</dependency>
		<dependency>
			<groupId>us.codecraft</groupId>
			<artifactId>webmagic-core</artifactId>
			<version>0.8.0</version>
		</dependency>
		<dependency>
			<groupId>us.codecraft</groupId>
			<artifactId>webmagic-extension</artifactId>
			<version>0.8.0</version>
		</dependency>

示例


      // sf6
      List<String> hrefs =  ReUtil.findAll("<a.*?href=\"(.+)\".*?>(.*?)</a>", title, 1);
      String link = "https://www.streetfighter.com/6/zh-hans/character/aki";
       String contentLink = HttpUtil.get(link);
       Document document = Jsoup.parse(contentLink);
       Elements detail_detail_profile = document.getElementsByClass("detail_detail__profile__text__8JGgO");
       Elements detail_info_item = document.getElementsByClass("detail_info__item__h33Dg");
       Elements lis = document.select("a[href]");
       Console.log(detail_detail_profile );
       Console.log(detail_info_item.text());
       Console.log(lis);

schInfo

import org.apache.commons.lang3.StringUtils;
import org.jeecg.modules.sch.entity.SchoolInfoTxt;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.utils.HttpConstant;

import java.util.ArrayList;
import java.util.List;
import java.util.Vector;
@Component
public class PageProcessor implements us.codecraft.webmagic.processor.PageProcessor {


    private List<SchoolInfoTxt> SchoolInfoTxtList;

    private  String linkUrl ;

    public String getLinkUrl() {
        return linkUrl;
    }

    public void setLinkUrl(String linkUrl) {
        this.linkUrl = linkUrl;
    }

    public List<SchoolInfoTxt> getSchoolInfoTxtList() {
        return SchoolInfoTxtList;
    }

    public void setSchoolInfoTxtList(List<SchoolInfoTxt> SchoolInfoTxtList) {
        this.SchoolInfoTxtList = SchoolInfoTxtList;
    }

    private Site site = Site.me().setDomain("www.creditsailing.com");

    @Override
    public void process(Page page) {
        List<String> links = page.getHtml().links().regex(getLinkUrl()).all();
        page.addTargetRequests(links);
       // System.out.println(page.getHtml());
   /*     //查询所有id=‘ne_article_source’的节点的文本值
        String source=page.getHtml().xpath("//*[@id='ne_article_source']/text()").get();

        //查询所有id=‘endText’下p节点的文本,查询的是列表
        List<String> contextList=page.getHtml().xpath("//*[@id='endText']/p/text()").all();

        //查询所有id='endText'下p包含的图片的src列表
        List<String> imgurlList=page.getHtml().xpath("//*[@id='endText']/p/img/@src").all();

        //查询所有id='endText'下p包含的图片对象列表
        List<String> imgurlList2=page.getHtml().xpath("//*[@id='endText']/p/img").all();*/
       // List<String> areaList = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='sch-SBSchoolList-attr']/div[@class='sch-box']/div[@id='area']/a[@class='sch-cell']").all();

        List<String> schList = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='sch-SBSchoolList-list']/ul/li[@class='sch-item']").all();


      /*  for (String item : areaList) {
            Html tmp = Html.create(item);
            //标题
            title = tmp.xpath("//a/text()").toString();
            System.out.println("area=="+title);
            //图片路径
            img = tmp.xpath("//a/@href").toString();
            System.out.println("img=="+img);
        }*/
        SchoolInfoTxtList = new ArrayList<>();
        for (String item : schList) {
            Html tmp = Html.create(item);
            //标题
            String name = tmp.xpath("//li[@class='sch-item']/div[@class='sch-item-text']/div[@class='sch-name']/p[@class='sch-name-detail']/a/text()").toString();
            String href = tmp.xpath("//li[@class='sch-item']/div[@class='sch-item-text']/div[@class='sch-name']/p[@class='sch-name-detail']/a/@href").toString();
      //      String area = tmp.xpath("//li[@class='sch-item']/div[@class='sch-item-text']/div[@class='sch-name']/p[@class='sch-name-detail']/i/text()").toString();

            page.putField("name",  name);
            SchoolInfoTxt SchoolInfoTxt = new SchoolInfoTxt();
            SchoolInfoTxt.setName(name);
            SchoolInfoTxt.setSchCode(StringUtils.substringBetween(href, "ool/", "."));


            SchoolInfoTxtPageProcessor SchoolInfoTxtPageProcessor = new SchoolInfoTxtPageProcessor();
            SchoolInfoTxtPageProcessor.setLinkUrl("http://www.creditsailing.com/"+href);
            SchoolInfoTxtPageProcessor.setSchoolInfoTxt(SchoolInfoTxt);

            Spider.create(SchoolInfoTxtPageProcessor).addUrl(SchoolInfoTxtPageProcessor.getLinkUrl())
                    .addPipeline(new ConsolePipeline()).run();

            SchoolInfoTxtList .add(SchoolInfoTxt);
        }
        /*page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString());
        page.putField("content", page.getHtml().$("div.content").toString());
        page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());*/
    }

    @Override
    public Site getSite() {
        return site;

    }

    public static void main(String[] args) {
        PageProcessor pageProcessor = new PageProcessor();


        Request request = new Request(pageProcessor.getLinkUrl());
    //    request.setMethod(HttpConstant.Method.GET);
    //    request.addHeader("User-Agent","PostmanRuntime-ApipostRuntime/1.1.0");

        Spider.create(pageProcessor).addRequest(request)
                .addPipeline(new ConsolePipeline()).run();
        System.out.println(pageProcessor.getSchoolInfoTxtList());
    }

}

schInfoSub

import org.jeecg.common.util.HTMLUtils;
import org.jeecg.modules.sch.entity.SchoolInfoTxt;
import org.jeecg.modules.sch.service.ISchoolInfoTxtService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;

import javax.annotation.Resource;
import java.util.List;

@Component
public class SchoolInfoTxtPageProcessor implements us.codecraft.webmagic.processor.PageProcessor {


    private Site site = Site.me().setDomain("www.creditsailing.com");

    private  String linkUrl ;


    public SchoolInfoTxt getSchoolInfoTxt() {
        return SchoolInfoTxt;
    }

    public void setSchoolInfoTxt(SchoolInfoTxt SchoolInfoTxt) {
        this.SchoolInfoTxt = SchoolInfoTxt;
    }

    private SchoolInfoTxt SchoolInfoTxt;


    public String getLinkUrl() {
        return linkUrl;
    }

    public void setLinkUrl(String linkUrl) {
        this.linkUrl = linkUrl;
    }



    @Override
    public void process(Page page) {
        List<String> links = page.getHtml().links().regex(getLinkUrl()).all();
        page.addTargetRequests(links);
       // System.out.println(page.getHtml());
   /*     //查询所有id=‘ne_article_source’的节点的文本值
        String source=page.getHtml().xpath("//*[@id='ne_article_source']/text()").get();

        //查询所有id=‘endText’下p节点的文本,查询的是列表
        List<String> contextList=page.getHtml().xpath("//*[@id='endText']/p/text()").all();

        //查询所有id='endText'下p包含的图片的src列表
        List<String> imgurlList=page.getHtml().xpath("//*[@id='endText']/p/img/@src").all();

        //查询所有id='endText'下p包含的图片对象列表
        List<String> imgurlList2=page.getHtml().xpath("//*[@id='endText']/p/img").all();*/
       // List<String> areaList = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='sch-SBSchoolList-attr']/div[@class='sch-box']/div[@id='area']/a[@class='sch-cell']").all();

        String details = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='sch-jieshao']/div[@class='info-detail']/html()").toString();
        String contact = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='schlog']/div[@class='c-info']/p[@class='contact']/span/text()").toString();
        String schLogo = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='schlog']/div[@class='sch-logo']/img/@src").toString();
        String slabs = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='sch-jieshao']/div[@class='college-service mt25']/ul//html()").toString();
        List<String> spans = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='schlog']/div[@class='c-info']/p[@class='label']/span/text()").all();
        List<String> remarks = page.getHtml().xpath("//div[@class='co_left fl_l']/ul[@class='school-detail-nav']/li/a/@href").all();


       /* List<String> rels = new ArrayList<>();
        for (String item : slabs) {
            Html tmp = Html.create(item);
            //标题
            String name = tmp.xpath("//li/p/text()").toString();
            rels.add(name);
        }*/
        String lastxt = HTMLUtils.getInnerText(slabs);
        String[] rels = lastxt.split("\n");
        page.putField("remarks",remarks);
        page.putField("details", HTMLUtils.getInnerText(details));
        page.putField("schLogo", schLogo);
        page.putField("creation",  rels[0]);
        page.putField("type",  rels[1]);
        page.putField("classify",  rels[2]);
        page.putField("belong",  rels[3]);
        page.putField("education",  rels[4]);


        page.putField("cityName",  rels[5]);
        page.putField("contact",  contact);
        page.putField("spans",  HTMLUtils.intersectionForList_3(spans,HTMLUtils.getForList_2()).toString().replaceAll("(?:\\[|null|\\]| +)", ""));

        if(SchoolInfoTxt == null) SchoolInfoTxt = new SchoolInfoTxt();

        SchoolInfoTxt.setThumb(schLogo);
        SchoolInfoTxt.setDetails(details);
        SchoolInfoTxt.setCreation(rels[0].trim());
        SchoolInfoTxt.setType(rels[1].trim());
        SchoolInfoTxt.setClassify(rels[2].trim());
        SchoolInfoTxt.setBelong(rels[3].trim());
        SchoolInfoTxt.setEducation(rels[4].trim());
        SchoolInfoTxt.setArea(rels[5].trim());
        SchoolInfoTxt.setFamous(HTMLUtils.intersectionForList_3(spans,HTMLUtils.getForList_2()).toString().replaceAll("(?:\\[|null|\\]| +)", ""));
        SchoolInfoTxt.setRemarks(remarks.toString().replaceAll("(?:\\[|null|\\]| +)", ""));
        SchoolInfoTxt.setContact(contact.replaceAll("招生电话:",""));
    }

    @Override
    public Site getSite() {
        return site;

    }

    public static void main(String[] args) {
       SchoolInfoTxtPageProcessor SchoolInfoTxtPageProcessor = new SchoolInfoTxtPageProcessor();
       SchoolInfoTxtPageProcessor.setLinkUrl("http://www.creditsailing.com/school/1438920.html");

       Spider.create(SchoolInfoTxtPageProcessor).addUrl(SchoolInfoTxtPageProcessor.getLinkUrl())
                .addPipeline(new ConsolePipeline()).run();

    }

}

schScore

import org.apache.commons.lang3.StringUtils;
import org.jeecg.common.util.HTMLUtils;

import org.jeecg.modules.sch.entity.SchoolScoreTxt;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;

import java.util.ArrayList;
import java.util.List;

public class SchoolScoreTxtPageProcessor implements us.codecraft.webmagic.processor.PageProcessor {



    private Site site = Site.me().setDomain("www.creditsailing.com");

    private  String linkUrl ;


    public void setSite(Site site) {
        this.site = site;
    }

    private List<SchoolScoreTxt> SchoolScoreTxtList;

    public List<SchoolScoreTxt> getSchoolScoreTxtList() {
        return SchoolScoreTxtList;
    }

    public void setSchoolScoreTxtList(List<SchoolScoreTxt> SchoolScoreTxtList) {
        this.SchoolScoreTxtList = SchoolScoreTxtList;
    }

    public String getLinkUrl() {
        return linkUrl;
    }

    public void setLinkUrl(String linkUrl) {
        this.linkUrl = linkUrl;
    }



    @Override
    public void process(Page page) {
        List<String> links = page.getHtml().links().regex(getLinkUrl()).all();
        page.addTargetRequests(links);
  //      System.out.println(page.getHtml());
   /*     //查询所有id=‘ne_article_source’的节点的文本值
        String source=page.getHtml().xpath("//*[@id='ne_article_source']/text()").get();

        //查询所有id=‘endText’下p节点的文本,查询的是列表
        List<String> contextList=page.getHtml().xpath("//*[@id='endText']/p/text()").all();

        //查询所有id='endText'下p包含的图片的src列表
        List<String> imgurlList=page.getHtml().xpath("//*[@id='endText']/p/img/@src").all();

        //查询所有id='endText'下p包含的图片对象列表
        List<String> imgurlList2=page.getHtml().xpath("//*[@id='endText']/p/img").all();*/
        // List<String> areaList = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='sch-SBSchoolList-attr']/div[@class='sch-box']/div[@id='area']/a[@class='sch-cell']").all();


        List<String> ales = page.getHtml().xpath("//div[@class='sch-SBSchoolList-attr']/div[@class='score']/table[@class='layui-table']/tbody/").all();
        String name = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='schlog']/div[@class='c-info']/p[@class='title']/text()").toString();


        SchoolScoreTxtList = new ArrayList<>();
        for (String ele : ales) {
            String[] tdText = HTMLUtils.getTdText(ele);
            SchoolScoreTxt SchoolScoreTxt = new SchoolScoreTxt();
            SchoolScoreTxt.setSchCode(StringUtils.substringBetween(linkUrl, "all/all/", "."));
            SchoolScoreTxt.setSchName(name);
            SchoolScoreTxt.setControlined(tdText[7]);
            SchoolScoreTxt.setCourse(tdText[1]);
            SchoolScoreTxt.setEnroll(tdText[3]);
            SchoolScoreTxt.setLined(tdText[5]);
            SchoolScoreTxt.setPosited(tdText[6]);
            SchoolScoreTxt.setProvinces(tdText[0]);
            SchoolScoreTxt.setSpecialitd(tdText[4]);
            SchoolScoreTxt.setYeard(tdText[2]);

            SchoolScoreTxtList.add(SchoolScoreTxt);
        }
        page.putField("content",name);
    }

    @Override
    public Site getSite() {
        return site;

    }

    public static void main(String[] args) {
       SchoolScoreTxtPageProcessor SchoolScoreTxtPageProcessor = new SchoolScoreTxtPageProcessor();
       String url = "http://www.creditsailing.com/school/sch_score/all/all/1443119.html";
       SchoolScoreTxtPageProcessor.setLinkUrl(url);
       Request request = new Request(url);
       request.setMethod("GET");
       request.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57");
       Spider.create(SchoolScoreTxtPageProcessor).addRequest(request)
                .addPipeline(new ConsolePipeline()).run();

    }


}

subjectScore

import org.apache.commons.lang3.StringUtils;
import org.jeecg.common.util.HTMLUtils;
import org.jeecg.modules.sch.entity.SchoolScoreTxt;
import org.jeecg.modules.sch.entity.SchoolSubjectScoreTxt;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;

import java.util.ArrayList;
import java.util.List;

public class SubjectScoreTxtPageProcessor implements us.codecraft.webmagic.processor.PageProcessor {



    private Site site = Site.me().setDomain("www.creditsailing.com");

    private  String linkUrl ;


    public void setSite(Site site) {
        this.site = site;
    }

    private List<SchoolSubjectScoreTxt> subjectScoreTxts;

    public List<SchoolSubjectScoreTxt> getSubjectScoreTxts() {
        return subjectScoreTxts;
    }

    public void setSubjectScoreTxts(List<SchoolSubjectScoreTxt> subjectScoreTxts) {
        this.subjectScoreTxts = subjectScoreTxts;
    }

    public String getLinkUrl() {
        return linkUrl;
    }

    public void setLinkUrl(String linkUrl) {
        this.linkUrl = linkUrl;
    }



    @Override
    public void process(Page page) {
        List<String> links = page.getHtml().links().regex(getLinkUrl()).all();
        page.addTargetRequests(links);
  //      System.out.println(page.getHtml());
   /*     //查询所有id=‘ne_article_source’的节点的文本值
        String source=page.getHtml().xpath("//*[@id='ne_article_source']/text()").get();

        //查询所有id=‘endText’下p节点的文本,查询的是列表
        List<String> contextList=page.getHtml().xpath("//*[@id='endText']/p/text()").all();

        //查询所有id='endText'下p包含的图片的src列表
        List<String> imgurlList=page.getHtml().xpath("//*[@id='endText']/p/img/@src").all();

        //查询所有id='endText'下p包含的图片对象列表
        List<String> imgurlList2=page.getHtml().xpath("//*[@id='endText']/p/img").all();*/
        // List<String> areaList = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='sch-SBSchoolList-attr']/div[@class='sch-box']/div[@id='area']/a[@class='sch-cell']").all();


        List<String> ales = page.getHtml().xpath("//div[@class='sch-SBSchoolList-attr']/div[@class='score']/table[@class='layui-table']/tbody/").all();
        String name = page.getHtml().xpath("//div[@class='co_left fl_l']/div[@class='schlog']/div[@class='c-info']/p[@class='title']/text()").toString();
        String href = page.getHtml().xpath("//div[@class='co_left fl_l']/ul[@class='school-detail-nav']/li/a/@href").toString();
        List<String> res = page.getHtml().xpath("//div[@id='sch_types']/a/@href").all();

        subjectScoreTxts = new ArrayList<>();
        for (String ele : ales) {
            String[] tdText = HTMLUtils.getTdText(ele);
            SchoolSubjectScoreTxt  subjectScoreTxt = new SchoolSubjectScoreTxt();
            subjectScoreTxt.setSchCode(StringUtils.substringBetween(href, "ool/", "."));
            subjectScoreTxt.setSchName(name);

            subjectScoreTxt.setProvinces(tdText[0]);
            subjectScoreTxt.setYeard(tdText[1]);
            subjectScoreTxt.setCourse(tdText[2]);
            subjectScoreTxt.setEnroll(tdText[3]);
            subjectScoreTxt.setSpecialitd(tdText[4]);
            subjectScoreTxt.setMinLined(tdText[5]);
            subjectScoreTxt.setMinPosited(tdText[6]);
            subjectScoreTxt.setRemarks(res.toString().replaceAll("(?:\\[|null|\\]| +)", ""));
            subjectScoreTxts.add(subjectScoreTxt);
        }
        page.putField("content",subjectScoreTxts.get(0));
    }

    @Override
    public Site getSite() {
        return site;

    }

    public static void main(String[] args) {
       SubjectScoreTxtPageProcessor SchoolScoreTxtPageProcessor = new SubjectScoreTxtPageProcessor();
       String url = "http://www.creditsailing.com/school/major_score/all/2022/2792339.html";
       SchoolScoreTxtPageProcessor.setLinkUrl(url);
       Request request = new Request(url);
       request.setMethod("GET");
       request.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57");
       Spider.create(SchoolScoreTxtPageProcessor).addRequest(request)
                .addPipeline(new ConsolePipeline()).run();

    }



}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值