爬虫项目文本提取处理

package spider.common.annotation;

import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;

/**
 * @Author lyr
 * @create 2020/6/24 0:23
 */
@Retention(RetentionPolicy.RUNTIME)//要打上 runtime注解,不然会空指针异常
public @interface HtmlSelector {
    String css();
    String regex() default "";
    String xpath() default "";
}

package spider.pojo.po;

import lombok.Data;
import spider.common.annotation.Html;
import spider.common.annotation.HtmlSelector;

import java.util.List;

/**
 * @Author lyr
 * @create 2020/6/23 19:42
 */
@Html
@Data
public class SimpleBlogHtml {
    @HtmlSelector(css = ".oneline span" ,regex = "<.*?>(.*?)<.*?>")
    private List<String> title;

}

package spider.common.bean;


import lombok.Data;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;

import spider.common.annotation.Html;
import spider.common.annotation.HtmlSelector;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

import us.codecraft.webmagic.selector.Selectable;


import java.util.Arrays;



/**
 * @Author lyr
 * @create 2020/6/23 18:50
 * 处理器
 */

@Data
@Slf4j
public class HtmlProcessor<T> implements PageProcessor {

    T data;

    public HtmlProcessor(T object) {
        this.data = object;
    }
    public HtmlProcessor(Class<T> object) {
        try {
            this.data =object.newInstance();
        } catch (InstantiationException | IllegalAccessException e) {
            e.printStackTrace();
        }
    }
    private HtmlProcessor() {

    }

    @Override
    public void process(Page page) {


        T blogHtml = this.data;
        Html html = blogHtml.getClass().getDeclaredAnnotation(Html.class);
        if(html!=null) {
            log.info("设置 字符集 {}",html.charset());
            page.setCharset(html.charset());
        }
        Arrays.stream(blogHtml.getClass().getDeclaredFields())
                .peek(field->field.setAccessible(true))
                .forEach(f->{
                    HtmlSelector selector = f.getAnnotation(HtmlSelector.class);

                    String css = selector.css();
                    String reg = selector.regex();
                    String xpath = selector.xpath();
                    Selectable select =  (page.getHtml().css(css));
                    if(StringUtils.isNotBlank(reg)) {
                        select = select.regex(reg);
                    }
                    if(StringUtils.isNotBlank(xpath)) {
                        select = select.xpath(xpath);
                    }

                    try {
                        f.set(blogHtml,select.all());
                    } catch (IllegalAccessException e) {
                        e.printStackTrace();
                    }
                });



        page.putField("html",blogHtml);



    }
    private final Site site = new Site();


    @Override
    public Site getSite() {

        Html html = this.data.getClass().getAnnotation(Html.class);
        if(html!=null) {
            site.setCharset(html.charset());

        }

        return site;
    }
}

package spider.common.bean;

import com.alibaba.fastjson.JSONObject;
import lombok.extern.slf4j.Slf4j;
import org.assertj.core.util.Files;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

import java.io.*;

/**
 * @Author lyr
 * @create 2020/6/23 19:53
 */
@Slf4j
public class DownloadPipeLine<T> implements Pipeline {
    @Override
    public void process(ResultItems resultItems, Task task) {
        T data = resultItems.get("html");
        String json = JSONObject.toJSONString(data);
        String path = "F:\\ss1\\testJava\\SpiderJava\\src\\main\\resources\\txt\\blog.csdn.net\\博客2.txt";
        try(

                PrintWriter printWriter = new PrintWriter(Files.newFile(path));
                ) {
             printWriter.append(json);


        } catch (IOException e) {
            e.printStackTrace();
        }

    }
}

package spider;

import spider.common.bean.DownloadPipeLine;
import spider.common.bean.HtmlProcessor;
import spider.pojo.po.SimpleBlogHtml;
import us.codecraft.webmagic.Spider;

/**
 * @Author lyr
 * @create 2020/6/23 18:52
 */
public class Main {
    public static void main(String[] args) {
        Spider.create(new HtmlProcessor<>(SimpleBlogHtml.class))
                .addUrl("https://blog.csdn.net/qq_43923045")
                .addPipeline(new DownloadPipeLine<SimpleBlogHtml>())
                .run();

    }
}


对代码再一次改造升级,使用反射获取setter方法注入值

private void map(Field field, T object, Selectable selectable) throws IntrospectionException, InvocationTargetException, IllegalAccessException {
        Class<?> clazz = field.getType();

        Object value=null;//数据库列的值
        Type type = field.getType();
        if(type==Integer.class||type==int.class){ //数字类型需要特殊处理
            value=Integer.parseInt(selectable.get());
        }else if(type  ==Double.class||type==double.class){
            value= Double.parseDouble(selectable.get());
        }else if(type== List.class){
            //str all
            value=selectable.all();//字符 时间类型通过这个方法获取
        }else if(type== BigDecimal.class) {
            value = new BigDecimal(selectable.get());
        }else if(type==String.class) {
            value = selectable.get();
        }else if(type==Float.class||type==float.class) {
            value = Float.parseFloat(selectable.get());
        }else if (type==String[].class) {
            List<String> all = selectable.all();
            String[]arr = all.toArray(new String[all.size()]);
            value =arr;
        }


        if(value!=null) {
            log.info("setValue {}",value);
            PropertyDescriptor prop=new PropertyDescriptor(field.getName(), object.getClass());//创建属性描述对象
            //setter方法
            prop.getWriteMethod().invoke(object,value);//执行set方法
        }

    }
package spider.common.bean;

import lombok.Data;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;

import spider.common.annotation.Html;
import spider.common.annotation.HtmlSelector;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

import us.codecraft.webmagic.selector.Selectable;


import java.beans.IntrospectionException;
import java.beans.PropertyDescriptor;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Type;
import java.math.BigDecimal;
import java.util.Arrays;
import java.util.List;



/**
 * @Author lyr
 * @create 2020/6/23 18:50
 * 处理器
 */

@Data
@Slf4j
public class HtmlProcessor<T> implements PageProcessor {

    T data;

    public HtmlProcessor(T object) {
        this.data = object;
    }
    public HtmlProcessor(Class<T> object) {
        try {
            this.data =object.newInstance();
        } catch (InstantiationException | IllegalAccessException e) {
            e.printStackTrace();
        }
    }
    private HtmlProcessor() {

    }

    @Override
    public void process(Page page) {


        T blogHtml = this.data;
        Html html = blogHtml.getClass().getDeclaredAnnotation(Html.class);
        if(html!=null) {
            log.info("设置 字符集 {}",html.charset());
            page.setCharset(html.charset());
        }
        Arrays.stream(blogHtml.getClass().getDeclaredFields())
                .peek(field->field.setAccessible(true))
                .forEach(f->{
                    HtmlSelector selector = f.getAnnotation(HtmlSelector.class);

                    String css = selector.css();
                    String reg = selector.regex();
                    String xpath = selector.xpath();
                    Selectable select =  (page.getHtml().css(css));
                    if(StringUtils.isNotBlank(reg)) {
                        select = select.regex(reg);
                    }
                    if(StringUtils.isNotBlank(xpath)) {
                        select = select.xpath(xpath);
                    }

                    try {
                        map(f,this.data,select);
                    } catch (IntrospectionException | IllegalAccessException | InvocationTargetException e) {
                        e.printStackTrace();
                    }

                });



        page.putField("html",blogHtml);



    }
    private final Site site = new Site();


    @Override
    public Site getSite() {

        Html html = this.data.getClass().getAnnotation(Html.class);
        if(html!=null) {
            site.setCharset(html.charset());

        }

        return site;
    }


    private void map(Field field, T object, Selectable selectable) throws IntrospectionException, InvocationTargetException, IllegalAccessException {
        Class<?> clazz = field.getType();

        Object value=null;//数据库列的值
        Type type = field.getType();
        if(type==Integer.class||type==int.class){ //数字类型需要特殊处理
            value=Integer.parseInt(selectable.get());
        }else if(type  ==Double.class||type==double.class){
            value= Double.parseDouble(selectable.get());
        }else if(type== List.class){
            //str all
            value=selectable.all();//字符 时间类型通过这个方法获取
        }else if(type== BigDecimal.class) {
            value = new BigDecimal(selectable.get());
        }else if(type==String.class) {
            value = selectable.get();
        }else if(type==Float.class||type==float.class) {
            value = Float.parseFloat(selectable.get());
        }else if (type==String[].class) {
            List<String> all = selectable.all();
            String[]arr = all.toArray(new String[all.size()]);
            value =arr;
        }


        if(value!=null) {
            log.info("setValue {}",value);
            PropertyDescriptor prop=new PropertyDescriptor(field.getName(), object.getClass());//创建属性描述对象
            //setter方法
            prop.getWriteMethod().invoke(object,value);//执行set方法
        }

    }


}




package spider.pojo.po;

import lombok.Data;
import spider.common.annotation.Html;
import spider.common.annotation.HtmlSelector;

import java.util.List;

/**
 * @Author lyr
 * @create 2020/6/24 8:37
 */
@Html(charset = "UTF-8")
@Data
public class BlogPage {
    @HtmlSelector(css = ".oneline span" ,xpath = "//span/text()")
    private List<String> title;
    @HtmlSelector(css = "#uid .name",regex = "<.*?>(.*?)</.*?>")
    private String author;

    @HtmlSelector(css = "#asideNewArticle .aside-content  a", regex = "<a.*?>(.*?)</a.*?>")
    private List<String>  newArticle;




}

/**
 * @Author lyr
 * @create 2020/6/23 18:52
 */
public class Main {
    public static void main(String[] args) {
        Spider.create(new HtmlProcessor<>(BlogPage.class))
                .addUrl("https://blog.csdn.net/qq_43923045")
                .addPipeline(new DownloadPipeLine<BlogPage>("F:\\妹子1\\testJava\\SpiderJava\\src\\main\\resources\\txt\\blog.csdn.net\\a.txt"))
                .run();

    }
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值