package spider.common.annotation;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
/**
* @Author lyr
* @create 2020/6/24 0:23
*/
@Retention(RetentionPolicy.RUNTIME)//要打上 runtime注解,不然会空指针异常
public @interface HtmlSelector {
String css();
String regex() default "";
String xpath() default "";
}
package spider.pojo.po;
import lombok.Data;
import spider.common.annotation.Html;
import spider.common.annotation.HtmlSelector;
import java.util.List;
/**
* @Author lyr
* @create 2020/6/23 19:42
*/
@Html
@Data
public class SimpleBlogHtml {
@HtmlSelector(css = ".oneline span" ,regex = "<.*?>(.*?)<.*?>")
private List<String> title;
}
package spider.common.bean;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import spider.common.annotation.Html;
import spider.common.annotation.HtmlSelector;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import java.util.Arrays;
/**
* @Author lyr
* @create 2020/6/23 18:50
* 处理器
*/
@Data
@Slf4j
public class HtmlProcessor<T> implements PageProcessor {
T data;
public HtmlProcessor(T object) {
this.data = object;
}
public HtmlProcessor(Class<T> object) {
try {
this.data =object.newInstance();
} catch (InstantiationException | IllegalAccessException e) {
e.printStackTrace();
}
}
private HtmlProcessor() {
}
@Override
public void process(Page page) {
T blogHtml = this.data;
Html html = blogHtml.getClass().getDeclaredAnnotation(Html.class);
if(html!=null) {
log.info("设置 字符集 {}",html.charset());
page.setCharset(html.charset());
}
Arrays.stream(blogHtml.getClass().getDeclaredFields())
.peek(field->field.setAccessible(true))
.forEach(f->{
HtmlSelector selector = f.getAnnotation(HtmlSelector.class);
String css = selector.css();
String reg = selector.regex();
String xpath = selector.xpath();
Selectable select = (page.getHtml().css(css));
if(StringUtils.isNotBlank(reg)) {
select = select.regex(reg);
}
if(StringUtils.isNotBlank(xpath)) {
select = select.xpath(xpath);
}
try {
f.set(blogHtml,select.all());
} catch (IllegalAccessException e) {
e.printStackTrace();
}
});
page.putField("html",blogHtml);
}
private final Site site = new Site();
@Override
public Site getSite() {
Html html = this.data.getClass().getAnnotation(Html.class);
if(html!=null) {
site.setCharset(html.charset());
}
return site;
}
}
package spider.common.bean;
import com.alibaba.fastjson.JSONObject;
import lombok.extern.slf4j.Slf4j;
import org.assertj.core.util.Files;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.io.*;
/**
* @Author lyr
* @create 2020/6/23 19:53
*/
@Slf4j
public class DownloadPipeLine<T> implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
T data = resultItems.get("html");
String json = JSONObject.toJSONString(data);
String path = "F:\\ss1\\testJava\\SpiderJava\\src\\main\\resources\\txt\\blog.csdn.net\\博客2.txt";
try(
PrintWriter printWriter = new PrintWriter(Files.newFile(path));
) {
printWriter.append(json);
} catch (IOException e) {
e.printStackTrace();
}
}
}
package spider;
import spider.common.bean.DownloadPipeLine;
import spider.common.bean.HtmlProcessor;
import spider.pojo.po.SimpleBlogHtml;
import us.codecraft.webmagic.Spider;
/**
* @Author lyr
* @create 2020/6/23 18:52
*/
public class Main {
public static void main(String[] args) {
Spider.create(new HtmlProcessor<>(SimpleBlogHtml.class))
.addUrl("https://blog.csdn.net/qq_43923045")
.addPipeline(new DownloadPipeLine<SimpleBlogHtml>())
.run();
}
}
对代码再一次改造升级,使用反射获取setter方法注入值
private void map(Field field, T object, Selectable selectable) throws IntrospectionException, InvocationTargetException, IllegalAccessException {
Class<?> clazz = field.getType();
Object value=null;//数据库列的值
Type type = field.getType();
if(type==Integer.class||type==int.class){ //数字类型需要特殊处理
value=Integer.parseInt(selectable.get());
}else if(type ==Double.class||type==double.class){
value= Double.parseDouble(selectable.get());
}else if(type== List.class){
//str all
value=selectable.all();//字符 时间类型通过这个方法获取
}else if(type== BigDecimal.class) {
value = new BigDecimal(selectable.get());
}else if(type==String.class) {
value = selectable.get();
}else if(type==Float.class||type==float.class) {
value = Float.parseFloat(selectable.get());
}else if (type==String[].class) {
List<String> all = selectable.all();
String[]arr = all.toArray(new String[all.size()]);
value =arr;
}
if(value!=null) {
log.info("setValue {}",value);
PropertyDescriptor prop=new PropertyDescriptor(field.getName(), object.getClass());//创建属性描述对象
//setter方法
prop.getWriteMethod().invoke(object,value);//执行set方法
}
}
package spider.common.bean;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import spider.common.annotation.Html;
import spider.common.annotation.HtmlSelector;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import java.beans.IntrospectionException;
import java.beans.PropertyDescriptor;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Type;
import java.math.BigDecimal;
import java.util.Arrays;
import java.util.List;
/**
* @Author lyr
* @create 2020/6/23 18:50
* 处理器
*/
@Data
@Slf4j
public class HtmlProcessor<T> implements PageProcessor {
T data;
public HtmlProcessor(T object) {
this.data = object;
}
public HtmlProcessor(Class<T> object) {
try {
this.data =object.newInstance();
} catch (InstantiationException | IllegalAccessException e) {
e.printStackTrace();
}
}
private HtmlProcessor() {
}
@Override
public void process(Page page) {
T blogHtml = this.data;
Html html = blogHtml.getClass().getDeclaredAnnotation(Html.class);
if(html!=null) {
log.info("设置 字符集 {}",html.charset());
page.setCharset(html.charset());
}
Arrays.stream(blogHtml.getClass().getDeclaredFields())
.peek(field->field.setAccessible(true))
.forEach(f->{
HtmlSelector selector = f.getAnnotation(HtmlSelector.class);
String css = selector.css();
String reg = selector.regex();
String xpath = selector.xpath();
Selectable select = (page.getHtml().css(css));
if(StringUtils.isNotBlank(reg)) {
select = select.regex(reg);
}
if(StringUtils.isNotBlank(xpath)) {
select = select.xpath(xpath);
}
try {
map(f,this.data,select);
} catch (IntrospectionException | IllegalAccessException | InvocationTargetException e) {
e.printStackTrace();
}
});
page.putField("html",blogHtml);
}
private final Site site = new Site();
@Override
public Site getSite() {
Html html = this.data.getClass().getAnnotation(Html.class);
if(html!=null) {
site.setCharset(html.charset());
}
return site;
}
private void map(Field field, T object, Selectable selectable) throws IntrospectionException, InvocationTargetException, IllegalAccessException {
Class<?> clazz = field.getType();
Object value=null;//数据库列的值
Type type = field.getType();
if(type==Integer.class||type==int.class){ //数字类型需要特殊处理
value=Integer.parseInt(selectable.get());
}else if(type ==Double.class||type==double.class){
value= Double.parseDouble(selectable.get());
}else if(type== List.class){
//str all
value=selectable.all();//字符 时间类型通过这个方法获取
}else if(type== BigDecimal.class) {
value = new BigDecimal(selectable.get());
}else if(type==String.class) {
value = selectable.get();
}else if(type==Float.class||type==float.class) {
value = Float.parseFloat(selectable.get());
}else if (type==String[].class) {
List<String> all = selectable.all();
String[]arr = all.toArray(new String[all.size()]);
value =arr;
}
if(value!=null) {
log.info("setValue {}",value);
PropertyDescriptor prop=new PropertyDescriptor(field.getName(), object.getClass());//创建属性描述对象
//setter方法
prop.getWriteMethod().invoke(object,value);//执行set方法
}
}
}
package spider.pojo.po;
import lombok.Data;
import spider.common.annotation.Html;
import spider.common.annotation.HtmlSelector;
import java.util.List;
/**
* @Author lyr
* @create 2020/6/24 8:37
*/
@Html(charset = "UTF-8")
@Data
public class BlogPage {
@HtmlSelector(css = ".oneline span" ,xpath = "//span/text()")
private List<String> title;
@HtmlSelector(css = "#uid .name",regex = "<.*?>(.*?)</.*?>")
private String author;
@HtmlSelector(css = "#asideNewArticle .aside-content a", regex = "<a.*?>(.*?)</a.*?>")
private List<String> newArticle;
}
/**
* @Author lyr
* @create 2020/6/23 18:52
*/
public class Main {
public static void main(String[] args) {
Spider.create(new HtmlProcessor<>(BlogPage.class))
.addUrl("https://blog.csdn.net/qq_43923045")
.addPipeline(new DownloadPipeLine<BlogPage>("F:\\妹子1\\testJava\\SpiderJava\\src\\main\\resources\\txt\\blog.csdn.net\\a.txt"))
.run();
}
}