//看了gecco使用文档,没有找到,看了源码才发现的问题,希望能给刚学的人带来一点帮助
@ComponentScan(basePackages = { "com.gfound.data", "com.geccocrawler.gecco.spring" })
public class GeccoConfig {
@Bean
public SpringGeccoEngine job1() {
return new SpringGeccoEngine() {
@Override
public void init() {
Date date = new Date();
String dateyear= new SimpleDateFormat("yyyy").format(date);
List<HttpRequest> requests = Lists.newArrayList();
HttpRequest request6 = new HttpGetRequest(
"http://www.baidu.com");
request6.setCharset("utf-8");//这里设置获取页面数据的编码格式
requests.add(request6);
GeccoEngine.create().pipelineFactory(springPipelineFactory).classpath("com.gfound.data").start(
requests
).loop(false).start();
}
};
}
}
产生问题的原因在于,获取页面内容的格式是不一致.
例如:页面的编码格式是gb2312而代码获取内容的编码格式utf-8.由于格式不一致导致乱码.
有人会说,可以将utf-8编码在转成gb2312就可以了.事实上,并不是这样的.
@Configuration@ComponentScan(basePackages = { "com.gfound.data", "com.geccocrawler.gecco.spring" })
public class GeccoConfig {
@Bean
public SpringGeccoEngine job1() {
return new SpringGeccoEngine() {
@Override
public void init() {
Date date = new Date();
String dateyear= new SimpleDateFormat("yyyy").format(date);
List<HttpRequest> requests = Lists.newArrayList();
HttpRequest request6 = new HttpGetRequest(
"http://www.baidu.com");
request6.setCharset("utf-8");//这里设置获取页面数据的编码格式
requests.add(request6);
GeccoEngine.create().pipelineFactory(springPipelineFactory).classpath("com.gfound.data").start(
requests
).loop(false).start();
}
};
}
}