package com.cwkj.qaback.webMagic;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.cwkj.qaback.entity.Webmagic;
import com.cwkj.qaback.entity.WebmagicConfigure;
import com.cwkj.qaback.service.WebmagicService;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.CookieManager;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.thinkgem.jeesite.common.utils.StringUtils;
/**
*
* @author Administrator
*
*/
public class Htmlunit{
private WebmagicService webmagicService;
private WebmagicConfigure webmagicConfigure;
private String key;
private static String getMainUrl="http://www.fjqi.gov.cn/xxgk/tzgg/";
public Htmlunit(WebmagicService _webmagicService,WebmagicConfigure _webmagicConfigure,String _key){
this.webmagicService=_webmagicService;
this.key = _key;
this.webmagicConfigure = _webmagicConfigure;
}
public void start() {
//创建一个WebClient对象
WebClient webClient = new WebClient(BrowserVersion.CHROME);
try {
//启用js
webClient.getOptions().setJavaScriptEnabled(false);
//禁用css
webClient.getOptions().setCssEnabled(false);
// js运行错误时,是否抛出异常
webClient.getOptions().setThrowExceptionOnScriptError(false);
//设置超时
webClient.getOptions().setTimeout(3000);
//启动重定向
webClient.getOptions().setRedirectEnabled(true);
//启动cookie管理
webClient.setCookieManager(new CookieManager());
webClient.getCookieManager().setCookiesEnabled(true);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
//打开爬取地址页面
int a=3;
HtmlPage page;
for(int b=0;b<=a;b++){
//列表页
if(b==0){
//webmagicConfigure.getAddurl();
page = webClient.getPage(getMainUrl);
}else{
//webmagicConfigure.getUrlList(); webmagicConfigure.getUrlPost();
String zhu="http://www.fjqi.gov.cn/xxgk/tzgg/index_"+b+".htm";
page = webClient.getPage(zhu);
}
// 等待JS驱动dom完成获得还原后的网页
webClient.waitForBackgroundJavaScript(3000);
Document doc = Jsoup.parse(page.asXml());
List<String> hrefList = new ArrayList<String>();
//详细页
//webmagicConfigure.getDetailtitlereg();
Elements elements= doc.select("a[href^=./][href$=.htm]");
if(null!=elements&&elements.size()>0){
for(int i=0;i<elements.size();i++){
Element ele = elements.get(i);
if(null!=ele){
String _href = ele.attr("href");
if(StringUtils.isNotEmpty(_href)){
//webmagicConfigure.getAddurl();
_href ="http://www.fjqi.gov.cn/xxgk/tzgg/"+_href.substring(2,_href.length());
hrefList.add(_href);
}
}
}
}
for(int i=0;i<hrefList.size();i++){
getDetail(hrefList.get(i));
}
}
} catch (Exception e) {
e.printStackTrace();
}finally{
webClient.close();
}
}
private String getDetail(String href) throws Exception{
String html="";
URL url = new URL(href);
Document doc = Jsoup.parse(url, 100000);
//webmagicConfigure.getListtitlereg();
Elements titles =doc.select("div[class=xl_content] h1");
String title = titles.html();
System.out.println(title);
Webmagic webmagic = new Webmagic();
webmagic.setTitle(title);
//webmagicConfigure.getContentrule();
Elements contents =doc.select("div[class=TRS_Editor]");
String content = contents.text();
System.out.println(content);
webmagic.setContent(content);
webmagicService.save(webmagic);
return html;
}
}
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.cwkj.qaback.entity.Webmagic;
import com.cwkj.qaback.entity.WebmagicConfigure;
import com.cwkj.qaback.service.WebmagicService;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.CookieManager;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.thinkgem.jeesite.common.utils.StringUtils;
/**
*
* @author Administrator
*
*/
public class Htmlunit{
private WebmagicService webmagicService;
private WebmagicConfigure webmagicConfigure;
private String key;
private static String getMainUrl="http://www.fjqi.gov.cn/xxgk/tzgg/";
public Htmlunit(WebmagicService _webmagicService,WebmagicConfigure _webmagicConfigure,String _key){
this.webmagicService=_webmagicService;
this.key = _key;
this.webmagicConfigure = _webmagicConfigure;
}
public void start() {
//创建一个WebClient对象
WebClient webClient = new WebClient(BrowserVersion.CHROME);
try {
//启用js
webClient.getOptions().setJavaScriptEnabled(false);
//禁用css
webClient.getOptions().setCssEnabled(false);
// js运行错误时,是否抛出异常
webClient.getOptions().setThrowExceptionOnScriptError(false);
//设置超时
webClient.getOptions().setTimeout(3000);
//启动重定向
webClient.getOptions().setRedirectEnabled(true);
//启动cookie管理
webClient.setCookieManager(new CookieManager());
webClient.getCookieManager().setCookiesEnabled(true);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
//打开爬取地址页面
int a=3;
HtmlPage page;
for(int b=0;b<=a;b++){
//列表页
if(b==0){
//webmagicConfigure.getAddurl();
page = webClient.getPage(getMainUrl);
}else{
//webmagicConfigure.getUrlList(); webmagicConfigure.getUrlPost();
String zhu="http://www.fjqi.gov.cn/xxgk/tzgg/index_"+b+".htm";
page = webClient.getPage(zhu);
}
// 等待JS驱动dom完成获得还原后的网页
webClient.waitForBackgroundJavaScript(3000);
Document doc = Jsoup.parse(page.asXml());
List<String> hrefList = new ArrayList<String>();
//详细页
//webmagicConfigure.getDetailtitlereg();
Elements elements= doc.select("a[href^=./][href$=.htm]");
if(null!=elements&&elements.size()>0){
for(int i=0;i<elements.size();i++){
Element ele = elements.get(i);
if(null!=ele){
String _href = ele.attr("href");
if(StringUtils.isNotEmpty(_href)){
//webmagicConfigure.getAddurl();
_href ="http://www.fjqi.gov.cn/xxgk/tzgg/"+_href.substring(2,_href.length());
hrefList.add(_href);
}
}
}
}
for(int i=0;i<hrefList.size();i++){
getDetail(hrefList.get(i));
}
}
} catch (Exception e) {
e.printStackTrace();
}finally{
webClient.close();
}
}
private String getDetail(String href) throws Exception{
String html="";
URL url = new URL(href);
Document doc = Jsoup.parse(url, 100000);
//webmagicConfigure.getListtitlereg();
Elements titles =doc.select("div[class=xl_content] h1");
String title = titles.html();
System.out.println(title);
Webmagic webmagic = new Webmagic();
webmagic.setTitle(title);
//webmagicConfigure.getContentrule();
Elements contents =doc.select("div[class=TRS_Editor]");
String content = contents.text();
System.out.println(content);
webmagic.setContent(content);
webmagicService.save(webmagic);
return html;
}
}