package rule;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author zhy
*
*/
public class ExtractService {
/**
* @param rule
* @return
*/
public static List<LinkTypeData> extract(Rule rule) {
// 进行对rule的必要校验
validateRule(rule);
List<LinkTypeData> datas = new ArrayList<LinkTypeData>();
LinkTypeData data = null;
try {
/**
* 解析rule
*/
String url = rule.getUrl();
String[] params = rule.getParams();
String[] values = rule.getValues();
String resultTagName = rule.getResultTagName();
int type = rule.getType();
int requestType = rule.getRequestMoethod();
Connection conn = Jsoup.connect(url);
// 设置查询参数
if (params != null) {
for (int i = 0; i < params.length; i++) {
conn.data(params[i], values[i]);
}
}
// 设置请求类型
Document doc = null;
switch (requestType) {
case Rule.GET:
doc = conn.timeout(100000).get();
break;
case Rule.POST:
doc = conn.timeout(100000).post();
break;
}
// 处理返回数据
Elements results = new Elements();
switch (type) {
case Rule.CLASS:
results = doc.getElementsByClass(resultTagName);
break;
case Rule.ID:
Element result = doc.getElementById(resultTagName);
results.add(result);
break;
case Rule.SELECTION:
results = doc.select(resultTagName);
break;
default:
// 当resultTagName为空时默认去body标签
if (TextUtil.isEmpty(resultTagName)) {
results = doc.getElementsByTag("body");
}
}
for (Element result : results) {
Elements links = result.getElementsByTag("a");
for (Element link : links) {
// 必要的筛选
String linkHref = link.attr("href");
String linkText = link.text();
data = new LinkTypeData();
data.setLinkHref(linkHref);
data.setLinkText(linkText);
datas.add(data);
}
}
} catch (IOException e) {
e.printStackTrace();
}
return datas;
}
/**
* 对传入的参数进行必要的校验
*/
private static void validateRule(Rule rule) {
String url = rule.getUrl();
if (TextUtil.isEmpty(url)) {
throw new RuleException("url不能为空!");
}
if (!url.startsWith("http://")) {
throw new RuleException("url的格式不正确!");
}
if (rule.getParams() != null && rule.getValues() != null) {
if (rule.getParams().length != rule.getValues().length) {
throw new RuleException("参数的键值对个数不匹配!");
}
}
}
}
package rule;
public class LinkTypeData {
private int id;
/**
* 链接的地址
*/
private String linkHref;
/**
* 链接的标题
*/
private String linkText;
/**
* 摘要
*/
private String summary;
/**
* 内容
*/
private String content;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getLinkHref() {
return linkHref;
}
public void setLinkHref(String linkHref) {
this.linkHref = linkHref;
}
public String getLinkText() {
return linkText;
}
public void setLinkText(String linkText) {
this.linkText = linkText;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}
package rule;
/**
* 规则类
*
* @author zhy
*
*/
public class Rule {
/**
* 链接
*/
private String url;
/**
* 参数集合
*/
private String[] params;
/**
* 参数对应的值
*/
private String[] values;
/**
* 对返回的HTML,第一次过滤所用的标签,请先设置type
*/
private String resultTagName;
/**
* CLASS / ID / SELECTION 设置resultTagName的类型,默认为ID
*/
private int type = ID;
/**
*GET / POST 请求的类型,默认GET
*/
private int requestMoethod = GET;
public final static int GET = 0;
public final static int POST = 1;
public final static int CLASS = 0;
public final static int ID = 1;
public final static int SELECTION = 2;
public Rule() {
}
public Rule(String url, String[] params, String[] values,
String resultTagName, int type, int requestMoethod) {
super();
this.url = url;
this.params = params;
this.values = values;
this.resultTagName = resultTagName;
this.type = type;
this.requestMoethod = requestMoethod;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String[] getParams() {
return params;
}
public void setParams(String[] params) {
this.params = params;
}
public String[] getValues() {
return values;
}
public void setValues(String[] values) {
this.values = values;
}
public String getResultTagName() {
return resultTagName;
}
public void setResultTagName(String resultTagName) {
this.resultTagName = resultTagName;
}
public int getType() {
return type;
}
public void setType(int type) {
this.type = type;
}
public int getRequestMoethod() {
return requestMoethod;
}
public void setRequestMoethod(int requestMoethod) {
this.requestMoethod = requestMoethod;
}
}
package rule;
public class RuleException extends RuntimeException {
public RuleException() {
super();
// TODO Auto-generated constructor stub
}
public RuleException(String message, Throwable cause) {
super(message, cause);
// TODO Auto-generated constructor stub
}
public RuleException(String message) {
super(message);
// TODO Auto-generated constructor stub
}
public RuleException(Throwable cause) {
super(cause);
// TODO Auto-generated constructor stub
}
}
package rule;
public class TextUtil
{
public static boolean isEmpty(String str)
{
if(str == null || str.trim().length() == 0)
{
return true ;
}
return false ;
}
}
package rule;
import java.util.List;
public class Test {
@org.junit.Test
public void getDatasByClass() {
Rule rule = new Rule(
"http://www1.sxcredit.gov.cn/public/infocomquery.do?method=publicIndexQuery",
new String[] { "query.enterprisename",
"query.registationnumber" }, new String[] { "dy", "" },
"cont_right", Rule.CLASS, Rule.POST);
List<LinkTypeData> extracts = ExtractService.extract(rule);
printf(extracts);
}
@org.junit.Test
public void getDatasByCssQuery() {
Rule rule = new Rule("http://www.11315.com/search",
new String[] { "name" }, new String[] { "dy" },
"div.g-mn div.con-model", Rule.SELECTION, Rule.GET);
List<LinkTypeData> extracts = ExtractService.extract(rule);
printf(extracts);
}
public void printf(List<LinkTypeData> datas) {
for (LinkTypeData data : datas) {
System.out.println(data.getLinkText());
System.out.println(data.getLinkHref());
System.out.println("***********************************");
}
}
}