在网上看了些爬虫视频然后自己手敲了份!纪念下自己初学爬虫,让自己以后乐一乐
package com.cd.spider.rule;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ExtractService {
public static List<LinkTypeData> extract(Rule rule){
validateRule(rule);
List<LinkTypeData> datas = new ArrayList<>();
LinkTypeData data = null;
try {
String url = rule.getUrl();
String[] params = rule.getParams();
String[] values = rule.getValues();
String resultTagName =rule.getResultTagName();
int type = rule.getType();
int requestType = rule.getRequestMoethod();
Connection conn = Jsoup.connect(url);
//设置查询参数
if(params !=null){
for(int i=0;i<params.length;i++){
conn.data(params[i],values[i]);
}
}
//设置请求类型
Document doc = null;
switch(requestType){
case Rule.GET:
doc=conn.timeout(100000).get();
break;
case Rule.POST:
doc=conn.timeout(100000).post();
}
//处理返回数据类型
Elements results = new Elements();
switch (type) {
case Rule.CLASS:
results = doc.getElementsByClass(resultTagName);
break;
case Rule.SELECTION:
results = doc.select(resultTagName);
break;
case Rule.ID:
Element result = doc.getElementById(resultTagName);
results.add(result);
break;
default :
if(TextUtil.isEmpty(resultTagName)){
results=doc.getElementsByTag("body");
}
}
for(Element result:results){
Elements links = result.getElementsByTag("a");
for(Element link : links){
String linkHref = link.attr("href");
String linkText =link.text();
data = new LinkTypeData();
data.setLinkHref(linkHref);
data.setLinkeText(linkText);
datas.add(data);
}
}
} catch (Exception e) {
e.printStackTrace();
}
return datas;
}
private static void validateRule(Rule rule) {
String url=rule.getUrl();
if(TextUtil.isEmpty(url)){
throw new RuleException("url不能为空!");
}
if (!url.startsWith("http://"))
{
throw new RuleException("url的格式不正确!");
}
if (rule.getParams() != null && rule.getValues() != null)
{
if (rule.getParams().length != rule.getValues().length)
{
throw new RuleException("参数的键值对个数不匹配!");
}
}
}
}