走过路过求吐槽
爬虫原理:
1、读取当前页面中的连接,判断是否已爬取,未爬取则加入未爬取队列;下载未爬取队列中的连接网页
2、已爬取的URL:添加已爬取网页链接;获取已爬取的网页数量;
3、未爬取的URL:添加未爬取的网页链接(保证每个 url 只被访问一次)
4、利用jsoup解析页面,取出所需内容
以下是代码:
1、先定义列表类
package com.cmcc.db.webspider;
import java.util.HashSet;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
public class LinkQueue {
/*读取当前页面中的连接,判断是否已爬取,未爬取则加入未爬取队列;下载未爬取队列中的连接网页*/
//已爬取的URL:添加已爬取网页链接;获取已爬取的网页数量;
private static Set visitedUrl=new HashSet();
//未爬取的URL:添加未爬取的网页链接(保证每个 url 只被访问一次);未访问的URL出队列(删除并添加到已访问集合);判断未访问的队列是否为空;获得未访问队列
private static Queue unVisitedUrl=new PriorityQueue();//优先队列默认的是数据大的优先级高,所以我们无论按照什么顺序push一堆数,最终在队列里总是pop出最大的元素。
@SuppressWarnings("unchecked")
public static void addVisitedUrl(String url){
visitedUrl.add(url);
}
public static int getVisitedUrlNum(){
return visitedUrl.size();
}
@SuppressWarnings("unchecked")
public static void addUnVisitedUrl(String url){
if(url!=null&&!url.trim().equals("")&&!visitedUrl.contains(url)&&!unVisitedUrl.contains(url))
unVisitedUrl.add(url);
}
public static Object pollUnVisitedUrl(){
return unVisitedUrl.poll();//poll是删除并弹出
}
public static boolean judgeUnVisitedUrl(){
return unVisitedUrl.isEmpty();
}
// 判断未访问的URL队列中是否为空
public static boolean unVisitedUrlsEmpty() {
return unVisitedUrl.isEmpty();
}}
2、访问页面规则
package com.cmcc.db.spider;
public class Rule {
/**
* 链接
*/
private String url;
/**
* 参数集合
*/
private String[] params;
/**
* 参数对应的值
*/
private String[] values;
/**
* CLASS / ID / SELECTION
* 设置resultTagName的类型,默认为ID
*/
private int type = ID ;
/**
* 对返回的HTML,第一次过滤所用的标签,请先设置type
*/
private String resultTagName;
/**
*GET / POST
* 请求的类型,默认GET
*/
private int requestMoethod = GET ;
public final static int GET = 0 ;
public final static int POST = 1 ;
public final static int CLASS = 0;
public final static int ID = 1;
public final static int SELECTION = 2;
public Rule()
{
}
public Rule(String url, String[] params, String[] values,
int type, String resultTagName, int requestMoethod)
{
super();
this.url = url;
this.params = params;
this.values = values;
this.resultTagName = resultTagName;
this.type = type;
this.requestMoethod = requestMoethod;
}
public String getUrl()
{
return url;
}
public void setUrl(String url)
{
this.url = url;
}
public String[] getParams()
{
return params;
}
public void setParams(String[] params)
{
this.params = params;
}
public String[] getValues()
{
return values;
}
public void setValues(String[] values)
{
this.values = values;
}
public String getResultTagName()
{
return resultTagName;
}
public void setResultTagName(String resultTagName)
{
this.resultTagName = resultTagName;
}
public int getType()
{
return type;
}
public void setType(int type)
{
this.type = type;
}
public int getRequestMoethod()
{
return requestMoethod;
}
public void setRequestMoethod(int requestMoethod)
{
this.requestMoethod = requestMoethod;
} }
3、解析页面,爬取链接(根据需要获取内容)
package com.cmcc.db.spider;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.cmcc.db.spider.LinkTypeData;
import com.cmcc.db.spider.Rule;
import com.cmcc.db.spider.RuleException;
public class ExtractService {
/**
* @param rule
* @return
*/
public static List<LinkTypeData> extract(Rule rule)
{
// 进行对rule的必要校验
validateRule(rule);
List<LinkTypeData> datas = new ArrayList<LinkTypeData>();
LinkTypeData data = null;
try
{
/**
* 解析rule
*/
String url = rule.getUrl();
String[] params = rule.getParams();
String[] values = rule.getValues();
String resultTagName = rule.getResultTagName();
int type = rule.getType();
int requestType = rule.getRequestMoethod();
Connection conn = Jsoup.connect(url); //jsoup解析URL,模拟浏览器去访问页面
// 设置查询参数
if (params != null)
{
for (int i = 0; i < params.length; i++)
{
conn.data(params[i], values[i]); //传参
}
}
// 设置请求类型
Document doc = null;
switch (requestType)
{
case Rule.GET:
doc = conn.timeout(100000).get();
break;
case Rule.POST:
doc = conn.timeout(100000).post();
break;
}
//处理返回数据 or : Elements ListDiv = doc.getElementsByAttributeValue("class","postBody");
Elements results = new Elements();
switch (type)
{
case Rule.CLASS:
results = doc.getElementsByClass(resultTagName); //利用jsoup选择class=resultTagName 的元素
break;
case Rule.ID:
Element result = doc.getElementById(resultTagName);
results.add(result);
break;
case Rule.SELECTION:
results = doc.select(resultTagName);
break;
default:
//当resultTagName为空时默认去body标签
if (TextIsEmpty(resultTagName))
{
results = doc.getElementsByTag("body");
}
}
for (Element result : results)
{
Elements links = result.getElementsByTag("a");
for (Element link : links)
{
//必要的筛选
String linkHref = link.attr("href");
String linkText = link.text();
data = new LinkTypeData();
data.setLinkHref(linkHref); //链接
data.setLinkText(linkText); //超链接内容
datas.add(data);
}
}
} catch (IOException e)
{
e.printStackTrace();
}
return datas;
}
public static boolean TextIsEmpty(String str)
{
if(str == null || str.trim().length() == 0)
{
return true ;
}
return false ;
}
/**
* 对传入的参数进行必要的校验
*/
private static void validateRule(Rule rule)
{
String url = rule.getUrl();
if (TextIsEmpty(url))
{
throw new RuleException("url不能为空!");
}
if (!url.startsWith("http://"))
{
throw new RuleException("url的格式不正确!");
}
if (rule.getParams() != null && rule.getValues() != null)
{
if (rule.getParams().length != rule.getValues().length)
{
throw new RuleException("参数的键值对个数不匹配!");
}
}
}
}
4、编写测试类并写入txt
package com.cmcc.db.webspider;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import com.cmcc.db.spider.ExtractService;
import com.cmcc.db.spider.LinkTypeData;
import com.cmcc.db.spider.Rule;
public class MyCrawler {
/**
* 使用种子初始化 URL 队列
*
* @return
* @param seeds
* 种子URL
*/
private void initCrawlerWithSeeds(String[] seeds) {
for (int i = 0; i < seeds.length; i++)
LinkQueue.addUnVisitedUrl(seeds[i]);
}
public static void printf(List<LinkTypeData> datas)
{
for (LinkTypeData data : datas)
{
System.out.println(data.getLinkText());
System.out.println(data.getLinkHref());
System.out.println("***********************************");
}
}
/**
* 抓取过程
*
* @return
* @param seeds
*/
@SuppressWarnings("unchecked")
public void crawling(String[] seeds) { // 定义过滤器,提取以http://news.baidu.com/ns开头的链接
LinkFilter filter = new LinkFilter() {
public boolean accept(String url) {
if (url.startsWith("http://news.baidu.com/ns"))
return true;
else
return false;
}
};
// 初始化URL队列,将根节点放入队列
initCrawlerWithSeeds(seeds);
// 循环条件:待抓取的链接不空且抓取的网页不多于1000
while (!LinkQueue.unVisitedUrlsEmpty()
&& LinkQueue.getVisitedUrlNum() <= 10) {
// 队头URL出队列
String visitUrl = (String) LinkQueue.pollUnVisitedUrl();
if (visitUrl == null)
continue;
Rule rule = new Rule(visitUrl,
new String[] { "jquery" }, new String[] { "java" },
-1, null, Rule.GET);
List<LinkTypeData> extracts = ExtractService.extract(rule);
printf(extracts);
//写入txt文件
writeObject(extracts);
// 该 url 放入到已访问的 URL中
LinkQueue.addVisitedUrl(visitUrl);
// 新的未访问的 URL 入队
for (LinkTypeData date : extracts) {
LinkQueue.addUnVisitedUrl(date.getLinkHref());
}
}
}
public boolean writeObject(List<LinkTypeData> list) {//将获取的信息写入txt文件
FileWriter fw = null;
BufferedWriter bw = null;
try {
fw = new FileWriter("d:/test.txt", true);
bw = new BufferedWriter(fw, 100);
for (LinkTypeData date : list) {
bw.write(date.getLinkText().trim()+" "+date.getLinkHref()+"\n");
}
return true;
} catch (IOException e) {
System.out.println("写入文件出错");
return false;
} finally {
try {
if (bw != null) {
bw.flush();
bw.close();
}
if (fw != null)
fw.close();
}catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
// main 方法入口
public static void main(String[] args) {
MyCrawler crawler = new MyCrawler();
crawler.crawling(new String[] { "http://news.baidu.com/ns" });
}
}