一直以来都知道网络爬虫的爬取原理,直到最近才有很空闲的时间将其代码写出来,不废话了,直接贴代码,
对于某个url来说,需要知道它的深度、是否被访问、是否检查过访问权限、是否有访问权限、是否设置了访问权限等各项信息,将这些信息都封装在类CrawlerUrl里面,
package webcrawler;
import java.net.MalformedURLException;
import java.net.URL;
public class CrawlerUrl {
private int depth = 0;
private String urlString = null;
private URL url = null;
private boolean isAllowedToVisit;
private boolean isCheckedForPermission = false;
private boolean isVisited = false;
public CrawlerUrl(String urlString, int depth) {
this.depth = depth;
this.urlString = urlString;
computeUrl();
}
private void computeUrl() {
try {
this.url = new URL(this.urlString);
} catch (MalformedURLException e) {
e.printStackTrace();
}
}
public URL getUrl() {
return this.url;
}
public int getDepth() {
return this.depth;
}
public boolean isAllowedToVisit() {
return this.isAllowedToVisit;
}
public void setAllowedToVisit(boolean isAllowedToVisit) {
this.isAllowedToVisit = isAllowedToVisit;
this.isCheckedForPermission = true;
}
public boolean isCheckedForPermission() {
return this.isCheckedForPermission;
}
public boolean isVisited() {
return this.isVisited;
}
public void setIsVisited() {
this.isVisited = true;
}
public String getUrlString() {
return this.urlString;
}
public String toString() {
return "url=" + this.urlString + "\n" +
"depth=" + this.depth + "\n" +
"visit=" + this.isAllowedToVisit + "\n" +
"check=" + this.isCheckedForPermission;
}
public int hashCode() {
return this.urlString.hashCode();
}
public boolean equals(Object obj) {
return obj.hashCode() == this.hashCode();
}
}
下面是封装爬取过程的类WebCrawler,
package <pre name="code" class="java">webcrawler;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.entity.BufferedHttpEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
public class WebCrawler {
private static final String REGEXP_HTTP = "<a [^<>]{0,} href=\"(http://[^\"]{1,})\""; // 绝对路径
private static final String REGEXP_RELATIVE = "<a [^<>]{0,} href=\"(/[^\"]{1,})\""; // 相对路径
private static final String REGEXP_CHARSET = "charset=\"{0,1}([^\"]{1,20})\""; // 字符编码
private static final String REGEXP_TITLE = "<title>(.*)</title>"; // 网页标题
private int maxNumberUrls; // 爬取的url数上限
private long delayBetweenUrls; // 爬取延迟时间
private int maxDepth; // 允许访问的最大深度
private String urlFilterRegexp; // 过滤掉不想抓取的url,默认是空值,例如只想爬取www.csdn.net的网页,就将urlFilterRegexp设置为csdn.net
private Pattern charsetRegexp;
private Pattern httpRegexp;
private Pattern relativeRegexp;
private Pattern titleRegexp;
private Map<String, CrawlerUrl> visitedUrls = null;
private Queue<CrawlerUrl> urlQueue = null; // 将待访问的地址存放在队列里
private BufferedWriter crawlOutput = null; // 输出爬取结果
// 爬虫初始化
public WebCrawler(Queue<CrawlerUrl> urlQueue, int maxNumberUrls, int maxDepth, long delayBetweenUrls, String savePath) throws IOException {
this.urlQueue = urlQueue;
this.maxNumberUrls = maxNumberUrls;
this.maxDepth = maxDepth;
this.delayBetweenUrls = delayBetweenUrls;
this.visitedUrls = new HashMap<String, CrawlerUrl>();
this.httpRegexp = Pattern.compile(REGEXP_HTTP);
this.relativeRegexp = Pattern.compile(REGEXP_RELATIVE);
this.charsetRegexp = Pattern.compile(REGEXP_CHARSET);
this.titleRegexp = Pattern.compile(REGEXP_TITLE);
Files.deleteIfExists(Paths.get(savePath));
this.crawlOutput = new BufferedWriter(new FileWriter(savePath));
}
// 执行内网爬取
public void crawl() throws Exception {
while (continueCrawling()) {
CrawlerUrl nextUrl = getNextUrl();
if (nextUrl != null) {
System.out.println("Current url = " + nextUrl.getUrlString());
if (isUrlRelevant(nextUrl)) {
printCrawlInfo(); // 打印爬虫状态信息
String content = getContent(nextUrl); // 下载网页文本数据,有个陷阱:如果网站禁止访问此url,则content==null
if (content == null) {
System.out.println("Although the url is relevant, but it is unable to visited!");
continue;
}
saveContent(nextUrl, content); // 保存网页文本到硬盘上
List<String> urlList = this.getUrlsFromHtml(content, nextUrl);
addUrlsToQueue(nextUrl, urlList);
} else {
System.out.println("The url is not relevant, ignoring!");
}
System.out.println("--------------------------------------------");
Thread.sleep(delayBetweenUrls);
}
}
crawlOutput.close();
}
// 若要爬取内网,在执行方法crawl()之前先要执行此方法
public void setUrlFilter(String regexp) {
this.urlFilterRegexp = regexp;
}
// 是否继续爬取
private boolean continueCrawling() {
// 当爬取队列非空且已访问地址数小于url数上限时,才继续爬取
return ((!urlQueue.isEmpty()) && (visitedUrls.size() < maxNumberUrls));
}
private CrawlerUrl getNextUrl() throws Exception {
CrawlerUrl nextUrl = null;
while ((nextUrl == null) && (!urlQueue.isEmpty())) {
CrawlerUrl crawlerUrl = urlQueue.remove(); // 移除并返回队首的url
if ((!isUrlVisited(crawlerUrl))
&& (crawlerUrl.getDepth() <= maxDepth)){
nextUrl = crawlerUrl;
}
}
return nextUrl;
}
// 检查是否访问过该url
private boolean isUrlVisited(CrawlerUrl crawlerUrl) {
if ((crawlerUrl.isVisited())
|| (visitedUrls.containsKey(crawlerUrl.getUrlString()))) {
return true;
}
return false;
}
// 输出爬取过程关键信息
private void printCrawlInfo() {
StringBuilder sb = new StringBuilder();
sb.append("Queue length = ").append(urlQueue.size()).append(", ")
.append("visited urls = ").append(visitedUrls.size()).append(", ");
System.out.println(sb.toString());
}
// 下载网页
public String getContent(String urlString) throws Exception {
return getContent(new CrawlerUrl(urlString, 0));
}
public String getContent(CrawlerUrl url) throws Exception {
String content = null;
String urlString = url.getUrlString();
CloseableHttpClient httpclient = HttpClients.createDefault();
// 以下代码是参考httpclient官方给出的下载网页示例代码
try {
HttpGet httpget = new HttpGet(urlString);
CloseableHttpResponse response = httpclient.execute(httpget);
try {
int statusCode = response.getStatusLine().getStatusCode();
HttpEntity entity = response.getEntity();
if ((statusCode == HttpStatus.SC_OK) && (entity != null)) {
entity = new BufferedHttpEntity(entity);
StringBuilder sb = new StringBuilder();
String contentType = entity.getContentType().toString();
int charsetStart = contentType.indexOf("charset=");
if (charsetStart != -1) { // 读取字符流
String charset = contentType.substring(charsetStart + 8);
BufferedReader reader = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
int c;
while ((c = reader.read()) != -1) sb.append((char) c);
reader.close();
} else { // 先解析html文件的前几行获取字符编码,设置好编码格式,再解析html文件的全部内容
BufferedReader FiestReader = new BufferedReader(new InputStreamReader(entity.getContent()));
String charset = null;
String line = null;
int charsetStartInHtml;
while ((line = FiestReader.readLine()) != null) {
charsetStartInHtml = line.indexOf("charset=");
if (charsetStartInHtml != -1) {
Matcher charsetMatcher = charsetRegexp.matcher(line);
while (charsetMatcher.find()) charset = charsetMatcher.group(1);
break;
}
}
FiestReader.close();
BufferedReader SecondReader = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
int c;
while ((c = SecondReader.read()) != -1) sb.append((char) c);
SecondReader.close();
}
content = sb.toString();
}
} finally {
response.close();
}
} finally {
httpclient.close();
}
visitedUrls.put(url.getUrlString(), url);
url.setIsVisited();
return content;
}
private List<String> getUrlsFromHtml(String content, CrawlerUrl crawlerUrl) {
List<String> urlList = new ArrayList<String>();
getHttpUrls(urlList, content); // 将html文本中的绝对路径添加到urlSet中
getRelativeUrls(urlList, content, crawlerUrl); // 将html文本中的相对路径添加到urlSet中
return urlList;
}
private void getHttpUrls(List<String> urlList, String content) {
Matcher urlMatcher = httpRegexp.matcher(content);
String url = null;
while (urlMatcher.find()) {
url = urlMatcher.group(1);
urlList.add(url);
}
}
private void getRelativeUrls(List<String> urlList, String content, CrawlerUrl crawlerUrl) {
Matcher urlMatcher = relativeRegexp.matcher(content);
String url = null;
String host = crawlerUrl.getUrl().getHost();
while (urlMatcher.find()) {
url = urlMatcher.group(1);
url = "http://" + host + url;
urlList.add(url);
}
}
private void addUrlsToQueue(CrawlerUrl crawlerUrl, List<String> urlList) {
int depth = crawlerUrl.getDepth() + 1; // 从crawlerUrl里面提取的链接的深度
for (String url: urlList) {
if (!visitedUrls.containsKey(url)) {
urlQueue.add(new CrawlerUrl(url, depth));
}
}
}
private boolean isUrlRelevant(CrawlerUrl crawlerUrl) {
String host = crawlerUrl.getUrl().getHost();
if (host.indexOf(urlFilterRegexp) == -1) return false;
return true;
}
// 保存爬过的网页地址及其标题,后期若要建立搜索引擎,还需解析content,剔除html标签
private void saveContent(CrawlerUrl url, String content) throws IOException {
String urlString = url.getUrlString();
String title = "NA";
Matcher m = titleRegexp.matcher(content);
if (m.find()) title = m.group(1);
crawlOutput.write(urlString + "\t" + title + "\n"); // 保存网页地址及其标题
}
}
接下来运行爬虫的主方法,
package webcrawler;
import java.util.LinkedList;
import java.util.Queue;
public class CrawlerTest {
public static void main(String[] args) throws Exception {
Queue<CrawlerUrl> urlQueue = new LinkedList<CrawlerUrl>();
String url = "http://www.csdn.net"; // 爬虫入口
String savePath = "D:/crawler_result.txt";
urlQueue.add(new CrawlerUrl(url, 0));
WebCrawler crawler = new WebCrawler(urlQueue, 100, 2, 1000L, savePath);
crawler.setUrlFilter("csdn.net"); // 爬取内网
crawler.crawl(); // 开始爬取
}
}
运行上述程序,下面是程序运行时在控制台输出内容的部分片段
可以看到csdn的技术博客页面是被禁止访问的,用浏览器能看其源码,但是用httoclient去访问会返回403错误码。
程序最后将页面的网址和标题写入文本文件D:/crawler_result.txt,以下是内容片段
改进方向:
1.上述爬虫是单线程的,可以对代码改进,增加一个工作线程池,这些线程并行地从url队列中取出url进行处理。或者更好的解决方案是分布式爬虫,在这种情况下
将url队列和已访问过的url存储在数据库中,同时使所有节点能够访问该数据库。
2.上述代码只是将网页的网址和标题保存在文件中,如果想做文本挖掘,需要保存网页的全部文本,可以试一下jsoup或apache tika。