Java单机爬虫

一直以来都知道网络爬虫的爬取原理,直到最近才有很空闲的时间将其代码写出来,不废话了,直接贴代码,

对于某个url来说,需要知道它的深度、是否被访问、是否检查过访问权限、是否有访问权限、是否设置了访问权限等各项信息,将这些信息都封装在类CrawlerUrl里面,

package webcrawler;

import java.net.MalformedURLException;
import java.net.URL;

public class CrawlerUrl {
	private int depth = 0;
	private String urlString = null;
	private URL url = null;
	private boolean isAllowedToVisit;
	private boolean isCheckedForPermission = false;
	private boolean isVisited = false;
	
	public CrawlerUrl(String urlString, int depth) {
		this.depth = depth;
		this.urlString = urlString;
		computeUrl();
	}
	
	private void computeUrl() {
		try {
			this.url = new URL(this.urlString);
		} catch (MalformedURLException e) {
			e.printStackTrace();
		}
	}
	
	public URL getUrl() {
		return this.url;
	}
	
	public int getDepth() {
		return this.depth;
	}
	
	public boolean isAllowedToVisit() {
		return this.isAllowedToVisit;
	}
	
	public void setAllowedToVisit(boolean isAllowedToVisit) {
		this.isAllowedToVisit = isAllowedToVisit;
		this.isCheckedForPermission = true;
	}
	
	public boolean isCheckedForPermission() {
		return this.isCheckedForPermission;
	}
	
	public boolean isVisited() {
		return this.isVisited;
	}
	
	public void setIsVisited() {
		this.isVisited = true;
	}
	
	public String getUrlString() {
		return this.urlString;
	}
	
	public String toString() {
		return "url=" + this.urlString + "\n" +
				"depth=" + this.depth + "\n" +
				"visit=" + this.isAllowedToVisit + "\n" +
				"check=" + this.isCheckedForPermission;
	}
	
	public int hashCode() {
		return this.urlString.hashCode();
	}
	
	public boolean equals(Object obj) {
		return obj.hashCode() == this.hashCode();
	}
}

下面是封装爬取过程的类WebCrawler,

package <pre name="code" class="java">webcrawler;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.entity.BufferedHttpEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

public class WebCrawler {
	private static final String REGEXP_HTTP = "<a [^<>]{0,} href=\"(http://[^\"]{1,})\""; // 绝对路径
	private static final String REGEXP_RELATIVE = "<a [^<>]{0,} href=\"(/[^\"]{1,})\""; // 相对路径
	private static final String REGEXP_CHARSET = "charset=\"{0,1}([^\"]{1,20})\""; // 字符编码
	private static final String REGEXP_TITLE = "<title>(.*)</title>"; // 网页标题
	
	private int maxNumberUrls; // 爬取的url数上限
	private long delayBetweenUrls; // 爬取延迟时间
	private int maxDepth; // 允许访问的最大深度
	private String urlFilterRegexp; // 过滤掉不想抓取的url,默认是空值,例如只想爬取www.csdn.net的网页,就将urlFilterRegexp设置为csdn.net

	private Pattern charsetRegexp;
	private Pattern httpRegexp;
	private Pattern relativeRegexp;
	private Pattern titleRegexp;
	
	private Map<String, CrawlerUrl> visitedUrls = null;
	private Queue<CrawlerUrl> urlQueue = null; // 将待访问的地址存放在队列里
	
	private BufferedWriter crawlOutput = null; // 输出爬取结果
	
	// 爬虫初始化
	public WebCrawler(Queue<CrawlerUrl> urlQueue, int maxNumberUrls, int maxDepth, long delayBetweenUrls, String savePath) throws IOException {
		this.urlQueue = urlQueue;
		this.maxNumberUrls = maxNumberUrls;
		this.maxDepth = maxDepth;
		this.delayBetweenUrls = delayBetweenUrls;
		this.visitedUrls = new HashMap<String, CrawlerUrl>();
		
		this.httpRegexp = Pattern.compile(REGEXP_HTTP);
		this.relativeRegexp = Pattern.compile(REGEXP_RELATIVE);
		this.charsetRegexp = Pattern.compile(REGEXP_CHARSET);
		this.titleRegexp = Pattern.compile(REGEXP_TITLE);
		
		Files.deleteIfExists(Paths.get(savePath));
		this.crawlOutput = new BufferedWriter(new FileWriter(savePath));
	}
	
	// 执行内网爬取
	public void crawl() throws Exception {
		while (continueCrawling()) {
			CrawlerUrl nextUrl = getNextUrl();
			if (nextUrl != null) {
				System.out.println("Current url = " + nextUrl.getUrlString());
				if (isUrlRelevant(nextUrl)) {
					printCrawlInfo(); // 打印爬虫状态信息
					String content = getContent(nextUrl); // 下载网页文本数据,有个陷阱:如果网站禁止访问此url,则content==null
					if (content == null) {
						System.out.println("Although the url is relevant, but it is unable to visited!");
						continue;
					}
					saveContent(nextUrl, content); // 保存网页文本到硬盘上
					List<String> urlList = this.getUrlsFromHtml(content, nextUrl);
					addUrlsToQueue(nextUrl, urlList);
				} else {
					System.out.println("The url is not relevant, ignoring!");
				}
				System.out.println("--------------------------------------------");
				Thread.sleep(delayBetweenUrls);
			}
		}
		crawlOutput.close();
	}
	
	// 若要爬取内网,在执行方法crawl()之前先要执行此方法
	public void setUrlFilter(String regexp) {
		this.urlFilterRegexp = regexp;
	}
		
	// 是否继续爬取
	private boolean continueCrawling() {
		// 当爬取队列非空且已访问地址数小于url数上限时,才继续爬取
		return ((!urlQueue.isEmpty()) && (visitedUrls.size() < maxNumberUrls));
	}
	
	private CrawlerUrl getNextUrl() throws Exception {
		CrawlerUrl nextUrl = null;
		while ((nextUrl == null) && (!urlQueue.isEmpty())) {
			CrawlerUrl crawlerUrl = urlQueue.remove(); // 移除并返回队首的url
			if ((!isUrlVisited(crawlerUrl))					
					&& (crawlerUrl.getDepth() <= maxDepth)){
				nextUrl = crawlerUrl;
			}
		}
		return nextUrl;
	}

	// 检查是否访问过该url
	private boolean isUrlVisited(CrawlerUrl crawlerUrl) {
		if ((crawlerUrl.isVisited())
				|| (visitedUrls.containsKey(crawlerUrl.getUrlString()))) {
			return true;
		}
		return false;
	}
	
	// 输出爬取过程关键信息
	private void printCrawlInfo() {
		StringBuilder sb = new StringBuilder();
		sb.append("Queue length = ").append(urlQueue.size()).append(", ")
		  .append("visited urls = ").append(visitedUrls.size()).append(", ");
		System.out.println(sb.toString());
	}

        // 下载网页
	public String getContent(String  urlString) throws Exception {
		return getContent(new CrawlerUrl(urlString, 0));
	}
	
	public String getContent(CrawlerUrl url) throws Exception {
		String content = null;		
		String urlString = url.getUrlString();
		CloseableHttpClient httpclient = HttpClients.createDefault();
		// 以下代码是参考httpclient官方给出的下载网页示例代码
		try {
			HttpGet httpget = new HttpGet(urlString);
			CloseableHttpResponse response = httpclient.execute(httpget);
			try {
				int statusCode = response.getStatusLine().getStatusCode();
				HttpEntity entity = response.getEntity();
 				if ((statusCode == HttpStatus.SC_OK) && (entity != null)) {
					entity = new BufferedHttpEntity(entity);
					StringBuilder sb = new StringBuilder();
					String contentType = entity.getContentType().toString();
					int charsetStart = contentType.indexOf("charset=");
					if (charsetStart != -1) { // 读取字符流
						String charset = contentType.substring(charsetStart + 8);
						BufferedReader reader = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
						int c;
						while ((c = reader.read()) != -1) sb.append((char) c);
						reader.close();
					} else { // 先解析html文件的前几行获取字符编码,设置好编码格式,再解析html文件的全部内容
						BufferedReader FiestReader = new BufferedReader(new InputStreamReader(entity.getContent()));
						String charset = null;
						String line = null;
						int charsetStartInHtml;
						while ((line = FiestReader.readLine()) != null) {
							charsetStartInHtml = line.indexOf("charset=");
							if (charsetStartInHtml != -1) {
								Matcher charsetMatcher = charsetRegexp.matcher(line);
								while (charsetMatcher.find()) charset = charsetMatcher.group(1);
								break;
 							}
						}
						FiestReader.close();
						BufferedReader SecondReader = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
						int c;
						while ((c = SecondReader.read()) != -1) sb.append((char) c); 
						SecondReader.close();
					}
					content = sb.toString();
				}
			} finally {
				response.close();
			}
		} finally {
			httpclient.close();
		}
		visitedUrls.put(url.getUrlString(), url);
		url.setIsVisited();
		return content;
	}
	
	private List<String> getUrlsFromHtml(String content, CrawlerUrl crawlerUrl) {
		List<String> urlList = new ArrayList<String>();
		getHttpUrls(urlList, content); // 将html文本中的绝对路径添加到urlSet中
		getRelativeUrls(urlList, content, crawlerUrl); // 将html文本中的相对路径添加到urlSet中
		return urlList;
	}
	
	private void getHttpUrls(List<String> urlList, String content) {
		Matcher urlMatcher = httpRegexp.matcher(content);
		String url = null;
		while (urlMatcher.find()) {
			url = urlMatcher.group(1);
			urlList.add(url);
		}
	}
	
	private void getRelativeUrls(List<String> urlList, String content, CrawlerUrl crawlerUrl) {
		Matcher urlMatcher = relativeRegexp.matcher(content);
		String url = null;
		String host = crawlerUrl.getUrl().getHost();
		while (urlMatcher.find()) {
			url = urlMatcher.group(1);
			url = "http://" + host + url;
			urlList.add(url);
		}
	}
	
	private void addUrlsToQueue(CrawlerUrl crawlerUrl, List<String> urlList) {
		int depth = crawlerUrl.getDepth() + 1; // 从crawlerUrl里面提取的链接的深度
		for (String url: urlList) {
			if (!visitedUrls.containsKey(url)) {
				urlQueue.add(new CrawlerUrl(url, depth));
			}
		}
	}
	
	private boolean isUrlRelevant(CrawlerUrl crawlerUrl) {
		String host = crawlerUrl.getUrl().getHost();
		if (host.indexOf(urlFilterRegexp) == -1) return false;
		return true;
	}
	
	// 保存爬过的网页地址及其标题,后期若要建立搜索引擎,还需解析content,剔除html标签
	private void saveContent(CrawlerUrl url, String content) throws IOException {
		String urlString = url.getUrlString();
		String title = "NA";
		Matcher m = titleRegexp.matcher(content);
		if (m.find()) title = m.group(1);
		crawlOutput.write(urlString + "\t" + title + "\n"); // 保存网页地址及其标题
	}
}

 

接下来运行爬虫的主方法,

package webcrawler;

import java.util.LinkedList;
import java.util.Queue;

public class CrawlerTest {

	public static void main(String[] args) throws Exception {
		Queue<CrawlerUrl> urlQueue = new LinkedList<CrawlerUrl>();
		String url = "http://www.csdn.net"; // 爬虫入口
		String savePath = "D:/crawler_result.txt";
		urlQueue.add(new CrawlerUrl(url, 0));
		WebCrawler crawler = new WebCrawler(urlQueue, 100, 2, 1000L, savePath);
		crawler.setUrlFilter("csdn.net"); // 爬取内网
		crawler.crawl(); // 开始爬取
	}

}

运行上述程序,下面是程序运行时在控制台输出内容的部分片段


可以看到csdn的技术博客页面是被禁止访问的,用浏览器能看其源码,但是用httoclient去访问会返回403错误码。

程序最后将页面的网址和标题写入文本文件D:/crawler_result.txt,以下是内容片段

改进方向:

1.上述爬虫是单线程的,可以对代码改进,增加一个工作线程池,这些线程并行地从url队列中取出url进行处理。或者更好的解决方案是分布式爬虫,在这种情况下

将url队列和已访问过的url存储在数据库中,同时使所有节点能够访问该数据库。

2.上述代码只是将网页的网址和标题保存在文件中,如果想做文本挖掘,需要保存网页的全部文本,可以试一下jsoup或apache tika。

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
Java Post 爬虫可以使用 Java 的 HttpUrlConnection 类来发送 POST 请求获取网页内容,以下是一个简单的示例代码: ```java import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.net.HttpURLConnection; import java.net.URL; public class PostSpider { public static void main(String[] args) throws Exception { // 发送 POST 请求的 URL URL url = new URL("http://www.example.com"); // 创建 HttpURLConnection 对象 HttpURLConnection connection = (HttpURLConnection) url.openConnection(); // 设置请求方法为 POST connection.setRequestMethod("POST"); // 设置请求头 connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); // 开启输出流,以便向服务器发送数据 connection.setDoOutput(true); // 构造 POST 数据 String postData = "name=value&name2=value2"; // 发送 POST 数据 OutputStreamWriter writer = new OutputStreamWriter(connection.getOutputStream()); writer.write(postData); writer.flush(); writer.close(); // 读取响应数据 BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream())); String line; StringBuffer response = new StringBuffer(); while ((line = reader.readLine()) != null) { response.append(line); } reader.close(); // 输出响应数据 System.out.println(response.toString()); } } ``` 需要注意的是,POST 请求需要在请求头中设置 Content-Type 为 application/x-www-form-urlencoded,并且需要开启输出流以便向服务器发送数据。构造 POST 数据时需要按照 key=value 的格式进行拼接,并使用 & 符号连接多个参数。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值