java实现网络爬虫第一个版本

最新推荐文章于 2024-07-17 08:51:56 发布

honbaa

最新推荐文章于 2024-07-17 08:51:56 发布

阅读量730

点赞数

分类专栏：爬虫 java 文章标签：爬虫 java

本文链接：https://blog.csdn.net/zhanghongxian123/article/details/8716054

版权

java 同时被 2 个专栏收录

33 篇文章 0 订阅

订阅专栏

爬虫

1 篇文章 0 订阅

订阅专栏

功能：

1.返回网页文本内容；

2.正则表达式提取title;

4.对抓取的网页重命名；

5.文件名乱码问题的解决

package basicLearn;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.lang.reflect.Field;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;

public class CrawlPage1 {

	private HttpClient httpClient;
	private GetMethod getMethod;
	private int statusCode;
	private InputStream is;
	private OutputStream os;
	private File file;

	// 直接获取网络资源，而不是通过代理服务器
	/**
	 * 
	 * @param url
	 *            抓取的网页URL
	 * 
	 */
	public CrawlPage1(String url) {// ********************初始化资源在这里做***********//
		httpClient = new HttpClient();
		getMethod = new GetMethod(url);

		try {
			statusCode = httpClient.executeMethod(getMethod);
		} catch (HttpException e) {
			throw new RuntimeException(e);
		} catch (IOException e) {
			throw new RuntimeException(e);
		}
		System.out.println("initial over");
	}

	// 工具方法：用来获取文件文本内容

	public static String readContent(File file) throws Exception {
		System.out.println("read content beginning...");
		BufferedReader bufferedReader = new BufferedReader(
				new InputStreamReader(new FileInputStream(file), "utf-8"));// *****这里必须在构造流的时候就指定源文件的编码格式！！否则读取的到文件
																			// ******肯定乱码！！！********//
		String string;
		StringBuilder stringBuilder = new StringBuilder();
		while ((string = bufferedReader.readLine()) != null)
			stringBuilder.append(string + "\n");

		// close resourse
		bufferedReader.close();
		String str = stringBuilder.toString();
		System.out.println("read content over...");
		return str;
	}

	// 创建下载的文件名，根据网页的<title>标签的值
	public static String getFileName(String content) {

		Pattern pattern = Pattern.compile("<title>([^</title>]*)");
		Matcher matcher = pattern.matcher(content);
		String title = null;
		while (matcher.find()) {
			title = matcher.group(1);

			System.out.println("get file name over..." + title);
			break;
		}
		return title;
	}

	/**
	 * @param 下载的网页到本地的目录
	 * 
	 */
	public void downLoadPage(String directory) {// *********参数是一个本地的目录，这应该是一个有效的目录，将下载的网页保存到这个目录下******//

		try {
			if (statusCode == HttpStatus.SC_OK) {
				System.out
						.println("statuscode is " + getMethod.getStatusCode());
				// 如果访问到了资源，则再在本地创建文件
				File file = new File(directory);

				if (!file.exists()) {
					file.mkdirs();
					this.file = file;
				}
				if (file.isFile()) {
					throw new RuntimeException("这不是一个目录，而是一个文件");
				}

				if (file.exists() && file.isDirectory()) {

					// 为了获得<title>，先将网页下载到临时的文件中
					File f = new File(directory + "\\temp.html");
					if (!f.exists()) {
						f.createNewFile();
					}
					System.out.println("begin downLoadPage...");
					is = getMethod.getResponseBodyAsStream();
					os = new FileOutputStream(f);
					int i;
					byte[] buf = new byte[2048];
					while ((i = is.read(buf)) != -1) {
						os.write(buf, 0, i);
					}
					os.close();
					getMethod.releaseConnection();
					System.out.println(" downLoadPage over...");
					String titleString = getFileName(readContent(f));

					// 重命名
					f.renameTo(new File(directory + "\\" + titleString
							+ ".html"));
					System.out.println("rename the file as" + titleString);
					System.out.println("download over..");

				} else {
					System.out.println("你所指定的保存网页的目录并不是一个有效的位置..");
				}
			} else
				System.out
						.println("statuscode is " + getMethod.getStatusCode());

		} catch (Exception e) {
			e.printStackTrace();
			throw new RuntimeException("读写错误了！！！");
		}

	}

	/**
	 * 释放所有资源
	 */
	public void closeResource() {
		try {
			if (is != null) {
				is.close();
			}
			if (os != null) {
				os.close();
			}
		} catch (Exception e) {
			throw new RuntimeException(e);
		}
		if (getMethod != null) {
			getMethod.releaseConnection();
		}
		System.out.println("resource released...");

	}

	public static void main(String[] args) {
		CrawlPage1 crawlPage1 = new CrawlPage1(
				"http://hao.360.cn/");
		crawlPage1.downLoadPage("E:\\工作\\搜索引擎\\pageDownload\\temp");

		crawlPage1.closeResource();
		System.out.println("ok");
	}

}