网络爬虫系列之一：通过URL下载网页

最新推荐文章于 2024-06-18 16:33:43 发布

huzhengnan

最新推荐文章于 2024-06-18 16:33:43 发布

阅读量1.1w

点赞数 7

分类专栏： java 搜索引擎网络爬虫文章标签：搜索引擎网络爬虫 java

本文链接：https://blog.csdn.net/huzhengnan/article/details/22288897

版权

java 同时被 3 个专栏收录

3 篇文章 0 订阅

订阅专栏

搜索引擎

3 篇文章 0 订阅

订阅专栏

网络爬虫

3 篇文章 0 订阅

订阅专栏

世界上第一个爬虫叫做"互联网漫游者（www wanderer）"，是由MIT学生马修·格雷写的。我想他大概也是通过细心细致的观察后发现：互联网上的页面之间是有联系的。比如说，通过分析一个页面的链接，就能下载到其它页面。而且做起来可能没那么困难，就放手去尝试，第一个爬虫就成了！

初学爬虫，顺着自己的思路往下做。

第一个爬虫的第一个部分就是下载，同时也知道第二个部分是通过已经下载的页面下载到新的页面。

关键的地方在于java.net.URL类中有openStream()方法，就相当于获得下载的入口，看到这个感觉今天的任务就已经完成一半了。

运行代码过后，刷新工程目录就可以

TIPS:在写代码的时候出现了乱码问题，第一次是输出到控制台乱码，就在BufferedReader中加了参数”utf-8“，然后下载的页面又乱码，就在OutPutStream后面添加了”utf-8“就ok了。在文章末尾增加了近期更新的代码。

一、工程目录

图1. 工程目录

二、程序源代码

package csdnBlog;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.net.URL;

/**
 * 功能：通过URL下载网页
 * 
 * 实现方式：通过获得URL的输入流
 * 
 * @author <span style="color:blue"><b>胡征南</b></span>
 * 
 */
public class URLPageDownload {

	/**
	 * 下载页面的具体函数实现
	 * 
	 * @param str
	 *            输入的地址
	 */
	public static void downloadPage(String str) {
		BufferedReader br = null;
		FileOutputStream fos = null;
		OutputStreamWriter osw = null;
		String inputLine;
		try {
			URL url = null;
			url = new URL(str);

			// 通过url.openStream(),来获得输入流
			br = new BufferedReader(new InputStreamReader(url.openStream(),
					"UTF-8"));

			File file = new File("download.html");
			fos = new FileOutputStream(file);
			osw = new OutputStreamWriter(fos, "utf-8");

			// 将输入流读入到临时变量中，再写入到文件
			while ((inputLine = br.readLine()) != null) {
				osw.write(inputLine);
				// System.out.println(inputLine);
			}
			
			br.close();
			osw.close();
			System.err.println("下载完毕!");
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (br != null && osw != null) {
					br.close();
					osw.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}

	public static void main(String[] args) {
		// 此处做测试
		URLPageDownload.downloadPage("http://www.csu.edu.cn/");
	}

}

三、运行效果

图2. 运行后刷新项目产生的新文件

图3. 运行download.html的效果

四、参考资料

乱码解决方案参考资料：

1、文件读取乱码

http://blog.csdn.net/greenqingqingws/article/details/7395213

2、文件写入乱码

http://hi.baidu.com/duanxzf/item/ff9837e4c2eaea1b8c3ea870

----------------------------------------------------分割线，2014年4月16日更新--------------------------------------------------------------------------------------------

1、采用了HttpURLConnection进行访问；

2、将页面下载到一定路径下，显得更为美观；

3、增加了读取页面的编码格式，解决了一些下载页面乱码的问题。

相关代码如下：

package csdnBlog;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;

/**
 * 
 * 通过http请求下载页面
 * 
 * @author <span style="color:blue"><b>胡征南</b></span>
 * 
 */
public class HttpPageDownload {

	/**
	 * 下载页面的具体函数实现
	 * 
	 * @param urlStr
	 *            输入的地址
	 */
	public static boolean DownloadPages(String urlStr) {
		String tmpString = "";
		URL url = null;
		HttpURLConnection httpConn = null;
		InputStream in = null;
		FileOutputStream out = null;

		BufferedReader br = null;
		BufferedWriter bw = null;

		String returnResult = null;

		String charset = null;
		try {
			url = new URL(urlStr);
			httpConn = (HttpURLConnection) url.openConnection();
			// 设置链接超时为5秒
			httpConn.setConnectTimeout(5000);
			// 设置读取数据超时为30秒
			httpConn.setReadTimeout(30000);
			// HttpURLConnection.setFollowRedirects(true);
			httpConn.setRequestMethod("GET");
			// 设置http协议头
			httpConn.setRequestProperty("User-Agent",
					"Mozilla/4.0(compatible; MSIE 6.0; Windows 2000)");

			// 设置下载页面的文件名称
			String fileName = Utils.urlToFileName(urlStr);
			File downloadPage = new File(fileName);
			File parent = downloadPage.getParentFile();
			if (parent != null && !parent.exists()) {
				parent.mkdirs();
			}
			// System.out.println(fileName);

			// 在此处输出返回结果，OK为返回成功，Not Found为返回失败
			returnResult = httpConn.getResponseMessage();
			// System.out.println(returnResult);

			if ("OK".equals(returnResult)) {
				in = httpConn.getInputStream();
				out = new FileOutputStream(downloadPage);

				// 获取编码
				charset = Utils.getCharsetFormUrl(urlStr);
				// System.out.println("当前页面的编码格式为：" + charset);
				// 将字节流封装成字符流
				br = new BufferedReader(new InputStreamReader(in, charset));
				bw = new BufferedWriter(new OutputStreamWriter(out, charset));

				// 正则表达式完成
				while ((tmpString = br.readLine()) != null) {
					bw.write(tmpString);
				}
				return true;
			} else {
				// System.err.println("页面下载失败，失败页面的连接地址为：\n" + urlStr + "\n");
				return false;
			}
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (Exception e) {
			System.err.println("下载失败");
			e.printStackTrace();
		} finally {
			try {
				if ("OK".equals(returnResult)) {
					bw.close();
					br.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return false;
	}

	// 此处做测试
	public static void main(String[] args) {
		DownloadPages("http://mobile.163.com/");
	}
}

工具类代码如下：

package csdnBlog;

import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 
 * 这个类用于存放一些有用的工具
 * 
 * @author <span style="color:blue"><b>胡征南</b></span>
 * 
 */
public class Utils {
	/**
	 * 将url对应到文件名称
	 * 
	 * @param url
	 *            需要进行对应的url
	 * @return
	 */
	public static String urlToFileName(String url) {
		String fileName = url.replace("http://", "");
		fileName = fileName.replace("https://", "");
		fileName = fileName.replaceAll("[\\:*?\"<>|]", "");// 替换掉不能出现的文件名
		StringBuffer fileDir = new StringBuffer("downloads/");
		if (!fileName.contains("/")) {
			fileDir.append(fileName);
			fileName = "index.html";
		} else {
			fileDir.append(fileName.substring(0, fileName.lastIndexOf('/')));
			fileName = fileName.substring(fileName.lastIndexOf('/'));
			if (!fileName.contains(".")) {
				fileName = fileName + ".html";
			}
		}
		fileName = fileDir.toString() + fileName;
		return fileName;
	}

	/**
	 * 从url地址获取编码格式
	 * 
	 * @param url
	 */
	public static String getCharsetFormUrl(String urlString) {
		InputStream in = null;
		int chByte = 0;
		URL url = null;
		HttpURLConnection httpConn = null;
		String contents = null;
		String charset = "utf-8";
		int len = 0;
		// 差不多这么大已经可以读到编码格式了
		byte[] b = new byte[1024];
		try {
			url = new URL(urlString);
			httpConn = (HttpURLConnection) url.openConnection();
			HttpURLConnection.setFollowRedirects(true);
			httpConn.setRequestMethod("GET");
			httpConn.setRequestProperty("User-Agent",
					"Mozilla/4.0(compatible; MSIE 6.0; Windows 2000)");
			httpConn.setConnectTimeout(5000);
			httpConn.setReadTimeout(30000);
			// System.out.println(httpConn.getResponseMessage());
			in = httpConn.getInputStream();

			// 用于标记内容
			chByte = in.read();
			while (chByte != -1) {
				chByte = in.read();
				b[len++] = (byte) chByte;
				if (len >= 1024) {
					break;
				}
			}
			contents = new String(b);
			Pattern p = Pattern.compile(
					"<meta[^>]*?charset=[\"]?(\\w+)[\\W]*?>",
					Pattern.CASE_INSENSITIVE);
			Matcher m = p.matcher(contents);
			if (m.find()) {
				charset = m.group(1).trim();
			}
			in.close();
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				in.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		System.out.println(charset);
		return charset;
	}

	public static void main(String[] args) {
		// 测试页面编码格式
		String charset = getCharsetFormUrl("http://cidian.youdao.com/?vendor=topnav");
		System.out.println(charset);
	}
}

相关代码资源下载地址：http://download.csdn.net/detail/huzhengnan/7203185

huzhengnan

关注

7
点赞
踩
19

收藏

觉得还不错? 一键收藏
3
评论
网络爬虫系列之一：通过URL下载网页

世界上第一个爬虫叫做"互联网漫游者（www wanderer）"，是由MIT学生马修·格雷写的。我想他大概也是通过细心细致的观察后发现：互联网上的页面之间是有联系的。比如说，通过分析一个页面的链接，就能下载到其它页面。而且做起来可能没那么困难，就放手去尝试，第一个爬虫就成了！初学爬虫，顺着自己的思路往下做。第一个爬虫的第一个部分就是下载，同时也知道第二个部分
复制链接

扫一扫

专栏目录