功能:
1.返回网页文本内容;
2.正则表达式提取title;
3.自动创建下载目录即目录合法检查;
4.对抓取的网页重命名;
5.文件名乱码问题的解决
package basicLearn;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.lang.reflect.Field;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
public class CrawlPage1 {
private HttpClient httpClient;
private GetMethod getMethod;
private int statusCode;
private InputStream is;
private OutputStream os;
private File file;
// 直接获取网络资源,而不是通过代理服务器
/**
*
* @param url
* 抓取的网页URL
*
*/
public CrawlPage1(String url) {// ********************初始化资源在这里做***********//
httpClient = new HttpClient();
getMethod = new GetMethod(url);
try {
statusCode = httpClient.executeMethod(getMethod);
} catch (HttpException e) {
throw new RuntimeException(e);
} catch (IOException e) {
throw new RuntimeException(e);
}
System.out.println("initial over");
}
// 工具方法:用来获取文件文本内容
public static String readContent(File file) throws Exception {
System.out.println("read content beginning...");
BufferedReader bufferedReader = new BufferedReader(
new InputStreamReader(new FileInputStream(file), "utf-8"));// *****这里必须在构造流的时候就指定源文件的编码格式!!否则读取的到文件
// ******肯定乱码!!!********//
String string;
StringBuilder stringBuilder = new StringBuilder();
while ((string = bufferedReader.readLine()) != null)
stringBuilder.append(string + "\n");
// close resourse
bufferedReader.close();
String str = stringBuilder.toString();
System.out.println("read content over...");
return str;
}
// 创建下载的文件名,根据网页的<title>标签的值
public static String getFileName(String content) {
Pattern pattern = Pattern.compile("<title>([^</title>]*)");
Matcher matcher = pattern.matcher(content);
String title = null;
while (matcher.find()) {
title = matcher.group(1);
System.out.println("get file name over..." + title);
break;
}
return title;
}
/**
* @param 下载的网页到本地的目录
*
*/
public void downLoadPage(String directory) {// *********参数是一个本地的目录,这应该是一个有效的目录,将下载的网页保存到这个目录下******//
try {
if (statusCode == HttpStatus.SC_OK) {
System.out
.println("statuscode is " + getMethod.getStatusCode());
// 如果访问到了资源,则再在本地创建文件
File file = new File(directory);
if (!file.exists()) {
file.mkdirs();
this.file = file;
}
if (file.isFile()) {
throw new RuntimeException("这不是一个目录,而是一个文件");
}
if (file.exists() && file.isDirectory()) {
// 为了获得<title>,先将网页下载到临时的文件中
File f = new File(directory + "\\temp.html");
if (!f.exists()) {
f.createNewFile();
}
System.out.println("begin downLoadPage...");
is = getMethod.getResponseBodyAsStream();
os = new FileOutputStream(f);
int i;
byte[] buf = new byte[2048];
while ((i = is.read(buf)) != -1) {
os.write(buf, 0, i);
}
os.close();
getMethod.releaseConnection();
System.out.println(" downLoadPage over...");
String titleString = getFileName(readContent(f));
// 重命名
f.renameTo(new File(directory + "\\" + titleString
+ ".html"));
System.out.println("rename the file as" + titleString);
System.out.println("download over..");
} else {
System.out.println("你所指定的保存网页的目录并不是一个有效的位置..");
}
} else
System.out
.println("statuscode is " + getMethod.getStatusCode());
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("读写错误了!!!");
}
}
/**
* 释放所有资源
*/
public void closeResource() {
try {
if (is != null) {
is.close();
}
if (os != null) {
os.close();
}
} catch (Exception e) {
throw new RuntimeException(e);
}
if (getMethod != null) {
getMethod.releaseConnection();
}
System.out.println("resource released...");
}
public static void main(String[] args) {
CrawlPage1 crawlPage1 = new CrawlPage1(
"http://hao.360.cn/");
crawlPage1.downLoadPage("E:\\工作\\搜索引擎\\pageDownload\\temp");
crawlPage1.closeResource();
System.out.println("ok");
}
}