java下载网页内容_java下载网页并读取内容

下载回来怎么也得读取内容:

package com.core.crawl;

import java.io.IOException;

import com.util.file.Files;

public class Crawl {

/**

* @param args

* @throws IOException

* @throws InterruptedException

*/

public static void main(String[] args) throws IOException, InterruptedException {

long begin = System.currentTimeMillis();

//WebSpider spider2 = new WebSpider();

WebSpider spider1 = new WebSpider();

spider1.setWebAddress("http://www.w3c.org/robots.txt");

spider1.setDestFile(Files.getSysPath() + "/"+"robots.");

//spider2.setWebAddress("http://blog.csdn.net/longronglin");

//spider2.setDestFile(Files.getSysPath() + "/"+"spider2.");

Thread t1 = new Thread(spider1);

//Thread t2 = new Thread(spider2);

t1.start();

//t2.start();

t1.join();

//t2.join();

System.out.println("the end");

System.out.println(System.currentTimeMillis() - begin);

}

}

package com.core.crawl;

import java.io.BufferedReader;

import java.io.DataInputStream;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.net.HttpURLConnection;

import java.net.URL;

import com.core.http.Http;

public class WebSpider implements Runnable{

private Http http = new Http();

private String webAddress = "";

private String destFile = "";

public void setWebAddress(String webAddress){

this.webAddress = webAddress;

}

public void setDestFile (String destFile){

this.destFile = destFile;

}

public boolean download() throws IOException, InterruptedException {

HttpURLConnection httpConn = null;

try {

URL url = new URL(webAddress);

httpConn = (HttpURLConnection) url.openConnection();

httpConn.setRequestMethod("GET");

httpConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14");

InputStream in = httpConn.getInputStream();

String fileType = http.fileType(httpConn.getContentType());

System.out.println(fileType);

FileOutputStream out = new FileOutputStream(new File(destFile + fileType));

int chByte = in.read();

BufferedReader bf = new BufferedReader(new InputStreamReader(in));

String result = null;

while ((result = bf.readLine()) != null) {

System.out.println(result);

}

// while (chByte != -1) {

//out.write(chByte);

//

//System.out.println(chByte);

//chByte = in.read();

// }

} catch (Exception ex) {

System.out.println(ex.toString());

} finally {

httpConn.disconnect();

}

return true;

}

public void run() {

try {

System.out.println(Thread.currentThread().getName());

download();

} catch (IOException e) {

e.printStackTrace();

} catch (InterruptedException e) {

e.printStackTrace();

}

}

}

package com.util.file;

public class Files {

/***

* 获取应用程序的根目录

* @return 应用程序根目录

*/

public static String getSysPath(){

return System.getProperty("user.dir");

}

}

results:

Thread-0

html

# robots.txt for http://www.w3.org/

#

# $Id: robots.txt,v 1.50 2007/12/13 17:09:37 ted Exp $

#

# For use by search.w3.org

User-agent: W3C-gsa

Disallow: /Out-Of-Date

User-agent: W3T_SE

Disallow: /Out-Of-Date

User-agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT; MS Search 4.0 Robot)

Disallow: /

# W3C Link checker

User-agent: W3C-checklink

Disallow:

# exclude some access-controlled areas

User-agent: *

Disallow: /2004/ontaria/basic

Disallow: /Team

Disallow: /Project

Disallow: /Web

Disallow: /Systems

Disallow: /History

Disallow: /Out-Of-Date

Disallow: /2002/02/mid

Disallow: /mid/

Disallow: /2004/08/W3CTalks

Disallow: /2007/11/Talks/search

Disallow: /People/all/

Disallow: /RDF/Validator/ARPServlet

Disallow: /2003/03/Translations/byLanguage

Disallow: /2003/03/Translations/byTechnology

Disallow: /2005/11/Translations/Query

Disallow: /2003/glossary/subglossary/

#Disallow: /2005/06/blog/

#Disallow: /2001/07/pubrules-checker

#shouldnt get transparent proxies but will ml links of things like pubrules

Disallow: /2000/06/webdata/xslt

Disallow: /2000/09/webdata/xslt

Disallow: /2005/08/online_xslt/xslt

Disallow: /Bugs/

Disallow: /Search/Mail/Public/

Disallow: /2006/02/chartergen

the end

10485

spider1.setWebAddress("http://www.w3c.org/");

spider1.setDestFile(Files.getSysPath() + "/"+"w3c.");

的设置自己测试

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值