终于写了个小的爬虫程序,mark一下。只是实现根据url获取网页信息的功能。
mport org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
/*
* 获取网页信息
*/
public class DownloadPage
{
public String getContentFormUrl(String url,Logger log)
{
/* 实例化一个HttpClient客户端 */
HttpClient client = new DefaultHttpClient();
HttpGet getHttp = new HttpGet(url);
String content = null;
HttpResponse response;
int i=0;
for(i=0;i<3;i++)
{
try
{
/*超时设置*/
client.getParams().setIntParameter(HttpConnectionParams.SO_TIMEOUT,3000); //超时设置
client.getParams().setIntParameter(HttpConnectionParams.CONNECTION_TIMEOUT, 3000);//连接超时
/*获得信息载体*/
response = client.execute(getHttp);
HttpEntity entity = response.getEntity();
if(entity!=null)
{
/* 转化为文本信息 */
content = EntityUtils.toString(entity);
//System.out.print(content);
log.info("获取"+url+"信息!");
//log.info(content);
client.getConnectionManager().shutdown();
return content;
}
} catch (Exception e)
{
log.error("Exception 访问"+url+ e.toString());
} finally
{
client.getConnectionManager().shutdown();
}
}
return content;
}
}