java 爬虫百度新闻_基于HttpClient实现网络爬虫~以百度新闻为例

最新推荐文章于 2023-03-11 12:34:16 发布

weixin_39927378

最新推荐文章于 2023-03-11 12:34:16 发布

阅读量467

点赞数

文章标签： java 爬虫百度新闻

本文链接：https://blog.csdn.net/weixin_39927378/article/details/114201470

版权

在曾经的工作中，实现过简单的网络爬虫，没有系统的介绍过，这篇博客就系统的介绍以下怎样使用java的HttpClient实现网络爬虫。

关于网络爬虫的一些理论知识、实现思想以及策略问题。能够參考百度百科“网络爬虫”，那里已经介绍的十分具体。这里也不再啰嗦，以下就主要介绍怎样去实现。

http请求：

代码開始之前，还是首先介绍以下怎样通过浏览器获取http请求信息。这一步是分析站点资源的第一步。在浏览器界面右键有“审查元素”这一功能(假设没找到。F12一样能够的)，谷歌浏览器效果例如以下：

点击“审查元素”之后会出现例如以下界面：

当中的Network栏目是做爬虫应该重点关注的，打开会看到当前网页所有的http请求信息，例如以下图：

单击每一个信息。能够看到http请求的具体信息。例如以下图所看到的：

通过程序伪装成浏览器请求的时候，就多须要关注Request Headers里面的信息，另一些须要登录的站点也是须要关注这些的。Response里面的信息就是server返回的内容，这里仅仅做对文本信息的处理，对图片、音频、视频等信息不做介绍。

Response里面就包括这我们爬虫想获取的信息内容。假设里面的格式不好看的话。能够在浏览器中输入该http请求的url地址。然后右键-->查看网页源码的形式查看相关信息。

通过分析网页源码中的字符串。总结出统一的规则。提取对应的文本信息。

代码实现：

CrawlBase类，模拟http请求的基类

/**

*@Description: 获取网页信息基类

package com.lulei.crawl;

import java.io.BufferedReader;

import java.io.ByteArrayInputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.util.HashMap;

import java.util.Iterator;

import java.util.Map;

import java.util.Map.Entry;

import org.apache.commons.httpclient.Header;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.HttpException;

import org.apache.commons.httpclient.HttpMethod;

import org.apache.commons.httpclient.HttpStatus;

import org.apache.commons.httpclient.methods.GetMethod;

import org.apache.commons.httpclient.methods.PostMethod;

import org.apache.log4j.Logger;

import com.lulei.util.CharsetUtil;

public abstract class CrawlBase {

private static Logger log = Logger.getLogger(CrawlBase.class);

//链接源码

private String pageSourceCode = "";

//返回头信息

private Header[] responseHeaders = null;

//连接超时时间

private static int connectTimeout = 3500;

//连接读取时间

private static int readTimeout = 3500;

//默认最大訪问次数

private static int maxConnectTimes = 3;

//网页默认编码方式

private static String charsetName = "iso-8859-1";

private static HttpClient httpClient = new HttpClient();

static {

httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(connectTimeout);

httpClient.getHttpConnectionManager().getParams().setSoTimeout(readTimeout);

}

/**

* @param urlStr

* @param charsetName

* @param method

* @param params

* @return

* @throws HttpException

* @throws IOException

* @Author: lulei

* @Description: method方式訪问页面

public boolean readPage(String urlStr, String charsetName, String method, HashMap params) throws HttpException, IOException {

if ("post".equals(method) || "POST".equals(method)) {

return readPageByPost(urlStr, charsetName, params);

} else {

return readPageByGet(urlStr, charsetName, params);

}

/**

* @param urlStr

* @param charsetName

* @param params

* @return 訪问是否成功

* @throws HttpException

* @throws IOException

* @Author: lulei

* @Description: Get方式訪问页面

public boolean readPageByGet(String urlStr, String charsetName, HashMap params) throws HttpException, IOException {

GetMethod getMethod = createGetMethod(urlStr, params);

return readPage(getMethod, charsetName, urlStr);

}

/**

* @param urlStr

* @param charsetName

* @param params

* @return 訪问是否成功

* @throws HttpException

* @throws IOException

* @Author: lulei

* @Description: Post方式訪问页面

public boolean readPageByPost(String urlStr, String charsetName, HashMap params) throws HttpException, IOException{

PostMethod postMethod = createPostMethod(urlStr, params);

return readPage(postMethod, charsetName, urlStr);

}

/**

* @param method

* @param defaultCharset

* @param urlStr

* @return 訪问是否成功

* @throws HttpException

* @throws IOException

* @Author: lulei

* @Description: 读取页面信息和头信息

private boolean readPage(HttpMethod method, String defaultCharset, String urlStr) throws HttpException, IOException{

int n = maxConnectTimes;

while (n > 0) {

try {

if (httpClient.executeMethod(method) != HttpStatus.SC_OK){

log.error("can not connect " + urlStr + "\t" + (maxConnectTimes - n + 1) + "\t" + httpClient.executeMethod(method));

n--;

} else {

//获取头信息

responseHeaders = method.getResponseHeaders();

//获取页面源码

InputStream inputStream = method.getResponseBodyAsStream();

BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, charsetName));

StringBuffer stringBuffer = new StringBuffer();

String lineString = null;

while ((lineString = bufferedReader.readLine()) != null){

stringBuffer.append(lineString);

stringBuffer.append("\n");

}

pageSourceCode = stringBuffer.toString();

InputStream in =new ByteArrayInputStream(pageSourceCode.getBytes(charsetName));

String charset = CharsetUtil.getStreamCharset(in, defaultCharset);

//以下这个推断是为了IP归属地查询特意加上去的

if ("Big5".equals(charset)) {

charset = "gbk";

}

if (!charsetName.toLowerCase().equals(charset.toLowerCase())) {

pageSourceCode = new String(pageSourceCode.getBytes(charsetName), charset);

}

return true;

}

} catch (Exception e) {

e.printStackTrace();

System.out.println(urlStr + " -- can't connect " + (maxConnectTimes - n + 1));

n--;

}

return false;

}

/**

* @param urlStr

* @param params

* @return GetMethod

* @Author: lulei

* @Description: 设置get请求參数

@SuppressWarnings("rawtypes")

private GetMethod createGetMethod(String urlStr, HashMap params){

GetMethod getMethod = new GetMethod(urlStr);

if (params == null){

return getMethod;

}

Iterator iter = params.entrySet().iterator();

while (iter.hasNext()) {

Map.Entry entry = (Map.Entry) iter.next();

String key = (String) entry.getKey();

String val = (String) entry.getValue();

getMethod.setRequestHeader(key, val);

}

return getMethod;

}

/**

* @param urlStr

* @param params

最低0.47元/天解锁文章

weixin_39927378

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
java 爬虫百度新闻_基于HttpClient实现网络爬虫~以百度新闻为例

在曾经的工作中，实现过简单的网络爬虫，没有系统的介绍过，这篇博客就系统的介绍以下怎样使用java的HttpClient实现网络爬虫。关于网络爬虫的一些理论知识、实现思想以及策略问题。能够參考百度百科“网络爬虫”，那里已经介绍的十分具体。这里也不再啰嗦，以下就主要介绍怎样去实现。http请求：代码開始之前，还是首先介绍以下怎样通过浏览器获取http请求信息。这一步是分析站点资源的第一步。在浏览器界面...
复制链接

扫一扫