多线程数据采集 第一步使用java对网页文本内容进行抓取网页数据,获取网页文本节点 。目前我就接触过三种抓取方式
第一种方式 jsoup 插件 ,第一种相对简单 。瞎扯没大作用 直接上代码 。
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class jsoup {
public static List<String> getSiteUrlList1(List<String> list,String listurl){
if(list==null || list.size()<=0){
list = new ArrayList<String>();
}
try {
Document docdata = Jsoup.connect(listurl).timeout(10000).get();
String hb = ".m_book li a";
String page = ".page_list .page_up";
Elements ele = docdata.select(hb);
for (Element el : ele) {
list.add(el.attr("href"));
}
if(docdata.select(page)!=null && docdata.select(page).first()!=null){
String url = "http://www.xxxxx.com/"+docdata.select(page).first().attr("href");
getSiteUrlList1(list, url);
}
} catch (Exception e) {
e.printStackTrace();
}
return list;
}
public static void main(String[] args) {
List<String> list =new ArrayList<String>() ;
String listurl = "http://csdn.net";
getSiteUrlList1(list, listurl);
}
}
第二种 httpurlConnection 方式
这里有人会问了, 嗨哥们java中的URLConnection和HttpURLConnection有什么区别?
UrlConnection是一个抽象类,只实现了一些基本方法
抽象类URLConnection是所有表示应用程序与 URL 之间通信链路的类的超类。该类的实例可以用来对由 URL 引用的资源进行读取和写入操作
HttpURLConnection是UrlConnection的一个实现类是可以处理Http协议相关的操作,是支持HTTP特定功能的 URLConnection
这个答案还行吧/
你看看下面的代码好让你尽快了解httpurlConnection 相关知识 这是根据模拟浏览器进行抓取的 这个有点分量
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
public class HttpWebUtil {
/**
*网页抓取方法
* @param urlString 要抓取的url地址
* @param charset 网页编码方式
* @param timeout 超时时间
* @return 抓取的网页内容
* @throws IOException 抓取异常
*/
public static String GetWebContent(String urlString, final String charset, int timeout) throws IOException {
if (urlString == null || urlString.length() == 0) {
return "";
}
urlString = (urlString.startsWith("http://") || urlString.startsWith("https://")) ? urlString : ("http://" + urlString).intern();
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setDoOutput(true);
conn.setRequestProperty("Pragma", "no-cache");
conn.setRequestProperty("Cache-Control", "no-cache");
int temp = Integer.parseInt(Math.round(Math.random()*(UserAgent.length-1))+"");
conn.setRequestProperty(
"User-Agent",
UserAgent[temp]);
conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");//只接受text/html类型,当然也可以接受图片,pdf,*/*任意,就是tomcat/conf/web里面定义那些
conn.setConnectTimeout(timeout);
try {
if (conn.getResponseCode() != HttpURLConnection.HTTP_OK) {
return "";
}
} catch (Exception e) {
try {
System.out.println(e.getMessage());
} catch (Exception e2) {
e2.printStackTrace();
}
return "";
}
InputStream input = conn.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(input,
charset));
String line = null;
StringBuffer sb = new StringBuffer("");
while ((line = reader.readLine()) != null) {
sb.append(line).append("\r\n");
}
if (reader != null) {
reader.close();
}
if (conn != null) {
conn.disconnect();
}
return sb.toString();
}
public static String[] UserAgent = {
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.2",
"Mozilla/5.0 (iPad; U; CPU OS 3_2_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B500 Safari/531.21.11",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18121",
"Nokia5700AP23.01/SymbianOS/9.1 Series60/3.0",
"UCWEB7.0.2.37/28/998",
"NOKIA5700/UCWEB7.0.2.37/28/977",
"Openwave/UCWEB7.0.2.37/28/978",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/989"
};
public static void main(String[] args) {
String urlString ="http://www.csdn.net";
String charset="utf-8";
int timeout =1000;
try {
System.out.print(GetWebContent(urlString, charset, timeout));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
这一个相对简单点 就是一个普通数据流抓取 一般很好理解
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
/**
* JAVA抓取网页内容
*
* @author sunlightcs
*/
public class HtmlTest {
public static void main(String[] args) throws Exception {
System.out.println(getURLContent());
}
/**
* 21 获取网页内容 22
*/
private static String getURLContent() throws MalformedURLException,
IOException, UnsupportedEncodingException {
URL urlmy = new URL("http://v.youku.com/v_show/id_XMjU0MjI2NzY0.html"); // 填写想抓取的url地址
HttpURLConnection con = (HttpURLConnection) urlmy.openConnection(); // /开启Connection()方法
HttpURLConnection.setFollowRedirects(true);
con.setInstanceFollowRedirects(false);
con.connect();
BufferedReader br = new BufferedReader(new InputStreamReader(
con.getInputStream(), "UTF-8")); // /输入数据流 //
String s = "";
StringBuffer sb = new StringBuffer();
while ((s = br.readLine()) != null) {
sb.append(s + "\r\n"); // 打印注意格式
}
return sb.toString();
}
}
第三种 http 模拟器
import java.awt.image.BufferedImage;
import java.io.InputStream;
import javax.imageio.ImageIO;
import org.apache.commons.httpclient.*;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.apache.commons.httpclient.params.HttpMethodParams;
public class MyHttpClient {
/** *//**
* Method description
*
*
* @param title 留言标题
* @param name 留言者
* @param Content 内容
* @param proIP 代理IP
* @param port 代理端口
* @param usePro 是否使用代理
*/
public synchronized void doSomeThing(String title, String name, String Content, String proIP, int port,
boolean usePro) {
// 构造HttpClient的实例
HttpClient httpClient = new HttpClient();
HttpClientParams clientParams =new HttpClientParams() ;
// 隐藏自己请求相关的信息
clientParams.setParameter("http.useragent", "Mozilla/4.0 (compatible; FIREFOX 9.0; IBM AIX 5)");
// httpClient.getHttpConnectionManager().getParams().setSoTimeout(30 * 1000);
clientParams.setHttpElementCharset("GBK");
HttpState httpState = new HttpState();
httpClient.setParams(clientParams);
httpClient.getParams() .setParameter(HttpClientParams.HTTP_CONTENT_CHARSET, "GBK");
httpClient.setState(httpState);
clientParams.setVersion(HttpVersion.HTTP_1_1);
// httpClient.getHostConfiguration().setProxy("148.233.159.58", 3128);
if (usePro) // 使用代理
{
httpClient.getHostConfiguration().setProxy(proIP, port);
}
// 创建GET方法的实例
GetMethod getMethod = new GetMethod("http://www.XXXcom/Guestbook/imgchk/validatecode.asp");
// 使用系统提供的默认的恢复策略
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());
try {
// 执行getMethod
int statusCode = httpClient.executeMethod(getMethod);
// System.out.println(statusCode);
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: " + getMethod.getStatusLine());
} // 读取内容
InputStream inStream = getMethod.getResponseBodyAsStream();
// 处理内容
// System.out.println(new String(responseBody));
BufferedImage iag = ImageIO.read(inStream);
ImgIdent imgIdent = new ImgIdent(iag);
// imgIdent.saveJPEG(iag, "C:/ddd.jpg");
String validate = imgIdent.getValidatecode(4);
System.out.println(validate);
PostMethod method = new PostMethod("http://www.XXX.com/Guestbook/add_msg.asp");
String connect = Content;
String Title = title;
method.setParameter("subject", Title);
method.setParameter("g_name", name);
method.setParameter("companyname", "");
method.setParameter("mail", "");
method.setParameter("homepageurl", "http://");
method.setParameter("pic", "p5.gif");
method.setParameter("validatecode", validate);
method.setParameter("content", connect);
// if (todo) {
int code = httpClient.executeMethod(method);
// String Stringresponse = new String(method.getResponseBodyAsString().getBytes("8859_1"));
// 打印返回的信息
// System.out.println(Stringresponse);
// }
method.releaseConnection();
// System.out.println(iag.getHeight());
// System.out.println(iag.getWidth());
// //背景 颜色
// intBgColor = iag.getRGB(38, 0);
// System.out.println("intBgColor=" + intBgColor);
//
//
// intBgColor = iag.getRGB(0, 0);
// System.out.println("intBgColor=" + intBgColor);
} catch (Exception e) {
// 发生网络异常
e.printStackTrace();
} finally {}
// 释放连接 getMethod.releaseConnection(); }
getMethod.releaseConnection();
}
}
转载于:https://my.oschina.net/sunzy/blog/153067