java 抓取网站数据

最新推荐文章于 2024-04-29 23:47:34 发布

lijuanabc

最新推荐文章于 2024-04-29 23:47:34 发布

阅读量419

点赞数

文章标签：数据库 java php

假设你需要获取51job 人才网上java 人才的需求数量，首先你需要分析51job 网站的搜索这

一块是怎么运作的，通过解析网页的源代码，我们发现了以下一些信息：

1. 搜索时页面请求的URL 是 http://search.51job.com/jobsearch/search_result.php

2. 请求所用的方法为：POST

3. 返回的页面的编码格式为：GBK

4. 假设我们想获取搜索java 人才时结果页面中显示的需求数量，我们发现数量位于返回的

HTML 数据中这样的一段代码之中：<td>1-30 / 14794</td>，于是我们可以得到这样的一个

模式：".+1-\d+ / (\d+).+"，第一个分组的内容就是我们需要的最终数据，有关java 中的模式，

请参考java 文档中Pattern 类的介绍

5. 另外做为POST 请求，页面向服务器发送的数据如下(这个很容易能过prototype 这样的js

框架抓取到，参考我的其它博客介绍) ：

lang=c&stype=1&postchannel=0000&fromType=1&line=&keywordtype=2&keyword=java&btnJ

obarea=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&jobarea=0000&image=&btn

Funtype=%E9%80%89%E6%8B%A9%2F%E4%BF%AE%E6%94%B9&funtype=0000&btnInd

ustrytype=%E9%80%89%E6%8B%A9%2F%E4%BF%AE%E6%94%B9&industrytype=00

对于第5 条中的数据哪些是服务器真正需要的我们不管，全部发送过去就是了。有了这些准

备，我们就可以真正开始通过java 发送请求，并获得最终数据了。

我们定义Resource 类，这个类封装所有的与请求有关的信息，Resource 包括以下属性：

view plaincopy to clipboardprint?

/**

* 需要获取资源的目标地址，不包含查询串

private String target;

/**

* get 请求时的查询串，或post 请求的请求数据

private String queryData = "";

/**

* 请求方式，get / post

private String method = "GET";

/**

* 返回的数据的编码类型

private String charset = "GBK";

/**

* 抓取数据的模式，将根据模式的分组来返回数据列表

private String pattern;

/**

* 需要获取资源的目标地址，不包含查询串

private String target;

/**

* get 请求时的查询串，或post 请求的请求数据

private String queryData = "";

/**

* 请求方式，get / post

private String method = "GET";

/**

* 返回的数据的编码类型

private String charset = "GBK";

/**

* 抓取数据的模式，将根据模式的分组来返回数据列表

private String pattern;

以下为抓取内容的代码：

view plaincopy to clipboardprint?

//假设以下代码中res 对象封装了所有的请求信息。

//URL 指向目的地。

//res.getTarget 返回目标地址，且当为get 请求时，这个地址包含了查询串的信息

URL url = new URL(res.getTarget());

HttpURLConnection con = (HttpURLConnection) url.openConnection(); //建立到目的地的联接

con.setRequestMethod(res.getMethod()); //设置请求的方法

//设置HTTP 请求头信息

con.setRequestProperty("accept", "*/*");

con.setRequestProperty("connection", "Keep-Alive");

con.setRequestProperty("user-agent",

"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)");

con.setDoInput(true);

if (res.getMethod().equals("POST")) { //如果为Post 请求则发送请求数据

con.setDoOutput(true);

con.getOutputStream().write(res.getQueryData().getBytes());

con.getOutputStream().flush();

}

//通过BufferedReader 一行行的读取数据，如果你需要的是全部返回结果，可以修改一下这

里

BufferedReader br = new BufferedReader(new InputStreamReader(

con.getInputStream(), res.getCharset()));

Pattern pattern = Pattern.compile(res.getPattern());

String s = null;

while ((s = br.readLine()) != null) {

System.out.println(s);

Matcher m = pattern.matcher(s); //检测当前行是否与要求结果的模式相匹配

boolean b = m.matches();

if (! b) {

continue;

}

int size = m.groupCount();

List result = new ArrayList(size);

for(int i=0; i result.add(m.group(i+1)); //如果有多个分组，则取出所有分组，并把最终

结果做为列表返回

}

return result;

}

远程抓取页面信息并解析XML

XmlTransfer.java 负责链接对方服务器

package untitled1;

import java.net.URL;

import java.net.URLConnection;

import java.net.HttpURLConnection;

import java.net.MalformedURLException;

import java.net.ProtocolException;

import java.io.IOException;

import java.io.InputStream;

import java.io.PrintWriter;

import org.w3c.dom.*;

import javax.xml.parsers.*;

public class XmlTransfer{

private String urlAddr;

private String xmlStr;

HttpURLConnection urlCon = null;

public XmlTransfer(String _urlAddr,String _xmlStr) {

this.urlAddr = _urlAddr;

this.xmlStr = _xmlStr;

}

public InputStream get() throws Exception

{

if(urlCon==null){urlCon=getUrlConnection();}

if(urlCon==null){throw new Exception("连接失败");}

PrintWriter out = new PrintWriter(urlCon.getOutputStream());

out.print(xmlStr);

out.flush();

out.close();

urlCon.disconnect();

InputStream fin1 = urlCon.getInputStream();

return fin1;

}

private HttpURLConnection getUrlConnection(){

try{

URL url = new URL(urlAddr);

URLConnection conn = url.openConnection();

urlCon = (HttpURLConnection)conn;

urlCon.setRequestProperty("Content-type", "text/html;charset=gb2312");

urlCon.setDoOutput(true);

urlCon.setRequestMethod("GET");

urlCon.setUseCaches(false);

}

catch (MalformedURLException mex) {

mex.printStackTrace();

}

catch (ProtocolException pex) {

pex.printStackTrace();

}

catch (IOException iex) {

iex.printStackTrace();

}

return urlCon;

}

public static String getHttp( String strURL ){

XmlTransfer xt=new XmlTransfer(strURL,"");

StringBuffer sb = new StringBuffer();

try{

InputStream is = xt.get();

byte[] b = new byte[1024];

int iCount = 0;

while ((iCount = is.read(b)) > 0) {

sb.append(new String(b, 0, iCount));

}

}catch(Exception e){

sb.append("An error occurs in XmlTransfer.getHttp\n");

sb.append(e.getMessage());

}

return (sb.toString());

}

public static void main(String[] args) throws Exception {

System.out.println( XmlTransfer.getHttp("http://215.117.110.81/yyoa/oainfo.jsp?comm=person")

);

//http://192.168.0.110/testProvince.html","");

}

UsrDataSync.java 负责抓取页面

package untitled1;

import java.util.Calendar;

import java.util.TimerTask;

import javax.servlet.ServletContext;

import java.io.File;

/**

* Title:

* Description:

* Company:

* @author not attributable

* @version 1.0

public class UsrDataSync {

public UsrDataSync() {

}

public static boolean doSync(){

String strXml;

ParseXML px = new ParseXML();

strXml = XmlTransfer.getHttp("http://215.117.110.81/yyoa/oainfo.jsp?comm=person");

strXml = strXml.replaceAll("\r\n", "");

px.doParse(strXml);

return false;

}

public static void main(String[] args) throws Exception {

UsrDataSync dd= new UsrDataSync();

dd.doSync();

}

ParseXML.java 解析XML（包括正则表达式）

//import java.awt.*;

//import javax.servlet.*;

//import javax.servlet.http.*;

//import javax.servlet.jsp.*;

//import org.apache.jasper.runtime.*;

package usersync;

import java.io.*;

import java.util.*;

import javax.xml.parsers.*;

import org.w3c.dom.*;

import java.net.URL;

import java.net.URLConnection;

import java.net.HttpURLConnection;

import java.net.MalformedURLException;

import java.net.ProtocolException;

import java.io.IOException;

import java.io.InputStream;

import java.io.PrintWriter;

import javax.swing.*;

import java.sql.*;

/**

* Title:

* Description:

* Company:

* @author not attributable

* @version 1.0

public class ParseXML{

// StringBuffer os = new StringBuffer();

Document doc = null;

public Connection con=null;

public Connection con_history=null;

public String doParse(String str) {

try {

DocumentBuilder builder =

DocumentBuilderFactory.newInstance().newDocumentBuilder();

//InputStream is=xt.get();

doc = builder.parse(new ByteArrayInputStream(str.getBytes()));

NodeList nl= doc.getElementsByTagName("person");

int i=0;

int len=nl.getLength();

Element tempElement=null;

while(i<len)

{

tempElement=(Element) nl.item(i);

System.out.println(tempElement.getFirstChild().getNodeValue().toString());

tempElement.normalize();

System.out.print(tempElement.getAttribute("id"));

System.out.print(" ");

System.out.print(tempElement.getAttribute("name"));

System.out.print(" ");

System.out.print(tempElement.getAttribute("logname"));

System.out.println();

String duty=nl.item(i).getChildNodes().item(1).toString();

String department=nl.item(i).getChildNodes().item(3).toString();

String station=nl.item(i).getChildNodes().item(5).toString();

String state=nl.item(i).getChildNodes().item(7).toString();

String description=nl.item(i).getChildNodes().item(9).toString();

//String s="1312311231";

//正则表达式

duty = duty.replaceAll(".duty\\sid..\\d....","");

duty = duty.replaceAll("</duty>","");

department = department.replaceAll(".department\\sid..\\d..","");

department = department.replaceAll("</department>","");

station = station.replaceAll(".station\\sid..\\d..","");

station = station.replaceAll("</station>","");

state = state.replaceAll(".state\\sid..\\d..","");

state = state.replaceAll("</state>","");

description = description.replaceAll(".description\\sid..\\d..","");

description = description.replaceAll("</description>","");

description = description.replaceAll("<description />","");

syncUser(Integer.parseInt(tempElement.getAttribute("id")),

tempElement.getAttribute("name"),

tempElement.getAttribute("logname"),

duty,

department,

station,

state,

description);

// syncUser(Integer.parseInt(tempElement.getAttribute("id")),

// tempElement.getAttribute("name"),

// tempElement.getAttribute("logname"),

// nl.item(i).getChildNodes().item(1).getTextContent(),

// nl.item(i).getChildNodes().item(3).getTextContent(),

// nl.item(i).getChildNodes().item(5).getTextContent(),

// nl.item(i).getChildNodes().item(7).getTextContent(),

// nl.item(i).getChildNodes().item(9).getTextContent());

i++;

}

System.out.println(doc.toString());

//System.out.println(os.toString());

// System.out.println("==============" + System.currentTimeMillis() +

"==============");

}catch(Exception e){

System.out.println(e.getMessage());

}

return null;

}

private boolean syncUser(int uid, String usrname, String logname, String duty, String

department, String station, String state, String description ){

int pos = 0;//0: new, 1:running, 2:history

try{

Class.forName("com.mysql.jdbc.Driver");

con=DriverManager.getConnection("jdbc:mysql://localhost:3306/blog","root","root");

con_history=DriverManager.getConnection("jdbc:mysql://localhost:3306/blog_history","root","ro

ot");

Statement st = con.createStatement();

Statement stt = con_history.createStatement();

//新表

ResultSet rs = st.executeQuery("select * from blogusers where id=" +

Integer.toString(uid) );

//老表

ResultSet rss = stt.executeQuery("select * from blogusers where id=" + uid);

if(rs.next())

{

String dp = department.substring(0,2);

if(dp.equals("中央"))

{

st.executeUpdate("update blogusers set id=" + uid +

",TRUENAME='" + usrname + "',DUTYNAME='" +

duty + "'," +

"FLAG=" + 1 + ",DEPMENT='"+department+"' where

id=" + uid);

}else{

st.executeUpdate("update blogusers set id=" + uid +

",TRUENAME='" + usrname + "',DUTYNAME='" +

duty + "'," +

"FLAG=" + 2 + ",DEPMENT='"+department+"' where

id=" + uid);

}

//运行库

pos = 1;

}

else if(rss.next())

{

st.executeUpdate("update blogusers set id=" + uid +

",TRUENAME='" + usrname + "',DUTYNAME='" +

duty + "'," + "FLAG=" + 2 +

",DEPMENT='"+department+"' where id=" + uid);

//老库

pos = 2;

}

if(state.equals("在职"))

{

switch( pos ){

//新库

case 0:

//insert to running

st.executeUpdate("insert into

blogusers(id,TRUENAME,DUTYNAME,FLAG,DEPMENT)"+

values("+uid+",'"+usrname+"','"+duty+"',"+(department.startsWith(" 中央

")==true?1:2)+",'"+department+"')");

break;

//老库

case 2:

//move from running to history

moveUser(con, con_history, uid, usrname, logname, duty, department,

station, state, description);

break;

}

}else{

switch( pos ){

//新库

case 0:

//insert to history

stt.executeUpdate("insert into

blogusers(id,TRUENAME,DUTYNAME,FLAG,DEPMENT)"+

values("+uid+",'"+usrname+"','"+duty+"',"+(department.startsWith(" 中央

")==true?1:2)+",'"+department+"')");

break;

//运行库

case 1:

//move from history to running

moveUser(con_history, con, uid, usrname, logname, duty, department,

station, state, description );

break;

}

// rs.close();

// st.close();

// con.close();

}catch(Exception e){

e.printStackTrace();

}

return false;

}

private void moveUser(Connection src, Connection dest, int uid, String usrname, String

logname, String duty, String department, String station, String state, String description ) throws

SQLException {

Statement st1=src.createStatement();

Statement st2=dest.createStatement();

//查询运行库

ResultSet rs1=st1.executeQuery("select * from blogusers where id="+uid);

String s1="";

String s2="";

String s3="";

//String s4="";

//String s5="";

String s6="";

//String s7="";

String s8="";

if(rs1.next())

{

s1=rs1.getString(1);

s2=rs1.getString(2);

s3=rs1.getString(3);

//s4=rs1.getString(4);

//s5=rs1.getString(5);

s6=rs1.getString(6);

//s7=rs1.getString(7);

s8=rs1.getString(8);

}

//插入老库

st2.executeUpdate("insert into

blogusers(id,TRUENAME,DUTYNAME,FLAG,DEPMENT)"+

" values("+uid+",'"+usrname+"','"+duty+"',"+(department.startsWith(" 中央

")==true?1:2)+",'"+department+"')");

//删除运行库记录

st1.executeUpdate("delete from blogusers where id=" + uid);

}

抓取网页数据

暂时没有事情做，所以就研究一些小东东，以前经常听人家说抓取网站数据呀，感觉好牛呀，

所以自己也来研究一下下，只是没有成为牛人一组，写了一段代码，以后再慢慢的改，希望

能改成搜索引擎那样子，随意抓取各大网站数据。

//分析HTML 标签查找裢接

private string GetUrl(string strWebContent)

{

//string strRef =

string strRef = @"(href|HREF)[ ]*=[ ]*[""'][^""'#>]+[""']";

string strResult = "";

MatchCollection matches = new Regex(strRef).Matches(strWebContent);

for (int i = 0; i < matches.Count; i++)

{

strResult += matches[i].ToString().Replace("href=", "") + "\r\n";

}

//strRef = @"[ ]*[""'][^""'#>]+[""']";

//matches = new Regex(strRef).Matches(strResult);

//for (int i = 0; i < matches.Count; i++)

//{

// strResult += matches[i].ToString() + "\r\n";

//}

return strResult;

}

//提取URL 地址

private string GetUrl()

{

string strRef = @"(href|HREF)[ ]*=[ ]*[""'][^""'#>]+[""']";

Regex objRegExp = new Regex(strRef);

return strRef;

}

/// <summary>

/// 将Html 标签转化为空格

/// </summary>

/// <param name="strHtml">待转化的字符串</param>

/// <returns>经过转化的字符串</returns>

private string stripHtml(string strHtml)

{

Regex objRegExp = new Regex("<(.|\n)+?>");

string strOutput = objRegExp.Replace(strHtml, "");

strOutput = strOutput.Replace("<", "<");

strOutput = strOutput.Replace(">", ">");

return strOutput;

}

//获得标题

private string GetTitle(string strWebContent)

{

//获取标题

Match TitleMatch = Regex.Match(strWebContent, "<title>([^<]*)</title>",

RegexOptions.IgnoreCase | RegexOptions.Multiline);

return TitleMatch.Groups[1].Value;

}

//获取描述信息

private string GetDescription(string strWebContent)

{

Match Desc = Regex.Match(strWebContent, "<Meta name=\"DESCRIPTION\"

content=\"([^<]*)\">", RegexOptions.IgnoreCase | RegexOptions.Multiline);

return Desc.Groups[1].Value;

}

//根据Url 地址得到网页的html 源码

private string GetWebContent(string Url)

{

string strResult = "";

try

{

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);

//声明一个HttpWebRequest 请求

request.Timeout = 30000;

//设置连接超时时间

request.Headers.Set("Pragma", "no-cache");

HttpWebResponse response = (HttpWebResponse)request.GetResponse();

Stream streamReceive = response.GetResponseStream();

Encoding encoding = Encoding.GetEncoding("GB2312");

StreamReader streamReader = new StreamReader(streamReceive, encoding);

strResult += streamReader.ReadToEnd();

}

catch

{

}

return strResult;

}

Java 基础：利用HttpClient 获取网页内容

HTTP 协议是目前互联网上最重要的协议，许多软件与服务都需要依赖HTTP 协议。

虽然java.net 这个package 中包含了对HTTP 的基本支持，但还有很多高级和复杂的功能无

法实现，这不能不说是一个遗憾。

HttpClient 作为Apache 的开源项目项目之一，为基于HTTP 协议的操作提供了强大的客户端

执行支持，最新的版本为3.0RC3。

__________下面通过一个例子简要展示HttpClient 的使用方法：

--------------------------------------------------------------------------------

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

iimport java.io.UnsupportedEncodingException;

import java.util.*;

import org.apache.commons.httpclient.Header;

import org.apache.commons.httpclient.HostConfiguration;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.HttpConnection;

import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;

import org.apache.commons.httpclient.NameValuePair;

import org.apache.commons.httpclient.methods.GetMethod;

import org.apache.commons.httpclient.methods.PostMethod;

/**

* @author steven

public class HttpClientExample {

//获得ConnectionManager，设置相关参数

private static MultiThreadedHttpConnectionManager manager =

new MultiThreadedHttpConnectionManager();

private static int connectionTimeOut = 20000;

private static int socketTimeOut = 10000;

private static int maxConnectionPerHost = 5;

private static int maxTotalConnections = 40;

//标志初始化是否完成的flag

private static boolean initialed = false;

//初始化ConnectionManger 的方法

public static void SetPara() {

manager.getParams().setConnectionTimeout(connectionTimeOut);

manager.getParams().setSoTimeout(socketTimeOut);

manager.getParams()

.setDefaultMaxConnectionsPerHost(maxConnectionPerHost);

manager.getParams().setMaxTotalConnections(maxTotalConnections);

initialed = true;

}

//通过get 方法获取网页内容

public static String getGetResponseWithHttpClient(String url, String encode) {

HttpClient client = new HttpClient(manager);

if (initialed) {

HttpClientExample.SetPara();

}

GetMethod get = new GetMethod(url);

get.setFollowRedirects(true);

String result = null;

StringBuffer resultBuffer = new StringBuffer();

try {

client.executeMethod(get);

//在目标页面情况未知的条件下，不推荐使用getResponseBodyAsString()方法

//String strGetResponseBody = post.getResponseBodyAsString();

BufferedReader in = new BufferedReader(

new InputStreamReader(

get.getResponseBodyAsStream(),

get.getResponseCharSet()));

String inputLine = null;

while ((inputLine = in.readLine()) != null) {

resultBuffer.append(inputLine);

resultBuffer.append("\n");

}

in.close();

result = resultBuffer.toString();

//iso-8859-1 is the default reading encode

result = HttpClientExample.ConverterStringCode(resultBuffer.toString(),

get.getResponseCharSet(),

encode);

} catch (Exception e) {

e.printStackTrace();

result = "";

} finally {

get.releaseConnection();

return result;

}

public static String getPostResponseWithHttpClient(String url,

String encode) {

HttpClient client = new HttpClient(manager);

if (initialed) {

HttpClientExample.SetPara();

}

PostMethod post = new PostMethod(url);

post.setFollowRedirects(false);

StringBuffer resultBuffer = new StringBuffer();

String result = null;

try {

client.executeMethod(post);

BufferedReader in = new BufferedReader(

new InputStreamReader(

post.getResponseBodyAsStream(),

post.getResponseCharSet()));

String inputLine = null;

while ((inputLine = in.readLine()) != null) {

resultBuffer.append(inputLine);

resultBuffer.append("\n");

}

in.close();

//iso-8859-1 is the default reading encode

result = HttpClientExample.ConverterStringCode(resultBuffer.toString(),

post.getResponseCharSet(),

encode);

} catch (Exception e) {

e.printStackTrace();

result = "";

} finally {

post.releaseConnection();

return result;

}

public static String getPostResponseWithHttpClient(String url,

String encode,

NameValuePair[] nameValuePair) {

HttpClient client = new HttpClient(manager);

if (initialed) {

HttpClientExample.SetPara();

}

PostMethod post = new PostMethod(url);

post.setRequestBody(nameValuePair);

post.setFollowRedirects(false);

String result = null;

StringBuffer resultBuffer = new StringBuffer();

try {

client.executeMethod(post);

BufferedReader in = new BufferedReader(

new InputStreamReader(

post.getResponseBodyAsStream(),

post.getResponseCharSet()));

String inputLine = null;

while ((inputLine = in.readLine()) != null) {

resultBuffer.append(inputLine);

resultBuffer.append("\n");

}

in.close();

//iso-8859-1 is the default reading encode

result = HttpClientExample.ConverterStringCode(resultBuffer.toString(),

post.getResponseCharSet(),

encode);

} catch (Exception e) {

e.printStackTrace();

result = "";

} finally {

post.releaseConnection();

return result;

}

private static String ConverterStringCode(String source, String srcEncode, String destEncode) {

if (src != null) {

try {

return new String(src.getBytes(srcEncode), destEncode);

} catch (UnsupportedEncodingException e) {

// TODO Auto-generated catch block

e.printStackTrace();

return "";

}

} else {

return "";

}

--------------------------------------------------------------------------------

之后，就可以通过下面的代码获得目标网页：

String source = HttpClientExample.getGetResponseWithHttpClient("www.sina.com.cn", "GBK");

注意，在默认情况下，HttpClient 的Request 的Head 中

User-Agent 的值是Jakarta Commons-HttpClient 3.0RC1，如果需要改变它（例如，变为

Mozilla/4.0），必须在调用之前运行如下语句：

System.getProperties().setProperty("httpclient.useragent", "Mozilla/4.0");

java 抓取网页乱码问题处理

String htmlContent = "";

java.io.InputStream inputStream;

java.net.URL url = new java.net.URL("www.csdn.net ");

java.net.HttpURLConnection connection = (java.net.HttpURLConnection)

url.openConnection();

connection.connect();

inputStream = connection.getInputStream();

byte bytes[] = new byte[1024*100];

int index = 0;

int count = inputStream.read(bytes, index, 1024*100);

while (count != -1) {

index += count;

count = inputStream.read(bytes, index, 1);

}

System.out.println (count);

htmlContent = new String(bytes, "gb2312");//

System.out.println(htmlContent);

lijuanabc

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java 抓取网站数据

假设你需要获取51job人才网上java 人才的需求数量，首先你需要分析51job网站的搜索这一块是怎么运作的，通过解析网页的源代码，我们发现了以下一些信息：1. 搜索时页面请求的URL是 http://search.51job.com/jobsearch/search_result.php2. 请求所用的方法为：POST3. 返回的页面的编码格式为：GBK4. 假...
复制链接

扫一扫