1.关键方法
/**
* 向指定 URL 发送POST方法的请求
*
* @param url
* 发送请求的 URL
* @param param
* 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
* @param encode
* 请求页面的字符编码
* @param cookie
* cookie
* @return 所代表远程资源的响应结果
*/
public static String sendPost1(String url, String param, String encode,String cookie) {
PrintWriter out = null;
BufferedReader in = null;
String result = "";
try {
URL realUrl = new URL(url);
// 打开和URL之间的连接
URLConnection conn = realUrl.openConnection();
// 设置通用的请求属性
conn.setRequestProperty("accept", "*/*");
conn.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8");
conn.setRequestProperty("Cache-Control","max-age=0");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("Cookie",cookie);
//conn.setRequestProperty("Host","www.zjtax.gov.cn");
conn.setRequestProperty("user-agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
// 发送POST请求必须设置如下两行
conn.setDoOutput(true);
conn.setDoInput(true);
// 获取URLConnection对象对应的输出流
out = new PrintWriter(conn.getOutputStream());
// 发送请求参数
out.print(param);
// flush输出流的缓冲
out.flush();
// 定义BufferedReader输入流来读取URL的响应
in = new BufferedReader(
new InputStreamReader(conn.getInputStream(),encode));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
} catch (Exception e) {
System.out.println("发送 POST 请求出现异常!"+e);
e.printStackTrace();
}
//使用finally块来关闭输出流、输入流
finally{
try{
if(out!=null){
out.close();
}
if(in!=null){
in.close();
}
}
catch(IOException ex){
ex.printStackTrace();
}
}
return result;
}
/**
* 获取cookie
*
* @param url
* 发送请求的URL
* @return key=value;key=value;...
*/
public static String getCookie2(String url) {
HttpURLConnection conn = null;
try {
URL realUrl = new URL(url);
conn = (HttpURLConnection) realUrl.openConnection();
conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
conn.setRequestProperty("Accept-Encoding","gzip, deflate, sdch");
conn.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8");
conn.setRequestProperty("Cache-Control","max-age=0");
conn.setRequestProperty("connection", "Keep-Alive");
//conn.setRequestProperty("Host","www.zjtax.gov.cn");
conn.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
//是否自动执行 http 重定向,默认为true
//如果实际操作中,不存在重定向问题,不需要设置此行。
conn.setInstanceFollowRedirects(false);
conn.setDoInput(true);
conn.setDoOutput(true);
conn.setRequestMethod("POST");
} catch (Exception e) {
e.printStackTrace();
}
String sessionId = "";
String cookieVal = "";
String key = null;
// Map<String,List<String>> map = conn.getHeaderFields();
// for (String key1 : map.keySet()) {
// System.out.println(key1 + "--->" + map.get(key1));
// }
//取cookie
for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){
if(key.equalsIgnoreCase("set-cookie")){
cookieVal = conn.getHeaderField(i);
cookieVal = cookieVal.substring(0, cookieVal.indexOf(";"));
sessionId = sessionId + cookieVal + ";";
}
}
//如果实际操作中,不存在重定向问题,不需要以下四行
String location= conn.getHeaderField("Location");//获取 重定向地址
List<String> list = getCookie3(location,sessionId);
List<String> list2 = getCookie3(list.get(1),sessionId+list.get(0));
sessionId = sessionId + list2.get(0);
return sessionId;
}
/**
* 获取 cookie
* @param url
* 发送请求的URL
* @param cookie
* cookie
*/
public static List<String> getCookie3(String url,String cookie) {
HttpURLConnection conn = null;
try {
URL realUrl = new URL(url);
conn = (HttpURLConnection) realUrl.openConnection();
conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
conn.setRequestProperty("Accept-Encoding","gzip, deflate, sdch");
conn.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8");
conn.setRequestProperty("Cache-Control","max-age=0");
conn.setRequestProperty("connection", "Keep-Alive");
//conn.setRequestProperty("Host","www.zjtax.gov.cn");
conn.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
conn.setRequestProperty("Cookie",cookie);
conn.setInstanceFollowRedirects(false);
conn.setDoInput(true);
conn.setDoOutput(true);
conn.setRequestMethod("POST");
} catch (Exception e) {
e.printStackTrace();
}
String sessionId = "";
String cookieVal = "";
String key = null;
String location= conn.getHeaderField("Location");
for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){
if(key.equalsIgnoreCase("set-cookie")){
cookieVal = conn.getHeaderField(i);
cookieVal = cookieVal.substring(0, cookieVal.indexOf(";"));
sessionId = sessionId + cookieVal + ";";
}
}
List<String> list = new ArrayList<String>();
list.add(sessionId);//存放cookie
list.add(location);//存放重定向地址
return list;
}
另附,最基本的get抓取、post抓取、获取cookie方法
public class HttpURLContent {
/**
* 向指定URL发送GET方法的请求
*
* @param url
* 发送请求的URL
* @param param
* 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
* @return URL 所代表远程资源的响应结果
*/
public static String sendGet(String url, String param) {
String result = "";
BufferedReader in = null;
try {
String urlNameString = url + "?" + param;
URL realUrl = new URL(urlNameString);
// 打开和URL之间的连接
URLConnection connection = realUrl.openConnection();
// 设置通用的请求属性
connection.setRequestProperty("accept", "*/*");
connection.setRequestProperty("connection", "Keep-Alive");
connection.setRequestProperty("user-agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
// 建立实际的连接
connection.connect();
// 定义 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(
connection.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
} catch (Exception e) {
System.out.println("发送GET请求出现异常!" + e);
e.printStackTrace();
}
// 使用finally块来关闭输入流
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}
/**
* 向指定 URL 发送POST方法的请求
*
* @param url
* 发送请求的 URL
* @param param
* 请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
* @return 所代表远程资源的响应结果
*/
public static String sendPost(String url, String param) {
PrintWriter out = null;
BufferedReader in = null;
String result = "";
try {
URL realUrl = new URL(url);
// 打开和URL之间的连接
URLConnection conn = realUrl.openConnection();
// 设置通用的请求属性
conn.setRequestProperty("accept", "*/*");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("user-agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
// 发送POST请求必须设置如下两行
conn.setDoOutput(true);
conn.setDoInput(true);
// 获取URLConnection对象对应的输出流
out = new PrintWriter(conn.getOutputStream());
// 发送请求参数
out.print(param);
// flush输出流的缓冲
out.flush();
// 定义BufferedReader输入流来读取URL的响应
in = new BufferedReader(
new InputStreamReader(conn.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
} catch (Exception e) {
System.out.println("发送 POST 请求出现异常!"+e);
e.printStackTrace();
}
//使用finally块来关闭输出流、输入流
finally{
try{
if(out!=null){
out.close();
}
if(in!=null){
in.close();
}
}
catch(IOException ex){
ex.printStackTrace();
}
}
return result;
}
public static String getCookie(String url) {
HttpURLConnection conn = null;
try {
URL realUrl = new URL(url);
conn = (HttpURLConnection) realUrl.openConnection();
conn.setDoInput(true);
conn.setDoOutput(true);
conn.setRequestMethod("POST");
} catch (Exception e) {
e.printStackTrace();
}
String sessionId = "";
String cookieVal = "";
String key = null;
//取cookie
for(int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++){
if(key.equalsIgnoreCase("set-cookie")){
cookieVal = conn.getHeaderField(i);
cookieVal = cookieVal.substring(0, cookieVal.indexOf(";"));
sessionId = sessionId + cookieVal + ";";
}
}
return sessionId;
}
2.问题总结
第一步:使用最基本方法,直接抓取,抓取到内容,恭喜你。
第二步:直接抓取页面无果时,通过设置cookie抓取,即conn.setRequestProperty(“Cookie”,cookie);
第三步:新的问题是,如何获取cookie,当第一次访问页面时会产生cookie。所以要先访问一次页面,拿到cookie。即getCookie(String url)方法
第四步:这里就比较复杂了,我接触的大部分页面抓取,目标页面不存在重定向。如果遇到,就需要使用getCookie2()和getCookie3()方法 获取cookie。
这也是我目前遇到最麻烦的抓取,用了二天才解决。加油加油加油!!!
3.测试代码
/**
* 出口退税率查询
* 测试url:
* http://www.zjtax.gov.cn/wcm/xchaxun/tuishui.jsp?sotype=FULLNAME&sovalue=钢铁&PageIndex=1
*/
public HashMap<String,Object> getCktsls(String url){
//先获取cookie
String cookie= HttpURLContent.getCookie2("http://www.zjtax.gov.cn/wcm/xchaxun/tuishui.jsp");
HashMap<String,Object> re = new HashMap<String,Object>();
//抓取结果
String result = HttpURLContent.sendPost1(url,null,"utf-8",cookie);
//System.out.println(result);
//以下代码是对结果的处理了。。。根据实际情况。。。
if(result.contains("<font color='#104194'>共")){//查询到结果
//总页数
String[] result_arr = result.split("<font color='#104194'>共");
String totalPage_str = result_arr[1].substring(0, result_arr[1].indexOf("页")).trim();
List<Map<String,String>> mapList = new ArrayList<Map<String,String>>();
String[] result_arr1 = result.split("class=\"gs_cx4_sp7\">");
for(int i=1;i<result_arr1.length;i++){
Map<String,String> map = new HashMap<String,String>();
map.put("number", result_arr1[i].substring(0, result_arr1[i].indexOf("</span>")));
String[] result_arr2 = result_arr1[i].split("\">");
for(int j=1;j<result_arr2.length;j++){
String value = "";
if(j<=5) value = result_arr2[j].substring(0, result_arr2[j].indexOf("</span>"));
switch (j) {
case 1:
map.put("nsrmc",value );
break;
case 2:
map.put("type", value);
break;
case 3:
map.put("sdate", value);
break;
case 4:
map.put("edate", value);
break;
case 5:
map.put("sign", value);
break;
default:
break;
}
}
mapList.add(map);
}
re.put("totalPage_str", totalPage_str);
re.put("result", mapList);
}else{//未查询到结果
}
return re;
}