java 抓取网站内容例子2


import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;


public class h2 {

/**
* @param args
* @throws IOException
* @throws HttpException
* @throws InterruptedException
*/
public static void main(String[] args) throws HttpException, IOException, InterruptedException
{
// TODO Auto-generated method stub
System.setProperty( "org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog" );

/* HttpClient httpClient = new HttpClient();
//创建GET方法的实例
GetMethod getMethod = new GetMethod("http://www.dianping.com/search/category/1/10/g0r0");

//使用系统提供的默认的恢复策略
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());

getMethod.setRequestHeader( "User-Agent", "ie");
//getMethod.setRequestHeader( "Accept-Charset", "UTF-8");

int statusCode = httpClient.executeMethod(getMethod);
if (statusCode != HttpStatus.SC_OK)
{
System.err.println("Method failed: "+ getMethod.getStatusLine());
}
//读取内容
// byte[] responseBody = getMethod.getResponseBody();
//处理内容
//System.out.println(new String(responseBody));

String s =getMethod.getResponseBodyAsString();
// System.out.println(s);
//<li class="shopname"><a href="/shop/3990805" class="BL" title="宝岛新乐园美食总汇" onclick="pageTracker._trackPageview('dp_search_shopname_shanghai');">宝岛新乐园美食总汇</a></li>
String regx1="(\\d{1,10})";
// String regx2="([\u4E00-\u9FA5]+)";
_L1:
{
String regx2="<a href=\"(\\/shop/\\d{1,10})\" class=\"BL\".+?>(.+?)</a>";
Pattern pattern3 = Pattern.compile(regx2);
Matcher matcher3 = pattern3.matcher(s);
System.out.println(matcher3.groupCount());

int g[] = null;
ArrayList List = new ArrayList();
int i =0;
while(matcher3.find())
{

System.out.println(i+"-URL:"+matcher3.group(1));
System.out.println(" "+matcher3.group(2));
//List.add(matcher3.group()) ;
i++;
}
}
System.out.println("=====================================");

//next
//<a href="/search/category/1/10/g10p4/g10" class="NextPage" title="下一页">下一页</a>
String regx5="<a href=\"(/search/category/\\d{1,3}/\\d{1,10}/g\\d{1,5}p\\d{1,3}/g\\d{1,3})\" class=\"NextPage\".+?>(.+?)</a>";
Pattern pattern5 = Pattern.compile(regx5);
Matcher matcher5 = pattern5.matcher(s);
System.out.println(matcher5.find());
System.out.println(matcher5.groupCount());
System.out.println(matcher5.group());
System.out.println(matcher5.group(1));
System.out.println(matcher5.group(2)); */

//list
/*for(int y=0;y<List.size();y++)
{
//System.out.println("list:"+List.get(y));
}*/
/* System.out.println("=================================");
Iterator it= List.iterator();
while(it.hasNext())
{
System.out.println("list:"+it.next());
}*/
h2 hx = new h2();
String url ="http://www.dianping.com/search/category/1/10/g0r0";
StringBuffer s = hx.getData(url);
// System.out.println(s);
StringBuffer s2 = new StringBuffer("aa");
String page ="";
int c=0;
while(true)
//for(int i=0;i<100;i++)
{
if(!url.equals(""))
{
hx.printShoplist(s);
url = hx.getNext(s);
s = hx.getData("http://www.dianping.com"+url);
System.out.println(url);
// System.out.println(hx.getData(page));
c++;
//Thread.sleep(1000);
System.out.println("count:"+c);
}else{
System.out.println("工作结束!");
break;
}
}


}


HttpClient httpClient;

public h2()
{
httpClient = new HttpClient();
}
//取得页面数据
public StringBuffer getData(String url) throws IOException
{
//创建GET方法的实例
GetMethod getMethod = new GetMethod(url);

//使用系统提供的默认的恢复策略
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());

getMethod.setRequestHeader( "User-Agent", "oprea");
//getMethod.setRequestHeader( "Accept-Charset", "UTF-8");
//getMethod.addRequestHeader("referer","a.jsp"); //就这行就好
getMethod.setRequestHeader("REMOTE_ADDR", "114.80.166.161");
int statusCode = 0;
try {
statusCode = httpClient.executeMethod(getMethod);
} catch (HttpException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (statusCode != HttpStatus.SC_OK)
{
System.err.println("Method failed: "+ getMethod.getStatusLine());
}
//读取内容
// byte[] responseBody = getMethod.getResponseBody();
//处理内容
//System.out.println(new String(responseBody));

StringBuffer s =new StringBuffer(getMethod.getResponseBodyAsString());

return s;

}
//取得页面列表数据
public void printShoplist(StringBuffer s)
{
String regx2="<a href=\"(\\/shop/\\d{1,10})\" class=\"BL\".+?>(.+?)</a>";
Pattern pattern3 = Pattern.compile(regx2);
Matcher matcher3 = pattern3.matcher(s);
System.out.println(matcher3.groupCount());

int g[] = null;
ArrayList List = new ArrayList();
int i =0;
while(matcher3.find())
{

System.out.println(i+"-URL:"+matcher3.group(1));
System.out.println(" "+matcher3.group(2));
//List.add(matcher3.group()) ;
i++;
}
}
//取得下一页的地址
public String getNext(StringBuffer s)
{
String regx5="<a href=\"(/search/category/\\d{1,3}/\\d{1,10}/g\\d{1,5}p\\d{1,3}/g\\d{1,3})\" class=\"NextPage\".+?>(.+?)</a>";
Pattern pattern5 = Pattern.compile(regx5);
Matcher matcher5 = pattern5.matcher(s);
/* System.out.println(matcher5.find());
System.out.println(matcher5.groupCount());
System.out.println(matcher5.group());
System.out.println(matcher5.group(1));
System.out.println(matcher5.group(2)); */
if(matcher5.find())
{
return matcher5.group(1);
}else{
return "";
}
}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值