httpclient4 网页抓取

最近在做全文检索。所以数据很纠结。没办法 抓一些行业新闻。于是乎用了 httpclient

上代码。分享下

TEbInformationModel model = new TEbInformationModel();

HttpClient httpclient = new DefaultHttpClient();

httpclient.getParams().setParameter("http.protocol.content-charset",HTTP.UTF_8);
httpclient.getParams().setParameter(HTTP.CONTENT_ENCODING, HTTP.UTF_8);
httpclient.getParams().setParameter(HTTP.CHARSET_PARAM, HTTP.UTF_8);
httpclient.getParams().setParameter(HTTP.DEFAULT_PROTOCOL_CHARSET,HTTP.UTF_8);
httpclient.getParams().setParameter(HTTP.CONTENT_TYPE, HTTP.UTF_8);

HttpPost httppost = new HttpPost(httpurl);

httppost.setHeader("Accept-Language", "zh-cn,zh;q=0.5");
httppost.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");


httppost.getParams().setParameter("http.protocol.content-charset",HTTP.UTF_8);
httppost.getParams().setParameter(HTTP.CONTENT_ENCODING, HTTP.UTF_8);
httppost.getParams().setParameter(HTTP.CHARSET_PARAM, HTTP.UTF_8);
httppost.getParams().setParameter(HTTP.DEFAULT_PROTOCOL_CHARSET, HTTP.UTF_8);
httppost.getParams().setParameter(HTTP.CONTENT_TYPE, HTTP.UTF_8);

HttpResponse response = httpclient.execute(httppost);



InputStream is = response.getEntity().getContent();
BufferedReader br = new BufferedReader(new InputStreamReader(is,"GBK"));
StringBuffer sbf = new StringBuffer();
String line = null;
while ((line = br.readLine()) != null)
{
sbf.append(line);
}
/** 回收资源 */
br.close();

String title = getSubTitle(getStringNoBlank(getTitle(sbf.toString(),"title")));
String context = getSubContext(getStringNoBlank(getTitle(sbf.toString(),"content")));
String key = getSubKey(getStringNoBlank(getTitle(sbf.toString(),"key")));

System.out.println("标题:"+title);
System.out.println("内容:"+context);
System.out.println("关键字:"+key);


正则匹配的部分

private String getStringNoBlank(String str) {
if(str!=null && !"".equals(str)) {
Pattern p = Pattern.compile("\\s*|\t|\r|\n");
Matcher m = p.matcher(str);
String strNoBlank = m.replaceAll("");
return strNoBlank;
}else {
return str;
}
}



public String getSubTitle(String str){
return str.substring(str.indexOf("<h1>")+4, str.lastIndexOf("</h1>"));
}
public String getSubContext(String str){
return str.substring(str.indexOf("<P>")+3, str.lastIndexOf("</P>"));
}
public String getSubKey(String str){
return str.substring(str.indexOf("</b>")+4, str.lastIndexOf("</p>"));
}

private String getTitle( String s,String type)
{
String regex = null;
String title = "";
final List<String> list = new ArrayList<String>();

if("title".equals(type)){
regex = "<div class=\"zz_leftneirong1\">.*?</h1>";
}else if("content".equals(type)){
regex = "<div class=\"zz_leftneirong4\" id=\"content\" name=\"content\">.*? </div>";
}else{
regex = " <p class=\"key\"><b>本文关键词:</b>.*?</p>";
}
final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma = pa.matcher(s);
while (ma.find())
{
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++)
{
title = title + list.get(i);
}
return title;
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值