import java.io.IOException

  package com.xiaofeng.picup;

  import java.io.BufferedReader;

  import java.io.IOException;

  import java.io.InputStreamReader;

  import java.net.MalformedURLException;

  import java.net.URL;

  import java.util.ArrayList;

  import java.util.HashMap;

  import java.util.List;

  import java.util.regex.Matcher;

  import java.util.regex.Pattern;

  /** *//**

  *

  * @抓取雅虎知识堂的文章标题及内容(测试) 手动输入网址抓取,可进一步自动抓取整个知识堂的全部内容

  *

  */

  public class WebContent ...{

  /** *//**

  * 读取一个网页全部内容

  */

  public String getOneHtml(String htmlurl) throws IOException...{

  URL url;

  String temp;

  StringBuffer sb = new StringBuffer();

  try ...{

  url = new URL(htmlurl);

  BufferedReader in = new BufferedReader(new InputStreamReader(url

  .openStream(), "utf-8"));// 读取网页全部内容

  while ((temp = in.readLine()) != null) ...{

  sb.append(temp);

  }

  in.close();

  }catch(MalformedURLException me)...{

  System.out.println("你输入的URL格式有问题!请仔细输入");

  me.getMessage();

  throw me;

  }catch (IOException e) ...{

  e.printStackTrace();

  throw e;

  }

  return sb.toString();

  }

  /** *//**

  *

  * @param s

  * @return 获得网页标题

  */

  public String getTitle(String s) ...{

  String regex;

  String title = "";

  List list = new ArrayList();

  regex = "";

  Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);

  Matcher ma = pa.matcher(s);

  while (ma.find()) ...{

  list.add(ma.group());

  }

  for (int i = 0; i < list.size(); i++) ...{

  title = title + list.get(i);

  }

  return outTag(title);

  }

  /** *//**

  *

  * @param s

  * @return 获得链接

  */

  public List getLink(String s) ...{

  String regex;

  List list = new ArrayList();

  regex = "]*href=("([^"]*)"|'([^']*)'|([^s>]*))[^>]*>(.*?)";

  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

  Matcher ma = pa.matcher(s);

  while (ma.find()) ...{

  list.add(ma.group());

  }

  return list;

  }

  /** *//**

  *

  * @param s

  * @return 获得脚本代码

  */

  public List getScript(String s) ...{

  String regex;

  List list = new ArrayList();

  regex = "<script.*?< script="">";

  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

  Matcher ma = pa.matcher(s);

  while (ma.find()) ...{

  list.add(ma.group());

  }

  return list;

  }

  /** *//**

  *

  * @param s

  * @return 获得CSS

  */

  public List getCSS(String s) ...{

  String regex;

  List list = new ArrayList();

  regex = "<style.*?<>";

  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);

  Matcher ma = pa.matcher(s);

  while (ma.find()) ...{

  list.add(ma.group());

  }

  return list;

  }

  /** *//**

  *

  * @param s

  * @return 去掉标记

  */

  public String outTag(String s) ...{

  return s.replaceAll("<.*?>", "");

  }

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值