java抓取网页内容

直接上代码

    import java.io.BufferedReader;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.net.MalformedURLException;
    import java.net.URL;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.List;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    public class TestHtml {
     /**
      * 读取一个网页全部内容
      */
     public String getOneHtml(final String htmlurl) throws IOException
     {
      URL url;
      String temp;
      final StringBuffer sb = new StringBuffer();
      try
      {
       url = new URL(htmlurl);
       final BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), "gbk"));// 读取网页全部内容
       while ((temp = in.readLine()) != null)
       {
        sb.append(temp);
       }
       in.close();
      }
      catch (final MalformedURLException me)
      {
       System.out.println("你输入的URL格式有问题!请仔细输入");
       me.getMessage();
       throw me;
      }
      catch (final IOException e)
      {
       e.printStackTrace();
       throw e;
      }
      return sb.toString();
     }

     /**
      * 
      * @param s
      * @return 获得网页标题
      */
     public String getTitle(final String s)
     {
      String regex;
      String title = "";
      final List<String> list = new ArrayList<String>();
      regex = "<title>.*?</title>";
      final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
      final Matcher ma = pa.matcher(s);
      while (ma.find())
      {
       list.add(ma.group());
      }
      for (int i = 0; i < list.size(); i++)
      {
       title = title + list.get(i);
      }
      return outTag(title);
     }
/**
      * 获取参数
      * @param s
      * @param regexarg
      * @return
      */
     public String getByRegex(final String s,String regexarg)
     {
      String regex;
      String title = "";
      final List<String> list = new ArrayList<String>();
      regex = regexarg;
      final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
      final Matcher ma = pa.matcher(s);
      while (ma.find())
      {
       list.add(ma.group());
      }
      for (int i = 0; i < list.size(); i++)
      {
       title = title + list.get(i);
      }
      return outTag(title);
     }
/**
      * 
      * @param args
      *       
      */
     public static void main(final String args[])
     {
      String url = "http://detail.1688.com/offer/41797007099.html?tracelog=p4p";
      try
      {
       String html = "";
       TestHtml testHtml=new TestHtml();
       html = testHtml.getOneHtml(url);
       String Regex="<span class=\"value price-length-5\">.*?</span>";
       String content=testHtml.getByRegex(html, Regex);
       
       System.out.println("contet is :"+content);
      }
      catch (final Exception e)
      {
       e.getMessage();
      }
     }
}

 

转载于:https://www.cnblogs.com/loklook123/p/4159776.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值