抓取远程网页并解析HTML



正则表达式HTMLApache

 学习java的正则表达式,抓取网页并解析HTML部分内容   

 

 
Java代码  收藏代码

    import java.util.regex.Matcher;  
    import java.util.regex.Pattern;  
    import org.apache.commons.httpclient.HttpClient;  
    import org.apache.commons.httpclient.HttpStatus;  
    import org.apache.commons.httpclient.methods.GetMethod;  
      
    public class HttpClientDemo {  
          
        /**
         *  
         * @param url
         * @return
         * @throws Exception
         */  
        public static String getHTML(String url) throws Exception {  
            HttpClient httpClient = new HttpClient();  
            GetMethod getMethod = new GetMethod(url);  
            int statusCode = httpClient.executeMethod(getMethod);  
            if (statusCode != HttpStatus.SC_OK) {  
                System.err.println("Method failed: " + getMethod.getStatusLine());  
                return null;  
            }  
            // 读取内容  
            byte[] responseBody = getMethod.getResponseBody();  
            getMethod.releaseConnection();  
            return new String(responseBody);  
      
        }  
        /**
         *  
         * @throws Exception
         */  
        public static void test(String url) throws Exception{  
              
            String html = getHTML(url);  
            Pattern p = null;  
            Matcher m = null;  
            StringBuffer sb0 = new StringBuffer();  
            // ul正则  
            String regex = "<ul class=\"d2_9\">([\\s\\S]*<li>)<a.*href='(.*)'.*>(.+?)</a> \\[(.*)\\]</li>([\\s].*)";  
            // 链接正则  
            String regexa = "<a.*href='(.*)'.*>(.+?)</a> \\[(.*)\\]";  
            p = Pattern.compile(regex);  
            // m = p.matcher(sb.toString());  
            m = p.matcher(html);  
            int count = 0;  
            // ul字符串  
            while (m.find()) {  
                sb0.append(m.group());  
            }  
            //System.out.println(sb0.toString());  
            p = Pattern.compile(regexa);  
            m = p.matcher(sb0.toString());  
            // 链接地址和标题  
            while (m.find()) {  
                System.out.println("地址:" + m.group(1));  
                System.out.println("标题:" + m.group(2));  
                System.out.println("时间:" + m.group(3));  
                count++;  
            }  
              
            System.out.println("抓取条数:"+count);  
      
        }  
          
        public static void main(String[] args) throws Exception {  
            String url = "http://cpc.people.com.cn/GB/194302/194306/index.html";  
            test(url);  
              
        }  
    } 
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值