抓取远程网页并解析HTML

最新推荐文章于 2019-05-16 14:07:09 发布

w_j_w2010

最新推荐文章于 2019-05-16 14:07:09 发布

阅读量413

点赞数

分类专栏：文本处理

文本处理专栏收录该内容

26 篇文章 0 订阅

订阅专栏

正则表达式HTMLApache

学习java的正则表达式，抓取网页并解析HTML部分内容

Java代码收藏代码

    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    import org.apache.commons.httpclient.HttpClient;
    import org.apache.commons.httpclient.HttpStatus;
    import org.apache.commons.httpclient.methods.GetMethod;

    public class HttpClientDemo {

        /**
         *
         * @param url
         * @return
         * @throws Exception
         */
        public static String getHTML(String url) throws Exception {
            HttpClient httpClient = new HttpClient();
            GetMethod getMethod = new GetMethod(url);
            int statusCode = httpClient.executeMethod(getMethod);
            if (statusCode != HttpStatus.SC_OK) {
                System.err.println("Method failed: " + getMethod.getStatusLine());
                return null;
            }
            // 读取内容
            byte[] responseBody = getMethod.getResponseBody();
            getMethod.releaseConnection();
            return new String(responseBody);

        }
        /**
         *
         * @throws Exception
         */
        public static void test(String url) throws Exception{

            String html = getHTML(url);
            Pattern p = null;
            Matcher m = null;
            StringBuffer sb0 = new StringBuffer();
            // ul正则
            String regex = "<ul class=\"d2_9\">([\\s\\S]*<li>)<a.*href='(.*)'.*>(.+?)</a> \\[(.*)\\]</li>([\\s].*)";
            // 链接正则
            String regexa = "<a.*href='(.*)'.*>(.+?)</a> \\[(.*)\\]";
            p = Pattern.compile(regex);
            // m = p.matcher(sb.toString());
            m = p.matcher(html);
            int count = 0;
            // ul字符串
            while (m.find()) {
                sb0.append(m.group());
            }
            //System.out.println(sb0.toString());
            p = Pattern.compile(regexa);
            m = p.matcher(sb0.toString());
            // 链接地址和标题
            while (m.find()) {
                System.out.println("地址:" + m.group(1));
                System.out.println("标题:" + m.group(2));
                System.out.println("时间:" + m.group(3));
                count++;
            }

            System.out.println("抓取条数："+count);

        }

        public static void main(String[] args) throws Exception {
            String url = "http://cpc.people.com.cn/GB/194302/194306/index.html";
            test(url);

        }
    }