Java——正则表达式_初识网络爬虫

最新推荐文章于 2024-07-30 14:00:34 发布

yangzheng0515

最新推荐文章于 2024-07-30 14:00:34 发布

阅读量2.3k

点赞数 1

分类专栏： Java 文章标签： java 正则表达式网络爬虫

本文链接：https://blog.csdn.net/yangzheng0515/article/details/53948712

版权

Java 专栏收录该内容

15 篇文章 0 订阅

订阅专栏

目的

获取某网站中所有的链接地址

思路

（1）获取该网站的HTML源码
（2）利用正则表达式，获取其中的链接地址（href=”http://……”）

代码

这里以www.163.com为例

package TestRegex;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 初识网络爬虫
 * @author yangzheng
 *
 */

public class Demo2 {
    public static String getURLContent(String urlStr, String charset){
        StringBuffer res = new StringBuffer();
        try {
            URL url = new URL(urlStr);

            BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream(), Charset.forName(charset)));
            String str = null;

            while ((str = br.readLine()) != null) {
                res.append(str);
            }
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

        return res.toString();
    }

    public static ArrayList<String> getMatherSubstrs(String destStr, String regexStr){
        Pattern pattern = Pattern.compile(regexStr);
        Matcher mather = pattern.matcher(destStr);
        ArrayList<String> res = new ArrayList<String>();
        while(mather.find()){
            //System.out.println(mather.group());
            //System.out.println(mather.group(1));
            res.add(mather.group(1));
        }

        return res;
    }

    public static void main(String[] args) {
        String destStr = getURLContent("http://www.163.com", "gbk");
        //System.out.println(destStr);

        ArrayList<String> list = getMatherSubstrs(destStr, "href=\"(.+?)\"");

        for (String temp : list) {
            System.out.println(temp);
        }

    }
}