Java正则爬虫

从百姓网上爬取一些地名和对应的连接(转载)

利用正则表达式匹配地名和对应的url链接

package test0903;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.Buffer;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class FinallyDemo{
    public static void main(String[] args) {
        String buf = getBuf();
        System.out.println("main-----------");
        String al = getRegex(buf);

    }
    public static String getBuf(){
        try{
            URL url = new URL("");
            HttpURLConnection connect = (HttpURLConnection) url.openConnection();
            connect.setRequestMethod("GET");
            connect.setConnectTimeout(3000);
            connect.connect();
            int code = connect.getResponseCode();
            if (code == 200){
                BufferedReader reader = new BufferedReader(
                        new InputStreamReader(connect.getInputStream(),"UTF-8"));
                StringBuffer buffer = new StringBuffer();
                String line = null;
                while ((line = reader.readLine()) != null){
                    buffer.append(line);
                }
                System.out.println("try--------------------");
                return buffer.toString();
            }

        }
            catch (IOException e){
            e.printStackTrace();
            System.out.println("catch---------------------");}
            finally{
            System.out.println("finally-------------------");
            }
            return null;
        }
        public static  String getRegex(String s){
            String regex = "<a[^>]*href=(\\\"([^\\\"]*)\\\"|\\'([^\\']*)\\'|([^\\\\s>]*))[^>]*>(.*?)</a>";
            Pattern r = Pattern.compile(regex);
            Matcher m = r.matcher(s);
            System.out.println(m.matches());
            ArrayList list = new ArrayList();
            while(m.find()){
                list.add(m.group());
                String regex1 = "^<a href='//(.*?)/'.*?([\\u4e00-\\u9fa5]*)</a>$";
                Pattern r1 = Pattern.compile(regex1);
                Matcher m1 = r1.matcher(m.group());
                if (m1.find()){
                    System.out.println(m1.group(2)+" = "+m1.group(1));
                }

            }
            return list.toString();
        }
    }
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值