爬取男生 女生姓名

 //  姓氏 https://hanyu.baidu.com/shici/detail?pid=0b2f26d4c0ddb3ee693fdb1137ee1b0d&from=kg0
    //  男   http://www.haoming8.cn/baobao/10881.html
    //   女     http://www.haoming8.cn/baoba0/7641.html
    //1.定义变量网址
    String familyNameNet = "https://hanyu.baidu.com/shici/detail?pid=0b2f26d4c0ddb3ee693fdb1137ee1b0d";
    String boyNameNet = " http://www.haoming8.cn/baobao/10881.html";
    String girlNameNet = "http://www.haoming8.cn/baobao/7641.html";
    //2.爬取数据 把网址上所有的数据拼接成字符串
    String familyNameSrt = webCrawler(familyNameNet);
    String boyNameSrt = webCrawler(boyNameNet);
    String girlNameSrt = webCrawler(girlNameNet);
    //3.通过正则表达式,把符合要求的传递过来
    ArrayList<String> familyNameTempList = getDate(familyNameSrt, "([\\u4E00-\\u9FA5]{4})(,|。)", 1);
    ArrayList<String> boyNameTempList = getDate(boyNameSrt, "([\\u4E00-\\u9FA5]{2})(、|。)", 1);
    ArrayList<String> girlNameTempList = getDate(girlNameSrt, "(.. ){4}..", 0);

    //4.处理数据
    //familyNameTempList(姓氏)
    ArrayList<String> familyNameList = new ArrayList<>();
    for (String str : familyNameTempList) {
        //str 就是   赵钱孙李   周吴郑王
        for (int i = 0; i < str.length(); i++) {
            familyNameList.add(str.charAt(i) + "");
        }
    }
    //boyNameTempList(男生)
    ArrayList<String> boyNameList = new ArrayList<>();
    for (String str : boyNameTempList) {
        if (!boyNameList.contains(str)) {
            boyNameList.add(str);
        }
    }
    //girlNameTempList(女生)
    ArrayList<String> girlNameList = new ArrayList<>();
    for (String str : girlNameTempList) {
        String[] arr = str.split(" ");
        for (String s : arr) {
            girlNameList.add(s);
        }
    }
    //5.生成数据
    ArrayList<String> list = getInfos(familyNameList, boyNameList, girlNameList, 70, 50);
    Collections.sort(list);
    //6.写出数据
    BufferedWriter bw = new BufferedWriter(new FileWriter("name.txt"));
    for (String str : list) {
        bw.write(str);
        bw.newLine();

    }
    bw.close();
}

/*
参数一:完整字符串
参数二:正则表达式
参数三:如果是0就是获得全部
        如果是1就是前一部分

*/
private static ArrayList<String> getDate(String str, String regex, int index) {
    //1.创建集合存储数据
    ArrayList<String> list = new ArrayList<>();
    //2.按照正则表达式的规则读取数据
    Pattern pattern = Pattern.compile(regex);
    //按照pattern的规则,到str当中获取数据
    Matcher matcher = pattern.matcher(str);
    while (matcher.find()) {
        list.add(matcher.group(index));

    }
    return list;
}


//形参:网址
//返回值:所需数据
public static String webCrawler(String net) throws IOException {
    //1.定义StringBuilder去拼接数据
    StringBuilder sb = new StringBuilder();
    //2.创建url对象(网址)
    URL url = new URL(net);
    //3.链接网址
    //细节:保证网络畅通
    //网址可以使用
    URLConnection conn = url.openConnection();
    //4.读取数据
    //字节流转换为字符流
    //细节:有可能有中文所以转化为字符流
    InputStreamReader isr = new InputStreamReader(conn.getInputStream());
    //5.循环读取
    int ch;
    while ((ch = isr.read()) != -1) {
        sb.append((char) ch);
    }
    //6.释放资源
    isr.close();

    return sb.toString();
}

public static ArrayList<String> getInfos(ArrayList<String> familyNameList, ArrayList<String> boyNameList, ArrayList<String> girlNameList, int boycount, int girlcount) {
    //1.生成不重复的名字
    //男生
    HashSet<String> boyhs = new HashSet<>();
    while (true) {
        if (boyhs.size() == boycount) {
            break;
        }
        Collections.shuffle(familyNameList);
        Collections.shuffle(boyNameList);
        //从0索引开始
        boyhs.add(familyNameList.get(0) + boyNameList.get(0));
    }
    //女生
    HashSet<String> girhs = new HashSet<>();
    while (true) {
        if (girhs.size() == girlcount) {
            break;
        }
        //随机
        Collections.shuffle(familyNameList);
        Collections.shuffle(girlNameList);
        //从0索引开始
        girhs.add(familyNameList.get(0) + girlNameList.get(0));

    }
    //3.张三-男-23
    ArrayList<String> list = new ArrayList<>();
    Random r = new Random();
    for (String boyName : boyhs) {
        int age = r.nextInt(10) + 18;
        list.add(boyName + "-男-" + age);
    }
    //4.生成女生信息加入集合
    for (String girName : girhs) {
        int age = r.nextInt(8) + 18;
        list.add(girName + "-女-" + age);
    }

    return list;
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值