// 姓氏 https://hanyu.baidu.com/shici/detail?pid=0b2f26d4c0ddb3ee693fdb1137ee1b0d&from=kg0 // 男 http://www.haoming8.cn/baobao/10881.html // 女 http://www.haoming8.cn/baoba0/7641.html //1.定义变量网址 String familyNameNet = "https://hanyu.baidu.com/shici/detail?pid=0b2f26d4c0ddb3ee693fdb1137ee1b0d"; String boyNameNet = " http://www.haoming8.cn/baobao/10881.html"; String girlNameNet = "http://www.haoming8.cn/baobao/7641.html"; //2.爬取数据 把网址上所有的数据拼接成字符串 String familyNameSrt = webCrawler(familyNameNet); String boyNameSrt = webCrawler(boyNameNet); String girlNameSrt = webCrawler(girlNameNet); //3.通过正则表达式,把符合要求的传递过来 ArrayList<String> familyNameTempList = getDate(familyNameSrt, "([\\u4E00-\\u9FA5]{4})(,|。)", 1); ArrayList<String> boyNameTempList = getDate(boyNameSrt, "([\\u4E00-\\u9FA5]{2})(、|。)", 1); ArrayList<String> girlNameTempList = getDate(girlNameSrt, "(.. ){4}..", 0); //4.处理数据 //familyNameTempList(姓氏) ArrayList<String> familyNameList = new ArrayList<>(); for (String str : familyNameTempList) { //str 就是 赵钱孙李 周吴郑王 for (int i = 0; i < str.length(); i++) { familyNameList.add(str.charAt(i) + ""); } } //boyNameTempList(男生) ArrayList<String> boyNameList = new ArrayList<>(); for (String str : boyNameTempList) { if (!boyNameList.contains(str)) { boyNameList.add(str); } } //girlNameTempList(女生) ArrayList<String> girlNameList = new ArrayList<>(); for (String str : girlNameTempList) { String[] arr = str.split(" "); for (String s : arr) { girlNameList.add(s); } } //5.生成数据 ArrayList<String> list = getInfos(familyNameList, boyNameList, girlNameList, 70, 50); Collections.sort(list); //6.写出数据 BufferedWriter bw = new BufferedWriter(new FileWriter("name.txt")); for (String str : list) { bw.write(str); bw.newLine(); } bw.close(); } /* 参数一:完整字符串 参数二:正则表达式 参数三:如果是0就是获得全部 如果是1就是前一部分 */ private static ArrayList<String> getDate(String str, String regex, int index) { //1.创建集合存储数据 ArrayList<String> list = new ArrayList<>(); //2.按照正则表达式的规则读取数据 Pattern pattern = Pattern.compile(regex); //按照pattern的规则,到str当中获取数据 Matcher matcher = pattern.matcher(str); while (matcher.find()) { list.add(matcher.group(index)); } return list; } //形参:网址 //返回值:所需数据 public static String webCrawler(String net) throws IOException { //1.定义StringBuilder去拼接数据 StringBuilder sb = new StringBuilder(); //2.创建url对象(网址) URL url = new URL(net); //3.链接网址 //细节:保证网络畅通 //网址可以使用 URLConnection conn = url.openConnection(); //4.读取数据 //字节流转换为字符流 //细节:有可能有中文所以转化为字符流 InputStreamReader isr = new InputStreamReader(conn.getInputStream()); //5.循环读取 int ch; while ((ch = isr.read()) != -1) { sb.append((char) ch); } //6.释放资源 isr.close(); return sb.toString(); } public static ArrayList<String> getInfos(ArrayList<String> familyNameList, ArrayList<String> boyNameList, ArrayList<String> girlNameList, int boycount, int girlcount) { //1.生成不重复的名字 //男生 HashSet<String> boyhs = new HashSet<>(); while (true) { if (boyhs.size() == boycount) { break; } Collections.shuffle(familyNameList); Collections.shuffle(boyNameList); //从0索引开始 boyhs.add(familyNameList.get(0) + boyNameList.get(0)); } //女生 HashSet<String> girhs = new HashSet<>(); while (true) { if (girhs.size() == girlcount) { break; } //随机 Collections.shuffle(familyNameList); Collections.shuffle(girlNameList); //从0索引开始 girhs.add(familyNameList.get(0) + girlNameList.get(0)); } //3.张三-男-23 ArrayList<String> list = new ArrayList<>(); Random r = new Random(); for (String boyName : boyhs) { int age = r.nextInt(10) + 18; list.add(boyName + "-男-" + age); } //4.生成女生信息加入集合 for (String girName : girhs) { int age = r.nextInt(8) + 18; list.add(girName + "-女-" + age); } return list;
爬取男生 女生姓名
最新推荐文章于 2025-05-20 22:23:34 发布