注意代码中使用了hutool包
hutool: 🍬小而全的Java工具类库,使Java拥有函数式语言般的优雅,让Java语言也可以“甜甜的”。 (gitee.com)
package a_综合练习;
import cn.hutool.core.io.FileUtil;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class a_爬取名字 {
public static void main(String[] args) throws IOException {
String nameFirst = "https://zhuanlan.zhihu.com/p/25056561";
String boynames = "http://www.haoming8.cn/baobao/10881.html";
String girlName = "http://www.haoming8.cn/baobao/7641.html";
String nameFirstContent = webCrawler(nameFirst);
String boyNameContent = webCrawler(boynames);
String girlNameContent = webCrawler(girlName);
String nameFirstRegex = "(. ){7}.";
String zenze = "([\\u4E00-\\u9FA5]{2})(、|。)";
String girlNameRegex = "(.. ){4}..";
//处理男孩数据
ArrayList<String> boyNameTemporaryList = panDuan(boyNameContent, zenze, 1);
HashSet<String> boyNameSet = boyNameDataProcessing(boyNameTemporaryList);
System.out.println(boyNameSet);
//处理女孩数据
ArrayList<String> girlNameTemporaryList = panDuan(girlNameContent, girlNameRegex, 0);
HashSet<String> girlNameSet = girlNameDateProcessing(girlNameTemporaryList);
System.out.println(girlNameSet);
//处理姓数据
ArrayList<String> nameFirstTemporaryList = panDuan(nameFirstContent, nameFirstRegex, 0);
HashSet<String> nameFirstSet = NameFirstDataProcessing(nameFirstTemporaryList);
System.out.println(nameFirstSet);
//进行拼接
nameSplicing(boyNameSet,girlNameSet,nameFirstSet,40,20);
}
private static void nameSplicing(HashSet<String> boyNameSet, HashSet<String> girlNameSet, HashSet<String> nameset, int boyNumber, int girlNumber) {
ArrayList<String> name = new ArrayList<>();
int[] girlAge = {18, 28};
int[] boyAge = {18, 30};
ArrayList<String> boyName = splicing(boyNameSet, nameset, boyNumber, boyAge,'男');
ArrayList<String> girlName = splicing(girlNameSet, nameset, girlNumber, girlAge,'女');
name.addAll(boyName);
name.addAll(girlName);
FileUtil.writeLines(name, "C:\\IDEA代码\\javalean\\ab_IOlean\\src\\a_综合练习\\a.txt", Charset.forName("UTF-8"));
}
private static ArrayList<String> splicing(HashSet<String> nameLast, HashSet<String> nameFirst, int number, int[] ages,char sex) {
ArrayList<String> name = new ArrayList<>();
Random rs = new Random();
for (int i = 0; i < number; i++) {
String first = rondomSet(nameFirst);
String last = rondomSet(nameLast);
int age = rs.nextInt(ages[1] - ages[0] + 1) + ages[0];
name.add(first + last +"-"+ sex+"-"+ age);
}
return name;
}
private static String rondomSet(HashSet<String> set) {
Random rs = new Random();
int rondomNumber = rs.nextInt(set.size());
Iterator<String> iterator = set.iterator();
for (int i = 0; i < rondomNumber - 1; i++) {
iterator.next();
}
return iterator.next();
}
private static HashSet<String> NameFirstDataProcessing(ArrayList<String> NameFirstTemporaryList) {
HashSet<String> NameFirstTemporarySet = new HashSet<>();
for (String s : NameFirstTemporaryList) {
String[] split = s.split(" ");
NameFirstTemporarySet.addAll(Arrays.asList(split));
}
return NameFirstTemporarySet;
}
private static HashSet<String> girlNameDateProcessing(ArrayList<String> girlNameTemporaryList) {
HashSet<String> girlNameTemporarySet = new HashSet<>();
for (String s : girlNameTemporaryList) {
String[] split = s.split(" ");
girlNameTemporarySet.addAll(Arrays.asList(split));
}
return girlNameTemporarySet;
}
private static HashSet<String> boyNameDataProcessing(ArrayList<String> boyNameTemporaryList) {
HashSet<String> boyNameTemporaryset = new HashSet<>();
for (String s : boyNameTemporaryList) {
boyNameTemporaryset.add(s);
}
return boyNameTemporaryset;
}
private static ArrayList<String> panDuan(String content, String regex, int dex) {
ArrayList<String> list = new ArrayList<>();
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
String group = matcher.group(dex);
list.add(group);
}
return list;
}
private static String webCrawler(String website) throws IOException {
StringBuilder sb = new StringBuilder();
URL url = new URL(website);
URLConnection uc = url.openConnection();
InputStreamReader isr = new InputStreamReader(uc.getInputStream());
int ch;
while ((ch = isr.read()) != -1) {
sb.append((char) ch);
}
//关闭资源
isr.close();
//System.out.println(sb);
return sb.toString();
}
}
结果演示