1,爬取-在网址中爬取数据并返回字符串
public class paquxing {
public static void main(String[] args) throws IOException {
String familyname = "https://so.gushiwen.cn/guwen/book_46653FD803893E4FE03CBAE75DE61AB8.aspx";
String s = webCrawler(familyname);
String regex = "(.(4))(, | .)";
getData(s,regex,0);
}
//获取网页内容并用正则表达式筛选
private static ArrayList<String> getData(String s, String regex, int i) {
ArrayList<String> list = new ArrayList<>();
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(s);
while(matcher.find()){
String group = matcher.group(i);
list.add(group);
}
return list;
}
//获取网页内容
public static String webCrawler(String net) throws IOException {
StringBuilder sb = new StringBuilder();
URL url = new URL(net);
URLConnection uc = url.openConnection();
InputStreamReader isr = new InputStreamReader(uc.getInputStream());
int ch;
while((ch = isr.read()) != -1){
sb.append(ch);
}
isr.close();
return sb.toString();
}
}