1.定义一个对象存储抓取到的结果
package Zhihu;
import java.util.ArrayList;
import java.util.Arrays;
public class Zhihu {
public String question; //问题
public String zhihuUrl; //网页链接
public ArrayList<String> answers; //存储所有回答的数组
public Zhihu() {
question = "";
zhihuUrl = "";
answers = new ArrayList<String>();
}
public String toString(){
return "问题:"+ question + "\n链接:" + zhihuUrl + "\n回答"+answers + "\n";
}
}
2.爬虫
package Zhihu;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Spider {
static String sendGet(String url) {
//定义一个字符串用来存储页面内容
String result = "";
//定义一个缓冲字符输入流
BufferedReader in = null;
try{
//将string转成url对象
URL realUrl = new URL(url);
//初始化一个链接到URL的连接
URLConnection connection = realUrl.openConnection();
//开始实际的连接
connection.connect();
//初始化 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(
connection.getInputStream(),"UTF-8"
));
//用来临时存储抓取到的每一行的数据
String line;
while((line = in.readLine()) != null) {
//遍历抓取到的每一行并将其存储到result里面
result += line;
}
}catch (Exception e){
System.out.println("发送GET请求出现异常!" + e);
e.printStackTrace();
}finally {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return result;
}
static ArrayList<Zhihu> GetZhihu(String content) {
//ArrayList存储结果
ArrayList<Zhihu>result = new ArrayList<Zhihu>();
//用来匹配标题
Pattern questionPattern = Pattern.compile("question_link.+?>(.+?)</a>");
Matcher questionMatcher = questionPattern.matcher(content);
//用来匹配问题的链接
Pattern urlPattern = Pattern.compile("question_link.+?href=\"(.+?)\"");
Matcher urlMatcher = urlPattern.matcher(content);
//问题和链接要均能匹配到
boolean isFind = questionMatcher.find()&&urlMatcher.find();
while(isFind) {
//定义一个对象来存储抓取到的信息
Zhihu zhihuTemp = new Zhihu();
zhihuTemp.question = questionMatcher.group(1);
zhihuTemp.zhihuUrl = "http://www.zhihu.com" + urlMatcher.group(1);
result.add(zhihuTemp);
isFind = questionMatcher.find() && urlMatcher.find();
}
return result;
}
}
3.main函数
package Zhihu;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
public static void main(String[] args) {
//定义即将访问的链接
String url = "https://www.zhihu.com/explore/recommendations";
//访问链接并获取页面的内容
String content = Spider.sendGet(url);
// System.out.println(content);
//获取该页面的所有的知乎对象
ArrayList<Zhihu> zhihu = Spider.GetZhihu(content);
System.out.println(zhihu);
}
}
输出: