1.定义一个对象
package Zhihu;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Zhihu {
public String question; //问题
public String questionDescription;// 问题描述
public String zhihuUrl; //网页链接
public String answers; //存储答案个数
public Zhihu(String url) {
question = "";
questionDescription = "";
zhihuUrl = "";
answers = "";
//判断url是否合法
if(getRealUrl(url)) {
System.out.println("正在抓取:"+ zhihuUrl);
//根据url获取该问题的细节
String content = Spider.sendGet(zhihuUrl);
Pattern pattern;
Matcher matcher;
//匹配问题
pattern = Pattern.compile("<h1.+>(.+)</h1>");
matcher = pattern.matcher(content);
if(matcher.find()){
question = matcher.group(1);
}
//匹配描述
pattern = Pattern.compile("RichText ztext.+?>(.+?)</span>");
matcher = pattern.matcher(content);
if(matcher.find()){
questionDescription = matcher.group(1);
}
//匹配答案
pattern = Pattern.compile("<h4.+?<span>(.+?)<!");
matcher = pattern.matcher(content);
boolean isFind = matcher.find();
while (isFind) {
answers = matcher.group(1);
isFind = matcher.find();
}
}
}
// 根据自己的url抓取自己的问题和描述和答案
public boolean getAll() {
return true;
}
//处理url,我们只需要针对问题的链接
/**
* 将https://www.zhihu.com/question/299145115/answer/513959527
* 转化为https://www.zhihu.com/question/299145115/
* 否则不转变
* @param url
* @return
*/
boolean getRealUrl(String url){
Pattern pattern = Pattern.compile("question/(.*?)/");
Matcher matcher = pattern.matcher(url);
if(matcher.find()){
zhihuUrl = "https://www.zhihu.com/question/"+matcher.group(1);
}else{
return false;
}
return true;
}
public String toString(){
return "问题:"+ question + "\n描述:"+questionDescription+"\n链接:" + zhihuUrl + "\n回答:"+answers+"个回答" + "\n";
}
}
2.爬虫
package Zhihu;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Spider {
static String sendGet(String url) {
//定义一个字符串用来存储页面内容
String result = "";
//定义一个缓冲字符输入流
BufferedReader in = null;
try{
//将string转成url对象
URL realUrl = new URL(url);
//初始化一个链接到URL的连接
URLConnection connection = realUrl.openConnection();
//开始实际的连接
connection.connect();
//初始化 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(
connection.getInputStream(),"UTF-8"
));
//用来临时存储抓取到的每一行的数据
String line;
while((line = in.readLine()) != null) {
//遍历抓取到的每一行并将其存储到result里面
result += line;
}
}catch (Exception e){
System.out.println("发送GET请求出现异常!" + e);
e.printStackTrace();
}finally {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return result;
}
static ArrayList<Zhihu> GetRecommendations(String content) {
//预定义一个ArrayList来存储结果
ArrayList<Zhihu> results = new ArrayList<Zhihu>();
//用来匹配url,就是问题的链接
Pattern urlPattern = Pattern.compile("question_link.+?href=\"(.+?)\"");
Matcher urlMatcher = urlPattern.matcher(content);
boolean isFind = urlMatcher.find();
while (isFind) {
Zhihu zhihuTemp = new Zhihu(urlMatcher.group(1));
//添加成功匹配的结果
results.add(zhihuTemp);
//继续查找下一个匹配对象
isFind = urlMatcher.find();
}
return results;
}
}
3.测试main函数
package Zhihu;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
public static void main(String[] args) {
//定义即将访问的链接
String url = "https://www.zhihu.com/explore/recommendations";
//访问链接并获取页面的内容
String content = Spider.sendGet(url);
// System.out.println(content);
//获取编辑推荐
ArrayList<Zhihu> zhihu = Spider.GetRecommendations(content);
System.out.println(zhihu);
}
}
4.运行结果
5.抓取的内容存储到本地
package ex3;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
public class FileReaderWriter {
/**
* 创建文件的函数createNewFile
*/
public static boolean createNewFile(String filePath){
boolean isSuccess = true;
//如果有"\\"转为"/",没有则不产生任何变化
String filePathTurn = filePath.replaceAll("\\\\","/");
//先过滤掉文件名
int index = filePathTurn.lastIndexOf("/");
String dir = filePathTurn.substring(0,index);
//再创建文件夹
File fileDir = new File(dir); //通过将给定路径名字符串转换为抽象路径名来创建一个新file实例
isSuccess = fileDir.mkdirs();//创建此抽象路径名指定的目录,包括所有必需但不存在的父目录。
//创建文件
File file = new File(filePathTurn);//新file实例
try {
isSuccess = file.createNewFile(); //当且仅当不存在具有此抽象路径名指定名称的文件时,不可分地创建一个新的空文件。
} catch (IOException e) {
isSuccess = false;
e.printStackTrace();
}
return isSuccess;
}
/**
* 写入文件的函数
*/
public static boolean writeIntoFile(String content,String filePath,boolean isAppend){
boolean isSuccess = true;
//先过滤掉文件名
int index = filePath.lastIndexOf("/");
String dir = filePath.substring(0,index);
//创建除文件的路径
File fileDir = new File(dir);
fileDir.mkdirs();
//再创建路径下的文件
File file = null;
try{
file = new File(filePath);
file.createNewFile();
}catch (IOException e){
isSuccess = false;
e.printStackTrace();
}
//写入文件
FileWriter fileWriter = null; //用来写入字符文件的便捷类
try{
fileWriter = new FileWriter(file,isAppend);//根据给定的文件名以及指示是否附加写入数据的 boolean 值来构造 FileWriter 对象。
fileWriter.write(content);
fileWriter.flush(); //刷新该流的缓冲
}catch (IOException e){
isSuccess = false;
e.printStackTrace();
}finally {
try {
if(fileWriter!=null)
fileWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return isSuccess;
}
}