JAVA知乎爬虫

写了个简单的爬虫,用来爬取知乎中某些问题的图片,初衷当然是为了爬妹子,不多说,上代码
github地址:https://github.com/kevindai007/springbootTest

package kevindai.pachong;


import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by kevindai on 2017/1/12.
 */
public class WebSpider {
    public static String sendUrl(String url){
        // 定义一个字符串用来存储网页内容
        StringBuffer sb = new StringBuffer();
        // 定义一个缓冲字符输入流
        BufferedReader in = null;
        try {
            // 将string转成url对象
            URL realUrl = new URL(url);
            // 初始化一个链接到那个url的连接
            URLConnection connection = realUrl.openConnection();
            // 开始实际的连接
            connection.connect();
            // 初始化 BufferedReader输入流来读取URL的响应
            in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"));
            // 用来临时存储抓取到的每一行的数据
            String line;
            while ((line = in.readLine()) != null) {
                // 遍历抓取到的每一行并将其存储到result里面
                sb.append(line);
            }
        } catch (Exception e) {
            System.out.println("发送GET请求出现异常!" + e);
            e.printStackTrace();
        }
        // 使用finally来关闭输入流
        finally {
            try {
                if (in != null) {
                    in.close();
                }
            } catch (Exception e2) {
                e2.printStackTrace();
            }
        }
        return sb.toString();
    }

    public static String regexStr(String targetStr,String regex){
        StringBuilder result = new StringBuilder();
        Pattern pattern = Pattern.compile(regex);;
        Matcher matcher = pattern.matcher(targetStr);
        while (matcher.find()){
            result.append(matcher.group(1)).append(",");
        }

        return result.toString();
    }

    public static void main(String[] args) {
        String url = "https://www.zhihu.com/question/31839156";
        String result = sendUrl(url);

        result = regexStr(result,"</noscript><img.+?src=\"(https.+?)\".+?");
        List<String> zhihuPicUrl = Arrays.asList(result.split(","));
        for (String s : zhihuPicUrl) {
            System.out.println(s);
        }
        //System.out.println(result);
    }
}

图片的URL都获取到了直接下载就成,下载代码就不写了

我只想说四个字:注意身体


但这个爬虫有个问题,目前只能爬取问题下的前10条数据,虽然找到知乎分页的方法,但发送参数的时候一直有问题,如果以后完善了,我再补上来

分页完成了,并且把下载也做好了

package com.kevindai.webSpider;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by kevindai on 2017/1/12.
 */
public class WebSpider {
    public static String sendUrl(String url){
        // 定义一个字符串用来存储网页内容
        StringBuffer sb = new StringBuffer();
        // 定义一个缓冲字符输入流
        BufferedReader in = null;
        try {
            // 将string转成url对象
            URL realUrl = new URL(url);
            // 初始化一个链接到那个url的连接
            URLConnection connection = realUrl.openConnection();
            // 开始实际的连接
            connection.connect();
            // 初始化 BufferedReader输入流来读取URL的响应
            in = new BufferedReader(new InputStreamReader(connection.getInputStream(), "UTF-8"));
            // 用来临时存储抓取到的每一行的数据
            String line;
            while ((line = in.readLine()) != null) {
                // 遍历抓取到的每一行并将其存储到result里面
                sb.append(line);
            }
        } catch (Exception e) {
            System.out.println("发送GET请求出现异常!" + e);
            e.printStackTrace();
        }
        // 使用finally来关闭输入流
        finally {
            try {
                if (in != null) {
                    in.close();
                }
            } catch (Exception e2) {
                e2.printStackTrace();
            }
        }
        return sb.toString();
    }

    public static String regexStr(String targetStr,String regex){
        StringBuilder result = new StringBuilder();
        Pattern pattern = Pattern.compile(regex);;
        Matcher matcher = pattern.matcher(targetStr);
        while (matcher.find()){
            result.append(matcher.group(1)).append(",");
        }

        return result.toString();
    }

    public static String httpClientSendUrl(String url) throws Exception{
        String result = null;
        HttpClient httpClient = HttpClients.createDefault();//初始化一个httpClient链接
        HttpGet httpGet = new HttpGet(url);//创建一个get请求
        HttpResponse response = httpClient.execute(httpGet);//获取请求的返回值
        HttpEntity entity = response.getEntity();//得到请求返回值的数据
        result = EntityUtils.toString(entity);//格式化结果
        return result;
    }

    public  static String httpClientPostUrl(String url,int offset,int pageSize) throws Exception{
        String result = null;
        //初始化一个httpClient链接
        HttpClient httpClient = HttpClients.createDefault();
        //设置header参数,在浏览器console中查看
        HttpPost post = new HttpPost("https://www.zhihu.com/node/QuestionAnswerListV2");//知乎翻页url,创建post请求
        post.setHeader("Host","www.zhihu.com");
        post.setHeader("Origin","https://www.zhihu.com");
        post.setHeader("Referer",url);
        post.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36");
        post.setHeader("Content-Type","application/x-www-form-urlencoded; charset=UTF-8");
        post.setHeader("Accept-Language","zh-CN,zh;q=0.8");
        //设置分页参数
        List<NameValuePair> list = new ArrayList<NameValuePair>();
        list.add(new BasicNameValuePair("method","next"));
        list.add(new BasicNameValuePair("params","{\"url_token\":" + url.substring(url.lastIndexOf("/") + 1) + ",\""+ pageSize +"\":10,\"offset\":"+ offset +"}"));
        StringEntity stringEntity = new UrlEncodedFormEntity(list);
        post.setEntity(stringEntity);//把请求参数设置到请求中

        //获取返回结果
        HttpResponse response = httpClient.execute(post);
        result = EntityUtils.toString(response.getEntity());
        return  result.replace("\\","");//知乎翻页结果是经过格式化的字符串,因此做一个替换
    }

    public static File downLoadFile(String URL,String filePath){
        File f = new File(filePath);
        try{
            URL httpurl = new URL(URL);
            FileUtils.copyURLToFile(httpurl, f);

        }catch (Exception e){

        }
        return f;
    }

    public static void main(String[] args) throws  Exception {
        String url = "https://www.zhihu.com/question/31839156";
        //String total = sendUrl(url);
        //用于获取当前问题下总共有多少回答
        String content = httpClientSendUrl(url);//得到当前问题返回数据
        //获取答案总数
        String total = regexStr(content,"id=\"zh-question-answer-num\">([0-9]{1,}.+?).+?");
        total = total.substring(0,total.indexOf(",")).trim();
        //获取问题名,为方便建文件夹
        String title = regexStr(content,"class=\"zm-editable-content\">(.*).+?</span>");
        title = title.substring(0,title.indexOf(",")).trim();
        File dir = new File("D:" + File.separator + title);
        if(!dir.exists()){
            dir.mkdir();
        }


        //循环获取问题下的回答
        int pageSize = 10;
        int count = Integer.valueOf(total)/pageSize + 1;
        String result = null;
        for (int i = 0; i < count ; i++) {
            result = httpClientPostUrl(url,pageSize * i,pageSize);
            //获取图片url
            result = regexStr(result,"</noscript><img.+?src=\"(https.+?)\".+?");
            List<String> zhihuPicUrls = Arrays.asList(result.split(","));
            for (String picUrl : zhihuPicUrls) {
                //通过url获取图片
                downLoadFile(picUrl,dir.getPath() + File.separator + picUrl.substring(picUrl.lastIndexOf("/") + 1));
            }
        }

        System.out.println("-----------------------------------------------");
        System.out.println("数据获取完毕,just enjoy it!");
        System.out.println("数据目录:" + dir.getPath());
        System.out.println("-----------------------------------------------");
    }
}

但现在还有点慢,后面再完善线程池

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值