微信公众号文章爬取 java

通过订阅号的方式抓取,频率和次数都会有限制。

代码只是初稿,简单写的测试类。投入实际使用还需做出一些整理(注)。

public class PersonSubscriptionTest {
    private static String userName = "xxxx";
    private static String password = "xxxx";
    private static Set<Cookie> cookies;
    private static int begin = 0;
    private static int count = 5;
    private static String host = "mp.weixin.qq.com";
    private static String userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0";
    private static String loginUrl = "https://mp.weixin.qq.com/";
    private static String searchUrl = "https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&begin=0&count=5&query=%s&lang=zh_CN&f=json&ajax=1&token=%s";
    private static String referer = "https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&lang=zh_CN&token=%s";
    private static String appmsgUrl = "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=%d&count=%d&fakeid=%s&type=9&query=&token=%s&lang=zh_CN&f=json&ajax=1";

    static class WeixinArticleList {
        private String link;
        private String title;

        public String getLink() {
            return link;
        }

        public void setLink(String link) {
            this.link = link;
        }

        public String getTitle() {
            return title;
        }

        public void setTitle(String title) {
            this.title = title;
        }

        public WeixinArticleList(String link, String title) {
            this.link = link;
            this.title = title;
        }
    }

    public static void wechatLogin() throws InterruptedException {
        System.out.println("启动浏览器,打开微信公众号登录界面");
        ChromeDriverBean chromeDriverBean = SpringUtil.getBean(ChromeDriverBean.class);
        PooledWebDriver driver = chromeDriverBean.get();
        driver.get(loginUrl);
        Thread.sleep(5000);
        System.out.println("正在输入微信公众号登录账号和密码......");
        driver.findElement(By.xpath("//a[@class='login__type__container__select-type']")).click();
        //清空账号框中的内容
        driver.findElement(By.xpath("./*//input[@name='account']")).clear();
        //自动填入登录用户名
        driver.findElement(By.xpath("./*//input[@name='account']")).sendKeys(userName);
        //清空密码框中的内容
        driver.findElement(By.xpath("./*//input[@name='password']")).clear();
        //自动填入登录密码
        driver.findElement(By.xpath("./*//input[@name='password']")).sendKeys(password);
        //在自动输完密码之后需要手动点一下记住我
        System.out.println("请在登录界面点击:记住账号");
        Thread.sleep(5000);
        //自动点击登录按钮进行登录
        driver.findElement(By.xpath("//a[@class='btn_login']")).click();
        //拿手机扫二维码!
        System.out.println("请拿手机扫码二维码登录公众号");
        Thread.sleep(10000);
        System.out.println("登录成功");
        //重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息
        driver.get(loginUrl);
        cookies = driver.manage().getCookies();
    }

    public static void getContent(String query) throws IOException {
        StringBuilder stringBuilder = new StringBuilder();
        cookies.stream().forEach(it -> {
            stringBuilder.append(it.getName()).append("=").append(it.getValue()).append(";");
        });
        String cookie = stringBuilder.toString();
        String redirectUrl = getRedirectUrl(cookie);
        String token = redirectUrl.substring(redirectUrl.lastIndexOf("=") + 1);
        System.out.println("token:" + token);
        String fakeId = getFakeId(cookie, query, token);
        System.out.println("fakeId:" + fakeId);
        int totalNum = getTotalNum(cookie, token, fakeId);
        System.out.println("totalNum:" + totalNum);
        if (totalNum > 0) {
            List<WeixinArticleList> list = getWeixinArticleList(cookie, token, fakeId, totalNum);
            if (list.size() > 0) {
                list.stream().forEach(it -> {
                    System.out.println("title:" + it.title);
                    System.out.println("link:" + it.link);
                });
            }
        }
    }

    public static String getRedirectUrl(String cookies) {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        HttpClientContext context = HttpClientContext.create();
        HttpGet httpget = new HttpGet(loginUrl);
        httpget.addHeader("Cookie", cookies);
        CloseableHttpResponse response = null;
        String redirectUrl = null;
        try {
            response = httpclient.execute(httpget, context);
            HttpHost target = context.getTargetHost();
            List<URI> redirectLocations = context.getRedirectLocations();
            URI location = URIUtils.resolve(httpget.getURI(), target, redirectLocations);
            redirectUrl = location.toASCIIString();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (URISyntaxException e) {
            e.printStackTrace();
        } finally {
            try {
                httpclient.close();
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return redirectUrl;
    }


    public static String getFakeId(String cookies, String query, String token) throws IOException {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        String url = String.format(searchUrl, query, token);
        String refererUrl = String.format(referer, token);
        HttpGet httpGet = new HttpGet(url);
        httpGet.addHeader("Referer", refererUrl);
        httpGet.addHeader("Cookie", cookies);
        httpGet.addHeader("HOST", host);
        httpGet.addHeader("User-Agent", userAgent);
        CloseableHttpResponse response = null;
        String fakeId = null;
        try {
            response = httpclient.execute(httpGet);
            String result = EntityUtils.toString(response.getEntity(), "utf-8");
            JSONObject reJson = JSON.parseObject(result);
            JSONArray jsonArray = JSONArray.parseArray(reJson.get("list").toString());
            JSONObject jsonObject = JSON.parseObject(jsonArray.get(0).toString());
            fakeId = jsonObject.get("fakeid").toString();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                httpclient.close();
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return fakeId;
    }

    public static int getTotalNum(String cookies, String token, String fakeId) {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        String firstPageUrl = String.format(appmsgUrl, begin, count, fakeId, token);
        String refererUrl = String.format(referer, token);
        HttpGet httpget = new HttpGet(firstPageUrl);
        httpget.addHeader("Referer", refererUrl);
        httpget.addHeader("Cookie", cookies);
        httpget.addHeader("HOST", host);
        httpget.addHeader("User-Agent", userAgent);
        CloseableHttpResponse response = null;
        int totalNum = 0;
        try {
            response = httpclient.execute(httpget);
            String result = EntityUtils.toString(response.getEntity(), "utf-8");
            JSONObject reJson = JSON.parseObject(result);
            totalNum = Integer.valueOf(reJson.get("app_msg_cnt").toString());
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                httpclient.close();
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return totalNum;
    }

    public static List<WeixinArticleList> getWeixinArticleList(String cookies, String token, String fakeId, int totalNum) {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        CloseableHttpResponse response = null;
        List<WeixinArticleList> list = new ArrayList<>();
        try {
            int times = totalNum / 5;
            int remainder = totalNum % 5;
            int countTime = 1;
            int pos = 0;
            while (times > 0) {
                System.out.println("开始爬取第" + countTime + "页");
                String pageUrl = String.format(appmsgUrl, pos, count, fakeId, token);
                pos = countTime * count;
                String refererUrl = String.format(referer, token);
                HttpGet httpget = new HttpGet(pageUrl);
                httpget.addHeader("Referer", refererUrl);
                httpget.addHeader("Cookie", cookies);
                httpget.addHeader("HOST", host);
                httpget.addHeader("User-Agent", userAgent);
                response = httpclient.execute(httpget);
                String result = EntityUtils.toString(response.getEntity(), "utf-8");
                JSONObject reJson = JSON.parseObject(result);
                JSONArray jsonArray = JSONArray.parseArray(reJson.get("app_msg_list").toString());
                if (jsonArray.size() > 0) {
                    for (int i = 0; i < jsonArray.size(); i++) {
                        JSONObject job = jsonArray.getJSONObject(i);
                        String title = (String) job.get("title");
                        String link = (String) job.get("link");
                        list.add(new WeixinArticleList(link, title));
                    }
                }
                countTime++;
                times--;
                Thread.sleep(10000);
            }
            if (remainder > 0) {
                System.out.println("开始爬取第" + countTime + "页");
                String lastPageUrl = String.format(appmsgUrl, pos, remainder, fakeId, token);
                String refererUrl = String.format(referer, token);
                HttpGet httpget = new HttpGet(lastPageUrl);
                httpget.addHeader("Referer", refererUrl);
                httpget.addHeader("Cookie", cookies);
                httpget.addHeader("HOST", host);
                httpget.addHeader("User-Agent", userAgent);
                response = httpclient.execute(httpget);
                String result = EntityUtils.toString(response.getEntity(), "utf-8");
                JSONObject reJson = JSON.parseObject(result);
                JSONArray jsonArray = JSONArray.parseArray(reJson.get("app_msg_list").toString());
                if (jsonArray.size() > 0) {
                    for (int i = 0; i < jsonArray.size(); i++) {
                        JSONObject job = jsonArray.getJSONObject(i);
                        String title = (String) job.get("title");
                        String link = (String) job.get("link");
                        list.add(new WeixinArticleList(link, title));
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } finally {
            try {
                httpclient.close();
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return list;
    }

    @Test
    public void test() throws InterruptedException, IOException {
        wechatLogin();
        getContent("XXX");
    }
}

 

评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值