微信公众号文章爬取 java

通过订阅号的方式抓取,频率和次数都会有限制。

代码只是初稿,简单写的测试类。投入实际使用还需做出一些整理(注)。

public class PersonSubscriptionTest {
    private static String userName = "xxxx";
    private static String password = "xxxx";
    private static Set<Cookie> cookies;
    private static int begin = 0;
    private static int count = 5;
    private static String host = "mp.weixin.qq.com";
    private static String userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0";
    private static String loginUrl = "https://mp.weixin.qq.com/";
    private static String searchUrl = "https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&begin=0&count=5&query=%s&lang=zh_CN&f=json&ajax=1&token=%s";
    private static String referer = "https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&lang=zh_CN&token=%s";
    private static String appmsgUrl = "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=%d&count=%d&fakeid=%s&type=9&query=&token=%s&lang=zh_CN&f=json&ajax=1";

    static class WeixinArticleList {
        private String link;
        private String title;

        public String getLink() {
            return link;
        }

        public void setLink(String link) {
            this.link = link;
        }

        public String getTitle() {
            return title;
        }

        public void setTitle(String title) {
            this.title = title;
        }

        public WeixinArticleList(String link, String title) {
            this.link = link;
            this.title = title;
        }
    }

    public static void wechatLogin() throws InterruptedException {
        System.out.println("启动浏览器,打开微信公众号登录界面");
        ChromeDriverBean chromeDriverBean = SpringUtil.getBean(ChromeDriverBean.class);
        PooledWebDriver driver = chromeDriverBean.get();
        driver.get(loginUrl);
        Thread.sleep(5000);
        System.out.println("正在输入微信公众号登录账号和密码......");
        driver.findElement(By.xpath("//a[@class='login__type__container__select-type']")).click();
        //清空账号框中的内容
        driver.findElement(By.xpath("./*//input[@name='account']")).clear();
        //自动填入登录用户名
        driver.findElement(By.xpath("./*//input[@name='account']")).sendKeys(userName);
        //清空密码框中的内容
        driver.findElement(By.xpath("./*//input[@name='password']")).clear();
        //自动填入登录密码
        driver.findElement(By.xpath("./*//input[@name='password']")).sendKeys(password);
        //在自动输完密码之后需要手动点一下记住我
        System.out.println("请在登录界面点击:记住账号");
        Thread.sleep(5000);
        //自动点击登录按钮进行登录
        driver.findElement(By.xpath("//a[@class='btn_login']")).click();
        //拿手机扫二维码!
        System.out.println("请拿手机扫码二维码登录公众号");
        Thread.sleep(10000);
        System.out.println("登录成功");
        //重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息
        driver.get(loginUrl);
        cookies = driver.manage().getCookies();
    }

    public static void getContent(String query) throws IOException {
        StringBuilder stringBuilder = new StringBuilder();
        cookies.stream().forEach(it -> {
            stringBuilder.append(it.getName()).append("=").append(it.getValue()).append(";");
        });
        String cookie = stringBuilder.toString();
        String redirectUrl = getRedirectUrl(cookie);
        String token = redirectUrl.substring(redirectUrl.lastIndexOf("=") + 1);
        System.out.println("token:" + token);
        String fakeId = getFakeId(cookie, query, token);
        System.out.println("fakeId:" + fakeId);
        int totalNum = getTotalNum(cookie, token, fakeId);
        System.out.println("totalNum:" + totalNum);
        if (totalNum > 0) {
            List<WeixinArticleList> list = getWeixinArticleList(cookie, token, fakeId, totalNum);
            if (list.size() > 0) {
                list.stream().forEach(it -> {
                    System.out.println("title:" + it.title);
                    System.out.println("link:" + it.link);
                });
            }
        }
    }

    public static String getRedirectUrl(String cookies) {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        HttpClientContext context = HttpClientContext.create();
        HttpGet httpget = new HttpGet(loginUrl);
        httpget.addHeader("Cookie", cookies);
        CloseableHttpResponse response = null;
        String redirectUrl = null;
        try {
            response = httpclient.execute(httpget, context);
            HttpHost target = context.getTargetHost();
            List<URI> redirectLocations = context.getRedirectLocations();
            URI location = URIUtils.resolve(httpget.getURI(), target, redirectLocations);
            redirectUrl = location.toASCIIString();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (URISyntaxException e) {
            e.printStackTrace();
        } finally {
            try {
                httpclient.close();
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return redirectUrl;
    }


    public static String getFakeId(String cookies, String query, String token) throws IOException {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        String url = String.format(searchUrl, query, token);
        String refererUrl = String.format(referer, token);
        HttpGet httpGet = new HttpGet(url);
        httpGet.addHeader("Referer", refererUrl);
        httpGet.addHeader("Cookie", cookies);
        httpGet.addHeader("HOST", host);
        httpGet.addHeader("User-Agent", userAgent);
        CloseableHttpResponse response = null;
        String fakeId = null;
        try {
            response = httpclient.execute(httpGet);
            String result = EntityUtils.toString(response.getEntity(), "utf-8");
            JSONObject reJson = JSON.parseObject(result);
            JSONArray jsonArray = JSONArray.parseArray(reJson.get("list").toString());
            JSONObject jsonObject = JSON.parseObject(jsonArray.get(0).toString());
            fakeId = jsonObject.get("fakeid").toString();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                httpclient.close();
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return fakeId;
    }

    public static int getTotalNum(String cookies, String token, String fakeId) {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        String firstPageUrl = String.format(appmsgUrl, begin, count, fakeId, token);
        String refererUrl = String.format(referer, token);
        HttpGet httpget = new HttpGet(firstPageUrl);
        httpget.addHeader("Referer", refererUrl);
        httpget.addHeader("Cookie", cookies);
        httpget.addHeader("HOST", host);
        httpget.addHeader("User-Agent", userAgent);
        CloseableHttpResponse response = null;
        int totalNum = 0;
        try {
            response = httpclient.execute(httpget);
            String result = EntityUtils.toString(response.getEntity(), "utf-8");
            JSONObject reJson = JSON.parseObject(result);
            totalNum = Integer.valueOf(reJson.get("app_msg_cnt").toString());
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                httpclient.close();
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return totalNum;
    }

    public static List<WeixinArticleList> getWeixinArticleList(String cookies, String token, String fakeId, int totalNum) {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        CloseableHttpResponse response = null;
        List<WeixinArticleList> list = new ArrayList<>();
        try {
            int times = totalNum / 5;
            int remainder = totalNum % 5;
            int countTime = 1;
            int pos = 0;
            while (times > 0) {
                System.out.println("开始爬取第" + countTime + "页");
                String pageUrl = String.format(appmsgUrl, pos, count, fakeId, token);
                pos = countTime * count;
                String refererUrl = String.format(referer, token);
                HttpGet httpget = new HttpGet(pageUrl);
                httpget.addHeader("Referer", refererUrl);
                httpget.addHeader("Cookie", cookies);
                httpget.addHeader("HOST", host);
                httpget.addHeader("User-Agent", userAgent);
                response = httpclient.execute(httpget);
                String result = EntityUtils.toString(response.getEntity(), "utf-8");
                JSONObject reJson = JSON.parseObject(result);
                JSONArray jsonArray = JSONArray.parseArray(reJson.get("app_msg_list").toString());
                if (jsonArray.size() > 0) {
                    for (int i = 0; i < jsonArray.size(); i++) {
                        JSONObject job = jsonArray.getJSONObject(i);
                        String title = (String) job.get("title");
                        String link = (String) job.get("link");
                        list.add(new WeixinArticleList(link, title));
                    }
                }
                countTime++;
                times--;
                Thread.sleep(10000);
            }
            if (remainder > 0) {
                System.out.println("开始爬取第" + countTime + "页");
                String lastPageUrl = String.format(appmsgUrl, pos, remainder, fakeId, token);
                String refererUrl = String.format(referer, token);
                HttpGet httpget = new HttpGet(lastPageUrl);
                httpget.addHeader("Referer", refererUrl);
                httpget.addHeader("Cookie", cookies);
                httpget.addHeader("HOST", host);
                httpget.addHeader("User-Agent", userAgent);
                response = httpclient.execute(httpget);
                String result = EntityUtils.toString(response.getEntity(), "utf-8");
                JSONObject reJson = JSON.parseObject(result);
                JSONArray jsonArray = JSONArray.parseArray(reJson.get("app_msg_list").toString());
                if (jsonArray.size() > 0) {
                    for (int i = 0; i < jsonArray.size(); i++) {
                        JSONObject job = jsonArray.getJSONObject(i);
                        String title = (String) job.get("title");
                        String link = (String) job.get("link");
                        list.add(new WeixinArticleList(link, title));
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } finally {
            try {
                httpclient.close();
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return list;
    }

    @Test
    public void test() throws InterruptedException, IOException {
        wechatLogin();
        getContent("XXX");
    }
}

 

微信公众号开发源码Java是用Java语言编写的用于开发微信公众号的源代码。微信公众号开发是指通过开发者账号申请成为微信公众号的开发者,利用微信提供的开发接口和SDK来开发和管理公众号。 Java作为一种广泛应用于企业级开发的编程语言,在微信公众号开发中也得到了广泛应用。通过使用Java开发微信公众号,可以实现公众号的业务逻辑,包括用户管理、消息推送、菜单设置、素材管理等功能,以及与其他系统的对接、数据的处理和存储等。 对于开发微信公众号的源码来说,Java源码通常包括了处理微信服务器与开发者服务器之间的消息通信和交互的代码,以及各类功能模块的实现代码。开发者可以根据自己的需求和业务逻辑,使用Java语言编写各种业务逻辑代码,并通过开发工具集成微信提供的SDK库来实现与微信服务器之间的交互。 在Java源码的基础上,开发者还可以根据需要进行定制和扩展,以满足更具体的业务需求。可以添加自定义的功能模块或者对现有功能进行修改和优化,以适应不同的应用场景和业务要求。 总之,微信公众号开发源码Java是用于开发微信公众号的源代码,通过使用Java语言和相应的开发工具,开发者可以自定义和实现具体的业务功能,满足不同用户的需求,并与微信服务器进行消息交互,为用户提供更好的微信公众号服务。
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值