通过订阅号的方式抓取,频率和次数都会有限制。
代码只是初稿,简单写的测试类。投入实际使用还需做出一些整理(注)。
public class PersonSubscriptionTest {
private static String userName = "xxxx";
private static String password = "xxxx";
private static Set<Cookie> cookies;
private static int begin = 0;
private static int count = 5;
private static String host = "mp.weixin.qq.com";
private static String userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0";
private static String loginUrl = "https://mp.weixin.qq.com/";
private static String searchUrl = "https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&begin=0&count=5&query=%s&lang=zh_CN&f=json&ajax=1&token=%s";
private static String referer = "https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&lang=zh_CN&token=%s";
private static String appmsgUrl = "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=%d&count=%d&fakeid=%s&type=9&query=&token=%s&lang=zh_CN&f=json&ajax=1";
static class WeixinArticleList {
private String link;
private String title;
public String getLink() {
return link;
}
public void setLink(String link) {
this.link = link;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public WeixinArticleList(String link, String title) {
this.link = link;
this.title = title;
}
}
public static void wechatLogin() throws InterruptedException {
System.out.println("启动浏览器,打开微信公众号登录界面");
ChromeDriverBean chromeDriverBean = SpringUtil.getBean(ChromeDriverBean.class);
PooledWebDriver driver = chromeDriverBean.get();
driver.get(loginUrl);
Thread.sleep(5000);
System.out.println("正在输入微信公众号登录账号和密码......");
driver.findElement(By.xpath("//a[@class='login__type__container__select-type']")).click();
//清空账号框中的内容
driver.findElement(By.xpath("./*//input[@name='account']")).clear();
//自动填入登录用户名
driver.findElement(By.xpath("./*//input[@name='account']")).sendKeys(userName);
//清空密码框中的内容
driver.findElement(By.xpath("./*//input[@name='password']")).clear();
//自动填入登录密码
driver.findElement(By.xpath("./*//input[@name='password']")).sendKeys(password);
//在自动输完密码之后需要手动点一下记住我
System.out.println("请在登录界面点击:记住账号");
Thread.sleep(5000);
//自动点击登录按钮进行登录
driver.findElement(By.xpath("//a[@class='btn_login']")).click();
//拿手机扫二维码!
System.out.println("请拿手机扫码二维码登录公众号");
Thread.sleep(10000);
System.out.println("登录成功");
//重新载入公众号登录页,登录之后会显示公众号后台首页,从这个返回内容中获取cookies信息
driver.get(loginUrl);
cookies = driver.manage().getCookies();
}
public static void getContent(String query) throws IOException {
StringBuilder stringBuilder = new StringBuilder();
cookies.stream().forEach(it -> {
stringBuilder.append(it.getName()).append("=").append(it.getValue()).append(";");
});
String cookie = stringBuilder.toString();
String redirectUrl = getRedirectUrl(cookie);
String token = redirectUrl.substring(redirectUrl.lastIndexOf("=") + 1);
System.out.println("token:" + token);
String fakeId = getFakeId(cookie, query, token);
System.out.println("fakeId:" + fakeId);
int totalNum = getTotalNum(cookie, token, fakeId);
System.out.println("totalNum:" + totalNum);
if (totalNum > 0) {
List<WeixinArticleList> list = getWeixinArticleList(cookie, token, fakeId, totalNum);
if (list.size() > 0) {
list.stream().forEach(it -> {
System.out.println("title:" + it.title);
System.out.println("link:" + it.link);
});
}
}
}
public static String getRedirectUrl(String cookies) {
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpClientContext context = HttpClientContext.create();
HttpGet httpget = new HttpGet(loginUrl);
httpget.addHeader("Cookie", cookies);
CloseableHttpResponse response = null;
String redirectUrl = null;
try {
response = httpclient.execute(httpget, context);
HttpHost target = context.getTargetHost();
List<URI> redirectLocations = context.getRedirectLocations();
URI location = URIUtils.resolve(httpget.getURI(), target, redirectLocations);
redirectUrl = location.toASCIIString();
} catch (IOException e) {
e.printStackTrace();
} catch (URISyntaxException e) {
e.printStackTrace();
} finally {
try {
httpclient.close();
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return redirectUrl;
}
public static String getFakeId(String cookies, String query, String token) throws IOException {
CloseableHttpClient httpclient = HttpClients.createDefault();
String url = String.format(searchUrl, query, token);
String refererUrl = String.format(referer, token);
HttpGet httpGet = new HttpGet(url);
httpGet.addHeader("Referer", refererUrl);
httpGet.addHeader("Cookie", cookies);
httpGet.addHeader("HOST", host);
httpGet.addHeader("User-Agent", userAgent);
CloseableHttpResponse response = null;
String fakeId = null;
try {
response = httpclient.execute(httpGet);
String result = EntityUtils.toString(response.getEntity(), "utf-8");
JSONObject reJson = JSON.parseObject(result);
JSONArray jsonArray = JSONArray.parseArray(reJson.get("list").toString());
JSONObject jsonObject = JSON.parseObject(jsonArray.get(0).toString());
fakeId = jsonObject.get("fakeid").toString();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
httpclient.close();
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return fakeId;
}
public static int getTotalNum(String cookies, String token, String fakeId) {
CloseableHttpClient httpclient = HttpClients.createDefault();
String firstPageUrl = String.format(appmsgUrl, begin, count, fakeId, token);
String refererUrl = String.format(referer, token);
HttpGet httpget = new HttpGet(firstPageUrl);
httpget.addHeader("Referer", refererUrl);
httpget.addHeader("Cookie", cookies);
httpget.addHeader("HOST", host);
httpget.addHeader("User-Agent", userAgent);
CloseableHttpResponse response = null;
int totalNum = 0;
try {
response = httpclient.execute(httpget);
String result = EntityUtils.toString(response.getEntity(), "utf-8");
JSONObject reJson = JSON.parseObject(result);
totalNum = Integer.valueOf(reJson.get("app_msg_cnt").toString());
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
httpclient.close();
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return totalNum;
}
public static List<WeixinArticleList> getWeixinArticleList(String cookies, String token, String fakeId, int totalNum) {
CloseableHttpClient httpclient = HttpClients.createDefault();
CloseableHttpResponse response = null;
List<WeixinArticleList> list = new ArrayList<>();
try {
int times = totalNum / 5;
int remainder = totalNum % 5;
int countTime = 1;
int pos = 0;
while (times > 0) {
System.out.println("开始爬取第" + countTime + "页");
String pageUrl = String.format(appmsgUrl, pos, count, fakeId, token);
pos = countTime * count;
String refererUrl = String.format(referer, token);
HttpGet httpget = new HttpGet(pageUrl);
httpget.addHeader("Referer", refererUrl);
httpget.addHeader("Cookie", cookies);
httpget.addHeader("HOST", host);
httpget.addHeader("User-Agent", userAgent);
response = httpclient.execute(httpget);
String result = EntityUtils.toString(response.getEntity(), "utf-8");
JSONObject reJson = JSON.parseObject(result);
JSONArray jsonArray = JSONArray.parseArray(reJson.get("app_msg_list").toString());
if (jsonArray.size() > 0) {
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject job = jsonArray.getJSONObject(i);
String title = (String) job.get("title");
String link = (String) job.get("link");
list.add(new WeixinArticleList(link, title));
}
}
countTime++;
times--;
Thread.sleep(10000);
}
if (remainder > 0) {
System.out.println("开始爬取第" + countTime + "页");
String lastPageUrl = String.format(appmsgUrl, pos, remainder, fakeId, token);
String refererUrl = String.format(referer, token);
HttpGet httpget = new HttpGet(lastPageUrl);
httpget.addHeader("Referer", refererUrl);
httpget.addHeader("Cookie", cookies);
httpget.addHeader("HOST", host);
httpget.addHeader("User-Agent", userAgent);
response = httpclient.execute(httpget);
String result = EntityUtils.toString(response.getEntity(), "utf-8");
JSONObject reJson = JSON.parseObject(result);
JSONArray jsonArray = JSONArray.parseArray(reJson.get("app_msg_list").toString());
if (jsonArray.size() > 0) {
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject job = jsonArray.getJSONObject(i);
String title = (String) job.get("title");
String link = (String) job.get("link");
list.add(new WeixinArticleList(link, title));
}
}
}
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
try {
httpclient.close();
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return list;
}
@Test
public void test() throws InterruptedException, IOException {
wechatLogin();
getContent("XXX");
}
}