爬取最近的文章,有问题可以交流,新人爬手!!
static WebDriver driver = null;
public void test() {
if (driver == null) {
System.setProperty("webdriver.chrome.driver", "d://chromedriver.exe");
System.getProperties().setProperty("webdriver.chrome.driver", "d://chromedriver.exe");
driver = new ChromeDriver();
}
driver.get("https://weixin.sogou.com");
driver.findElement(By.id("query")).sendKeys("公众号");
driver.findElement(By.xpath("//*[@uigs='search_account']")).click();
Thread.sleep(2000);
WebElement el = driver.findElement(By.xpath("//*[@uigs='account_name_0']"));
Thread.sleep(2000);
JavascriptExecutor js = (JavascriptExecutor) driver;
js.executeScript("arguments[0].target=''", el);
Thread.sleep(2000);
driver.findElement(By.xpath("//*[@uigs='account_name_0']")).click();
Thread.sleep(2000);
WebElement webElement = driver.findElement(By.xpath("/html"));
String responseContent = webElement.getAttribute("outerHTML");
Document doc = Jsoup.parse(responseContent);
this.getDetail(doc);
Thread.sleep(2000);
}
public void getDetail(Document doc) {
String littleimg = doc.getElementsByClass("profile_avatar").select("img").get(0).attr("src");
String html = doc.html();
int index = html.indexOf("msgList");
String msglist = html.substring(index, html.indexOf("};", index)).replace("msgList = ", "");
msglist += "}";
Map<String, Object> data = new HashMap<String, Object>();
List<Map<String, Object>> datalist = new ArrayList<Map<String, Object>>();
JSONObject json = null;
json = JSONObject.fromObject(msglist);
String firstUrl = JSONObject.fromObject(json.getJSONArray("list").get(0)).getJSONObject("app_msg_ext_info")
.get("content_url").toString();
String firstDesc = JSONObject.fromObject(json.getJSONArray("list").get(0)).getJSONObject("app_msg_ext_info")
.get("digest").toString();
String firsttitle = JSONObject.fromObject(json.getJSONArray("list").get(0)).getJSONObject("app_msg_ext_info")
.get("title").toString();
String firstauthor = JSONObject.fromObject(json.getJSONArray("list").get(0)).getJSONObject("app_msg_ext_info")
.get("author").toString();
String firstimg = JSONObject.fromObject(json.getJSONArray("list").get(0)).getJSONObject("app_msg_ext_info")
.get("cover").toString();
String datetime = JSONObject.fromObject(json.getJSONArray("list").get(0)).getJSONObject("comm_msg_info")
.get("datetime").toString();
data.put("url", firstUrl);
data.put("Desc", firstDesc);
data.put("title", firsttitle);
data.put("author", firstauthor);
data.put("img", firstimg);
datalist.add(data);
JSONArray jsonarray = JSONObject.fromObject(json.getJSONArray("list").get(0)).getJSONObject("app_msg_ext_info")
.getJSONArray("multi_app_msg_item_list");
Iterator<Object> it = jsonarray.iterator();
while (it.hasNext()) {
Map<String, Object> seconddata = new HashMap<String, Object>();
JSONObject ob = (JSONObject) it.next();
String url = ob.get("content_url").toString();
String desc = ob.get("digest").toString();
String title = ob.get("title").toString();
String author = ob.get("author").toString();
String img = ob.get("cover").toString();
seconddata.put("url", url);
seconddata.put("Desc", desc);
seconddata.put("title", title);
seconddata.put("author", author);
seconddata.put("img", img);
datalist.add(seconddata);
}
if (datalist != null && datalist.size() > 0) {
for (int i = 0; i < datalist.size(); i++) {
if (i > 0)
driver.navigate().back();
Map<String, Object> datas = datalist.get(i);
driver.findElements(By.xpath("//*[@class='weui_media_hd']")).get(i).click();
WebElement webElement = driver.findElement(By.xpath("/html"));
String responseContent = webElement.getAttribute("outerHTML");
Document baseData = Jsoup.parse(responseContent);
String title = baseData.getElementById("activity-name").text();
String sourcename = "";
if (baseData.getElementById("post-user") == null) {
sourcename = baseData.getElementById("profileBt").getElementsByClass("profile_nickname").text();
} else {
sourcename = baseData.getElementById("post-user").text();
}
String content = baseData.getElementById("img-content").toString();
String descs = datas.get("Desc").toString();
String img = datas.get("img").toString();
}
}
}