package com.iminer.crawlers.gsdata;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.UnexpectedPage;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.iminer.crawlers.CrawlerUtils4MusicAndMovie;
public class Test {
static String url = "http://www.gsdata.cn/index.php/rank/ranks?gid=@GID@&date=2015-09-22&page=1";
static String detailurl = "http://www.gsdata.cn/index.php/rank/single?id=";
public static void main(String[] args) throws Exception {
WebClient webClient = CrawlerUtils4MusicAndMovie.getClient();
String entranceUrl ="http://www.gsdata.cn/index.php/rank/detail?gid=0";
HtmlPage page = webClient.getPage(entranceUrl);
List<?> byXPath = page.getByXPath("//ul[@class='group-items']/li");
List<String> gids = new ArrayList<String>();
for (Object object : byXPath) {
HtmlElement oElement = (HtmlElement) object;
String attribute = oElement.getAttribute("data-gid");
gids.add(attribute);
}
for (String gid : gids) {
String tempUrl = url.replace("@GID@", gid);
webClient.addRequestHeader("X-Requested-With", "XMLHttpRequest");
UnexpectedPage spage = webClient.getPage(tempUrl);
JSONObject jsonObject = JSONObject.fromObject(spage.getWebResponse().getContentAsString());
String total = jsonObject.getJSONObject("data").getString("total");
System.out.println(total);
JSONArray jsonArray = jsonObject.getJSONObject("data").getJSONArray("rows");
for (Object object : jsonArray) {
JSONObject jsonObject2 = (JSONObject) object;
String id = jsonObject2.getString("nickname_id");
String temptempurl = detailurl + id;
System.out.println(temptempurl);
//详细页的访问
HtmlPage page2 = webClient.getPage(temptempurl);
List<?> byXPath2 = page2.getByXPath("//li[@class='li_2']");
for (Object object2 : byXPath2) {
HtmlElement element = (HtmlElement) object2;
System.out.println(element.getTextContent());
}
//访问统计数据
//http://www.gsdata.cn/index.php/rank/singleStatistic?id=52
}
}
}
}
java 爬虫Demo
最新推荐文章于 2022-07-15 14:42:59 发布