下面是爬取51.la网站数据,您可以注册网站进行测试参考 网址:51.la
package com.cc.api.test.user;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.ig.common.utils.*;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.junit.Test;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.text.SimpleDateFormat;
import java.util.Date;
/**
* Created by Administrator on 2015/8/24.
*/
public class JsoupApiTest {
@Test
public void testGame () {
String wd = "http: www.51.la";
String url = "http: www.51.la/login.asp";
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
String newDate = sdf.format(new Date());
Connection.Response res = null;
try {
res = Jsoup.connect(url).data("uname", "betlog", "upass", "Betlogbiwin").method(Connection.Method.POST).execute();
} catch (IOException e) {
e.printStackTrace();
}
这儿的SESSIONID需要根据要登录的目标网站设置的session Cookie名字而定
String sessionId = res.cookie("ajstat");
try {
Document objectDoc = Jsoup.connect("http: www.51.la/user/index.asp").cookie("ajstat", sessionId).get();
System.out.print("统计ID报表" + "\r\n");
Elements groups = objectDoc.getElementsByClass("bodys_zw").first().getElementsByTag("div").first().getElementsByClass("grplistlink");
String groupInfo = "";
for(Element gr : groups){
groupInfo = groupInfo+ gr.text()+" ";
}
System.out.print(groupInfo + "\r\n\n");
System.out.print("获取具体的平台的数据情况 : " + "\r\n");
Elements ptElements = objectDoc.getElementsByClass("idlist_box");
for (Element pt :ptElements){
String bcptName = pt.getElementsByClass("idlist_n").first().text();
System.out.print(bcptName + "\r\n");
}
Element gr = objectDoc.getElementsByClass("idlist_box").first();
String bcptName = gr.getElementsByClass("idlist_n").first().text();
System.out.print(bcptName + "\r\n");
Elements bcreports = gr.getElementsByClass("idlist_o").first().getElementsByTag("a");
String agentlinks = bcreports.first().attr("href").substring(bcreports.first().attr("href").indexOf("/"), bcreports.first().attr("href").length());
String agentId = agentlinks.substring(agentlinks.indexOf("id") + 3, agentlinks.length());
String accInfo = "http: www.51.la/report/3_last.asp?id="+agentId+"&slailu=&skey=&spage=&sip=&d1="+newDate+"&t=kuai"; 访问明细
String serchEngine = "http: www.51.la/report/3_SE.asp?id="+agentId+"&d1="+newDate+"&d2="+newDate; 搜索引擎
String keyWords = "http: www.51.la/report/3_keyword.asp?id="+agentId+"&s=&d1="+newDate+"&d2="+newDate+"&ord=k_ci"; 关键词
String route = "http: www.51.la/report/3_Lailu.asp?id="+agentId+"&d1="+newDate+"&d2="+newDate+"&s=&ord=l_ci"; 来路
String pageBrowser = "http: www.51.la/report/3_page.asp?id="+agentId+"&isdm=0&s=&d1="+newDate+"&d2="+newDate+"&ord=p_ci"; 页面浏览
String domain = "http: www.51.la/report/3_page.asp?id="+agentId+"&isdm=1&s=&d1="+newDate+"&d2="+newDate+"&ord=p_ci"; 域名
String returnedCust = "http: www.51.la/report/3_Client.asp?id="+agentId+"&t=huitou&d1="+newDate+"&d2="+newDate; 回头客
String browsingDepth = "http: www.51.la/report/3_Shendu.asp?id="+agentId+"&d1="+newDate+"&d2="+newDate; 浏览深度
String os = "http: www.51.la/report/3_Client.asp?id="+agentId+"&t=os&d1="+newDate+"&d2="+newDate; 操作系统
String browser = "http: www.51.la/report/3_Client.asp?id="+agentId+"&t=soft&d1="+newDate+"&d2="+newDate; 浏览器
System.out.print("IP 访问地址 上站时间 来路 入口网址\r\n\n");
Document pageaccInfoDom = Jsoup.connect(accInfo).cookie("ajstat", sessionId).get();
int pageaccInfoSize = 1;
if(null!=pageaccInfoDom.getElementById("pageslink")){
pageaccInfoSize = Integer.valueOf(pageaccInfoDom.getElementById("pageslink").getElementsByTag("a").last().text());
}
System.out.print("来路网站:总页数"+pageaccInfoSize+" \r\n");
if(null!=pageaccInfoDom.getElementsByClass("bodys_zw").first().getElementsByTag("table")){
for (int i=0;i<pageaccInfoSize ;i++) {
String requestRoteUrl = accInfo + "&p=" + (i + 1);
Document pageaccInfoDoms = Jsoup.connect(accInfo).cookie("ajstat", sessionId).get();
Elements accInfoTrs = pageaccInfoDoms.getElementsByClass("bodys_zw").first().getElementsByTag("table").first().getElementsByTag("tbody").first().getElementsByTag("tr");
int tri = 0;
for (Element trs : accInfoTrs){
++tri;
if (tri>2 && tri<accInfoTrs.size()){
Elements accInfoTds = trs.getElementsByTag("td");
String accInfotdText = "";
String ip = accInfoTds.get(0).text(); ip
String address = accInfoTds.get(1).text(); 地址
String upperStationTime = newDate +" "+ accInfoTds.get(2).text(); 上站时间
String routes = accInfoTds.get(3).text(); 来路
String entranceSite = accInfoTds.get(4).getElementsByTag("a").first().attr("href"); 入口网址
accInfotdText = accInfotdText + ip + " "+ address+" "+upperStationTime+" "+routes + " "+entranceSite;
System.out.print("\r\n"+accInfotdText);
}
}
}
}
String todayFlowSum = objectDoc.getElementsByClass("sitelist_n").get(1).text();
System.out.print("\r\n\n"+todayFlowSum + "\r\n");
} catch (Exception e) {
e.printStackTrace();
}
}
浏览器、操作系统、浏览深度、回头客
public String[] publicGrapMeht(String url, String sessionId)throws Exception{
System.out.print("浏览器 访问量 比例 \r\n");
Document publicGrapDom = Jsoup.connect(url).cookie("ajstat", sessionId).get();
Elements publicGrapTrs = publicGrapDom.getElementById("gra_shu").getElementsByTag("tbody").first().getElementsByTag("tr");
String returnGrapInfo[] = new String[3];
int sertri = 0;
for (Element graptrs : publicGrapTrs){
++sertri;
if (sertri>2 && sertri<publicGrapTrs.size()){
Elements serTds = graptrs.getElementsByTag("td");
String se = serTds.get(0).text(); 浏览情况
String sumNum = serTds.get(1).text(); 总量
String proportion = serTds.get(2).text(); 比例
returnGrapInfo[0] = se;
returnGrapInfo[1] = sumNum;
returnGrapInfo[2] = proportion;
String serEngineText = ""+se+" "+sumNum + " "+proportion;
System.out.print("\r\n"+serEngineText);
}
}
return returnGrapInfo;
}
浏览器
public void browserMeht(String browser,String sessionId)throws Exception{
System.out.print("浏览器 访问量 比例 \r\n");
Document browserDom = Jsoup.connect(browser).cookie("ajstat", sessionId).get();
Elements browserTrs = browserDom.getElementById("gra_shu").getElementsByTag("tbody").first().getElementsByTag("tr");
int sertri = 0;
for (Element sertrs : browserTrs){
++sertri;
if (sertri>2 && sertri<browserTrs.size()){
Elements serTds = sertrs.getElementsByTag("td");
String se = serTds.get(0).text(); 浏览情况
String sumNum = serTds.get(1).text(); 总量
String proportion = serTds.get(2).text(); 比例
String serEngineText = ""+se+" "+sumNum + " "+proportion;
System.out.print("\r\n"+serEngineText);
}
}
}
操作系统
public void osMeth(String os,String sessionId)throws Exception{
System.out.print("操作系统 访问量 比例 \r\n");
Document osDom = Jsoup.connect(os).cookie("ajstat", sessionId).get();
Elements osTrs = osDom.getElementById("gra_shu").getElementsByTag("tbody").first().getElementsByTag("tr");
int sertri = 0;
for (Element sertrs : osTrs){
++sertri;
if (sertri>2 && sertri<osTrs.size()){
Elements serTds = sertrs.getElementsByTag("td");
String se = serTds.get(0).text(); 浏览情况
String sumNum = serTds.get(1).text(); 总量
String proportion = serTds.get(2).text(); 比例
String serEngineText = ""+se+" "+sumNum + " "+proportion;
System.out.print("\r\n"+serEngineText);
}
}
}
浏览深度
public void browsingDepthMeth(String browsingDepth,String sessionId)throws Exception{
System.out.print("浏览深度 访问量 比例 \r\n");
Document browsingDepthDom = Jsoup.connect(browsingDepth).cookie("ajstat", sessionId).get();
Elements browsingDeptTrs = browsingDepthDom.getElementById("gra_shu").getElementsByTag("tbody").first().getElementsByTag("tr");
int sertri = 0;
for (Element sertrs : browsingDeptTrs){
++sertri;
if (sertri>2 && sertri<browsingDeptTrs.size()){
Elements serTds = sertrs.getElementsByTag("td");
String se = serTds.get(0).text(); 浏览情况
String sumNum = serTds.get(1).text(); 总量
String proportion = serTds.get(2).text(); 比例
String serEngineText = ""+se+" "+sumNum + " "+proportion;
System.out.print("\r\n"+serEngineText);
}
}
}
回头客
public void returnedCustMeth(String returnedCust,String sessionId)throws Exception{
System.out.print("回头率分析 访问量 比例 \r\n");
Document returnedCustDom = Jsoup.connect(returnedCust).cookie("ajstat", sessionId).get();
Elements returnedCustTrs = returnedCustDom.getElementById("gra_shu").getElementsByTag("tbody").first().getElementsByTag("tr");
int sertri = 0;
for (Element sertrs : returnedCustTrs){
++sertri;
if (sertri>2 && sertri<returnedCustTrs.size()){
Elements serTds = sertrs.getElementsByTag("td");
String se = serTds.get(0).text(); 搜索引擎
String sumNum = serTds.get(1).text(); 总量
String proportion = serTds.get(2).text(); 比例
String serEngineText = ""+se+" "+sumNum + " "+proportion;
System.out.print("\r\n"+serEngineText);
}
}
}
域名
public void domainMeth(String domain,String sessionId)throws Exception{
System.out.print("被访问域名 IP入口 UV入口 新UV 浏览量 比例 \r\n");
Document domainDom = Jsoup.connect(domain).cookie("ajstat", sessionId).get();
if(null!=domainDom.getElementById("tablist")){
int domainPageSize = 1;
if(null!=domainDom.getElementById("pageslink")){
domainPageSize = Integer.valueOf(domainDom.getElementById("pageslink").getElementsByTag("a").last().text());
}
System.out.print("域名:总页数"+domainPageSize+" \r\n");
for (int i=0;i<domainPageSize ;i++){
String requestDomainUrl = domain + "&p="+(i+1);
Document pageDomiansDom = Jsoup.connect(requestDomainUrl).cookie("ajstat", sessionId).get();
Elements pageDomiansTrs = pageDomiansDom.getElementById("tablist").getElementsByTag("tbody").first().getElementsByTag("tr");
int routei = 0;
for (Element rttr : pageDomiansTrs){
++routei;
if (routei>1){
Elements rtTds = rttr.getElementsByTag("td");
String routeText = "";
String textInfo = rtTds.get(0).text();
int msgLength = textInfo.length();
if(textInfo.indexOf("[细]")>0){
msgLength = textInfo.indexOf("[细]")-1;
}
String msginfo = textInfo.substring(textInfo.indexOf("[细]")+3,msgLength); 被访问的域名
String entranceIP = rtTds.get(1).text();
String entranceUV = rtTds.get(2).text();
String newUV = rtTds.get(3).text();
String views = rtTds.get(4).text();
String ratio = rtTds.get(5).text();
routeText = routeText + msginfo + " "+entranceIP+ " "+entranceUV+ " "+newUV+ " "+views+" "+ratio;
System.out.print("\r\n"+routeText);
}
}
}
}else {
System.out.print("来路抓取 无数据");
}
}
来路
public void routeMeth(String route,String sessionId)throws Exception{
System.out.print("来路网站 贡献IP 贡献率 贡献UV 新UV \r\n");
Document routeDom = Jsoup.connect(route).cookie("ajstat", sessionId).get();
if(null!=routeDom.getElementById("tablist")){
int routePageSize = 1;
if(null!=routeDom.getElementById("pageslink")){
routePageSize = Integer.valueOf(routeDom.getElementById("pageslink").getElementsByTag("a").last().text());
}
System.out.print("来路网站:总页数"+routePageSize+" \r\n");
for (int i=0;i<routePageSize ;i++){
String requestRoteUrl = route + "&p="+(i+1);
Document pageRouteDom = Jsoup.connect(requestRoteUrl).cookie("ajstat", sessionId).get();
Elements pageRouteTrs = pageRouteDom.getElementById("tablist").getElementsByTag("tbody").first().getElementsByTag("tr");
int routei = 0;
for (Element rttr : pageRouteTrs){
++routei;
if (routei>1){
Elements rtTds = rttr.getElementsByTag("td");
String routeText = "";
String textInfo = rtTds.get(0).text();
int msgLength = textInfo.length();
if(textInfo.indexOf("[GO]")>0){
msgLength = textInfo.indexOf("[GO]")-1;
}
String msginfo = textInfo.substring(textInfo.indexOf("史]")+3,msgLength); 来路网站
String contributeIP = rtTds.get(1).text(); 贡献IP
String rateContribute = rtTds.get(2).text(); 贡献率
String contributionUV = rtTds.get(4).text(); 贡献UV
String newUV = rtTds.get(5).text(); 新UV
routeText = routeText + msginfo + " "+contributeIP+ " "+rateContribute+ " "+contributionUV+ " "+newUV;
System.out.print("\r\n"+routeText);
}
}
}
}else {
System.out.print("来路抓取 无数据");
}
}
关键词
public void keyWordsMeth(String keyWords,String sessionId)throws Exception{
System.out.print("关键词 1 ~ 2 ( 共 2 ) 贡献IP 比例 百度 好搜 谷歌 搜搜 搜狗 雅虎 其它 贡献UV 新UV \r\n");
Document keyWordsDom = Jsoup.connect(keyWords).cookie("ajstat", sessionId).get();
if(null!=keyWordsDom.getElementById("tablist")){
Elements keyWordsTrs = keyWordsDom.getElementById("tablist").getElementsByTag("tbody").first().getElementsByTag("tr");
int kwi = 0;
for (Element kwts : keyWordsTrs){
++kwi;
if (kwi>1){
Elements serTds = kwts.getElementsByTag("td");
String keyWordText = "";
String textInfo = serTds.get(0).text();
int msgLength = textInfo.length();
if(textInfo.indexOf("[GO]")>0){
msgLength = textInfo.indexOf("[GO]")-1;
}
String msginfo = textInfo.substring(textInfo.indexOf("史]")+3,msgLength); 关键词
String contributionIP = serTds.get(1).text(); 贡献IP
String proportion = serTds.get(2).text(); 比例
String baidu = serTds.get(4).text(); 百度
String haosou = serTds.get(5).text(); 好搜
String google = serTds.get(6).text(); 谷歌
String soso = serTds.get(7).text(); 搜搜
String sogou = serTds.get(8).text(); 搜狗
String yahoo = serTds.get(9).text(); 雅虎
String other = serTds.get(10).text(); 其它
String contributionUV = serTds.get(11).text(); 贡献UV
String newUv = serTds.get(12).text(); 新UV
keyWordText = keyWordText + " "+msginfo+ " "+contributionIP+ " "+proportion+ " "+baidu+ " "+haosou+ " "+google
+ " "+soso+ " "+sogou+ " "+yahoo+ " "+other+ " "+contributionUV+" "+newUv;
System.out.print("\r\n"+keyWordText);
}
}
}else {
System.out.print("关键词搜索 无数据");
}
}
搜索引擎
public void serchEngineMeth(String serchEngine,String sessionId)throws Exception{
System.out.print("搜索引擎 总量 比例 \r\n");
Document serchEngineDom = Jsoup.connect(serchEngine).cookie("ajstat", sessionId).get();
Elements serchEngineTrs = serchEngineDom.getElementById("gra_shu").getElementsByTag("tbody").first().getElementsByTag("tr");
int sertri = 0;
for (Element sertrs : serchEngineTrs){
++sertri;
if (sertri>2 && sertri<serchEngineTrs.size()){
Elements serTds = sertrs.getElementsByTag("td");
String se = serTds.get(0).text(); 搜索引擎
String sumNum = serTds.get(1).text(); 总量
String proportion = serTds.get(2).text(); 比例
String serEngineText = ""+se+" "+sumNum + " "+proportion;
System.out.print("\r\n"+serEngineText);
}
}
}
访问明细数据查询
public void pageaccInfoMeth(String accInfo, String sessionId,String newDate)throws Exception{
System.out.print("IP 访问地址 上站时间 来路 入口网址\r\n\n");
Document pageaccInfoDom = Jsoup.connect(accInfo).cookie("ajstat", sessionId).get();
int pageaccInfoSize = 1;
if(null!=pageaccInfoDom.getElementById("pageslink")){
pageaccInfoSize = Integer.valueOf(pageaccInfoDom.getElementById("pageslink").getElementsByTag("a").last().text());
}
System.out.print("来路网站:总页数"+pageaccInfoSize+" \r\n");
if(null!=pageaccInfoDom.getElementsByClass("bodys_zw").first().getElementsByTag("table")){
for (int i=0;i<pageaccInfoSize ;i++) {
String requestRoteUrl = accInfo + "&p=" + (i + 1);
Document pageaccInfoDoms = Jsoup.connect(accInfo).cookie("ajstat", sessionId).get();
Elements accInfoTrs = pageaccInfoDoms.getElementsByClass("bodys_zw").first().getElementsByTag("table").first().getElementsByTag("tbody").first().getElementsByTag("tr");
int tri = 0;
for (Element trs : accInfoTrs){
++tri;
if (tri>2 && tri<accInfoTrs.size()){
Elements accInfoTds = trs.getElementsByTag("td");
String accInfotdText = "";
String ip = accInfoTds.get(0).text(); ip
String address = accInfoTds.get(1).text(); 地址
String upperStationTime = newDate +" "+ accInfoTds.get(2).text(); 上站时间
String routes = accInfoTds.get(3).text(); 来路
String entranceSite = accInfoTds.get(4).getElementsByTag("a").first().attr("href"); 入口网址
accInfotdText = accInfotdText + ip + " "+ address+" "+upperStationTime+" "+routes + " "+entranceSite;
System.out.print("\r\n"+accInfotdText);
}
}
}
}
}
页面浏览
public void pageBrowserMeth(String pageBrowser, String sessionId)throws Exception{
System.out.print("页面浏览:页面地址 IP入口 UV入口 新UV 浏览量 比例 \r\n");
Document pageBrowserDom = Jsoup.connect(pageBrowser).cookie("ajstat", sessionId).get();
if(null!=pageBrowserDom.getElementById("tablist")){
int pageBrowserSize = 1;
if(null!=pageBrowserDom.getElementById("pageslink")){
pageBrowserSize = Integer.valueOf(pageBrowserDom.getElementById("pageslink").getElementsByTag("a").last().text());
}
System.out.print("页面浏览:总页数"+pageBrowserSize+" \r\n");
for (int i=0;i<pageBrowserSize ;i++){
String requestpageBrowserUrl = pageBrowser + "&p="+(i+1);
Document pageBrowsersDom = Jsoup.connect(requestpageBrowserUrl).cookie("ajstat", sessionId).get();
Elements pageBrowsersTrs = pageBrowsersDom.getElementById("tablist").getElementsByTag("tbody").first().getElementsByTag("tr");
int pbi = 0;
for (Element pbtr : pageBrowsersTrs){
++pbi;
if (pbi>1){
Elements pbTds = pbtr.getElementsByTag("td");
String pageBrowserText = "";
String pblink = pbTds.get(0).getElementsByTag("a").last().attr("href"); 页面网址
String iprk = pbTds.get(1).text(); IP入口
String uvrk = pbTds.get(2).text(); UV
String newUv = pbTds.get(3).text(); 新UV
String lll = pbTds.get(4).text(); 浏览量
String ratio = pbTds.get(5).text(); 比例
pageBrowserText = pageBrowserText+ ""+pblink+ " "+iprk+ " "+uvrk+ " "+newUv+ " "+lll+ " "+ratio;
System.out.print("\r\n"+pageBrowserText);
}
}
}
}
}
}