记一次java写爬虫 ------- (爬取小米应用商店的所有应用)

准备工作:

1.需要加入js插件

phantomjs-2.1.1-windows 放到对应的磁盘位置

2.依赖

<dependency>

<groupId>org.seleniumhq.selenium</groupId>

<artifactId>selenium-java</artifactId>

<version>3.5.2</version>

</dependency>

 

import com.demo13.pojo.Result;

import com.demo13.reptile.pojo.AContentsEntity;

import com.demo13.reptile.pojo.AppStore;

import com.demo13.service.AppService;

import com.demo13.service.AppStoreService;

import org.openqa.selenium.By;

import org.openqa.selenium.WebElement;

import org.openqa.selenium.os.ExecutableFinder;

import org.openqa.selenium.phantomjs.PhantomJSDriver;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.stereotype.Controller;

import org.springframework.web.bind.annotation.RequestMapping;

import org.springframework.web.bind.annotation.RequestParam;

import org.springframework.web.bind.annotation.ResponseBody;

 

import java.util.ArrayList;

import java.util.HashSet;

import java.util.List;

import java.util.Set;

 

@Controller

@RequestMapping("/simple")

public class Simpl4jReptileController {

 

 

@Autowired

AppStoreService appStoreService;

 

static final String URL="http://app.mi.com/topList";

 

 

@ResponseBody

@RequestMapping("/reptile")

public Result addApp(@RequestParam(value = "a") Integer a){

try {

String path = "D:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe";

String page="#page=";

//初始化js

System.setProperty("phantomjs.binary.path", path);

//建立驱动

 

PhantomJSDriver webDriver = new PhantomJSDriver();

//获取种类集合从每个种类再去分页遍历

List<String> categoryList=getCategoryList(webDriver,URL);

//去重(防止category有重复的)

Set<String> set=new HashSet<>(categoryList);

List<String> distinctList=new ArrayList<>(set);

//循环调用获取a标签

List<AppStore> list;

//一个种类一个种类的去添加

for(String categoryUrl:distinctList){

list= getAList(webDriver, categoryUrl, page);

System.out.println(categoryUrl+"该cateGory已经爬取完成-------");

/*appStoreService.insertAppStore(list);*/

}

}catch (Exception e){

e.printStackTrace();

return Result.buildFailure("服务器异常");

}

return Result.buildSuccess("爬取完成");

}

 

 

@ResponseBody

@RequestMapping("/hostReptile")

public Result addHostApp(@RequestParam(value = "a") Integer a){

try {

List<AppStore> aContentsEntities=new ArrayList<>();

AppStore aContentsEntity;

String path = "D:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe";

String page="?page=";

//初始化js

System.setProperty("phantomjs.binary.path", path);

//建立驱动

PhantomJSDriver webDriver = new PhantomJSDriver();

List<AppStore> hostAppList=getHostAppList(webDriver,page);

appStoreService.insertAppStore(hostAppList);

}catch (Exception e){

e.printStackTrace();

return Result.buildFailure("服务器异常");

}

return Result.buildSuccess("爬取完成");

}

 

 

 

public static List<AppStore> getHostAppList(PhantomJSDriver webDriver,String page){

List<AppStore> aContentsEntities=new ArrayList<>();

AppStore aContentsEntity;

boolean lookNext=true;

int num=1;

try {

while(lookNext){

//从第1页开始

String pageUrl=URL+page+num;

System.out.println(pageUrl+"-*-*-*-*--*-*--*-*");

webDriver.get(pageUrl);

webDriver.executePhantomJS("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;");

Thread.sleep(2000);

 

List<WebElement> elements=webDriver.findElements(By.className("applist"));

for(WebElement el:elements){

List<WebElement> wel = el.findElements(By.tagName("li"));

if(wel!=null&&wel.size()>0){

for(WebElement al:wel){

aContentsEntity=getPacageAndAppName(al);

aContentsEntities.add(aContentsEntity);

}

++num;

}else{

//如果为false证明该category没有值了 就跳出循环返回flase;

lookNext=false;

}

}

}

}catch (Exception e){

e.printStackTrace();

}

return aContentsEntities;

}

public static List<AppStore> getAList(PhantomJSDriver webDriver, String category, String page){

List<AppStore> aContentsEntities=new ArrayList<>();

AppStore aContentsEntity;

boolean lookNext=true;

int num=0;

try {

while(lookNext){

//从第0页开始

String pageUrl=category+page+num;

System.out.println(pageUrl+"-*-*-*-*--*-*--*-*");

webDriver.get(pageUrl);

Thread.sleep(3000);

List<WebElement> elements=webDriver.findElements(By.id("all-applist"));

for(WebElement el:elements){

List<WebElement> wel = el.findElements(By.tagName("li"));

if(wel!=null&&wel.size()>0){

for(WebElement al:wel){

aContentsEntity=getPacageAndAppName(al);

aContentsEntities.add(aContentsEntity);

}

++num;

}else{

//如果为false证明该category没有值了 就跳出循环返回flase;

lookNext=false;

}

}

}

}catch (Exception e){

e.printStackTrace();

}

return aContentsEntities;

}

/**

* 获取报名和应用名称

*/

public static AppStore getPacageAndAppName(WebElement a){

AppStore aContentsEntity=new AppStore();

 

WebElement webElement = a.findElement(By.tagName("a"));

String hrefValue=webElement.getAttribute("href");

//获取包名

String packageName=getHrefValue(hrefValue);

aContentsEntity.setAppPackage(packageName);

//获取应用名称

WebElement img = a.findElement(By.tagName("img"));

String imgAltValue=img.getAttribute("alt");

aContentsEntity.setAppName(imgAltValue);

String src=img.getAttribute("src");

String targetSrc="http://resource.xiaomi.net/miuimarket/app/lazyload.gif";

if(src.length()==targetSrc.length()){

System.out.println(packageName);

System.out.println(src);

System.out.println(imgAltValue);

}

aContentsEntity.setAppIconAddress(src);

aContentsEntity.setAppStatus(1);

 

return aContentsEntity;

}

/**

* 在路径上获取应用包名

* @param href

* @return

*/

public static String getHrefValue(String href){

int i=href.indexOf("?");

int w=href.indexOf("=",i+1);

String packageName=href.substring(w+1);

return packageName;

}

 

/**

* 获取快速入口的categoryList(种类集合)

* @param p

* @param url

* @return

*/

public static List<String> getCategoryList(PhantomJSDriver p, String url){

List<String> categroyList=new ArrayList<>();

p.get(url);

List<WebElement> elements=p.findElements(By.className("category-list"));

for(WebElement e:elements){

List<WebElement> li=e.findElements(By.tagName("li"));

for(WebElement ae:li){

WebElement webElement = ae.findElement(By.tagName("a"));

categroyList.add(webElement.getAttribute("href"));

}

}

return categroyList;

}

}

---------------------------------------------------------------------------------------------

package com.demo13.reptile.pojo;

 

 

 

/**

* /**

*

* @author

* @date :Created in 2019/6/4 15:04

* @description:应用商店实体(全部已知包名存放表)

* @modified By:

* @version: 1.1$

*/

 

public class AppStore {

 

private Integer appId;

 

 

private String appName;

 

 

private String appPackage;

 

 

private Integer createUserId;

 

 

private Integer appStatus;

 

 

private String appIconAddress;

 

public Integer getAppId() {

return appId;

}

 

public void setAppId(Integer appId) {

this.appId = appId;

}

 

public String getAppName() {

return appName;

}

 

public void setAppName(String appName) {

this.appName = appName;

}

 

public String getAppPackage() {

return appPackage;

}

 

public void setAppPackage(String appPackage) {

this.appPackage = appPackage;

}

 

public Integer getCreateUserId() {

return createUserId;

}

 

public void setCreateUserId(Integer createUserId) {

this.createUserId = createUserId;

}

 

public Integer getAppStatus() {

return appStatus;

}

 

public void setAppStatus(Integer appStatus) {

this.appStatus = appStatus;

}

 

public String getAppIconAddress() {

return appIconAddress;

}

 

public void setAppIconAddress(String appIconAddress) {

this.appIconAddress = appIconAddress;

}

}

-------------------------------------------------------------------------------

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值