导入包:
<!-- https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-firefox-driver -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-firefox-driver</artifactId>
<version>3.141.59</version>
</dependency>
爬取(controller)
package com.qinpoint.controller;
import com.alibaba.fastjson.JSON;
import com.aliyun.oss.OSS;
import com.qinpoint.bo.XinTuBo;
import com.qinpoint.common.RestResponse;
import com.qinpoint.common.ResultUtil;
import com.qinpoint.constant.Constant;
import com.qinpoint.dto.ValuesDto;
import com.qinpoint.dto.XinTuDetails;
import com.qinpoint.dto.XinTuDto;
import com.qinpoint.service.IXinTuService;
import com.qinpoint.utils.AliYunOssClientUtil;
import com.qinpoint.utils.StringUtils;
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.*;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import javax.imageio.ImageIO;
import javax.servlet.http.HttpServletRequest;
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import static com.qinpoint.utils.AliYunOssClientUtil.getImgUrl;
import static com.qinpoint.utils.OssClientConstants.BACKET_NAME;
import static com.qinpoint.utils.OssClientConstants.FOLDER;
/**
* @program: selenium
* @description: 星图抓取
* @author: wt
* @create: 2020-08-10 16:58
**/
@Slf4j
@RestController
@RequestMapping("/xintu/")
public class XinTuController {
int a = (int)Math.random()*10+1;
/**
* 随机数
*/
private final int RANDOM = a < 2 ? a+2 :a;
/**
* 抓取详情5分钟
*/
private final Long A_TIME = RANDOM * 60 * 1000L ;
/**
* 截图
*/
private final Long T_TIME = (RANDOM-2) * 60 *1000L;
/**
* 循环调取详情页面
*/
private final Long FOR_DE_TIME = StringUtils.getRandomInterval(30,60)*1L;
@Autowired
private IXinTuService iXinTuService;
/**
* 爬虫爬取(http://localhost:58001/xintu/capture)
* @param request
* @return
* @throws Exception
*/
@RequestMapping("capture")
public RestResponse capture(HttpServletRequest request) throws Exception{
Thread.sleep(3000);
System.setProperty("webdriver.chrome.driver", "F:\\chromedriver.exe");
ChromeOptions options = new ChromeOptions();
options.addArguments("--test-type","--no-check-certificate","--ignore-certificate-errors","--start-maximized","--disable-extensions");
WebDriver webDriver= new ChromeDriver(options);
webDriver.manage().window().maximize();
webDriver.manage().deleteAllCookies();
// 与浏览器同步非常重要,必须等待浏览器加载完毕
webDriver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
//打开目标地址
webDriver.get("https://star.toutiao.com/login?roleType=user");
//输入账号 密码并登陆系统
webDriver.findElement(By.xpath("/html/body/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div[1]")).click();
Thread.sleep(1000);
webDriver.findElement(By.name("email")).sendKeys("******");
webDriver.findElement(By.xpath("//*[@id=\"account-sdk\"]/section/div[3]/div[2]/div/div/input")).sendKeys("*****");
webDriver.findElement(By.xpath("//*[@id=\"account-sdk\"]/section/div[6]/button")).click();
//选择系统
Thread.sleep(1*60*1000);
//抖音达人
webDriver.findElement(By.xpath("/html/body/div[1]/div[1]/div[2]/div[1]/div[1]/div/div[2]/div[2]/div[1]/div[2]/div[1]/span")).click();
Set<Cookie> cookies = webDriver.manage().getCookies();
List<String> cookieList = new ArrayList<>();
for (Cookie ck : cookies){
cookieList.add(ck.getName() + "=" + ck.getValue());
}
int a = 1;
XinTuBo xinTuBo= new XinTuBo();
xinTuBo.setRedisKey("car");
// List<XinTuDto> car = this.iXinTuService.findXinTuSortRedis(xinTuBo);
// for (XinTuDto xinTuDto : car) {
// log.error("car " + a++ );
// Thread.sleep(FOR_DE_TIME);
// getXinTuDetail(xinTuDto.getId(), Constant.CAR_REDIS,webDriver);
// }
// xinTuBo.setRedisKey("game");
// List<XinTuDto> game = this.iXinTuService.findXinTuSortRedis(xinTuBo);
// for (XinTuDto xinTuDto : game) {
// Thread.sleep(FOR_DE_TIME);
// log.error("game " + a++ );
// getXinTuDetail(xinTuDto.getId(), Constant.GAME_REDIS,webDriver);
// }
// xinTuBo.setRedisKey("infant");
// List<XinTuDto> infant = this.iXinTuService.findXinTuSortRedis(xinTuBo);
// for (XinTuDto xinTuDto : infant) {
// log.error("infant " + a++ );
// Thread.sleep(FOR_DE_TIME);
// getXinTuDetail(xinTuDto.getId(), Constant.INFANT_REDIS,webDriver);
// }
// xinTuBo.setRedisKey("plot");
// List<XinTuDto> plot = this.iXinTuService.findXinTuSortRedis(xinTuBo);
// log.error("plot " + plot.size());
// for (XinTuDto xinTuDto : plot) {
// log.error("plot " + a++ );
// Thread.sleep(FOR_DE_TIME);
// getXinTuDetail(xinTuDto.getId(), Constant.PLOT_REDIS,webDriver);
// }
/**
* 抓取列表
*/
iXinTuService.addXinTuSortCookie(cookieList);
return ResultUtil.success();
}
/**
* 获取详情页信息
* @param id
* @param webDriver
* @return
*/
private XinTuDetails getXinTuDetail(String id ,String redisKey,WebDriver webDriver){
XinTuDetails xinTuDetails = new XinTuDetails();
try {
String plot_url = "https://star.toutiao.com/ad#/author/douyin/"+id+"/1/?recommend=false";
webDriver.get(plot_url);
webDriver.navigate().refresh();
webDriver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
log.error("id = " + id);
Thread.sleep(A_TIME);
List<ValuesDto> values = new ArrayList<>();
WebElement valuel = webDriver.findElement(By.xpath("//div[@class=\"star-row\"][1]//div[@class=\"male gender-item\"]"));
ValuesDto v1= new ValuesDto();
v1.setName(valuel.findElement(By.className("name")).getText());
v1.setValue(valuel.findElement(By.className("value")).getText());
log.error("v1.name " + v1.getName() +","+ "v1.value " + v1.getValue());
WebElement value2 = webDriver.findElement(By.xpath("//div[@class=\"star-row\"][1]//div[@class=\"female gender-item\"]"));
ValuesDto v2 = new ValuesDto();
v2.setName(value2.findElement(By.className("name")).getText());
v2.setValue(value2.findElement(By.className("value")).getText());
log.error("v2.name " + v2.getName() + "," +"v2.getValue() =" + v2.getValue());
values.add(v1);
values.add(v2);
xinTuDetails.setMaxVideoPlay(webDriver.findElement(By.xpath("//*[@id=\"pane-play\"]/div[1]/span[2]/strong")).getText());
xinTuDetails.setMinVideoPlay(webDriver.findElement(By.xpath("//*[@id=\"pane-play\"]/div[1]/span[1]/strong")).getText());
xinTuDetails.setValuesDtos(values);
WebElement bfa = webDriver.findElement(By.xpath("//div[@class=\"indicator-card\"][1]//div[@class=\"value\"]"));
xinTuDetails.setVideoFinishPlay(bfa.getText());
String bf_url = this.getBoFangImageUrl(webDriver);
log.info("bf_url" + bf_url);
String fs_url = this.getFenSiImageUrl(webDriver);
log.info("fs_url" + fs_url);
xinTuDetails.setBfUrl(bf_url);
xinTuDetails.setFsUrl(fs_url);
// xinTuDetails.setBfNum(Sample.getImageSite(fs_url));
xinTuDetails.setDtUrl(plot_url);
this.iXinTuService.addXinTuSortRedis(redisKey + ":" + id, JSON.toJSONString(xinTuDetails));
}catch (Exception e){
log.error(e.getMessage());
}
return xinTuDetails;
}
/**
* 获取粉丝图片
* @param webDriver
* @return
*/
private String getFenSiImageUrl(WebDriver webDriver){
String url = null;
try {
Thread.sleep(T_TIME);
WebElement e = webDriver.findElement(By.xpath("//div[@class=\"card-panel section-container\"]"));
log.info("scroll view element");
JavascriptExecutor js = (JavascriptExecutor) webDriver;
// roll down and keep the element to the center of browser
js.executeScript("arguments[0].scrollIntoView(true);", e);
File files = ((TakesScreenshot) webDriver).getScreenshotAs(OutputType.FILE);
Point p = e.getLocation();
int width = e.getSize().getWidth();
BufferedImage subImage = ImageIO.read(files).getSubimage(p.getX()+100 , p.getY() , width+250, 600);
ImageIO.write(subImage, "png", files);
OSS ossClient = AliYunOssClientUtil.getOSSClient();
AliYunOssClientUtil.uploadObject2OSS(ossClient, files, BACKET_NAME, FOLDER);
url = getImgUrl(files.getName()).split(files.getName())[0] + files.getName();
} catch (Exception e) {
e.printStackTrace();
}
return url;
}
/**
* 获取播放量图片
* url = http://qduploadimage.oss-cn-beijing.aliyuncs.com/seleniumImage/screenshot8062161595485560820.png
* @param webDriver
* @return
*/
private String getBoFangImageUrl(WebDriver webDriver){
String url = null;
try {
Thread.sleep(T_TIME);
WebElement e = webDriver.findElement(By.xpath("//div[@class=\"content-wrapper data-trend-section\"]"));
log.info("scroll view element");
JavascriptExecutor js = (JavascriptExecutor) webDriver;
// roll down and keep the element to the center of browser
js.executeScript("arguments[0].scrollIntoView(true);", e);
File files = ((TakesScreenshot) webDriver).getScreenshotAs(OutputType.FILE);
Point p = e.getLocation();
int width = e.getSize().getWidth();
BufferedImage subImage = ImageIO.read(files).getSubimage(p.getX()+100 , p.getY()+100 , width+250, 500);
ImageIO.write(subImage, "png", files);
OSS ossClient = AliYunOssClientUtil.getOSSClient();
AliYunOssClientUtil.uploadObject2OSS(ossClient, files, BACKET_NAME, FOLDER);
url = getImgUrl(files.getName()).split(files.getName())[0] + files.getName();
} catch (Exception e) {
e.printStackTrace();
}
return url;
}
}
service
package com.qinpoint.service;
import com.alibaba.fastjson.JSONObject;
import com.qinpoint.bo.XinTuBo;
import com.qinpoint.dto.XinTuDetails;
import com.qinpoint.dto.XinTuDto;
import java.util.List;
/**
* @author wt
* 星图数据
*/
public interface IXinTuService {
/**
* 添加数据,传递cookie
* @param cookieList cookie
* @return
*/
boolean addXinTuSortCookie(List<String> cookieList);
/**
* 添加全部
* @param cookieList
* @return
*/
boolean addAll(List<String> cookieList);
/**
* 剧情搞笑添加
* @param cookieList
* @return
*/
boolean addPlot(List<String> cookieList);
/**
* 添加游戏
* @param cookieList
* @return
*/
boolean addGame(List<String> cookieList);
/**
* 母婴亲子
* @param cookieList
* @return
*/
boolean addInfant(List<String> cookieList);
/**
* 汽车
* @param cookieList
* @return
*/
boolean addCar(List<String> cookieList);
/**
* 添加
* @param url 地址
* @param cookieList cookie
* @return
*/
JSONObject addXinTuSortDate(String url, List<String> cookieList);
/**
* 获取星图数据
* @param xinTuBo
* @return
*/
List<XinTuDto> findXinTuSortRedis(XinTuBo xinTuBo);
/**
* 添加缓存
* @param key
* @param value
* @return
*/
boolean addXinTuSortRedis(String key,String value);
/**
* 获取详情
* @param key
* @return
*/
XinTuDetails getXinTuDetails(String key);
}
实现类:
package com.qinpoint.service.impl;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.qinpoint.bo.XinTuBo;
import com.qinpoint.common.CalculateUtils;
import com.qinpoint.constant.Constant;
import com.qinpoint.dto.XinTuDetails;
import com.qinpoint.dto.XinTuDto;
import com.qinpoint.dto.XinTuPriceDto;
import com.qinpoint.service.IXinTuService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.http.HttpEntity;
import org.springframework.http.HttpHeaders;
import org.springframework.http.HttpMethod;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Service;
import org.springframework.util.StringUtils;
import org.springframework.web.client.RestTemplate;
import java.util.ArrayList;
import java.util.List;
/**
* @program: selenium
* @description: 星图数据抓取
* @author: wt
* @create: 2020-08-12 09:15
**/
@Slf4j
@Service
public class XinTuServiceImpl implements IXinTuService {
int a = (int)Math.random()*10+1;
/**
* 随机数
*/
private final int RANDOM = a < 2 ? a+2 :a;
/**
* 抓取列表
*/
private final Long FOR_LIST = RANDOM * 60 * 1000L ;
@Autowired
private StringRedisTemplate stringRedisTemplate;
@Autowired
private RestTemplate restTemplate;
@Override
public boolean addXinTuSortCookie(List<String> cookieList) {
// this.addPlot(cookieList);
this.addGame(cookieList);
// this.addInfant(cookieList);
// this.addCar(cookieList);
// this.addAll(cookieList);
return true;
}
@Override
public boolean addAll(List<String> cookieList) {
String plot_url = "https://star.toutiao.com/v/api/demand/author_list/?limit="+ 30+ "&need_detail=true&page=1&platform_source=1&task_category=1&order_by=score&disable_replace_keyword=false&is_author_plan=false&is_filter=true";
JSONObject json = addXinTuSortDate(plot_url, cookieList);
this.addXinTuSortRedis(Constant.XIN_TU_SORT + "a_url",json.toJSONString());
return false;
}
@Override
public boolean addPlot(List<String> cookieList) {
String plot_url = "https://star.toutiao.com/v/api/demand/author_list/?limit=30&need_detail=true&page=1&platform_source=1&task_category=1&tag=97&order_by=score&disable_replace_keyword=false&is_author_plan=false&expected_cpm__le=50&is_filter=true";
List<XinTuDto> all = new ArrayList<>();
JSONObject json = addXinTuSortDate(plot_url, cookieList);
List<XinTuDto> onext = analysisXtJSON(json);
Long total = totalCount(json);
Integer num = CalculateUtils.getCountTotalPage(total, 30);
log.error( "num" + num);
all.addAll(onext);
for (int i = 2; i < num ; i++) {
log.error("plot i = " + i);
try {
Thread.sleep(FOR_LIST);
} catch (InterruptedException e) {
e.printStackTrace();
}
String plot = "https://star.toutiao.com/v/api/demand/author_list/?limit="+ 30+ "&need_detail=true&page="+i+"&platform_source=1&task_category=1&order_by=score&disable_replace_keyword=false&is_author_plan=false&is_filter=true";
JSONObject object = addXinTuSortDate(plot, cookieList);
log.error("addPlot object " + object.isEmpty());
all.addAll(analysisXtJSON(object));
}
this.addXinTuSortRedis(Constant.PLOT_REDIS,JSON.toJSONString(all));
return true;
}
@Override
public boolean addGame(List<String> cookieList) {
String plot_url = "https://star.toutiao.com/v/api/demand/author_list/?limit=20&need_detail=true&page=1&platform_source=1&task_category=1&tag=23&order_by=score&disable_replace_keyword=false&is_author_plan=false&expected_cpm__le=50&is_filter=true";
List<XinTuDto> all = new ArrayList<>();
JSONObject json = addXinTuSortDate(plot_url, cookieList);
List<XinTuDto> onext = analysisXtJSON(json);
Long total = totalCount(json);
Integer num = CalculateUtils.getCountTotalPage(total, 20);
log.error( "num" + num);
all.addAll(onext);
for (int i = 2; i < num ; i++) {
log.error("i = " + i);
try {
Thread.sleep(FOR_LIST);
} catch (InterruptedException e) {
e.printStackTrace();
}
String plot = "https://star.toutiao.com/v/api/demand/author_list/?limit=20&need_detail=true&page="+i+"&platform_source=1&task_category=1&tag=23&order_by=score&disable_replace_keyword=false&is_author_plan=false&expected_cpm__le=50&is_filter=true";
JSONObject object = addXinTuSortDate(plot, cookieList);
log.error("addGame object " + object.isEmpty());
all.addAll(analysisXtJSON(object));
}
this.addXinTuSortRedis(Constant.GAME_REDIS,JSON.toJSONString(all));
return true;
}
@Override
public boolean addInfant(List<String> cookieList) {
String plot_url = "https://star.toutiao.com/v/api/demand/author_list/?limit=20&need_detail=true&page=1&platform_source=1&task_category=1&tag=55&order_by=score&disable_replace_keyword=false&is_author_plan=false&expected_cpm__le=50&is_filter=true";
List<XinTuDto> all = new ArrayList<>();
JSONObject json = addXinTuSortDate(plot_url, cookieList);
List<XinTuDto> onext = analysisXtJSON(json);
Long total = totalCount(json);
all.addAll(onext);
Integer num = CalculateUtils.getCountTotalPage(total, 20);
log.error( "num" + num);
for (int i = 2; i < num ; i++) {
log.error("i = " + i);
try {
Thread.sleep(FOR_LIST);
} catch (InterruptedException e) {
e.printStackTrace();
}
String plot = "https://star.toutiao.com/v/api/demand/author_list/?limit=20&need_detail=true&page="+i+"&platform_source=1&task_category=1&tag=55&order_by=score&disable_replace_keyword=false&is_author_plan=false&expected_cpm__le=50&is_filter=true";
JSONObject object = addXinTuSortDate(plot, cookieList);
log.error("addInfant object " + object.isEmpty());
all.addAll(analysisXtJSON(object));
}
this.addXinTuSortRedis(Constant.INFANT_REDIS,JSON.toJSONString(all));
return true;
}
@Override
public boolean addCar(List<String> cookieList) {
String plot_url = "https://star.toutiao.com/v/api/demand/author_list/?limit=20&need_detail=true&page=1&platform_source=1&task_category=1&tag=31&order_by=score&disable_replace_keyword=false&is_author_plan=false&expected_cpm__le=50&is_filter=true";
List<XinTuDto> all = new ArrayList<>();
JSONObject json = addXinTuSortDate(plot_url, cookieList);
List<XinTuDto> onext = analysisXtJSON(json);
Long total = totalCount(json);
Integer num = CalculateUtils.getCountTotalPage(total, 20);
all.addAll(onext);
log.error( "num" + num);
for (int i = 2; i < num ; i++) {
log.error("car i = " + i);
try {
Thread.sleep(FOR_LIST);
} catch (InterruptedException e) {
e.printStackTrace();
}
String plot = "https://star.toutiao.com/v/api/demand/author_list/?limit=20&need_detail=true&page="+i+"&platform_source=1&task_category=1&tag=31&order_by=score&disable_replace_keyword=false&is_author_plan=false&expected_cpm__le=50&is_filter=true";
JSONObject object = addXinTuSortDate(plot, cookieList);
log.error("addCar object " + object.isEmpty());
all.addAll(analysisXtJSON(object));
}
this.addXinTuSortRedis(Constant.CAR_REDIS,JSON.toJSONString(all));
return true;
}
@Override
public JSONObject addXinTuSortDate(String url, List<String> cookieList) {
HttpHeaders requestHeaders = new HttpHeaders();
requestHeaders.put("Cookie", cookieList);
HttpEntity<String> requestEntity = new HttpEntity<>(null, requestHeaders);
ResponseEntity<JSONObject> response = restTemplate.exchange(url, HttpMethod.GET, requestEntity, JSONObject.class);
return response.getBody();
}
@Override
public List<XinTuDto> findXinTuSortRedis(XinTuBo xinTuBo) {
String redisValue = stringRedisTemplate.opsForValue().get(Constant.XIN_TU_SORT + xinTuBo.getRedisKey());
if (StringUtils.isEmpty(redisValue)){
return null;
}
return JSONObject.parseArray(redisValue,XinTuDto.class);
}
@Override
public boolean addXinTuSortRedis(String key, String value) {
String forValue = stringRedisTemplate.opsForValue().get(key);
if (com.qinpoint.utils.StringUtils.isNotEmpty(forValue)){
stringRedisTemplate.delete(key);
}
stringRedisTemplate.opsForValue().set(key,value);
return true;
}
@Override
public XinTuDetails getXinTuDetails(String key) {
String redisValue = stringRedisTemplate.opsForValue().get(Constant.XIN_TU_SORT + key);
if (StringUtils.isEmpty(redisValue)){
return null;
}
return JSONObject.parseObject(redisValue,XinTuDetails.class);
}
/**
* 进行 redis 存储json 解析
* @param jsonObject
* @return
*/
private List<XinTuDto> analysisXtJSON(JSONObject jsonObject){
List<XinTuDto> result = new ArrayList<>();
JSONObject data = jsonObject.getJSONObject("data");
if (!StringUtils.isEmpty(data)){
JSONArray authors = data.getJSONArray("authors");
for (Object author : authors) {
XinTuDto dto = new XinTuDto();
JSONObject user = JSONObject.parseObject(JSON.toJSONString(author));
dto.setNick_name(user.getString("nick_name"));
dto.setAvatar_uri(user.getString("avatar_uri"));
dto.setFollower(user.getInteger("follower"));
dto.setExpected_play_num(user.getInteger("expected_play_num"));
dto.setExpected_cpm(user.getDouble("expected_cpm"));
dto.setId(user.getString("id"));
dto.setShort_id(String.valueOf(user.get("short_id")));
List<XinTuPriceDto> priceDtoList = new ArrayList<XinTuPriceDto>();
JSONArray priceInfoArray = user.getJSONArray("price_info");
for (Object o : priceInfoArray) {
XinTuPriceDto xinTuPriceDto = new XinTuPriceDto();
JSONObject priceInfo = JSONObject.parseObject(JSON.toJSONString(o));
xinTuPriceDto.setDesc(priceInfo.getString("desc"));
xinTuPriceDto.setPrice(priceInfo.getInteger("price"));
xinTuPriceDto.setSettlement_desc(priceInfo.getString("settlement_desc"));
priceDtoList.add(xinTuPriceDto);
}
dto.setXinTuPriceDtos(priceDtoList);
result.add(dto);
}
}
return result;
}
/**
* 进行 redis 存储json 解析
* @param jsonObject
* @return
*/
private Long totalCount(JSONObject jsonObject){
JSONObject data = jsonObject.getJSONObject("data");
JSONObject pagination = data.getJSONObject("pagination");
Integer total_count = pagination.getInteger("total_count");
return Long.parseLong(String.valueOf(total_count));
}
}
实体类与图片上传 可以根据自己需求进行实现