大家好,我是烤鸭:
某音竟然有pc版了,不过搜索的数据有限,会限制条数,亲测只能搜索400条数据,简单分析下过程。
工具使用
java + chromedriver + fiddler
java + selenium 自动化网页,需要登录,可以登录一次共享cookie
@Test
public void testXyin() {
String keyWord = "旅游";
try {
// 调用chrome driver
System.setProperty("webdriver.chrome.driver", "D:\\dev\\env\\chromedriver\\chromedriver.exe");
// 共享cookie
// ChromeOptions
ChromeOptions chromeOptions = new ChromeOptions();
// 添加用户cookies
chromeOptions.addArguments(
"--user-data-dir=C:\\Users\\user\\AppData\\Local\\Google\\Chrome\\User Data-Cookie");
WebDriver driver = new ChromeDriver(chromeOptions);
// 窗口最大化
driver.manage().window().maximize();
driver.get(
"https://www.douyin.com/search/"
+ keyWord
+ "?publish_time=0&sort_type=0&source=normal_search&type=general");
// 调整高度
((ChromeDriver) driver).executeScript("window.scrollTo(0, document.body.scrollHeight);");
Thread.sleep(1000);
// 构建driver对象
driver.manage().timeouts().implicitlyWait(3, TimeUnit.SECONDS);
WebElement webElement = driver.findElement(By.cssSelector("body"));
webElement.click(); // 有的时候必须点击一下,下拉才能生效(有的网站是这样,原因未找到)
} catch (Exception e) {
e.printStackTrace();
}
}
fiddler 脚本
修改 Fiddler,Rules—>Customize Rules, 改写 OnBeforeResponse 方法
static function OnBeforeResponse(oSession: Session) {
if (m_Hide304s && oSession.responseCode == 304) {
oSession["ui-hide"] = "true";
}
//加在方法末尾
if (oSession.HostnameIs("www.douyin.com") && oSession.uriContains("https://www.douyin.com/aweme/v1/web/general/search/single")){
var filename = "D:\\data\\dy\\fiddler-token.log";
var curDate = new Date();
var logContent = "[" + curDate.toLocaleString() + "] " + oSession.GetRequestBodyAsString() + "\r\n"+oSession.GetResponseBodyAsString()+"\r\n";
var sw : System.IO.StreamWriter;
if (System.IO.File.Exists(filename)){
sw = System.IO.File.AppendText(filename);
sw.Write(logContent);
}
else{
sw = System.IO.File.CreateText(filename);
sw.Write(logContent);
}
sw.Close();
sw.Dispose();
}
}
解析数据
读取文件解析:
public void readText() {
ReaderTxt rt = new ReaderTxt();
ArrayList<String> list = rt.InitTxt();
for (int i = 0; i < list.size(); i++) {
String txt = list.get(i);
if (!txt.startsWith("{")) {
continue;
}
JSONObject jrs = JSONObject.parseObject(txt);
JSONArray array = jrs.getJSONArray("data");
for (Object obs : array) {
DyScrapVideo scrapVideo = new DyScrapVideo();
JSONObject json = (JSONObject) obs;
// aweme_info
JSONObject awemeInfo = json.getJSONObject("aweme_info");
if (!Optional.ofNullable(awemeInfo).isPresent()) {
continue;
}
// https://www.douyin.com/video/ + aweme_id 详情页
String aweme_id = awemeInfo.getString("aweme_id");
String desc = awemeInfo.getString("desc");
Long publishTime = awemeInfo.getLong("create_time");
scrapVideo.setVideoDesc(desc);
scrapVideo.setAwemeId(aweme_id);
scrapVideo.setVideoPublishTime(UnixUtil.TimeStamp2Date(publishTime + ""));
// author
JSONObject author = awemeInfo.getJSONObject("author");
Long aLong = author.getLong("uid");
String nickname = author.getString("nickname");
String signature = author.getString("signature");
scrapVideo.setAuthorUid(aLong + "");
scrapVideo.setAuthorNickname(nickname);
scrapVideo.setAuthorSignature(signature);
JSONObject avatar_thumb = author.getJSONObject("avatar_thumb");
JSONArray url_list = avatar_thumb.getJSONArray("url_list");
if (Optional.ofNullable(url_list).isPresent()) {
scrapVideo.setAuthorAvatarThumb(url_list.get(0).toString());
}
Long follower_count = author.getLong("follower_count");
scrapVideo.setFollowerCount(follower_count != null ? follower_count.intValue() : 0);
String custom_verify = author.getString("custom_verify");
scrapVideo.setCustomVerify(custom_verify);
// video
JSONObject video = awemeInfo.getJSONObject("video");
if(video != null){
JSONObject download_addr = video.getJSONObject("download_addr");
if(download_addr != null){
JSONArray down_url_list = download_addr.getJSONArray("url_list");
if (Optional.ofNullable(down_url_list).isPresent()) {
scrapVideo.setVideoDownloadAddr(UnicodeUtil.unicodeToCN(down_url_list.get(0).toString()));
}
}
Integer duration = video.getInteger("duration");
scrapVideo.setVideoDuration(duration);
}
// statistics
JSONObject statistics = awemeInfo.getJSONObject("statistics");
if(statistics != null){
Integer comment_count = statistics.getInteger("comment_count");
Integer digg_count = statistics.getInteger("digg_count");
Integer download_count = statistics.getInteger("download_count");
Integer play_count = statistics.getInteger("play_count");
Integer share_count = statistics.getInteger("share_count");
Integer collect_count = statistics.getInteger("collect_count");
scrapVideo.setCommentCount(comment_count);
scrapVideo.setDiggCount(digg_count);
scrapVideo.setDownloadCount(download_count);
scrapVideo.setPlayCount(play_count);
scrapVideo.setShareCount(share_count);
scrapVideo.setCollectCount(collect_count);
}
scrapVideo.setCreateDate(new Date());
scrapVideo.setSearchKeyword("北京旅游");
}
}
}
public ArrayList<String> InitTxt() {
ArrayList<String> list = new ArrayList<String>();
try { // 防止文件建立或读取失败,用catch捕捉错误并打印,也可以throw
/* 读入TXT文件 */
String pathname =
"D:\\data\\fiddler-token.log"; // 绝对路径或相对路径都可以,这里是绝对路径,写入文件时演示相对路径
File filename = new File(pathname);
InputStreamReader reader =
new InputStreamReader(new FileInputStream(filename), "utf-8"); // 建立一个输入流对象reader
BufferedReader br = new BufferedReader(reader); // 建立一个对象,它把文件内容转成计算机能读懂的语言
String line = "";
while (line != null) {
line = br.readLine(); // 一次读入一行数据
if (line == null) {
break;
}
list.add(line);
}
} catch (Exception e) {
e.printStackTrace();
}
return list;
}
实体对象:
package com.machu.picchu.crawler.dto;
import java.util.Date;
public class DyScrapVideo {
private Integer id;
private String awemeId;
private String videoDesc;
private Date videoPublishTime;
private String videoDownloadAddr;
private Integer videoDuration;
private Integer commentCount;
private Integer diggCount;
private Integer playCount;
private Integer downloadCount;
private Integer shareCount;
private Integer collectCount;
private String authorUid;
private String authorNickname;
private String authorSignature;
private String authorAvatarThumb;
private Integer followerCount;
private String customVerify;
private Date createDate;
private Date publishDate;
private String searchKeyword;
private String memo;
private Integer status;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getVideoDesc() {
return videoDesc;
}
public void setVideoDesc(String videoDesc) {
this.videoDesc = videoDesc;
}
public Date getVideoPublishTime() {
return videoPublishTime;
}
public void setVideoPublishTime(Date videoPublishTime) {
this.videoPublishTime = videoPublishTime;
}
public String getVideoDownloadAddr() {
return videoDownloadAddr;
}
public void setVideoDownloadAddr(String videoDownloadAddr) {
this.videoDownloadAddr = videoDownloadAddr;
}
public Integer getVideoDuration() {
return videoDuration;
}
public void setVideoDuration(Integer videoDuration) {
this.videoDuration = videoDuration;
}
public Integer getCommentCount() {
return commentCount;
}
public void setCommentCount(Integer commentCount) {
this.commentCount = commentCount;
}
public Integer getDiggCount() {
return diggCount;
}
public void setDiggCount(Integer diggCount) {
this.diggCount = diggCount;
}
public Integer getPlayCount() {
return playCount;
}
public void setPlayCount(Integer playCount) {
this.playCount = playCount;
}
public Integer getDownloadCount() {
return downloadCount;
}
public void setDownloadCount(Integer downloadCount) {
this.downloadCount = downloadCount;
}
public Integer getShareCount() {
return shareCount;
}
public void setShareCount(Integer shareCount) {
this.shareCount = shareCount;
}
public Integer getCollectCount() {
return collectCount;
}
public void setCollectCount(Integer collectCount) {
this.collectCount = collectCount;
}
public String getAuthorUid() {
return authorUid;
}
public void setAuthorUid(String authorUid) {
this.authorUid = authorUid;
}
public String getAuthorNickname() {
return authorNickname;
}
public void setAuthorNickname(String authorNickname) {
this.authorNickname = authorNickname;
}
public String getAuthorSignature() {
return authorSignature;
}
public void setAuthorSignature(String authorSignature) {
this.authorSignature = authorSignature;
}
public String getAuthorAvatarThumb() {
return authorAvatarThumb;
}
public void setAuthorAvatarThumb(String authorAvatarThumb) {
this.authorAvatarThumb = authorAvatarThumb;
}
public Integer getFollowerCount() {
return followerCount;
}
public void setFollowerCount(Integer followerCount) {
this.followerCount = followerCount;
}
public String getCustomVerify() {
return customVerify;
}
public void setCustomVerify(String customVerify) {
this.customVerify = customVerify;
}
public Date getCreateDate() {
return createDate;
}
public void setCreateDate(Date createDate) {
this.createDate = createDate;
}
public Date getPublishDate() {
return publishDate;
}
public void setPublishDate(Date publishDate) {
this.publishDate = publishDate;
}
public String getSearchKeyword() {
return searchKeyword;
}
public void setSearchKeyword(String searchKeyword) {
this.searchKeyword = searchKeyword;
}
public String getMemo() {
return memo;
}
public void setMemo(String memo) {
this.memo = memo;
}
public Integer getStatus() {
return status;
}
public void setStatus(Integer status) {
this.status = status;
}
public String getAwemeId() {
return awemeId;
}
public void setAwemeId(String awemeId) {
this.awemeId = awemeId;
}
}