某音数据分析

大家好,我是烤鸭:

    某音竟然有pc版了,不过搜索的数据有限,会限制条数,亲测只能搜索400条数据,简单分析下过程。

工具使用

java + chromedriver + fiddler

java + selenium 自动化网页,需要登录,可以登录一次共享cookie

@Test
public void testXyin() {
  String keyWord = "旅游";
  try {
    // 调用chrome driver
    System.setProperty("webdriver.chrome.driver", "D:\\dev\\env\\chromedriver\\chromedriver.exe");
    // 共享cookie
    // ChromeOptions
    ChromeOptions chromeOptions = new ChromeOptions();
    // 添加用户cookies
    chromeOptions.addArguments(
        "--user-data-dir=C:\\Users\\user\\AppData\\Local\\Google\\Chrome\\User Data-Cookie");
    WebDriver driver = new ChromeDriver(chromeOptions);
    // 窗口最大化
    driver.manage().window().maximize();
    driver.get(
        "https://www.douyin.com/search/"
            + keyWord
            + "?publish_time=0&sort_type=0&source=normal_search&type=general");
    // 调整高度
    ((ChromeDriver) driver).executeScript("window.scrollTo(0, document.body.scrollHeight);");
    Thread.sleep(1000);
    // 构建driver对象
    driver.manage().timeouts().implicitlyWait(3, TimeUnit.SECONDS);
    WebElement webElement = driver.findElement(By.cssSelector("body"));
    webElement.click(); // 有的时候必须点击一下,下拉才能生效(有的网站是这样,原因未找到)
  } catch (Exception e) {
    e.printStackTrace();
  }
}

fiddler 脚本

修改 Fiddler,Rules—>Customize Rules, 改写 OnBeforeResponse 方法

	static function OnBeforeResponse(oSession: Session) {
		if (m_Hide304s && oSession.responseCode == 304) {
			oSession["ui-hide"] = "true";
		}
		//加在方法末尾
		if (oSession.HostnameIs("www.douyin.com") && oSession.uriContains("https://www.douyin.com/aweme/v1/web/general/search/single")){
			var filename = "D:\\data\\dy\\fiddler-token.log";
			var curDate = new Date();
			var logContent =  "[" + curDate.toLocaleString() + "] " + oSession.GetRequestBodyAsString() + "\r\n"+oSession.GetResponseBodyAsString()+"\r\n";
			var sw : System.IO.StreamWriter;
			if (System.IO.File.Exists(filename)){
				sw = System.IO.File.AppendText(filename);
				sw.Write(logContent);
			}
			else{
				sw = System.IO.File.CreateText(filename);
				sw.Write(logContent);
			}
			sw.Close();
			sw.Dispose();
		}

	}

解析数据

读取文件解析:

public void readText() {
  ReaderTxt rt = new ReaderTxt();
  ArrayList<String> list = rt.InitTxt();
  for (int i = 0; i < list.size(); i++) {
    String txt = list.get(i);
    if (!txt.startsWith("{")) {
      continue;
    }
    JSONObject jrs = JSONObject.parseObject(txt);
    JSONArray array = jrs.getJSONArray("data");
    for (Object obs : array) {
      DyScrapVideo scrapVideo = new DyScrapVideo();
      JSONObject json = (JSONObject) obs;
      // aweme_info
      JSONObject awemeInfo = json.getJSONObject("aweme_info");
      if (!Optional.ofNullable(awemeInfo).isPresent()) {
        continue;
      }
      // https://www.douyin.com/video/ + aweme_id 详情页
      String aweme_id = awemeInfo.getString("aweme_id");
      String desc = awemeInfo.getString("desc");
      Long publishTime = awemeInfo.getLong("create_time");
      scrapVideo.setVideoDesc(desc);
      scrapVideo.setAwemeId(aweme_id);
      scrapVideo.setVideoPublishTime(UnixUtil.TimeStamp2Date(publishTime + ""));

      // author
      JSONObject author = awemeInfo.getJSONObject("author");
      Long aLong = author.getLong("uid");
      String nickname = author.getString("nickname");
      String signature = author.getString("signature");
      scrapVideo.setAuthorUid(aLong + "");
      scrapVideo.setAuthorNickname(nickname);
      scrapVideo.setAuthorSignature(signature);
      JSONObject avatar_thumb = author.getJSONObject("avatar_thumb");
      JSONArray url_list = avatar_thumb.getJSONArray("url_list");
      if (Optional.ofNullable(url_list).isPresent()) {
        scrapVideo.setAuthorAvatarThumb(url_list.get(0).toString());
      }
      Long follower_count = author.getLong("follower_count");
      scrapVideo.setFollowerCount(follower_count != null ? follower_count.intValue() : 0);
      String custom_verify = author.getString("custom_verify");
      scrapVideo.setCustomVerify(custom_verify);

      // video
      JSONObject video = awemeInfo.getJSONObject("video");
      if(video != null){
        JSONObject download_addr = video.getJSONObject("download_addr");
        if(download_addr != null){
          JSONArray down_url_list = download_addr.getJSONArray("url_list");
          if (Optional.ofNullable(down_url_list).isPresent()) {
            scrapVideo.setVideoDownloadAddr(UnicodeUtil.unicodeToCN(down_url_list.get(0).toString()));
          }
        }
        Integer duration = video.getInteger("duration");
        scrapVideo.setVideoDuration(duration);
      }

      // statistics
      JSONObject statistics = awemeInfo.getJSONObject("statistics");
      if(statistics != null){
        Integer comment_count = statistics.getInteger("comment_count");
        Integer digg_count = statistics.getInteger("digg_count");
        Integer download_count = statistics.getInteger("download_count");
        Integer play_count = statistics.getInteger("play_count");
        Integer share_count = statistics.getInteger("share_count");
        Integer collect_count = statistics.getInteger("collect_count");

        scrapVideo.setCommentCount(comment_count);
        scrapVideo.setDiggCount(digg_count);
        scrapVideo.setDownloadCount(download_count);
        scrapVideo.setPlayCount(play_count);
        scrapVideo.setShareCount(share_count);
        scrapVideo.setCollectCount(collect_count);
      }

      scrapVideo.setCreateDate(new Date());
      scrapVideo.setSearchKeyword("北京旅游");

    }
  }
}

public ArrayList<String> InitTxt() {
  ArrayList<String> list = new ArrayList<String>();
  try { // 防止文件建立或读取失败,用catch捕捉错误并打印,也可以throw

    /* 读入TXT文件 */
    String pathname =
        "D:\\data\\fiddler-token.log"; // 绝对路径或相对路径都可以,这里是绝对路径,写入文件时演示相对路径
    File filename = new File(pathname);
    InputStreamReader reader =
        new InputStreamReader(new FileInputStream(filename), "utf-8"); // 建立一个输入流对象reader
    BufferedReader br = new BufferedReader(reader); // 建立一个对象,它把文件内容转成计算机能读懂的语言
    String line = "";
    while (line != null) {
      line = br.readLine(); // 一次读入一行数据
      if (line == null) {
        break;
      }
      list.add(line);
    }
  } catch (Exception e) {
    e.printStackTrace();
  }
  return list;
}

实体对象:

package com.machu.picchu.crawler.dto;

import java.util.Date;

public class DyScrapVideo {
    private Integer id;
    
    private String awemeId;

    private String videoDesc;

    private Date videoPublishTime;

    private String videoDownloadAddr;

    private Integer videoDuration;

    private Integer commentCount;

    private Integer diggCount;

    private Integer playCount;

    private Integer downloadCount;

    private Integer shareCount;

    private Integer collectCount;

    private String authorUid;

    private String authorNickname;

    private String authorSignature;

    private String authorAvatarThumb;

    private Integer followerCount;

    private String customVerify;

    private Date createDate;

    private Date publishDate;

    private String searchKeyword;

    private String memo;

    private Integer status;

    public Integer getId() {
        return id;
    }

    public void setId(Integer id) {
        this.id = id;
    }

    public String getVideoDesc() {
        return videoDesc;
    }

    public void setVideoDesc(String videoDesc) {
        this.videoDesc = videoDesc;
    }

    public Date getVideoPublishTime() {
        return videoPublishTime;
    }

    public void setVideoPublishTime(Date videoPublishTime) {
        this.videoPublishTime = videoPublishTime;
    }

    public String getVideoDownloadAddr() {
        return videoDownloadAddr;
    }

    public void setVideoDownloadAddr(String videoDownloadAddr) {
        this.videoDownloadAddr = videoDownloadAddr;
    }

    public Integer getVideoDuration() {
        return videoDuration;
    }

    public void setVideoDuration(Integer videoDuration) {
        this.videoDuration = videoDuration;
    }

    public Integer getCommentCount() {
        return commentCount;
    }

    public void setCommentCount(Integer commentCount) {
        this.commentCount = commentCount;
    }

    public Integer getDiggCount() {
        return diggCount;
    }

    public void setDiggCount(Integer diggCount) {
        this.diggCount = diggCount;
    }

    public Integer getPlayCount() {
        return playCount;
    }

    public void setPlayCount(Integer playCount) {
        this.playCount = playCount;
    }

    public Integer getDownloadCount() {
        return downloadCount;
    }

    public void setDownloadCount(Integer downloadCount) {
        this.downloadCount = downloadCount;
    }

    public Integer getShareCount() {
        return shareCount;
    }

    public void setShareCount(Integer shareCount) {
        this.shareCount = shareCount;
    }

    public Integer getCollectCount() {
        return collectCount;
    }

    public void setCollectCount(Integer collectCount) {
        this.collectCount = collectCount;
    }

    public String getAuthorUid() {
        return authorUid;
    }

    public void setAuthorUid(String authorUid) {
        this.authorUid = authorUid;
    }

    public String getAuthorNickname() {
        return authorNickname;
    }

    public void setAuthorNickname(String authorNickname) {
        this.authorNickname = authorNickname;
    }

    public String getAuthorSignature() {
        return authorSignature;
    }

    public void setAuthorSignature(String authorSignature) {
        this.authorSignature = authorSignature;
    }

    public String getAuthorAvatarThumb() {
        return authorAvatarThumb;
    }

    public void setAuthorAvatarThumb(String authorAvatarThumb) {
        this.authorAvatarThumb = authorAvatarThumb;
    }

    public Integer getFollowerCount() {
        return followerCount;
    }

    public void setFollowerCount(Integer followerCount) {
        this.followerCount = followerCount;
    }

    public String getCustomVerify() {
        return customVerify;
    }

    public void setCustomVerify(String customVerify) {
        this.customVerify = customVerify;
    }

    public Date getCreateDate() {
        return createDate;
    }

    public void setCreateDate(Date createDate) {
        this.createDate = createDate;
    }

    public Date getPublishDate() {
        return publishDate;
    }

    public void setPublishDate(Date publishDate) {
        this.publishDate = publishDate;
    }

    public String getSearchKeyword() {
        return searchKeyword;
    }

    public void setSearchKeyword(String searchKeyword) {
        this.searchKeyword = searchKeyword;
    }

    public String getMemo() {
        return memo;
    }

    public void setMemo(String memo) {
        this.memo = memo;
    }

    public Integer getStatus() {
        return status;
    }

    public void setStatus(Integer status) {
        this.status = status;
    }
    
    public String getAwemeId() {
        return awemeId;
    }
    
    public void setAwemeId(String awemeId) {
        this.awemeId = awemeId;
    }
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

烤鸭的世界我们不懂

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值