python爬虫短视频数据实战练习

胡说八道家

已于 2023-03-21 19:39:40 修改

阅读量282

点赞数

文章标签：爬虫 Powered by 金山文档

于 2023-03-10 08:19:35 首次发布

本文链接：https://blog.csdn.net/m0_59088506/article/details/129434974

版权

某音太难还没解决，有大佬麻烦提示我一下 hhhhhh 互相学习

3xwrbnfsfzpq83k其中为博主id （爬漂亮的东西更有动力😈）

初步代码默认爬博主前三个视频并且爬第二个的全部评论，先记录一下

import re
import time
import requests
from numpy import random


class TikTok():
    def __init__(self):
        self.url = "https://www.kuaishou.com/graphql"
        self.header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57",
            "Cookie": ""}
    def get_information(self, userId):
        datas = {}
        json = {
            "operationName": "visionProfile",
            "variables": {
                "userId": userId
            },
            "query": "query visionProfile($userId: String) {\n  visionProfile(userId: $userId) {\n    result\n    hostName\n    userProfile {\n      ownerCount {\n        fan\n        photo\n        follow\n        photo_public\n        __typename\n      }\n      profile {\n        gender\n        user_name\n        user_id\n        headurl\n        user_text\n        user_profile_bg_url\n        __typename\n      }\n      isFollowing\n      __typename\n    }\n    __typename\n  }\n}\n"
        }
        response = requests.post(url=self.url, headers=self.header, json=json)
        data = response.json()['data']['visionProfile']['userProfile']
        fan = data['ownerCount']['fan']
        datas['fan'] = fan
        follow = data['ownerCount']['follow']
        datas['follow'] = follow
        photo_public = data['ownerCount']['photo_public']
        datas['photo_public'] = photo_public
        user_name = data['profile']['user_name']
        datas['user_name'] = user_name
        user_id = data['profile']['user_id']
        datas['user_id'] = user_id
        user_text = data['profile']['user_text']
        datas['user_text'] = user_text
        return datas

    def get_vide(self, userId):
        datas = {}
        json = {
            "operationName": "visionProfilePhotoList",
            "variables": {
                "userId": userId,
                "pcursor": "",
                "page": "profile"
            },
            "query": "fragment photoContent on PhotoEntity {\n  id\n  duration\n  caption\n  originCaption\n  likeCount\n  viewCount\n  realLikeCount\n  coverUrl\n  photoUrl\n  photoH265Url\n  manifest\n  manifestH265\n  videoResource\n  coverUrls {\n    url\n    __typename\n  }\n  timestamp\n  expTag\n  animatedCoverUrl\n  distance\n  videoRatio\n  liked\n  stereoType\n  profileUserTopPhoto\n  musicBlocked\n  __typename\n}\n\nfragment feedContent on Feed {\n  type\n  author {\n    id\n    name\n    headerUrl\n    following\n    headerUrls {\n      url\n      __typename\n    }\n    __typename\n  }\n  photo {\n    ...photoContent\n    __typename\n  }\n  canAddComment\n  llsid\n  status\n  currentPcursor\n  tags {\n    type\n    name\n    __typename\n  }\n  __typename\n}\n\nquery visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n  visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n    result\n    llsid\n    webPageArea\n    feeds {\n      ...feedContent\n      __typename\n    }\n    hostName\n    pcursor\n    __typename\n  }\n}\n"
        }
        response = requests.post(url=self.url, headers=self.header, json=json)
        for i in range(3):
            data = response.json()['data']["visionProfilePhotoList"]["feeds"][i]["photo"]
            caption = data["caption"]
            datas[f'caption{i}'] = caption
            likeCount = data["likeCount"]
            datas[f'likeCount{i}'] = likeCount
            viewCount = data["viewCount"]
            datas[f'viewCount{i}'] = viewCount
            realLikeCount = data["realLikeCount"]
            datas[f'realLikeCount{i}'] = realLikeCount
            photoUrl = data["photoUrl"]
            datas[f'photoUrl{i}'] = photoUrl
            id = data['id']
            datas[f'id{i}'] = id
        return datas

    def get_comment(self, photoId,pcursor):
        json = {
            "operationName": "commentListQuery",
            "variables": {
                "photoId": photoId,
                "pcursor": pcursor
            },
            "query": "query commentListQuery($photoId: String, $pcursor: String) {\n  visionCommentList(photoId: $photoId, pcursor: $pcursor) {\n    commentCount\n    pcursor\n    rootComments {\n      commentId\n      authorId\n      authorName\n      content\n      headurl\n      timestamp\n      likedCount\n      realLikedCount\n      liked\n      status\n      authorLiked\n      subCommentCount\n      subCommentsPcursor\n      subComments {\n        commentId\n        authorId\n        authorName\n        content\n        headurl\n        timestamp\n        likedCount\n        realLikedCount\n        liked\n        status\n        authorLiked\n        replyToUserName\n        replyTo\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n"
        }
        response = requests.post(url=self.url, headers=self.header, json=json)
        pcursor=response.json()['data']["visionCommentList"]['pcursor']
        data=response.json()['data']["visionCommentList"]["rootComments"]
        for d in data:
            name=d["authorName"]
            content=re.sub('[(]O3x[a-z0-9]{13}[)]','',d['content'])
            print("{0}:{1}".format(name,content))
        return pcursor

if __name__ == '__main__':
    tik = TikTok()
    inf = tik.get_vide("3xwrbnfsfzpq83k")
    photoId=inf['id1']
    pcursor=''
    while pcursor!="no_more":
        pcursor=tik.get_comment(photoId,pcursor)