【youtubo】爬虫数据采集案例

从此烟雨落金城,一人撑伞两人行在这里插入图片描述

案例展示

# -*- coding:utf-8 -*-
# @software: PyCharm
# desc:

import datetime
import json
from loguru import logger as logging
import re
from json import load, dumps
from os import path
from re import findall

import pymongo
import requests
import scrapy
from pymongo.errors import DuplicateKeyError

cwd = path.dirname(path.abspath(__file__))


class VideoError(Exception):
    def __init__(self, vid):
        self.message = f'Invalid video ID. Are you sure "{vid}" is a valid URL?'
        super().__init__(self.message)


class PlaylistError(Exception):
    def __init__(self, pid):
        self.message = f'Invalid Playlist ID. Are you sure "{pid}" is a valid URL and available?'
        super().__init__(self.message)


def fetch_and_save_video_info():
    gMongoClient = pymongo.MongoClient()
    gMongoDb = gMongoClient['crawlers']
    gMongoCollection = gMongoDb['youtube']

    docs = gMongoCollection.find({})
    for doc in docs:
        videoId = doc['videoId']
        url = f"https://www.youtube.com/watch?v={videoId}"
        title = doc['title']
        shortViewCountText = doc['shortViewCountText']
        channelTitle = doc['channelTitle']
        channelId = doc['channelId']
        canonicalBaseUrl = doc['canonicalBaseUrl']
        subscriberCount = doc['subscriberCount']
        videosCount = doc['videosCount']
        headers = {
            'user-agent': (
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'),
            'referer': 'https://youtube.com'}

        vid = "".join([i for i in findall(r"v=(.*?)&|youtu.be\/(.*?)&", url + "&")[0]])
        logging.info(vid)
        json_file = load(open(cwd + "/tube_dl_config.json", "rb"))

        headers["x-youtube-client-version"] = json_file['cver']
        headers["x-youtube-client-name"] = json_file['cname']
        y_data = requests.get(url=f"https://youtube.com/watch?v={vid}&pbj=1", headers=headers,
                              ).json()
        yt_data = [i for i in y_data if "playerResponse" in i.keys()][0]["playerResponse"]
        if yt_data["playabilityStatus"]["status"] == "ERROR":
            raise VideoError(vid)

        shortDescription = yt_data['videoDetails']['shortDescription']
        logging.info(dumps(y_data))
        # logging.info(shortDescription)

        # "label": "102,293 likes"
        likes = re.findall('"defaultText": \{"accessibility": \{"accessibilityData": \{"label": "(.*?) likes"\}\}',
                           dumps(y_data))[0]
        try:
            if likes:
                # 格式化成数字
                likes = int(likes.replace(',', ''))
        except Exception as e:
            logging.info(e)
            likes = 0
        publishDate = yt_data['microformat']['playerMicroformatRenderer']['publishDate']  # 2023-11-04T16:00:11-07:00
        publishDateDay = publishDate.split('T')[0]  # 2023-11-04

        viewCount = re.findall('"allowRatings": true, "viewCount": "(.*?)",', dumps(y_data))[0]

        if '万 个视频' in videosCount:
            videosCount = videosCount.replace('万 个视频', '')
            videosCount = float(videosCount) * 10000
        if '万位订阅者' in subscriberCount:
            subscriberCount = subscriberCount.replace('万位订阅者', '')
            subscriberCount = float(subscriberCount) * 10000

        item = {
            'videoId': videoId,
            'title': title,
            'viewCount': viewCount,
            'channelTitle': channelTitle,
            'channelId': channelId,
            'canonicalBaseUrl': canonicalBaseUrl,
            'subscriberCount': subscriberCount,
            'videosCount': videosCount,
            'shortDescription': shortDescription,
            'likes': likes,
            'createTime': datetime.datetime.now(),
            'day': datetime.datetime.now().strftime('%Y-%m-%d'),
            "publishDate": publishDate,
            "publishDateDay": publishDateDay,
        }
        gMongoDb.get_collection('youtube_detail_info').insert_one(item)


def fetch_and_save_comments_info():
    gMongoClient = pymongo.MongoClient()
    gMongoDb = gMongoClient['crawlers']
    docs = gMongoDb.get_collection('youtube_detail_info').find({"commentsCount": {"$exists": False}})
    for doc in docs:
        url = f'https://www.youtube.com/watch?v={doc["videoId"]}'

        headers = {
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'x-client-data': '自定义',
            'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '?1',
            'Cookie': f"{cookie}",
            'sec-fetch-dest': 'document', 'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
        }
        resp = requests.get(url=url, headers=headers)

        response = scrapy.Selector(text=resp.text)
        token = re.findall(r'"continuationCommand":{"token":"(.*?)"', resp.text)[0]

        apiKey = re.findall(r'"innertubeApiKey":"(.*?)"', resp.text)[0]

        url = f'https://www.youtube.com/youtubei/v1/next?key={apiKey}&prettylogging.info=false'

        payload = {
            "context": {"client": {"deviceMake": "Apple", "deviceModel": "", "visitorData": "", "clientName": "WEB",
                                   "clientVersion": "2.20240123.01.00", "configInfo": {"appInstallData": ""},
                                   "mainAppWebInfo": {"graftUrl": ""}}, "user": {"lockedSafetyMode": False},
                        "request": {"useSsl": True, "internalExperimentFlags": [], "consistencyTokenJars": []},
                        "clickTracking": {"clickTrackingParams": ""}, "adSignalsInfo": {"params": [], "bid": ""}},
            "continuation": f"{token}"}
        videoResp = requests.post(url=url, headers=headers, json=payload)
        videoRespJson = json.loads(videoResp.text)
        try:
            commentsCount = re.findall(r'"commentsCount":\{"runs":\[\{"text":"(.*?)"\}\]\}', videoResp.text)[0]
        except Exception as e:
            logging.info(e)
            commentsCount = 0

        gMongoDb.get_collection('youtube_detail_info').update_one({'_id': doc['_id']},
                                                                  {'$set': {'commentsCount': commentsCount}})


def get_video_list(token, channelTitle, channelId, canonicalBaseUrl, subscriberCount, videosCount, gMongoDb):
    videlListUrl = 'https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettylogging.info=false'
    payload = {"context": {"client": {"deviceMake": "Apple", "deviceModel": "", "visitorData": "", "clientName": "WEB",
                                      "clientVersion": "2.20240123.01.00", "configInfo": {"appInstallData": ""},
                                      "mainAppWebInfo": {"graftUrl": ""}}, "user": {"lockedSafetyMode": False},
                           "request": {"useSsl": True, "internalExperimentFlags": [], "consistencyTokenJars": []},
                           "clickTracking": {"clickTrackingParams": ""}, "adSignalsInfo": {"params": [], "bid": ""}},
               "continuation": f"{token}"}

    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'x-client-data': '自定义',
        'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '?1',
        'Cookie': f"{cookie}",
        'sec-fetch-dest': 'document', 'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
    }
    videoResp = requests.post(url=videlListUrl, headers=headers, json=payload, )
    videoRespJson = json.loads(videoResp.text)
    logging.info(videoResp.text)
    # .onResponseReceivedActions[0].appendContinuationItemsAction.continuationItems
    videoList = videoRespJson['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
    for eVideo in videoList[:-1]:
        videoId = eVideo['richItemRenderer']['content']['videoRenderer']['videoId']
        title = eVideo['richItemRenderer']['content']['videoRenderer']['title']['runs'][0]['text']
        # richItemRenderer.content.videoRenderer.shortViewCountText.simpleText
        shortViewCountText = eVideo['richItemRenderer']['content']['videoRenderer']['shortViewCountText']['simpleText']
        item = {
            'videoId': videoId,
            'title': title,
            'shortViewCountText': shortViewCountText,
            'channelTitle': channelTitle,
            'channelId': channelId,
            'canonicalBaseUrl': canonicalBaseUrl,
            'subscriberCount': subscriberCount,
            'videosCount': videosCount,
        }
        logging.info(item)
        try:
            gMongoDb['youtube'].insert_one(item)
        except DuplicateKeyError as e:
            logging.info("重复数据")
    try:
        continuationCommand = videoList[-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']
        token = continuationCommand['token']
        return get_video_list(token)
    except Exception as e:
        logging.info(e)
        logging.info('没有下一页了')


def fetch_and_save_list_info():
    gMongoClient = pymongo.MongoClient()
    gMongoDb = gMongoClient['crawlers']
    url = 'https://www.youtube.com/@ganfutong/videos'

    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'x-client-data': '自定义',
        'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '?1',
        'Cookie': f"{cookie}",
        'sec-fetch-dest': 'document', 'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
    }
    resp = requests.get(url=url, headers=headers,
                        )

    response = scrapy.Selector(text=resp.text)
    token = re.findall(r'"continuationCommand":{"token":"(.*?)"', resp.text)[0]
    context = response.xpath("//script[contains(string(),'var ytInitialData')]/text()").extract_first()
    contextJson = json.loads(context.replace('var ytInitialData = ', '')[:-1])

    subscriberCount = contextJson['header']['c4TabbedHeaderRenderer']['subscriberCountText']['simpleText']
    videosCountTexts = contextJson['header']['c4TabbedHeaderRenderer']['videosCountText']['runs']
    channelTitle = contextJson['header']['c4TabbedHeaderRenderer']['title']
    channelId = contextJson['header']['c4TabbedHeaderRenderer']['channelId']
    canonicalBaseUrl = contextJson['header']['c4TabbedHeaderRenderer']['navigationEndpoint']['browseEndpoint'][
        'canonicalBaseUrl']
    videosCount = ''
    for i in videosCountTexts:
        videosCount += i['text']

    token = token
    logging.info(token)
    get_video_list(token, channelTitle, channelId, canonicalBaseUrl, subscriberCount, videosCount, gMongoDb)


def fetch_and_save_account_info():
    gMongoClient = pymongo.MongoClient()
    gMongoDb = gMongoClient['crawlers']
    url = 'https://www.youtube.com/@ganfutong'

    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'x-client-data': 'CIq2yQEIpbbJAQipncoBCLXsygEIk6HLAQia/swBCIagzQEIj+HNAQiE4s0BCN/rzQEI5uzNAQjB7s0BCIrvzQEIg/DNAQiG8M0BCL7xzQEIjPLNARj2yc0BGKfqzQEY+fLNAQ==',
        'Cookie': f"{cookie}",
        'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document', 'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
    }
    resp = requests.get(url=url, headers=headers)

    response = scrapy.Selector(text=resp.text)
    token = re.findall(r'"continuationCommand":{"token":"(.*?)"', resp.text)[0]
    context = response.xpath("//script[contains(string(),'var ytInitialData')]/text()").extract_first()
    contextJson = json.loads(context.replace('var ytInitialData = ', '')[:-1])

    # 订阅数.header.c4TabbedHeaderRenderer.subscriberCountText
    subscriberCount = contextJson['header']['c4TabbedHeaderRenderer']['subscriberCountText']['simpleText']
    # 视频数 .header.c4TabbedHeaderRenderer.videosCountText
    videosCountTexts = contextJson['header']['c4TabbedHeaderRenderer']['videosCountText']['runs']
    channelTitle = contextJson['header']['c4TabbedHeaderRenderer']['title']
    channelId = contextJson['header']['c4TabbedHeaderRenderer']['channelId']
    apiKey = re.findall(r'"innertubeApiKey":"(.*?)"', resp.text)[0]

    canonicalBaseUrl = contextJson['header']['c4TabbedHeaderRenderer']['navigationEndpoint']['browseEndpoint'][
        'canonicalBaseUrl']
    videosCount = ''
    for i in videosCountTexts:
        videosCount += i['text']
    logging.info(json.dumps(contextJson))
    gMongoDb.get_collection('youtube_account_info').insert_one(item := {
        'channelTitle': channelTitle,
        'channelId': channelId,
        'canonicalBaseUrl': canonicalBaseUrl,
        'subscriberCount': subscriberCount,
        'videosCount': videosCount,
        'apiKey': apiKey,
        "createTime": datetime.datetime.now(),
    })


if __name__ == '__main__':
    cookie = ''
    fetch_and_save_comments_info()

  • 11
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值