爬虫—dy直播各个类别下直播数据

最新推荐文章于 2024-05-21 21:22:09 发布

liuwangleoooO

最新推荐文章于 2024-05-21 21:22:09 发布

阅读量2w

点赞数

分类专栏： # python爬虫文章标签：爬虫 python xpath

本文链接：https://blog.csdn.net/qq_36581961/article/details/110734031

版权

python爬虫专栏收录该内容

4 篇文章 0 订阅

订阅专栏

附代码

直播首页数据

import requests
import flask

from bs4 import BeautifulSoup
from lxml import etree


def getInfoUtils(info):
    if info:
        return info[0]
    else:
        ''


def getHtml(url):
    response = requests.get(url)
    try:
        if response.status_code == 200:
            content = response.text
            return content
    except Exception as e:
        return e


def htmlToTree(html):
    tree = etree.HTML(html)
    return tree


def parseLi(li):
    # content_dict = {}
    li_tree = htmlToTree(li)
    href = "https://www.douyu.com" + li_tree.xpath(
        '//div[@class="DyListCover HeaderCell is-href"]/a[1]/@href')[0]
    a_element = li_tree.xpath('//div[@class="DyListCover HeaderCell is-href"]/a[1]')[0]
    a_tag = etree.tostring(a_element, pretty_print=True, encoding='utf-8').decode('utf-8')
    a_tree = htmlToTree(a_tag)
    dy_cover = a_tree.xpath(
        '//div[@class ="DyListCover-content"]/div[@class="DyListCover-info"]')
    content1 = etree.tostring(dy_cover[0], pretty_print=True, encoding='utf-8').decode(
        'utf-8')
    live_type = htmlToTree(content1).xpath('//span/text()')
    live_type = getInfoUtils(live_type)
    live_title = htmlToTree(content1).xpath('//h3/text()')
    live_title = getInfoUtils(live_title)
    content2 = etree.tostring(dy_cover[1], pretty_print=True, encoding='utf-8').decode(
        'utf-8')
    live_hot = htmlToTree(content2).xpath('//span/text()')
    live_hot = getInfoUtils(live_hot)
    live_name = htmlToTree(content2).xpath('//h2/div/text()')
    live_name = getInfoUtils(live_name)
    content_dict = {
        'live_link': href,
        'live_type': live_type,
        'live_title': live_title,
        'live_hot': live_hot,
        'live_name': live_name
    }
    return content_dict


def parseHtml(html):
    res_data = []
    tree = etree.HTML(html)
    path = '//main[@class="layout-Main"]//div[@class="layout-Module-container layout-Cover ' \
           'ListContent"]/ul/li'
    data = tree.xpath(path)
    for item in data:
        item = etree.tostring(item, pretty_print=True, encoding='utf-8').decode('utf-8')
        res = parseLi(item)
        res_data.append(res)
    return res_data


def liveMain(url):
    s = parseHtml(getHtml(url))
    return s


if __name__ == '__main__':
    url = 'https://www.douyu.com/directory/all'
    data = liveMain(url)
    print(data)

分类下的直播数据

import requests

from lxml import etree

from douyuLive import liveMain


def getHtml(url):
    response = requests.get(url)
    try:
        if response.status_code == 200:
            content = response.text
            return content
    except Exception as e:
        return e


def htmlToTree(html):
    tree = etree.HTML(html)
    return tree


def parseCate(category):
    category_dict = {}
    category_list = []
    category_tree = htmlToTree(category)
    category_title = category_tree.xpath('//div[@class="categoryBox-head"]/h4/text()')
    category_hrefs = category_tree.xpath('//ul[@class="layout-Classify-list"]/li/a/@href')
    category_items = category_tree.xpath('//ul[@class="layout-Classify-list"]/li/a/strong/text()')
    category_tatolhots = category_tree.xpath(
        '//ul[@class="layout-Classify-list"]/li/a/div/span/text()')
    if len(category_items) == len(category_tatolhots) == len(category_hrefs):
        for i in range(len(category_items)):
            info = {}
            info['category_item'] = category_items[i]
            info['category_href'] = "https://www.douyu.com" + category_hrefs[i]
            info['category_tatolhot'] = category_tatolhots[i]
            category_list.append(info)
    category_dict["category_title"] = category_title[0]
    category_dict["category_list"] = category_list
    # print(category_dict)
    return category_dict


def parseHtml(html):
    data_info = []
    tree = htmlToTree(html)
    all_categories = tree.xpath('//main[@id="allCate"]/section[@class="layout-Module"]/div')
    for i in range(2, len(all_categories)):
        category = etree.tostring(all_categories[i], pretty_print=True,
                                  encoding='utf-8').decode('utf-8')
        data = parseCate(category)
        data_info.append(data)
    return data_info


def categoryMain(url):
    html = getHtml(url=url)
    data = parseHtml(html)
    return data


def main(data):
    for item in data:
        category_list = item["category_list"]
        for hrefs in category_list:
            category_href = hrefs["category_href"]
            info = liveMain(category_href)
            print("*******************", hrefs["category_item"])
            print(info)


if __name__ == '__main__':
    url = "https://www.douyu.com/directory"
    data = categoryMain(url)
    print(data)
    main(data)

liuwangleoooO

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
爬虫—dy直播各个类别下直播数据

附代码直播首页数据import requestsimport flaskfrom bs4 import BeautifulSoupfrom lxml import etreedef getInfoUtils(info): if info: return info[0] else: ''def getHtml(url): response = requests.get(url) try: if respon
复制链接

扫一扫

专栏目录