爬取b站直播娱乐分区人气排名

最新推荐文章于 2023-02-06 21:14:06 发布

zcharlotteyq

最新推荐文章于 2023-02-06 21:14:06 发布

阅读量725

点赞数

分类专栏：网络爬虫文章标签： python mysql 爬虫

本文链接：https://blog.csdn.net/m0_49119161/article/details/108647976

版权

网络爬虫专栏收录该内容

8 篇文章 1 订阅

订阅专栏

from selenium import webdriver
from bs4 import BeautifulSoup
import time
import re
import pymysql
import pandas

url = "https://live.bilibili.com/p/eden/area-tags?parentAreaId=1&areaId=0&visit_id=55we5sdlb9o0"
temp_check_height = 0
getInfoList = []
getInfoDict = {}
'''
    打开页面并将页面下拉至底
'''


def toBottom(spider_url):
    global temp_check_height
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    browser = webdriver.Chrome(options=options)
    browser.get(spider_url)
    while True:
        browser.execute_script("window.scrollBy(0,5000)")
        time.sleep(1)
        check_height = browser.execute_script(
            "return document.body.scrollHeight;")
        print(str(temp_check_height) + '**************' + str(check_height))
        if check_height == temp_check_height:
            break
        temp_check_height = check_height
    return browser.page_source


'''
    获取全部页面信息，提取有用信息，返回信息列表
'''


def getPageInfo(html):
    global getInfoDict, getInfoList
    soup = BeautifulSoup(html, 'lxml')
    for a in soup.find_all(class_='dp-block room-card-ctnr p-relative w-100'):
        getInfo = getInfoDict.copy()
        getInfo['title'] = a.find(class_='dp-block room-title card-text').text
        getInfo['name'] = a.find(class_='room-anchor card-text p-relative').find(name='span').text
        getInfo['type'] = a.find(class_='area-name w-100 p-absolute border-box t-left v-bottom').text
        getInfo['count'] = a.find(name='span', attrs={'data-v-191d6a08': '', 'class': 'v-middle'}).text
        getInfoList.append(getInfo)
    return getInfoList


'''
    按人气进行排序
'''


def sort(items):
    items = sorted(items, key=sort_seed, reverse=True)
    return items


def sort_seed(item):
    r = re.findall('\d*', item['count'])
    number = float(r[0])
    if '万' in item['count']:
        number *= 10000
    return number


def show(items):
    for rank in range(0, len(items)):
        print('rank' + str(rank + 1) + ':'
              + items[rank]['type'] + '----' + items[rank]['title'] + '----' + items[rank]['name'] + '----' +
              items[rank]['count'].replace(" ", ""))


'''
    输出csv文件
'''


def outFile(items):
    with open('b站直播.csv', 'w', encoding='utf-8-sig') as xlsFile:
        xlsFile.write('排名,分区,标题,up主昵称,观看人数\n')
        for rank in range(0, len(items)):
            xlsFile.write('第' + str(rank + 1) + '名')
            xlsFile.write(",")
            xlsFile.write(str(items[rank]['type']))
            xlsFile.write(",")
            xlsFile.write(str(items[rank]['title']))
            xlsFile.write(",")
            xlsFile.write(str(items[rank]['name']))
            xlsFile.write(",")
            xlsFile.write(str(items[rank]['count'].replace(" ", "")))
            xlsFile.write("\n")
        xlsFile.close()


'''
    打印pandas,输出csv
'''


def printTable(items):
    a = []
    b = []
    c = []
    d = []
    for rank in range(0, len(items)):
        a.append(items[rank]['type'])
        b.append(items[rank]['title'])
        c.append(items[rank]['name'])
        d.append(items[rank]['count'])
    column_1 = pandas.Series(a)
    column_2 = pandas.Series(b)
    column_3 = pandas.Series(c)
    column_4 = pandas.Series(d)
    table = pandas.DataFrame([column_1, column_2, column_3, column_4], index=['分区', '标题', 'up主昵称', '人气']).T
    print(table)
    table.to_csv('rank.csv', index=False, encoding='utf-8-sig')


'''
    存储到数据库中
'''


def outToDatabase(items):
    db = pymysql.connect(
        host='localhost',
        user='root',
        password='6274426',
        port=3306,
        db='spider'
    )
    cursor = db.cursor()
    sql = "INSERT INTO spiderbilibili(type, title, name, count) VALUES (%s, %s, %s, %s)"
    for item in items:
        try:
            cursor.execute(sql, (item['type'], item['title'], item['name'], item['count']))
            db.commit()
        except:
            db.rollback()
    print('finish')
    db.close()


def main():
    html = toBottom(url)
    pageInfoList = getPageInfo(html)
    pageInfoList = sort(pageInfoList)
    outFile(pageInfoList)
    printTable(pageInfoList)
    outToDatabase(pageInfoList)


main()