from selenium import webdriver
from bs4 import BeautifulSoup
import time
import re
import pymysql
import pandas
url = "https://live.bilibili.com/p/eden/area-tags?parentAreaId=1&areaId=0&visit_id=55we5sdlb9o0"
temp_check_height = 0
getInfoList = []
getInfoDict = {}
'''
打开页面并将页面下拉至底
'''
def toBottom(spider_url):
global temp_check_height
options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)
browser.get(spider_url)
while True:
browser.execute_script("window.scrollBy(0,5000)")
time.sleep(1)
check_height = browser.execute_script(
"return document.body.scrollHeight;")
print(str(temp_check_height) + '**************' + str(check_height))
if check_height == temp_check_height:
break
temp_check_height = check_height
return browser.page_source
'''
获取全部页面信息,提取有用信息,返回信息列表
'''
def getPageInfo(html):
global getInfoDict, getInfoList
soup = BeautifulSoup(html, 'lxml')
for a in soup.find_all(class_='dp-block room-card-ctnr p-relative w-100'):
getInfo = getInfoDict.copy()
getInfo['title'] = a.find(class_='dp-block room-title card-text').text
getInfo['name'] = a.find(class_='room-anchor card-text p-relative').find(name='span').text
getInfo['type'] = a.find(class_='area-name w-100 p-absolute border-box t-left v-bottom').text
getInfo['count'] = a.find(name='span', attrs={'data-v-191d6a08': '', 'class': 'v-middle'}).text
getInfoList.append(getInfo)
return getInfoList
'''
按人气进行排序
'''
def sort(items):
items = sorted(items, key=sort_seed, reverse=True)
return items
def sort_seed(item):
r = re.findall('\d*', item['count'])
number = float(r[0])
if '万' in item['count']:
number *= 10000
return number
def show(items):
for rank in range(0, len(items)):
print('rank' + str(rank + 1) + ':'
+ items[rank]['type'] + '----' + items[rank]['title'] + '----' + items[rank]['name'] + '----' +
items[rank]['count'].replace(" ", ""))
'''
输出csv文件
'''
def outFile(items):
with open('b站直播.csv', 'w', encoding='utf-8-sig') as xlsFile:
xlsFile.write('排名,分区,标题,up主昵称,观看人数\n')
for rank in range(0, len(items)):
xlsFile.write('第' + str(rank + 1) + '名')
xlsFile.write(",")
xlsFile.write(str(items[rank]['type']))
xlsFile.write(",")
xlsFile.write(str(items[rank]['title']))
xlsFile.write(",")
xlsFile.write(str(items[rank]['name']))
xlsFile.write(",")
xlsFile.write(str(items[rank]['count'].replace(" ", "")))
xlsFile.write("\n")
xlsFile.close()
'''
打印pandas,输出csv
'''
def printTable(items):
a = []
b = []
c = []
d = []
for rank in range(0, len(items)):
a.append(items[rank]['type'])
b.append(items[rank]['title'])
c.append(items[rank]['name'])
d.append(items[rank]['count'])
column_1 = pandas.Series(a)
column_2 = pandas.Series(b)
column_3 = pandas.Series(c)
column_4 = pandas.Series(d)
table = pandas.DataFrame([column_1, column_2, column_3, column_4], index=['分区', '标题', 'up主昵称', '人气']).T
print(table)
table.to_csv('rank.csv', index=False, encoding='utf-8-sig')
'''
存储到数据库中
'''
def outToDatabase(items):
db = pymysql.connect(
host='localhost',
user='root',
password='6274426',
port=3306,
db='spider'
)
cursor = db.cursor()
sql = "INSERT INTO spiderbilibili(type, title, name, count) VALUES (%s, %s, %s, %s)"
for item in items:
try:
cursor.execute(sql, (item['type'], item['title'], item['name'], item['count']))
db.commit()
except:
db.rollback()
print('finish')
db.close()
def main():
html = toBottom(url)
pageInfoList = getPageInfo(html)
pageInfoList = sort(pageInfoList)
outFile(pageInfoList)
printTable(pageInfoList)
outToDatabase(pageInfoList)
main()
爬取b站直播娱乐分区人气排名
最新推荐文章于 2023-02-06 21:14:06 发布