python b站日排行榜_python爬取b站排行榜

import requests

import re

from bs4 import BeautifulSoup

import pymysql

import traceback

class Spider:#常用的爬取方法的简单封装

def __init__(self,url):

self.url=url

def getHTML(self):#获取html的对应代码

headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.103 Safari/537.36'}

try:

response=requests.get(url=self.url,headers=headers,timeout=20)

response.raise_for_status()

response.encoding=response.apparent_encoding

return response.text

except:

return "网页访问失败"

def setSoup(self):#获取soup对象

html=self.getHTML()

self.soup=BeautifulSoup(html,'html.parser')

def findTag(self,tagName):#按照标签名查找标签

return self.soup.find_all(tagName)

def findTagByAttrs(self,tagName,attrs):

return self.soup.find_all(tagName,attrs)

def getBeautifyHTML(self):

return self.soup.prettify()

def getPage(url):#爬取单个页面,核心代码

spider=Spider(url)

spider.setSoup()

itemList=spider.findTagByAttrs('li','rank-item')

pageContentList=[]

for item in itemList:

pageContentItem=[]

for title in item.find_all('a','title'):

pageContentItem.append(title.string)

# print(title.string)

for playnum in item.find_all('span','data-box'):

pattern=r">([^

n=re.findall(pattern,playnum.__str__())[0]

pageContentItem.append(n)

# print(n)

# print(item.find_all('div','pts')[0].div.string)

pageContentItem.append(item.find_all('div','pts')[0].div.string)

pageContentList.append(pageContentItem)

return pageContentList

def getURLFormBilibili():# 获取各种各样排行的榜单的信息

date={

1:'日排行',

3:'三日排行',

7:'周排行',

30:'月排行'

}

areatype={

0:'全站',

1:'动画',

168:'国漫相关',

3:'音乐',

129:'舞蹈',

4:'游戏',

36:'科技',

188:'数码',

160:'生活',

119:'鬼畜',

155:'时尚',

5:'娱乐',

181:'影视'

}

ranktype={

'all':'全站',

'origin':'原创'

}

submit={

'0':'全部投稿',

'1':'近期投稿'

}

urlDict={}#存放相应url的字典

for ranktypeItem in ranktype.keys():

for areatypeItem in areatype.keys():

for submitItem in submit.keys():

for dateTypeItem in date.keys():

title=ranktype[ranktypeItem]+'_'+areatype[areatypeItem]+'_'+submit[submitItem]+'_'+date[dateTypeItem]

destinaTionUrl='https://www.bilibili.com/ranking/{}/{}/{}/{}'.format(ranktypeItem,areatypeItem,submitItem,dateTypeItem)

urlDict[title]=destinaTionUrl

return urlDict

class MysqlConnect:#数据库的连接类

def __init__(self):

pass

def getConnect(self):

db=coon = pymysql.connect(

host = 'localhost',user = 'root',passwd = '19990614kang',

port = 3306,db = 'bilibilirank',charset = 'utf8'

#port必须写int类型

#charset必须写utf8,不能写utf-8

)

return db

def insertInfo(self,sql):

db=self.getConnect()

cursor=db.cursor()

try:

cursor.execute(sql)

db.commit()

print("sucessed...")

except:

print("failed...")

db.rollback()

def queryOutCome(self,sql):

# 获取数据库连接

db = self.getConnect()

# 使用cursor() 方法创建一个游标对象 cursor

cursor = db.cursor()

try:

# 执行sql语句

cursor.execute(sql)

result = cursor.fetchone()

except: #方法二:采用traceback模块查看异常

#输出异常信息

traceback.print_exc()

# 如果发生异常,则回滚

db.rollback()

finally:

# 最终关闭数据库连接

db.close()

return result

def getCreateTableSql(self,tableName):#获取创建表的sql语句

sql='''

create table `{}` (

id int(11) auto_increment primary key,

title char(100) NOT NULL UNIQUE,

playnum char(100) NOT NULL,

commentnum char(100) NOT NULL,

author char(100) NOT NULL,

score char(100) NOT NULL

)ENGINE=innodb DEFAULT CHARSET=utf8;

'''.format(tableName)

return sql

def getInsertToTableSql(self,tableName,title,playnum,commentnum,author,score):

sql='''

insert into `{}` values(null,'{}','{}','{}','{}','{}');

'''.format(tableName,title,playnum,commentnum,author,score)

return sql

def createTable(self,tableName,sql):

db=self.getConnect()

cursor=db.cursor()

cursor.execute("drop table if exists %s" %(tableName))

cursor.execute(sql)

db.close()

if __name__ == "__main__":

#开始爬取数据

urlDict=getURLFormBilibili()#获取对应的URL信息

mysqlconnect=MysqlConnect()#用于连接数据库

for urlName in urlDict:

print("正在处理"+urlName+"页面...")

url=urlDict[urlName]

tableName=urlName

createsql=mysqlconnect.getCreateTableSql(tableName)

mysqlconnect.createTable(tableName,createsql)

pageList=getPage(url)

for contentItem in pageList:

insertsql=mysqlconnect.getInsertToTableSql(tableName,contentItem[0],contentItem[1],contentItem[2],contentItem[3],contentItem[4])

print(insertsql)

mysqlconnect.insertInfo(insertsql)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值