# @Time: 2021/7/14 17:49
# @File : main.py
import datetime
import time
import requests
from lxml import etree
import pymysql
from pymysql.err import Error
time_today = time.strftime("%Y-%m-%d", time.localtime()) # 本地时间
month = datetime.datetime.now().month # 获取当前月份
# print('month' + str(type(month)))
year_month = str(datetime.datetime.now().year) + '-' + str(month) + '月'
table_name = str(year_month.replace('-', '_')) + 'b站热播视频排行数据'
# print(table_name)
db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='jundb')
for i in range(1, 13):
# print(type(i))
if i == month:
cursor = db.cursor()
sql = 'CREATE TABLE IF NOT EXISTS %s (视频标题 VARCHAR(255) NOT NULL , 视频链接 CHAR(255) NOT NULL ,UP VARCHAR(255) NOT NULL , ' \
'评论数 VARCHAR(25) NOT NULL , 播放量 VARCHAR(25) NOT NULL ,排名 int(10) NOT NULL, PRIMARY KEY(排名))' % (table_name)
cursor.execute(sql)
elif i > month:
pass
else:
pass
class main():
# 定义一个爬取网页请求
def load_page(self):
url = 'https://www.bilibili.com/v/popular/rank/all'
headers = {
'user-agent': ''
}
html=requests.get(url,headers=headers).content.decode('utf-8')
html = html.replace("\\n", " ")
html = etree.HTML(html)
node_list = html.xpath('/html/body/div[3]/div[2]/div[2]/ul/li')
items={}
for i in node_list:
items["视频标题"] = i.xpath('./div[2]/div[2]/a/text()')
items["视频链接"] = i.xpath('./div[2]/div[2]/a/@href')
items["UP"] = i.xpath('./div[2]/div[2]/div[1]/a/span/text()')
items["评论数"] = i.xpath('./div[2]/div[2]/div[1]/span[2]/text()')
items["播放量"] = i.xpath('./div[2]/div[2]/div[1]/span[1]/text()')
items["排名"] = i.xpath('./div[1]/text()')
items_end = repr(str(items)).replace(" ", "").replace("\\\\n", "")
self.save_file(items)
print(items_end)
def save_file(self, items):
print("[INFO]开始保存到本地")
file = open('D:\\bilibili.csv', 'a', encoding='UTF-8')
file.write(repr(str(items)).replace(" ","").replace("\\\\n","")+"\n")
file.close()
print("[INFO]保存到本地成功")
self.save_to_mysql(items)
def save_to_mysql(self,items):
print("[INFO]开始保存到MySQL")
table = table_name
keys = ','.join(items.keys()) # 键
values = ','.join(['%s'] * len(items)) # 值
sql1 = 'INSERT INTO {table}({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE '.format(table=table, keys=keys,
values=values) # 通过 ON DUPLICATE KEY UPDATE找出相同的键,然后更新
updata = ','.join([" {key} = %s ".format(key=key) for key in items])
sql1 += updata
try:
if cursor.execute(sql1, tuple(items.values()) * 2):
print('[INFO]插入MySQL成功')
db.commit() # 插入
except Error as e:
print(e)
print('Failed')
db.rollback() # 回滚
if __name__ == '__main__':
spider = main()
spider.load_page()
保存到数据库遇到单位为中文的情况 才疏学浅查了许多资料
json.loads()报错问题 :暂未知悉
用
from lxml import etree html = etree.HTML(html) 得以解决
可以用
UPDATE tb_name set `fb`=REPLACE(`fb`,'.'
,'');进行替换