一、思路
利用requests模块发送HTTP请求,然后用xpath从请求结果中提取出想要的数据,最后将数据保存到数据库
1、数据提取
利用Xpath来提取网页中的数据
def parse(text):
html = etree.HTML(text)
div_list = html.xpath('//div[@id="J_waterfallWrapper"]/div[@class="item"]')
result = []
for div in div_list:
commodity_url = ''.join(div.xpath('./a/@href'))
commodity_title = ''.join(div.xpath('./a/div[@class="info"]/span/@title'))
price = ''.join(div.xpath('./a/div[@class="info"]/p[@class="price"]/span/strong/text()'))
shop_name = ''.join(div.xpath('./a/div[@class="info"]/p[@class="shopName"]/span[1]/text()'))
li_list = div.xpath('./a/div[@class="info"]/div[@class="moreInfo"]/div[@class="dsr-info"]/ul/li')
truth = ''.join(li_list[0].xpath('./span[@class="morethan"]/b/text()'))
service = ''.join(li_list[1].xpath('./span[@class="morethan"]/b/text()'))
speed = ''.join(li_list[2].xpath('./span[@class="morethan"]/b/text()'))
commodity = {
'commodity_url': commodity_url,
'commodity_title': commodity_title,
'price': price,
'shop_name': shop_name,
'truth': truth,
'service': service,
'speed': speed,
}
result.append(commodity)
save(result)
2、数据保存
将数据保存到MySQL数据库中
def save(self):
cursor = self.connect.cursor()
sql = "insert into video_rank (id,number,title,video_url,watch_number,bullet_number,up_name,score) values (null,%s,%s,%s,%s,%s,%s,%s)"
for item in self.result:
cursor.execute(sql, (item['number'], item['title'], item['video_url'], item['watch_number'],
item['bullet_number'], item['up_name'], item['score']))
print(item)
self.connect.commit()
self.connect.close()
二、结果
三、源代码
import pymysql
import requests
from lxml import etree
class Rank:
def __init__(self):
self.result = []
self.url = "https://www.bilibili.com/v/popular/rank/all"
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.58"}
self.connect = pymysql.connect(host="127.0.0.1", user="root", password="root", database="bilibili",
charset="utf8mb4")
def run(self):
response = requests.get(url=self.url, headers=self.headers)
self.parse(response.text)
self.save()
def parse(self, text):
html = etree.HTML(text)
li_list = html.xpath('//ul[@class="rank-list"]/li')
for li in li_list:
number = ''.join(li.xpath('./div[@class="num"]/text()'))
title = ''.join(li.xpath('./div[@class="content"]/div[@class="info"]/a/text()'))
watch_number = ''.join(
li.xpath('./div[@class="content"]/div[@class="info"]/div[@class="detail"]/span[1]/text()')).strip()
bullet_number = ''.join(
li.xpath('./div[@class="content"]/div[@class="info"]/div[@class="detail"]/span[2]/text()')).strip()
video_url = ''.join(li.xpath('./div[@class="content"]/div[@class="info"]/a/@href')).replace("//", "")
up_name = ''.join(
li.xpath('./div[@class="content"]/div[@class="info"]/div[@class="detail"]/a/span/text()')).strip()
score = ''.join(li.xpath('./div[@class="content"]/div[@class="info"]/div[@class="pts"]/div/text()'))
video = {
'number': number,
'title': title,
'video_url': video_url,
'watch_number': watch_number,
'bullet_number': bullet_number,
'up_name': up_name,
'score': score,
}
self.result.append(video)
def save(self):
cursor = self.connect.cursor()
sql = "insert into video_rank (id,number,title,video_url,watch_number,bullet_number,up_name,score) values (null,%s,%s,%s,%s,%s,%s,%s)"
for item in self.result:
cursor.execute(sql, (item['number'], item['title'], item['video_url'], item['watch_number'],
item['bullet_number'], item['up_name'], item['score']))
print(item)
self.connect.commit()
self.connect.close()
if __name__ == '__main__':
Rank().run()