1. 实现功能
爬取网站中的“店铺名称”“描述”“价格”“销量”,并存储到表格中。
主要是练习使用 workbook
2. 目标网站
3. 代码实现
# -*- codeing = utf-8 -*-
#@Time : 2021-05-07 08:06
#@Author : Guo
#@Fil : main.py
#@Software : PyCharm
import requests
import random
from lxml import etree
import re
from time import sleep
from openpyxl import workbook # 写入Excel表所用
headers = {
'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
# 持久化保存信息存储信息
price_list = []
sale_num_list = []
desc_list = []
name_list = []
for i in range(0,15):
if(i == 0):
url = "https://liaoning.zbj.com/search/f/?type=new&kw=%E6%95%B0%E6%8D%AE%E9%87%87%E9%9B%86&fr=newpdy.it.20.8.04"
else:
url = "https://liaoning.zbj.com/search/f/?type=new&kw=%25E6%2595%25B0%25E6%258D%25AE%25E9%2587%2587%25E9%259B%2586&fr=newpdy.it.20.8.04&needHtml=1&start={}".format(-1+i*70)
page_text = requests.get(url=url, headers=headers, proxies={"https":"103.99.77.254:3128"}).text
tree = etree.HTML(page_text)
div_list = tree.xpath("/html/body/div[6]/div/div/div[2]/div[6]/div[1]/div")
for div in div_list:
# 价格
price = div.xpath('./div/div/a[1]/div[2]/div[1]/span[1]/text()') # ¥1000
if(len(price) == 0): # 数据为空填补0
price.append('0')
price = price[0].strip("¥") # 1000
price_list.append(price)
# 销量
sale_num = div.xpath('./div/div/a/div[2]/div[1]/span[2]/text()') # 近半年成交:23笔
if (len(sale_num) == 0): # 数据为空填补0
sale_num.append('0')
sale_num_deal = re.findall("\d+", str(sale_num))[0] # 提取数字 23
sale_num_list.append(sale_num_deal)
# 描述
desc = "".join(div.xpath('./div/div/a/div[2]/div[2]/p//text()'))
desc_list.append(desc)
# 店铺
name = div.xpath('./div/div/a[2]/div[1]/p/text()')
if (len(name) == 0): # 数据为空填补0
name.append('NUll')
name = name[0]
name_list.append(name)
print("第{}页爬取完成.....".format(i+1))
sleep(random.randint(0,5))
# 创建Excel表并写入数据
wb = workbook.Workbook() # 创建Excel对象
ws = wb.active # 获取当前正在操作的表对象
# 往表中写入标题行,以列表形式写入!
ws.append(['店铺', '描述', '销量', '价格'])
for i in range(len(name_list)):
ws.append([name_list[i], desc_list[i], sale_num_list[i], price_list[i]])
wb.save('猪八戒.xlsx') # 存入所有信息后,保存为filename.xlsx