目标:华为商城下的商品信息
- 按主页的左边手机,笔记本&平板,智能穿戴……分类
- 每一个分类下的小分类
- 规格参数
- 写入excel
- 设置好excel数据表,分析数据
代码如下(scrapy):
import os
import re
import urllib.request
from copy import deepcopy
import scrapy
import xlrd
import xlwt
from ..items import HuaweiItem
class HuaWei(scrapy.Spider):
name = 'huawei'
allowed_domains = ['vmall.com', 'vmallres.com']
start_urls = ['http://vmall.com/']
def parse(self, response):
self.new_xls()
print("分割线-----------------------主页------------------------分割线")
classify_list_A = response.xpath('//div[@id="category-block"]/div/ol/li')
print("大分类长度:", len(classify_list_A))
for i in classify_list_A:
item = HuaweiItem()
item['classify_A'] = i.xpath('.//input[2]/@value').extract_first()
classify_list = i.xpath('.//div[2]//li[not(@class="subcate-btn")]')
for i in classify_list:
item['classify_B'] = i.xpath('.//input[1]/@value').extract_first()
href = "https://www.vmall.com" + str(i.xpath('.//a/@href').extract_first()) + '-1-3-0'
yield scrapy.Request(
href,
callback=self.parse_A,
meta={"item": deepcopy(item)}
)
rb = xlrd.open_workbook('华为商城.xls')
rs = rb.sheet_by_index(0)
print("已爬取的商品数量:", rs.nrows - 1)
def parse_A(self, response):
print("分割线-----------------------中间页------------------------分割线")
li_list = response.xpath('//div[@class="layout"]/div[@class="channel-list"]/div[@class="pro-list clearfix"]/ul/li')
if li_list:
print("正在爬取页面链接:", response.request.url)
print("此页面商品数量:", len(li_list))
for i in li_list:
item = response.meta["item"]
rb = xlrd.open_workbook('华为商城.xls')
rs = rb.sheet_by_index(0)
cods = rs.col_values(0, start_rowx=0, end_rowx=None)
item['title'] = i.xpath('./div[1]/p[2]/a/span[1]/text()').extract_first()
item['price'] = round(float(i.xpath('./div[1]/p[3]/b/text()').extract_first().split("¥")[1]) if i.xpath('./div[1]/p[3]/b/text()') else 0, 2)
item['comments'] = int(i.xpath('./div[1]/div[@class="p-button clearfix"]//label//text()').extract_first().split("人")[0])
item['img'] = i.xpath('./div[1]/p[1]/a/img/@src').extract_first()
item['href'] = "https://www.vmall.com" + i.xpath('./div[1]/p[1]/a/@href').extract_first()
item['coding'] = re.findall('[(]\'(.*?)\'[)]', i.xpath('./div[1]/p[1]/a/@onclick').extract_first())
if item['coding'][0] not in cods:
yield scrapy.Request(
item['href'],
callback=self.parse_B,
meta={"item": deepcopy(item)}
)
next_url_len = len(response.xpath('//ul[@id="page_ul"]/a'))
if int(response.request.url.split("-")[2]) < next_url_len:
href = response.request.url.split("-")[0] + "-" + response.request.url.split("-")[1] + "-" + str(
int(response.request.url.split("-")[2]) + 1) + '-3-0'
print("next_href:", href)
yield scrapy.Request(
href,
callback=self.parse_A,
meta={"item": deepcopy(item)}
)
def parse_B(self, response):
print("分割线-----------------------详情页------------------------分割线")
item = response.meta["item"]
print("现在位置%s/%s" % (item["classify_A"], item["classify_B"]))
print("正在爬取:", item['title'])
content = response.xpath('//div[@id="product-property-recommand"]')
if content:
item['promotion'] = self.get_cx(response)
item['coding'] = content.xpath(
'./div[@class="product-description clearfix"]/div[@class="fl"]/text()').extract_first().strip()
item['explain'] = content.xpath('.//div[@id="skuPromWord"]//span/text()').extract_first()
server_explain = content.xpath(
'.//div[@id="product-pulldown1"]/div[1]/div[@class="product-description-list clearfix"]/ul/li')
item['server_explain'] = self.get_cm(server_explain)
item['content'] = content.xpath('.//h1[@id="pro-name"]/text()').extract_first()
cu_1 = re.findall(r'<script src="(.*?)" namespace="ec"></script>', response.text)[1]
yield scrapy.Request(
cu_1,
callback=self.get_cu_1,
meta={"item": deepcopy(item)},
dont_filter=True
)
else:
content = response.xpath('//div[@class="pro-meta-area"]')
item['content'] = content.xpath('.//h1[@id="pro-name"]/text()').extract_first()
item['explain'] = content.xpath('.//div[@id="skuPromWord"]//span/text()').extract_first()
item['server_explain'] = content.xpath('.//div[@class="pro-service"]/text()').extract_first()
item['promotion'] = "暂无活动"
yield item
def get_cx(self, response):
print("获取促销")
"""获取促销数据"""
str = ""
cu = re.findall(r'_groupPhotoList.push[(]{name:.*?}[)]; (_promotionsList+.*?); _prolongLst.push', response.text)
if cu:
try:
cs = re.findall(r'"(.*?)"', cu[1])
except:
cs = re.findall(r'"(.*?)"', cu[0])
print(cu)
print(len(cu))
index = 0
pop_list = []
for i in cs:
i = i.replace(r'/', "/")
if i.find('&#x') != -1:
i = i.replace("&#x", "\\u")
i = i.replace(";", "")
i = i.replace("\n", "")
i = i.replace("\t", "")
i = i.replace(" ", "")
i = i.encode().decode('unicode-escape')
cs[index] = i
else:
pop_list.append(index)
index += 1
for i in pop_list[::-1]:
cs.pop(i)
ins = 0
for i in cs:
str += i
ins += 1
if ins % 2 is 0:
str += ";"
elif ins % 2 is 1:
str += ":"
return str
def get_cu_1(self, response):
print("进入GET_CU_1")
item = response.meta["item"]
print(item)
cu1 = re.findall(r' \\x3e\'[)],a.push[(](.*?")[)],', response.text)[0]
cul_1 = re.findall(r'\\x3e(.*?)\\x3c', cu1)[0].encode().decode('unicode-escape')
cul_2 = re.findall(r'a.push[(]"(.*?)"', cu1)[0].encode().decode('unicode-escape')
str = cul_1 + ":" + cul_2 + ";"
print("--------------------------str----------------------------------")
item['promotion'] += str
if item['promotion'] is '':
item['promotion'] = "暂无活动"
yield item
def new_xls(self):
"""创建表格"""
if not os.path.exists("华为商城.xls"):
print("正在创建。。。")
wb = xlwt.Workbook(encoding='utf-8')
ws = wb.add_sheet('商品数据')
ws.write(0, 0, label='商品编码')
ws.write(0, 1, label='祖分类')
ws.write(0, 2, label='父分类')
ws.write(0, 3, label='标题')
ws.write(0, 4, label='图片')
ws.write(0, 5, label='链接')
ws.write(0, 6, label='价格')
ws.write(0, 7, label='评价数量')
ws.write(0, 8, label='内容')
ws.write(0, 9, label='说明')
ws.write(0, 10, label='服务说明')
ws.write(0, 11, label="促销")
wb.save('华为商城.xls')
def get_cm(self, server_explain):
cm = ""
for i in server_explain:
text = i.xpath('./text()')
if len(text) > 1:
mm = ""
str_1 = re.findall(r'data=\'(.+?)\'>', str(text))
if i.xpath('./span/text()'):
for k in str_1:
mm += k
if str_1.index(k) == 0:
mm += i.xpath('./span/text()').extract_first()
cm += mm
else:
cm += str(text.extract_first()) + ';'
return cm
git地址