目标
亚马逊公司(Amazon),是美国最大的一家网络电子商务公司,位于华盛顿州的西雅图,是网络上最早开始经营电子商务的公司之一,现在已成为全球商品品种最多的网上零售商和全球第二大互联网企业。
本次目标是爬取亚马逊所有家具种类销售排行榜前100名的商品排名信息。
首先把所有家具种类,以及种类页面链接爬取出来
代码
import requests
from lxml import etree
import pandas as pd
import time
import re
from pandas import DataFrame
def gethtml(url0,head):
i = 0
while i < 5:
try:
html = requests.get(url = url0, headers = head,timeout = (10, 20))
repeat = 0
while (html.status_code != 200): # 错误响应码重试
print('error: ', html.status_code)
time.sleep(20 + repeat * 5)
if (repeat < 5):
repeat += 1
html = requests.get(url = url0, headers = head,timeout = (10, 20))
return html
except requests.exceptions.RequestException:
print('超时重试次数: ', i + 1)
time.sleep(1)
i += 1
raise Exception()
def get_link(url, hea):
req = gethtml(url, hea)
html = etree.HTML(req.text)
type_link0 = html.xpath('//span[@class="zg_selected"]/../following-sibling::ul//a/@href') # 排除上级
type_text = html.xpath('//*[@id="zg_browseRoot"]//span/text()')
end_link0 = html.xpath('//span[@class="zg_selected"]/../following-sibling::li[1]') # 兄弟节点(之后)
end_link1 = html.xpath(