Amazon 爬虫,获取图片,产品信息,写入excel并插入图片
import os,random
import re
import time
from PIL import Image
import parsel
import requests
import xlsxwriter
import time,json
from retrying import retry
class amazon_request():
def __init__(self):
self.proxies = {}
row_nums = 1
all_asin = []
current_time = time.strftime('%Y-%m-%d')
book = xlsxwriter.Workbook(r'top fill humidifier_germany.xlsx')
sheet = book.add_worksheet(current_time)
excel_headers = ['产品首图','产品名称','尺寸','品牌','价格','重量','容量','产品图片']
for i,z in zip(range(ord('A'),ord('H')+1),excel_headers):
sheet.write(chr(i)+'1',z)
img_width = 120
img_height = 160
cell_width = 20
cell_height = 160
sheet.set_column('H:H',cell_width*20)
sheet.set_column('A:A',20)
def get_page(self,url):
self.headers = {
'Accept':'*/*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
}
res = requests.get(url=url,headers=self.headers)
res.encoding = res.apparent_encoding
html = parsel.Selector(res.text)
p_url = html.xpath('//a[@class="a-link-normal a-text-normal"]/@href').extract()
next_a = re.findall(r'a href="(.*?)"',res.text)
a = []
for i in next_a:
if 'page=' in i and i not in a:
print(i)
a.append(i.strip())
if a != []:
next_page_url = 'https://www.amazon.de'+a[-1]
next_page_num = int(re.findall(r'page=([0-9]+)',next_page_url)[0])
else:
next_page_url = 'not'
next_page_num = 0
if 'page=' in url:
current_page_num = int(re.findall(r'page=([0-9]+)',url)[0])
else:
current_page_num = 1
for i in p_url:
print(len(p_url),i)
if 'https://www.amazon.de' not in i.strip():
p_nurl = 'https://www.amazon.de'+i
else:
p_nurl = i.strip()
url_reset = re.sub(r'\%[0-9]+[A-z]','/',p_nurl)
p_asin = re.findall(r'/dp/(.*?)/',url_reset)
if p_asin == []:
p_asin = re.findall(r'/dp/(.*?)\?',url_reset)
if p_asin not in self.all_asin:
self.sheet.set_row(self.row_nums,self.cell_height)
self.row_nums+=1
print(p_asin[0],self.row_nums)
print('='*20+'\n'*3+p_nurl+'\n'*3+next_page_url+'\n'*3,current_page_num,next_page_num,'\n'*2)
self.get_product_page(url=p_nurl)
self.all_asin.append(p_asin[0])
time.sleep(3)
if current_page_num >= next_page_num:
print('完成......')
self.book.close()
return
if next_page_url == 'not':
return
else:
self.get_page(next_page_url)
@retry(wait_incrementing_start=10,wait_incrementing_increment=20,wait_exponential_max=5000)
def get_product_page(self,url):
product_all_content = {}
self.headers1 = {
'Accept':'*/*',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
with open(os.path.join(os.getcwd(),'USER_AGENT.txt'),'r')as f:
agent_list = f.read().strip().split('\n')
user_agent1 = ''
while 'user-agent' not in user_agent1.lower():
user_agent1 = random.choice(agent_list)
agent = user_agent1.split('=')
self.headers1.update({agent[0].lower().strip():agent[1].strip()})
print(self.headers1)
print(url)
url_reset = re.sub(r'\%[0-9]+[A-z]','/',url)
dir_name = re.findall(r'/dp/(.*?)/',url_reset)
if dir_name == []:
dir_name = re.findall(r'/dp/(.*?)\?',url_reset)
dir_name = dir_name[0]
res = requests.get(url=url,headers=self.headers1,timeout=300)
html = parsel.Selector(res.text)
p_price = html.xpath('//*[@class="a-offscreen"]/text()').extract_first()
p_title = html.xpath('//*[@id="productTitle"]')
p_title = p_title.css('::text').extract()
if p_title == []:
print('第二次获取产品名称')
p_title = html.xpath('//*[@id="title"]')
p_title = p_title.css('::text').extract()
if p_title == []:
print('没有产品名称')
raise
pro_title = []
for i in p_title:
if i.strip() == '':
continue
pro_title.append(i.strip())
p_detials = html.css('#productDetails_techSpec_section_1')
p_detials1 = p_detials.css('::text').extract()
print(p_detials1)
if p_detials1 == []:
p_detials = html.css('#detailBulletsWrapper_feature_div')
p_detials1 = p_detials.css('::text').extract()
p_brand = html.xpath('//*[@class="a-normal a-spacing-micro"]')
p_brand1 = p_brand.css('::text').extract()
r_brand = []
for i in p_brand1:
if i.strip() == '':
continue
r_brand.append(i)
brand = []
for i in range(len(r_brand)):
if 'marke' in r_brand[i].lower():
brand.append(r_brand[i+1])
print(pro_title[0].strip(),'\n',p_price.strip())
product_all_content.update({'产品名称':pro_title[0].strip()})
product_all_content.update({'价格':p_price.strip()})
product_all_content.update({'品牌':str(brand)})
p_detials2 = []
for x in p_detials1:
if 'P.when' in x or x.strip() == ''or 'a:hover' in x:
continue
if 'Support' in x:
break
p_detials2.append(x.strip())
print('产品明细:',p_detials2)
for n in range(len(p_detials2)):
if 'Produktabmessungen' in p_detials2[n]:
product_all_content.update({'尺寸':p_detials2[n+1]})
elif 'Artikelgewicht' in p_detials2[n]:
product_all_content.update({'重量':p_detials2[n+1]})
elif 'Fassungsvermögen' in p_detials2[n]:
product_all_content.update({'容量':p_detials2[n+1]})
print(product_all_content)
for i,x in zip(range(ord('A'),ord('H')+1),self.excel_headers):
for z in product_all_content.keys():
if z in x and z == '产品名称':
self.sheet.write_url(chr(i)+str(self.row_nums),url,string=product_all_content[z])
if z in x:
self.sheet.write(chr(i)+str(self.row_nums),product_all_content[z])
p_img_url_list = html.css('#imageBlock_feature_div')
p_img_url_list = p_img_url_list.css('::text').extract()
img_url_list = re.findall(r'"main":{"https:(.*?)\"',str(p_img_url_list))
for i in range(len(img_url_list)):
if '.jpg'not in img_url_list[i]:
continue
print(img_url_list[i])
x = img_url_list[i].split('/')[-1][0:11]
img_rurl = 'https://m.media-amazon.com/images/I/'+x+'._AC_SL1500_.jpg'
print(img_rurl)
self.get_product_img(url=img_rurl,dir_name=dir_name,img_name=x,img_nums=i)
print('\n')
def get_product_img(self,url,dir_name,img_name,img_nums):
dir_path = os.path.join(os.getcwd(),'amazon_img',dir_name)
if not os.path.exists(dir_path):
os.mkdir(dir_path)
if not os.path.exists(os.path.join(dir_path,img_name+'.jpg')):
res = requests.get(url=url,headers=self.headers1)
with open(dir_path+'/'+img_name+'.jpg','wb')as f:
f.write(res.content)
else:
print('已存在')
x_scale = (Image.open(os.path.join(dir_path,img_name+'.jpg')).size[0])*0.1
y_scale = (Image.open(os.path.join(dir_path,img_name+'.jpg')).size[1])*0.1
if img_nums == 0:
self.sheet.insert_image(f'A{self.row_nums}',os.path.join(dir_path,img_name+'.jpg'),{"x_scale":0.1,"y_scale":0.1,"x_offset":20,"y_offset":20})
else:
self.sheet.insert_image(f'H{self.row_nums}',os.path.join(dir_path,img_name+'.jpg'),{"x_scale":0.1,"y_scale":0.1,"x_offset":(img_nums-1)*150+10,"y_offset":20})
url = 'https://www.amazon.de/s?k=top+fill+humidifier&__mk_de_DE=%C3%85M%C3%85%C5%BD%C3%95%C3%91&ref=nb_sb_noss_2'
a = amazon_request()
try:
a.get_page(url=url)
except Exception as e:
print(e)
a.book.close()