亚马逊爬虫，需要科学上网

最新推荐文章于 2024-05-21 20:55:00 发布
撞到头了
最新推荐文章于 2024-05-21 20:55:00 发布
阅读量1.1k
点赞数
文章标签：爬虫 python 开发语言
本文链接：https://blog.csdn.net/weixin_38485865/article/details/121927078
版权
Amazon 爬虫，获取图片，产品信息，写入excel并插入图片

import os,random
import re
import time
from PIL import Image
import parsel
import requests
import xlsxwriter
import time,json
from retrying import retry



class amazon_request():
    def __init__(self):
        self.proxies = {}
    row_nums = 1
    all_asin = []
    current_time = time.strftime('%Y-%m-%d')
    book = xlsxwriter.Workbook(r'top fill humidifier_germany.xlsx')
    sheet = book.add_worksheet(current_time)
    excel_headers = ['产品首图','产品名称','尺寸','品牌','价格','重量','容量','产品图片']
    for i,z in zip(range(ord('A'),ord('H')+1),excel_headers):
        sheet.write(chr(i)+'1',z)
    

    img_width = 120
    img_height = 160
    cell_width = 20
    cell_height = 160
    sheet.set_column('H:H',cell_width*20)
    sheet.set_column('A:A',20)

    def get_page(self,url):
        self.headers = {
            'Accept':'*/*',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
        }
        res = requests.get(url=url,headers=self.headers)
        res.encoding = res.apparent_encoding
        html = parsel.Selector(res.text)
        p_url = html.xpath('//a[@class="a-link-normal a-text-normal"]/@href').extract()
        next_a = re.findall(r'a href="(.*?)"',res.text)
        a = []
        for i in next_a:
            if 'page=' in i and i not in a:
                print(i)
                a.append(i.strip())
        if a != []:
            next_page_url = 'https://www.amazon.de'+a[-1]
            next_page_num = int(re.findall(r'page=([0-9]+)',next_page_url)[0])
        else:
            next_page_url = 'not'
            next_page_num = 0
        if 'page=' in url:
            current_page_num = int(re.findall(r'page=([0-9]+)',url)[0])
        else:
            current_page_num = 1
        for i in p_url:
            print(len(p_url),i)
            if 'https://www.amazon.de' not in i.strip():
                p_nurl = 'https://www.amazon.de'+i
            else:
                p_nurl = i.strip()
            url_reset = re.sub(r'\%[0-9]+[A-z]','/',p_nurl)
            p_asin = re.findall(r'/dp/(.*?)/',url_reset)
            if p_asin == []:
                p_asin = re.findall(r'/dp/(.*?)\?',url_reset)
            if p_asin not in self.all_asin:
                self.sheet.set_row(self.row_nums,self.cell_height)
                self.row_nums+=1
                print(p_asin[0],self.row_nums)
                print('='*20+'\n'*3+p_nurl+'\n'*3+next_page_url+'\n'*3,current_page_num,next_page_num,'\n'*2)
                self.get_product_page(url=p_nurl)
                self.all_asin.append(p_asin[0])
                time.sleep(3)
        if current_page_num >= next_page_num:
            print('完成......')
            self.book.close()
            return
        if next_page_url == 'not':
            return
        else:
            self.get_page(next_page_url)

    @retry(wait_incrementing_start=10,wait_incrementing_increment=20,wait_exponential_max=5000)
    def get_product_page(self,url):
        product_all_content = {}
        self.headers1 = {
            'Accept':'*/*',
            'Accept-Language': 'zh-CN,zh;q=0.9',
        }
        with open(os.path.join(os.getcwd(),'USER_AGENT.txt'),'r')as f:
            agent_list = f.read().strip().split('\n')
            user_agent1 = ''
            while 'user-agent' not in user_agent1.lower():
                user_agent1 = random.choice(agent_list)
            agent = user_agent1.split('=')
        self.headers1.update({agent[0].lower().strip():agent[1].strip()})
        print(self.headers1)
        print(url)
        # print(self.proxies)
        url_reset = re.sub(r'\%[0-9]+[A-z]','/',url)
        dir_name = re.findall(r'/dp/(.*?)/',url_reset)
        if dir_name == []:
            dir_name = re.findall(r'/dp/(.*?)\?',url_reset)
        dir_name = dir_name[0]
        # print(dir_name,self.n)
        res = requests.get(url=url,headers=self.headers1,timeout=300)
        # res.encoding = res.apparent_encoding
        
        html = parsel.Selector(res.text)
        p_price = html.xpath('//*[@class="a-offscreen"]/text()').extract_first()
        p_title = html.xpath('//*[@id="productTitle"]')
        p_title = p_title.css('::text').extract()
        if p_title == []:
            print('第二次获取产品名称')
            p_title = html.xpath('//*[@id="title"]')
            p_title = p_title.css('::text').extract()
            if p_title == []:
                print('没有产品名称')
                raise
        pro_title = []
        for i in p_title:
            if i.strip() == '':
                continue
            pro_title.append(i.strip())
        p_detials = html.css('#productDetails_techSpec_section_1')
        p_detials1 = p_detials.css('::text').extract()
        print(p_detials1)
        if p_detials1 == []:
            p_detials = html.css('#detailBulletsWrapper_feature_div')
            p_detials1 = p_detials.css('::text').extract()
        p_brand = html.xpath('//*[@class="a-normal a-spacing-micro"]')
        p_brand1 = p_brand.css('::text').extract()
        r_brand = []
        for i in p_brand1:
            if i.strip() == '':
                continue
            r_brand.append(i)
        brand = []
        for i in range(len(r_brand)):
            if 'marke' in r_brand[i].lower():
                brand.append(r_brand[i+1])
        print(pro_title[0].strip(),'\n',p_price.strip())
        product_all_content.update({'产品名称':pro_title[0].strip()})
        product_all_content.update({'价格':p_price.strip()})
        product_all_content.update({'品牌':str(brand)})

        p_detials2 = []
        for x in p_detials1:
            if 'P.when' in x or x.strip() == ''or 'a:hover' in x:
                continue
            if 'Support' in x:
                break
            p_detials2.append(x.strip())
        print('产品明细：',p_detials2)
        for n in range(len(p_detials2)):
            if 'Produktabmessungen' in p_detials2[n]:
                product_all_content.update({'尺寸':p_detials2[n+1]})
            elif 'Artikelgewicht' in p_detials2[n]:
                product_all_content.update({'重量':p_detials2[n+1]})
            elif 'Fassungsvermögen' in p_detials2[n]:
                product_all_content.update({'容量':p_detials2[n+1]})
        print(product_all_content)
        
        for i,x in zip(range(ord('A'),ord('H')+1),self.excel_headers):
            for z in product_all_content.keys():
                if z in x and z == '产品名称':
                    self.sheet.write_url(chr(i)+str(self.row_nums),url,string=product_all_content[z])
                if z in x:
                    self.sheet.write(chr(i)+str(self.row_nums),product_all_content[z])

        p_img_url_list = html.css('#imageBlock_feature_div')
        p_img_url_list = p_img_url_list.css('::text').extract()
        img_url_list = re.findall(r'"main":{"https:(.*?)\"',str(p_img_url_list))


        for i in range(len(img_url_list)):
            if '.jpg'not in img_url_list[i]:
                continue
            print(img_url_list[i])
            x = img_url_list[i].split('/')[-1][0:11]
            img_rurl = 'https://m.media-amazon.com/images/I/'+x+'._AC_SL1500_.jpg'
            print(img_rurl)
            self.get_product_img(url=img_rurl,dir_name=dir_name,img_name=x,img_nums=i)
        print('\n')
            

    def get_product_img(self,url,dir_name,img_name,img_nums):
        dir_path = os.path.join(os.getcwd(),'amazon_img',dir_name)
        if not os.path.exists(dir_path):
                os.mkdir(dir_path)
        if not os.path.exists(os.path.join(dir_path,img_name+'.jpg')):
            res = requests.get(url=url,headers=self.headers1)
            with open(dir_path+'/'+img_name+'.jpg','wb')as f:
                f.write(res.content)
        else:
            print('已存在')
        x_scale = (Image.open(os.path.join(dir_path,img_name+'.jpg')).size[0])*0.1
        y_scale = (Image.open(os.path.join(dir_path,img_name+'.jpg')).size[1])*0.1
        if img_nums == 0:
            self.sheet.insert_image(f'A{self.row_nums}',os.path.join(dir_path,img_name+'.jpg'),{"x_scale":0.1,"y_scale":0.1,"x_offset":20,"y_offset":20})
        else:
            self.sheet.insert_image(f'H{self.row_nums}',os.path.join(dir_path,img_name+'.jpg'),{"x_scale":0.1,"y_scale":0.1,"x_offset":(img_nums-1)*150+10,"y_offset":20})
        

url = 'https://www.amazon.de/s?k=top+fill+humidifier&__mk_de_DE=%C3%85M%C3%85%C5%BD%C3%95%C3%91&ref=nb_sb_noss_2'



a = amazon_request()
try:
    a.get_page(url=url)
except Exception as e:
    print(e)
    a.book.close()