ebay_展示图片抓取

第一步: 保存图片url

import logging
import random
import threading
import urllib.parse
import urllib.parse
import urllib.request
from queue import Queue
import pymysql
from bs4 import BeautifulSoup
import time
import  re
import csv
import json
import pandas as pd
import os
class Spider():
    def randHeader(self):
        head_connection = ['Keep-Alive', 'close']
        head_accept = ['text/html, application/xhtml+xml, */*']
        head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
        head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
                           'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
                           'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
                           'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
                           'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']

        header = {
            'Connection': head_connection[0],
            'Accept': head_accept[0],
            'Accept-Language': head_accept_language[1],
            'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
        }
        return header

    def getPicture(self,ebayno):
        ebayno = str(ebayno)
        if os.path.exists("image_url//ebay//" + str(ebayno) + ".xlsx"):
            return
        url = 'http://www.ebay.com/itm/' + ebayno
        req = urllib.request.Request(url=url, headers=self.randHeader())
        webpage = urllib.request.urlopen(req)
        html = webpage.read()
        soup = BeautifulSoup(html, 'html.parser')  # 解析
        # print(soup.prettify())
        src = soup.find_all("img", index = re.compile("\d+"))
        urls = []
        for img in src:
            url = img["src"]
            urls.append([ebayno, url])
        imagedf = pd.DataFrame(urls,columns=["ebayno","url"])
        imagedf = imagedf.drop_duplicates()
        imagedf["id"]=imagedf.index
        imagedf.to_excel("image_url//ebay//" + str(ebayno) + ".xlsx", index=False)
        # print(src)

class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类

    def __init__(self, queue):  #子类特有属性, queue
        FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[Spider]-----%(message)s------"
        logging.basicConfig(level=logging.INFO, format=FORMAT)
        threading.Thread.__init__(self)
        self.queue = queue
        self.spider = Spider()  #子类特有属性spider, 并初始化,将实例用作属性

    def run(self):
        while True:
            success = True
            item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
            self.spider.getPicture(item)  # 调用实例spider的方法getDataById(item)
            # try:
            #     self.spider.getPicture(item) #调用实例spider的方法getDataById(item)
            # except :
            #     success = False
            # if not success :
            #     self.queue.put(item)
            logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
            self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号

class SpiderJob():

    def __init__(self , size , qs):
        self.size = size  # 将形参size的值存储到属性变量size中
        self.qs = qs

    def work(self):
        toSpiderQueue = Queue() #创建一个Queue队列对象
        for i in range(self.size):
            t = ThreadCrawl(toSpiderQueue)    #将实例用到一个类的方法中
            t.setDaemon(True)
            t.start()
        for q in self.qs:
            toSpiderQueue.put(q)  #调用队列对象的put()方法,在对尾插入一个项目item
        toSpiderQueue.join()    #队列对象,等到队列为空,再执行别的操作



if __name__ == '__main__':
    df = pd.read_excel("后视镜eBay平台图片抓取ebayno.xlsx")
    print(df.info())
    qs = df["ebayno"].values
    # qs = ["361991704099"]
    print(len(qs))
    Job = SpiderJob(8, qs)
    Job.work()






第二步:更改图片大小url

"""
在抓取ebay的图片的时候 结尾是64.jpg的是小图,应替换成500.jpg
"""
import pandas as pd
import os
import re

# dataframe = []
# for root , dirs, files in os.walk("image_url\\ebay"): #files返回combine中存在csv的文件名
#     print(len(files))
#     for name in files:
#         name = "\\".join([root, name])
#         print(str(name))
#         temp = pd.read_excel(str(name))
#         dataframe.append(temp)
# print(len(dataframe))
# result = pd.concat(dataframe)
# print(result.info())
# # exit()
# result.to_excel("ebay_mirror_picture_url.xlsx",index = False)

df = pd.read_excel("ebay_mirror_picture_url.xlsx")
print(df.info())
for i in df.index:
    url = df.loc[i,"url"]
    url = re.sub("(64\.jpg)","500.jpg",url)
    url = re.sub("(64\.png)", "500.png", url)
    df.loc[i,"large_url"]=url
df.to_excel("ebay_mirror_picture_url_large.xlsx",index=False)


第三步:图片url下载到本地

import random
from http.cookiejar import CookieJar
import requests
from bs4 import BeautifulSoup
import csv
import numpy as np
import re
import xlrd
from queue import Queue
import time
import urllib
import os
import random
import threading
import logging
import pandas as pd
from my_feedback_ebayno import Feedback_ebayno

class EbaySpider(object):
    def __init__(self):
        self.SESSION = requests.session()
        self.SESSION.cookies = CookieJar()
        # print(self.SESSION.cookies)
        self.HEAD = self.randHeader()

    def randHeader(self):
        head_connection = ['Keep-Alive', 'close']
        head_accept = ['text/html, application/xhtml+xml, */*']
        head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
        head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
                           'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
                           'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
                           'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
                           'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
                           'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
                           'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
                           'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
                           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']

        header = {
            'Connection': head_connection[0],
            'Accept': head_accept[0],
            'Accept-Language': head_accept_language[1],
            'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
        }
        return header

    def getBeautifulSoup(self, query_rl):
        r = self.SESSION.get(url=query_rl, headers=self.HEAD)
        # print(self.SESSION.cookies)
        soup = BeautifulSoup(r.text, 'html.parser')
        return soup

    def getRates(self):
        query_rl = "https://www.ebay.com/sch/i.html?_from=R40&_sacat=0&_ipg=100&rt=nc&_nkw=window regulator&_pgn=1&_skc=0"
        r = self.SESSION.get(url=query_rl, headers=self.HEAD)
        # print(self.SESSION.cookies)
        soup = BeautifulSoup(r.text, 'html.parser')
        content = soup.find("span", "rcnt")
        itemSize = int(str(content.string).replace(",",""))
        # print("初次查询:" + str(itemSize) + "项")
        #获取第一个ebayno,将收货地址更改
        itm = soup.find("div", "lvpic pic img left")['iid']
        # print("设置shippingandpayments为美国US")
        getrates_url = "http://www.ebay.com/itm/getrates?item=" + itm + "&country=1&co=0&cb=jQuery1705349737076189762_1501724760425"
        r = self.SESSION.get(url=getrates_url, headers=self.HEAD) #发请求,保存cookie

    #从ebay上抓取关键字搜索到的相关的ebayno

    def search(self,item):
        sku = item[0]
        url = item[4]
        id = item[3]
        print(sku,url,id)
        if os.path.exists("picture_ebay//" + str(sku) + "_" + str(id) + ".jpg"):
            return
        urllib.request.urlretrieve(url, "picture_ebay//" + str(sku) + "_" + str(id) + ".jpg")




class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类

    def __init__(self, queue):  #子类特有属性, queue
        FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[AmazonSpider]-----%(message)s------"
        logging.basicConfig(level=logging.INFO, format=FORMAT)
        threading.Thread.__init__(self)
        self.queue = queue
        self.spider = EbaySpider()  #子类特有属性spider, 并初始化,将实例用作属性

    def run(self):
        while True:
            success = True
            item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
            self.spider.search(item)  # 调用实例spider的方法getDataById(item)
            # try:
            #     self.spider.search(item) #调用实例spider的方法getDataById(item)
            # except :
            #     success = False
            # if not success :
            #     self.queue.put(item)
            logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
            self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号

class EbaySpiderJob():

    def __init__(self , size , qs ):
        self.size = size  # 将形参size的值存储到属性变量size中
        self.qs = qs


    def work(self):
        toSpiderQueue = Queue() #创建一个Queue队列对象
        for i in range(self.size):
            t = ThreadCrawl(toSpiderQueue)    #将实例用到一个类的方法中
            t.setDaemon(True)
            t.start()
        for q in self.qs:
            toSpiderQueue.put(q)  #调用队列对象的put()方法,在对尾插入一个项目item
        toSpiderQueue.join()    #队列对象,等到队列为空,再执行别的操作


if __name__ == '__main__':


    # dataframe = []
    # for root, path, files in os.walk("image_url"):
    #     for file in files:
    #         names = '//'.join([root, file])
    #         temp = pd.read_excel(names)
    #         dataframe.append(temp)
    # temp = pd.concat(dataframe)
    # print(temp.info())
    # temp.to_excel("syppo_mirror_picture.xlsx",index=False)
    # # for i in temp.values:
    # #     print(i)
    # #     exit()
    df = pd.read_excel("ebay_mirror_picture_url_large.xlsx")
    print(df.info())
    amazonJob = EbaySpiderJob(8,df.values)  #实例化对象
    amazonJob.work() #调用对象方法




评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值