第一步: 保存图片url
import logging
import random
import threading
import urllib.parse
import urllib.parse
import urllib.request
from queue import Queue
import pymysql
from bs4 import BeautifulSoup
import time
import re
import csv
import json
import pandas as pd
import os
class Spider():
def randHeader(self):
head_connection = ['Keep-Alive', 'close']
head_accept = ['text/html, application/xhtml+xml, */*']
head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
header = {
'Connection': head_connection[0],
'Accept': head_accept[0],
'Accept-Language': head_accept_language[1],
'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
}
return header
def getPicture(self,ebayno):
ebayno = str(ebayno)
if os.path.exists("image_url//ebay//" + str(ebayno) + ".xlsx"):
return
url = 'http://www.ebay.com/itm/' + ebayno
req = urllib.request.Request(url=url, headers=self.randHeader())
webpage = urllib.request.urlopen(req)
html = webpage.read()
soup = BeautifulSoup(html, 'html.parser') # 解析
# print(soup.prettify())
src = soup.find_all("img", index = re.compile("\d+"))
urls = []
for img in src:
url = img["src"]
urls.append([ebayno, url])
imagedf = pd.DataFrame(urls,columns=["ebayno","url"])
imagedf = imagedf.drop_duplicates()
imagedf["id"]=imagedf.index
imagedf.to_excel("image_url//ebay//" + str(ebayno) + ".xlsx", index=False)
# print(src)
class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类
def __init__(self, queue): #子类特有属性, queue
FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[Spider]-----%(message)s------"
logging.basicConfig(level=logging.INFO, format=FORMAT)
threading.Thread.__init__(self)
self.queue = queue
self.spider = Spider() #子类特有属性spider, 并初始化,将实例用作属性
def run(self):
while True:
success = True
item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
self.spider.getPicture(item) # 调用实例spider的方法getDataById(item)
# try:
# self.spider.getPicture(item) #调用实例spider的方法getDataById(item)
# except :
# success = False
# if not success :
# self.queue.put(item)
logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号
class SpiderJob():
def __init__(self , size , qs):
self.size = size # 将形参size的值存储到属性变量size中
self.qs = qs
def work(self):
toSpiderQueue = Queue() #创建一个Queue队列对象
for i in range(self.size):
t = ThreadCrawl(toSpiderQueue) #将实例用到一个类的方法中
t.setDaemon(True)
t.start()
for q in self.qs:
toSpiderQueue.put(q) #调用队列对象的put()方法,在对尾插入一个项目item
toSpiderQueue.join() #队列对象,等到队列为空,再执行别的操作
if __name__ == '__main__':
df = pd.read_excel("后视镜eBay平台图片抓取ebayno.xlsx")
print(df.info())
qs = df["ebayno"].values
# qs = ["361991704099"]
print(len(qs))
Job = SpiderJob(8, qs)
Job.work()
第二步:更改图片大小url
"""
在抓取ebay的图片的时候 结尾是64.jpg的是小图,应替换成500.jpg
"""
import pandas as pd
import os
import re
# dataframe = []
# for root , dirs, files in os.walk("image_url\\ebay"): #files返回combine中存在csv的文件名
# print(len(files))
# for name in files:
# name = "\\".join([root, name])
# print(str(name))
# temp = pd.read_excel(str(name))
# dataframe.append(temp)
# print(len(dataframe))
# result = pd.concat(dataframe)
# print(result.info())
# # exit()
# result.to_excel("ebay_mirror_picture_url.xlsx",index = False)
df = pd.read_excel("ebay_mirror_picture_url.xlsx")
print(df.info())
for i in df.index:
url = df.loc[i,"url"]
url = re.sub("(64\.jpg)","500.jpg",url)
url = re.sub("(64\.png)", "500.png", url)
df.loc[i,"large_url"]=url
df.to_excel("ebay_mirror_picture_url_large.xlsx",index=False)
第三步:图片url下载到本地
import random
from http.cookiejar import CookieJar
import requests
from bs4 import BeautifulSoup
import csv
import numpy as np
import re
import xlrd
from queue import Queue
import time
import urllib
import os
import random
import threading
import logging
import pandas as pd
from my_feedback_ebayno import Feedback_ebayno
class EbaySpider(object):
def __init__(self):
self.SESSION = requests.session()
self.SESSION.cookies = CookieJar()
# print(self.SESSION.cookies)
self.HEAD = self.randHeader()
def randHeader(self):
head_connection = ['Keep-Alive', 'close']
head_accept = ['text/html, application/xhtml+xml, */*']
head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
header = {
'Connection': head_connection[0],
'Accept': head_accept[0],
'Accept-Language': head_accept_language[1],
'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
}
return header
def getBeautifulSoup(self, query_rl):
r = self.SESSION.get(url=query_rl, headers=self.HEAD)
# print(self.SESSION.cookies)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def getRates(self):
query_rl = "https://www.ebay.com/sch/i.html?_from=R40&_sacat=0&_ipg=100&rt=nc&_nkw=window regulator&_pgn=1&_skc=0"
r = self.SESSION.get(url=query_rl, headers=self.HEAD)
# print(self.SESSION.cookies)
soup = BeautifulSoup(r.text, 'html.parser')
content = soup.find("span", "rcnt")
itemSize = int(str(content.string).replace(",",""))
# print("初次查询:" + str(itemSize) + "项")
#获取第一个ebayno,将收货地址更改
itm = soup.find("div", "lvpic pic img left")['iid']
# print("设置shippingandpayments为美国US")
getrates_url = "http://www.ebay.com/itm/getrates?item=" + itm + "&country=1&co=0&cb=jQuery1705349737076189762_1501724760425"
r = self.SESSION.get(url=getrates_url, headers=self.HEAD) #发请求,保存cookie
#从ebay上抓取关键字搜索到的相关的ebayno
def search(self,item):
sku = item[0]
url = item[4]
id = item[3]
print(sku,url,id)
if os.path.exists("picture_ebay//" + str(sku) + "_" + str(id) + ".jpg"):
return
urllib.request.urlretrieve(url, "picture_ebay//" + str(sku) + "_" + str(id) + ".jpg")
class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类
def __init__(self, queue): #子类特有属性, queue
FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[AmazonSpider]-----%(message)s------"
logging.basicConfig(level=logging.INFO, format=FORMAT)
threading.Thread.__init__(self)
self.queue = queue
self.spider = EbaySpider() #子类特有属性spider, 并初始化,将实例用作属性
def run(self):
while True:
success = True
item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
self.spider.search(item) # 调用实例spider的方法getDataById(item)
# try:
# self.spider.search(item) #调用实例spider的方法getDataById(item)
# except :
# success = False
# if not success :
# self.queue.put(item)
logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号
class EbaySpiderJob():
def __init__(self , size , qs ):
self.size = size # 将形参size的值存储到属性变量size中
self.qs = qs
def work(self):
toSpiderQueue = Queue() #创建一个Queue队列对象
for i in range(self.size):
t = ThreadCrawl(toSpiderQueue) #将实例用到一个类的方法中
t.setDaemon(True)
t.start()
for q in self.qs:
toSpiderQueue.put(q) #调用队列对象的put()方法,在对尾插入一个项目item
toSpiderQueue.join() #队列对象,等到队列为空,再执行别的操作
if __name__ == '__main__':
# dataframe = []
# for root, path, files in os.walk("image_url"):
# for file in files:
# names = '//'.join([root, file])
# temp = pd.read_excel(names)
# dataframe.append(temp)
# temp = pd.concat(dataframe)
# print(temp.info())
# temp.to_excel("syppo_mirror_picture.xlsx",index=False)
# # for i in temp.values:
# # print(i)
# # exit()
df = pd.read_excel("ebay_mirror_picture_url_large.xlsx")
print(df.info())
amazonJob = EbaySpiderJob(8,df.values) #实例化对象
amazonJob.work() #调用对象方法