给一个汽配图片网站,需要抓取某品类下的数据和图片。
步骤:
第一步: 品类网址下的所有item收集 title url
第二步: 根据item的url网址下 收集该item相关的title description fitment 图片url
第三步: 根据图片url下载图片并保存
代码实现:
第一步:品类网址下的所有item收集 title url
import re
import json
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os
import random
from pandas.io.json import json_normalize
class IPProxy():
def __init__(self,count = 50):
self.count = count
def get_IPProxies(self):
r = requests.get('http://127.0.0.1:8000/?types=0&count=' + str(self.count) + '&country=国内')
ip_ports = json.loads(r.text)
proxy = random.choice(ip_ports)
ip = proxy[0]
port = proxy[1]
proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
return proxies
# print(proxies)
def get_headers(self):
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
return {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate',
}
def get_html_content(self,url):
try:
proxies = self.get_IPProxies()
headers = self.get_headers()
r = requests.get(url=url, headers=headers, timeout=5, proxies=proxies)
if (not r.ok) or len(r.content) < 500:
raise Exception("连接错误")
else:
return r.text
except Exception:
count = 0 # 重试次数
while count < 3:
try:
proxies = self.get_IPProxies()
headers = self.get_headers()
r = requests.get(url=url, headers=headers, timeout=5, proxies=proxies)
if (not r.ok) or len(r.content) < 500:
raise Exception("连接错误")
else:
return r.text
except Exception:
count += 1
return None
class image_structs():
def __init__(self):
self.picture_url = {
"image_id": '',
"picture_url": ''
}
class data_structs():
def __init__(self):
# columns=['title', 'item_url', 'id','picture_url','std_desc','description','information','fitment'])
self.info={
"title":'',
"item_url":'',
"id":0,
"picture_url":[],
"std_desc":'',
"description":'',
"information":'',
"fitment":''
}
# "https://waldoch.com/store/catalogsearch/result/index/?cat=0&limit=200&p=1&q=nerf+bar"
# https://waldoch.com/store/new-oem-ford-f-150-f150-5-running-boards-nerf-bar-crew-cab-2015-w-brackets-fl34-16451-ge5fm6.html
def soup_parser(outfile="item_urls.xlsx"):
result = []
for i in range(9):
i = str(i + 1)
with open(str(i)+".txt","r",encoding="utf-8") as fp:
soup = fp.read()
# print(soup)
soup = BeautifulSoup(soup,"html.parser")
alink = soup.find_all("a", class_="product-image")
for a in alink:
title = a["title"]
item_url = a["href"]
result.append([title, item_url])
df = pd.DataFrame(result, columns=["title", "item_url"])
print(len(df))
df = df.drop_duplicates()
print(len(df))
df["id"] =df.index
df.to_excel(outfile, index=False)
def content_parser(soup):
print(soup)
soup = BeautifulSoup(soup, "html.parser")
result = []
alink = soup.find_all("a", class_="product-image")
for a in alink:
title = a["title"]
item_url = a["href"]
result.append([title, item_url])
return result
def get_item_list(outfile):
pages = ["&p=%d" % n for n in list(range(1, 7))]
# http://4x4sidesteps.co.uk/side-steps.html?limit=15&p=2
urls = ['http://4x4sidesteps.co.uk/side-steps.html?limit=15%s' % n for n in [''] + pages]
print(urls)
result = []
ips = IPProxy()
for url in urls:
soup = ips.get_html_content(url)
res = content_parser(soup)
print(res)
result.extend(res)
df = pd.DataFrame(result, columns=["title", "item_url"])
df = df.drop_duplicates()
df["id"] =df.index
df.to_excel( outfile, index=False)
# print(soup)
def get_item_info_4x4(file,outfile=""):
DEFAULT_FALSE = ""
df = pd.read_excel(file)
for i in df.index:
id = df.loc[i, "id"]
item_url = df.loc[i, "item_url"]
data = data_structs()
data.info["title"] = df.loc[i, "title"]
data.info["id"] = id
data.info["item_url"] = item_url
# if os.path.exists(str(int(id)) + ".xlsx"):
# continue
ips = IPProxy()
soup = ips.get_html_content(item_url)
print(soup)
# 图片
try:
soup = BeautifulSoup(soup, "html.parser")
imglink = soup.find_all("img", class_=re.compile("^gallery-image"))
for a in imglink:
image = image_structs()
image.picture_url["image_id"] = a["id"]
image.picture_url["picture_url"] = a["src"]
print(image.picture_url)
data.info["picture_url"].append(image.picture_url)
except:
data.info["picture_url"] = DEFAULT_FALSE
print(data.info)
print(data.info.keys())
singledf = json_normalize(data.info, "picture_url",['title', 'id'])
singledf.to_excel("test.xlsx", index=False)
exit()
# print(df.ix[i])
df.to_excel(outfile, index=False)
# std_desc
def get_item_info(file,outfile):
DEFAULT_FALSE = ""
df = pd.read_excel(file)
for i in df.index:
id = df.loc[i,"id"]
if os.path.exists(str(int(id))+".xlsx"):
continue
item_url = df.loc[i,"item_url"]
url = item_url
web = requests.get(url)
soup = BeautifulSoup(web.text, "html.parser")
# 图片
imglink = soup.find_all("img", class_=re.compile("^gallery-image"))
data = data_structs()
data.info["title"] = df.loc[i,"title"]
data.info["id"] = id
data.info["item_url"] = item_url
for a in imglink:
image = image_structs()
image.picture_url["image_id"] = a["id"]
image.picture_url["picture_url"]=a["src"]
print(image.picture_url)
data.info["picture_url"].append(image.picture_url)
print(data.info)
# std_desc
std_desc = soup.find("div", itemprop="description")
try:
strings_desc = []
for ii in std_desc.stripped_strings:
strings_desc.append(ii)
strings_desc = "\n".join(strings_desc)
except:
strings_desc=DEFAULT_FALSE
# description
try:
desc = soup.find('h2', text="Description")
desc = desc.find_next()
except:
desc=DEFAULT_FALSE
description=desc
# information
try:
information = soup.find("h2", text='Information')
desc = information
desc = desc.find_next()
except:
desc=DEFAULT_FALSE
information = desc
# fitment
try:
fitment = soup.find('h2', text='Fitment')
desc = fitment
desc = desc.find_next()
except:
desc=DEFAULT_FALSE
fitment=desc
data.info["std_desc"] = strings_desc
data.info["description"] = str(description)
data.info["information"] = str(information)
data.info["fitment"] = str(fitment)
print(data.info.keys())
singledf = json_normalize(data.info,"picture_url",['title', 'item_url', 'id', 'std_desc', 'description', 'information', 'fitment'])
singledf.to_excel("test.xlsx",index=False)
exit()
# print(df.ix[i])
df.to_excel(outfile,index=False)
# get_item_list("item_urls.xlsx")
get_item_info_4x4("item_urls.xlsx")
# soup_parser()
# get_item_info("item_urls.xlsx","item_urls_info.xlsx")
第二步: 根据item的url网址下 收集该item相关的title description fitment 图片url
import random
from http.cookiejar import CookieJar
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
from queue import Queue
import time
import os
import random
import threading
import json
import logging
import pandas as pd
from pandas.io.json import json_normalize
class IPProxy():
def __init__(self,count = 50):
self.count = count
def get_IPProxies(self):
r = requests.get('http://127.0.0.1:8000/?types=0&count=' + str(self.count) + '&country=国内')
ip_ports = json.loads(r.text)
proxy = random.choice(ip_ports)
ip = proxy[0]
port = proxy[1]
proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
return proxies
# print(proxies)
def get_headers(self):
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
return {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate',
}
class image_structs():
def __init__(self):
self.picture_url = {
"image_id": '',
"picture_url": ''
}
class data_structs():
def __init__(self):
# columns=['title', 'item_url', 'id','picture_url','std_desc','description','information','fitment'])
self.info={
"title":'',
"item_url":'',
"id":0,
"picture_url":[],
"std_desc":'',
"description":'',
"information":'',
"fitment":''
}
class EbaySpider(object):
def __init__(self,file=""):
self.file= file
def get_html_content(self,url):
ips = IPProxy()
proxies = ips.get_IPProxies()
try:
r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies)
if (not r.ok) or len(r.content) < 500:
raise Exception("连接错误")
else:
return r.text
except Exception:
count = 0 # 重试次数
while count < 3:
try:
proxies = ips.get_IPProxies()
r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies)
if (not r.ok) or len(r.content) < 500:
raise Exception("连接错误")
else:
return r.text
except Exception:
count += 1
return None
def get_item_info(self,item):
# title item_url id
DEFAULT_FALSE = ""
id = item[2]
item_url = item[1]
data = data_structs()
data.info["title"] = item[0]
data.info["id"] = id
data.info["item_url"] = item_url
soup = self.get_html_content(item_url)
# 图片
try:
soup = BeautifulSoup(soup, "html.parser")
imglink = soup.find_all("img", class_=re.compile("^gallery-image"))
for a in imglink:
image = image_structs()
image.picture_url["image_id"] = a["id"]
image.picture_url["picture_url"] = a["src"]
print(image.picture_url)
data.info["picture_url"].append(image.picture_url)
except:
data.info["picture_url"] = DEFAULT_FALSE
singledf = json_normalize(data.info, "picture_url", ['title', 'id'])
singledf.to_excel(self.file + "/"+str(int(id)) + ".xlsx", index=False)
class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类
def __init__(self, queue,file): #子类特有属性, queue
FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[AmazonSpider]-----%(message)s------"
logging.basicConfig(level=logging.INFO, format=FORMAT)
threading.Thread.__init__(self)
self.queue = queue
self.file=file
self.spider = EbaySpider(self.file) #子类特有属性spider, 并初始化,将实例用作属性
def run(self):
while True:
success = True
item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
self.spider.get_item_info(item) # 调用实例spider的方法getDataById(item)
logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号
class EbaySpiderJob():
def __init__(self , size , qs ,file):
self.size = size # 将形参size的值存储到属性变量size中
self.qs = qs
self.file = file
def work(self):
toSpiderQueue = Queue() #创建一个Queue队列对象
for i in range(self.size):
t = ThreadCrawl(toSpiderQueue,self.file) #将实例用到一个类的方法中
t.setDaemon(True)
t.start()
for q in self.qs:
toSpiderQueue.put(q) #调用队列对象的put()方法,在对尾插入一个项目item
toSpiderQueue.join() #队列对象,等到队列为空,再执行别的操作
def combine_data(combine_file,outfile):
# 合并数据
dataframe = []
for root, path, files in os.walk(combine_file):
if not files: # 文件夹为空
return False
for file in files:
names = '//'.join([root, file])
temp = pd.read_excel(names)
dataframe.append(temp)
temp = pd.concat(dataframe)
# urls=temp["url"].values
temp.to_excel(outfile, index=False)
return True
def main():
# 创建文件夹
file = "info"
outfile = "item_infos.xlsx"
if not os.path.exists(file):
os.makedirs(file)
#读入数据
df = pd.read_excel("item_urls.xlsx")
combinefile = combine_data(file, outfile)
existitem = []
if combinefile:
existdf = pd.read_excel(outfile)
existitem = existdf["id"].unique()
while len(existitem) != len(df):
temp = df[~df["id"].isin(existitem)] # 不存在 取反
amazonJob = EbaySpiderJob(8,temp.values,file) #实例化对象
amazonJob.work() #调用对象方法
combine_data(file,outfile )
existdf = pd.read_excel(outfile)
existitem = existdf["id"].unique()
def single_test():
spider = EbaySpider()
spider.get_item_info()
if __name__ == '__main__':
# file = "info"
# single_test()
main()
第三步: 根据图片url下载图片并保存
import random
import urllib
from http.cookiejar import CookieJar
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
from queue import Queue
import time
import os
import random
import threading
import json
import logging
import pandas as pd
from pandas.io.json import json_normalize
class IPProxy():
def __init__(self,count = 50):
self.count = count
def get_IPProxies(self):
r = requests.get('http://127.0.0.1:8000/?types=0&count=' + str(self.count) + '&country=国内')
ip_ports = json.loads(r.text)
proxy = random.choice(ip_ports)
ip = proxy[0]
port = proxy[1]
proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
return proxies
# print(proxies)
def get_headers(self):
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
return {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate',
}
class image_structs():
def __init__(self):
self.picture_url = {
"image_id": '',
"picture_url": ''
}
class data_structs():
def __init__(self):
# columns=['title', 'item_url', 'id','picture_url','std_desc','description','information','fitment'])
self.info={
"title":'',
"item_url":'',
"id":0,
"picture_url":[],
"std_desc":'',
"description":'',
"information":'',
"fitment":''
}
class EbaySpider(object):
def __init__(self,file=""):
self.file= file
def get_html_content(self,url):
ips = IPProxy()
proxies = ips.get_IPProxies()
try:
r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies)
if (not r.ok) or len(r.content) < 500:
raise Exception("连接错误")
else:
return r.text
except Exception:
count = 0 # 重试次数
while count < 3:
try:
proxies = ips.get_IPProxies()
r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies)
if (not r.ok) or len(r.content) < 500:
raise Exception("连接错误")
else:
return r.text
except Exception:
count += 1
return None
def get_html_response(self,url):
ips = IPProxy()
proxies = ips.get_IPProxies()
try:
r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies)
if (not r.ok) or len(r.content) < 500:
raise Exception("连接错误")
else:
return r
except Exception:
count = 0 # 重试次数
while count < 3:
try:
proxies = ips.get_IPProxies()
r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies)
if (not r.ok) or len(r.content) < 500:
raise Exception("连接错误")
else:
return r
except Exception:
count += 1
return None
def get_item_info(self, item):
file = self.file
picture_id = item[0]
url = item[1]
save_file= file + "//" + str(picture_id) + ".jpg"
response = self.get_html_response(url)
if response:
with open(save_file, 'wb') as f:
f.write(response.content)
f.flush()
class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类
def __init__(self, queue,file): #子类特有属性, queue
FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[AmazonSpider]-----%(message)s------"
logging.basicConfig(level=logging.INFO, format=FORMAT)
threading.Thread.__init__(self)
self.queue = queue
self.file=file
self.spider = EbaySpider(self.file) #子类特有属性spider, 并初始化,将实例用作属性
def run(self):
while True:
success = True
item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
self.spider.get_item_info(item) # 调用实例spider的方法getDataById(item)
logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号
class EbaySpiderJob():
def __init__(self , size , qs ,file):
self.size = size # 将形参size的值存储到属性变量size中
self.qs = qs
self.file = file
def work(self):
toSpiderQueue = Queue() #创建一个Queue队列对象
for i in range(self.size):
t = ThreadCrawl(toSpiderQueue,self.file) #将实例用到一个类的方法中
t.setDaemon(True)
t.start()
for q in self.qs:
toSpiderQueue.put(q) #调用队列对象的put()方法,在对尾插入一个项目item
toSpiderQueue.join() #队列对象,等到队列为空,再执行别的操作
def combine_file_name(combine_file):
# 合并数据
result = []
for root, path, files in os.walk(combine_file):
if not files: # 文件夹为空
return result
for file in files:
filename,exe = os.path.splitext(file)
result.append(filename)
return result
def main():
# 创建文件夹
file = "picture"
if not os.path.exists(file):
os.makedirs(file)
#读入数据
df = pd.read_excel("item_infos.xlsx")
print(df.info())
for i in df.index:
df.loc[i,"picture_id"] = str(df.loc[i,"id"])+"_"+str(df.loc[i,"image_id"])
df = df[["picture_id","picture_url"]]
existitem = combine_file_name(file)
while len(existitem) != len(df):
temp = df[~df["picture_id"].isin(existitem)] # 不存在 取反
# #image_url 需要传入的字段 sku url id
amazonJob = EbaySpiderJob(8,temp.values,file) #实例化对象
amazonJob.work() #调用对象方法
existitem = combine_file_name(file )
def single_test():
spider = EbaySpider()
spider.get_item_info()
if __name__ == '__main__':
# file = "info"
# single_test()
main()