方式1
s = re.findall("\((.*)\)", s)[0]
将代码里的相应的正则表达式替换成这个
import logging
import random
import threading
import urllib.parse
import urllib.parse
import urllib.request
from queue import Queue
import pymysql
from bs4 import BeautifulSoup
import time
import re
import csv
import json
class Spider():
def randHeader(self):
head_connection = ['Keep-Alive', 'close']
head_accept = ['text/html, application/xhtml+xml, */*']
head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
header = {
'Connection': head_connection[0],
'Accept': head_accept[0],
'Accept-Language': head_accept_language[1],
'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
}
return header
def getBeautifulSoup(self,ebayno):
url_1 = 'http://frame.ebay.com/ws/eBayISAPI.dll?GetFitmentData&rand=147945705603&site=100&vs=0&req=2&cid=33706&item=' + str(
ebayno).strip() + '&ct=20&pn=FitmentComments%7CYear%7CMake%7CModel%7CTrim%7CEngine&page=1000&cb=jQuery1709105713909136433_1479456959821&_=1479457056036'
req = urllib.request.Request(url= url_1 , headers=self.randHeader())
webpage = urllib.request.urlopen(req)
html = webpage.read()
soup = BeautifulSoup(html, 'html.parser')
return soup
def getFitment(self,ebayno):
print(ebayno)
out = open("result\\"+str(ebayno)+".csv", "w",newline="",encoding="utf-8")
csv_writer = csv.writer(out)
csv_writer.writerow(['fitmentcomment','year','make','model','trim','engine','ebayno'])
soup = self.getBeautifulSoup(ebayno)
s = str(soup)
#s = re.findall("\((.*)\)", s)[0]
s = re.findall("\((.*)\)", s)[0]
sjson = json.loads(s)
data = sjson["data"]
if data is None:
with open("result\\fitment_empty.txt","a")as f:
f.write(ebayno+"\n")
return
for i in range(len(data)):
row = data[i]
if "Trim" in row:
Trim = row["Trim"][0]
else:
Trim = "All"
if "FitmentComments" in row:
FitmentComments = row["FitmentComments"][0]
else:
FitmentComments = "All"
if "Engine" in row:
Engine = row["Engine"][0]
else:
Engine = "All"
Year = row["Year"][0]
Make = row["Make"][0]
Model = row["Model"][0]
csv_writer.writerow([FitmentComments, Year, Make, Model, Trim , Engine, ebayno])
class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类
def __init__(self, queue): #子类特有属性, queue
FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[Spider]-----%(message)s------"
logging.basicConfig(level=logging.INFO, format=FORMAT)
threading.Thread.__init__(self)
self.queue = queue
self.spider = Spider() #子类特有属性spider, 并初始化,将实例用作属性
def run(self):
while True:
success = True
item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
try:
self.spider.getFitment(item) #调用实例spider的方法getDataById(item)
except :
success = False
if not success :
self.queue.put(item)
logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号
class SpiderJob():
def __init__(self , size , qs):
self.size = size # 将形参size的值存储到属性变量size中
self.qs = qs
def work(self):
toSpiderQueue = Queue() #创建一个Queue队列对象
for i in range(self.size):
t = ThreadCrawl(toSpiderQueue) #将实例用到一个类的方法中
t.setDaemon(True)
t.start()
for q in self.qs:
toSpiderQueue.put(q) #调用队列对象的put()方法,在对尾插入一个项目item
toSpiderQueue.join() #队列对象,等到队列为空,再执行别的操作
2
from fitment_ebay import SpiderJob #从一个模块中导入类
import csv
if __name__ == '__main__':
qs = []
with open("test.txt") as fp:
for line in fp:
qs.append(str(line.strip()))
Job = SpiderJob(8, qs[0:100])
Job.work()
out = open("result\\together.csv","w",newline="",encoding="utf-8")
csv_writer = csv.writer(out)
csv_writer.writerow(['fitmentcomment', 'year', 'make', 'model', 'trim', 'engine', 'ebayno'])
n = len(qs)
for i in range(n):
with open("result\\"+str(qs[i])+".csv", newline="",encoding="utf-8") as f:
reader = csv.reader(f)
k = 0
for row1 in reader:
if k == 0: # 去掉列名
k = 1
continue
if len(row1) == 0: #排除最后一项为空
break
csv_writer.writerow(row1)
方式二:
"""
使用须知:
代码中数据表名 fitment ,需要更改该数据表名称的注意更改 fitment
"""
import random
import urllib
from http.cookiejar import CookieJar
import requests
from bs4 import BeautifulSoup
import csv
import numpy as np
import xlrd
import os
import json
from queue import Queue
import time
import random
import threading
import logging
import pandas as pd
import pymysql
class Database():
def __init__(self):
self.tablename = "fitment" # 可以设置表名
self.host = "localhost"
self.user = "root"
self.password ="123456"
self.database="ebay"
self.charset = "utf8"
self.connect = pymysql.connect(host = self.host, user = self.user,password = self.password, database = self.database, charset = self.charset)
self.cursor = self.connect.cursor()
#删表
def dropTables(self):
sql = 'drop table if exists '+self.tablename
self.cursor.execute(sql)
print("删表")
#建表
def createTables(self):
sql = 'create table if not exists '+ self.tablename+ '''
(
id int(11) primary key auto_increment,
ebayno varchar(100) not null,
note varchar(5000) ,
year varchar(100),
make varchar(100),
model varchar(100),
trim varchar(1000),
engine varchar(1000)
)
'''
self.cursor.execute(sql)
print("建表")
#判断是否存在ebayno
def is_exists_ebayno(self,ebayno):
sql = 'select * from '+self.tablename + ' where ebayno = %s'
self.cursor.execute(sql,ebayno)
if self.cursor.fetchone() is None:
return False
return True
#保存数据
def save(self,ebayno,note,year,make,model,trim,engine):
sql = 'insert into '+self.tablename+' (ebayno,note,year,make,model,trim,engine) values (%s,%s,%s,%s,%s,%s,%s)'
self.cursor.execute(sql,(ebayno,note,year,make,model,trim,engine))
self.connect.commit()
class EbaySpider(object):
def __init__(self):
self.db = Database()
self.SESSION = requests.session()
self.SESSION.cookies = CookieJar()
self.HEAD = self.randHeader()
def randHeader(self):
head_connection = ['Keep-Alive', 'close']
head_accept = ['text/html, application/xhtml+xml, */*']
head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
header = {
'Connection': head_connection[0],
'Accept': head_accept[0],
'Accept-Language': head_accept_language[1],
'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
}
return header
def get_fitment(self,ebayno):
ebayno = str(ebayno)
if self.db.is_exists_ebayno(ebayno):
return
# http://developer.ebay.com/devzone/shopping/docs/callref/getsingleitem.html
# Compatibility Description Details ItemSpecifics ShippingCosts TextDescription Variations
url = "http://open.api.ebay.com/shopping?callname=GetSingleItem&responseencoding=JSON&" \
"appid=yourappid&siteid=100&version=967&" \
"IncludeSelector=Compatibility&" \
"ItemID=" + ebayno
r = requests.get(url=url,headers = self.HEAD)
items = json.loads(r.text)
item={}
try:
fitmentcount = items["Item"]["ItemCompatibilityCount"]
except:
print("fitmentcount异常") # end
self.db.save(ebayno,item.setdefault("note",None), item.setdefault("Year",None),item.setdefault("Make",None),item.setdefault("Model",None),item.setdefault("Trim",None),item.setdefault("Engine",None))
return
fitment=repr(items["Item"]["ItemCompatibilityList"])
fitment = str(fitment).replace("'",'"')
try:
jsonstr = json.loads(fitment)
except:
print("json解析错误")
return
for i in jsonstr["Compatibility"]:
value = i["NameValueList"]
item = {"note": i["CompatibilityNotes"]}
for i in value:
try:
item[i["Name"]] = i["Value"][0]
except:
continue
self.db.save(ebayno,item.setdefault("note",None), item.setdefault("Year",None),item.setdefault("Make",None),item.setdefault("Model",None),item.setdefault("Trim",None),item.setdefault("Engine",None))
class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类
def __init__(self, queue): #子类特有属性, queue
FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[AmazonSpider]-----%(message)s------"
logging.basicConfig(level=logging.INFO, format=FORMAT)
threading.Thread.__init__(self)
self.queue = queue
self.spider = EbaySpider() #子类特有属性spider, 并初始化,将实例用作属性
def run(self):
while True:
success = True
item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
self.spider.get_fitment(item) # 调用实例spider的方法getDataById(item)
logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号
class EbaySpiderJob():
def __init__(self , size , qs ):
self.size = size # 将形参size的值存储到属性变量size中
self.qs = qs
def work(self):
toSpiderQueue = Queue() #创建一个Queue队列对象
for i in range(self.size):
t = ThreadCrawl(toSpiderQueue) #将实例用到一个类的方法中
t.setDaemon(True)
t.start()
for q in self.qs:
toSpiderQueue.put(q) #调用队列对象的put()方法,在对尾插入一个项目item
toSpiderQueue.join() #队列对象,等到队列为空,再执行别的操作
if __name__ == '__main__':
### 测试
# spider = EbaySpider()
# spider.get_fitment("371809250769")
# exit()
# ####创建数据库, 停止后接着跑就需要注释
# db = Database()
# db.dropTables()
# db.createTables()
# exit()
####创建数据库 end
#
df = pd.read_excel("ebaynos_new.xlsx")
qs = df["ebayno"].values
amazonJob = EbaySpiderJob(16, qs)
amazonJob.work()
方式三: 网址只想确定哪一个 make model year
"""
2017-11-01
1 :在抓取fitment时,抓取api地址为
url_1 = 'https://frame.ebay.com/ebaymotors/ws/eBayISAPI.dll?GetFitmentData&rand=1509417472792&site=100&vs=0&req=2&item=' + str(ebayno) + '&ct=100&pn=&page=200'
&ct每页展示100
&page第几页,当页码大于最大页时,可展示全部
如果只想抓取部分 Year Make Model 可通过&sfp=设置
url_1 = 'https://frame.ebay.com/ebaymotors/ws/eBayISAPI.dll?GetFitmentData&rand=1509417472792&site=100&vs=0&req=2&sfp=Make%253A' + make + '%257CModel%253A' + model + '&item=' + str(ebayno) + '&ct=100&pn=&page=200'
2: 通过网址,响应的状态可能分三种情况:
"status":{"id":1,"name":"SUCCESS"} 该状态为正常SUCCESS,显示json格式的数据 ,data = sjson["data"]
"status":{"id":0,"name":"FAILURE"}, 非正常Failure,"data":[],
"status":null 非正常null "data":null
3:对于某些特殊的ebayno 271595213456 191749891947 当非正常网页出现3次,就停止对该ebayno的抓取
"""
import logging
import random
import threading
import urllib.parse
import urllib.parse
import requests
from queue import Queue
import pymysql
from bs4 import BeautifulSoup
import time
import re
import csv
import json
import webbrowser
import pandas as pd
import os
from http.cookiejar import CookieJar
class Spider():
def __init__(self):
self.SESSION = requests.session()
self.SESSION.cookies = CookieJar()
self.HEAD = self.randHeader()
self.Count = {}
def randHeader(self):
head_connection = ['Keep-Alive', 'close']
head_accept = ['text/html, application/xhtml+xml, */*']
head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
header = {
'Connection': head_connection[0],
'Accept': head_accept[0],
'Accept-Language': head_accept_language[1],
'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
}
return header
def getFitment(self, item):
make = str(item[0])
model_original = str(item[1])
ebayno = str(item[2])
restr = " ".join([make, model_original, ebayno])
if os.path.exists("ebaynos\\"+restr+".xlsx"):
return
if ebayno not in self.Count.keys():
self.Count[ebayno]=1
else:
if self.Count[ebayno] == 3: #对超过3次的ebayno停止抓取
return
self.Count[ebayno]+=1
print(ebayno)
model = str(model_original).replace("&","%26")
url_1 = 'https://frame.ebay.com/ebaymotors/ws/eBayISAPI.dll?GetFitmentData&rand=1509417472792&site=100&vs=0&req=2&sfp=Make%253A' + make + '%257CModel%253A' + model + '&item=' + str(ebayno) + '&ct=100&pn=&page=200'
# webbrowser.open(url_1)
# webbrowser.open("https://www.ebay.com/itm/"+str(ebayno))
# print(url_1)
r = self.SESSION.get(url=url_1, headers=self.HEAD)
# r = requests.get(url_1,headers = self.randHeader())
soup = BeautifulSoup(r.text , "html.parser")
# print("soup")
# print(soup)
s = str(soup)
sjson = json.loads(s)
data = sjson["data"]
# if data is None:
# with open("fitment_empty.txt", "a")as f:
# f.write(ebayno + "\n")
# return
alen = len(data) #data status null,报错,重新加入队列
# if data is None:
# with open("fitment_empty.txt", "a")as f:
# f.write(ebayno + "\n")
# return
Years = []
for i in range(len(data)):
row = data[i]
Year = row["Year"][0]
Years.append(Year)
y_start = min(Years) # data status [],报错,重新加入队列
y_end = max(Years)
temp = pd.DataFrame({"make":[make],"model":[model_original],"ebayno":[ebayno],"ebay_start":[y_start],"ebay_end":[y_end]}, columns=["make","model","ebayno","ebay_start","ebay_end"])
restr = " ".join([make,model_original,ebayno])
temp.to_excel("ebaynos\\"+restr+".xlsx", index=False)
class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类
def __init__(self, queue): #子类特有属性, queue
FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[Spider]-----%(message)s------"
logging.basicConfig(level=logging.INFO, format=FORMAT)
threading.Thread.__init__(self)
self.queue = queue
self.spider = Spider() #子类特有属性spider, 并初始化,将实例用作属性
def run(self):
while True:
success = True
item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
# self.spider.getFitment(item) # 调用实例spider的方法getDataById(item)
try:
self.spider.getFitment(item) #调用实例spider的方法getDataById(item)
except :
success = False
if not success :
self.queue.put(item)
logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号
class SpiderJob():
def __init__(self , size , qs):
self.size = size # 将形参size的值存储到属性变量size中
self.qs = qs
def work(self):
toSpiderQueue = Queue() #创建一个Queue队列对象
for i in range(self.size):
t = ThreadCrawl(toSpiderQueue) #将实例用到一个类的方法中
t.setDaemon(True)
t.start()
for q in self.qs:
toSpiderQueue.put(q) #调用队列对象的put()方法,在对尾插入一个项目item
toSpiderQueue.join() #队列对象,等到队列为空,再执行别的操作