单线程爬虫:
# coding = utf-8
import requests
from lxml import etree
import time
class bdjSpider():
def __init__(self):
self.start_url = 'http://www.budejie.com/text/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/62.0.3202.62 Safari/537.36'
}
def get_url(self): # 生成url_list
page_num = 1
url_list = []
while page_num <= 50:
url = self.start_url + str(page_num)
page_num += 1
url_list.append(url)
return url_list
def parse(self, url): # 主请求
ret = requests.get(url, headers=self.headers)
response = ret.content
return response
def get_content(self, response): # 匹配内容
ret = etree.HTML(response)
content_list = ret.xpath('//div[@class="j-r-list-c-desc"]/a/text()')
return content_list
def run(self): # 主逻辑运行
url_list = self.get_url()
for url in url_list:
response = self.parse(url)
content_list = self.get_content(response)
for joke in content_list:
print(joke)
if __name__ == '__main__':
time1 = time.time()
start = bdjSpider()
start.run()
time2 = time.time()
print(time2 - time1)
多线程爬虫
# coding = utf-8
import requests
from lxml import etree
import time
from queue import Queue
import threading
class bdjSpider():
def __init__(self):
self.start_url = 'http://www.budejie.com/text/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/62.0.3202.62 Safari/537.36'
}
self.url_list_queue = Queue()
self.response_queue = Queue()
def get_url(self): # 生成url_list
page_num = 1
while page_num <= 50:
url = self.start_url + str(page_num)
page_num += 1
self.url_list_queue.put(url)
def parse(self): # 主请求
while True:
url = self.url_list_queue.get()
ret = requests.get(url, headers=self.headers)
response = ret.content
self.response_queue.put(response)
self.url_list_queue.task_done()
def get_content(self): # 匹配内容
while True:
response = self.response_queue.get()
ret = etree.HTML(response)
content_list = ret.xpath('//div[@class="j-r-list-c-desc"]/a/text()')
for content in content_list:
print(content)
self.response_queue.task_done()
def run(self): # 主逻辑运行
thread_list = []
t_url = threading.Thread(target=self.get_url)
thread_list.append(t_url)
for i in range(3):
t_parse = threading.Thread(target=self.parse)
thread_list.append(t_parse)
t_content = threading.Thread(target=self.get_content)
thread_list.append(t_content)
for t in thread_list:
t.setDaemon(True) # 守护线程
t.start()
for q in [self.url_list_queue, self.response_queue]:
q.join()
print('运行结束')
if __name__ == '__main__':
time1 = time.time()
start = bdjSpider()
start.run()
time2 = time.time()
print(time2 - time1)
对比
多线程运行速度快于单线程(废话)
此处采用队列和线程Queue 以及threading
在类中初始化队列容器
例如:
self.response_queue = Queue()
在每个方法中在队列中get数据
并并向队列put数据
采用task_done() 删减队列数据
def parse(self): # 主请求
while True:
url = self.url_list_queue.get()
ret = requests.get(url, headers=self.headers)
response = ret.content
self.response_queue.put(response)
self.url_list_queue.task_done()
在逻辑运行run函数中
建立线程 threading.Thread(target=函数名)
把所有线程加入到线程列表thread_list
for t in thread_list:
#设置守护进程
t.setdaemon(Ture)
# 运行线程
t.start()
按照以上写法,在主进程结束后,子进程就结束,这不是我所要的
多以采用
for q in [self.url_list_queue, self.response_queue]:
q.join()
当队列中无数据,销毁进程