多进程抓取页面
- 导入第三方thread包,定义一个进程池
- 将抓取的一级页面放进进程池里
- 启动进程池,自定义用几个进程来执行函数
- 在进程池中引入函数,定义download函数,执行二级抓取数据
- 退出线程
import time
import threading
from queue import Queue
import lxml.etree
import requests
START_URL= 'http://qianmu.iguye.com/2018USNEWS世界大学排名'
DOWNLOAD_LINE = 10
link_queue = Queue()
threads = []
download_pages = 0
def fetch(url, raise_err=True):
global download_pages
try:
r = requests.get(url)
r.encoding = 'utf-8'
download_pages += 1
return r.text
except Exception as e:
print(e)
else:
if raise_err:
r.raise_for_status()
def parse_university(html):
selector = lxml.etree.HTML(html)
title = selector.xpath('//*[@id="wikiContent"]/h1/text()')[0]
print(title)
infobox = selector.xpath('//div[@class="infobox"]')[0]
keys = infobox.xpath('./table//tr/td[1]/p//text()')
cols = infobox.xpath('./table//tr/td[2]')
values = [''.join(col.xpath('.//text()')) for col in cols]
for k, v in zip(keys, values):
print('%s:%s'%(k, v))
print('-' * 30)
def download():
while True:
link = link_queue.get()
if link is None:
break
parse_university(fetch(link))
link_queue.task_done()
print('remaining queue: %s' % link_queue.qsize())
if __name__ == '__main__':
start_time = time.time()
selector = lxml.etree.HTML(fetch(START_URL))
links = selector.xpath('//*[@id="content"]/table/tbody/tr/td/a/@href')
for link in links:
if not link.startswith('http://'):
link = 'http://qianmu.iguye.com/%s' % link
link_queue.put(link)
for i in range(DOWNLOAD_LINE):
t = threading.Thread(target=download)
t.start()
threads.append(t)
link_queue.join()
for i in range(DOWNLOAD_LINE):
link_queue.put(None)
for t in threads:
t.join()
cost_seconds = time.time() - start_time
print('download %s pages in %.2f seconds'%(download_pages, cost_seconds))