普通版本(借用队列)
import requests
from lxml import etree
import os
from queue import Queue
import threading
import time
start_url = "http://www.qianmu.org/ranking/1528.htm"
link_queue = Queue()
threads_num = 10
threads = []
download_pages = 0
def fetch(url):
"""链接请求"""
resp = requests.get(url)
if resp.status_code != 200:
resp.raise_for_status()
return resp.text.replace("\t", "")
def parse_university(link):
"""处理大学详情页面"""
resp = fetch(link)
selector = etree.HTML(resp)
data = {}
# 学校名
data["name"] = selector.xpath("//div[@id='wikiContent']/h1/text()")[0]
# 信息
try:
table = selector.xpath("//div[@class='infobox']//table")[0]
except IndexError as e:
print("无表格信息")
return None
keys = table.xpath(".//td[1]/p/text()")
values = table.xpath(".//td[2]/p//text()")
print(len(keys), len(values))
if len(keys) > len(values):
return None
data.update(zip(keys, values))
return data
def download():
while True:
# 阻塞,直到从队列里获取一条消息
link = link_queue.get()
if link is None:
break
# 提取详情页的信息
data = parse_university(link)
global download_pages
download_pages += 1
if data:
print(data)
link_queue.task_done()
print(f'remaining queue is {link_queue.qsize()}')
if __name__ == "__main__":
# 开始时间
start_time = time.time()
# 请求入口页面
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
headers = {"User-Agent": ua}
resp = requests.get(start_url, headers=headers)
selector = etree.HTML(resp.text)
# 提取列表页面的链接
links = selector.xpath("//div[@class='rankItem']//td[2]/a/@href")
for link in links:
if not link.startswith("http://www.qianmu.org"):
link = "http://www.qianmu.org/" + link
link_queue.put(link)
# 多线程执行
for i in range(threads_num):
t = threading.Thread(target=download)
t.start()
threads.append(t)
# 阻塞队列,直到队列被清空
link_queue.join()
for i in range(threads_num):
link_queue.put(None)
# 退出线程
for i in threads:
t.join()
finished_time = time.time()
cost_seconds = finished_time-start_time
print(f"download finished!!!耗时:{cost_seconds}s,抓取界面:{download_pages}个")
升级版本(借用redis)
import requests
from lxml import etree
import os
import signal
from queue import Queue
import threading
import time
import redis
start_url = "http://www.qianmu.org/ranking/1528.htm"
link_queue = Queue()
threads_num = 10
threads = []
thread_on = True
download_pages = 0
my_redis = redis.Redis(host="host", password="password")
def fetch(url):
"""链接请求"""
resp = requests.get(url)
if resp.status_code != 200:
resp.raise_for_status()
return resp.text.replace("\t", "")
def parse_university(link):
"""处理大学详情页面"""
resp = fetch(link)
selector = etree.HTML(resp)
data = {}
# 学校名
data["name"] = selector.xpath("//div[@id='wikiContent']/h1/text()")[0]
# 信息
try:
table = selector.xpath("//div[@class='infobox']//table")[0]
except IndexError as e:
print("无表格信息")
return None
keys = table.xpath(".//td[1]/p/text()")
values = table.xpath(".//td[2]/p//text()")
print(len(keys), len(values))
if len(keys) > len(values):
return None
data.update(zip(keys, values))
return data
def download(i):
while thread_on:
# 阻塞,直到从队列里获取一条消息
link = my_redis.lpop("qianmu.queue")
if link:
# 提取详情页的信息
data = parse_university(link)
global download_pages
download_pages += 1
if data:
print(data)
print(f'remaining queue is {my_redis.llen("qianmu.queue")}')
print(f"Thread-{i} exit now")
def signal_handler(signum, frame):
print("received Ctrl+C, wait for exit gracefully")
global thread_on
thread_on = False
def exit_handler(i):
global thread_on
while thread_on:
if input("") == "exit":
thread_on = False
print(f"Thread-{i} exit now")
if __name__ == "__main__":
# 开始时间
start_time = time.time()
# 请求入口页面
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
headers = {"User-Agent": ua}
resp = requests.get(start_url, headers=headers)
selector = etree.HTML(resp.text)
# 提取列表页面的链接
links = selector.xpath("//div[@class='rankItem']//td[2]/a/@href")
for link in links:
if not link.startswith("http://www.qianmu.org"):
link = "http://www.qianmu.org/" + link
if my_redis.sadd("qianmu.seen", link):
my_redis.rpush("qianmu.queue", link)
# 多线程执行
for i in range(threads_num):
t = threading.Thread(target=download, args=(i + 1,))
t.start()
threads.append(t)
# win系统下没用
signal.signal(signal.SIGINT, signal_handler)
# 关闭
t = threading.Thread(target=exit_handler, args=(threads_num + 1,))
t.start()
threads.append(t)
# 阻塞队列,直到队列被清空
link_queue.join()
# 退出线程
for i in threads:
t.join()
finished_time = time.time()
cost_seconds = finished_time - start_time
print(f"download finished!!!耗时:{cost_seconds}s,抓取界面:{download_pages}个")
这里注意一下,signal在win下没用,我就多加了一个线程用于监听输入的"exit",当存在输入时,退出程序。