一: 所要用到的包和常量
import urllib.request
from queue import Queue
import time
import threading
from lxml import etree
queue = Queue()
DOWNLOADER_NUM = 10
threads = []
url = "http://sz.ganji.com/site/s/_python%20%E7%88%AC%E8%99%AB/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
二: 一级页面的处理
req = urllib.request.Request(url=url, headers=headers)
res = urllib.request.urlopen(req)
tree = etree.HTML(res.read())
url_list = tree.xpath('//div[@class="job-wanted"]/dl/dt/a/@href')
# print(url_list)
for i in url_list:
url = "http://sz.ganji.com/" + i
queue.put(url)
# print(url)
三: 二级页面的处理
def gan_spiders(url1):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
req = urllib.request.Request(url=url1, headers=headers)
res = urllib.request.urlopen(req)
tree = etree.HTML(res.read())
daiyu = tree.xpath('//div[@class="salary-line"]/b/text()')
print(daiyu)
四: 函数的调用
def main():
while True:
url1 = queue.get()
if url1 is None:
break
gan_spiders(url1)
五: 线程的启动和关闭
if __name__ == '__main__':
start_time = time.time()
for i in range(DOWNLOADER_NUM):
t = threading.Thread(target=main)
t.start()
threads.append(t)
queue.join()
for i in range(DOWNLOADER_NUM):
queue.put(None)
for t in threads:
t.join()
cost_seconds = time.time() - start_time
六: 整体代码示例
import urllib.request
from queue import Queue
import time
import threading
from lxml import etree
queue = Queue()
DOWNLOADER_NUM = 10
threads = []
url = "http://sz.ganji.com/site/s/_python%20%E7%88%AC%E8%99%AB/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
req = urllib.request.Request(url=url, headers=headers)
res = urllib.request.urlopen(req)
tree = etree.HTML(res.read())
url_list = tree.xpath('//div[@class="job-wanted"]/dl/dt/a/@href')
# print(url_list)
for i in url_list:
url = "http://sz.ganji.com/" + i
queue.put(url)
# print(url)
def gan_spiders(url1):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
req = urllib.request.Request(url=url1, headers=headers)
res = urllib.request.urlopen(req)
tree = etree.HTML(res.read())
daiyu = tree.xpath('//div[@class="salary-line"]/b/text()')
print(daiyu)
def main():
while True:
url1 = queue.get()
if url1 is None:
break
gan_spiders(url1)
if __name__ == '__main__':
start_time = time.time()
for i in range(DOWNLOADER_NUM):
t = threading.Thread(target=main)
t.start()
threads.append(t)
queue.join()
for i in range(DOWNLOADER_NUM):
queue.put(None)
for t in threads:
t.join()
cost_seconds = time.time() - start_time