多线程练习【赶集】

一:    所要用到的包和常量

import urllib.request
from queue import Queue
import time
import threading
from lxml import etree
queue = Queue()
DOWNLOADER_NUM = 10
threads = []
url = "http://sz.ganji.com/site/s/_python%20%E7%88%AC%E8%99%AB/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}

二:   一级页面的处理

req = urllib.request.Request(url=url, headers=headers)
res = urllib.request.urlopen(req)
tree = etree.HTML(res.read())
url_list = tree.xpath('//div[@class="job-wanted"]/dl/dt/a/@href')
# print(url_list)
for i in url_list:
    url = "http://sz.ganji.com/" + i
    queue.put(url)
    # print(url)

三:    二级页面的处理

def gan_spiders(url1):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
    req = urllib.request.Request(url=url1, headers=headers)
    res = urllib.request.urlopen(req)
    tree = etree.HTML(res.read())
    daiyu = tree.xpath('//div[@class="salary-line"]/b/text()')
    print(daiyu)

四:    函数的调用

def main():
    while True:
        url1 = queue.get()
        if url1 is None:
            break
        gan_spiders(url1)

五:    线程的启动和关闭

if __name__ == '__main__':
    start_time = time.time()
    for i in range(DOWNLOADER_NUM):
        t = threading.Thread(target=main)
        t.start()
        threads.append(t)
    queue.join()
    for i in range(DOWNLOADER_NUM):
        queue.put(None)
    for t in threads:
        t.join()
    cost_seconds = time.time() - start_time

六:    整体代码示例

import urllib.request
from queue import Queue
import time
import threading
from lxml import etree
queue = Queue()
DOWNLOADER_NUM = 10
threads = []
url = "http://sz.ganji.com/site/s/_python%20%E7%88%AC%E8%99%AB/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
req = urllib.request.Request(url=url, headers=headers)
res = urllib.request.urlopen(req)
tree = etree.HTML(res.read())
url_list = tree.xpath('//div[@class="job-wanted"]/dl/dt/a/@href')
# print(url_list)
for i in url_list:
    url = "http://sz.ganji.com/" + i
    queue.put(url)
    # print(url)

def gan_spiders(url1):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
    req = urllib.request.Request(url=url1, headers=headers)
    res = urllib.request.urlopen(req)
    tree = etree.HTML(res.read())
    daiyu = tree.xpath('//div[@class="salary-line"]/b/text()')
    print(daiyu)

def main():
    while True:
        url1 = queue.get()
        if url1 is None:
            break
        gan_spiders(url1)


if __name__ == '__main__':
    start_time = time.time()
    for i in range(DOWNLOADER_NUM):
        t = threading.Thread(target=main)
        t.start()
        threads.append(t)
    queue.join()
    for i in range(DOWNLOADER_NUM):
        queue.put(None)
    for t in threads:
        t.join()
    cost_seconds = time.time() - start_time



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值