Python多线程爬虫实现

1. Python多线程爬虫

在批量去爬取数据的时候,往往效率会很低,这个时候我们可以用到多线程的技术。
python是支持多线程的, 主要是通过thread和threading这两个模块来实现的。

单线程爬虫效率相对来说会低很多,例如:

import requests
from bs4 import BeautifulSoup
import time
start_time = time.time()
def main():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    }

    for i in range(1,6):
        url = "https://so.csdn.net/so/search/s.do?p="+str(i)+"&q=python"
        s = requests.session()
        html = s.get(url,headers = headers)
        html.encoding = "utf-8"
        r = html.text
        #print(r)
        #print(r)
        soup = BeautifulSoup(str(r),"html.parser")
        limit = soup.find_all('div', class_='limit_width')
        soup1 = BeautifulSoup(str(limit), "html.parser")
        div = soup1.find_all('div', class_='limit_width')
        soup2 = BeautifulSoup(str(div), "html.parser")
        a = soup2.find_all('a')
        for i in a:
            text = i.get_text()
            href = i["href"]
            if "CSDN" not in text:
                print(text)
                print(href)
main()
end = time.time()
print(end-start_time)

#运行结果:
#......
#Time-Cost:2.061112642288208

然后我们尝试用多线程的方法,执行同样的爬取内容,如下所示:

# coding=utf-8
import threading, queue, time, urllib
from bs4 import BeautifulSoup
from urllib import request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
import requests
baseUrl = "https://so.csdn.net/so/search/s.do?p="

urlQueue = queue.Queue()


for i in range(1, 6):
    url = baseUrl + str(i) + "&q=python"
    urlQueue.put(url)
 #print(url)
def fetchUrl(urlQueue):
    while True:
        try:
   #不阻塞的读取队列数据
            url = urlQueue.get_nowait()
            i = urlQueue.qsize()
            #print(url,threading.current_thread().name)
        except Exception as e:
            break
        #print ('Current Thread Name %s, Url: %s ' % (threading.currentThread().name, url))
        try:
            s = requests.session()
            html = s.get(url, headers=headers)
            html.encoding = "utf-8"
            r = html.text
            # print(r)
            # print(r)
            soup = BeautifulSoup(str(r), "html.parser")
            limit = soup.find_all('div', class_='limit_width')
            soup1 = BeautifulSoup(str(limit), "html.parser")
            div = soup1.find_all('div', class_='limit_width')
            soup2 = BeautifulSoup(str(div), "html.parser")
            a = soup2.find_all('a')
            for i in a:
                text = i.get_text()
                href = i["href"]
                if "CSDN" not in text:
                    print(text)
                    print(href)
            print("已爬取完毕!")
        except:
            pass
   #抓取内容的数据处理可以放到这里
   #为了突出效果, 设置延时
            #time.sleep(1)
            #print(html)
if __name__ == '__main__':
    startTime = time.time()
    print("这是主线程:",threading.current_thread().name)
    threads = []
 # 可以调节线程数, 进而控制抓取速度
    threadNum = 5
    for i in range(0, threadNum):
        #创建一个线程
        t = threading.Thread(target=fetchUrl, args=(urlQueue,))
        threads.append(t)
        print(threads)
    for t in threads:
        t.start()
    for t in threads:
  #多线程多join的情况下,依次执行各线程的join方法, 这样可以确保主线程最后退出, 且各个线程间没有阻塞
        t.join()
    endTime = time.time()
    print("主线程结束:",threading.current_thread().name)
    print ('Done, Time cost: %s ' % (endTime - startTime))

#运行结果:
#这是主线程: MainThread
#Python游戏开发入门
#https://edu.csdn.net/course/detail/5690
#Python, Python, Python
#https://blog.csdn.net/ww_great/article/details/3057071
#......
#已爬取完毕!
#主线程结束: MainThread
#Time cost: 0.7241780757904053 

设置threadNum = 2的话,也就是将队列设置为2,那么速度会大大降低。

我们运行一下,发现 Time cost: 1.3654978275299072

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值