python3 多线程爬虫的使用demo

本文将介绍如何在Python3中使用多线程进行网页爬虫开发,通过实例演示如何提高爬取效率,同时讲解了线程池的运用以及遇到的常见问题与解决方案。
摘要由CSDN通过智能技术生成
import datetime

import io
import sys
import os
import requests
from queue import Queue
from pymongo import MongoClient
import json
import jsonpath
import threading



#采集线程
class ThreadCrawl(threading.Thread):

    def __init__(self,threadName,cityQueue,dataQueue):
        super(ThreadCrawl,self).__init__()
        self.threadName = threadName
        self.cityQueue = cityQueue
        self.dataQueue = dataQueue

        self.headers = {
            'charset':'utf-8',
            'Accept-Encoding':'gzip',
            'referer':'https://servicewechat.com/wx4a68a5b1b2d89fea/32/page-frame.html',
            'content-type':'application/json',
            'User-Agent':'Mozilla/5.0 (Linux; Android 5.1.1; Redmi 3 Build/LMY47V; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 Mobile Safari/537.36 MicroMessenger/7.0.6.1460(0x27000634) Process/appbrand2 NetType/WIFI Language/zh_CN',
            'Content-Length':'415',
            'Host':'sjz.ihotels.cc',
            'Connection':'Keep-Alive',
        }
        self.json = {"brandNameList":["AA连锁酒店"],
                     "serviceNameList":[],
                     "distance":"",
                     "commentLeave":"",
                     "beginDate":"2019-08-26",
                     "endDate":"2019-08-27",
                     #"cityName":"上海",
                     "areaList":[],
                     "keyword":"",
                     "openType":1,
                     "pageIndex":1,
                     "pageSize":50,
                     "sortOpt":"",
                     "sortType":"",
                     #"lat":40.001064,
                     #"lon":116.461409,
                     "memberId":"",
                     "channel":17,
                     "deviceType":"4",
                     "tagVersion":"5.0.0",
                     "deviceName":"Redmi 3",
                     "code":"023NrRa70BYVxF1c7ra70LBGa70NrRan"
                     }
    def run(self):
        print('启动'+self.threadName+'...')
        while not CRAWL_EXIT:
            try:
                # 取出一个数字,,先进先出
                # 可选参数block,默认是true
                # 1.如果队列为空,block为true不会结束 就会进入阻塞状态,直到队列有新的数据
                # 2.如果队列,block为false的话 就会弹出一个queue.empty()异常
                city = self.cityQueue.get(False)
                print(city+'-----------')
                self.json['cityName'] = city
                listUrl = "https://sjz.ihotels.cc//ethank-sjz-web/rest/hotelResource/v2.1/queryHotelList"
                # print(url)
                content = requests.post(listUrl, headers=self.headers,json=self.json)
                if 'hotelName' in content.text:
                    self.dataQueue.put(content.text)
            except:
                pass
        print("结束 " + self.threadName)

CRAWL_EXIT = False
PARSE_EXIT = False


class ThreadParse(threading.Thread):
    def __init__(self,threadName,dataQueue,lock):
        super(ThreadParse,self).__init__()
        self.threadName = threadName
        self.dataQueue = dataQueue
        self.lock = lock
        self.strings  = []
    def run(self):
        print('启动' + self.threadName + '...')
        while not CRAWL_EXIT:
            try:
                text = self.dataQueue.get(False)
                self.parse(text)
            except:
                pass

        print(self.strings)
        print("结束"+self.threadName)



    def parse(self,text):
        setup_io()
        textjson = json.loads(text,encoding='utf-8')
        jsonList = jsonpath.jsonpath(textjson,'$..hotelList')[0]
        for list in jsonList:
            poi_id = jsonpath.jsonpath(list,'$..hotelId')[0]
            poi_name = jsonpath.jsonpath(list, '$..hotelName')[0]
            city = jsonpath.jsonpath(list,'$..areaName')[0]
            address = jsonpath.jsonpath(list,'$..areaName')[0]
            score = jsonpath.jsonpath(list,'$..score')[0]
            crawl_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            source = "尚美荟"
            keyword = 'AA连锁'
            strings = {'task_id':os.environ.get('CRAWLAB_TASK_ID'),
                       'poi_id':poi_id,
                       'poi_name':poi_name,
                       'city':city,
                       'address':address,
                       'score':score,
                       'crawl_time':crawl_time,
                       'source':source,
                       'keyword':keyword}
            print(strings)
            self.strings.append(strings)

#编码转换错误解决方式
def setup_io():
    sys.stdout = sys.__stdout__ = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8', line_buffering=True)
    sys.stderr = sys.__stderr__ = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8', line_buffering=True)
#程序主函数
def mian(citylist):
    setup_io()#编码转换错误解决方式
    cityjson = json.loads(citylist, encoding='utf-8')#把列表转为json字符串
    citys = jsonpath.jsonpath(cityjson, '$..city')#获取所有的city
    print(citys)
    #创建城市队列
    cityQueue = Queue()
    for i in citys:
        cityQueue.put(i)
    #创建数据队列
    dataQueue = Queue()
    #创建锁
    lock = threading.local
    #创建采集线程
    crawlList = ['采集线程1号','采集线程2号','采集线程3号']
    threadcrawl = [];
    for threadName in crawlList:
        thread = ThreadCrawl(threadName, cityQueue,dataQueue)
        thread.start()
        threadcrawl.append(thread)

    #创建解析线程
    parseList = ['解析线程1号','解析线程2号','解析线程3号']
    threadparse = [];
    for threadName in parseList:
        thread = ThreadParse(threadName,dataQueue,lock)
        thread.start()
        threadparse.append(thread)
    while not cityQueue.empty():
        pass
    # 如果cityQueue为空,采集线程退出循环
    global CRAWL_EXIT
    CRAWL_EXIT = True
    print("cityQueue为空")
    # 让抓取主线程进入阻塞状态,等待子线程执行完毕再退出
    for thread in threadcrawl:
        thread.join()
        print("1")
    # 如果dataQueue为空,采集线程退出循环
    while not dataQueue.empty():
        pass
    global PARSE_EXIT
    PARSE_EXIT = True
    #让数据主线程进入阻塞状态,等待子线程执行完毕再退出
    for thread in threadparse:
        thread.join()
        print("2")
    print("谢谢使用!")

#程序入口
if __name__ == '__main__':
    cityjson=[ {
            "city": "成都",
            "initial": "c"
            }, {
            "city": "上海",
            "initial": "s"
            }, {
            "city": "中卫",
            "initial": "Z"
            },{
                "city": "重庆",
                "initial": "C"
            }, {
                "city": "承德",
                "initial": "C"
            }, {
                "city": "沧州",
                "initial": "C"
            }, {
                "city": "长治",
                "initial": "C"
            }, {
                "city": "赤峰",
                "initial": "C"
            }, {
                "city": "朝阳",
                "initial": "C"
            }, {
                "city": "长春",
                "initial": "C"
            }, {
                "city": "常州",
                "initial": "C"
            }, {
                "city": "滁州",
                "initial": "C"
            }, {
                "city": "巢湖",
                "initial": "C"
            }, {
                "city": "池州",
                "initial": "C"
            }, {
                "city": "长沙",
                "initial": "C"
            }
    ]
    json_str = json.dumps(cityjson, ensure_ascii=False) #[{"city": "阿拉善盟", "initial": "A"}, {"city": "鞍山", "initial": "A"}, {"city": "安庆", "initial": "A"}, {"city": "中卫", "initial": "Z"}]
    mian(json_str)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值