python爬虫多线程进程

python爬虫多线程进程

import requests
import json
from lxml import etree
import time
from retrying import retry
import threading
from queue import Queue

class QiushiSpider:
def init(self):
self.url = “https://www.qiushibaike.com/hot/page/{}/
self.headers ={“User-Agent”:“Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36”}
# self.url_list = [self.url.format(i) for i in range(1,14)]
self.url_queue = Queue()
[self.url_queue.put(self.url.format(i)) for i in range(1,14)]
self.html_str_queue = Queue()
self.content_list_queue = Queue()

#按住ctrl键点击@retry搜索init
@retry(stop_max_attempt_number = 5)

def _get_html(self,url):
    response = requests.get(url,headers = self.headers)
    print(response.status_code)
    #ASSERT函数,其作用是如果它的条件返回错误,则终止程序执行
    assert response.status_code == 200
    return response.content.decode()


def get_html(self):
    while True:
        url = self.url_queue.get()
        try:
            html_str = self._get_html(url)
        except:
            html_str = None
        # return html_str
        self.html_str_queue.put(html_str)
        self.url_queue.task_done()

def get_content_list(self):
    while True:
        html_str = self.html_str_queue.get()
        html = etree.HTML(html_str)
        #contains(@id,'qiushi_tag') id包含字符串'qiushi_tag'
        div_list = html.xpath("//div[contains(@id,'qiushi_tag')]")
        content_list = []
        for div in div_list:
            cdict = {}
            #a[]下标从1开始,|后面那边是在匿名用户找,其实一样只要把a[1]换成span[1]就可以了
            cdict["header_img"] = div.xpath(".//div[@class='author clearfix']/a[1]/img/@src|.//div[@class='author clearfix']/span[1]/img/@src")
            #[0]:因为是个list对象,取第一个,从0开始
            cdict["header_img"] = "http:" + cdict["header_img"][0]
            cdict["username"] = div.xpath(".//div[@class='author clearfix']/a[1]/img/@alt|.//div[@class='author clearfix']/span[1]/img/@alt")[0]
            #年龄同一行,年龄前面就是性别
            cdict["sex"] = div.xpath(".//div[@class='author clearfix']/div/@class")
            cdict["sex"] = cdict["sex"][0].split(" ")[-1].replace("Icon","") if len(cdict["sex"])>0 else None
            content_list.append(cdict)
        self.content_list_queue.put(content_list)
        self.html_str_queue.task_done()

def save_data(self):
    while True:
        content_list = self.content_list_queue.get()
        for i in content_list:
            with open("1_qiushibk.txt","a",encoding="utf-8") as f:
                f.write(json.dumps(i,ensure_ascii=False,indent=2))
            print(i)
        self.content_list_queue.task_done()

def run(self):
    # for i in self.url_list:
    #     html_str = self.get_html(i)
    #     if html_str is not None:
    #         self.get_content_list(html_str)
    #     else:
    #         self.url_list.append(i)
    thread_list = []
    for i in range(3):
        t_get_html = threading.Thread(target=self.get_html)
        #上面get_html后面不能加(),得去掉,下同.target=不要忘记写了
        thread_list.append(t_get_html)
    for i in range(2):
        t_content_list =threading.Thread(target=self.get_content_list)
        thread_list.append(t_content_list)

    t_save_data =threading.Thread(target=self.save_data)
    thread_list.append(t_save_data)

    for t in thread_list:
        t.setDaemon(True)
        t.start()

    for q in [self.url_queue,self.html_str_queue,self.content_list_queue]:
        q.join()

if name == ‘main’:
t1=time.time()
guoke = QiushiSpider()
guoke.run()
#format():把()里面的东西放入前面的{}
print(“程序运行了{}”.format(time.time()-t1))

import requests
from lxml import etree
import time
from retrying import retry
import json
from multiprocessing import Process
from multiprocessing import JoinableQueue as Queue

class QiushiSpider:
def init(self):
‘’‘全局变量’’’
self.headers = {“User-Agent”:“Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36”}
self.url = “https://www.qiushibaike.com/hot/page/{}/
# self.url_list = [self.url.format(i) for i in range(1,14)]
self.url_list_queue = Queue()
# [self.url_list_queue.put(self.url.format(i)) for i in range(1,14)]
for i in range(1,14):
self.url_list_queue.put(self.url.format(i))
self.html_str_queue = Queue()
self.content_list_queue = Queue()

@retry(stop_max_attempt_number=5)

def _get_html(self,url):
    '''发送请求,获取数据'''
    print(url)
    # time.sleep(5)
    response = requests.get(url,headers=self.headers)
    assert response.status_code == 200
    return response.content.decode()

def get_html(self):
    while True:
        url = self.url_list_queue.get()
        try:
            html_str = self._get_html(url)
        except:
            html_str = None

        self.html_str_queue.put(html_str)
        self.url_list_queue.task_done()

def xpath_cl(self):
    '''处理数据,提取所需要的内容'''
    while True:
        html_str = self.html_str_queue.get()
        elem = etree.HTML(html_str)
        div_list = elem.xpath('//div[contains(@id,"qiushi_tag")]')
        content_list = []
        for div in div_list:
            item = {}
            #定义一个字典,保存每一条数据
            #获取头像
            item["head_img"] = div.xpath('./div[@class="author clearfix"]/a[1]/img/@src|./div[@class="author clearfix"]/span[1]/img/@src')
            item["head_img"] = "https:" + item["head_img"][0]
            #获取网名
            item["name"] = div.xpath('./div[@class="author clearfix"]/a[2]/h2/text()|./div[@class="author clearfix"]/span[2]/h2/text()')[0]
            #获取性别
            item["sex"] = div.xpath('.//div[contains(@class,"articleGender")]/@class')
            item["sex"] = item["sex"][0].split(" ")[-1].replace("Icon","") if len(item["sex"])>0 else None
            #获取年龄
            item["age"] = div.xpath('.//div[contains(@class,"articleGender")]/text()')
            item["age"] = item["age"][0] if len(item["age"])>0 else None
            #获取内容
            item["content"] = div.xpath('.//div[@class="content"]/span/text()')
            item["content"] = [i.replace("\n","") for i in item["content"]]
            item["content"] = "".join(item["content"])
            #获取链接
            item["url"] = div.xpath('./a[1]/@href')
            item["url"] = "https://www.qiushibaike.com" + item["url"][0]
            #获取图片
            item["img"] = div.xpath('.//div[@class="thumb"]/a/img/@src')
            item["img"] = "http:" + item["img"][0] if len(item["img"])>0 else None
            #获取好笑数
            item["vote"] = div.xpath('.//span[@class="stats-vote"]/i/text()')[0]
            #获取评论数
            item["comments"] = div.xpath('.//span[@class="stats-comments"]/a/i/text()')[0]
            #获取神用户
            item["cmt_name"] = div.xpath('.//div[@class="cmtMain"]/span[2]/text()')
            item["cmt_name"] = item["cmt_name"][0].replace(":","") if len(item["cmt_name"])>0 else None
            #获取神评论
            item["cmt_text"] = div.xpath('.//div[@class="cmtMain"]/div/text()')
            item["cmt_text"] = item["cmt_text"][0] if len(item["cmt_text"])>0 else None
            #获取点赞数
            item["likenum"] = div.xpath('.//div[@class="cmtMain"]//div[@class="likenum"]/text()')
            item["likenum"] = item["likenum"][-1].replace("\n","") if len(item["likenum"])>0 else None
            content_list.append(item)

        self.content_list_queue.put(content_list)
        self.html_str_queue.task_done()


def sava_date(self):
    '''保存数据'''
    while True:
        content_list = self.content_list_queue.get()
        for content in content_list:
            with open("qiushinew.txt","a",encoding="utf-8") as f:
                f.write(json.dumps(content,ensure_ascii=False,indent=2))

        self.content_list_queue.task_done()

def run(self):
    # 主要功能实现
    # for url in self.url_list:
    #     # 1.发送url请求
    #     html_str = self.get_html(url)
    #     # 2.得到响应,获取数据
    #     # 3.xpath对数据进行处理
    #     if html_str is not None:
    #         content_list = self.xpath_cl(html_str)
    #         # 4.保存数据
    #         self.sava_date(content_list)
    #     else:
    #         self.url_list.append(url)
    Process_list = []
    for i in range(10):
        P_get_html = Process(target=self.get_html)
        Process_list.append(P_get_html)
    for i in range(6):
        P_xpath_cl = Process(target=self.xpath_cl)
        Process_list.append(P_xpath_cl)

    P_sava_date = Process(target=self.sava_date)
    Process_list.append(P_sava_date)

    for p in Process_list:
        p.daemon=True
        p.start()

    for q in [self.url_list_queue,self.html_str_queue,self.content_list_queue]:
        q.join()

if name == ‘main’:
t1 = time.time()
qiushi = QiushiSpider()
qiushi.run()
print(“爬取花了:{}秒”.format(time.time()-t1))

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值