python 并发和并行

  • 并行是指两个或者多个事件在同一时刻发送

  • 并发是指两个或者多个事件在同一时间段发送

  1. 并行指的多个cup, 并发主要是针对一个cpu而已
  2. 并发的目的是充分利用处理器的每一个核,以达到最高的处理

 线程代码如下

# +--------------------------
# | User: zq                -
# | Version: python3.7      -
# | Time: 2020-03-12 09:31                
# +--------------------------
# 多线程编程
# 1. 实例化 Thread
# 2. 继承Tread类

import time
from threading import Thread


def sleep_task1():
    print("sleep 2 second start!")
    time.sleep(2)
    print("sleep2 seconds end!")


def sleep_task2():
    print("sleep 3 second start!")
    time.sleep(3)
    print("sleep3 seconds end!")


if __name__ == "__main__":
    t1 = Thread(target=sleep_task1)
    t1.start()

    t2 = Thread(target=sleep_task2)
    t2.start()

改进代码:

# +--------------------------
# | User: zq                -
# | Version: python3.7      -
# | Time: 2020-03-12 09:31                
# +--------------------------
# 多线程编程
# 1. 实例化 Thread
# 2. 继承Tread类

import time
from threading import Thread


def sleep_task(sleep_time):
    print("sleep {} second start!".format(sleep_time))
    time.sleep(sleep_time)
    print("sleep {} seconds end!".format(sleep_time))


if __name__ == "__main__":
    t1 = Thread(target=sleep_task, args=(2,))
    t1.start()

    t2 = Thread(target=sleep_task, args=(3,))
    t2.start()

继续改进代码,加上统计时间 :

join 是让等待上面的子线程结束之后,才继续执行join下面的代码

# +--------------------------
# | User: zq                -
# | Version: python3.7      -
# | Time: 2020-03-12 09:31                
# +--------------------------
# 多线程编程
# 1. 实例化 Thread
# 2. 继承Tread类

import time
from threading import Thread


def sleep_task(sleep_time):
    print("sleep {} second start!".format(sleep_time))
    time.sleep(sleep_time)
    print("sleep {} seconds end!".format(sleep_time))


if __name__ == "__main__":
    start_time = time.time()
    t1 = Thread(target=sleep_task, args=(2,))
    t1.start()

    t2 = Thread(target=sleep_task, args=(3,))
    t2.start()

    # 加了join之后,主线程会等到 t1,t2结束之后再继续执行下去
    t1.join()
    t2.join()
    end_time = time.time()
    print("last_time:{}".format(end_time - start_time))

    # 1. 当开启一个程序的时候,会默认启动一个主线程
    # 2. 如果在主线程等到其他线程执行完以后才继续执行, join

 现在有一个新需要,主线程给 t1,t2线程 1秒时间内完成,不然就关闭 setDaemon(True)

    t1.setDaemon(True) #  守护线程,主线程退出,子线程也退出
 

# +--------------------------
# | User: zq                -
# | Version: python3.7      -
# | Time: 2020-03-12 09:31                
# +--------------------------
# 多线程编程
# 1. 实例化 Thread
# 2. 继承Tread类

import time
from threading import Thread


def sleep_task(sleep_time):
    print("sleep {} second start!".format(sleep_time))
    time.sleep(sleep_time)
    print("sleep {} seconds end!".format(sleep_time))


if __name__ == "__main__":
    start_time = time.time()
    t1 = Thread(target=sleep_task, args=(2,))
    t1.setDaemon(True) # 守护线程,主线程退出,子线程也退出
    t1.start()

    t2 = Thread(target=sleep_task, args=(3,))
    t2.setDaemon(True) # 守护线程,主线程退出,子线程也退出
    t2.start()

    # 加了join之后,主线程会等到 t1,t2结束之后再继续执行下去
    # t1.join()
    # t2.join()
    time.sleep(1)

    # 现在有一个新需要,主线程给 t1,t2线程 1秒时间内完成,不然就关闭


    end_time = time.time()
    print("last_time:{}".format(end_time - start_time))

    # 1. 当开启一个程序的时候,会默认启动一个主线程
    # 2. 如果在主线程等到其他线程执行完以后才继续执行, join

加上守护线程之后,1秒之后,就会都停止掉


进程类的方式,创建进程,join方法和setDaemon 正常可以用

# 进程类
class SleepThread(Thread):
    def __init__(self, sleep_time):
        self.sleep_time = sleep_time
        super().__init__()

    def run(self):
        print("sleep {} seconds start!".format(self.sleep_time))
        time.sleep(self.sleep_time)
        print("sleep {} seconds end!".format(self.sleep_time))


if __name__ == "__main__":
    t1 = SleepThread(2)
    t2 = SleepThread(3)
    t1.start()
    t2.start()

什么是GIL

  • GIL的全称是 Global Interpreter Lock(全局解释器锁),来源是 python 设计之初的考虑,为了数据安全所做的决定。每个CPU在同一时间只能执行一个线程(在单核CUP下的多线程其实都只是并发,不是并行)
  • 某个线程想要执行,必须先拿到GIL,我们可以把GIL看作是“通行证”,并且在一个 python 进程中,GIL只有一个。拿不到通行证的线程,就不允许进入 cpu执行,GIL会释放的。
  1. 时间片释放 - 指定时间释放
  2. 遇到io释放


8-4 线程锁

# 一把锁,上完锁之后,只有释放了,下一个线程才难继续运行,不然拿不到
 

# +--------------------------
# | User: zq                -
# | Version: python3.7      -
# | Time: 2020-03-12 13:51                
# +--------------------------

from threading import Thread
# 线程同步
from threading import Lock

total = 0
total_lock = Lock()  # 线程锁


# 一把锁,上完锁之后,只有释放了,下一个线程才难继续运行,不然拿不到


def add():
    total_lock.acquire()  # 拿到线程锁
    global total
    for i in range(1000000):
        total += 1
    total_lock.release()  # 释放线程锁


def desc():
    total_lock.acquire()  # 拿到线程锁
    global total
    for i in range(1000000):
        total -= 1
    total_lock.release()  # 释放线程锁


if __name__ == '__main__':
    add_thread = Thread(target=add)
    desc_thread = Thread(target=desc)

    add_thread.start()
    desc_thread.start()

    add_thread.join()
    desc_thread.join()

    print(total)

8-6 使用多线程重构 csdn -不是很好,下面是用线程消息队列改进

利用线程可以使用全局变量的原理,实现多线程操作,代码如下:

# +--------------------------
# | User: zq                -
# | Version: python3.7      -
# | Time: 2020-03-12 14:07                
# +--------------------------
from threading import Thread
import re
import ast  # str转为list
import requests
from scrapy import Selector
from datetime import datetime  # 把字符串转为时间类型
from urllib import parse
import time

from csdn_spider.models import *

topic_list_urls = []
topic_list = []
author_list = []
# 需要抓取的 csdn 域名
domain = 'https://bbs.csdn.net/'


# 获取js,然后通过正则提取出list数据
def get_nodes_json():
    left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text
    nodes_str_match = re.search("forumNodes: (.*])", left_menu_text)
    if nodes_str_match:
        nodes_str = nodes_str_match.group(1).replace('null', 'None')  # 把js里面的None改成null
        nodes_list = ast.literal_eval(nodes_str)  # 把str转为list
        return nodes_list
    return []


# url的list
url_list = []


# 处理url,把url都提取到 url_list中
def process_nodes_list(nodes_list):
    # 将 json的格式提取出 url到list中
    for item in nodes_list:
        if "url" in item:
            if item['url']:
                url_list.append(item['url'])
            if 'children' in item:
                process_nodes_list(item['children'])


# 获取顶级的url,后面去掉,因为顶级的url只是,下面的聚合.
def get_level1_list(nodes_list):
    level1_url = []
    for item in nodes_list:
        if 'url' in item and item['url']:
            level1_url.append(item['url'])

    return level1_url


# 获取最终需要抓取的url的list数据
def get_last_urls():
    # 获取 list数据
    nodes_list = get_nodes_json()
    # 处理list数据得到url的list
    process_nodes_list(nodes_list)
    # 获取1级的url
    level1_url = get_level1_list(nodes_list)
    # 经过处理之后的 url 的list
    last_urls = []
    # 如果 url在 url_list 中但是不在 1级中则加入到 last_urls中
    for url in url_list:
        if url not in level1_url:
            last_urls.append(url)
    all_urls = []
    # csdn默认路径为 未解决, recommend为精华,closed为以解决,我们要抓取这3个地址,所以都要拼接出来
    for url in last_urls:
        all_urls.append(parse.urljoin(domain, url))
        all_urls.append(parse.urljoin(domain, url + '/recommend'))
        all_urls.append(parse.urljoin(domain, url + 'closed'))

    return all_urls


class ParseTopicAuthorThread(Thread):
    pass


class ParseTopicDetailThread(Thread):
    def run(self):
        while 1:
            try:
                url = topic_list.pop()
            except IndexError as e:
                time.sleep(1)
                continue
            print('开始获取帖子: {} '.format(url))

            # 获取帖子的详情以及回复
            topic_id = url.split('/')[-1]
            res_text = requests.get(url).text
            sel = Selector(text=res_text)
            all_divs = sel.xpath("//div[starts-with(@id, 'post-')]")  # xpath中的方法,已什么开头
            topic_item = all_divs[0]
            content = topic_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0]  # 内容
            praised_nums = topic_item.xpath(".//label[@class='red_praise digg']/em/text()").extract()[0]  # 点赞数量
            jtl = 0
            if topic_item.xpath(".//div[@class='close_topic']/text()").extract():
                jtl_str = topic_item.xpath(".//div[@class='close_topic']/text()").extract()[0]  # 结帖率
                jtl_match = re.search("(\d+\.?\d+)%", jtl_str)
                if jtl_match:
                    jtl = jtl_match.group(1)

            existed_topics = Topic.select().where(Topic.id == topic_id)
            # 完成topic的更新,把少的几个字段加入进来
            if existed_topics:
                topic = existed_topics[0]
                topic.content = content
                topic.jtl = jtl
                topic.praised_nums = praised_nums
                topic.save()

            for answer_item in all_divs[1:]:
                answer = Answer()
                answer.topic_id = topic_id  # 这篇文章id

                author_info = answer_item.xpath(".//div[@class='nick_name']//a[1]/@href").extract()[0]
                answer.author = author_info.split('/')[-1]  # 回帖作者id

                create_time = answer_item.xpath(".//label[@class='date_time']/text()").extract()[0]
                create_time = datetime.strptime(create_time, '%Y-%m-%d %H:%M:%S')
                answer.create_time = create_time  # 回帖时间

                content = answer_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0]  # 内容
                answer.content = content

                praised_nums = answer_item.xpath(".//label[@class='red_praise digg']/em/text()").extract()[0]  # 点赞数量
                answer.praised_nums = int(praised_nums)

                answer.save()

            # 解析下一页
            next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
            if next_page:
                next_url = parse.urljoin(domain, next_page[0])
                # 继续解析刚刚得到的下一页的地址
                topic_list.append(next_url)


class ParseTopicListThread(Thread):
    def run(self):
        while 1:
            try:
                url = topic_list_urls.pop()
            except IndexError as e:
                time.sleep(1)
                continue
            print('开始获取帖子列表页: {} '.format(url))

            res_text = requests.get(url).text  # 获取url的内容
            sel = Selector(text=res_text)  # 获取selector对象
            # 我们找的信息,进过观察在 table下面的 tr中
            all_trs = sel.xpath("//table[@class='forums_tab_table']/tbody//tr")
            # all_trs = sel.xpath("//table[@class='forums_tab_table']//tr")[2:]  #这种写法也可以
            for tr in all_trs:
                topic = Topic()

                if tr.xpath(".//td[1]/span/text()").extract():
                    status = tr.xpath(".//td[1]/span/text()").extract()[0]  # 状态 "未结" "已结" "满意"
                    topic.status = status

                if tr.xpath(".//td[2]/em/text()").extract():
                    score = tr.xpath(".//td[2]/em/text()").extract()[0]  # 赏分
                    topic.score = int(score)

                if tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/@href").extract():
                    topic_url = tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/@href").extract()[0]  # 标题链接,相对地址
                    topic.id = int(topic_url.split('/')[-1])  # 文章的id
                    topic_url = parse.urljoin(domain, topic_url)  # 相对地址换成绝对地址,方便后续抓取

                if tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/text()").extract():
                    topic_title = tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/text()").extract()[0]  # 标题内容
                    topic.title = topic_title

                if tr.xpath("//td[4]/a/text()").extract():
                    author_url = tr.xpath(".//td[4]/a/@href").extract()[0]  # 作者链接,相对地址
                    author_id = author_url.split('/')[-1]  # 作者id
                    author_url = parse.urljoin(domain, author_url)  # 作者链接,改成绝对地址
                    topic.author = author_id
                    # 解析用户详情页面
                    # parse_author(author_url)

                if tr.xpath(".//td[4]/em/text()").extract():
                    create_time = tr.xpath(".//td[4]/em/text()").extract()[0]  # 创建时间字符串类型
                    create_time = datetime.strptime(create_time, '%Y-%m-%d %H:%M')  # 创建时间转为时间类型
                    topic.create_time = create_time

                if tr.xpath(".//td[5]/span/text()").extract():
                    answer_info = tr.xpath(".//td[5]/span/text()").extract()[0]  # 回复查看数量
                    answer_nums = answer_info.split('/')[0]  # 回复数量
                    click_nums = answer_info.split('/')[1]  # 查看数量
                    topic.click_nums = int(click_nums)
                    topic.answer_nums = int(answer_nums)

                if tr.xpath(".//td[6]/em/text()").extract():
                    last_time_str = tr.xpath(".//td[6]/em/text()").extract()[0]  # 最后回复时间
                    last_time = datetime.strptime(last_time_str, '%Y-%m-%d %H:%M')  # 把字符串转为时间类型
                    topic.last_answer_time = last_time

                try:
                    existed_topics = Topic.select().where(Topic.id == topic.id)
                    if existed_topics:
                        topic.save()
                    else:
                        topic.save(force_insert=True)
                except Exception as e:
                    pass

                # 解析帖子内容页面
                # parse_topic(topic_url)
                topic_list.append(topic_url)

            # 解析下一页
            next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
            # 如果下一页存在,就取到下一页的连接,放入 next_url

            if next_page:
                next_url = parse.urljoin(domain, next_page[0])
                # 继续解析刚刚得到的下一页的地址
                topic_list_urls.append(next_url)


if __name__ == "__main__":
    last_urls = get_last_urls()
    for url in last_urls:
        topic_list_urls.append(url)

    topic_list_thread = ParseTopicListThread()
    topic_detail_thread = ParseTopicDetailThread()

    topic_list_thread.start()
    topic_detail_thread.start()

8-7 使用多线程和 Queue重构csdn 使用线程消息队列

from queue import Queue

消息队列就不用全局变量了

1.队列添加数据 put

  1. put 方法是向队列中添加数据
  2. Queue(maxsize=2) 如果设置了 maxsize=2 表示只能有2个值,如果添加了第三个值,就会堵塞住,下面的代码也不会执行
  3. 如果 添加了第三个值,被堵塞了,这时候,加上timeout=3,表示3秒后就结束堵塞
  4. try 用 queue.Full 这个方法也比较好
from queue import Queue
import queue

if __name__ == "__main__":
    message_queue = Queue(maxsize=2)  # 最多放2个值
    # 向队列中放数据,可以放任何类或者对象, timeout
    message_queue.put("bobby")
    message_queue.put("bobby2")
    print("start put bobby3")
    try:
        message_queue.put("bobby3", timeout=3) #put方法是组赛的
    except queue.Full as e:
        pass
    print("end")

put 方法如果不加timeout属性就会一直堵塞,还有一个方法 put_nowait(), 如果错误,立刻返回,不会等待

2. 队列取数据 get() get_nowait()

如果不传参数,就会从队列头取一个数据

from queue import Queue
import queue

if __name__ == "__main__":
    message_queue = Queue(maxsize=2)  # 最多放2个值
    # 向队列中放数据,可以放任何类或者对象, timeout
    message_queue.put("bobby")
    message_queue.put("bobby2")
    message = message_queue.get()
    print(message)  #输出 bobby

    message = message_queue.get()
    print(message)  # 输出 bobby2

用队列改善的csdn爬虫

# +--------------------------
# | User: zq                -
# | Version: python3.7      -
# | Time: 2020-03-12 14:07
# +--------------------------
from threading import Thread
import re
import ast  # str转为list
import requests
from scrapy import Selector
from datetime import datetime  # 把字符串转为时间类型
from urllib import parse
import time
from queue import Queue

from csdn_spider.models import *

topic_list_queue = Queue()
topic_queue = Queue()
author_queue = Queue()
# 需要抓取的 csdn 域名
domain = 'https://bbs.csdn.net/'


# 获取js,然后通过正则提取出list数据
def get_nodes_json():
    left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text
    nodes_str_match = re.search("forumNodes: (.*])", left_menu_text)
    if nodes_str_match:
        nodes_str = nodes_str_match.group(1).replace('null', 'None')  # 把js里面的None改成null
        nodes_list = ast.literal_eval(nodes_str)  # 把str转为list
        return nodes_list
    return []


# url的list
url_list = []


# 处理url,把url都提取到 url_list中
def process_nodes_list(nodes_list):
    # 将 json的格式提取出 url到list中
    for item in nodes_list:
        if "url" in item:
            if item['url']:
                url_list.append(item['url'])
            if 'children' in item:
                process_nodes_list(item['children'])


# 获取顶级的url,后面去掉,因为顶级的url只是,下面的聚合.
def get_level1_list(nodes_list):
    level1_url = []
    for item in nodes_list:
        if 'url' in item and item['url']:
            level1_url.append(item['url'])

    return level1_url


# 获取最终需要抓取的url的list数据
def get_last_urls():
    # 获取 list数据
    nodes_list = get_nodes_json()
    # 处理list数据得到url的list
    process_nodes_list(nodes_list)
    # 获取1级的url
    level1_url = get_level1_list(nodes_list)
    # 经过处理之后的 url 的list
    last_urls = []
    # 如果 url在 url_list 中但是不在 1级中则加入到 last_urls中
    for url in url_list:
        if url not in level1_url:
            last_urls.append(url)
    all_urls = []
    # csdn默认路径为 未解决, recommend为精华,closed为以解决,我们要抓取这3个地址,所以都要拼接出来
    for url in last_urls:
        all_urls.append(parse.urljoin(domain, url))
        all_urls.append(parse.urljoin(domain, url + '/recommend'))
        all_urls.append(parse.urljoin(domain, url + 'closed'))

    return all_urls


class ParseTopicAuthorThread(Thread):
    pass


class ParseTopicDetailThread(Thread):
    def run(self):
        while 1:
            url = topic_queue.get()  # 会堵塞,真是我们需要的
            print('开始获取帖子: {} '.format(url))

            # 获取帖子的详情以及回复
            topic_id = url.split('/')[-1]
            res_text = requests.get(url).text
            sel = Selector(text=res_text)
            all_divs = sel.xpath("//div[starts-with(@id, 'post-')]")  # xpath中的方法,已什么开头
            topic_item = all_divs[0]
            content = topic_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0]  # 内容
            praised_nums = topic_item.xpath(".//label[@class='red_praise digg']/em/text()").extract()[0]  # 点赞数量
            jtl = 0
            if topic_item.xpath(".//div[@class='close_topic']/text()").extract():
                jtl_str = topic_item.xpath(".//div[@class='close_topic']/text()").extract()[0]  # 结帖率
                jtl_match = re.search("(\d+\.?\d+)%", jtl_str)
                if jtl_match:
                    jtl = jtl_match.group(1)

            existed_topics = Topic.select().where(Topic.id == topic_id)
            # 完成topic的更新,把少的几个字段加入进来
            if existed_topics:
                topic = existed_topics[0]
                topic.content = content
                topic.jtl = jtl
                topic.praised_nums = praised_nums
                topic.save()

            for answer_item in all_divs[1:]:
                answer = Answer()
                answer.topic_id = topic_id  # 这篇文章id

                author_info = answer_item.xpath(".//div[@class='nick_name']//a[1]/@href").extract()[0]
                answer.author = author_info.split('/')[-1]  # 回帖作者id

                create_time = answer_item.xpath(".//label[@class='date_time']/text()").extract()[0]
                create_time = datetime.strptime(create_time, '%Y-%m-%d %H:%M:%S')
                answer.create_time = create_time  # 回帖时间

                content = answer_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0]  # 内容
                answer.content = content

                praised_nums = answer_item.xpath(".//label[@class='red_praise digg']/em/text()").extract()[0]  # 点赞数量
                answer.praised_nums = int(praised_nums)

                answer.save()

            # 解析下一页
            next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
            if next_page:
                next_url = parse.urljoin(domain, next_page[0])
                # 继续解析刚刚得到的下一页的地址
                topic_queue.put(next_url)


class ParseTopicListThread(Thread):
    def run(self):
        while 1:
            url = topic_list_queue.get()
            print('开始获取帖子列表页: {} '.format(url))

            res_text = requests.get(url).text  # 获取url的内容
            sel = Selector(text=res_text)  # 获取selector对象
            # 我们找的信息,进过观察在 table下面的 tr中
            all_trs = sel.xpath("//table[@class='forums_tab_table']/tbody//tr")
            # all_trs = sel.xpath("//table[@class='forums_tab_table']//tr")[2:]  #这种写法也可以
            for tr in all_trs:
                topic = Topic()

                if tr.xpath(".//td[1]/span/text()").extract():
                    status = tr.xpath(".//td[1]/span/text()").extract()[0]  # 状态 "未结" "已结" "满意"
                    topic.status = status

                if tr.xpath(".//td[2]/em/text()").extract():
                    score = tr.xpath(".//td[2]/em/text()").extract()[0]  # 赏分
                    topic.score = int(score)

                if tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/@href").extract():
                    topic_url = tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/@href").extract()[0]  # 标题链接,相对地址
                    topic.id = int(topic_url.split('/')[-1])  # 文章的id
                    topic_url = parse.urljoin(domain, topic_url)  # 相对地址换成绝对地址,方便后续抓取

                if tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/text()").extract():
                    topic_title = tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/text()").extract()[0]  # 标题内容
                    topic.title = topic_title

                if tr.xpath("//td[4]/a/text()").extract():
                    author_url = tr.xpath(".//td[4]/a/@href").extract()[0]  # 作者链接,相对地址
                    author_id = author_url.split('/')[-1]  # 作者id
                    author_url = parse.urljoin(domain, author_url)  # 作者链接,改成绝对地址
                    topic.author = author_id
                    # 解析用户详情页面
                    # parse_author(author_url)

                if tr.xpath(".//td[4]/em/text()").extract():
                    create_time = tr.xpath(".//td[4]/em/text()").extract()[0]  # 创建时间字符串类型
                    create_time = datetime.strptime(create_time, '%Y-%m-%d %H:%M')  # 创建时间转为时间类型
                    topic.create_time = create_time

                if tr.xpath(".//td[5]/span/text()").extract():
                    answer_info = tr.xpath(".//td[5]/span/text()").extract()[0]  # 回复查看数量
                    answer_nums = answer_info.split('/')[0]  # 回复数量
                    click_nums = answer_info.split('/')[1]  # 查看数量
                    topic.click_nums = int(click_nums)
                    topic.answer_nums = int(answer_nums)

                if tr.xpath(".//td[6]/em/text()").extract():
                    last_time_str = tr.xpath(".//td[6]/em/text()").extract()[0]  # 最后回复时间
                    last_time = datetime.strptime(last_time_str, '%Y-%m-%d %H:%M')  # 把字符串转为时间类型
                    topic.last_answer_time = last_time

                try:
                    existed_topics = Topic.select().where(Topic.id == topic.id)
                    if existed_topics:
                        topic.save()
                    else:
                        topic.save(force_insert=True)
                except Exception as e:
                    pass

                # 解析帖子内容页面
                # parse_topic(topic_url)
                topic_queue.put(topic_url)

            # 解析下一页
            next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
            # 如果下一页存在,就取到下一页的连接,放入 next_url

            if next_page:
                next_url = parse.urljoin(domain, next_page[0])
                # 继续解析刚刚得到的下一页的地址
                topic_list_queue.put(next_url)


if __name__ == "__main__":
    last_urls = get_last_urls()
    for url in last_urls:
        topic_list_queue.put(url)

    topic_list_thread = ParseTopicListThread()
    topic_detail_thread = ParseTopicDetailThread()

    topic_list_thread.start()
    topic_detail_thread.start()

线程池

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值