Python 多进程爬取豆瓣TOP250

import requests
from bs4 import BeautifulSoup
import multiprocessing
import time
import os

# 进程1获取网页真实地址并存入队列中
class geturl(multiprocessing.Process):
    def __init__(self, urlqueue, count, url):
        multiprocessing.Process.__init__(self)
        self.urlqueue = urlqueue
        self.url = url
        self.count = count

    def run(self):
        # time.sleep(5)
        while self.count >= 0 and self.count <= 250:
            page_url = self.url + '?start=' + str(self.count) + '&filter='
            self.urlqueue.put(page_url)
            # self.urlqueue.task_done()
            self.count += 25
          #  time.sleep(1)

# 进程2获取信息并存入TXT文档中
class getcontent(multiprocessing.Process):
    def __init__(self, urlqueue):
        multiprocessing.Process.__init__(self)
        self.urlqueue = urlqueue

    def run(self):
        while True:
            header = {'Referer': 'https://www.douban.com/',
                      'User-Agent':
                          'ozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
            url = self.urlqueue.get()
            res = requests.get(url, headers=header)
            soup = BeautifulSoup(res.text, 'html.parser')
            for contents in soup.select('.info'):
                if contents.select('.hd') != []:
                    titles = ''.join(contents.select('.hd')[0].text.split())
                    # print(titles)
                if contents.select('.bd p') != []:
                    peoples = contents.select('.bd p')[0]
                    name = peoples.contents[0].strip()
                    addrs = peoples.contents[2].strip()
                    # print(name)
                    # print(addrs)
                score = contents.select('.bd .star .rating_num')[0].text
                numbers = contents.select('.bd .star span')[3].text  # .contents[6]
                # print (score)
                # print(numbers)
                if contents.select('.bd .quote .inq') != []:
                    message = contents.select('.bd .quote .inq')[0].text
                    # print(message)

                content = [titles, name, addrs,
                           score, numbers, message]

                with open('C:\\Users\\dell\\Desktop\\douban.txt', 'a', encoding='utf-8') as file:
                    for each in content:
                        file.write(each)

                        file.write('\n')
                    file.write('\n')
                    file.write('\n')
                # print()

            time.sleep(1)


# 进程3监控进程1,2
class contrl(multiprocessing.Process):
    def __init__(self, urlqueue):
        multiprocessing.Process.__init__(self)
        self.urlqueue = urlqueue

    def run(self):
        while True:
            print("程序执行中")
            time.sleep(60)
            if (self.urlqueue.empty()):
                print("程序执行完毕!")
                exit()

if __name__ == '__main__':
    url = 'https://movie.douban.com/top250'
    count = 0
    urlqueue = multiprocessing.Queue()

    t1 = geturl(urlqueue, count, url)
    t1.start()

    t2 = getcontent(urlqueue)
    t2.start()

    t3 = contrl(urlqueue)
    t3.start()

注意:这里不能使用队列Queue模块,而要使用multiprocessing 里的Queue方法,不然会报错。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值