多线程以及队列获取域名排行网址

最新推荐文章于 2021-01-21 17:38:25 发布

褶皱的包子

最新推荐文章于 2021-01-21 17:38:25 发布

阅读量199

点赞数 1

分类专栏：爬虫爬虫项目汇总文章标签： python多线程队列请求域名查询

本文链接：https://blog.csdn.net/qq_37193537/article/details/89631380

版权

爬虫同时被 2 个专栏收录

15 篇文章 1 订阅

订阅专栏

爬虫项目汇总

6 篇文章 1 订阅

订阅专栏

使用最基础的线程，队列以及锁的概念，对http://top.chinaz.com/all/index.html进行的数据获取。

具体情况，请直接运行代码即可

import requests
from lxml import etree
from threading import Thread, Lock
from queue import Queue
import re
import csv
import time


class Productor(Thread):
    def __init__(self, q, w):
        Thread.__init__(self)
        self.q = q
        self.w = w
        self.s = requests.Session()
        # 这个地方很重要，不设置这个请求状态的话后续请求会报错
        self.s.keep_alive = False
        self.headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 5.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2634.33 Safari/537.36',
        }

    def run(self):
        while not self.q.empty():
            key = self.q.get()
            url = key[0]
            if key[1] < 3:
                try:
                    req = self.s.get(url, headers=self.headers, timeout=2)
                    time.sleep(0.1)
                except Exception as e:
                    self.q.put((url, key[1] + 1))
                    print(url, e)
                else:
                    # 设置网站的编码格式为utf-8
                    req.encoding = 'utf-8'
                    self.get_info(req.text)
            else:
                # 每个链接最多请求3次，如果3次还失败就放弃请求，打印链接
                # 不这样设置的话，如果遇到有的链接请求一直超时就造成了死循环
                print(key)
        print('{} 线程结束'.format(self.getName()))

    def get_info(self, html):
        tree = etree.HTML(html)
        sites = tree.xpath('//ul[@class="listCentent"]/li')
        for site in sites:
            info = dict()
            name = site.xpath('.//h3[@class="rightTxtHead"]/a/text()')
            # 有一个网站没有名字，所以要判断查找是否为空
            name = name[0] if name else ''
            url = site.xpath('.//h3[@class="rightTxtHead"]/span/text()')[0]
            alexa = site.xpath('.//p[@class="RtCData"][1]/a/text()')[0]
            baidu_s = site.xpath('.//p[@class="RtCData"][2]/a/img/@src')[0]
            baidu_pr = site.xpath('.//p[@class="RtCData"][3]/a/img/@src')[0]
            baidu_fl = site.xpath('.//p[@class="RtCData"][4]/a/text()')[0]
            rank = site.xpath('.//div[@class="RtCRateCent"]/strong/text()')[0]
            score = site.xpath('.//div[@class="RtCRateCent"]/span/text()')[0]
            # 名称里面有的有特殊字符，所以可以过滤掉特殊的字符
            info['name'] = name.encode('gbk', 'ignore').decode('gbk')
            info['url'] = url
            info['alexa'] = alexa
            info['baidu_s'] = self.get_s(baidu_s)
            info['baidu_pr'] = self.get_s(baidu_pr)
            info['baidu_fl'] = baidu_fl
            info['rank'] = rank
            info['score'] = score.replace('得分:', '')
            self.w.put(info)

    def get_s(self, url):
        '''从百度权重或PR的图片中提取权重值'''
        s = re.findall(r'(\d+)\.gif', url)[0]
        return s


class Worker(Thread):
    def __init__(self, w, file, l):
        Thread.__init__(self)
        self.w = w
        self.file = file
        self.l = l

    def run(self):
        while True:
            info = self.w.get()
            if info is None:
                break
            try:
                # 因为共享资源，所以要加锁
                self.l.acquire()
                self.writeinfo(info)
                self.l.release()
            except Exception as e:
                print(info, e)
            self.w.task_done()
        print('{} worker is done!'.format(self.getName()))

    def writeinfo(self, data):
        headers = ['name', 'url', 'alexa', 'baidu_s', 'baidu_pr', 'baidu_fl', 'rank', 'score']
        with open(self.file, 'a', newline='', encoding='gbk') as f:
            writer = csv.DictWriter(f, fieldnames=headers)
            writer.writerow(data)


def get_csv(filename):
    '''创建一个新的csv表格，并且设置标题'''
    headers = ['name', 'url', 'alexa', 'baidu_s', 'baidu_pr', 'baidu_fl', 'rank', 'score']
    with open(filename, 'w', newline='', encoding='gbk') as f:
        writer = csv.DictWriter(f, fieldnames=headers)
        writer.writeheader()


def main(file):
    get_csv(file)
    l = Lock()
    q = Queue()
    work = Queue()
    # 插入的信息是链接和一个基础的请求次数0构成的元组，为了后续判断链接请求了几次
    q.put(('http://top.chinaz.com/all/index.html', 0))
    baseurl = 'http://top.chinaz.com/all/index_{}.html'
    for i in range(2, 1892):
        q.put((baseurl.format(i), 0))
    pl = [Productor(q, work) for i in range(10)]
    for each in pl:
        each.start()
    wl = [Worker(work, file, l) for i in range(5)]
    for each in wl:
        each.start()
    for each in pl:
        each.join()
    work.join()
    for each in wl:
        work.put(None)
    for each in wl:
        each.join()
    print('game over!')


if __name__ == '__main__':
    main('info.csv')

运行之后，会在本文件夹下有一个info.csv文件，点击可查看获取的数据：