python爬虫爬取中国国际招标有限公司

最新推荐文章于 2024-07-08 14:59:20 发布

Uluoyu

最新推荐文章于 2024-07-08 14:59:20 发布

阅读量571

点赞数 8

文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/Uluoyu/article/details/140215275

版权

python爬虫爬取中国国际招标有限公司

python爬取数据存储到redis中

#  -*- coding: utf-8 -*-
# 中国国际招标有限公司
import re
from datetime import datetime

import redis
import requests
from lxml import etree
from config import REDIS_IP, REDIS_PORT, REDIS_DB, REDIS_PASSWORD
from items.sql import MySQL


class Cntcitc:
    def __init__(self):
        self.redis = redis.Redis(host=REDIS_IP, port=REDIS_PORT, db=REDIS_DB, password=REDIS_PASSWORD,
                                 decode_responses=True, charset='UTF-8',
                                 encoding='UTF-8')
        self.db = MySQL()
        self.db.connect()
        self.name = '中国国际招标有限公司'
        self.url = 'https://www.cntcitc.com.cn/searchPage.html'
        self.api_url = 'https://www.cntcitc.com.cn/search.html'
        self.today = datetime.today().strftime('%Y-%m-%d')
        self.counter_key = f"cntcitc:counter:{self.today}"
        self.overall_cycle = False
        self.headers = {
            "referer": self.url,
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
        }

    def get_data(self, key, page=1):
        payload = {
            'channelId': '-1',
            'key': key,
            'startTime': "2024-06-18",
            'endTime': '',
            'currentPage': page
        }
        con = requests.post(url=self.api_url, headers=self.headers, data=payload).content.decode('utf8')
        html = etree.HTML(con)
        content_text = ''.join(html.xpath('/html/body/div/div/form/div[2]/ul/text()'))
        content = content_text.strip()
        print(f"key:{key},爬取内容：{content}")
        if content == "未查询到相关内容":
            return None
        else:
            return html

    # 获取总页数
    def get_page(self, key):
        html = self.get_data(key)
        if html is not None:
            pageText = ''.join(html.xpath('/html/body/div/div/form/div[2]/div/span[2]/text()'))
            # 使用正则表达式匹配“共x页”格式的文本
            match = re.search(r"共\d+页", pageText)
            # 如果匹配成功，去除中间空格并提取数字
            if match:
                # 去除中间空格
                cleaned_text = re.sub(r"\s", "", match.group())
                # 提取数字
                page = re.search(r"\d+", cleaned_text).group()

            else:
                page = None
            return page
        else:
            return None

    def spider(self, key):
        pages = self.get_page(key)
        if pages is not None:
            self.overall_cycle = False
            # 爬取增量数据
            last_page_key = f"cntcitc:last_link:{key}"
            last_page_link = str(self.redis.get(last_page_key) or "")
            try:
                for page in range(1, int(pages) + 1):
                    if self.overall_cycle:
                        break
                    html = self.get_data(key, page)
                    if html is not None:
                        for i in range(1, 16):
                            title = ''.join(
                                html.xpath(f'/html/body/div/div/form/div[2]/ul/li[{i}]/a/text()')).strip()
                            if title == "":
                                break
                            suffix_link = ''.join(
                                html.xpath(f'/html/body/div/div/form/div[2]/ul/li[{i}]/a/@href'))
                            link = f"https://www.cntcitc.com.cn/{suffix_link}"
                            if last_page_link == link:
                                self.overall_cycle = True
                                break
                            publish_time_text = ''.join(
                                html.xpath(f'/html/body/div/div/form/div[2]/ul/li[{i}]/text()'))
                            # 使用正则表达式匹配日期
                            match = re.search(r'\d{4}-\d{2}-\d{2}', publish_time_text)
                            publish_time = ""
                            if match:
                                date_str = match.group()
                                publish_time = date_str

                            self.store_to_redis(link, title, publish_time, key)
                            if last_page_link == "":
                                self.redis.set(last_page_key, link)
                                last_page_link = link
            except Exception as e:
                print(f"中国国际招标有限公司爬虫出现异常： {e}")
                self.redis.set(last_page_key, "")

    def store_to_redis(self, link, title, show_times, key):
        if self.redis.exists(link):
            existing_keys = self.redis.hget(link, 'keys').split(',')
            if key not in existing_keys:
                existing_keys.append(key)
                self.redis.hset(link, 'keys', ','.join(existing_keys))
                self.redis.hset(link, 'is_synced', 'false')
        else:
            self.redis.hset(link, mapping={
                'title': title,
                'show_times': show_times,
                'keys': key,
                'is_synced': 'false'
            })
            # 设置过期时间为28天（2419200秒）
            self.redis.expire(link, 2419200)
        self.redis.incr(self.counter_key)

    def get_today_crawl_count(self):
        return int(self.redis.get(self.counter_key) or 0)

    def process(self):
        key_list = ['动漫', '引流', '银行', '业务']
        for key in key_list:
            self.spider(key)
        print(f'中国国际招标有限公司的爬取数据数量为：{self.get_today_crawl_count()}')


if __name__ == '__main__':
    bank_cntcitc = Cntcitc()
    bank_cntcitc.process()