【金融行业招标网站爬虫实现】

weixin_45851870
于 2024-10-12 11:16:55 发布
阅读量170
点赞数 1
文章标签：金融爬虫
本文链接：https://blog.csdn.net/weixin_45851870/article/details/142872738
版权
为方便公司内部销售员工更加快捷的方便获取到招标系统，开发了招标大师邮件提醒服务，本项目目前已经收集10+以上的招标系统，稳定运行一年，有需要的自取
import requests
from bs4 import BeautifulSoup
import json
import datetime
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.header import Header
from jinja2 import Template
from dateutil import parser
import pytz  # 导入 pytz 模块
# from datetime import strptime
import sys
def get_dongfang(days):
    # 设置URL和请求头
    url = "https://supplier.dfzq.com.cn/suppliers/announcement/getAnnouncement"
    headers = {
        "sec-ch-ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
        "sec-ch-ua-mobile": "?0",
        "Authorization": "",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "isOut": "supplier",
        "sessionId": "dfef71f6-24da-1e85-a952-d005332e4e20",
        "tgticketUrl": "aHR0cHM6Ly9zdXBwbGllci5kZnpxLmNvbS5jbi9zdXBwbGllcnMtcG9ydGFsL3BhZ2Uvc3VwcGxpZXIvYW5ub3VuY2VtZW50L2JpZGRpbmctbGlzdC5odG1s",
        "license": "telehot-inner",
        "sec-ch-ua-platform": '"Windows"',
        "Accept": "*/*",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Dest": "empty",
        "Referer": "https://supplier.dfzq.com.cn/suppliers-portal/page/supplier/announcement/bidding-list.html",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cookie": "isload=true"
    }

    response = requests.get(url, headers=headers)

    # 检查请求是否成功
    if response.status_code == 200:
        # 将返回的JSON解析为Python的字典
        data = response.json()
        urls = {}
        for result in data["result"]["announcementList"]["content"]:
            publishTime_str = result['createTime']
            # print(publishTime_str)
            publishTime = parser.parse(publishTime_str)
            current_time = datetime.datetime.now()
            time_difference = current_time - publishTime
            if time_difference.days <= days:  # Checking if the time difference is within 3 days
                # print("The publishTime is within the past few days.",publishTime)
                # name=data['rows'][i]['id']
                url='https://supplier.dfzq.com.cn/announcementView?id='+result['id']
                urls[result['title']] = url
                # urls[data['rows'][i]['noticeTitle'] ] = url
                # urls.append(url)
                # print(url)
            # else:
            #     print("The publishTime is not within the past few days.")
        return urls
    else:
        print("请求失败，状态码：" + str(response.status_code))
        return {}
 
#获取海通官网的招标文件
def get_haitong_urls(days):
    # 发送请求
    url = 'https://itms.haitong.com/api/index/notice'
    params = {
        'currentPage': '1',
        'pageSize': '30',
        'title': '',
        'noticeFrom': '',
        'noticeType': '01'
    }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
        'Accept': 'application/json, text/plain, */*',
        'Referer': 'https://itms.haitong.com/',
        'Connection': 'close',
        'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
        'sec-ch-ua-mobile': '?0',
        'Authorization': 'null',
        'sec-ch-ua-platform': '"Windows"',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Dest': 'empty',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        # 其他头部信息...
    }

    response = requests.get(url, params=params, headers=headers)
    # print(response.text)
    if response.status_code == 200:
        data = response.json()
        urls = {}

        for i in range(len(data['data']['list'])):
            id = data['data']['list'][i]['id']
            publishTime_str = data['data']['list'][i]['publishTime']
            publishTime = parser.parse(publishTime_str)
            current_time = datetime.datetime.now()
            time_difference = current_time - publishTime

            if time_difference.days <= days:  # Checking if the time difference is within 3 days
                # print("The publishTime is within the past few days.",publishTime)
                # print(data['data']['data'][i]['id'])
                url='https://itms.haitong.com/#/procurnotice?type=&id='+str(data['data']['list'][i]['id'] )
                # print( url)
                urls[data['data']['list'][i]['title'] ] = url
        return urls
    else:
        print("请求失败，状态码：" + str(response.status_code))
        return {}

#获取国泰官网的招标文件
def get_guangwang_urls(days):
    # 发送请求
    url = "https://www.gtjaqh.com/pc/bulletin"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
    }
    requests.packages.urllib3.disable_warnings()
    response = requests.get(url, headers=headers,verify=False)

    # 检查响应状态码
    if response.status_code == 200:
        # 解析响应内容
        soup = BeautifulSoup(response.text, "html.parser")
        script_tag = soup.find("script", id="__NEXT_DATA__")
        if script_tag:
            script_content = script_tag.string
            data = json.loads(script_content)
            bulletin_data = data["props"]["pageProps"]["BULLETIN"]

            # 筛选出catalog等于"采购公告"的id和title的值
            purchase_bulletins = [
                bulletin for bulletin in bulletin_data
                if bulletin["catalog"] == "采购公告"
            ]
            urls = {}
            for bulletin in purchase_bulletins:
                bulletin_id = bulletin["id"]
                bulletin_title = bulletin["title"]
                update_time = bulletin["update_time"]
                publishTime = datetime.datetime.strptime(update_time, "%Y-%m-%d %H:%M:%S")
                current_time = datetime.datetime.now()
                time_difference = current_time - publishTime

                if time_difference.days <= days:  # Checking if the time difference is within 2 days
                    url = f'https://www.gtjaqh.com/pc/a/{bulletin_id}'
                    # print(url)
                    urls[bulletin_title] = url

            return urls
        else:
            print("Script tag not found.")
    else:
        print("Request failed with status code:", response.status_code)

    return {}

#获取国泰官网的招标文件采购平台2024/05/22 新增
def get_guangwang_caigou_urls(days):
    from datetime import datetime, timedelta
    # 发送请求
    url = 'http://www.gtja.com/content/info-open/supplier.html'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'Upgrade-Insecure-Requests': '1',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Dest': 'document',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9'
    }

    response = requests.get(url, headers=headers)
    urls = {}
    soup = BeautifulSoup(response.text, 'html.parser')

    # 获取当前时间和当前时间前三天的日期
    current_time = datetime.now()
    # three_days_ago = current_time - timedelta(days=3)

    h3_contents = []  # 存储符合条件的h3内容
    li_tags = soup.find_all('li')
    for li in li_tags:
        span_tag = li.find('span')
        if span_tag:
            span_text = span_tag.get_text().strip()
            try:
                span_date = datetime.strptime(span_text, "发布时间：%Y年%m月%d日")
                # print(span_date)
                if current_time - span_date <= timedelta(days):
                    h3_tag = li.find('h3')
                    if h3_tag:
                        # h3_contents.append(h3_tag.get_text().strip())
                        urls[h3_tag.get_text().strip()]=url
            except ValueError:
                pass
    # 将符合条件的h3内容用分号隔开输出
    # output = '\n'.join(h3_contents)
    # urls[output] = url
    return urls
#获取上海证券官网的招标文件采购平台2024/08/12 新增
def get_shanghaizhengquan_urls(days):

    url = 'https://www.shzq.com/bwsp/common'

    headers = {
        'Host': 'www.shzq.com',
        'Connection': 'close',
        'Content-Length': '247',
        'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'sec-ch-ua-platform': '"Windows"',
        'X-Requested-With': 'XMLHttpRequest',
        'sec-ch-ua-mobile': '?0',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Origin': 'https://www.shzq.com',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Dest': 'empty',
        'Referer': 'https://www.shzq.com/about/news.html?type=zbxx',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9'
    }

    data = {
        'req_url': '/bbizp/articles',
        'req_http': 'get',
        'web_categoryCodes': 'uda_web_zjsz_zbxx',
        'web_pageIndex': '1',
        'web_pageSize': '10',
        'web_authAppToken': '258b2d917e97b9e5',
        'web_publishDateBegin': '2010-01-01',
        'web_currentPage': 'https://www.shzq.com/about/news.html?type=zbxx'
    }

    response = requests.post(url, headers=headers, data=data)
    # print(response.text)
    # response = requests.get(url, params=params, headers=headers)
    # print(response.text)
    if response.status_code == 200:
        data = response.json()
        urls = {}

        for i in range(len(data['dataList'])):
            id = data['dataList'][i]['id']
            publishTime_str = data['dataList'][i]['publishDate']
            publishTime = parser.parse(publishTime_str)
            current_time = datetime.datetime.now()
            time_difference = current_time - publishTime

            if time_difference.days <= days:  # Checking if the time difference is within 3 days
                # print("The publishTime is within the past few days.",publishTime)
                # print(data['data']['data'][i]['id'])
                url='https://www.shzq.com/about/newsContent.html?pageId='+str(data['dataList'][i]['id'] )
                # print( url)
                urls[data['dataList'][i]['title'] ] = url
        return urls
    else:
        print("请求失败，状态码：" + str(response.status_code))
        return {}
#获取国泰etu365
def get_guotai_urls(days):
    url = "https://www.etu365.com/api/v1/ztb/listzbggscreen_new"
    params = {
        "iszb": "1",
        "zbr": "",
        "zbdljg": "",
        "time": "",
        "keyword": "国泰君安",                                                                                    
        "gglx": "0",
        "pageSize": "10",
        "page": "1"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
        "Referer": "https://www.etu365.com/notice?id=1&zbr=&zbdljg=&time=&keyword=%E5%9B%BD%E6%B3%B0%E5%90%9B%E5%AE%89",
        "Cookie": "HWWAFSESID=706c57305724d5cff8; HWWAFSESTIME=1698720360529",
        "token": "null"
    }

    response = requests.get(url, params=params, headers=headers)


    if response.status_code == 200:
        data = response.json()
        # print(data)
        urls = {}
        for i in range(len(data['data']['data'])):
            id = data['data']['data'][i]['id']
            # print(len(data['data']))
            publishTime_str = data['data']['data'][i]['ggrq_begin']
            publishTime = parser.parse(publishTime_str)
            current_time = datetime.datetime.now()
            time_difference = current_time - publishTime
            # print(publishTime_str)
            if time_difference.days <= days:  # Checking if the time difference is within 3 days
                # print("The publishTime is within the past few days.",publishTime)
                # print(data['data']['data'][i]['id'])
                url='https://webapp.etu365.com/ProjectMgr/ZBGG?ggid='+str(data['data']['data'][i]['id'] )
                # print( url)
                urls[data['data']['data'][i]['bdmc'] ] = url
        return urls
    else:
        print("请求失败，状态码：" + str(response.status_code))
        return {}
def get_nongyin(days):
    url = "http://www.cfcpn.com/jcw/noticeinfo/noticeInfo/dataNoticeList"

    payload = {
        "noticeType": "",
        "pageNo": "",
        "noticeState": "1",
        "isValid": "1",
        "orderBy": "publish_time desc",
        "noticeContent": "",
        "briefContent": "农银汇理",
        "noticeTitle": "",
        "purchaseName": "",
        "purchaseId": "",
        "categoryLabName": "",
        "beginPublishTime": "",
        "endPublishTime": "",
        "areaProvince": "",
        "labelAllId": ""
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
        "Referer": "http://www.cfcpn.com/jcw/sys/index/goUrl?url=modules/sys/login/list&amp;column=cggg",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "X-Requested-With": "XMLHttpRequest",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Connection": "close",
        "Cookie": "pageNo=0; pageSize=10"
    }

    response = requests.post(url, data=payload, headers=headers)

    if response.status_code == 200:
        data = response.json()
        urls = {}
        for i in range(len(data['rows'])):
            id = data['rows'][i]['id']
            publishTime_str = data['rows'][i]['publishTime']
            publishTime = datetime.datetime.strptime(publishTime_str, "%Y-%m-%d %H:%M:%S.%f")
            current_time = datetime.datetime.now()
            time_difference = current_time - publishTime

            if time_difference.days <= days:  # Checking if the time difference is within 3 days
                # print("The publishTime is within the past few days.", publishTime)
                url = 'http://www.cfcpn.com/jcw/sys/index/goUrl?url=modules/sys/login/detail&column=undefined&searchVal=' + data['rows'][i]['id']
                urls[data['rows'][i]['noticeTitle']] = url
                # print('http://www.cfcpn.com/jcw/sys/index/goUrl?url=modules/sys/login/detail&column=undefined&searchVal=' + data['rows'][i]['id'])
            # else:
            #     print("The publishTime is not within the past few days.")

        return urls
    else:
        print("请求失败，状态码：" + str(response.status_code))
        return {}
# 获取yilian网招标信息 2024/9/2    
def get_yilianwang(days):
    url = "https://onelinkplus.com/api/saas-portal/noauth/trans/trade/pageEs"
    headers = {
        "Host": "onelinkplus.com",
        "Connection": "close",
        "Content-Length": "390",
        "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
        "Accept": "application/json, text/plain, */*",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-ch-ua-mobile": "?0",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
        "Content-Type": "application/json;charset=UTF-8",
        "Origin": "https://onelinkplus.com",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Dest": "empty",
        "Referer": "https://onelinkplus.com/",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cookie": "HWWAFSESID=ed0e7738ede363f223; HWWAFSESTIME=1725257712325; TODAY_NO_MORE_REMINDER=0; noticeLength=1; sajssdk_2015_cross_new_user=1; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=zh_CN; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE2=zh_CN; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22undefined%22%2C%22first_id%22%3A%22191b16c3c15581-0d8992149c7a0d-26001151-1764000-191b16c3c16888%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22191b16c3c15581-0d8992149c7a0d-26001151-1764000-191b16c3c16888%22%7D; COMPANY_COMPLETE_STATUS_undefined=0; checkIsWanbangGroup=1; checkIsFosunGroup=1; isExperience=; companyName=undefined; regeisterType=; isCompanyManager=; isRemember=100; rememberInfo=BImp5GES3WFc9q6gw7YWMcJVVISmDL4SrC26vcd/M5yQMDCIAQxJGWPWTYJgzohnoLvs/DFs8e3z8nZN3UlCXRxaTZdIoLchqaKci76OoLI="
    }

    data = {
        "dataSource": "",
        "pageNum": 1,
        "pageSize": 20,
        "tradePattern": "882296043327582208",
        "noticeType": "1,2",
        "businessName": "",
        "tendererName": "",
        "bidderName": "",
        "releaseEndTime": "",
        "releaseStartTime": None,
        "purchaseProjectType": "",
        "purchaseMode": "",
        "platformId": "882289088363483137",
        "industry": "",
        "budgetRange": "",
        "categoryName": "",
        "tendererChooseNames": "",
        "classifySize": 0,
        "fieldName": "",
        "keySort": []
    }

    response = requests.post(url, json=data, headers=headers)
    if response.status_code == 200:
        data = response.json()
        # print(data)
        urls = {}
        for i in range(len(data['data']['list'])):
            id = data['data']['list'][i]['id']
            publishTime_timestamp = data['data']['list'][i]['releaseTime'] / 1000  # 将时间戳转换为秒
            publishTime = datetime.datetime.fromtimestamp(publishTime_timestamp, tz=pytz.UTC)  # 将时间戳转换为日期时间对象
            current_time = datetime.datetime.now(pytz.UTC)  # 获取当前时间
            time_difference = current_time - publishTime
            ##获取服务品类关键字
            guanjianzi = 'IT'
            values= str(data['data']['list'][i]['categoryName'] )
            if time_difference.days <= days and guanjianzi in values :  
                url='https://onelinkplus.com/#/trade-info-detail?id='+str(data['data']['list'][i]['id'] )+"&noticeType="+str(data['data']['list'][i]['noticeType'] )+"&publishStatus="+str(data['data']['list'][i]['publishStatus'] )
                print( url)
                # urls[data['data']['list'][i]['tendererName'] ] = url
                urls[data['data']['list'][i]['businessName'] ] = url 
        return urls
    else:
        print("请求失败，状态码：" + str(response.status_code))
        return {}   
#####################################################################
def send_email(urls,days):
    # 邮箱服务器的地址和端口
    smtp_server = 'smtp.qiye.aliyun.com'
    smtp_port = 465

    # 发件人邮箱账号和密码
    sender_email = ''
    sender_password = ''

    # 收件人邮箱账号
    receiver_email = ''
    # receiver_email = ''
    # 构造邮件内容
    message = MIMEMultipart("alternative")
    message['From'] = '招标大师'
    # 获取当前日期和时间
    current_datetime = datetime.datetime.now()

    # 计算前三天的日期
    previous_datetime = current_datetime - datetime.timedelta(days)

    # 格式化日期和时间
    formatted_current_datetime = current_datetime.strftime('%H:%M:%S, %Y-%m-%d')
    formatted_previous_datetime = previous_datetime.strftime('%H:%M:%S, %Y-%m-%d')

    # 构建邮件主题
    subject = f'【Urgent】招标文件提醒，别让机会溜走！（{formatted_previous_datetime}至{formatted_current_datetime}）'

    # 设置邮件主题
    message['Subject'] = subject

    # 定义HTML模板
    html_template = """
    <html>
<head>
<style>
    body {
        font-family: 'Comic Sans MS', cursive, sans-serif;
        font-size: 16px;
        color: #333;
    }

    h1 {
        font-family: 'Arial', sans-serif;
        font-size: 24px;
        color: #ff00ff;
        text-decoration: underline;
    }

    p {
        margin-bottom: 10px;
    }

    a {
        color: #0066cc;
        text-decoration: none;
        font-weight: bold;
    }
    </style>
    </head>
    <body>
       
        <p>亲爱的小伙伴们，</p>
        <p>我是你们的机会捕捉者，特地为你们带来了一些好消息！</p>
        <p>请查收以下信息：</p>
        {% for key, value in urls.items() %}
        <p>请点击<a href='{{ value }}'>{{ key }}</a>查看更多内容。</p>
        {% endfor %}
        <p>祝你们好运！</p>
        <p>机会捕捉者 敬上</p>
    </body>
    </html>
    """

    # 创建Jinja2模板对象
    template = Template(html_template)
    # 渲染模板并传递参数
    if len(urls) == 0:
        print("最近没有招标文件哦，请耐心等待！")
        return
    else:
        html = template.render(urls=urls)
        # 将HTML内容转换为MIMEText对象
        part = MIMEText(html, "html")
        # 将MIMEText对象添加到MIMEMultipart对象中
        message.attach(part)
        try:
            # 连接到邮件服务器
            server = smtplib.SMTP_SSL(smtp_server, smtp_port)
            server.login(sender_email, sender_password)
            receivers = receiver_email.split(',')
            server.sendmail(sender_email, receivers, message.as_string())
            server.quit()
            print("邮件发送成功")
        except Exception as e:
            print("邮件发送失败:", str(e))

def main():
    # urls = get_haitong_urlss(days)
    days = 3
    urls = {**get_guangwang_urls(days), **get_guotai_urls(days), **get_nongyin(days), **get_dongfang(days),**get_guangwang_caigou_urls(days),**get_haitong_urls(days),**get_shanghaizhengquan_urls(days),**get_yilianwang(days)}
    # urls = { **get_yilianwang(days)}
    send_email(urls,days)

if __name__ == "__main__":
    main()