秘密,一般人我不告诉他.偷偷告诉你信创产品采购的正确打开方式:python爬虫实现

Carltiger_github

于 2024-07-17 22:00:15 发布

阅读量677

点赞数 16

文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/weixin_43688085/article/details/140506313

版权

创作不易只因热爱!!

热衷分享，一起成长!

“你的鼓励就是我努力付出的动力”

采购XX有没有找你诉说 , 某某ZFCG网上的信创产品, 品种太多太杂,无法细分查找,某某详情页面要一个个看, 真费时费力,于是乎…

!!!以下内容仅供学习使用,便于快速筛选找到需求产品!!!请勿对号入座!!!

python爬虫爬取信创产品

生成excel丢给采购XX, 结果, 采购XX在风中林乱
图文学习mark一下

# -*- coding: utf-8 -*-
"""以下仅供学习使用,便于快速筛选找到需求产品!!!请勿对号入座!!!"""
# import getpass
import json
# import platform
# import sys
import pandas as pd
import requests
# import configparser
# from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.common.by import By

# system = platform.system()
# if system.startswith("Windows"):
#     EXEC_DIR_PATH = f'C:/Users/{getpass.getuser()}/AppData/Local/Google/Chrome/Application/chrome.exe'
#     USER_DIR_PATH = f'C:/Users/{getpass.getuser()}/AppData/Local/Google/Chrome/User Data'
#     DRIVER_PATH = f'C:/Users/{getpass.getuser()}/AppData/Local/Google/Chrome/chromedriver.exe'

request_url = '...'
headersstr ='''
POST /front/index/search/search HTTP/1.1
Accept: application/json, text/plain, */*

...

User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36
X-Requested-With: XMLHttpRequest
sec-ch-ua: "Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"
'''

# 用于请求头字符串转字典
def headers_str2dict(headersstr):
    headerslist = [r for r in headersstr.split('\n') if ': ' in r]
    return dict(eval(str(headerslist).replace(': ',"\':\'").replace("[",'{').replace("]",'}')))

def data_str2dict(post_data_c, page=1):
    post_data = post_data_c.replace('true','True').replace('false', 'False').replace('null','None').replace('"pageNo":1','"pageNo":'+str(page))
    return dict(eval(post_data))

class CInit():
    # 配置项目
    fristurl = '...'
    title_after_dl = '...'  

    # 全局请求头 dict字典 
    header = headers_str2dict(headersstr)
    
    # 浏览器对象
    options = ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    driver = None

    # 初始化
    def __init__(self):
        pass
    
    # 更新头部, 每次访问一个地址就获取进行更新 Cookie Referer
    def get_header(self, url):
        resCookie = ''
        for cookie_dict in self.driver.get_cookies():
            resCookie += cookie_dict['name'] + '=' + cookie_dict['value'] + ';' if cookie_dict['name'] != '' else cookie_dict['value'] + ';'
        self.header['Cookie'] = resCookie
        self.header['Referer'] = url
        try:
            self.header.pop('Content-Length')
        except:
            pass
        return self.header
        
    # 结束输出
    def end(self):
        print('=========================game over==========================')
        self.driver.get('about:blank')
        # self.driver.close()
    
# 初始化浏览器
def GetWebdriver(ini):
    if ini.driver is None:
        print('*****************  初始化浏览器  *********************')
        ini.driver = webdriver.Chrome(options=ini.options)
        ini.driver.get(ini.fristurl)
        try:
            WebDriverWait(ini.driver, 9).until(ec.title_contains(ini.title_after_dl))
        except:
            print('未成功登陆首页地址')
        print('*****************  浏览器准备完毕  *********************')

def get_excel(ini, url_c, pages, post_data_c, filename):
    # 浏览器访问网址
    ini.driver.get(url_c)
    # 更新头部, 用于requests请求头
    ini.get_header(url_c)
    
    css = []
    for page in range(1,pages,1):
        # 对post数据字符串处理, 并按页码循环请求, 接收数据
        post_data = data_str2dict(post_data_c, page)
        
        response = requests.post(request_url, headers = ini.header, data = json.dumps(post_data))
        res = response.json()
        
        # 返回数据的处理使用
        df1 = res["result"]["searchWithAggs"]["entities"]["data"] 
        for row in df1 :
            list_att = [r for r in row["attributes"] if r.count(':')==1 and "'" not in r]
            tempdict = dict(eval(str(list_att).replace(':',"':'").replace("[",'{').replace("]",'}')))
            tempdict["N品牌名"] = row["brandName"] 
            tempdict["N名称"] = row["originName"]
            tempdict["N价格"] = row["price"]
            tempdict["N规格"] = row["specification"]
            tempdict["N后端分类"] = row["backCategoryName"]
            tempdict["N分类"] = row["categoryName"]
            tempdict["N网址链接"] = ini.fristurl+'items/'+ str(row["id"]) +'?searchType=1'

            css.append(tempdict)
    # 字典数组转为excel, 方便直观进行筛选等操作 
    pd.DataFrame(css).to_excel(filename)    
    
if __name__ == '__main__':
    ini =CInit()
    GetWebdriver(ini)
            
    url_c = '...'
    pages = 44+1
    post_data_c = '''
    {"pageNo":1,"pageSize":50,"matchDirectPurchase":false,"fcids":"1445395","hasStock":true,"deliveryCode":450302,"excludedIds":[],"tagSet":[],"sort":"0_0_0_0","normal":6}
    '''
    filename = r'D:\test服务器.xlsx'
    
    get_excel(ini, url_c, pages, post_data_c, filename)



    # # 测试过程数据
    # ini = CInit()
    # GetWebdriver(ini)
    # url = '...'
    # ini.driver.get(url)
    # ini.get_header(url_c)

    # data = {"pageNo":1,"pageSize":50,"matchDirectPurchase":False,"fcids":"1445402","hasStock":True,"deliveryCode":450302,"excludedIds":[],"tagSet":[],"sort":"0_0_0_0","normal":6}
    # response = requests.post(request_url, headers = ini.header, data = json.dumps(data))
    # res = response.json()
    # df1 = res["result"]["searchWithAggs"]["entities"]["data"]

    # for row in df1 :
    #     tempdict = dict(eval(str(row["attributes"]).replace(':','":"').replace("', '",'", "').replace("['",'{"').replace("']",'"}')))
    #     tempdict["N品牌名"] = row["brandName"] 
    #     tempdict["N名称"] = row["originName"]
    #     tempdict["N价格"] = row["price"]
    #     tempdict["N规格"] = row["specification"]
    #     tempdict["N后端分类"] = row["backCategoryName"]
    #     tempdict["N分类"] = row["categoryName"]
    #     tempdict["N网址链接"] = ini.fristurl+'items/'+ str(row["id"]) +'?searchType=1'