天工矩阵爬虫

难点:js加密分析、请求头参数构造

# - * - coding : utf-8 - * -

"""====================================================================================================================
function :  http://www.titanmatrix.com/tgxx
            按照品牌和系列可以看到选型参数
            不同参数的选择可以得到不同的订货号
===================================================================================================================="""

import os
import csv
import json
import time

import execjs
import random
import hashlib
import requests
import pandas as pd
from loguru import logger
from bs4 import BeautifulSoup
from datetime import datetime
from itertools import product

from utils.request import Request


class TG():
    def __init__(self):
        self.rq = Request()

        self.headers = {
            'Accept': 'application/json, text/plain, */*',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'apiVersion': '1.0',
            'Content-Type': 'application/json;charset=UTF-8',
            'Host': 'macafe.titanmatrix.com',
            'Origin': 'https://www.titanmatrix.com',
            'Referer': 'https://www.titanmatrix.com/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'
        }
        self.detail_headers = {
            'Accept': 'application/json, text/plain, */*',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'apiVersion': '1.0',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Content-Length': '390',
            'Content-Type': 'application/json;charset=UTF-8',
            'Host': 'macafe.titanmatrix.com',
            'Origin': 'https://www.titanmatrix.com',
            'Referer': 'https://www.titanmatrix.com/',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
        }
        self.writer = pd.ExcelWriter('天工数据.xlsx')

        self.treated_combines = []

    def gen_sign(self, u, rank, seriesid):
        """需要手动或通过selenium拿到seriesid"""
        t = '201010'
        f = '{"system":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36","version":"1.0.0"}'     # 不要空格!
        s = {"serviceId":"a4966e02741c4cc091fe1834d00f149c","sid":seriesid,"pkey":"","withParam":True,"noWaterMark":True}
        s = "&param=" + json.dumps(s)
        s = s.replace(' ', '')
        l = rank
        i = ''
        r = "72933362EAA649B893699E6191BC898F"
        h = "appid=" + t + "&client=" + f + s + "&rank=" + l + "&timestamp=" + u + "&token=" + i + "&key=" + r
        # print(h)

        with open('get_sign.js', 'r', encoding='UTF-8') as file:
            js_file = file.read()
        context = execjs.compile(js_file)
        sign = context.call("c", h)
        # print(sign)
        return sign

    def gen_sign_more(self, u, rank, seriesid, pkey, pid, id):  #
        t = '201010'
        f = '{"system":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36","version":"1.0.0"}'     # 不要空格!
        s = {"serviceId":"a4966e02741c4cc091fe1834d00f149c","sid":seriesid,"pkey":pkey,"action":{"type":"SELECT_PROP","payload":{"pid":pid,"id":id}},"withParam":True,"noWaterMark":True}
        s = "&param=" + json.dumps(s)
        s = s.replace(' ', '')
        l = rank
        i = ''
        r = "72933362EAA649B893699E6191BC898F"
        h = "appid=" + t + "&client=" + f + s + "&rank=" + l + "&timestamp=" + u + "&token=" + i + "&key=" + r
        # print(h)

        with open('get_sign.js', 'r', encoding='UTF-8') as file:
            js_file = file.read()
        context = execjs.compile(js_file)
        sign = context.call("c", h)
        # print(sign)
        return sign

    def gen_payload(self, seriesid, rank, u, sign):
        payload = {
            "appid": "201010",
            "client": {
                "system": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
                "version": "1.0.0"
            },
            "param": {
                "serviceId": "a4966e02741c4cc091fe1834d00f149c",
                "sid": seriesid,
                "pkey": "",
                "withParam": True,
                "noWaterMark": True
            },
            "timestamp": u,
            "rank": rank,
            "sign": sign
        }
        return payload

    def gen_more_payload(self, seriesid, rank, u, sign, pkey, pid, id):
        payload = {
            "appid": "201010",
            "client": {
                "system": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
                "version": "1.0.0"
            },
            "param": {
                "serviceId": "a4966e02741c4cc091fe1834d00f149c",
                "sid": seriesid,
                "pkey": pkey,
                "action": {
                    "type": "SELECT_PROP",
                    "payload": {
                        "pid": pid,      # 具体参数对应的id 顺序不能颠倒!
                        "id": id         # 这一类参数对应的id
                    }
                },
                "withParam": True,
                "noWaterMark": True
            },
            "timestamp": u,
            "rank": rank,
            "sign": sign
        }
        return payload

    def request(self, payload):
        url = 'https://macafe.titanmatrix.com/macafe/getState'
        i = 0
        while i < 5:
            try:
                # response = requests.post(url, data=json.dumps(payload), headers=headers)
                response = self.rq.requests_post(url, data=json.dumps(payload), headers=self.headers)
                return response
            except:
                i += 1
                print(f'try {i} times')
        else:
            raise

    def get_all_combines(self, all_para):
        choice_options = list(all_para.values())
        all_combines = []
        for i in product(*choice_options):
            all_combines.append(i)
        return all_combines

    def get_replace_combine(self, replace_para, now_value, combine):
        """获取一个替换值"""
        ids = replace_para['ids']    # 元组
        # 获取ids中非now_value的值
        for id in ids:
            if id != now_value:
                replace_id = id
                break
        else:
            raise
        new_combine = []
        for i in combine:
            if i != now_value:
                new_combine.append(i)
            else:
                new_combine.append(replace_id)
        return new_combine

    def get_datas(self, seriesid, all_para, replace_para, csv_f, treated_models):
        # 获取需要替换的位置索引
        index = replace_para['index']
        pid = replace_para['pid']

        all_combines = self.get_all_combines(all_para)
        print('all_combines', len(all_combines))
        datas = []
        for i, combine in enumerate(all_combines):   # 例如 获取组合12345的数据,则先将1替换成0,再id位置放置1对应的id,前提是1对应的参数有多个选项,每个组合都只需要替换这个位置即可!!定义为replace_para_values
            logger.info(f'{i}, {combine}')
            time.sleep(0.1)
            # 获取combine中对应位置的值
            now_value = combine[index]
            new_combine = self.get_replace_combine(replace_para, now_value, combine)
            # print('new_combine', new_combine)
            u = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
            rank = str(random.randint(1000000000000000, 9999999999999999))
            pkey = '_'.join([str(i) for i in new_combine])
            sign = self.gen_sign_more(u, rank, seriesid, pkey, pid, now_value)
            payload = self.gen_more_payload(seriesid, rank, u, sign, pkey, pid, now_value)
            # print(payload)
            response = self.request(payload)
            result = json.loads(response.text)
            # print('result', result)
            entity = result['entity']
            name = entity['name']
            mcode = entity['mcode']
            price = entity['price']
            date = entity['version']
            paras = entity['props']
            para_values = []  # 选择的参数
            for para in paras:
                para_items = para['items']
                for para_item in para_items:
                    is_selected = para_item['selected']
                    if is_selected:
                        para_values.append(para_item['name'])
                        continue
            data = [name, mcode, price, date] + para_values
            print(data)
            if name not in treated_models:
                csv_f.writerow(data)
            datas.append(data)
        return datas

    def treat_seriesid(self, seriesid):
        """这里获取组合的参数"""
        company_name, series_name, seriesid = seriesid[0], seriesid[1], seriesid[2]
        f = open(f'{company_name}_{series_name}.csv', 'a+', encoding='utf-8', newline='')
        csv_f = csv.writer(f)

        with open(f'{company_name}_{series_name}.csv', encoding='utf-8') as f1:
            treated_models = f1.read().split('\n')
            treated_models = [i.split(',')[0] for i in treated_models]
        u = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
        rank = str(random.randint(1000000000000000, 9999999999999999))
        sign = self.gen_sign(u, rank, seriesid)
        payload = self.gen_payload(seriesid, rank, u, sign)
        response = self.request(payload)
        result = json.loads(response.text)
        # print('result', result)

        entity = result['entity']
        # 部件名称 订货号 表价 日期
        paras = entity['props']
        para_names = []          # 参数字段名称
        all_para = {}            # 所有参数 去掉只有一个参数的,因为不用遍历
        replace_para = {}
        for index, para in enumerate(paras):
            para_id = para['id']
            items_ids = []
            para_name = para['name'].replace(':', '')
            para_names.append(para_name)
            para_items = para['items']
            if not replace_para and len(para_items) > 1:
                replace_para['index'] = index
                replace_para['ids'] = tuple([i['id'] for i in para_items])
                replace_para['pid'] = para_id
            # print(para_name, para_items)
            for para_item in para_items:
                items_ids.append(para_item['id'])
            all_para[para_id] = items_ids
        # print('all para', all_para)
        # print('replace_para', replace_para)
        columns = ['部件名称', '订货号', '表价', '日期'] + para_names
        # print('columns', columns)
        csv_f.writerow(columns)
        datas = self.get_datas(seriesid, all_para, replace_para, csv_f, treated_models)  # 先假设所有参数不变  TODO:参数变化
        df = pd.DataFrame(data=datas, columns=columns)
        df.to_excel(self.writer, encoding='utf-8', index=False, sheet_name=f'{company_name}_{series_name}')

    def main(self):
        series_ids = [('常熟开关制造有限公司', 'CH3N-63系列小型断路器', 24031),
                      ('北元电器有限公司', 'BB1L-63系列小型漏电断路器', 22234)]
        for series_id in series_ids:
            logger.info(series_id)
            self.treat_seriesid(series_id)
            break
        self.writer.save()


if __name__ == '__main__':
    a = TG()
    a.main()

注:本文仅可用于技术交流,请勿用于非法用途,欢迎咨询(q 1461124250)。

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值