Protocol Buffers笔记

看见我书了吗

已于 2023-01-24 21:48:14 修改

阅读量539

点赞数

分类专栏：爬虫文章标签： python

于 2022-03-02 16:04:26 首次发布

本文链接：https://blog.csdn.net/qq_43572758/article/details/123232775

版权

爬虫专栏收录该内容

19 篇文章 0 订阅

订阅专栏

这篇博客介绍了如何使用protoc命令生成Python接口文件和描述文件，涉及tbclient目录下的多个.proto文件。同时，展示了如何从protobuf响应中解析数据，包括将八进制Unicode和Unicode编码转换为中文。还提供了一个Java反向生成.proto文件的简单方法，并给出了一个利用protoc--decode_raw解析protobuf响应并转化为proto文件结构的示例。

摘要由CSDN通过智能技术生成

生成python文件

protoc -I=./tbclient --python_out=./out ./tbclient/*.proto
protoc -I=./tbclient/Personalized --python_out=./out/Personalized ./tbclient/Personalized/*.proto

protoc -I=./tbclient/ --python_out=./out/PbPage ./tbclient/PbPage/*.proto

生成desc文件

protoc -I=./tbclient --descriptor_set_out=./out/res.desc ./tbclient/Personalized/*.proto ./tbclient/*.proto

desc导入charles可以解码看(View-Viewer Mappings)

protocbuf采用8进制表示的utf-8编码

b"\344\275\240\345\245\275".decode()
'你好'

根据Java文件粗略反向生成.proto文件:

复制java文件中ProtoField那段代码然后用正则匹配.

import re

s = """ProtoField(tag = 68, type = Message.Datatype.STRING)
    public final String ad_context_list;
    。。。。。
    @ProtoField(tag = 48, type = Message.Datatype.STRING)
    public final String yuelaou_params;"""

s = s.split('@')
for w in s:
    # print(w)
    try:
        type = re.search('Message.Datatype.(.*?)\)', w).group(1).lower()
    except:
        type = '@@@'
    name = re.search('public final .*? (.*?);', w).group(1)
    tag = re.search('tag = (\d+)', w).group(1)
    print(f"{type} {name} = {tag};")

输出如下:
在这里插入图片描述

syntax = "proto3";
package tttt;
option java_package = "tttt";

message CommonReq {
  int32 _client_type = 1;
}

解析response

import os
import re
import subprocess

if __name__ == '__main__':
    # 获取temp文件的绝对路径
    file = os.path.join(os.path.dirname(__file__), 'temp')

    # shell解析protobuf文件
    p = subprocess.Popen(f'protoc --decode_raw < {file}', stdout=subprocess.PIPE, shell=True)
    result, err = p.communicate()
    result = result.decode()

    # 将protobuf中8进制unicode转为中文
    unicode_8 = re.search('(\\\\\d{3})+', result)
    chinese = ''
    while unicode_8:
        exec(f'chinese = b"{unicode_8.group()}".decode()')
        start, end = unicode_8.span()
        result = result[:start] + chinese + result[end:]
        unicode_8 = re.search('(\\\\\d{3})+', result)

    # 将protobuf中unicode转为中文
    unicode = re.search('(\\\\u\w{4})+', result)
    chinese = ''
    while unicode:
        exec(f'chinese = "{unicode.group()}"')
        start, end = unicode.span()
        if result[start - 1] == '\\':
            start -= 1
        result = result[:start] + chinese + result[end:]
        unicode = re.search('(\\\\u\w{4})+', result)

    # 转化结果
    print(result)

在这里插入图片描述

使用protoc --decode_raw 一键生成大致的proto文件

import subprocess
import re
from pprint import pprint
from collections import OrderedDict


file = '/Users/wiliam/Desktop/QQMusic/response_search'

p = subprocess.Popen(f'protoc --decode_raw < {file}', stdout=subprocess.PIPE, shell=True)
raw, err = p.communicate()
raw = raw.decode()


# def protoc_text_to_dict():
#     text = re.sub('\n', ',\n', raw)
#     part = re.search('(\d) {,', text)
#     while part:
#         start, end = part.span()
#         text = text[:start] + part.group(1) + ': {' + text[end:]
#         part = re.search('(\d) {,', text)
#     text = '{\n' + text + '}'
#     re.findall('^ +\d+: ', text)
#     return eval(text)


class ProtoDict(OrderedDict):
    def __setitem__(self, key, value):
        if self.get(key):
            if isinstance(self[key], list):
                self[key].append(value)
            else:
                super().__setitem__(key, [self[key], value])
        else:
            super().__setitem__(key, value)


def protoc_text_to_dict():
    proto_dict = ProtoDict()

    stack = []
    temp = proto_dict

    for line in raw.split('\n'):
        if re.match('^ *\d+ {', line):
            tag = re.match('^ *(\d+) {', line).group(1)
            temp[tag] = ProtoDict()
            stack.append(temp)
            temp = temp[tag]
        elif re.match('^ *}', line):
            temp = stack.pop()
        elif re.match('^ *\d+: .*', line):
            tag = re.match('^ *(\d+): (.*)', line).group(1)
            value = eval(re.match('^ *(\d+): (.*)', line).group(2))
            temp[tag] = value
    return proto_dict


def parse_protobuf_dict(protobuf_dict, index=1):
    space = '    ' * index
    for tag, value in protobuf_dict.items():
        if isinstance(value, list):
            repeated = 'repeated '
            value = value[0]
        else:
            repeated = ''
        if isinstance(value, dict):
            parse_protobuf_message(tag, value, index)
            continue
        elif isinstance(value, str):
            text = f'string field{tag} = {tag};'
        elif isinstance(value, int):
            if value >= pow(2, 63):
                text = f'uint64 field{tag} = {tag};'
            elif value >= pow(2, 31):
                text = f'int64 field{tag} = {tag};'
            else:
                text = f'int32 field{tag} = {tag};'
        else:
            raise Exception("解析类型错误")
        print(space + repeated + text)


def parse_protobuf_message(tag, value, index):
    space = '    ' * index
    if isinstance(value, list):
        space += 'repeated '
    print(space + 'message message' + str(tag) + '{')
    parse_protobuf_dict(value, index+1)
    print(space + '}')
    print(space + f"message{tag} field{tag} = {tag};")


if __name__ == '__main__':
    print('syntax = "proto3";\n')
    print('message Test {')
    protobuf_dict = protoc_text_to_dict()
    parse_protobuf_dict(protobuf_dict)
    print('}')