python protobuf_关于头条人群包protobuf格式的php(python)解决方案

虽然php能序列化和反序列化,但是奈何头条不认啊,最后使用了python脚本的形式,去序列化,但很快就暴露出了问题,速度太慢!几万个设备号要序列化2小时+,当然主要的原因在于当时赶时间,是一个个设备号序列化的,大量的时间花在python上下文切换上,上文里的脚本能用,但是不适合稍微量大一点的场景,故而用三脚猫的功夫写了一个新的python脚本,接受文件,吐出序列化后的新文件,速度大大提升,实测大概1000/s个设备号。from __future__ import print_function

import DmpDataProtoV2_pb2

import os,sys

import time

import base64

ag_len = sys.argv.__len__()

if ag_len <= 1:

print ('ag is null')

exit()

file = sys.argv[1]

if not file.strip():

print ('files is null')

exit()

if not os.path.exists(file):

print ('files is not exists')

exit()

f = open(file)

line = f.readline()

line=line.strip('\n')

base_name = os.path.splitext(file)[0]

target_file = base_name + '-ProtoBuf.txt'

print(target_file)

# if os.path.exists(target_file)::

# os.remove(target_file)

t = open(target_file, 'w')

t.truncate()

while line:

line=line.strip('\n')

if not line.strip():

continue

arr = line.split('|')

if arr.__len__() != 2:

continue

dmp_data = DmpDataProtoV2_pb2.DmpData()

id_item1 = dmp_data.idList.add()

dtype = arr[0]

dev_id = arr[1]

id_item1.dataType = getattr(DmpDataProtoV2_pb2.IdItem,dtype)

#id_item1.dataType = DmpDataProtoV2_pb2.IdItem.IDFA

id_item1.id = str.lower(dev_id)

id_item1.tags.append(dtype)

# id_item1.timestamp = int(time.time())

binary_string = dmp_data.SerializeToString()

s = base64.b64encode(binary_string)

t.write(s+"\n");

line = f.readline()

line=line.strip('\n')

f.close()

PHP调用部分//从py重写

$protobuf_path = shell_exec("python ".base_path()."/scripts/python/base64DmpItemByFile.py {$file_path}");

Done!

DmpDataProtoV2_pb2.py# Generated by the protocol buffer compiler. DO NOT EDIT!

# source: DmpDataProtoV2.proto

import sys

_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))

from google.protobuf import descriptor as _descriptor

from google.protobuf import message as _message

from google.protobuf import reflection as _reflection

from google.protobuf import symbol_database as _symbol_database

from google.protobuf import descriptor_pb2

# @@protoc_insertion_point(imports)

_sym_db = _symbol_database.Default()

DESCRIPTOR = _descriptor.FileDescriptor(

name='DmpDataProtoV2.proto',

package='toutiao.dmp',

serialized_pb=_b('\n\x14\x44mpDataProtoV2.proto\x12\x0btoutiao.dmp\".\n\x07\x44mpData\x12#\n\x06idList\x18\x01 \x03(\x0b\x32\x13.toutiao.dmp.IdItem\"\xda\x01\n\x06IdItem\x12\x11\n\ttimestamp\x18\x01 \x01(\r\x12.\n\x08\x64\x61taType\x18\x02 \x02(\x0e\x32\x1c.toutiao.dmp.IdItem.DataType\x12\n\n\x02id\x18\x03 \x02(\t\x12\x0c\n\x04tags\x18\x04 \x03(\t\"s\n\x08\x44\x61taType\x12\x08\n\x04IMEI\x10\x00\x12\x08\n\x04IDFA\x10\x01\x12\x07\n\x03UID\x10\x02\x12\x0c\n\x08IMEI_MD5\x10\x04\x12\x0c\n\x08IDFA_MD5\x10\x05\x12\x16\n\x12MOBILE_HASH_SHA256\x10\x06\x12\x08\n\x04OAID\x10\x07\x12\x0c\n\x08OAID_MD5\x10\x08\x42\x0e\x42\x0c\x44mpDataProto')

)

_sym_db.RegisterFileDescriptor(DESCRIPTOR)

_IDITEM_DATATYPE = _descriptor.EnumDescriptor(

name='DataType',

full_name='toutiao.dmp.IdItem.DataType',

filename=None,

file=DESCRIPTOR,

values=[

_descriptor.EnumValueDescriptor(

name='IMEI', index=0, number=0,

options=None,

type=None),

_descriptor.EnumValueDescriptor(

name='IDFA', index=1, number=1,

options=None,

type=None),

_descriptor.EnumValueDescriptor(

name='UID', index=2, number=2,

options=None,

type=None),

_descriptor.EnumValueDescriptor(

name='IMEI_MD5', index=3, number=4,

options=None,

type=None),

_descriptor.EnumValueDescriptor(

name='IDFA_MD5', index=4, number=5,

options=None,

type=None),

_descriptor.EnumValueDescriptor(

name='MOBILE_HASH_SHA256', index=5, number=6,

options=None,

type=None),

_descriptor.EnumValueDescriptor(

name='OAID', index=6, number=7,

options=None,

type=None),

_descriptor.EnumValueDescriptor(

name='OAID_MD5', index=7, number=8,

options=None,

type=None),

],

containing_type=None,

options=None,

serialized_start=189,

serialized_end=304,

)

_sym_db.RegisterEnumDescriptor(_IDITEM_DATATYPE)

_DMPDATA = _descriptor.Descriptor(

name='DmpData',

full_name='toutiao.dmp.DmpData',

filename=None,

file=DESCRIPTOR,

containing_type=None,

fields=[

_descriptor.FieldDescriptor(

name='idList', full_name='toutiao.dmp.DmpData.idList', index=0,

number=1, type=11, cpp_type=10, label=3,

has_default_value=False, default_value=[],

message_type=None, enum_type=None, containing_type=None,

is_extension=False, extension_scope=None,

options=None),

],

extensions=[

],

nested_types=[],

enum_types=[

],

options=None,

is_extendable=False,

extension_ranges=[],

oneofs=[

],

serialized_start=37,

serialized_end=83,

)

_IDITEM = _descriptor.Descriptor(

name='IdItem',

full_name='toutiao.dmp.IdItem',

filename=None,

file=DESCRIPTOR,

containing_type=None,

fields=[

_descriptor.FieldDescriptor(

name='timestamp', full_name='toutiao.dmp.IdItem.timestamp', index=0,

number=1, type=13, cpp_type=3, label=1,

has_default_value=False, default_value=0,

message_type=None, enum_type=None, containing_type=None,

is_extension=False, extension_scope=None,

options=None),

_descriptor.FieldDescriptor(

name='dataType', full_name='toutiao.dmp.IdItem.dataType', index=1,

number=2, type=14, cpp_type=8, label=2,

has_default_value=False, default_value=0,

message_type=None, enum_type=None, containing_type=None,

is_extension=False, extension_scope=None,

options=None),

_descriptor.FieldDescriptor(

name='id', full_name='toutiao.dmp.IdItem.id', index=2,

number=3, type=9, cpp_type=9, label=2,

has_default_value=False, default_value=_b("").decode('utf-8'),

message_type=None, enum_type=None, containing_type=None,

is_extension=False, extension_scope=None,

options=None),

_descriptor.FieldDescriptor(

name='tags', full_name='toutiao.dmp.IdItem.tags', index=3,

number=4, type=9, cpp_type=9, label=3,

has_default_value=False, default_value=[],

message_type=None, enum_type=None, containing_type=None,

is_extension=False, extension_scope=None,

options=None),

],

extensions=[

],

nested_types=[],

enum_types=[

_IDITEM_DATATYPE,

],

options=None,

is_extendable=False,

extension_ranges=[],

oneofs=[

],

serialized_start=86,

serialized_end=304,

)

_DMPDATA.fields_by_name['idList'].message_type = _IDITEM

_IDITEM.fields_by_name['dataType'].enum_type = _IDITEM_DATATYPE

_IDITEM_DATATYPE.containing_type = _IDITEM

DESCRIPTOR.message_types_by_name['DmpData'] = _DMPDATA

DESCRIPTOR.message_types_by_name['IdItem'] = _IDITEM

DmpData = _reflection.GeneratedProtocolMessageType('DmpData', (_message.Message,), dict(

DESCRIPTOR = _DMPDATA,

__module__ = 'DmpDataProtoV2_pb2'

# @@protoc_insertion_point(class_scope:toutiao.dmp.DmpData)

))

_sym_db.RegisterMessage(DmpData)

IdItem = _reflection.GeneratedProtocolMessageType('IdItem', (_message.Message,), dict(

DESCRIPTOR = _IDITEM,

__module__ = 'DmpDataProtoV2_pb2'

# @@protoc_insertion_point(class_scope:toutiao.dmp.IdItem)

))

_sym_db.RegisterMessage(IdItem)

DESCRIPTOR.has_options = True

DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('B\014DmpDataProto'))

# @@protoc_insertion_point(module_scope)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值