很久之前研究的,做个记录吧
1.文件拖到010editor
可以看到这个dex文件的各个区域
依次为:
dex_header, dex头部文件,主要是各个区域的的位移,dex的校验等等
dex_string_ids, dex的字符串区域
dex_type_ids, dex的类型区域
dex_proto_ids, 这个不知道干嘛用的
dex_field_ids, dex类里的field区域的内容
dex_method_ids, dex类里的函数
dex_class_defs, dex类的一些解析
可以看到010editor里有位移,大小
那么我们怎么去写一个dex解析器呢
2.先解析dex头, 同时校验文件是否是dex文件
def is_dex(f):
magic_code = f.read(4).decode()
print(magic_code)
if magic_code != 'dex\n':
raise ValueError("not a valid dex file")
dex_version = f.read(4)[:3].decode()
return dex_version
def parse_dex_header(f):
size_off_map = {}
dex_version = is_dex(f)
print("dex version", dex_version)
checksum = hex(struct.unpack("<I", f.read(4))[0])
print("check sum", checksum)
signature = f.read(20).hex()
print("signature", signature)
file_size = struct.unpack("<I", f.read(4))[0]
print("file size", file_size)
header_size = struct.unpack("<I", f.read(4))[0]
print("header size", header_size)
# 0x12345678表示小端
endian_tag = hex(struct.unpack("<I", f.read(4))[0])
print("end tag", endian_tag)
for name in ['link_size', 'link_off', 'map_off', 'string_ids_size', 'string_ids_off',
'type_ids_size', 'type_ids_off', 'proto_ids_size', 'proto_ids_off',
'field_ids_size', 'field_ids_off', 'method_ids_size', 'method_ids_off',
'class_defs_size', 'class_defs_off', 'data_size', 'data_off']:
size_off_map[name] = struct.unpack('<I', f.read(4))[0]
print(size_off_map)
return size_off_map
3.整个流程
#!/usr/bin/env python
# @Time : 2019-12-26 20:03
import struct
def read_uleb128(f):
val = 0
i = 0
while True:
b = f.read(1)[0]
val |= ((b & 0x7f) << (i * 7))
i += 1
if b & 0x80 == 0:
break
return val
def is_dex(f):
magic_code = f.read(4).decode()
print(magic_code)
if magic_code != 'dex\n':
raise ValueError("not a valid dex file")
dex_version = f.read(4)[:3].decode()
return dex_version
def parse_dex_header(f):
size_off_map = {}
dex_version = is_dex(f)
print("dex version", dex_version)
checksum = hex(struct.unpack("<I", f.read(4))[0])
print("check sum", checksum)
signature = f.read(20).hex()
print("signature", signature)
file_size = struct.unpack("<I", f.read(4))[0]
print("file size", file_size)
header_size = struct.unpack("<I", f.read(4))[0]
print("header size", header_size)
# 0x12345678表示小端
endian_tag = hex(struct.unpack("<I", f.read(4))[0])
print("end tag", endian_tag)
for name in ['link_size', 'link_off', 'map_off', 'string_ids_size', 'string_ids_off',
'type_ids_size', 'type_ids_off', 'proto_ids_size', 'proto_ids_off',
'field_ids_size', 'field_ids_off', 'method_ids_size', 'method_ids_off',
'class_defs_size', 'class_defs_off', 'data_size', 'data_off']:
size_off_map[name] = struct.unpack('<I', f.read(4))[0]
print(size_off_map)
return size_off_map
def read_dex_str(f, off, size):
text_info = []
for i in range(size):
f.seek(off + (i * 4))
# 先读取字符串位移
str_offset = struct.unpack("<I", f.read(4))[0]
# 文件移动到该位置
f.seek(str_offset)
# uleb128读取字符串的大小
utf16_size = read_uleb128(f)
# 读取字符串内容
string_data = f.read(utf16_size).decode()
text_info.append({
"text": string_data,
"offset": str_offset,
"text_size": utf16_size
})
print(text_info)
return text_info
def parse_dex_types_ids(f, offset, size):
type_ids = []
for i in range(size):
for i in range(size):
f.seek(offset + (i * 4))
# 先读取数据
type_id = struct.unpack("<I", f.read(4))[0]
type_ids.append(type_id)
print(type_ids)
return type_ids
def parse_proto_id(f, offset, size):
proto_ids = []
for i in range(size):
idx_item = {}
f.seek(offset + i * 4 * 3)
idx = struct.unpack("<I", f.read(4))[0]
type_idx = struct.unpack("<I", f.read(4))[0]
params_off = struct.unpack("<I", f.read(4))[0]
idx_item['idx'] = idx
idx_item['type_idx'] = type_idx
idx_item['params_off'] = params_off
if params_off != 0:
f.seek(params_off)
params_size = struct.unpack("<I", f.read(4))[0]
for i in range(params_size):
info = struct.unpack("<H", f.read(2))[0]
idx_item.setdefault("params", []).append(info)
proto_ids.append(idx_item)
print(proto_ids)
def parse_dex_field_ids(f, offset, size):
field_indxs = []
for i in range(size):
f.seek(offset + 4 * 2 * size)
class_idx = struct.unpack("<H", f.read(2))[0]
type_indx = struct.unpack("<H", f.read(2))[0]
name_indx = struct.unpack("<H", f.read(2))[0]
field_indxs.append({
'class_idx': class_idx,
"type_indx": type_indx,
"name_indx": name_indx
})
print(field_indxs)
def parse_method_idx(f, offset, size):
method_idxs = []
for i in range(size):
f.seek(offset + 4 * 2 * i)
class_idx = struct.unpack("<H", f.read(2))[0]
proto_idx = struct.unpack("<H", f.read(2))[0]
name_idx = struct.unpack("<I", f.read(4))[0]
method_idxs.append({
"class_idx": class_idx,
"proto_idx": proto_idx,
"name_idx": name_idx
})
print(method_idxs)
def read_annotations(f, off):
# to be implemented
f.seek(off)
return {
}
def read_class_data(f, off):
f.seek(off)
static_fields_size = read_uleb128(f)
instance_fields_size = read_uleb128(f)
direct_methods_size = read_uleb128(f)
virtual_methods_size = read_uleb128(f)
static_fields = list()
for i in range(static_fields_size):
static_fields.append({
'field_ifx_diff': read_uleb128(f),
'access_flags': read_uleb128(f)
})
instance_fields = list()
for i in range(instance_fields_size):
instance_fields.append({
'field_ifx_diff': read_uleb128(f),
'access_flags': read_uleb128(f)
})
direct_methods = list()
for i in range(direct_methods_size):
item = {
'method_idx_diff': read_uleb128(f),
'access_flags': read_uleb128(f),
'code_off': read_uleb128(f)
}
current_offset = f.tell()
if item['code_off'] != 0:
item['code'] = read_code(f, item['code_off'])
f.seek(current_offset)
direct_methods.append(item)
virtual_methods = list()
for i in range(virtual_methods_size):
item = {
'method_idx_diff': read_uleb128(f),
'access_flags': read_uleb128(f),
'code_off': read_uleb128(f)
}
current_offset = f.tell()
if item['code_off'] != 0:
item['code'] = read_code(f, item['code_off'])
f.seek(current_offset)
virtual_methods.append(item)
return {
'static_fields_size': static_fields_size,
'instance_fields_size': instance_fields_size,
'direct_methods_size': direct_methods_size,
'virtual_methods_size': virtual_methods_size,
'static_fields': static_fields,
'instance_fields': instance_fields,
'direct_methods': direct_methods,
'virtual_methods': virtual_methods
}
def read_static_values(f, off):
# to be implemented
f.seek(off)
return {
}
def read_code(f, off):
f.seek(off)
item = {
'registers_size': struct.unpack('<H', f.read(2))[0],
'ins_size': struct.unpack('<H', f.read(2))[0],
'outs_size': struct.unpack('<H', f.read(2))[0],
'tries_size': struct.unpack('<H', f.read(2))[0],
'debug_info_off': struct.unpack('<I', f.read(4))[0],
'debug_info': dict(),
'insns_size': struct.unpack('<I', f.read(4))[0],
'insns': list()
}
for i in range(item['insns_size']):
item['insns'].append(struct.unpack('<H', f.read(2))[0])
f.seek(item['debug_info_off'])
item['debug_info'] = {
'line_start': read_uleb128(f),
'parameters_size': read_uleb128(f),
'opcode': list(),
}
for i in range(3):
item['debug_info']['opcode'].append(f.read(1)[0])
return item
def parse_class_def(f):
item = {
'class_idx': struct.unpack('<I', f.read(4))[0],
'access_flags': struct.unpack('<I', f.read(4))[0],
'superclass_idx': struct.unpack('<I', f.read(4))[0],
'interfaces_off': struct.unpack('<I', f.read(4))[0],
'source_file_idx': struct.unpack('<I', f.read(4))[0],
'annotations_off': struct.unpack('<I', f.read(4))[0],
'class_data_off': struct.unpack('<I', f.read(4))[0],
'static_values_off': struct.unpack('<I', f.read(4))[0],
}
if item['annotations_off'] != 0:
item['annotations'] = read_annotations(f, item['annotations_off'])
if item['class_data_off'] != 0:
item['class_data'] = read_class_data(f, item['class_data_off'])
if item['static_values_off'] != 0:
item['static_values'] = read_static_values(f, item['static_values_off'])
return item
def parse_map(f, offset):
map_data = []
f.seek(offset)
size = struct.unpack("<I", f.read(4))[0]
print(size)
for i in range(size):
f.seek(map_offset + 4 * 3 * i)
map_data.append({
'type': struct.unpack("<H", f.read(2))[0],
'unused': struct.unpack("<H", f.read(2))[0],
'size': struct.unpack("<I", f.read(4))[0],
'offset': struct.unpack("<I", f.read(4))[0],
})
print(map_data)
if __name__ == '__main__':
with open("./classes.dex", 'rb') as f:
parse_dex_headers_map = parse_dex_header(f)
text_off, text_size = parse_dex_headers_map['string_ids_off'], parse_dex_headers_map['type_ids_size']
read_dex_str(f, text_off, text_size)
type_ids_off, type_ids_size = parse_dex_headers_map['type_ids_off'], parse_dex_headers_map['type_ids_size']
parse_dex_types_ids(f, type_ids_off, type_ids_size)
proto_ids_off, proto_ids_size = parse_dex_headers_map['proto_ids_off'], parse_dex_headers_map['proto_ids_size']
parse_proto_id(f, proto_ids_off, proto_ids_size)
field_ids_off, field_ids_size = parse_dex_headers_map['field_ids_off'], parse_dex_headers_map['field_ids_size']
parse_dex_field_ids(f, field_ids_off, field_ids_size)
method_ids_off, method_ids_size = parse_dex_headers_map['method_ids_off'], parse_dex_headers_map[
'method_ids_size']
parse_method_idx(f, method_ids_off, method_ids_size)
class_defs_off, class_defs_size = parse_dex_headers_map['class_defs_off'], parse_dex_headers_map[
'class_defs_size']
parse_class_def(f)
map_offset = parse_dex_headers_map['map_off']
parse_map(f, map_offset)