#!/usr/bin/python
''' Extracts some basic features from PE files. Many of the features
implemented have been used in previously published works. For more information,
check out the following resources:
* Schultz, et al., 2001: http://128.59.14.66/sites/default/files/binaryeval-ieeesp01.pdf
* Kolter and Maloof, 2006: http://www.jmlr.org/papers/volume7/kolter06a/kolter06a.pdf
* Shafiq et al., 2009: https://www.researchgate.net/profile/Fauzan_Mirza/publication/242084613_A_Framework_for_Efficient_Mining_of_Structural_Information_to_Detect_Zero-Day_Malicious_Portable_Executables/links/0c96052e191668c3d5000000.pdf
* Raman, 2012: http://2012.infosecsouthwest.com/files/speaker_materials/ISSW2012_Selecting_Features_to_Classify_Malware.pdf
* Saxe and Berlin, 2015: https://arxiv.org/pdf/1508.03096.pdf
It may be useful to do feature selection to reduce this set of features to a meaningful set
for your modeling problem.
'''
import re
import lief
import hashlib
import numpy as np
from sklearn.feature_extraction import FeatureHasher
class FeatureType(object):
''' Base class from which each feature type may inherit '''
name = ''
dim = 0
def __repr__(self):
return '{}({})'.format(self.name, self.dim)
def raw_features(self, bytez, lief_binary):
''' Generate a JSON-able representation of the file '''
raise (NotImplemented)
def process_raw_features(self, raw_obj):
''' Generate a feature vector from the raw features '''
raise (NotImplemented)
def feature_vector(self, bytez, lief_binary):
''' Directly calculate the feature vector from the sample itself. This should only be implemented differently
if there are significant speedups to be gained from combining the two functions. '''
return self.process_raw_features(self.raw_features(bytez, lief_binary))
class ByteHistogram(FeatureType):
''' Byte histogram (count + non-normalized) over the entire binary file '''
name = 'histogram'
dim = 256
def __init__(self):
super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary):
counts = np.bincount(np.frombuffer(bytez, dtype=np.uint8), minlength=256)
return counts.tolist()
def process_raw_features(self, raw_obj):
counts = np.array(raw_obj, dtype=np.float32)
sum = counts.sum()
normalized = counts / sum
return normalized
class ByteEntropyHistogram(FeatureType):
''' 2d byte/entropy histogram based loosely on (Saxe and Berlin, 2015).
This roughly approximates the joint probability of byte value and local entropy.
See Section 2.1.1 in https://arxiv.org/pdf/1508.03096.pdf for more info.
'''
name = 'byteentropy'
dim = 256
def __init__(self, step=1024, window=2048):
super(FeatureType, self).__init__()
self.window = window
self.step = step
def _entropy_bin_counts(self, block):
# coarse histogram, 16 bytes per bin
c = np.bincount(block >> 4, minlength=16) # 16-bin histogram
p = c.astype(np.float32) / self.window
wh = np.where(c)[0]
H = np.sum(-p[wh] * np.log2(
p[wh])) * 2 # * x2 b.c. we reduced information by half: 256 bins (8 bits) to 16 bins (4 bits)
Hbin = int(H * 2) # up to 16 bins (max entropy is 8 bits)
if Hbin == 16: # handle entropy = 8.0 bits
Hbin = 15
return Hbin, c
def raw_features(self, bytez, lief_binary):
output = np.zeros((16, 16), dtype=np.int)
a = np.frombuffer(bytez, dtype=np.uint8)
if a.shape[0] < self.window:
Hbin, c = self._entropy_bin_counts(a)
output[Hbin, :] += c
else:
# strided trick from here: http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html
shape = a.shape[:-1] + (a.shape[-1] - self.window + 1, self.window)
strides = a.strides + (a.strides[-1],)
blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::self.step, :]
# from the blocks, compute histogram
for block in blocks:
Hbin, c = self._entropy_bin_counts(block)
output[Hbin, :] += c
return output.flatten().tolist()
def process_raw_features(self, raw_obj):
counts = np.array(raw_obj, dtype=np.float32)
sum = counts.sum()
normalized = counts / sum
return normalized
class SectionInfo(FeatureType):
''' Information about section names, sizes and entropy. Uses hashing trick
to summarize all this section info into a feature vector.
'''
name = 'section'
dim = 5 + 50 + 50 + 50 + 50 + 50
def __init__(self):
super(FeatureType, self).__init__()
@staticmethod
def _properties(s):
return [str(c).split('.')[-1] for c in s.characteristics_lists]
def raw_features(self, bytez, lief_binary):
if lief_binary is None:
return {"entry": "", "sections": []}
# properties of entry point, or if invalid, the first executable section
try:
entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name
except lief.not_found:
# bad entry point, let's find the first executable section
entry_section = ""
for s in lief_binary.sections:
if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists:
entry_section = s.name
break
raw_obj = {"entry": entry_section}
raw_obj["sections"] = [{
'name': s.name,
'size': s.size,
'entropy': s.entropy,
'vsize': s.virtual_size,
'props': self._properties(s)
} for s in lief_binary.sections]
return raw_obj
def process_raw_features(self, raw_obj):
sections = raw_obj['sections']
general = [
len(sections), # total number of sections
# number of sections with nonzero size
sum(1 for s in sections if s['size'] == 0),
# number of sections with an empty name
sum(1 for s in sections if s['name'] == ""),
# number of RX
sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']),
# number of W
sum(1 for s in sections if 'MEM_WRITE' in s['props'])
]
# gross characteristics of each section
section_sizes = [(s['name'], s['size']) for s in sections]
section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0]
section_entropy = [(s['name'], s['entropy']) for s in sections]
section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]
section_vsize = [(s['name'], s['vsize']) for s in sections]
section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]
entry_name_hashed = FeatureHasher(50, input_type="string").transform([raw_obj['entry']]).toarray()[0]
characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']]
characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]
return np.hstack([
general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed,
characteristics_hashed
]).astype(np.float32)
class ImportsInfo(FeatureType):
''' Information about imported libraries and functions from the
import address table. Note that the total number of imported
functions is contained in GeneralFileInfo.
'''
name = 'imports'
dim = 1280
def __init__(self):
super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary):
imports = {}
if lief_binary is None:
return imports
for lib in lief_binary.imports:
if lib.name not in imports:
imports[lib.name] = [] # libraries can be duplicated in listing, extend instead of overwrite
# Clipping assumes there are diminishing returns on the discriminatory power of imported functions
# beyond the first 10000 characters, and this will help limit the dataset size
imports[lib.name].extend([entry.name[:10000] for entry in lib.entries])
return imports
def process_raw_features(self, raw_obj):
# unique libraries
libraries = list(set([l.lower() for l in raw_obj.keys()]))
libraries_hashed = FeatureHasher(256, input_type="string").transform([libraries]).toarray()[0]
# A string like "kernel32.dll:CreateFileMappingA" for each imported function
imports = [lib.lower() + ':' + e for lib, elist in raw_obj.items() for e in elist]
imports_hashed = FeatureHasher(1024, input_type="string").transform([imports]).toarray()[0]
# Two separate elements: libraries (alone) and fully-qualified names of imported functions
return np.hstack([libraries_hashed, imports_hashed]).astype(np.float32)
class ExportsInfo(FeatureType):
''' Information about exported functions. Note that the total number of exported
functions is contained in GeneralFileInfo.
'''
name = 'exports'
dim = 128
def __init__(self):
super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary):
if lief_binary is None:
return []
# Clipping assumes there are diminishing returns on the discriminatory power of exports beyond
# the first 10000 characters, and this will help limit the dataset size
clipped_exports = [export[:10000] for export in lief_binary.exported_functions]
return clipped_exports
def process_raw_features(self, raw_obj):
exports_hashed = FeatureHasher(128, input_type="string").transform([raw_obj]).toarray()[0]
return exports_hashed.astype(np.float32)
class GeneralFileInfo(FeatureType):
''' General information about the file '''
name = 'general'
dim = 10
def __init__(self):
super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary):
if lief_binary is None:
return {
'size': len(bytez),
'vsize': 0,
'has_debug': 0,
'exports': 0,
'imports': 0,
'has_relocations': 0,
'has_resources': 0,
'has_signature': 0,
'has_tls': 0,
'symbols': 0
}
return {
'size': len(bytez),
'vsize': lief_binary.virtual_size,
'has_debug': int(lief_binary.has_debug),
'exports': len(lief_binary.exported_functions),
'imports': len(lief_binary.imported_functions),
'has_relocations': int(lief_binary.has_relocations),
'has_resources': int(lief_binary.has_resources),
'has_signature': int(lief_binary.has_signature),
'has_tls': int(lief_binary.has_tls),
'symbols': len(lief_binary.symbols),
}
def process_raw_features(self, raw_obj):
return np.asarray(
[
raw_obj['size'], raw_obj['vsize'], raw_obj['has_debug'], raw_obj['exports'], raw_obj['imports'],
raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'],
raw_obj['symbols']
],
dtype=np.float32)
class HeaderFileInfo(FeatureType):
''' Machine, architecure, OS, linker and other information extracted from header '''
name = 'header'
dim = 62
def __init__(self):
super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary):
raw_obj = {}
raw_obj['coff'] = {'timestamp': 0, 'machine': "", 'characteristics': []}
raw_obj['optional'] = {
'subsystem': "",
'dll_characteristics': [],
'magic': "",
'major_image_version': 0,
'minor_image_version': 0,
'major_linker_version': 0,
'minor_linker_version': 0,
'major_operating_system_version': 0,
'minor_operating_system_version': 0,
'major_subsystem_version': 0,
'minor_subsystem_version': 0,
'sizeof_code': 0,
'sizeof_headers': 0,
'sizeof_heap_commit': 0
}
if lief_binary is None:
return raw_obj
raw_obj['coff']['timestamp'] = lief_binary.header.time_date_stamps
raw_obj['coff']['machine'] = str(lief_binary.header.machine).split('.')[-1]
raw_obj['coff']['characteristics'] = [str(c).split('.')[-1] for c in lief_binary.header.characteristics_list]
raw_obj['optional']['subsystem'] = str(lief_binary.optional_header.subsystem).split('.')[-1]
raw_obj['optional']['dll_characteristics'] = [
str(c).split('.')[-1] for c in lief_binary.optional_header.dll_characteristics_lists
]
raw_obj['optional']['magic'] = str(lief_binary.optional_header.magic).split('.')[-1]
raw_obj['optional']['major_image_version'] = lief_binary.optional_header.major_image_version
raw_obj['optional']['minor_image_version'] = lief_binary.optional_header.minor_image_version
raw_obj['optional']['major_linker_version'] = lief_binary.optional_header.major_linker_version
raw_obj['optional']['minor_linker_version'] = lief_binary.optional_header.minor_linker_version
raw_obj['optional'][
'major_operating_system_version'] = lief_binary.optional_header.major_operating_system_version
raw_obj['optional'][
'minor_operating_system_version'] = lief_binary.optional_header.minor_operating_system_version
raw_obj['optional']['major_subsystem_version'] = lief_binary.optional_header.major_subsystem_version
raw_obj['optional']['minor_subsystem_version'] = lief_binary.optional_header.minor_subsystem_version
raw_obj['optional']['sizeof_code'] = lief_binary.optional_header.sizeof_code
raw_obj['optional']['sizeof_headers'] = lief_binary.optional_header.sizeof_headers
raw_obj['optional']['sizeof_heap_commit'] = lief_binary.optional_header.sizeof_heap_commit
return raw_obj
def process_raw_features(self, raw_obj):
return np.hstack([
raw_obj['coff']['timestamp'],
FeatureHasher(10, input_type="string").transform([[raw_obj['coff']['machine']]]).toarray()[0],
FeatureHasher(10, input_type="string").transform([raw_obj['coff']['characteristics']]).toarray()[0],
FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['subsystem']]]).toarray()[0],
FeatureHasher(10, input_type="string").transform([raw_obj['optional']['dll_characteristics']]).toarray()[0],
FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['magic']]]).toarray()[0],
raw_obj['optional']['major_image_version'],
raw_obj['optional']['minor_image_version'],
raw_obj['optional']['major_linker_version'],
raw_obj['optional']['minor_linker_version'],
raw_obj['optional']['major_operating_system_version'],
raw_obj['optional']['minor_operating_system_version'],
raw_obj['optional']['major_subsystem_version'],
raw_obj['optional']['minor_subsystem_version'],
raw_obj['optional']['sizeof_code'],
raw_obj['optional']['sizeof_headers'],
raw_obj['optional']['sizeof_heap_commit'],
]).astype(np.float32)
class StringExtractor(FeatureType):
''' Extracts strings from raw byte stream '''
name = 'strings'
dim = 1 + 1 + 1 + 96 + 1 + 1 + 1 + 1 + 1
def __init__(self):
super(FeatureType, self).__init__()
# all consecutive runs of 0x20 - 0x7f that are 5+ characters
self._allstrings = re.compile(b'[\x20-\x7f]{5,}')
# occurances of the string 'C:\'. Not actually extracting the path
self._paths = re.compile(b'c:\\\\', re.IGNORECASE)
# occurances of http:// or https://. Not actually extracting the URLs
self._urls = re.compile(b'https?://', re.IGNORECASE)
# occurances of the string prefix HKEY_. No actually extracting registry names
self._registry = re.compile(b'HKEY_')
# crude evidence of an MZ header (dropper?) somewhere in the byte stream
self._mz = re.compile(b'MZ')
def raw_features(self, bytez, lief_binary):
allstrings = self._allstrings.findall(bytez)
if allstrings:
# statistics about strings:
string_lengths = [len(s) for s in allstrings]
avlength = sum(string_lengths) / len(string_lengths)
# map printable characters 0x20 - 0x7f to an int array consisting of 0-95, inclusive
as_shifted_string = [b - ord(b'\x20') for b in b''.join(allstrings)]
c = np.bincount(as_shifted_string, minlength=96) # histogram count
# distribution of characters in printable strings
csum = c.sum()
p = c.astype(np.float32) / csum
wh = np.where(c)[0]
H = np.sum(-p[wh] * np.log2(p[wh])) # entropy
else:
avlength = 0
c = np.zeros((96,), dtype=np.float32)
H = 0
csum = 0
return {
'numstrings': len(allstrings),
'avlength': avlength,
'printabledist': c.tolist(), # store non-normalized histogram
'printables': int(csum),
'entropy': float(H),
'paths': len(self._paths.findall(bytez)),
'urls': len(self._urls.findall(bytez)),
'registry': len(self._registry.findall(bytez)),
'MZ': len(self._mz.findall(bytez))
}
def process_raw_features(self, raw_obj):
hist_divisor = float(raw_obj['printables']) if raw_obj['printables'] > 0 else 1.0
return np.hstack([
raw_obj['numstrings'], raw_obj['avlength'], raw_obj['printables'],
np.asarray(raw_obj['printabledist']) / hist_divisor, raw_obj['entropy'], raw_obj['paths'], raw_obj['urls'],
raw_obj['registry'], raw_obj['MZ']
]).astype(np.float32)
class PEFeatureExtractor(object):
''' Extract useful features from a PE file, and return as a vector of fixed size. '''
features = [
ByteHistogram(), ByteEntropyHistogram(), StringExtractor(), GeneralFileInfo(), HeaderFileInfo(), SectionInfo(),
ImportsInfo(), ExportsInfo()
]
dim = sum([fe.dim for fe in features])
def raw_features(self, bytez):
try:
lief_binary = lief.PE.parse(list(bytez))
except (lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, RuntimeError) as e:
print("lief error: ", str(e))
lief_binary = None
except Exception: # everything else (KeyboardInterrupt, SystemExit, ValueError):
raise
features = {"sha256": hashlib.sha256(bytez).hexdigest()}
features.update({fe.name: fe.raw_features(bytez, lief_binary) for fe in self.features})
return features
def process_raw_features(self, raw_obj):
feature_vectors = [fe.process_raw_features(raw_obj[fe.name]) for fe in self.features]
return np.hstack(feature_vectors).astype(np.float32)
def feature_vector(self, bytez):
return self.process_raw_features(self.raw_features(bytez))