From c7045b03ad33b50259dc783fc2cc577c1d3eb747 Mon Sep 17 00:00:00 2001 From: zangobot Date: Tue, 26 Sep 2023 12:49:12 +0200 Subject: [PATCH 1/5] Version of EMBER working with LIEF versions higher than 0.9 (LIEF broke the compatibility by refactoring errors) --- ember/features.py | 61 +++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/ember/features.py b/ember/features.py index bbaa138..d38276d 100644 --- a/ember/features.py +++ b/ember/features.py @@ -12,16 +12,17 @@ for your modeling problem. ''' +import hashlib +import json +import os import re + import lief -import hashlib import numpy as np -import os -import json from sklearn.feature_extraction import FeatureHasher LIEF_MAJOR, LIEF_MINOR, _ = lief.__version__.split('.') -LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or ( int(LIEF_MAJOR)==0 and int(LIEF_MINOR) >= 10 ) +LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 10) LIEF_HAS_SIGNATURE = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 11) @@ -147,17 +148,17 @@ def raw_features(self, bytez, lief_binary): if int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 12): section = lief_binary.section_from_rva(lief_binary.entrypoint - lief_binary.imagebase) if section is None: - raise lief.not_found + raise lief.lief_errors.not_found entry_section = section.name - else: # lief < 0.12 + else: # lief < 0.12 entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name - except lief.not_found: - # bad entry point, let's find the first executable section - entry_section = "" - for s in lief_binary.sections: - if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists: - entry_section = s.name - break + except lief.lief_errors.not_found: + # bad entry point, let's find the first executable section + entry_section = "" + for s in lief_binary.sections: + if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists: + entry_section = s.name + break raw_obj = {"entry": entry_section} raw_obj["sections"] = [{ @@ -267,7 +268,6 @@ def raw_features(self, bytez, lief_binary): # export is a string (LIEF 0.9.0 and earlier) clipped_exports = [export[:10000] for export in lief_binary.exported_functions] - return clipped_exports def process_raw_features(self, raw_obj): @@ -318,7 +318,7 @@ def process_raw_features(self, raw_obj): raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'], raw_obj['symbols'] ], - dtype=np.float32) + dtype=np.float32) class HeaderFileInfo(FeatureType): @@ -499,15 +499,15 @@ class PEFeatureExtractor(object): def __init__(self, feature_version=2, print_feature_warning=True, features_file=''): self.features = [] features = { - 'ByteHistogram': ByteHistogram(), - 'ByteEntropyHistogram': ByteEntropyHistogram(), - 'StringExtractor': StringExtractor(), - 'GeneralFileInfo': GeneralFileInfo(), - 'HeaderFileInfo': HeaderFileInfo(), - 'SectionInfo': SectionInfo(), - 'ImportsInfo': ImportsInfo(), - 'ExportsInfo': ExportsInfo() - } + 'ByteHistogram': ByteHistogram(), + 'ByteEntropyHistogram': ByteEntropyHistogram(), + 'StringExtractor': StringExtractor(), + 'GeneralFileInfo': GeneralFileInfo(), + 'HeaderFileInfo': HeaderFileInfo(), + 'SectionInfo': SectionInfo(), + 'ImportsInfo': ImportsInfo(), + 'ExportsInfo': ExportsInfo() + } if os.path.exists(features_file): with open(features_file, encoding='utf8') as f: @@ -520,22 +520,27 @@ def __init__(self, feature_version=2, print_feature_warning=True, features_file= if not lief.__version__.startswith("0.8.3"): if print_feature_warning: print(f"WARNING: EMBER feature version 1 were computed using lief version 0.8.3-18d5b75") - print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies") + print( + f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies") print(f"WARNING: in the feature calculations.") elif feature_version == 2: self.features.append(DataDirectories()) if not lief.__version__.startswith("0.9.0"): if print_feature_warning: print(f"WARNING: EMBER feature version 2 were computed using lief version 0.9.0-") - print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies") + print( + f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies") print(f"WARNING: in the feature calculations.") else: raise Exception(f"EMBER feature version must be 1 or 2. Not {feature_version}") self.dim = sum([fe.dim for fe in self.features]) def raw_features(self, bytez): - lief_errors = (lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound, - RuntimeError) + lief_errors = ( + lief.lief_errors.conversion_error, lief.lief_errors.file_error, lief.lief_errors.file_format_error, + lief.lief_errors.corrupted, lief.lief_errors.parsing_error, lief.lief_errors.read_out_of_bound, + RuntimeError) + # lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound try: lief_binary = lief.PE.parse(list(bytez)) except lief_errors as e: From 442c3eda48660ca2414d84b2c0d3f5a9c1e9bf0f Mon Sep 17 00:00:00 2001 From: zangobot Date: Tue, 26 Sep 2023 12:52:09 +0200 Subject: [PATCH 2/5] Adding compatibility with v0.9 as well --- ember/features.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/ember/features.py b/ember/features.py index d38276d..7b66507 100644 --- a/ember/features.py +++ b/ember/features.py @@ -143,16 +143,17 @@ def raw_features(self, bytez, lief_binary): return {"entry": "", "sections": []} # properties of entry point, or if invalid, the first executable section - + not_found_error_class = lief.lief_errors.not_found if not lief.__version__.startswith("0.9.0") else lief.not_found try: if int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 12): section = lief_binary.section_from_rva(lief_binary.entrypoint - lief_binary.imagebase) + if section is None: - raise lief.lief_errors.not_found + raise not_found_error_class entry_section = section.name else: # lief < 0.12 entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name - except lief.lief_errors.not_found: + except not_found_error_class: # bad entry point, let's find the first executable section entry_section = "" for s in lief_binary.sections: @@ -536,11 +537,15 @@ def __init__(self, feature_version=2, print_feature_warning=True, features_file= self.dim = sum([fe.dim for fe in self.features]) def raw_features(self, bytez): - lief_errors = ( - lief.lief_errors.conversion_error, lief.lief_errors.file_error, lief.lief_errors.file_format_error, - lief.lief_errors.corrupted, lief.lief_errors.parsing_error, lief.lief_errors.read_out_of_bound, - RuntimeError) - # lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound + if lief.__version__.startswith("0.9.0"): + lief_errors = ( + lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound, RuntimeError) + else: + lief_errors = ( + lief.lief_errors.conversion_error, lief.lief_errors.file_error, lief.lief_errors.file_format_error, + lief.lief_errors.corrupted, lief.lief_errors.parsing_error, lief.lief_errors.read_out_of_bound, + RuntimeError) + try: lief_binary = lief.PE.parse(list(bytez)) except lief_errors as e: From 1d2eaec00da74e888b87764a9a5486991c121eb3 Mon Sep 17 00:00:00 2001 From: zangobot Date: Tue, 26 Sep 2023 14:08:16 +0200 Subject: [PATCH 3/5] Removed np.int, which has been deprecated --- ember/features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ember/features.py b/ember/features.py index 7b66507..49107f4 100644 --- a/ember/features.py +++ b/ember/features.py @@ -98,7 +98,7 @@ def _entropy_bin_counts(self, block): return Hbin, c def raw_features(self, bytez, lief_binary): - output = np.zeros((16, 16), dtype=np.int) + output = np.zeros((16, 16), dtype=int) a = np.frombuffer(bytez, dtype=np.uint8) if a.shape[0] < self.window: Hbin, c = self._entropy_bin_counts(a) From cc2010cf461780b37611e457f398feeffe181c07 Mon Sep 17 00:00:00 2001 From: zangobot Date: Tue, 26 Sep 2023 14:47:15 +0200 Subject: [PATCH 4/5] Feature hashing of entry (which is single string) was crashing the feature hasher. --- ember/features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ember/features.py b/ember/features.py index 49107f4..839fdf7 100644 --- a/ember/features.py +++ b/ember/features.py @@ -191,7 +191,7 @@ def process_raw_features(self, raw_obj): section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0] section_vsize = [(s['name'], s['vsize']) for s in sections] section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0] - entry_name_hashed = FeatureHasher(50, input_type="string").transform([raw_obj['entry']]).toarray()[0] + entry_name_hashed = FeatureHasher(50, input_type="string").transform([[raw_obj['entry']]]).toarray()[0] characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']] characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0] From d1d64e8ba28e062643f17c764d27995669d2ebb7 Mon Sep 17 00:00:00 2001 From: zangobot Date: Tue, 16 Jul 2024 11:32:52 +0200 Subject: [PATCH 5/5] Fixed bug introduced in LIEF 0.14 on ERROR HANDLING (done without exceptions) --- ember/features.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ember/features.py b/ember/features.py index 839fdf7..bc1ccb0 100644 --- a/ember/features.py +++ b/ember/features.py @@ -143,7 +143,7 @@ def raw_features(self, bytez, lief_binary): return {"entry": "", "sections": []} # properties of entry point, or if invalid, the first executable section - not_found_error_class = lief.lief_errors.not_found if not lief.__version__.startswith("0.9.0") else lief.not_found + not_found_error_class = RuntimeError if not lief.__version__.startswith("0.9.0") else lief.not_found try: if int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 12): section = lief_binary.section_from_rva(lief_binary.entrypoint - lief_binary.imagebase) @@ -156,8 +156,9 @@ def raw_features(self, bytez, lief_binary): except not_found_error_class: # bad entry point, let's find the first executable section entry_section = "" + mem_execute_characteristics = lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE if lief.__version__.startswith("0.9.0") else lief.PE.Section.CHARACTERISTICS.MEM_EXECUTE for s in lief_binary.sections: - if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists: + if mem_execute_characteristics in s.characteristics_lists: entry_section = s.name break