Source code for osaca.parser.parser_x86att

#!/usr/bin/env python3

import string

import pyparsing as pp

from osaca.parser import AttrDict, BaseParser


[docs]class ParserX86ATT(BaseParser): def __init__(self): super().__init__() self.isa = 'x86'
[docs] def construct_parser(self): """Create parser for ARM AArch64 ISA.""" decimal_number = pp.Combine( pp.Optional(pp.Literal('-')) + pp.Word(pp.nums) ).setResultsName('value') hex_number = pp.Combine(pp.Literal('0x') + pp.Word(pp.hexnums)).setResultsName('value') # Comment - either '#' or '//' (icc) self.comment = (pp.Literal('#') | pp.Literal('//')) + pp.Group( pp.ZeroOrMore(pp.Word(pp.printables)) ).setResultsName(self.COMMENT_ID) # Define x86 assembly identifier relocation = pp.Combine(pp.Literal('@') + pp.Word(pp.alphas)) id_offset = pp.Word(pp.nums) + pp.Suppress(pp.Literal('+')) first = pp.Word(pp.alphas + '_.', exact=1) rest = pp.Word(pp.alphanums + '$_.+-') identifier = pp.Group( pp.Optional(id_offset).setResultsName('offset') + pp.Combine(first + pp.Optional(rest)).setResultsName('name') + pp.Optional(relocation).setResultsName('relocation') ).setResultsName('identifier') # Label self.label = pp.Group( identifier.setResultsName('name') + pp.Literal(':') + pp.Optional(self.comment) ).setResultsName(self.LABEL_ID) # Register: pp.Regex('^%[0-9a-zA-Z]+{}{z},?') self.register = pp.Group( pp.Literal('%') + pp.Word(pp.alphanums).setResultsName('name') + pp.Optional(pp.Literal('(') + pp.Word(pp.nums) + pp.Literal(')')) + pp.Optional( pp.Literal('{') + pp.Literal('%') + pp.Word(pp.alphanums).setResultsName('mask') + pp.Literal('}') + pp.Optional( pp.Suppress(pp.Literal('{')) + pp.Literal('z').setResultsName('zeroing') + pp.Suppress(pp.Literal('}')) ) ) ).setResultsName(self.REGISTER_ID) # Immediate: pp.Regex('^\$(-?[0-9]+)|(0x[0-9a-fA-F]+),?') symbol_immediate = '$' immediate = pp.Group( pp.Literal(symbol_immediate) + (hex_number | decimal_number | identifier) ).setResultsName(self.IMMEDIATE_ID) # Memory preparations offset = pp.Group(identifier | hex_number | decimal_number).setResultsName( self.IMMEDIATE_ID ) scale = pp.Word('1248', exact=1) # Segment register extension segment_extension = ( hex_number ^ pp.Word(pp.nums) ^ pp.Group( pp.Optional(offset.setResultsName('offset')) + pp.Literal('(') + pp.Optional(self.register.setResultsName('base')) + pp.Optional(pp.Suppress(pp.Literal(','))) + pp.Optional(self.register.setResultsName('index')) + pp.Optional(pp.Suppress(pp.Literal(','))) + pp.Optional(scale.setResultsName('scale')) + pp.Literal(')') ) ) memory_segmentation = ( self.register.setResultsName('base') + pp.Literal(':') + segment_extension.setResultsName(self.SEGMENT_EXT_ID) ) # Memory: offset | seg:seg_ext | offset(base, index, scale){mask} memory = pp.Group( ( pp.Optional(pp.Suppress(pp.Literal('*'))) + pp.Optional(offset.setResultsName('offset')) + pp.Literal('(') + pp.Optional(self.register.setResultsName('base')) + pp.Optional(pp.Suppress(pp.Literal(','))) + pp.Optional(self.register.setResultsName('index')) + pp.Optional(pp.Suppress(pp.Literal(','))) + pp.Optional(scale.setResultsName('scale')) + pp.Literal(')') + pp.Optional( pp.Literal('{') + pp.Literal('%') + pp.Word(pp.alphanums).setResultsName('mask') + pp.Literal('}') ) ) | memory_segmentation | (hex_number | pp.Word(pp.nums)).setResultsName('offset') ).setResultsName(self.MEMORY_ID) # Directive directive_option = pp.Combine( pp.Word('#@.', exact=1) + pp.Word(pp.printables, excludeChars=',') ) directive_parameter = ( pp.quotedString ^ directive_option ^ identifier ^ hex_number ^ decimal_number ^ self.register ^ pp.Group(pp.Word(pp.alphanums + '_').setResultsName('name')) ) commaSeparatedList = pp.delimitedList(pp.Optional(directive_parameter), delim=',') self.directive = pp.Group( pp.Literal('.') + pp.Word(pp.alphanums + '_').setResultsName('name') + commaSeparatedList.setResultsName('parameters') + pp.Optional(self.comment) ).setResultsName(self.DIRECTIVE_ID) # Instructions # Mnemonic mnemonic = pp.ZeroOrMore(pp.Literal('data16') | pp.Literal('data32')) + pp.Word( pp.alphanums ).setResultsName('mnemonic') # Combine to instruction form operand_first = pp.Group(self.register ^ immediate ^ memory ^ identifier) operand_rest = pp.Group(self.register ^ immediate ^ memory) self.instruction_parser = ( mnemonic + pp.Optional(operand_first.setResultsName('operand1')) + pp.Optional(pp.Suppress(pp.Literal(','))) + pp.Optional(operand_rest.setResultsName('operand2')) + pp.Optional(pp.Suppress(pp.Literal(','))) + pp.Optional(operand_rest.setResultsName('operand3')) + pp.Optional(pp.Suppress(pp.Literal(','))) + pp.Optional(operand_rest.setResultsName('operand4')) + pp.Optional(self.comment) )
[docs] def parse_register(self, register_string): """Parse register string""" try: return self.process_operand( self.register.parseString(register_string, parseAll=True).asDict() ) except pp.ParseException: return None
[docs] def parse_line(self, line, line_number=None): """ Parse line and return instruction form. :param str line: line of assembly code :param line_number: default None, identifier of instruction form :type line_number: int, optional :return: ``dict`` -- parsed asm line (comment, label, directive or instruction form) """ instruction_form = AttrDict( { self.INSTRUCTION_ID: None, self.OPERANDS_ID: [], self.DIRECTIVE_ID: None, self.COMMENT_ID: None, self.LABEL_ID: None, 'line': line.strip(), 'line_number': line_number, } ) result = None # 1. Parse comment try: result = self.process_operand(self.comment.parseString(line, parseAll=True).asDict()) result = AttrDict.convert_dict(result) instruction_form[self.COMMENT_ID] = ' '.join(result[self.COMMENT_ID]) except pp.ParseException: pass # 2. Parse label if result is None: try: result = self.process_operand(self.label.parseString(line, parseAll=True).asDict()) result = AttrDict.convert_dict(result) instruction_form[self.LABEL_ID] = result[self.LABEL_ID]['name'] if self.COMMENT_ID in result[self.LABEL_ID]: instruction_form[self.COMMENT_ID] = ' '.join( result[self.LABEL_ID][self.COMMENT_ID] ) except pp.ParseException: pass # 3. Parse directive if result is None: try: result = self.process_operand( self.directive.parseString(line, parseAll=True).asDict() ) result = AttrDict.convert_dict(result) instruction_form[self.DIRECTIVE_ID] = AttrDict( { 'name': result[self.DIRECTIVE_ID]['name'], 'parameters': result[self.DIRECTIVE_ID]['parameters'], } ) if self.COMMENT_ID in result[self.DIRECTIVE_ID]: instruction_form[self.COMMENT_ID] = ' '.join( result[self.DIRECTIVE_ID][self.COMMENT_ID] ) except pp.ParseException: pass # 4. Parse instruction if result is None: try: result = self.parse_instruction(line) except pp.ParseException: raise ValueError( 'Could not parse instruction on line {}: {!r}'.format(line_number, line) ) instruction_form[self.INSTRUCTION_ID] = result[self.INSTRUCTION_ID] instruction_form[self.OPERANDS_ID] = result[self.OPERANDS_ID] instruction_form[self.COMMENT_ID] = result[self.COMMENT_ID] return instruction_form
[docs] def parse_instruction(self, instruction): """ Parse instruction in asm line. :param str instruction: Assembly line string. :returns: `dict` -- parsed instruction form """ result = self.instruction_parser.parseString(instruction, parseAll=True).asDict() result = AttrDict.convert_dict(result) operands = [] # Add operands to list # Check first operand if 'operand1' in result: operands.append(self.process_operand(result['operand1'])) # Check second operand if 'operand2' in result: operands.append(self.process_operand(result['operand2'])) # Check third operand if 'operand3' in result: operands.append(self.process_operand(result['operand3'])) # Check fourth operand if 'operand4' in result: operands.append(self.process_operand(result['operand4'])) return_dict = AttrDict( { self.INSTRUCTION_ID: result['mnemonic'], self.OPERANDS_ID: operands, self.COMMENT_ID: ' '.join(result[self.COMMENT_ID]) if self.COMMENT_ID in result else None, } ) return return_dict
[docs] def process_operand(self, operand): """Post-process operand""" # For the moment, only used to structure memory addresses if self.MEMORY_ID in operand: return self.process_memory_address(operand[self.MEMORY_ID]) if self.IMMEDIATE_ID in operand: return self.process_immediate(operand[self.IMMEDIATE_ID]) if self.LABEL_ID in operand: return self.process_label(operand[self.LABEL_ID]) return operand
[docs] def process_memory_address(self, memory_address): """Post-process memory address operand""" # Remove unecessarily created dictionary entries during memory address parsing offset = None if 'offset' not in memory_address else memory_address['offset'] base = None if 'base' not in memory_address else memory_address['base'] index = None if 'index' not in memory_address else memory_address['index'] scale = 1 if 'scale' not in memory_address else int(memory_address['scale']) if isinstance(offset, str) and base is None and index is None: offset = {'value': offset} new_dict = AttrDict({'offset': offset, 'base': base, 'index': index, 'scale': scale}) # Add segmentation extension if existing if self.SEGMENT_EXT_ID in memory_address: new_dict[self.SEGMENT_EXT_ID] = memory_address[self.SEGMENT_EXT_ID] return AttrDict({self.MEMORY_ID: new_dict})
[docs] def process_label(self, label): """Post-process label asm line""" # remove duplicated 'name' level due to identifier label['name'] = label['name']['name'] return AttrDict({self.LABEL_ID: label})
[docs] def process_immediate(self, immediate): """Post-process immediate operand""" if 'identifier' in immediate: # actually an identifier, change declaration return immediate # otherwise nothing to do return AttrDict({self.IMMEDIATE_ID: immediate})
[docs] def get_full_reg_name(self, register): """Return one register name string including all attributes""" # nothing to do return register['name']
[docs] def normalize_imd(self, imd): """Normalize immediate to decimal based representation""" if 'value' in imd: if imd['value'].lower().startswith('0x'): # hex, return decimal return int(imd['value'], 16) return int(imd['value'], 10) # identifier return imd
[docs] def is_flag_dependend_of(self, flag_a, flag_b): """Check if ``flag_a`` is dependent on ``flag_b``""" # we assume flags are independent of each other, e.g., CF can be read while ZF gets written # TODO validate this assumption if flag_a.name == flag_b.name: return True return False
[docs] def is_reg_dependend_of(self, reg_a, reg_b): """Check if ``reg_a`` is dependent on ``reg_b``""" # Check if they are the same registers if reg_a.name == reg_b.name: return True # Check vector registers first if self.is_vector_register(reg_a): if self.is_vector_register(reg_b): if reg_a.name[1:] == reg_b.name[1:]: # Registers in the same vector space return True return False # Check basic GPRs a_dep = ['RAX', 'EAX', 'AX', 'AH', 'AL'] b_dep = ['RBX', 'EBX', 'BX', 'BH', 'BL'] c_dep = ['RCX', 'ECX', 'CX', 'CH', 'CL'] d_dep = ['RDX', 'EDX', 'DX', 'DH', 'DL'] sp_dep = ['RSP', 'ESP', 'SP', 'SPL'] src_dep = ['RSI', 'ESI', 'SI', 'SIL'] dst_dep = ['RDI', 'EDI', 'DI', 'DIL'] basic_gprs = [a_dep, b_dep, c_dep, d_dep, sp_dep, src_dep, dst_dep] if self.is_basic_gpr(reg_a): if self.is_basic_gpr(reg_b): for dep_group in basic_gprs: if reg_a['name'].upper() in dep_group: if reg_b['name'].upper() in dep_group: return True return False # Check other GPRs gpr_parser = ( pp.CaselessLiteral('R') + pp.Word(pp.nums).setResultsName('id') + pp.Optional(pp.Word('dwbDWB', exact=1)) ) try: id_a = gpr_parser.parseString(reg_a['name'], parseAll=True).asDict()['id'] id_b = gpr_parser.parseString(reg_b['name'], parseAll=True).asDict()['id'] if id_a == id_b: return True except pp.ParseException: return False # No dependencies return False
[docs] def is_basic_gpr(self, register): """Check if register is a basic general purpose register (ebi, rax, ...)""" if any(char.isdigit() for char in register['name']): return False return True
[docs] def is_gpr(self, register): """Check if register is a general purpose register""" if register is None: return False gpr_parser = ( pp.CaselessLiteral('R') + pp.Word(pp.nums).setResultsName('id') + pp.Optional(pp.Word('dwbDWB', exact=1)) ) if self.is_basic_gpr(register): return True else: try: gpr_parser.parseString(register['name'], parseAll=True) return True except pp.ParseException: return False
[docs] def is_vector_register(self, register): """Check if register is a vector register""" if register is None: return False if register['name'].rstrip(string.digits).lower() in ['mm', 'xmm', 'ymm', 'zmm']: return True return False
[docs] def get_reg_type(self, register): """Ger register type""" if register is None: return False if self.is_gpr(register): return 'gpr' elif self.is_vector_register(register): return register['name'].rstrip(string.digits).lower() raise ValueError