#!/usr/bin/env python3
import string
import pyparsing as pp
from osaca.parser import AttrDict, BaseParser
[docs]class ParserX86ATT(BaseParser):
def __init__(self):
super().__init__()
self.isa = 'x86'
[docs] def construct_parser(self):
"""Create parser for ARM AArch64 ISA."""
decimal_number = pp.Combine(
pp.Optional(pp.Literal('-')) + pp.Word(pp.nums)
).setResultsName('value')
hex_number = pp.Combine(pp.Literal('0x') + pp.Word(pp.hexnums)).setResultsName('value')
# Comment - either '#' or '//' (icc)
self.comment = (pp.Literal('#') | pp.Literal('//')) + pp.Group(
pp.ZeroOrMore(pp.Word(pp.printables))
).setResultsName(self.COMMENT_ID)
# Define x86 assembly identifier
relocation = pp.Combine(pp.Literal('@') + pp.Word(pp.alphas))
id_offset = pp.Word(pp.nums) + pp.Suppress(pp.Literal('+'))
first = pp.Word(pp.alphas + '_.', exact=1)
rest = pp.Word(pp.alphanums + '$_.+-')
identifier = pp.Group(
pp.Optional(id_offset).setResultsName('offset')
+ pp.Combine(first + pp.Optional(rest)).setResultsName('name')
+ pp.Optional(relocation).setResultsName('relocation')
).setResultsName('identifier')
# Label
self.label = pp.Group(
identifier.setResultsName('name') + pp.Literal(':') + pp.Optional(self.comment)
).setResultsName(self.LABEL_ID)
# Register: pp.Regex('^%[0-9a-zA-Z]+{}{z},?')
self.register = pp.Group(
pp.Literal('%')
+ pp.Word(pp.alphanums).setResultsName('name')
+ pp.Optional(pp.Literal('(') + pp.Word(pp.nums) + pp.Literal(')'))
+ pp.Optional(
pp.Literal('{')
+ pp.Literal('%')
+ pp.Word(pp.alphanums).setResultsName('mask')
+ pp.Literal('}')
+ pp.Optional(
pp.Suppress(pp.Literal('{'))
+ pp.Literal('z').setResultsName('zeroing')
+ pp.Suppress(pp.Literal('}'))
)
)
).setResultsName(self.REGISTER_ID)
# Immediate: pp.Regex('^\$(-?[0-9]+)|(0x[0-9a-fA-F]+),?')
symbol_immediate = '$'
immediate = pp.Group(
pp.Literal(symbol_immediate) + (hex_number | decimal_number | identifier)
).setResultsName(self.IMMEDIATE_ID)
# Memory preparations
offset = pp.Group(identifier | hex_number | decimal_number).setResultsName(
self.IMMEDIATE_ID
)
scale = pp.Word('1248', exact=1)
# Segment register extension
segment_extension = (
hex_number
^ pp.Word(pp.nums)
^ pp.Group(
pp.Optional(offset.setResultsName('offset'))
+ pp.Literal('(')
+ pp.Optional(self.register.setResultsName('base'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(self.register.setResultsName('index'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(scale.setResultsName('scale'))
+ pp.Literal(')')
)
)
memory_segmentation = (
self.register.setResultsName('base')
+ pp.Literal(':')
+ segment_extension.setResultsName(self.SEGMENT_EXT_ID)
)
# Memory: offset | seg:seg_ext | offset(base, index, scale){mask}
memory = pp.Group(
(
pp.Optional(pp.Suppress(pp.Literal('*')))
+ pp.Optional(offset.setResultsName('offset'))
+ pp.Literal('(')
+ pp.Optional(self.register.setResultsName('base'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(self.register.setResultsName('index'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(scale.setResultsName('scale'))
+ pp.Literal(')')
+ pp.Optional(
pp.Literal('{')
+ pp.Literal('%')
+ pp.Word(pp.alphanums).setResultsName('mask')
+ pp.Literal('}')
)
)
| memory_segmentation
| (hex_number | pp.Word(pp.nums)).setResultsName('offset')
).setResultsName(self.MEMORY_ID)
# Directive
directive_option = pp.Combine(
pp.Word('#@.', exact=1) + pp.Word(pp.printables, excludeChars=',')
)
directive_parameter = (
pp.quotedString
^ directive_option
^ identifier
^ hex_number
^ decimal_number
^ self.register
^ pp.Group(pp.Word(pp.alphanums + '_').setResultsName('name'))
)
commaSeparatedList = pp.delimitedList(pp.Optional(directive_parameter), delim=',')
self.directive = pp.Group(
pp.Literal('.')
+ pp.Word(pp.alphanums + '_').setResultsName('name')
+ commaSeparatedList.setResultsName('parameters')
+ pp.Optional(self.comment)
).setResultsName(self.DIRECTIVE_ID)
# Instructions
# Mnemonic
mnemonic = pp.ZeroOrMore(pp.Literal('data16') | pp.Literal('data32')) + pp.Word(
pp.alphanums
).setResultsName('mnemonic')
# Combine to instruction form
operand_first = pp.Group(self.register ^ immediate ^ memory ^ identifier)
operand_rest = pp.Group(self.register ^ immediate ^ memory)
self.instruction_parser = (
mnemonic
+ pp.Optional(operand_first.setResultsName('operand1'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(operand_rest.setResultsName('operand2'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(operand_rest.setResultsName('operand3'))
+ pp.Optional(pp.Suppress(pp.Literal(',')))
+ pp.Optional(operand_rest.setResultsName('operand4'))
+ pp.Optional(self.comment)
)
[docs] def parse_register(self, register_string):
"""Parse register string"""
try:
return self.process_operand(
self.register.parseString(register_string, parseAll=True).asDict()
)
except pp.ParseException:
return None
[docs] def parse_line(self, line, line_number=None):
"""
Parse line and return instruction form.
:param str line: line of assembly code
:param line_number: default None, identifier of instruction form
:type line_number: int, optional
:return: ``dict`` -- parsed asm line (comment, label, directive or instruction form)
"""
instruction_form = AttrDict(
{
self.INSTRUCTION_ID: None,
self.OPERANDS_ID: [],
self.DIRECTIVE_ID: None,
self.COMMENT_ID: None,
self.LABEL_ID: None,
'line': line.strip(),
'line_number': line_number,
}
)
result = None
# 1. Parse comment
try:
result = self.process_operand(self.comment.parseString(line, parseAll=True).asDict())
result = AttrDict.convert_dict(result)
instruction_form[self.COMMENT_ID] = ' '.join(result[self.COMMENT_ID])
except pp.ParseException:
pass
# 2. Parse label
if result is None:
try:
result = self.process_operand(self.label.parseString(line, parseAll=True).asDict())
result = AttrDict.convert_dict(result)
instruction_form[self.LABEL_ID] = result[self.LABEL_ID]['name']
if self.COMMENT_ID in result[self.LABEL_ID]:
instruction_form[self.COMMENT_ID] = ' '.join(
result[self.LABEL_ID][self.COMMENT_ID]
)
except pp.ParseException:
pass
# 3. Parse directive
if result is None:
try:
result = self.process_operand(
self.directive.parseString(line, parseAll=True).asDict()
)
result = AttrDict.convert_dict(result)
instruction_form[self.DIRECTIVE_ID] = AttrDict(
{
'name': result[self.DIRECTIVE_ID]['name'],
'parameters': result[self.DIRECTIVE_ID]['parameters'],
}
)
if self.COMMENT_ID in result[self.DIRECTIVE_ID]:
instruction_form[self.COMMENT_ID] = ' '.join(
result[self.DIRECTIVE_ID][self.COMMENT_ID]
)
except pp.ParseException:
pass
# 4. Parse instruction
if result is None:
try:
result = self.parse_instruction(line)
except pp.ParseException:
raise ValueError(
'Could not parse instruction on line {}: {!r}'.format(line_number, line)
)
instruction_form[self.INSTRUCTION_ID] = result[self.INSTRUCTION_ID]
instruction_form[self.OPERANDS_ID] = result[self.OPERANDS_ID]
instruction_form[self.COMMENT_ID] = result[self.COMMENT_ID]
return instruction_form
[docs] def parse_instruction(self, instruction):
"""
Parse instruction in asm line.
:param str instruction: Assembly line string.
:returns: `dict` -- parsed instruction form
"""
result = self.instruction_parser.parseString(instruction, parseAll=True).asDict()
result = AttrDict.convert_dict(result)
operands = []
# Add operands to list
# Check first operand
if 'operand1' in result:
operands.append(self.process_operand(result['operand1']))
# Check second operand
if 'operand2' in result:
operands.append(self.process_operand(result['operand2']))
# Check third operand
if 'operand3' in result:
operands.append(self.process_operand(result['operand3']))
# Check fourth operand
if 'operand4' in result:
operands.append(self.process_operand(result['operand4']))
return_dict = AttrDict(
{
self.INSTRUCTION_ID: result['mnemonic'],
self.OPERANDS_ID: operands,
self.COMMENT_ID: ' '.join(result[self.COMMENT_ID])
if self.COMMENT_ID in result
else None,
}
)
return return_dict
[docs] def process_operand(self, operand):
"""Post-process operand"""
# For the moment, only used to structure memory addresses
if self.MEMORY_ID in operand:
return self.process_memory_address(operand[self.MEMORY_ID])
if self.IMMEDIATE_ID in operand:
return self.process_immediate(operand[self.IMMEDIATE_ID])
if self.LABEL_ID in operand:
return self.process_label(operand[self.LABEL_ID])
return operand
[docs] def process_memory_address(self, memory_address):
"""Post-process memory address operand"""
# Remove unecessarily created dictionary entries during memory address parsing
offset = None if 'offset' not in memory_address else memory_address['offset']
base = None if 'base' not in memory_address else memory_address['base']
index = None if 'index' not in memory_address else memory_address['index']
scale = 1 if 'scale' not in memory_address else int(memory_address['scale'])
if isinstance(offset, str) and base is None and index is None:
offset = {'value': offset}
new_dict = AttrDict({'offset': offset, 'base': base, 'index': index, 'scale': scale})
# Add segmentation extension if existing
if self.SEGMENT_EXT_ID in memory_address:
new_dict[self.SEGMENT_EXT_ID] = memory_address[self.SEGMENT_EXT_ID]
return AttrDict({self.MEMORY_ID: new_dict})
[docs] def process_label(self, label):
"""Post-process label asm line"""
# remove duplicated 'name' level due to identifier
label['name'] = label['name']['name']
return AttrDict({self.LABEL_ID: label})
[docs] def get_full_reg_name(self, register):
"""Return one register name string including all attributes"""
# nothing to do
return register['name']
[docs] def normalize_imd(self, imd):
"""Normalize immediate to decimal based representation"""
if 'value' in imd:
if imd['value'].lower().startswith('0x'):
# hex, return decimal
return int(imd['value'], 16)
return int(imd['value'], 10)
# identifier
return imd
[docs] def is_flag_dependend_of(self, flag_a, flag_b):
"""Check if ``flag_a`` is dependent on ``flag_b``"""
# we assume flags are independent of each other, e.g., CF can be read while ZF gets written
# TODO validate this assumption
if flag_a.name == flag_b.name:
return True
return False
[docs] def is_reg_dependend_of(self, reg_a, reg_b):
"""Check if ``reg_a`` is dependent on ``reg_b``"""
# Check if they are the same registers
if reg_a.name == reg_b.name:
return True
# Check vector registers first
if self.is_vector_register(reg_a):
if self.is_vector_register(reg_b):
if reg_a.name[1:] == reg_b.name[1:]:
# Registers in the same vector space
return True
return False
# Check basic GPRs
a_dep = ['RAX', 'EAX', 'AX', 'AH', 'AL']
b_dep = ['RBX', 'EBX', 'BX', 'BH', 'BL']
c_dep = ['RCX', 'ECX', 'CX', 'CH', 'CL']
d_dep = ['RDX', 'EDX', 'DX', 'DH', 'DL']
sp_dep = ['RSP', 'ESP', 'SP', 'SPL']
src_dep = ['RSI', 'ESI', 'SI', 'SIL']
dst_dep = ['RDI', 'EDI', 'DI', 'DIL']
basic_gprs = [a_dep, b_dep, c_dep, d_dep, sp_dep, src_dep, dst_dep]
if self.is_basic_gpr(reg_a):
if self.is_basic_gpr(reg_b):
for dep_group in basic_gprs:
if reg_a['name'].upper() in dep_group:
if reg_b['name'].upper() in dep_group:
return True
return False
# Check other GPRs
gpr_parser = (
pp.CaselessLiteral('R')
+ pp.Word(pp.nums).setResultsName('id')
+ pp.Optional(pp.Word('dwbDWB', exact=1))
)
try:
id_a = gpr_parser.parseString(reg_a['name'], parseAll=True).asDict()['id']
id_b = gpr_parser.parseString(reg_b['name'], parseAll=True).asDict()['id']
if id_a == id_b:
return True
except pp.ParseException:
return False
# No dependencies
return False
[docs] def is_basic_gpr(self, register):
"""Check if register is a basic general purpose register (ebi, rax, ...)"""
if any(char.isdigit() for char in register['name']):
return False
return True
[docs] def is_gpr(self, register):
"""Check if register is a general purpose register"""
if register is None:
return False
gpr_parser = (
pp.CaselessLiteral('R')
+ pp.Word(pp.nums).setResultsName('id')
+ pp.Optional(pp.Word('dwbDWB', exact=1))
)
if self.is_basic_gpr(register):
return True
else:
try:
gpr_parser.parseString(register['name'], parseAll=True)
return True
except pp.ParseException:
return False
[docs] def is_vector_register(self, register):
"""Check if register is a vector register"""
if register is None:
return False
if register['name'].rstrip(string.digits).lower() in ['mm', 'xmm', 'ymm', 'zmm']:
return True
return False
[docs] def get_reg_type(self, register):
"""Ger register type"""
if register is None:
return False
if self.is_gpr(register):
return 'gpr'
elif self.is_vector_register(register):
return register['name'].rstrip(string.digits).lower()
raise ValueError