#!/usr/bin/env python3
from copy import deepcopy
import pyparsing as pp
from osaca.parser import AttrDict, BaseParser
[docs]class ParserAArch64(BaseParser):
_instance = None
# Singelton pattern, as this is created very many times
def __new__(cls):
if cls._instance is None:
cls._instance = super(ParserAArch64, cls).__new__(cls)
return cls._instance
def __init__(self):
super().__init__()
self.isa = "aarch64"
[docs] def construct_parser(self):
"""Create parser for ARM AArch64 ISA."""
# Comment
symbol_comment = "//"
self.comment = pp.Literal(symbol_comment) + pp.Group(
pp.ZeroOrMore(pp.Word(pp.printables))
).setResultsName(self.COMMENT_ID)
# Define ARM assembly identifier
decimal_number = pp.Combine(
pp.Optional(pp.Literal("-")) + pp.Word(pp.nums)
).setResultsName("value")
hex_number = pp.Combine(
pp.Optional(pp.Literal("-")) + pp.Literal("0x") + pp.Word(pp.hexnums)
).setResultsName("value")
relocation = pp.Combine(pp.Literal(":") + pp.Word(pp.alphanums + "_") + pp.Literal(":"))
first = pp.Word(pp.alphas + "_.", exact=1)
rest = pp.Word(pp.alphanums + "_.")
identifier = pp.Group(
pp.Optional(relocation).setResultsName("relocation")
+ pp.Combine(first + pp.Optional(rest)).setResultsName("name")
+ pp.Optional(
pp.Suppress(pp.Literal("+"))
+ (hex_number | decimal_number).setResultsName("offset")
)
).setResultsName(self.IDENTIFIER_ID)
# Label
self.label = pp.Group(
identifier.setResultsName("name") + pp.Literal(":") + pp.Optional(self.comment)
).setResultsName(self.LABEL_ID)
# Directive
directive_option = pp.Combine(
pp.Word(pp.alphas + "#@.%", exact=1)
+ pp.Optional(pp.Word(pp.printables + " ", excludeChars=","))
)
directive_parameter = (
pp.quotedString | directive_option | identifier | hex_number | decimal_number
)
commaSeparatedList = pp.delimitedList(pp.Optional(directive_parameter), delim=",")
self.directive = pp.Group(
pp.Literal(".")
+ pp.Word(pp.alphanums + "_").setResultsName("name")
+ (pp.OneOrMore(directive_parameter) ^ commaSeparatedList).setResultsName("parameters")
+ pp.Optional(self.comment)
).setResultsName(self.DIRECTIVE_ID)
# LLVM-MCA markers
self.llvm_markers = pp.Group(
pp.Literal("#")
+ pp.Combine(
pp.CaselessLiteral("LLVM-MCA-")
+ (pp.CaselessLiteral("BEGIN") | pp.CaselessLiteral("END"))
)
+ pp.Optional(self.comment)
).setResultsName(self.COMMENT_ID)
##############################
# Instructions
# Mnemonic
# (?P<instr>[a-zA-Z][a-zA-Z0-9]*)(?P<setflg>S?)(P?<CC>.[a-zA-Z]{2})
mnemonic = pp.Word(pp.alphanums + ".").setResultsName("mnemonic")
# Immediate:
# int: ^-?[0-9]+ | hex: ^0x[0-9a-fA-F]+ | fp: ^[0-9]{1}.[0-9]+[eE]{1}[\+-]{1}[0-9]+[fF]?
symbol_immediate = "#"
mantissa = pp.Combine(
pp.Optional(pp.Literal("-")) + pp.Word(pp.nums) + pp.Literal(".") + pp.Word(pp.nums)
).setResultsName("mantissa")
exponent = (
pp.CaselessLiteral("e")
+ pp.Word("+-").setResultsName("e_sign")
+ pp.Word(pp.nums).setResultsName("exponent")
)
float_ = pp.Group(
mantissa + pp.Optional(exponent) + pp.CaselessLiteral("f")
).setResultsName("float")
double_ = pp.Group(mantissa + pp.Optional(exponent)).setResultsName("double")
immediate = pp.Group(
pp.Optional(pp.Literal(symbol_immediate))
+ (hex_number ^ decimal_number ^ float_ ^ double_)
| (pp.Optional(pp.Literal(symbol_immediate)) + identifier)
).setResultsName(self.IMMEDIATE_ID)
shift_op = (
pp.CaselessLiteral("lsl")
^ pp.CaselessLiteral("lsr")
^ pp.CaselessLiteral("asr")
^ pp.CaselessLiteral("ror")
^ pp.CaselessLiteral("sxtw")
^ pp.CaselessLiteral("uxtw")
^ pp.CaselessLiteral("uxtb")
^ pp.CaselessLiteral("mul vl")
)
arith_immediate = pp.Group(
immediate.setResultsName("base_immediate")
+ pp.Suppress(pp.Literal(","))
+ shift_op.setResultsName("shift_op")
+ pp.Optional(immediate).setResultsName("shift")
).setResultsName(self.IMMEDIATE_ID)
# Register:
# scalar: [XWBHSDQ][0-9]{1,2} | vector: [VZ][0-9]{1,2}(\.[12468]{1,2}[BHSD])?
# | predicate: P[0-9]{1,2}(/[ZM])?
# ignore vector len control ZCR_EL[123] for now
# define SP, ZR register aliases as regex, due to pyparsing does not support
# proper lookahead
alias_r31_sp = pp.Regex("(?P<prefix>[a-zA-Z])?(?P<name>(sp|SP))")
alias_r31_zr = pp.Regex("(?P<prefix>[a-zA-Z])?(?P<name>(zr|ZR))")
scalar = pp.Word("xwbhsdqXWBHSDQ", exact=1).setResultsName("prefix") + pp.Word(
pp.nums
).setResultsName("name")
index = pp.Literal("[") + pp.Word(pp.nums).setResultsName("index") + pp.Literal("]")
vector = (
pp.oneOf("v z", caseless=True).setResultsName("prefix")
+ pp.Word(pp.nums).setResultsName("name")
+ pp.Optional(
pp.Literal(".")
+ pp.Optional(pp.Word("12468")).setResultsName("lanes")
+ pp.Word(pp.alphas, exact=1).setResultsName("shape")
)
+ pp.Optional(index)
)
predicate = (
pp.CaselessLiteral("p").setResultsName("prefix")
+ pp.Word(pp.nums).setResultsName("name")
+ pp.Optional(
(
pp.Suppress(pp.Literal("/"))
+ pp.oneOf("z m", caseless=True).setResultsName("predication")
)
| (
pp.Literal(".")
+ pp.Optional(pp.Word("12468")).setResultsName("lanes")
+ pp.Word(pp.alphas, exact=1).setResultsName("shape")
)
)
)
self.list_element = vector ^ scalar
register_list = (
pp.Literal("{")
+ (
pp.delimitedList(pp.Combine(self.list_element), delim=",").setResultsName("list")
^ pp.delimitedList(pp.Combine(self.list_element), delim="-").setResultsName(
"range"
)
)
+ pp.Literal("}")
+ pp.Optional(index)
)
register = pp.Group(
(alias_r31_sp | alias_r31_zr | vector | scalar | predicate | register_list)
# (alias_r31_sp | alias_r31_zr | vector | scalar | predicate | register_list)
+ pp.Optional(
pp.Suppress(pp.Literal(","))
+ shift_op.setResultsName("shift_op")
+ pp.Optional(immediate).setResultsName("shift")
)
).setResultsName(self.REGISTER_ID)
self.register = register
# Memory
register_index = register.setResultsName("index") + pp.Optional(
pp.Literal(",") + pp.Word(pp.alphas) + immediate.setResultsName("scale")
)
memory = pp.Group(
pp.Literal("[")
+ pp.Optional(register.setResultsName("base"))
+ pp.Optional(pp.Suppress(pp.Literal(",")))
+ pp.Optional(register_index ^ (immediate ^ arith_immediate).setResultsName("offset"))
+ pp.Literal("]")
+ pp.Optional(
pp.Literal("!").setResultsName("pre_indexed")
| (pp.Suppress(pp.Literal(",")) + immediate.setResultsName("post_indexed"))
)
).setResultsName(self.MEMORY_ID)
prefetch_op = pp.Group(
pp.Group(pp.CaselessLiteral("PLD") ^ pp.CaselessLiteral("PST")).setResultsName("type")
+ pp.Group(
pp.CaselessLiteral("L1") ^ pp.CaselessLiteral("L2") ^ pp.CaselessLiteral("L3")
).setResultsName("target")
+ pp.Group(pp.CaselessLiteral("KEEP") ^ pp.CaselessLiteral("STRM")).setResultsName(
"policy"
)
).setResultsName("prfop")
# Condition codes, based on http://tiny.cc/armcc
condition = (
pp.CaselessLiteral("EQ") # z set
^ pp.CaselessLiteral("NE") # z clear
^ pp.CaselessLiteral("CS") # c set
^ pp.CaselessLiteral("HS") # c set
^ pp.CaselessLiteral("CC") # c clear
^ pp.CaselessLiteral("LO") # c clear
^ pp.CaselessLiteral("MI") # n set
^ pp.CaselessLiteral("PL") # n clear
^ pp.CaselessLiteral("VS") # v set
^ pp.CaselessLiteral("VC") # v clear
^ pp.CaselessLiteral("HI") # c set and z clear
^ pp.CaselessLiteral("LS") # c clear or z set
^ pp.CaselessLiteral("GE") # n and v the same
^ pp.CaselessLiteral("LT") # n and v different
^ pp.CaselessLiteral("GT") # z clear, and n and v the same
^ pp.CaselessLiteral("LE") # z set, or n and v different
^ pp.CaselessLiteral("AL") # any
).setResultsName("condition")
self.condition = condition
# Combine to instruction form
operand_first = pp.Group(
register ^ (prefetch_op | immediate) ^ memory ^ arith_immediate ^ identifier
)
operand_rest = pp.Group(
(register ^ condition ^ immediate ^ memory ^ arith_immediate) | identifier
)
self.instruction_parser = (
mnemonic
+ pp.Optional(operand_first.setResultsName("operand1"))
+ pp.Optional(pp.Suppress(pp.Literal(",")))
+ pp.Optional(operand_rest.setResultsName("operand2"))
+ pp.Optional(pp.Suppress(pp.Literal(",")))
+ pp.Optional(operand_rest.setResultsName("operand3"))
+ pp.Optional(pp.Suppress(pp.Literal(",")))
+ pp.Optional(operand_rest.setResultsName("operand4"))
+ pp.Optional(pp.Suppress(pp.Literal(",")))
+ pp.Optional(operand_rest.setResultsName("operand5"))
+ pp.Optional(self.comment)
)
# for testing
self.predicate = predicate
self.vector = vector
self.register = register
[docs] def parse_line(self, line, line_number=None):
"""
Parse line and return instruction form.
:param str line: line of assembly code
:param line_number: identifier of instruction form, defautls to None
:type line_number: int, optional
:return: `dict` -- parsed asm line (comment, label, directive or instruction form)
"""
instruction_form = AttrDict(
{
self.INSTRUCTION_ID: None,
self.OPERANDS_ID: [],
self.DIRECTIVE_ID: None,
self.COMMENT_ID: None,
self.LABEL_ID: None,
"line": line,
"line_number": line_number,
}
)
result = None
# 1. Parse comment
try:
result = self.process_operand(self.comment.parseString(line, parseAll=True).asDict())
result = AttrDict.convert_dict(result)
instruction_form[self.COMMENT_ID] = " ".join(result[self.COMMENT_ID])
except pp.ParseException:
pass
# 1.2 check for llvm-mca marker
try:
result = self.process_operand(
self.llvm_markers.parseString(line, parseAll=True).asDict()
)
result = AttrDict.convert_dict(result)
instruction_form[self.COMMENT_ID] = " ".join(result[self.COMMENT_ID])
except pp.ParseException:
pass
# 2. Parse label
if result is None:
try:
result = self.process_operand(self.label.parseString(line, parseAll=True).asDict())
result = AttrDict.convert_dict(result)
instruction_form[self.LABEL_ID] = result[self.LABEL_ID].name
if self.COMMENT_ID in result[self.LABEL_ID]:
instruction_form[self.COMMENT_ID] = " ".join(
result[self.LABEL_ID][self.COMMENT_ID]
)
except pp.ParseException:
pass
# 3. Parse directive
if result is None:
try:
result = self.process_operand(
self.directive.parseString(line, parseAll=True).asDict()
)
result = AttrDict.convert_dict(result)
instruction_form[self.DIRECTIVE_ID] = AttrDict(
{
"name": result[self.DIRECTIVE_ID].name,
"parameters": result[self.DIRECTIVE_ID].parameters,
}
)
if self.COMMENT_ID in result[self.DIRECTIVE_ID]:
instruction_form[self.COMMENT_ID] = " ".join(
result[self.DIRECTIVE_ID][self.COMMENT_ID]
)
except pp.ParseException:
pass
# 4. Parse instruction
if result is None:
try:
result = self.parse_instruction(line)
except (pp.ParseException, KeyError) as e:
raise ValueError(
"Unable to parse {!r} on line {}".format(line, line_number)
) from e
instruction_form[self.INSTRUCTION_ID] = result[self.INSTRUCTION_ID]
instruction_form[self.OPERANDS_ID] = result[self.OPERANDS_ID]
instruction_form[self.COMMENT_ID] = result[self.COMMENT_ID]
return instruction_form
[docs] def parse_instruction(self, instruction):
"""
Parse instruction in asm line.
:param str instruction: Assembly line string.
:returns: `dict` -- parsed instruction form
"""
result = self.instruction_parser.parseString(instruction, parseAll=True).asDict()
result = AttrDict.convert_dict(result)
operands = []
# Add operands to list
# Check first operand
if "operand1" in result:
operand = self.process_operand(result["operand1"])
operands.extend(operand) if isinstance(operand, list) else operands.append(operand)
# Check second operand
if "operand2" in result:
operand = self.process_operand(result["operand2"])
operands.extend(operand) if isinstance(operand, list) else operands.append(operand)
# Check third operand
if "operand3" in result:
operand = self.process_operand(result["operand3"])
operands.extend(operand) if isinstance(operand, list) else operands.append(operand)
# Check fourth operand
if "operand4" in result:
operand = self.process_operand(result["operand4"])
operands.extend(operand) if isinstance(operand, list) else operands.append(operand)
# Check fifth operand
if "operand5" in result:
operand = self.process_operand(result["operand5"])
operands.extend(operand) if isinstance(operand, list) else operands.append(operand)
return_dict = AttrDict(
{
self.INSTRUCTION_ID: result.mnemonic,
self.OPERANDS_ID: operands,
self.COMMENT_ID: " ".join(result[self.COMMENT_ID])
if self.COMMENT_ID in result
else None,
}
)
return return_dict
[docs] def process_operand(self, operand):
"""Post-process operand"""
# structure memory addresses
if self.MEMORY_ID in operand:
return self.process_memory_address(operand[self.MEMORY_ID])
# structure register lists
if self.REGISTER_ID in operand and (
"list" in operand[self.REGISTER_ID] or "range" in operand[self.REGISTER_ID]
):
# resolve ranges and lists
return self.resolve_range_list(self.process_register_list(operand[self.REGISTER_ID]))
if self.REGISTER_ID in operand and operand[self.REGISTER_ID]["name"] == "sp":
return self.process_sp_register(operand[self.REGISTER_ID])
# add value attribute to floating point immediates without exponent
if self.IMMEDIATE_ID in operand:
return self.process_immediate(operand[self.IMMEDIATE_ID])
if self.LABEL_ID in operand:
return self.process_label(operand[self.LABEL_ID])
if self.IDENTIFIER_ID in operand:
return self.process_identifier(operand[self.IDENTIFIER_ID])
return operand
[docs] def process_memory_address(self, memory_address):
"""Post-process memory address operand"""
# Remove unnecessarily created dictionary entries during parsing
offset = memory_address.get("offset", None)
if isinstance(offset, list) and len(offset) == 1:
offset = offset[0]
if offset is not None and "value" in offset:
offset["value"] = int(offset["value"], 0)
base = memory_address.get("base", None)
index = memory_address.get("index", None)
scale = 1
if base is not None and "name" in base and base["name"] == "sp":
base["prefix"] = "x"
if index is not None and "name" in index and index["name"] == "sp":
index["prefix"] = "x"
valid_shift_ops = ["lsl", "uxtw", "uxtb", "sxtw"]
if "index" in memory_address:
if "shift" in memory_address["index"]:
if memory_address["index"]["shift_op"].lower() in valid_shift_ops:
scale = 2 ** int(memory_address["index"]["shift"][0]["value"])
new_dict = AttrDict({"offset": offset, "base": base, "index": index, "scale": scale})
if "pre_indexed" in memory_address:
new_dict["pre_indexed"] = True
if "post_indexed" in memory_address:
if "value" in memory_address["post_indexed"]:
new_dict["post_indexed"] = {
"value": int(memory_address["post_indexed"]["value"], 0)
}
else:
new_dict["post_indexed"] = memory_address["post_indexed"]
return AttrDict({self.MEMORY_ID: new_dict})
[docs] def process_sp_register(self, register):
"""Post-process stack pointer register"""
reg = register
reg["prefix"] = "x"
return AttrDict({self.REGISTER_ID: reg})
[docs] def resolve_range_list(self, operand):
"""
Resolve range or list register operand to list of registers.
Returns None if neither list nor range
"""
if "register" in operand:
if "list" in operand.register:
index = operand.register.get("index")
range_list = []
for reg in operand.register.list:
reg = deepcopy(reg)
if index is not None:
reg["index"] = int(index, 0)
range_list.append(AttrDict({self.REGISTER_ID: reg}))
return range_list
elif "range" in operand.register:
base_register = operand.register.range[0]
index = operand.register.get("index")
range_list = []
start_name = base_register.name
end_name = operand.register.range[1].name
for name in range(int(start_name), int(end_name) + 1):
reg = deepcopy(base_register)
if index is not None:
reg["index"] = int(index, 0)
reg["name"] = str(name)
range_list.append(AttrDict({self.REGISTER_ID: reg}))
return range_list
# neither register list nor range, return unmodified
return operand
[docs] def process_register_list(self, register_list):
"""Post-process register lists (e.g., {r0,r3,r5}) and register ranges (e.g., {r0-r7})"""
# Remove unnecessarily created dictionary entries during parsing
rlist = []
dict_name = ""
if "list" in register_list:
dict_name = "list"
if "range" in register_list:
dict_name = "range"
for r in register_list[dict_name]:
rlist.append(
AttrDict.convert_dict(self.list_element.parseString(r, parseAll=True).asDict())
)
index = register_list.get("index", None)
new_dict = AttrDict({dict_name: rlist, "index": index})
if len(new_dict[dict_name]) == 1:
return AttrDict({self.REGISTER_ID: new_dict[dict_name][0]})
return AttrDict({self.REGISTER_ID: new_dict})
[docs] def process_label(self, label):
"""Post-process label asm line"""
# remove duplicated 'name' level due to identifier
label["name"] = label["name"]["name"]
return AttrDict({self.LABEL_ID: label})
[docs] def process_identifier(self, identifier):
"""Post-process identifier operand"""
# remove value if it consists of symbol+offset
if "value" in identifier:
del identifier["value"]
return AttrDict({self.IDENTIFIER_ID: identifier})
[docs] def get_full_reg_name(self, register):
"""Return one register name string including all attributes"""
name = register["prefix"] + str(register["name"])
if "shape" in register:
name += "." + str(register.get("lanes", "")) + register["shape"]
if "index" in register:
name += "[" + register["index"] + "]"
return name
[docs] def normalize_imd(self, imd):
"""Normalize immediate to decimal based representation"""
if "value" in imd:
if isinstance(imd["value"], str):
# hex or bin, return decimal
return int(imd["value"], 0)
else:
return imd["value"]
elif "float" in imd:
return self.ieee_to_float(imd["float"])
elif "double" in imd:
return self.ieee_to_float(imd["double"])
# identifier
return imd
[docs] def ieee_to_float(self, ieee_val):
"""Convert IEEE representation to python float"""
exponent = int(ieee_val["exponent"], 10)
if ieee_val["e_sign"] == "-":
exponent *= -1
return float(ieee_val["mantissa"]) * (10**exponent)
[docs] def parse_register(self, register_string):
raise NotImplementedError
[docs] def is_gpr(self, register):
"""Check if register is a general purpose register"""
if register["prefix"] in "wx":
return True
return False
[docs] def is_vector_register(self, register):
"""Check if register is a vector register"""
if register["prefix"] in "bhsdqvz":
return True
return False
[docs] def is_flag_dependend_of(self, flag_a, flag_b):
"""Check if ``flag_a`` is dependent on ``flag_b``"""
# we assume flags are independent of each other, e.g., CF can be read while ZF gets written
# TODO validate this assumption
if flag_a["name"] == flag_b["name"]:
return True
return False
[docs] def is_reg_dependend_of(self, reg_a, reg_b):
"""Check if ``reg_a`` is dependent on ``reg_b``"""
prefixes_gpr = "wx"
prefixes_vec = "bhsdqvz"
if reg_a["name"] == reg_b["name"]:
if reg_a["prefix"].lower() in prefixes_gpr and reg_b["prefix"].lower() in prefixes_gpr:
return True
if reg_a["prefix"].lower() in prefixes_vec and reg_b["prefix"].lower() in prefixes_vec:
return True
return False
[docs] def get_reg_type(self, register):
"""Get register type"""
return register["prefix"]