build/android/gyp/extract_unwind_tables.py - cobalt - Git at Google

 #!/usr/bin/env python3
 # Copyright 2018 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Extracts the unwind tables in from breakpad symbol files

 Runs dump_syms on the given binary file and extracts the CFI data into the
 given output file.
 The output file is a binary file containing CFI rows ordered based on function
 address. The output file only contains rows that match the most popular rule
 type in CFI table, to reduce the output size and specify data in compact format.
 See doc https://github.com/google/breakpad/blob/master/docs/symbol_files.md.
 1. The CFA rules should be of postfix form "SP <val> +".
 2. The RA rules should be of postfix form "CFA <val> + ^".
 Note: breakpad represents dereferencing address with '^' operator.

 The output file has 2 tables UNW_INDEX and UNW_DATA, inspired from ARM EHABI
 format. The first table contains function addresses and an index into the
 UNW_DATA table. The second table contains one or more rows for the function
 unwind information.

 The output file starts with 4 bytes counting the number of entries in UNW_INDEX.
 Then UNW_INDEX table and UNW_DATA table.

 UNW_INDEX contains two columns of N rows each, where N is the number of
 functions.
   1. First column 4 byte rows of all the function start address as offset from
      start of the binary, in sorted order.
   2. For each function addr, the second column contains 2 byte indices in order.
      The indices are offsets (in count of 2 bytes) of the CFI data from start of
      UNW_DATA.
 The last entry in the table always contains CANT_UNWIND index to specify the
 end address of the last function.

 UNW_DATA contains data of all the functions. Each function data contains N rows.
 The data found at the address pointed from UNW_INDEX will be:
   2 bytes: N - number of rows that belong to current function.
   N * 4 bytes: N rows of data. 16 bits : Address offset from function start.
                                14 bits : CFA offset / 4.
                                 2 bits : RA offset / 4.

 The function is not added to the unwind table in following conditions:
 C1. If length of the function code (number of instructions) is greater than
     0xFFFF (2 byte address span). This is because we use 16 bits to refer to
     offset of instruction from start of the address.
 C2. If the function moves the SP by more than 0xFFFF bytes. This is because we
     use 14 bits to denote CFA offset (last 2 bits are 0).
 C3. If the Return Address is stored at an offset >= 16 from the CFA. Some
     functions which have variable arguments can have offset upto 16.
     TODO(ssid): We can actually store offset 16 by subtracting 1 from RA/4 since
     we never have 0.
 C4: Some functions do not have unwind information defined in dwarf info. These
     functions have index value CANT_UNWIND(0xFFFF) in UNW_INDEX table.


 Usage:
   extract_unwind_tables.py --input_path [root path to unstripped chrome.so]
       --output_path [output path] --dump_syms_path [path to dump_syms binary]
 """

 import argparse
 import re
 import struct
 import subprocess
 import sys
 import tempfile


 _CFA_REG = '.cfa'
 _RA_REG = '.ra'

 _ADDR_ENTRY = 0
 _LENGTH_ENTRY = 1

 _CANT_UNWIND = 0xFFFF


 def _Write4Bytes(output_file, val):
   """Writes a 32 bit unsigned integer to the given output file."""
   output_file.write(struct.pack('<L', val));


 def _Write2Bytes(output_file, val):
   """Writes a 16 bit unsigned integer to the given output file."""
   output_file.write(struct.pack('<H', val));


 def _FindRuleForRegister(cfi_row, reg):
   """Returns the postfix expression as string for a given register.

   Breakpad CFI row format specifies rules for unwinding each register in postfix
   expression form separated by space. Each rule starts with register name and a
   colon. Eg: "CFI R1: <rule> R2: <rule>".
   """
   out = []
   found_register = False
   for part in cfi_row:
     if found_register:
       if part[-1] == ':':
         break
       out.append(part)
     elif part == reg + ':':
       found_register = True
   return ' '.join(out)


 def _GetCfaAndRaOffset(cfi_row):
   """Returns a tuple with 2 numbers (cfa_offset, ra_offset).

   Returns right values if rule matches the predefined criteria. Returns (0, 0)
   otherwise. The criteria for CFA rule is postfix form "SP <val> +" and RA rule
   is postfix form "CFA -<val> + ^".
   """
   cfa_offset = 0
   ra_offset = 0
   cfa_rule = _FindRuleForRegister(cfi_row, _CFA_REG)
   ra_rule = _FindRuleForRegister(cfi_row, _RA_REG)
   if cfa_rule and re.match(r'sp [0-9]+ \+', cfa_rule):
     cfa_offset = int(cfa_rule.split()[1], 10)
   if ra_rule:
     if not re.match(r'.cfa -[0-9]+ \+ \^', ra_rule):
       return (0, 0)
     ra_offset = -1 * int(ra_rule.split()[1], 10)
   return (cfa_offset, ra_offset)


 def _GetAllCfiRows(symbol_file):
   """Returns parsed CFI data from given symbol_file.

   Each entry in the cfi data dictionary returned is a map from function start
   address to array of function rows, starting with FUNCTION type, followed by
   one or more CFI rows.
   """
   cfi_data = {}
   current_func = []
   for line in symbol_file:
     line = line.decode('utf8')
     if 'STACK CFI' not in line:
       continue

     parts = line.split()
     data = {}
     if parts[2] == 'INIT':
       # Add the previous function to the output
       if len(current_func) > 1:
         cfi_data[current_func[0][_ADDR_ENTRY]] = current_func
       current_func = []

       # The function line is of format "STACK CFI INIT <addr> <length> ..."
       data[_ADDR_ENTRY] = int(parts[3], 16)
       data[_LENGTH_ENTRY] = int(parts[4], 16)

       # Condition C1: Skip if length is large.
       if data[_LENGTH_ENTRY] == 0 or data[_LENGTH_ENTRY] > 0xffff:
         continue  # Skip the current function.
     else:
       # The current function is skipped.
       if len(current_func) == 0:
         continue

       # The CFI row is of format "STACK CFI <addr> .cfa: <expr> .ra: <expr> ..."
       data[_ADDR_ENTRY] = int(parts[2], 16)
       (data[_CFA_REG], data[_RA_REG]) = _GetCfaAndRaOffset(parts)

       # Condition C2 and C3: Skip based on limits on offsets.
       if data[_CFA_REG] == 0 or data[_RA_REG] >= 16 or data[_CFA_REG] > 0xffff:
         current_func = []
         continue
       assert data[_CFA_REG] % 4 == 0
       # Since we skipped functions with code size larger than 0xffff, we should
       # have no function offset larger than the same value.
       assert data[_ADDR_ENTRY] - current_func[0][_ADDR_ENTRY] < 0xffff

     if data[_ADDR_ENTRY] == 0:
       # Skip current function, delete all previous entries.
       current_func = []
       continue
     assert data[_ADDR_ENTRY] % 2 == 0
     current_func.append(data)

   # Condition C4: Skip function without CFI rows.
   if len(current_func) > 1:
     cfi_data[current_func[0][_ADDR_ENTRY]] = current_func
   return cfi_data


 def _WriteCfiData(cfi_data, out_file):
   """Writes the CFI data in defined format to out_file."""
   # Stores the final data that will be written to UNW_DATA table, in order
   # with 2 byte items.
   unw_data = []

   # Represent all the CFI data of functions as set of numbers and map them to an
   # index in the |unw_data|. This index is later written to the UNW_INDEX table
   # for each function. This map is used to find index of the data for functions.
   data_to_index = {}
   # Store mapping between the functions to the index.
   func_addr_to_index = {}
   previous_func_end = 0
   for addr, function in sorted(cfi_data.items()):
     # Add an empty function entry when functions CFIs are missing between 2
     # functions.
     if previous_func_end != 0 and addr - previous_func_end  > 4:
       func_addr_to_index[previous_func_end + 2] = _CANT_UNWIND
     previous_func_end = addr + cfi_data[addr][0][_LENGTH_ENTRY]

     assert len(function) > 1
     func_data_arr = []
     func_data = 0
     # The first row contains the function address and length. The rest of the
     # rows have CFI data. Create function data array as given in the format.
     for row in function[1:]:
       addr_offset = row[_ADDR_ENTRY] - addr
       cfa_offset = (row[_CFA_REG]) | (row[_RA_REG] // 4)

       func_data_arr.append(addr_offset)
       func_data_arr.append(cfa_offset)

     # Consider all the rows in the data as one large integer and add it as a key
     # to the |data_to_index|.
     for data in func_data_arr:
       func_data = (func_data << 16) | data

     row_count = len(func_data_arr) // 2
     if func_data not in data_to_index:
       # When data is not found, create a new index = len(unw_data), and write
       # the data to |unw_data|.
       index = len(unw_data)
       data_to_index[func_data] = index
       unw_data.append(row_count)
       for row in func_data_arr:
         unw_data.append(row)
     else:
       # If the data was found, then use the same index for the function.
       index = data_to_index[func_data]
       assert row_count == unw_data[index]
     func_addr_to_index[addr] = data_to_index[func_data]

   # Mark the end end of last function entry.
   func_addr_to_index[previous_func_end + 2] = _CANT_UNWIND

   # Write the size of UNW_INDEX file in bytes.
   _Write4Bytes(out_file, len(func_addr_to_index))

   # Write the UNW_INDEX table. First list of addresses and then indices.
   sorted_unw_index = sorted(func_addr_to_index.items())
   for addr, index in sorted_unw_index:
     _Write4Bytes(out_file, addr)
   for addr, index in sorted_unw_index:
     _Write2Bytes(out_file, index)

   # Write the UNW_DATA table.
   for data in unw_data:
     _Write2Bytes(out_file, data)


 def _ParseCfiData(sym_stream, output_path):
   cfi_data = _GetAllCfiRows(sym_stream)
   with open(output_path, 'wb') as out_file:
     _WriteCfiData(cfi_data, out_file)


 def main():
   parser = argparse.ArgumentParser()
   parser.add_argument(
       '--input_path', required=True,
       help='The input path of the unstripped binary')
   parser.add_argument(
       '--output_path', required=True,
       help='The path of the output file')
   parser.add_argument(
       '--dump_syms_path', required=True,
       help='The path of the dump_syms binary')

   args = parser.parse_args()
   cmd = ['./' + args.dump_syms_path, args.input_path]
   proc = subprocess.Popen(cmd, bufsize=-1, stdout=subprocess.PIPE)
   _ParseCfiData(proc.stdout, args.output_path)
   assert proc.wait() == 0

   return 0

 if __name__ == '__main__':
   sys.exit(main())
	#!/usr/bin/env python3
	# Copyright 2018 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Extracts the unwind tables in from breakpad symbol files

	Runs dump_syms on the given binary file and extracts the CFI data into the
	given output file.
	The output file is a binary file containing CFI rows ordered based on function
	address. The output file only contains rows that match the most popular rule
	type in CFI table, to reduce the output size and specify data in compact format.
	See doc https://github.com/google/breakpad/blob/master/docs/symbol_files.md.
	1. The CFA rules should be of postfix form "SP <val> +".
	2. The RA rules should be of postfix form "CFA <val> + ^".
	Note: breakpad represents dereferencing address with '^' operator.

	The output file has 2 tables UNW_INDEX and UNW_DATA, inspired from ARM EHABI
	format. The first table contains function addresses and an index into the
	UNW_DATA table. The second table contains one or more rows for the function
	unwind information.

	The output file starts with 4 bytes counting the number of entries in UNW_INDEX.
	Then UNW_INDEX table and UNW_DATA table.

	UNW_INDEX contains two columns of N rows each, where N is the number of
	functions.
	1. First column 4 byte rows of all the function start address as offset from
	start of the binary, in sorted order.
	2. For each function addr, the second column contains 2 byte indices in order.
	The indices are offsets (in count of 2 bytes) of the CFI data from start of
	UNW_DATA.
	The last entry in the table always contains CANT_UNWIND index to specify the
	end address of the last function.

	UNW_DATA contains data of all the functions. Each function data contains N rows.
	The data found at the address pointed from UNW_INDEX will be:
	2 bytes: N - number of rows that belong to current function.
	N * 4 bytes: N rows of data. 16 bits : Address offset from function start.
	14 bits : CFA offset / 4.
	2 bits : RA offset / 4.

	The function is not added to the unwind table in following conditions:
	C1. If length of the function code (number of instructions) is greater than
	0xFFFF (2 byte address span). This is because we use 16 bits to refer to
	offset of instruction from start of the address.
	C2. If the function moves the SP by more than 0xFFFF bytes. This is because we
	use 14 bits to denote CFA offset (last 2 bits are 0).
	C3. If the Return Address is stored at an offset >= 16 from the CFA. Some
	functions which have variable arguments can have offset upto 16.
	TODO(ssid): We can actually store offset 16 by subtracting 1 from RA/4 since
	we never have 0.
	C4: Some functions do not have unwind information defined in dwarf info. These
	functions have index value CANT_UNWIND(0xFFFF) in UNW_INDEX table.


	Usage:
	extract_unwind_tables.py --input_path [root path to unstripped chrome.so]
	--output_path [output path] --dump_syms_path [path to dump_syms binary]
	"""

	import argparse
	import re
	import struct
	import subprocess
	import sys
	import tempfile


	_CFA_REG = '.cfa'
	_RA_REG = '.ra'

	_ADDR_ENTRY = 0
	_LENGTH_ENTRY = 1

	_CANT_UNWIND = 0xFFFF


	def _Write4Bytes(output_file, val):
	"""Writes a 32 bit unsigned integer to the given output file."""
	output_file.write(struct.pack('<L', val));


	def _Write2Bytes(output_file, val):
	"""Writes a 16 bit unsigned integer to the given output file."""
	output_file.write(struct.pack('<H', val));


	def _FindRuleForRegister(cfi_row, reg):
	"""Returns the postfix expression as string for a given register.

	Breakpad CFI row format specifies rules for unwinding each register in postfix
	expression form separated by space. Each rule starts with register name and a
	colon. Eg: "CFI R1: <rule> R2: <rule>".
	"""
	out = []
	found_register = False
	for part in cfi_row:
	if found_register:
	if part[-1] == ':':
	break
	out.append(part)
	elif part == reg + ':':
	found_register = True
	return ' '.join(out)


	def _GetCfaAndRaOffset(cfi_row):
	"""Returns a tuple with 2 numbers (cfa_offset, ra_offset).

	Returns right values if rule matches the predefined criteria. Returns (0, 0)
	otherwise. The criteria for CFA rule is postfix form "SP <val> +" and RA rule
	is postfix form "CFA -<val> + ^".
	"""
	cfa_offset = 0
	ra_offset = 0
	cfa_rule = _FindRuleForRegister(cfi_row, _CFA_REG)
	ra_rule = _FindRuleForRegister(cfi_row, _RA_REG)
	if cfa_rule and re.match(r'sp [0-9]+ \+', cfa_rule):
	cfa_offset = int(cfa_rule.split()[1], 10)
	if ra_rule:
	if not re.match(r'.cfa -[0-9]+ \+ \^', ra_rule):
	return (0, 0)
	ra_offset = -1 * int(ra_rule.split()[1], 10)
	return (cfa_offset, ra_offset)


	def _GetAllCfiRows(symbol_file):
	"""Returns parsed CFI data from given symbol_file.

	Each entry in the cfi data dictionary returned is a map from function start
	address to array of function rows, starting with FUNCTION type, followed by
	one or more CFI rows.
	"""
	cfi_data = {}
	current_func = []
	for line in symbol_file:
	line = line.decode('utf8')
	if 'STACK CFI' not in line:
	continue

	parts = line.split()
	data = {}
	if parts[2] == 'INIT':
	# Add the previous function to the output
	if len(current_func) > 1:
	cfi_data[current_func[0][_ADDR_ENTRY]] = current_func
	current_func = []

	# The function line is of format "STACK CFI INIT <addr> <length> ..."
	data[_ADDR_ENTRY] = int(parts[3], 16)
	data[_LENGTH_ENTRY] = int(parts[4], 16)

	# Condition C1: Skip if length is large.
	if data[_LENGTH_ENTRY] == 0 or data[_LENGTH_ENTRY] > 0xffff:
	continue # Skip the current function.
	else:
	# The current function is skipped.
	if len(current_func) == 0:
	continue

	# The CFI row is of format "STACK CFI <addr> .cfa: <expr> .ra: <expr> ..."
	data[_ADDR_ENTRY] = int(parts[2], 16)
	(data[_CFA_REG], data[_RA_REG]) = _GetCfaAndRaOffset(parts)

	# Condition C2 and C3: Skip based on limits on offsets.
	if data[_CFA_REG] == 0 or data[_RA_REG] >= 16 or data[_CFA_REG] > 0xffff:
	current_func = []
	continue
	assert data[_CFA_REG] % 4 == 0
	# Since we skipped functions with code size larger than 0xffff, we should
	# have no function offset larger than the same value.
	assert data[_ADDR_ENTRY] - current_func[0][_ADDR_ENTRY] < 0xffff

	if data[_ADDR_ENTRY] == 0:
	# Skip current function, delete all previous entries.
	current_func = []
	continue
	assert data[_ADDR_ENTRY] % 2 == 0
	current_func.append(data)

	# Condition C4: Skip function without CFI rows.
	if len(current_func) > 1:
	cfi_data[current_func[0][_ADDR_ENTRY]] = current_func
	return cfi_data


	def _WriteCfiData(cfi_data, out_file):
	"""Writes the CFI data in defined format to out_file."""
	# Stores the final data that will be written to UNW_DATA table, in order
	# with 2 byte items.
	unw_data = []

	# Represent all the CFI data of functions as set of numbers and map them to an
	# index in the \|unw_data\|. This index is later written to the UNW_INDEX table
	# for each function. This map is used to find index of the data for functions.
	data_to_index = {}
	# Store mapping between the functions to the index.
	func_addr_to_index = {}
	previous_func_end = 0
	for addr, function in sorted(cfi_data.items()):
	# Add an empty function entry when functions CFIs are missing between 2
	# functions.
	if previous_func_end != 0 and addr - previous_func_end > 4:
	func_addr_to_index[previous_func_end + 2] = _CANT_UNWIND
	previous_func_end = addr + cfi_data[addr][0][_LENGTH_ENTRY]

	assert len(function) > 1
	func_data_arr = []
	func_data = 0
	# The first row contains the function address and length. The rest of the
	# rows have CFI data. Create function data array as given in the format.
	for row in function[1:]:
	addr_offset = row[_ADDR_ENTRY] - addr
	cfa_offset = (row[_CFA_REG]) \| (row[_RA_REG] // 4)

	func_data_arr.append(addr_offset)
	func_data_arr.append(cfa_offset)

	# Consider all the rows in the data as one large integer and add it as a key
	# to the \|data_to_index\|.
	for data in func_data_arr:
	func_data = (func_data << 16) \| data

	row_count = len(func_data_arr) // 2
	if func_data not in data_to_index:
	# When data is not found, create a new index = len(unw_data), and write
	# the data to \|unw_data\|.
	index = len(unw_data)
	data_to_index[func_data] = index
	unw_data.append(row_count)
	for row in func_data_arr:
	unw_data.append(row)
	else:
	# If the data was found, then use the same index for the function.
	index = data_to_index[func_data]
	assert row_count == unw_data[index]
	func_addr_to_index[addr] = data_to_index[func_data]

	# Mark the end end of last function entry.
	func_addr_to_index[previous_func_end + 2] = _CANT_UNWIND

	# Write the size of UNW_INDEX file in bytes.
	_Write4Bytes(out_file, len(func_addr_to_index))

	# Write the UNW_INDEX table. First list of addresses and then indices.
	sorted_unw_index = sorted(func_addr_to_index.items())
	for addr, index in sorted_unw_index:
	_Write4Bytes(out_file, addr)
	for addr, index in sorted_unw_index:
	_Write2Bytes(out_file, index)

	# Write the UNW_DATA table.
	for data in unw_data:
	_Write2Bytes(out_file, data)


	def _ParseCfiData(sym_stream, output_path):
	cfi_data = _GetAllCfiRows(sym_stream)
	with open(output_path, 'wb') as out_file:
	_WriteCfiData(cfi_data, out_file)


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	'--input_path', required=True,
	help='The input path of the unstripped binary')
	parser.add_argument(
	'--output_path', required=True,
	help='The path of the output file')
	parser.add_argument(
	'--dump_syms_path', required=True,
	help='The path of the dump_syms binary')

	args = parser.parse_args()
	cmd = ['./' + args.dump_syms_path, args.input_path]
	proc = subprocess.Popen(cmd, bufsize=-1, stdout=subprocess.PIPE)
	_ParseCfiData(proc.stdout, args.output_path)
	assert proc.wait() == 0

	return 0

	if __name__ == '__main__':
	sys.exit(main())