blob: 7dcefc011ae81b155604c2e78432e75231cbb4ae [file] [log] [blame]
# coding=utf8
# Copyright (c) 2012 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Utility functions to handle patches."""
import posixpath
import os
import re
class UnsupportedPatchFormat(Exception):
def __init__(self, filename, status):
super(UnsupportedPatchFormat, self).__init__(filename, status)
self.filename = filename
self.status = status
def __str__(self):
out = 'Can\'t process patch for file %s.' % self.filename
if self.status:
out += '\n%s' % self.status
return out
class FilePatchBase(object):
"""Defines a single file being modified.
'/' is always used instead of os.sep for consistency.
"""
is_delete = False
is_binary = False
is_new = False
def __init__(self, filename):
assert self.__class__ is not FilePatchBase
self.filename = self._process_filename(filename)
# Set when the file is copied or moved.
self.source_filename = None
@property
def filename_utf8(self):
return self.filename.encode('utf-8')
@property
def source_filename_utf8(self):
if self.source_filename is not None:
return self.source_filename.encode('utf-8')
@staticmethod
def _process_filename(filename):
filename = filename.replace('\\', '/')
# Blacklist a few characters for simplicity.
for i in ('$', '..', '\'', '"', '<', '>', ':', '|', '?', '*'):
if i in filename:
raise UnsupportedPatchFormat(
filename, 'Can\'t use \'%s\' in filename.' % i)
for i in ('/', 'CON', 'COM'):
if filename.startswith(i):
raise UnsupportedPatchFormat(
filename, 'Filename can\'t start with \'%s\'.' % i)
return filename
def set_relpath(self, relpath):
if not relpath:
return
relpath = relpath.replace('\\', '/')
if relpath[0] == '/':
self._fail('Relative path starts with %s' % relpath[0])
self.filename = self._process_filename(
posixpath.join(relpath, self.filename))
if self.source_filename:
self.source_filename = self._process_filename(
posixpath.join(relpath, self.source_filename))
def _fail(self, msg):
"""Shortcut function to raise UnsupportedPatchFormat."""
raise UnsupportedPatchFormat(self.filename, msg)
def __str__(self):
# Use a status-like board.
out = ''
if self.is_binary:
out += 'B'
else:
out += ' '
if self.is_delete:
out += 'D'
else:
out += ' '
if self.is_new:
out += 'N'
else:
out += ' '
if self.source_filename:
out += 'R'
else:
out += ' '
out += ' '
if self.source_filename:
out += '%s->' % self.source_filename_utf8
return out + self.filename_utf8
def dump(self):
"""Dumps itself in a verbose way to help diagnosing."""
return str(self)
class FilePatchDelete(FilePatchBase):
"""Deletes a file."""
is_delete = True
def __init__(self, filename, is_binary):
super(FilePatchDelete, self).__init__(filename)
self.is_binary = is_binary
class FilePatchBinary(FilePatchBase):
"""Content of a new binary file."""
is_binary = True
def __init__(self, filename, data, svn_properties, is_new):
super(FilePatchBinary, self).__init__(filename)
self.data = data
self.svn_properties = svn_properties or []
self.is_new = is_new
def get(self):
return self.data
def __str__(self):
return str(super(FilePatchBinary, self)) + ' %d bytes' % len(self.data)
class Hunk(object):
"""Parsed hunk data container."""
def __init__(self, start_src, lines_src, start_dst, lines_dst):
self.start_src = start_src
self.lines_src = lines_src
self.start_dst = start_dst
self.lines_dst = lines_dst
self.variation = self.lines_dst - self.lines_src
self.text = []
def __repr__(self):
return '%s<(%d, %d) to (%d, %d)>' % (
self.__class__.__name__,
self.start_src, self.lines_src, self.start_dst, self.lines_dst)
class FilePatchDiff(FilePatchBase):
"""Patch for a single file."""
def __init__(self, filename, diff, svn_properties):
super(FilePatchDiff, self).__init__(filename)
if not diff:
self._fail('File doesn\'t have a diff.')
self.diff_header, self.diff_hunks = self._split_header(diff)
self.svn_properties = svn_properties or []
self.is_git_diff = self._is_git_diff_header(self.diff_header)
self.patchlevel = 0
if self.is_git_diff:
self._verify_git_header()
else:
self._verify_svn_header()
self.hunks = self._split_hunks()
if self.source_filename and not self.is_new:
self._fail('If source_filename is set, is_new must be also be set')
def get(self, for_git):
if for_git or not self.source_filename:
return self.diff_header + self.diff_hunks
else:
# patch is stupid. It patches the source_filename instead so get rid of
# any source_filename reference if needed.
return (
self.diff_header.replace(
self.source_filename_utf8, self.filename_utf8) +
self.diff_hunks)
def set_relpath(self, relpath):
old_filename = self.filename_utf8
old_source_filename = self.source_filename_utf8 or self.filename_utf8
super(FilePatchDiff, self).set_relpath(relpath)
# Update the header too.
filename = self.filename_utf8
source_filename = self.source_filename_utf8 or self.filename_utf8
lines = self.diff_header.splitlines(True)
for i, line in enumerate(lines):
if line.startswith('diff --git'):
lines[i] = line.replace(
'a/' + old_source_filename, source_filename).replace(
'b/' + old_filename, filename)
elif re.match(r'^\w+ from .+$', line) or line.startswith('---'):
lines[i] = line.replace(old_source_filename, source_filename)
elif re.match(r'^\w+ to .+$', line) or line.startswith('+++'):
lines[i] = line.replace(old_filename, filename)
self.diff_header = ''.join(lines)
def _split_header(self, diff):
"""Splits a diff in two: the header and the hunks."""
header = []
hunks = diff.splitlines(True)
while hunks:
header.append(hunks.pop(0))
if header[-1].startswith('--- '):
break
else:
# Some diff may not have a ---/+++ set like a git rename with no change or
# a svn diff with only property change.
pass
if hunks:
if not hunks[0].startswith('+++ '):
self._fail('Inconsistent header')
header.append(hunks.pop(0))
if hunks:
if not hunks[0].startswith('@@ '):
self._fail('Inconsistent hunk header')
# Mangle any \\ in the header to /.
header_lines = ('Index:', 'diff', 'copy', 'rename', '+++', '---')
basename = os.path.basename(self.filename_utf8)
for i in xrange(len(header)):
if (header[i].split(' ', 1)[0] in header_lines or
header[i].endswith(basename)):
header[i] = header[i].replace('\\', '/')
return ''.join(header), ''.join(hunks)
@staticmethod
def _is_git_diff_header(diff_header):
"""Returns True if the diff for a single files was generated with git."""
# Delete: http://codereview.chromium.org/download/issue6368055_22_29.diff
# Rename partial change:
# http://codereview.chromium.org/download/issue6250123_3013_6010.diff
# Rename no change:
# http://codereview.chromium.org/download/issue6287022_3001_4010.diff
return any(l.startswith('diff --git') for l in diff_header.splitlines())
def _split_hunks(self):
"""Splits the hunks and does verification."""
hunks = []
for line in self.diff_hunks.splitlines(True):
if line.startswith('@@'):
match = re.match(r'^@@ -([\d,]+) \+([\d,]+) @@.*$', line)
# File add will result in "-0,0 +1" but file deletion will result in
# "-1,N +0,0" where N is the number of lines deleted. That's from diff
# and svn diff. git diff doesn't exhibit this behavior.
# svn diff for a single line file rewrite "@@ -1 +1 @@". Fun.
# "@@ -1 +1,N @@" is also valid where N is the length of the new file.
if not match:
self._fail('Hunk header is unparsable')
count = match.group(1).count(',')
if not count:
start_src = int(match.group(1))
lines_src = 1
elif count == 1:
start_src, lines_src = map(int, match.group(1).split(',', 1))
else:
self._fail('Hunk header is malformed')
count = match.group(2).count(',')
if not count:
start_dst = int(match.group(2))
lines_dst = 1
elif count == 1:
start_dst, lines_dst = map(int, match.group(2).split(',', 1))
else:
self._fail('Hunk header is malformed')
new_hunk = Hunk(start_src, lines_src, start_dst, lines_dst)
if hunks:
if new_hunk.start_src <= hunks[-1].start_src:
self._fail('Hunks source lines are not ordered')
if new_hunk.start_dst <= hunks[-1].start_dst:
self._fail('Hunks destination lines are not ordered')
hunks.append(new_hunk)
continue
hunks[-1].text.append(line)
if len(hunks) == 1:
if hunks[0].start_src == 0 and hunks[0].lines_src == 0:
self.is_new = True
if hunks[0].start_dst == 0 and hunks[0].lines_dst == 0:
self.is_delete = True
if self.is_new and self.is_delete:
self._fail('Hunk header is all 0')
if not self.is_new and not self.is_delete:
for hunk in hunks:
variation = (
len([1 for i in hunk.text if i.startswith('+')]) -
len([1 for i in hunk.text if i.startswith('-')]))
if variation != hunk.variation:
self._fail(
'Hunk header is incorrect: %d vs %d; %r' % (
variation, hunk.variation, hunk))
if not hunk.start_src:
self._fail(
'Hunk header start line is incorrect: %d' % hunk.start_src)
if not hunk.start_dst:
self._fail(
'Hunk header start line is incorrect: %d' % hunk.start_dst)
hunk.start_src -= 1
hunk.start_dst -= 1
if self.is_new and hunks:
hunks[0].start_dst -= 1
if self.is_delete and hunks:
hunks[0].start_src -= 1
return hunks
def mangle(self, string):
"""Mangle a file path."""
return '/'.join(string.replace('\\', '/').split('/')[self.patchlevel:])
def _verify_git_header(self):
"""Sanity checks the header.
Expects the following format:
<garbage>
diff --git (|a/)<filename> (|b/)<filename>
<similarity>
<filemode changes>
<index>
<copy|rename from>
<copy|rename to>
--- <filename>
+++ <filename>
Everything is optional except the diff --git line.
"""
lines = self.diff_header.splitlines()
# Verify the diff --git line.
old = None
new = None
while lines:
match = re.match(r'^diff \-\-git (.*?) (.*)$', lines.pop(0))
if not match:
continue
if match.group(1).startswith('a/') and match.group(2).startswith('b/'):
self.patchlevel = 1
old = self.mangle(match.group(1))
new = self.mangle(match.group(2))
# The rename is about the new file so the old file can be anything.
if new not in (self.filename_utf8, 'dev/null'):
self._fail('Unexpected git diff output name %s.' % new)
if old == 'dev/null' and new == 'dev/null':
self._fail('Unexpected /dev/null git diff.')
break
if not old or not new:
self._fail('Unexpected git diff; couldn\'t find git header.')
if old not in (self.filename_utf8, 'dev/null'):
# Copy or rename.
self.source_filename = old.decode('utf-8')
self.is_new = True
last_line = ''
while lines:
line = lines.pop(0)
self._verify_git_header_process_line(lines, line, last_line)
last_line = line
# Cheap check to make sure the file name is at least mentioned in the
# 'diff' header. That the only remaining invariant.
if not self.filename_utf8 in self.diff_header:
self._fail('Diff seems corrupted.')
def _verify_git_header_process_line(self, lines, line, last_line):
"""Processes a single line of the header.
Returns True if it should continue looping.
Format is described to
http://www.kernel.org/pub/software/scm/git/docs/git-diff.html
"""
match = re.match(r'^(rename|copy) from (.+)$', line)
old = self.source_filename_utf8 or self.filename_utf8
if match:
if old != match.group(2):
self._fail('Unexpected git diff input name for line %s.' % line)
if not lines or not lines[0].startswith('%s to ' % match.group(1)):
self._fail(
'Confused %s from/to git diff for line %s.' %
(match.group(1), line))
return
match = re.match(r'^(rename|copy) to (.+)$', line)
if match:
if self.filename_utf8 != match.group(2):
self._fail('Unexpected git diff output name for line %s.' % line)
if not last_line.startswith('%s from ' % match.group(1)):
self._fail(
'Confused %s from/to git diff for line %s.' %
(match.group(1), line))
return
match = re.match(r'^deleted file mode (\d{6})$', line)
if match:
# It is necessary to parse it because there may be no hunk, like when the
# file was empty.
self.is_delete = True
return
match = re.match(r'^new(| file) mode (\d{6})$', line)
if match:
mode = match.group(2)
# Only look at owner ACL for executable.
if bool(int(mode[4]) & 1):
self.svn_properties.append(('svn:executable', '.'))
elif not self.source_filename and self.is_new:
# It's a new file, not from a rename/copy, then there's no property to
# delete.
self.svn_properties.append(('svn:executable', None))
return
match = re.match(r'^--- (.*)$', line)
if match:
if last_line[:3] in ('---', '+++'):
self._fail('--- and +++ are reversed')
if match.group(1) == '/dev/null':
self.is_new = True
elif self.mangle(match.group(1)) != old:
# git patches are always well formatted, do not allow random filenames.
self._fail('Unexpected git diff: %s != %s.' % (old, match.group(1)))
if not lines or not lines[0].startswith('+++'):
self._fail('Missing git diff output name.')
return
match = re.match(r'^\+\+\+ (.*)$', line)
if match:
if not last_line.startswith('---'):
self._fail('Unexpected git diff: --- not following +++.')
if '/dev/null' == match.group(1):
self.is_delete = True
elif self.filename_utf8 != self.mangle(match.group(1)):
self._fail(
'Unexpected git diff: %s != %s.' % (self.filename, match.group(1)))
if lines:
self._fail('Crap after +++')
# We're done.
return
def _verify_svn_header(self):
"""Sanity checks the header.
A svn diff can contain only property changes, in that case there will be no
proper header. To make things worse, this property change header is
localized.
"""
lines = self.diff_header.splitlines()
last_line = ''
while lines:
line = lines.pop(0)
self._verify_svn_header_process_line(lines, line, last_line)
last_line = line
# Cheap check to make sure the file name is at least mentioned in the
# 'diff' header. That the only remaining invariant.
if not self.filename_utf8 in self.diff_header:
self._fail('Diff seems corrupted.')
def _verify_svn_header_process_line(self, lines, line, last_line):
"""Processes a single line of the header.
Returns True if it should continue looping.
"""
match = re.match(r'^--- ([^\t]+).*$', line)
if match:
if last_line[:3] in ('---', '+++'):
self._fail('--- and +++ are reversed')
if match.group(1) == '/dev/null':
self.is_new = True
elif self.mangle(match.group(1)) != self.filename_utf8:
# guess the source filename.
self.source_filename = match.group(1).decode('utf-8')
self.is_new = True
if not lines or not lines[0].startswith('+++'):
self._fail('Nothing after header.')
return
match = re.match(r'^\+\+\+ ([^\t]+).*$', line)
if match:
if not last_line.startswith('---'):
self._fail('Unexpected diff: --- not following +++.')
if match.group(1) == '/dev/null':
self.is_delete = True
elif self.mangle(match.group(1)) != self.filename_utf8:
self._fail('Unexpected diff: %s.' % match.group(1))
if lines:
self._fail('Crap after +++')
# We're done.
return
def dump(self):
"""Dumps itself in a verbose way to help diagnosing."""
return str(self) + '\n' + self.get(True)
class PatchSet(object):
"""A list of FilePatch* objects."""
def __init__(self, patches):
for p in patches:
assert isinstance(p, FilePatchBase)
def key(p):
"""Sort by ordering of application.
File move are first.
Deletes are last.
"""
# The bool is necessary because None < 'string' but the reverse is needed.
return (
p.is_delete,
# False is before True, so files *with* a source file will be first.
not bool(p.source_filename),
p.source_filename_utf8,
p.filename_utf8)
self.patches = sorted(patches, key=key)
def set_relpath(self, relpath):
"""Used to offset the patch into a subdirectory."""
for patch in self.patches:
patch.set_relpath(relpath)
def __iter__(self):
for patch in self.patches:
yield patch
def __getitem__(self, key):
return self.patches[key]
@property
def filenames(self):
return [p.filename for p in self.patches]