Simplify mixed-line-ending hook
diff --git a/pre_commit_hooks/mixed_line_ending.py b/pre_commit_hooks/mixed_line_ending.py index 76512a7..301c654 100644 --- a/pre_commit_hooks/mixed_line_ending.py +++ b/pre_commit_hooks/mixed_line_ending.py
@@ -1,212 +1,83 @@ +from __future__ import absolute_import +from __future__ import print_function +from __future__ import unicode_literals + import argparse -import re -import sys - -from enum import Enum +import collections -class LineEnding(Enum): - CR = b'\r', 'cr', re.compile(b'\r(?!\n)', re.DOTALL) - CRLF = b'\r\n', 'crlf', re.compile(b'\r\n', re.DOTALL) - LF = b'\n', 'lf', re.compile(b'(?<!\r)\n', re.DOTALL) - - def __init__(self, string, opt_name, regex): - self.string = string - self.str_print = repr(string) - self.opt_name = opt_name - self.regex = regex +CRLF = b'\r\n' +LF = b'\n' +CR = b'\r' +# Prefer LF to CRLF to CR, but detect CRLF before LF +ALL_ENDINGS = (CR, CRLF, LF) +FIX_TO_LINE_ENDING = {'cr': CR, 'crlf': CRLF, 'lf': LF} -class MixedLineEndingOption(Enum): - AUTO = 'auto', None - NO = 'no', None - CRLF = LineEnding.CRLF.opt_name, LineEnding.CRLF - LF = LineEnding.LF.opt_name, LineEnding.LF - - def __init__(self, opt_name, line_ending_enum): - self.opt_name = opt_name - self.line_ending_enum = line_ending_enum +def _fix(filename, contents, ending): + new_contents = b''.join( + line.rstrip(b'\r\n') + ending for line in contents.splitlines(True) + ) + with open(filename, 'wb') as f: + f.write(new_contents) -class MixedLineDetection(Enum): - NOT_MIXED = 1, False, None - UNKNOWN = 2, False, None - MIXED_MOSTLY_CRLF = 3, True, LineEnding.CRLF - MIXED_MOSTLY_LF = 4, True, LineEnding.LF - MIXED_MOSTLY_CR = 5, True, LineEnding.CR +def fix_filename(filename, fix): + with open(filename, 'rb') as f: + contents = f.read() - def __init__(self, index, mle_found, line_ending_enum): - # TODO hack to prevent enum overriding - self.index = index - self.mle_found = mle_found - self.line_ending_enum = line_ending_enum + counts = collections.defaultdict(int) + for line in contents.splitlines(True): + for ending in ALL_ENDINGS: + if line.endswith(ending): + counts[ending] += 1 + break -ANY_LINE_ENDING_PATTERN = re.compile( - b'(' + LineEnding.CRLF.regex.pattern + - b'|' + LineEnding.LF.regex.pattern + - b'|' + LineEnding.CR.regex.pattern + b')', -) + # Some amount of mixed line endings + mixed = sum(bool(x) for x in counts.values()) > 1 + if fix == 'no' or (fix == 'auto' and not mixed): + return mixed -def mixed_line_ending(argv=None): - options = _parse_arguments(argv) + if fix == 'auto': + max_ending = LF + max_lines = 0 + # ordering is important here such that lf > crlf > cr + for ending_type in ALL_ENDINGS: + # also important, using >= to find a max that prefers the last + if counts[ending_type] >= max_lines: + max_ending = ending_type + max_lines = counts[ending_type] - filenames = options['filenames'] - fix_option = options['fix'] - - if fix_option == MixedLineEndingOption.NO: - return _process_no_fix(filenames) - elif fix_option == MixedLineEndingOption.AUTO: - return _process_fix_auto(filenames) - # when a line ending character is forced with --fix option + _fix(filename, contents, max_ending) + return 1 else: - return _process_fix_force(filenames, fix_option.line_ending_enum) + target_ending = FIX_TO_LINE_ENDING[fix] + # find if there are lines with *other* endings + del counts[target_ending] + other_endings = bool(sum(counts.values())) + if other_endings: + _fix(filename, contents, target_ending) + return other_endings -def _parse_arguments(argv=None): +def main(argv=None): parser = argparse.ArgumentParser() parser.add_argument( - '-f', - '--fix', - choices=[m.opt_name for m in MixedLineEndingOption], - default=MixedLineEndingOption.AUTO.opt_name, + '-f', '--fix', + choices=('auto', 'no') + tuple(FIX_TO_LINE_ENDING), + default='auto', help='Replace line ending with the specified. Default is "auto"', ) parser.add_argument('filenames', nargs='*', help='Filenames to fix') args = parser.parse_args(argv) - fix, = ( - member for name, member - in MixedLineEndingOption.__members__.items() - if member.opt_name == args.fix - ) - - options = { - 'fix': fix, 'filenames': args.filenames, - } - - return options - - -def _detect_line_ending(filename): - with open(filename, 'rb') as f: - buf = f.read() - - le_counts = {} - - for le_enum in LineEnding: - le_counts[le_enum] = len(le_enum.regex.findall(buf)) - - mixed = False - le_found_previously = False - most_le = None - max_le_count = 0 - - for le, le_count in le_counts.items(): - le_found_cur = le_count > 0 - - mixed |= le_found_previously and le_found_cur - le_found_previously |= le_found_cur - - if le_count == max_le_count: - most_le = None - elif le_count > max_le_count: - max_le_count = le_count - most_le = le - - if not mixed: - return MixedLineDetection.NOT_MIXED - - for mld in MixedLineDetection: - if ( - mld.line_ending_enum is not None and - mld.line_ending_enum == most_le - ): - return mld - - return MixedLineDetection.UNKNOWN - - -def _process_no_fix(filenames): - print('Checking if the files have mixed line ending.') - - mle_filenames = [] - for filename in filenames: - detect_result = _detect_line_ending(filename) - - if detect_result.mle_found: - mle_filenames.append(filename) - - mle_found = len(mle_filenames) > 0 - - if mle_found: - print( - 'The following files have mixed line endings:\n\t%s', - '\n\t'.join(mle_filenames), - ) - - return 1 if mle_found else 0 - - -def _process_fix_auto(filenames): - mle_found = False - - for filename in filenames: - detect_result = _detect_line_ending(filename) - - if detect_result == MixedLineDetection.NOT_MIXED: - print('The file %s has no mixed line ending', filename) - elif detect_result == MixedLineDetection.UNKNOWN: - print( - 'Could not define most frequent line ending in ' - 'file %s. File skiped.', filename, - ) - - mle_found = True - else: - le_enum = detect_result.line_ending_enum - - print( - 'The file %s has mixed line ending with a ' - 'majority of %s. Converting...', filename, le_enum.str_print, - ) - - _convert_line_ending(filename, le_enum.string) - mle_found = True - - print( - 'The file %s has been converted to %s line ending.', - filename, le_enum.str_print, - ) - - return 1 if mle_found else 0 - - -def _process_fix_force(filenames, line_ending_enum): - for filename in filenames: - _convert_line_ending(filename, line_ending_enum.string) - - print( - 'The file %s has been forced to %s line ending.', - filename, line_ending_enum.str_print, - ) - - return 1 - - -def _convert_line_ending(filename, line_ending): - with open(filename, 'rb+') as f: - bufin = f.read() - - # convert line ending - bufout = ANY_LINE_ENDING_PATTERN.sub(line_ending, bufin) - - # write the result in the file replacing the existing content - f.seek(0) - f.write(bufout) - f.truncate() + retv = 0 + for filename in args.filenames: + retv |= fix_filename(filename, args.fix) + return retv if __name__ == '__main__': - sys.exit(mixed_line_ending()) + exit(main())
diff --git a/setup.py b/setup.py index 432c19a..2563095 100644 --- a/setup.py +++ b/setup.py
@@ -31,7 +31,6 @@ 'simplejson', 'six', ], - extras_require={':python_version=="2.7"': ['enum34']}, entry_points={ 'console_scripts': [ 'autopep8-wrapper = pre_commit_hooks.autopep8_wrapper:main',
diff --git a/tests/mixed_line_ending_test.py b/tests/mixed_line_ending_test.py index 20fd22c..808295b 100644 --- a/tests/mixed_line_ending_test.py +++ b/tests/mixed_line_ending_test.py
@@ -1,154 +1,103 @@ +from __future__ import absolute_import +from __future__ import unicode_literals + import pytest -from pre_commit_hooks.mixed_line_ending import mixed_line_ending - -# Input, expected return value, expected output -TESTS_FIX_AUTO = ( - # only 'LF' - (b'foo\nbar\nbaz\n', 0, b'foo\nbar\nbaz\n'), - # only 'CRLF' - (b'foo\r\nbar\r\nbaz\r\n', 0, b'foo\r\nbar\r\nbaz\r\n'), - # only 'CR' - (b'foo\rbar\rbaz\r', 0, b'foo\rbar\rbaz\r'), - # mixed with majority of 'LF' - (b'foo\r\nbar\nbaz\n', 1, b'foo\nbar\nbaz\n'), - # mixed with majority of 'CRLF' - (b'foo\r\nbar\nbaz\r\n', 1, b'foo\r\nbar\r\nbaz\r\n'), - # mixed with majority of 'CR' - (b'foo\rbar\nbaz\r', 1, b'foo\rbar\rbaz\r'), - # mixed with as much 'LF' as 'CRLF' - (b'foo\r\nbar\nbaz', 1, b'foo\r\nbar\nbaz'), - # mixed with as much 'LF' as 'CR' - (b'foo\rbar\nbaz', 1, b'foo\rbar\nbaz'), - # mixed with as much 'CRLF' as 'CR' - (b'foo\r\nbar\nbaz', 1, b'foo\r\nbar\nbaz'), - # mixed with as much 'CRLF' as 'LF' as 'CR' - (b'foo\r\nbar\nbaz\r', 1, b'foo\r\nbar\nbaz\r'), -) +from pre_commit_hooks.mixed_line_ending import main @pytest.mark.parametrize( - ('input_s', 'expected_retval', 'output'), - TESTS_FIX_AUTO, + ('input_s', 'output'), + ( + # mixed with majority of 'LF' + (b'foo\r\nbar\nbaz\n', b'foo\nbar\nbaz\n'), + # mixed with majority of 'CRLF' + (b'foo\r\nbar\nbaz\r\n', b'foo\r\nbar\r\nbaz\r\n'), + # mixed with majority of 'CR' + (b'foo\rbar\nbaz\r', b'foo\rbar\rbaz\r'), + # mixed with as much 'LF' as 'CRLF' + (b'foo\r\nbar\n', b'foo\nbar\n'), + # mixed with as much 'LF' as 'CR' + (b'foo\rbar\n', b'foo\nbar\n'), + # mixed with as much 'CRLF' as 'CR' + (b'foo\r\nbar\r', b'foo\r\nbar\r\n'), + # mixed with as much 'CRLF' as 'LF' as 'CR' + (b'foo\r\nbar\nbaz\r', b'foo\nbar\nbaz\n'), + ), ) -def test_mixed_line_ending_fix_auto(input_s, expected_retval, output, tmpdir): +def test_mixed_line_ending_fixes_auto(input_s, output, tmpdir): path = tmpdir.join('file.txt') path.write_binary(input_s) - ret = mixed_line_ending(('--fix=auto', path.strpath)) + ret = main((path.strpath,)) - assert ret == expected_retval + assert ret == 1 assert path.read_binary() == output -# Input, expected return value, expected output -TESTS_NO_FIX = ( - # only 'LF' - (b'foo\nbar\nbaz\n', 0, b'foo\nbar\nbaz\n'), - # only 'CRLF' - (b'foo\r\nbar\r\nbaz\r\n', 0, b'foo\r\nbar\r\nbaz\r\n'), - # only 'CR' - (b'foo\rbar\rbaz\r', 0, b'foo\rbar\rbaz\r'), - # mixed with majority of 'LF' - (b'foo\r\nbar\nbaz\n', 1, b'foo\r\nbar\nbaz\n'), - # mixed with majority of 'CRLF' - (b'foo\r\nbar\nbaz\r\n', 1, b'foo\r\nbar\nbaz\r\n'), - # mixed with majority of 'CR' - (b'foo\rbar\nbaz\r', 1, b'foo\rbar\nbaz\r'), - # mixed with as much 'LF' as 'CR' - (b'foo\rbar\nbaz', 0, b'foo\rbar\nbaz'), - # mixed with as much 'CRLF' as 'CR' - (b'foo\r\nbar\nbaz', 0, b'foo\r\nbar\nbaz'), - # mixed with as much 'CRLF' as 'LF' as 'CR' - (b'foo\r\nbar\nbaz\r', 0, b'foo\r\nbar\nbaz\r'), -) +def test_non_mixed_no_newline_end_of_file(tmpdir): + path = tmpdir.join('f.txt') + path.write_binary(b'foo\nbar\nbaz') + assert not main((path.strpath,)) + # the hook *could* fix the end of the file, but leaves it alone + # this is mostly to document the current behaviour + assert path.read_binary() == b'foo\nbar\nbaz' + + +def test_mixed_no_newline_end_of_file(tmpdir): + path = tmpdir.join('f.txt') + path.write_binary(b'foo\r\nbar\nbaz') + assert main((path.strpath,)) + # the hook rewrites the end of the file, this is slightly inconsistent + # with the non-mixed case but I think this is the better behaviour + # this is mostly to document the current behaviour + assert path.read_binary() == b'foo\nbar\nbaz\n' @pytest.mark.parametrize( - ('input_s', 'expected_retval', 'output'), - TESTS_NO_FIX, + ('fix_option', 'input_s'), + ( + # All --fix=auto with uniform line endings should be ok + ('--fix=auto', b'foo\r\nbar\r\nbaz\r\n'), + ('--fix=auto', b'foo\rbar\rbaz\r'), + ('--fix=auto', b'foo\nbar\nbaz\n'), + # --fix=crlf with crlf endings + ('--fix=crlf', b'foo\r\nbar\r\nbaz\r\n'), + # --fix=lf with lf endings + ('--fix=lf', b'foo\nbar\nbaz\n'), + ), ) -def test_detect_mixed_line_ending(input_s, expected_retval, output, tmpdir): - path = tmpdir.join('file.txt') +def test_line_endings_ok(fix_option, input_s, tmpdir): + path = tmpdir.join('input.txt') path.write_binary(input_s) - ret = mixed_line_ending(('--fix=no', path.strpath)) + ret = main((fix_option, path.strpath)) - assert ret == expected_retval - assert path.read_binary() == output + assert ret == 0 + assert path.read_binary() == input_s -# Input, expected return value, expected output -TESTS_FIX_FORCE_LF = ( - # only 'LF' - (b'foo\nbar\nbaz\n', 1, b'foo\nbar\nbaz\n'), - # only 'CRLF' - (b'foo\r\nbar\r\nbaz\r\n', 1, b'foo\nbar\nbaz\n'), - # only 'CR' - (b'foo\rbar\rbaz\r', 1, b'foo\nbar\nbaz\n'), - # mixed with majority of 'LF' - (b'foo\r\nbar\nbaz\n', 1, b'foo\nbar\nbaz\n'), - # mixed with majority of 'CRLF' - (b'foo\r\nbar\nbaz\r\n', 1, b'foo\nbar\nbaz\n'), - # mixed with majority of 'CR' - (b'foo\rbar\nbaz\r', 1, b'foo\nbar\nbaz\n'), - # mixed with as much 'LF' as 'CR' - (b'foo\rbar\nbaz', 1, b'foo\nbar\nbaz'), - # mixed with as much 'CRLF' as 'CR' - (b'foo\r\nbar\nbaz', 1, b'foo\nbar\nbaz'), - # mixed with as much 'CRLF' as 'LF' as 'CR' - (b'foo\r\nbar\nbaz\r', 1, b'foo\nbar\nbaz\n'), -) +def test_no_fix_does_not_modify(tmpdir): + path = tmpdir.join('input.txt') + contents = b'foo\r\nbar\rbaz\nwomp\n' + path.write_binary(contents) + ret = main(('--fix=no', path.strpath)) + + assert ret == 1 + assert path.read_binary() == contents -@pytest.mark.parametrize( - ('input_s', 'expected_retval', 'output'), - TESTS_FIX_FORCE_LF, -) -def test_mixed_line_ending_fix_force_lf( - input_s, expected_retval, output, - tmpdir, -): - path = tmpdir.join('file.txt') - path.write_binary(input_s) - ret = mixed_line_ending(('--fix=lf', path.strpath)) +def test_fix_lf(tmpdir): + path = tmpdir.join('input.txt') + path.write_binary(b'foo\r\nbar\rbaz\n') + ret = main(('--fix=lf', path.strpath)) - assert ret == expected_retval - assert path.read_binary() == output + assert ret == 1 + assert path.read_binary() == b'foo\nbar\nbaz\n' -# Input, expected return value, expected output -TESTS_FIX_FORCE_CRLF = ( - # only 'LF' - (b'foo\nbar\nbaz\n', 1, b'foo\r\nbar\r\nbaz\r\n'), - # only 'CRLF' - (b'foo\r\nbar\r\nbaz\r\n', 1, b'foo\r\nbar\r\nbaz\r\n'), - # only 'CR' - (b'foo\rbar\rbaz\r', 1, b'foo\r\nbar\r\nbaz\r\n'), - # mixed with majority of 'LF' - (b'foo\r\nbar\nbaz\n', 1, b'foo\r\nbar\r\nbaz\r\n'), - # mixed with majority of 'CRLF' - (b'foo\r\nbar\nbaz\r\n', 1, b'foo\r\nbar\r\nbaz\r\n'), - # mixed with majority of 'CR' - (b'foo\rbar\nbaz\r', 1, b'foo\r\nbar\r\nbaz\r\n'), - # mixed with as much 'LF' as 'CR' - (b'foo\rbar\nbaz', 1, b'foo\r\nbar\r\nbaz'), - # mixed with as much 'CRLF' as 'CR' - (b'foo\r\nbar\nbaz', 1, b'foo\r\nbar\r\nbaz'), - # mixed with as much 'CRLF' as 'LF' as 'CR' - (b'foo\r\nbar\nbaz\r', 1, b'foo\r\nbar\r\nbaz\r\n'), -) +def test_fix_crlf(tmpdir): + path = tmpdir.join('input.txt') + path.write_binary(b'foo\r\nbar\rbaz\n') + ret = main(('--fix=crlf', path.strpath)) - -@pytest.mark.parametrize( - ('input_s', 'expected_retval', 'output'), - TESTS_FIX_FORCE_CRLF, -) -def test_mixed_line_ending_fix_force_crlf( - input_s, expected_retval, output, - tmpdir, -): - path = tmpdir.join('file.txt') - path.write_binary(input_s) - ret = mixed_line_ending(('--fix=crlf', path.strpath)) - - assert ret == expected_retval - assert path.read_binary() == output + assert ret == 1 + assert path.read_binary() == b'foo\r\nbar\r\nbaz\r\n'