Fix parsing of git output with unusual characters On Windows, all files are "executable". Therefore, to know if a file is supposed to be executed, we check how its attributes were recorded by git: we run a `git ls-files` command in a subprocess. By default, this command outputs information on multiple lines (file and their data separated by newlines). When a file contains an unusual character, the character is escaped with an integer sequence (such as `\303\261`), and git wraps the whole filename in double-quotes because of the backslashes. It breaks the current code because we try to open the filename containing the double-quotes: it doesn't exist, of course. Instead of trying to fix this special case by removing the double-quotes, and breaking other cases (a double-quote is a valid filename character on Linux), we tell git to separate each item with the null character `\0` instead of a new line `\n`, with the option `-z`. With this option, git doesn't escape unusual characters with integer sequence, so the output is fixed, and we parse it by splitting on `\0` instead of `\n`. Fixes #508.
diff --git a/pre_commit_hooks/check_executables_have_shebangs.py b/pre_commit_hooks/check_executables_have_shebangs.py index 1c50ea0..a02d2a9 100644 --- a/pre_commit_hooks/check_executables_have_shebangs.py +++ b/pre_commit_hooks/check_executables_have_shebangs.py
@@ -12,6 +12,14 @@ EXECUTABLE_VALUES = frozenset(('1', '3', '5', '7')) +def zsplit(s: str) -> List[str]: + s = s.strip('\0') + if s: + return s.split('\0') + else: + return [] + + def check_executables(paths: List[str]) -> int: if sys.platform == 'win32': # pragma: win32 cover return _check_git_filemode(paths) @@ -26,9 +34,9 @@ def _check_git_filemode(paths: Sequence[str]) -> int: - outs = cmd_output('git', 'ls-files', '--stage', '--', *paths) + outs = cmd_output('git', 'ls-files', '-z', '--stage', '--', *paths) seen: Set[str] = set() - for out in outs.splitlines(): + for out in zsplit(outs): metadata, path = out.split('\t') tagmode = metadata.split(' ', 1)[0]
diff --git a/tests/check_executables_have_shebangs_test.py b/tests/check_executables_have_shebangs_test.py index 5895a2a..7046081 100644 --- a/tests/check_executables_have_shebangs_test.py +++ b/tests/check_executables_have_shebangs_test.py
@@ -73,6 +73,21 @@ assert check_executables_have_shebangs._check_git_filemode(files) == 0 +def test_check_git_filemode_passing_unusual_characters(tmpdir): + with tmpdir.as_cwd(): + cmd_output('git', 'init', '.') + + f = tmpdir.join('mañana.txt') + f.write('#!/usr/bin/env bash') + f_path = str(f) + cmd_output('chmod', '+x', f_path) + cmd_output('git', 'add', f_path) + cmd_output('git', 'update-index', '--chmod=+x', f_path) + + files = (f_path,) + assert check_executables_have_shebangs._check_git_filemode(files) == 0 + + def test_check_git_filemode_failing(tmpdir): with tmpdir.as_cwd(): cmd_output('git', 'init', '.') @@ -87,6 +102,16 @@ assert check_executables_have_shebangs._check_git_filemode(files) == 1 +@pytest.mark.parametrize('out', ('\0f1\0f2\0', '\0f1\0f2', 'f1\0f2\0')) +def test_check_zsplits_correctly(out): + assert check_executables_have_shebangs.zsplit(out) == ['f1', 'f2'] + + +@pytest.mark.parametrize('out', ('\0\0', '\0', '')) +def test_check_zsplit_returns_empty(out): + assert check_executables_have_shebangs.zsplit(out) == [] + + @pytest.mark.parametrize( ('content', 'mode', 'expected'), (