Merge pull request #233 from pre-commit/mixed-line-ending

Add mixed-line-ending hook
diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml
index 69e316a..454f258 100644
--- a/.pre-commit-hooks.yaml
+++ b/.pre-commit-hooks.yaml
@@ -191,6 +191,15 @@
     # for backward compatibility
     files: ''
     minimum_pre_commit_version: 0.15.0
+-   id: mixed-line-ending
+    name: Mixed line ending
+    description: Replaces or checks mixed line ending
+    entry: mixed-line-ending
+    language: python
+    types: [text]
+    # for backward compatibility
+    files: ''
+    minimum_pre_commit_version: 0.15.0
 -   id: name-tests-test
     name: Tests should end in _test.py
     description: This verifies that test files are named correctly
diff --git a/README.md b/README.md
index 7b4c486..8efd74b 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,11 @@
 - `file-contents-sorter` - Sort the lines in specified files (defaults to alphabetical). You must provide list of target files as input to it. Note that this hook WILL remove blank lines and does NOT respect any comments.
 - `flake8` - Run flake8 on your python files.
 - `forbid-new-submodules` - Prevent addition of new git submodules.
+- `mixed-line-ending` - Replaces or checks mixed line ending.
+    - `--fix={auto,crlf,lf,no}`
+        - `auto` - Replaces automatically the most frequent line ending. This is the default argument.
+        - `crlf`, `lf` - Forces to replace line ending by respectively CRLF and LF.
+        - `no` - Checks if there is any mixed line ending without modifying any file.
 - `name-tests-test` - Assert that files in tests/ end in `_test.py`.
     - Use `args: ['--django']` to match `test*.py` instead.
 - `no-commit-to-branch` - Protect specific branches from direct checkins.
diff --git a/hooks.yaml b/hooks.yaml
index 5278bf5..59cc320 100644
--- a/hooks.yaml
+++ b/hooks.yaml
@@ -130,6 +130,12 @@
     entry: upgrade-your-pre-commit-version
     files: ''
     minimum_pre_commit_version: 0.15.0
+-   id: mixed-line-ending
+    language: system
+    name: upgrade-your-pre-commit-version
+    entry: upgrade-your-pre-commit-version
+    files: ''
+    minimum_pre_commit_version: 0.15.0
 -   id: name-tests-test
     language: system
     name: upgrade-your-pre-commit-version
diff --git a/pre_commit_hooks/mixed_line_ending.py b/pre_commit_hooks/mixed_line_ending.py
new file mode 100644
index 0000000..301c654
--- /dev/null
+++ b/pre_commit_hooks/mixed_line_ending.py
@@ -0,0 +1,83 @@
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import collections
+
+
+CRLF = b'\r\n'
+LF = b'\n'
+CR = b'\r'
+# Prefer LF to CRLF to CR, but detect CRLF before LF
+ALL_ENDINGS = (CR, CRLF, LF)
+FIX_TO_LINE_ENDING = {'cr': CR, 'crlf': CRLF, 'lf': LF}
+
+
+def _fix(filename, contents, ending):
+    new_contents = b''.join(
+        line.rstrip(b'\r\n') + ending for line in contents.splitlines(True)
+    )
+    with open(filename, 'wb') as f:
+        f.write(new_contents)
+
+
+def fix_filename(filename, fix):
+    with open(filename, 'rb') as f:
+        contents = f.read()
+
+    counts = collections.defaultdict(int)
+
+    for line in contents.splitlines(True):
+        for ending in ALL_ENDINGS:
+            if line.endswith(ending):
+                counts[ending] += 1
+                break
+
+    # Some amount of mixed line endings
+    mixed = sum(bool(x) for x in counts.values()) > 1
+
+    if fix == 'no' or (fix == 'auto' and not mixed):
+        return mixed
+
+    if fix == 'auto':
+        max_ending = LF
+        max_lines = 0
+        # ordering is important here such that lf > crlf > cr
+        for ending_type in ALL_ENDINGS:
+            # also important, using >= to find a max that prefers the last
+            if counts[ending_type] >= max_lines:
+                max_ending = ending_type
+                max_lines = counts[ending_type]
+
+        _fix(filename, contents, max_ending)
+        return 1
+    else:
+        target_ending = FIX_TO_LINE_ENDING[fix]
+        # find if there are lines with *other* endings
+        del counts[target_ending]
+        other_endings = bool(sum(counts.values()))
+        if other_endings:
+            _fix(filename, contents, target_ending)
+        return other_endings
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-f', '--fix',
+        choices=('auto', 'no') + tuple(FIX_TO_LINE_ENDING),
+        default='auto',
+        help='Replace line ending with the specified. Default is "auto"',
+    )
+    parser.add_argument('filenames', nargs='*', help='Filenames to fix')
+    args = parser.parse_args(argv)
+
+    retv = 0
+    for filename in args.filenames:
+        retv |= fix_filename(filename, args.fix)
+    return retv
+
+
+if __name__ == '__main__':
+    exit(main())
diff --git a/setup.py b/setup.py
index 4c8c148..2563095 100644
--- a/setup.py
+++ b/setup.py
@@ -53,6 +53,7 @@
             'file-contents-sorter = pre_commit_hooks.file_contents_sorter:main',
             'fix-encoding-pragma = pre_commit_hooks.fix_encoding_pragma:main',
             'forbid-new-submodules = pre_commit_hooks.forbid_new_submodules:main',
+            'mixed-line-ending = pre_commit_hooks.mixed_line_ending:mixed_line_ending',
             'name-tests-test = pre_commit_hooks.tests_should_end_in_test:validate_files',
             'no-commit-to-branch = pre_commit_hooks.no_commit_to_branch:main',
             'pretty-format-json = pre_commit_hooks.pretty_format_json:pretty_format_json',
diff --git a/tests/mixed_line_ending_test.py b/tests/mixed_line_ending_test.py
new file mode 100644
index 0000000..808295b
--- /dev/null
+++ b/tests/mixed_line_ending_test.py
@@ -0,0 +1,103 @@
+from __future__ import absolute_import
+from __future__ import unicode_literals
+
+import pytest
+
+from pre_commit_hooks.mixed_line_ending import main
+
+
+@pytest.mark.parametrize(
+    ('input_s', 'output'),
+    (
+        # mixed with majority of 'LF'
+        (b'foo\r\nbar\nbaz\n', b'foo\nbar\nbaz\n'),
+        # mixed with majority of 'CRLF'
+        (b'foo\r\nbar\nbaz\r\n', b'foo\r\nbar\r\nbaz\r\n'),
+        # mixed with majority of 'CR'
+        (b'foo\rbar\nbaz\r', b'foo\rbar\rbaz\r'),
+        # mixed with as much 'LF' as 'CRLF'
+        (b'foo\r\nbar\n', b'foo\nbar\n'),
+        # mixed with as much 'LF' as 'CR'
+        (b'foo\rbar\n', b'foo\nbar\n'),
+        # mixed with as much 'CRLF' as 'CR'
+        (b'foo\r\nbar\r', b'foo\r\nbar\r\n'),
+        # mixed with as much 'CRLF' as 'LF' as 'CR'
+        (b'foo\r\nbar\nbaz\r', b'foo\nbar\nbaz\n'),
+    ),
+)
+def test_mixed_line_ending_fixes_auto(input_s, output, tmpdir):
+    path = tmpdir.join('file.txt')
+    path.write_binary(input_s)
+    ret = main((path.strpath,))
+
+    assert ret == 1
+    assert path.read_binary() == output
+
+
+def test_non_mixed_no_newline_end_of_file(tmpdir):
+    path = tmpdir.join('f.txt')
+    path.write_binary(b'foo\nbar\nbaz')
+    assert not main((path.strpath,))
+    # the hook *could* fix the end of the file, but leaves it alone
+    # this is mostly to document the current behaviour
+    assert path.read_binary() == b'foo\nbar\nbaz'
+
+
+def test_mixed_no_newline_end_of_file(tmpdir):
+    path = tmpdir.join('f.txt')
+    path.write_binary(b'foo\r\nbar\nbaz')
+    assert main((path.strpath,))
+    # the hook rewrites the end of the file, this is slightly inconsistent
+    # with the non-mixed case but I think this is the better behaviour
+    # this is mostly to document the current behaviour
+    assert path.read_binary() == b'foo\nbar\nbaz\n'
+
+
+@pytest.mark.parametrize(
+    ('fix_option', 'input_s'),
+    (
+        # All --fix=auto with uniform line endings should be ok
+        ('--fix=auto', b'foo\r\nbar\r\nbaz\r\n'),
+        ('--fix=auto', b'foo\rbar\rbaz\r'),
+        ('--fix=auto', b'foo\nbar\nbaz\n'),
+        # --fix=crlf with crlf endings
+        ('--fix=crlf', b'foo\r\nbar\r\nbaz\r\n'),
+        # --fix=lf with lf endings
+        ('--fix=lf', b'foo\nbar\nbaz\n'),
+    ),
+)
+def test_line_endings_ok(fix_option, input_s, tmpdir):
+    path = tmpdir.join('input.txt')
+    path.write_binary(input_s)
+    ret = main((fix_option, path.strpath))
+
+    assert ret == 0
+    assert path.read_binary() == input_s
+
+
+def test_no_fix_does_not_modify(tmpdir):
+    path = tmpdir.join('input.txt')
+    contents = b'foo\r\nbar\rbaz\nwomp\n'
+    path.write_binary(contents)
+    ret = main(('--fix=no', path.strpath))
+
+    assert ret == 1
+    assert path.read_binary() == contents
+
+
+def test_fix_lf(tmpdir):
+    path = tmpdir.join('input.txt')
+    path.write_binary(b'foo\r\nbar\rbaz\n')
+    ret = main(('--fix=lf', path.strpath))
+
+    assert ret == 1
+    assert path.read_binary() == b'foo\nbar\nbaz\n'
+
+
+def test_fix_crlf(tmpdir):
+    path = tmpdir.join('input.txt')
+    path.write_binary(b'foo\r\nbar\rbaz\n')
+    ret = main(('--fix=crlf', path.strpath))
+
+    assert ret == 1
+    assert path.read_binary() == b'foo\r\nbar\r\nbaz\r\n'