Add encoding pragma hook. Resolves pre-commit/pre-commit#15

commit: aa2ba6f94fba94c91740bba3894b1f9e0977f8dc [log] [tgz]
author: Anthony Sottile <asottile@umich.edu> Fri Nov 13 12:34:37 2015 -0800
committer: Anthony Sottile <asottile@umich.edu> Fri Nov 13 12:39:43 2015 -0800
tree: 24dea1e121bfb4532f8918cb984c63c1c2a1f9a1
parent: 29bf11d13689a0a9a895c41eb3591c7e942d377d [diff]
diff --git a/README.md b/README.md
index 8148cee..9229f8f 100644
--- a/README.md
+++ b/README.md

@@ -43,6 +43,7 @@
 - `double-quote-string-fixer` - This hook replaces double quoted strings
   with single quoted strings.
 - `end-of-file-fixer` - Makes sure files end in a newline and only a newline.
+- `fix-encoding-pragma` - Add # -*- coding: utf-8 -*- to the top of python files
 - `flake8` - Run flake8 on your python files.
 - `name-tests-test` - Assert that files in tests/ end in `_test.py`.
     - Use `args: ['--django']` to match `test*.py` instead.

diff --git a/hooks.yaml b/hooks.yaml
index 7817d1b..d4ef521 100644
--- a/hooks.yaml
+++ b/hooks.yaml

@@ -92,6 +92,12 @@
     entry: end-of-file-fixer
     language: python
     files: \.(c|cpp|html|erb|slim|haml|ejs|jade|js|coffee|json|rb|md|py|css|scss|less|sh|tmpl|txt|yaml|yml|pp)$
+-   id: fix-encoding-pragma
+    name: Fix python encoding pragma
+    language: python
+    entry: fix-encoding-pragma
+    description: 'Add # -*- coding: utf-8 -*- to the top of python files'
+    files: \.py$
 -   id: flake8
     name: Flake8
     description: This hook runs flake8.

diff --git a/pre_commit_hooks/fix_encoding_pragma.py b/pre_commit_hooks/fix_encoding_pragma.py
new file mode 100644
index 0000000..48fc9c7
--- /dev/null
+++ b/pre_commit_hooks/fix_encoding_pragma.py

@@ -0,0 +1,75 @@
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import io
+
+expected_pragma = b'# -*- coding: utf-8 -*-\n'
+
+
+def has_coding(line):
+    if not line.strip():
+        return False
+    return (
+        line.lstrip()[0:1] == b'#' and (
+            b'unicode' in line or
+            b'encoding' in line or
+            b'coding:' in line or
+            b'coding=' in line
+        )
+    )
+
+
+def fix_encoding_pragma(f):
+    first_line = f.readline()
+    second_line = f.readline()
+    old = f.read()
+    f.seek(0)
+
+    # Ok case: the file is empty
+    if not (first_line + second_line + old).strip():
+        return 0
+
+    # Ok case: we specify pragma as the first line
+    if first_line == expected_pragma:
+        return 0
+
+    # OK case: we have a shebang as first line and pragma on second line
+    if first_line.startswith(b'#!') and second_line == expected_pragma:
+        return 0
+
+    # Otherwise we need to rewrite stuff!
+    if first_line.startswith(b'#!'):
+        if has_coding(second_line):
+            f.write(first_line + expected_pragma + old)
+        else:
+            f.write(first_line + expected_pragma + second_line + old)
+    elif has_coding(first_line):
+        f.write(expected_pragma + second_line + old)
+    else:
+        f.write(expected_pragma + first_line + second_line + old)
+
+    return 1
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser('Fixes the encoding pragma of python files')
+    parser.add_argument('filenames', nargs='*', help='Filenames to fix')
+    args = parser.parse_args(argv)
+
+    retv = 0
+
+    for filename in args.filenames:
+        with io.open(filename, 'r+b') as f:
+            file_ret = fix_encoding_pragma(f)
+            retv |= file_ret
+            if file_ret:
+                print('Added `{0}` to {1}'.format(
+                    expected_pragma.strip(), filename,
+                ))
+
+    return retv
+
+if __name__ == "__main__":
+    exit(main())

diff --git a/setup.py b/setup.py
index 4fefeaa..7779089 100644
--- a/setup.py
+++ b/setup.py

@@ -50,6 +50,7 @@
             'detect-private-key = pre_commit_hooks.detect_private_key:detect_private_key',
             'double-quote-string-fixer = pre_commit_hooks.string_fixer:main',
             'end-of-file-fixer = pre_commit_hooks.end_of_file_fixer:end_of_file_fixer',
+            'fix-encoding-pragma = pre_commit_hooks.fix_encoding_pragma:main',
             'name-tests-test = pre_commit_hooks.tests_should_end_in_test:validate_files',
             'pretty-format-json = pre_commit_hooks.pretty_format_json:pretty_format_json',
             'requirements-txt-fixer = pre_commit_hooks.requirements_txt_fixer:fix_requirements_txt',

diff --git a/tests/fix_encoding_pragma_test.py b/tests/fix_encoding_pragma_test.py
new file mode 100644
index 0000000..e000a33
--- /dev/null
+++ b/tests/fix_encoding_pragma_test.py

@@ -0,0 +1,82 @@
+from __future__ import absolute_import
+from __future__ import unicode_literals
+
+import io
+
+import pytest
+
+from pre_commit_hooks.fix_encoding_pragma import fix_encoding_pragma
+from pre_commit_hooks.fix_encoding_pragma import main
+
+
+def test_integration_inserting_pragma(tmpdir):
+    file_path = tmpdir.join('foo.py').strpath
+
+    with open(file_path, 'wb') as file_obj:
+        file_obj.write(b'import httplib\n')
+
+    assert main([file_path]) == 1
+
+    with open(file_path, 'rb') as file_obj:
+        assert file_obj.read() == (
+            b'# -*- coding: utf-8 -*-\n'
+            b'import httplib\n'
+        )
+
+
+def test_integration_ok(tmpdir):
+    file_path = tmpdir.join('foo.py').strpath
+    with open(file_path, 'wb') as file_obj:
+        file_obj.write(b'# -*- coding: utf-8 -*-\nx = 1\n')
+    assert main([file_path]) == 0
+
+
+@pytest.mark.parametrize(
+    'input_str',
+    (
+        b'',
+        b'# -*- coding: utf-8 -*-\n',
+        (
+            b'#!/usr/bin/env python\n'
+            b'# -*- coding: utf-8 -*-\n'
+            b'foo = "bar"\n'
+        ),
+    )
+)
+def test_ok_inputs(input_str):
+    bytesio = io.BytesIO(input_str)
+    assert fix_encoding_pragma(bytesio) == 0
+    bytesio.seek(0)
+    assert bytesio.read() == input_str
+
+
+@pytest.mark.parametrize(
+    ('input_str', 'output'),
+    (
+        (
+            b'import httplib\n',
+            b'# -*- coding: utf-8 -*-\n'
+            b'import httplib\n',
+        ),
+        (
+            b'#!/usr/bin/env python\n',
+            b'#!/usr/bin/env python\n'
+            b'# -*- coding: utf-8 -*-\n'
+        ),
+        (
+            b'#coding=utf-8\n',
+            b'# -*- coding: utf-8 -*-\n'
+        ),
+        (
+            b'#!/usr/bin/env python\n'
+            b'#coding=utf8\n',
+            b'#!/usr/bin/env python\n'
+            b'# -*- coding: utf-8 -*-\n',
+        ),
+    )
+)
+def test_not_ok_inputs(input_str, output):
+    bytesio = io.BytesIO(input_str)
+    assert fix_encoding_pragma(bytesio) == 1
+    bytesio.seek(0)
+    assert bytesio.read() == output
commit	aa2ba6f94fba94c91740bba3894b1f9e0977f8dc	[log] [tgz]
author	Anthony Sottile <asottile@umich.edu>	Fri Nov 13 12:34:37 2015 -0800
committer	Anthony Sottile <asottile@umich.edu>	Fri Nov 13 12:39:43 2015 -0800
tree	24dea1e121bfb4532f8918cb984c63c1c2a1f9a1
parent	29bf11d13689a0a9a895c41eb3591c7e942d377d [diff]