Don't require an encoding for check-docstring-first
diff --git a/pre_commit_hooks/check_docstring_first.py b/pre_commit_hooks/check_docstring_first.py index f4639f1..6c19381 100644 --- a/pre_commit_hooks/check_docstring_first.py +++ b/pre_commit_hooks/check_docstring_first.py
@@ -8,14 +8,23 @@ from typing import Optional from typing import Sequence +import six -NON_CODE_TOKENS = frozenset(( - tokenize.COMMENT, tokenize.ENDMARKER, tokenize.NEWLINE, tokenize.NL, -)) +if six.PY2: # pragma: no cover (PY2) + from tokenize import generate_tokens as tokenize_tokenize + OTHER_NON_CODE = () +else: # pragma: no cover (PY3) + from tokenize import tokenize as tokenize_tokenize + OTHER_NON_CODE = (tokenize.ENCODING,) + +NON_CODE_TOKENS = frozenset( + (tokenize.COMMENT, tokenize.ENDMARKER, tokenize.NEWLINE, tokenize.NL) + + OTHER_NON_CODE, +) def check_docstring_first(src, filename='<unknown>'): - # type: (str, str) -> int + # type: (bytes, str) -> int """Returns nonzero if the source has what looks like a docstring that is not at the beginning of the source. @@ -25,7 +34,7 @@ found_docstring_line = None found_code_line = None - tok_gen = tokenize.generate_tokens(io.StringIO(src).readline) + tok_gen = tokenize_tokenize(io.BytesIO(src).readline) for tok_type, _, (sline, scol), _, _ in tok_gen: # Looks like a docstring! if tok_type == tokenize.STRING and scol == 0: @@ -61,7 +70,7 @@ retv = 0 for filename in args.filenames: - with io.open(filename, encoding='UTF-8') as f: + with open(filename, 'rb') as f: contents = f.read() retv |= check_docstring_first(contents, filename=filename)
diff --git a/tests/check_docstring_first_test.py b/tests/check_docstring_first_test.py index aa9898d..0973a58 100644 --- a/tests/check_docstring_first_test.py +++ b/tests/check_docstring_first_test.py
@@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from __future__ import absolute_import from __future__ import unicode_literals @@ -10,37 +11,37 @@ # Contents, expected, expected_output TESTS = ( # trivial - ('', 0, ''), + (b'', 0, ''), # Acceptable - ('"foo"', 0, ''), + (b'"foo"', 0, ''), # Docstring after code ( - 'from __future__ import unicode_literals\n' - '"foo"\n', + b'from __future__ import unicode_literals\n' + b'"foo"\n', 1, '{filename}:2 Module docstring appears after code ' '(code seen on line 1).\n', ), # Test double docstring ( - '"The real docstring"\n' - 'from __future__ import absolute_import\n' - '"fake docstring"\n', + b'"The real docstring"\n' + b'from __future__ import absolute_import\n' + b'"fake docstring"\n', 1, '{filename}:3 Multiple module docstrings ' '(first docstring on line 1).\n', ), # Test multiple lines of code above ( - 'import os\n' - 'import sys\n' - '"docstring"\n', + b'import os\n' + b'import sys\n' + b'"docstring"\n', 1, '{filename}:3 Module docstring appears after code ' '(code seen on line 1).\n', ), # String literals in expressions are ok. - ('x = "foo"\n', 0, ''), + (b'x = "foo"\n', 0, ''), ) @@ -58,6 +59,13 @@ @all_tests def test_integration(tmpdir, capsys, contents, expected, expected_out): f = tmpdir.join('test.py') - f.write(contents) + f.write_binary(contents) assert main([f.strpath]) == expected assert capsys.readouterr()[0] == expected_out.format(filename=f.strpath) + + +def test_arbitrary_encoding(tmpdir): + f = tmpdir.join('f.py') + contents = '# -*- coding: cp1252\nx = "£"'.encode('cp1252') + f.write_binary(contents) + assert main([f.strpath]) == 0