Merge pull request #374 from pre-commit/check_docstring_first_no_encoding
Don't require an encoding for check-docstring-first
diff --git a/pre_commit_hooks/check_docstring_first.py b/pre_commit_hooks/check_docstring_first.py
index f4639f1..6c19381 100644
--- a/pre_commit_hooks/check_docstring_first.py
+++ b/pre_commit_hooks/check_docstring_first.py
@@ -8,14 +8,23 @@
from typing import Optional
from typing import Sequence
+import six
-NON_CODE_TOKENS = frozenset((
- tokenize.COMMENT, tokenize.ENDMARKER, tokenize.NEWLINE, tokenize.NL,
-))
+if six.PY2: # pragma: no cover (PY2)
+ from tokenize import generate_tokens as tokenize_tokenize
+ OTHER_NON_CODE = ()
+else: # pragma: no cover (PY3)
+ from tokenize import tokenize as tokenize_tokenize
+ OTHER_NON_CODE = (tokenize.ENCODING,)
+
+NON_CODE_TOKENS = frozenset(
+ (tokenize.COMMENT, tokenize.ENDMARKER, tokenize.NEWLINE, tokenize.NL) +
+ OTHER_NON_CODE,
+)
def check_docstring_first(src, filename='<unknown>'):
- # type: (str, str) -> int
+ # type: (bytes, str) -> int
"""Returns nonzero if the source has what looks like a docstring that is
not at the beginning of the source.
@@ -25,7 +34,7 @@
found_docstring_line = None
found_code_line = None
- tok_gen = tokenize.generate_tokens(io.StringIO(src).readline)
+ tok_gen = tokenize_tokenize(io.BytesIO(src).readline)
for tok_type, _, (sline, scol), _, _ in tok_gen:
# Looks like a docstring!
if tok_type == tokenize.STRING and scol == 0:
@@ -61,7 +70,7 @@
retv = 0
for filename in args.filenames:
- with io.open(filename, encoding='UTF-8') as f:
+ with open(filename, 'rb') as f:
contents = f.read()
retv |= check_docstring_first(contents, filename=filename)
diff --git a/tests/check_docstring_first_test.py b/tests/check_docstring_first_test.py
index aa9898d..0973a58 100644
--- a/tests/check_docstring_first_test.py
+++ b/tests/check_docstring_first_test.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
@@ -10,37 +11,37 @@
# Contents, expected, expected_output
TESTS = (
# trivial
- ('', 0, ''),
+ (b'', 0, ''),
# Acceptable
- ('"foo"', 0, ''),
+ (b'"foo"', 0, ''),
# Docstring after code
(
- 'from __future__ import unicode_literals\n'
- '"foo"\n',
+ b'from __future__ import unicode_literals\n'
+ b'"foo"\n',
1,
'{filename}:2 Module docstring appears after code '
'(code seen on line 1).\n',
),
# Test double docstring
(
- '"The real docstring"\n'
- 'from __future__ import absolute_import\n'
- '"fake docstring"\n',
+ b'"The real docstring"\n'
+ b'from __future__ import absolute_import\n'
+ b'"fake docstring"\n',
1,
'{filename}:3 Multiple module docstrings '
'(first docstring on line 1).\n',
),
# Test multiple lines of code above
(
- 'import os\n'
- 'import sys\n'
- '"docstring"\n',
+ b'import os\n'
+ b'import sys\n'
+ b'"docstring"\n',
1,
'{filename}:3 Module docstring appears after code '
'(code seen on line 1).\n',
),
# String literals in expressions are ok.
- ('x = "foo"\n', 0, ''),
+ (b'x = "foo"\n', 0, ''),
)
@@ -58,6 +59,13 @@
@all_tests
def test_integration(tmpdir, capsys, contents, expected, expected_out):
f = tmpdir.join('test.py')
- f.write(contents)
+ f.write_binary(contents)
assert main([f.strpath]) == expected
assert capsys.readouterr()[0] == expected_out.format(filename=f.strpath)
+
+
+def test_arbitrary_encoding(tmpdir):
+ f = tmpdir.join('f.py')
+ contents = '# -*- coding: cp1252\nx = "£"'.encode('cp1252')
+ f.write_binary(contents)
+ assert main([f.strpath]) == 0