Merge remote-tracking branch 'upstream/master' into ignore-uris
diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 7ccff56..f6e8580 100755
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -29,6 +29,11 @@
import textwrap
word_regex_def = u"[\\w\\-'’`]+"
+# While we want to treat characters like ( or " as okay for a starting break,
+# these may occur unescaped in URIs, and so we are more restrictive on the
+# endpoint. Emails are more restrictive, so the endpoint remains flexible.
+uri_regex_def = (u"(\\b(?:https?|t?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|"
+ u"\\b[\\w.%+-]+@[\\w.-]+\\b)")
encodings = ('utf-8', 'iso-8859-1')
USAGE = """
\t%prog [OPTIONS] [file1 file2 ... fileN]
@@ -280,7 +285,7 @@
'patterns to ignore by treating as whitespace. '
'When writing regexes, consider ensuring there '
'are boundary non-word chars, e.g., '
- '"\\Wmatch\\W". Defaults to empty/disabled.')
+ '"\\bmatch\\b". Defaults to empty/disabled.')
parser.add_argument('-I', '--ignore-words',
action='append', metavar='FILE',
help='file that contains words which will be ignored '
@@ -292,6 +297,13 @@
help='comma separated list of words to be ignored '
'by codespell. Words are case sensitive based on '
'how they are written in the dictionary file')
+ parser.add_argument('--uri-ignore-words-list',
+ action='append', metavar='WORDS',
+ help='comma separated list of words to be ignored '
+ 'by codespell in URIs and emails only. Words are '
+ 'case sensitive based on how they are written in '
+ 'the dictionary file. If set to "*", all '
+ 'misspelling in URIs and emails will be ignored.')
parser.add_argument('-r', '--regex',
action='store', type=str,
help='regular expression which is used to find words. '
@@ -299,6 +311,10 @@
'underscore, the hyphen, and the apostrophe is '
'used to build words. This option cannot be '
'specified together with --write-changes.')
+ parser.add_argument('--uri-regex',
+ action='store', type=str,
+ help='regular expression which is used to find URIs '
+ 'and emails. A default expression is provided.')
parser.add_argument('-s', '--summary',
action='store_true', default=False,
help='print summary of fixes')
@@ -401,6 +417,15 @@
return options, parser
+def parse_ignore_words_option(ignore_words_option):
+ ignore_words = set()
+ if ignore_words_option:
+ for comma_separated_words in ignore_words_option:
+ for word in comma_separated_words.split(','):
+ ignore_words.add(word.strip())
+ return ignore_words
+
+
def build_exclude_hashes(filename, exclude_lines):
with codecs.open(filename, 'r') as f:
for line in f:
@@ -530,8 +555,20 @@
return word_regex.findall(text)
+def apply_uri_ignore_words(check_words, line, word_regex, ignore_word_regex,
+ uri_regex, uri_ignore_words):
+ if not uri_ignore_words:
+ return
+ for uri in re.findall(uri_regex, line):
+ for uri_word in extract_words(uri, word_regex,
+ ignore_word_regex):
+ if uri_word in uri_ignore_words:
+ check_words.remove(uri_word)
+
+
def parse_file(filename, colors, summary, misspellings, exclude_lines,
- file_opener, word_regex, ignore_word_regex, context, options):
+ file_opener, word_regex, ignore_word_regex, uri_regex,
+ uri_ignore_words, context, options):
bad_count = 0
lines = None
changed = False
@@ -596,7 +633,19 @@
fixed_words = set()
asked_for = set()
- for word in extract_words(line, word_regex, ignore_word_regex):
+ # If all URI spelling errors will be ignored, erase any URI before
+ # extracting words. Otherwise, apply ignores after extracting words.
+ # This ensures that if a URI ignore word occurs both inside a URI and
+ # outside, it will still be a spelling error.
+ if "*" in uri_ignore_words:
+ line = uri_regex.sub(' ', line)
+ check_words = extract_words(line, word_regex, ignore_word_regex)
+ if "*" not in uri_ignore_words:
+ apply_uri_ignore_words(check_words, line, word_regex,
+ ignore_word_regex, uri_regex,
+ uri_ignore_words)
+
+ for word in check_words:
lword = word.lower()
if lword in misspellings:
context_shown = False
@@ -716,7 +765,7 @@
ignore_word_regex = None
ignore_words_files = options.ignore_words or []
- ignore_words = set()
+ ignore_words = parse_ignore_words_option(options.ignore_words_list)
for ignore_words_file in ignore_words_files:
if not os.path.isfile(ignore_words_file):
print("ERROR: cannot find ignore-words file: %s" %
@@ -725,10 +774,15 @@
return EX_USAGE
build_ignore_words(ignore_words_file, ignore_words)
- ignore_words_list = options.ignore_words_list or []
- for comma_separated_words in ignore_words_list:
- for word in comma_separated_words.split(','):
- ignore_words.add(word.strip())
+ uri_regex = options.uri_regex or uri_regex_def
+ try:
+ uri_regex = re.compile(uri_regex)
+ except re.error as err:
+ print("ERROR: invalid --uri-regex \"%s\" (%s)" %
+ (uri_regex, err), file=sys.stderr)
+ parser.print_help()
+ return EX_USAGE
+ uri_ignore_words = parse_ignore_words_option(options.uri_ignore_words_list)
if options.dictionary:
dictionaries = options.dictionary
@@ -822,8 +876,8 @@
continue
bad_count += parse_file(
fname, colors, summary, misspellings, exclude_lines,
- file_opener, word_regex, ignore_word_regex, context,
- options)
+ file_opener, word_regex, ignore_word_regex, uri_regex,
+ uri_ignore_words, context, options)
# skip (relative) directories
dirs[:] = [dir_ for dir_ in dirs if not glob_match.match(dir_)]
@@ -831,7 +885,8 @@
else:
bad_count += parse_file(
filename, colors, summary, misspellings, exclude_lines,
- file_opener, word_regex, ignore_word_regex, context, options)
+ file_opener, word_regex, ignore_word_regex, uri_regex,
+ uri_ignore_words, context, options)
if summary:
print("\n-------8<-------\nSUMMARY:")
diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py
index 2e8fd73..29c5de7 100644
--- a/codespell_lib/tests/test_basic.py
+++ b/codespell_lib/tests/test_basic.py
@@ -6,6 +6,7 @@
import inspect
import os
import os.path as op
+import re
from shutil import copyfile
import subprocess
import sys
@@ -13,7 +14,7 @@
import pytest
import codespell_lib as cs_
-from codespell_lib._codespell import EX_USAGE, EX_OK, EX_DATAERR
+from codespell_lib._codespell import uri_regex_def, EX_USAGE, EX_OK, EX_DATAERR
def test_constants():
@@ -455,8 +456,8 @@
assert 'ERROR' in lines[0]
-def test_ignore_regex_flag(tmpdir, capsys):
- """Test ignore regex flag functionality."""
+def test_ignore_regex_option(tmpdir, capsys):
+ """Test ignore regex option functionality."""
d = str(tmpdir)
# Invalid regex.
@@ -485,7 +486,247 @@
# Ignoring donn breaks them both.
assert cs.main(f.name, '--ignore-regex=donn') == 0
# Adding word breaks causes only one to be ignored.
- assert cs.main(f.name, r'--ignore-regex=\Wdonn\W') == 1
+ assert cs.main(f.name, r'--ignore-regex=\bdonn\b') == 1
+
+
+def test_uri_regex_option(tmpdir, capsys):
+ """Test --uri-regex option functionality."""
+ d = str(tmpdir)
+
+ # Invalid regex.
+ code, stdout, _ = cs.main('--uri-regex=(', std=True)
+ assert code == EX_USAGE
+ assert 'usage:' in stdout
+
+ with open(op.join(d, 'flag.txt'), 'w') as f:
+ f.write('# Please see http://abandonned.com for info\n')
+
+ # By default, the standard regex is used.
+ assert cs.main(f.name) == 1
+ assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 0
+
+ # If empty, nothing matches.
+ assert cs.main(f.name, '--uri-regex=',
+ '--uri-ignore-words-list=abandonned') == 0
+
+ # Can manually match urls.
+ assert cs.main(f.name, '--uri-regex=\\bhttp.*\\b',
+ '--uri-ignore-words-list=abandonned') == 0
+
+ # Can also match arbitrary content.
+ with open(op.join(d, 'flag.txt'), 'w') as f:
+ f.write('abandonned')
+ assert cs.main(f.name) == 1
+ assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 1
+ assert cs.main(f.name, '--uri-regex=.*') == 1
+ assert cs.main(f.name, '--uri-regex=.*',
+ '--uri-ignore-words-list=abandonned') == 0
+
+
+def test_uri_ignore_words_list_option_uri(tmpdir, capsys):
+ """Test ignore regex option functionality."""
+ d = str(tmpdir)
+
+ with open(op.join(d, 'flag.txt'), 'w') as f:
+ f.write('# Please see http://example.com/abandonned for info\n')
+ # Test file has 1 invalid entry, and it's not ignored by default.
+ assert cs.main(f.name) == 1
+ # An empty list is the default value, and nothing is ignored.
+ assert cs.main(f.name, '--uri-ignore-words-list=') == 1
+ # Non-matching regex results in nothing being ignored.
+ assert cs.main(f.name, '--uri-ignore-words-list=foo,example') == 1
+ # A word can be ignored.
+ assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 0
+ assert cs.main(f.name, '--uri-ignore-words-list=foo,abandonned,bar') == 0
+ assert cs.main(f.name, '--uri-ignore-words-list=*') == 0
+ # The match must be for the complete word.
+ assert cs.main(f.name, '--uri-ignore-words-list=abandonn') == 1
+
+ with open(op.join(d, 'flag.txt'), 'w') as f:
+ f.write('abandonned http://example.com/abandonned\n')
+ # Test file has 2 invalid entries.
+ assert cs.main(f.name) == 2
+ # Ignoring the value in the URI won't ignore the word completely.
+ assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 1
+ assert cs.main(f.name, '--uri-ignore-words-list=*') == 1
+ # The regular --ignore-words-list will ignore both.
+ assert cs.main(f.name, '--ignore-words-list=abandonned') == 0
+
+ variation_option = '--uri-ignore-words-list=abandonned'
+
+ # Variations where an error is ignored.
+ for variation in ('# Please see http://abandonned for info\n',
+ '# Please see "http://abandonned" for info\n',
+ # This variation could be un-ignored, but it'd require a
+ # more complex regex as " is valid in parts of URIs.
+ '# Please see "http://foo"abandonned for info\n',
+ '# Please see https://abandonned for info\n',
+ '# Please see ftp://abandonned for info\n',
+ '# Please see http://example/abandonned for info\n',
+ '# Please see http://example.com/abandonned for info\n',
+ '# Please see http://exam.com/ple#abandonned for info\n',
+ '# Please see http://exam.com/ple?abandonned for info\n',
+ '# Please see http://127.0.0.1/abandonned for info\n',
+ '# Please see http://[2001:0db8:85a3:0000:0000:8a2e:0370'
+ ':7334]/abandonned for info\n'):
+ with open(op.join(d, 'flag.txt'), 'w') as f:
+ f.write(variation)
+ assert cs.main(f.name) == 1, variation
+ assert cs.main(f.name, variation_option) == 0, variation
+
+ # Variations where no error is ignored.
+ for variation in ('# Please see abandonned/ for info\n',
+ '# Please see http:abandonned for info\n',
+ '# Please see foo/abandonned for info\n',
+ '# Please see http://foo abandonned for info\n'):
+ with open(op.join(d, 'flag.txt'), 'w') as f:
+ f.write(variation)
+ assert cs.main(f.name) == 1, variation
+ assert cs.main(f.name, variation_option) == 1, variation
+
+
+def test_uri_ignore_words_list_option_email(tmpdir, capsys):
+ """Test ignore regex option functionality."""
+ d = str(tmpdir)
+
+ with open(op.join(d, 'flag.txt'), 'w') as f:
+ f.write('# Please see example@abandonned.com for info\n')
+ # Test file has 1 invalid entry, and it's not ignored by default.
+ assert cs.main(f.name) == 1
+ # An empty list is the default value, and nothing is ignored.
+ assert cs.main(f.name, '--uri-ignore-words-list=') == 1
+ # Non-matching regex results in nothing being ignored.
+ assert cs.main(f.name, '--uri-ignore-words-list=foo,example') == 1
+ # A word can be ignored.
+ assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 0
+ assert cs.main(f.name, '--uri-ignore-words-list=foo,abandonned,bar') == 0
+ assert cs.main(f.name, '--uri-ignore-words-list=*') == 0
+ # The match must be for the complete word.
+ assert cs.main(f.name, '--uri-ignore-words-list=abandonn') == 1
+
+ with open(op.join(d, 'flag.txt'), 'w') as f:
+ f.write('abandonned example@abandonned.com\n')
+ # Test file has 2 invalid entries.
+ assert cs.main(f.name) == 2
+ # Ignoring the value in the URI won't ignore the word completely.
+ assert cs.main(f.name, '--uri-ignore-words-list=abandonned') == 1
+ assert cs.main(f.name, '--uri-ignore-words-list=*') == 1
+ # The regular --ignore-words-list will ignore both.
+ assert cs.main(f.name, '--ignore-words-list=abandonned') == 0
+
+ variation_option = '--uri-ignore-words-list=abandonned'
+
+ # Variations where an error is ignored.
+ for variation in ('# Please see example@abandonned for info\n',
+ '# Please see abandonned@example for info\n',
+ '# Please see abandonned@example.com for info\n',
+ '# Please see mailto:abandonned@example.com?subject=Test'
+ ' for info\n'):
+ with open(op.join(d, 'flag.txt'), 'w') as f:
+ f.write(variation)
+ assert cs.main(f.name) == 1, variation
+ assert cs.main(f.name, variation_option) == 0, variation
+
+ # Variations where no error is ignored.
+ for variation in ('# Please see example @ abandonned for info\n',
+ '# Please see abandonned@ example for info\n'):
+ with open(op.join(d, 'flag.txt'), 'w') as f:
+ f.write(variation)
+ assert cs.main(f.name) == 1, variation
+ assert cs.main(f.name, variation_option) == 1, variation
+
+
+def test_uri_regex_def():
+ uri_regex = re.compile(uri_regex_def)
+
+ # Tests based on https://mathiasbynens.be/demo/url-regex
+ true_positives = (
+ 'http://foo.com/blah_blah',
+ 'http://foo.com/blah_blah/',
+ 'http://foo.com/blah_blah_(wikipedia)',
+ 'http://foo.com/blah_blah_(wikipedia)_(again)',
+ 'http://www.example.com/wpstyle/?p=364',
+ 'https://www.example.com/foo/?bar=baz&inga=42&quux',
+ 'http://✪df.ws/123',
+ 'http://userid:password@example.com:8080',
+ 'http://userid:password@example.com:8080/',
+ 'http://userid@example.com',
+ 'http://userid@example.com/',
+ 'http://userid@example.com:8080',
+ 'http://userid@example.com:8080/',
+ 'http://userid:password@example.com',
+ 'http://userid:password@example.com/',
+ 'http://142.42.1.1/',
+ 'http://142.42.1.1:8080/',
+ 'http://➡.ws/䨹',
+ 'http://⌘.ws',
+ 'http://⌘.ws/',
+ 'http://foo.com/blah_(wikipedia)#cite-1',
+ 'http://foo.com/blah_(wikipedia)_blah#cite-1',
+ 'http://foo.com/unicode_(✪)_in_parens',
+ 'http://foo.com/(something)?after=parens',
+ 'http://☺.damowmow.com/',
+ 'http://code.google.com/events/#&product=browser',
+ 'http://j.mp',
+ 'ftp://foo.bar/baz',
+ 'http://foo.bar/?q=Test%20URL-encoded%20stuff',
+ 'http://مثال.إختبار',
+ 'http://例子.测试',
+ 'http://उदाहरण.परीक्षा',
+ "http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com",
+ 'http://1337.net',
+ 'http://a.b-c.de',
+ 'http://223.255.255.254',
+ )
+ true_negatives = (
+ 'http://',
+ '//',
+ '//a',
+ '///a',
+ '///',
+ 'foo.com',
+ 'rdar://1234',
+ 'h://test',
+ '://should.fail',
+ 'ftps://foo.bar/',
+ )
+ false_positives = (
+ 'http://.',
+ 'http://..',
+ 'http://../',
+ 'http://?',
+ 'http://??',
+ 'http://??/',
+ 'http://#',
+ 'http://##',
+ 'http://##/',
+ 'http:///a',
+ 'http://-error-.invalid/',
+ 'http://a.b--c.de/',
+ 'http://-a.b.co',
+ 'http://a.b-.co',
+ 'http://0.0.0.0',
+ 'http://10.1.1.0',
+ 'http://10.1.1.255',
+ 'http://224.1.1.1',
+ 'http://1.1.1.1.1',
+ 'http://123.123.123',
+ 'http://3628126748',
+ 'http://.www.foo.bar/',
+ 'http://www.foo.bar./',
+ 'http://.www.foo.bar./',
+ 'http://10.1.1.1',
+ )
+
+ boilerplate = 'Surrounding text %s more text'
+
+ for uri in true_positives + false_positives:
+ assert uri_regex.findall(uri) == [uri], uri
+ assert uri_regex.findall(boilerplate % uri) == [uri], uri
+
+ for uri in true_negatives:
+ assert not uri_regex.findall(uri), uri
+ assert not uri_regex.findall(boilerplate % uri), uri
def test_config(tmpdir, capsys):