Check for duplicates across dictionaries (#1501)
* Check for duplicates across dictionaries
And no longer allow self-corrections within these checks (given we exclude them elsewhere already)
* Improve the "is an error itself in another dictionary file" logging
* Try @larsoner 's proposed improvement to global dictionary checking
* Fix the syntax error and I think fix the cross-dictionary detection
* Ensure we catch all the error cases
* ENH: Refactor
* Move chack back to rare
* Fix the substitution variables for an error
* Move wen back from rare to names
* Finish moving chack to rare
* Remove a few en-GB to en-US conversions from rare
Co-authored-by: Eric Larson <larson.eric.d@gmail.com>
diff --git a/.travis.yml b/.travis.yml
index 535d8c0..2f301f7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -30,7 +30,7 @@
- source venv/bin/activate
- python --version # just to check
- pip install -U pip wheel # upgrade to latest pip find 3.5 wheels; wheel to avoid errors
- - retry pip install pytest pytest-cov flake8 coverage codecov chardet setuptools docutils
+ - retry pip install pytest pytest-cov pytest-dependency flake8 coverage codecov chardet setuptools docutils
- if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then retry pip install aspell-python-py2; fi
- if [ ${TRAVIS_PYTHON_VERSION:0:1} == "3" ]; then retry pip install aspell-python-py3; fi
- cd $SRC_DIR
diff --git a/appveyor.yml b/appveyor.yml
index 201e011..d7a9999 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -16,7 +16,7 @@
install:
- "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
- - "pip install pytest pytest-cov setuptools flake8 coverage chardet codecov"
+ - "pip install pytest pytest-cov pytest-dependency setuptools flake8 coverage chardet codecov"
- "python setup.py develop"
build: false # Not a C# project, build stuff at the test step instead.
diff --git a/codespell_lib/data/dictionary.txt b/codespell_lib/data/dictionary.txt
index 1e2ffb9..5b65e67 100644
--- a/codespell_lib/data/dictionary.txt
+++ b/codespell_lib/data/dictionary.txt
@@ -4223,7 +4223,6 @@
chache->cache
chached->cached
chacheline->cacheline
-chack->check, chalk,
chaeck->check
chaecked->checked
chaecker->checker
diff --git a/codespell_lib/data/dictionary_names.txt b/codespell_lib/data/dictionary_names.txt
index e58d4c7..a9113db 100644
--- a/codespell_lib/data/dictionary_names.txt
+++ b/codespell_lib/data/dictionary_names.txt
@@ -6,5 +6,4 @@
que->queue
sargent->sergeant, argent,
tim->time
-wen->when
-wight->weight, white, right, write,
+wen->we, when,
diff --git a/codespell_lib/data/dictionary_rare.txt b/codespell_lib/data/dictionary_rare.txt
index f79ee60..cd425c0 100644
--- a/codespell_lib/data/dictionary_rare.txt
+++ b/codespell_lib/data/dictionary_rare.txt
@@ -13,8 +13,7 @@
busses->buses
calculatable->calculable
cant->can't
-catalogue->catalog
-chack->check, cheque,
+chack->check, chalk, cheque,
chancel->cancel
chancels->cancels
circularly->circular
@@ -79,7 +78,6 @@
marge->merge
mater->matter, master, mother,
medias->media, mediums,
-memorise->memorize
midwifes->midwives
moil->soil, mohel,
mot->not
@@ -133,12 +131,11 @@
wan->want
want's->wants
wee->we
-wen->we, when,
whats->what's
whet->when, what, wet,
whiling->while
-wight->weight, white, right,
-wights->weights, whites, rights,
+wight->weight, white, right, write,
+wights->weights, whites, rights, writes,
wit->with
withe->with
wither->either, whether, weather,
diff --git a/codespell_lib/tests/test_dictionary.py b/codespell_lib/tests/test_dictionary.py
index a71e4c7..2349874 100644
--- a/codespell_lib/tests/test_dictionary.py
+++ b/codespell_lib/tests/test_dictionary.py
@@ -28,6 +28,8 @@
ws = re.compile(r'.*\s.*') # whitespace
comma = re.compile(r'.*,.*') # comma
+global_err_dicts = dict()
+global_pairs = set()
# Filename, should be seen as errors in aspell or not
_data_dir = op.join(op.dirname(__file__), '..', 'data')
@@ -169,21 +171,78 @@
_check_err_rep(err, rep, (err_aspell, rep_aspell), 'dummy')
+# allow some duplicates, like "m-i-n-i-m-i-s-e", or "c-a-l-c-u-l-a-t-a-b-l-e"
+allowed_dups = {
+ ('dictionary.txt', 'dictionary_en-GB_to_en-US.txt'),
+ ('dictionary.txt', 'dictionary_rare.txt'),
+}
+
+
@fname_params
+@pytest.mark.dependency(name='dictionary loop')
def test_dictionary_looping(fname, in_aspell):
"""Test that all dictionary entries are valid."""
- err_dict = dict()
+ this_err_dict = dict()
+ short_fname = op.basename(fname)
with open(fname, 'rb') as fid:
for line in fid:
err, rep = line.decode('utf-8').split('->')
err = err.lower()
- assert err not in err_dict, 'error %r already exists' % err
+ assert err not in this_err_dict, \
+ 'error %r already exists in %s' % (err, short_fname)
rep = rep.rstrip('\n')
reps = [r.strip() for r in rep.lower().split(',')]
reps = [r for r in reps if len(r)]
- err_dict[err] = reps
- # check for corrections that are errors (but not self replacements)
- for err in err_dict:
- for r in err_dict[err]:
- assert (r not in err_dict) or (r in err_dict[r]), \
- ('error %s: correction %s is an error itself' % (err, r))
+ this_err_dict[err] = reps
+ # 1. check the dict against itself (diagonal)
+ for err in this_err_dict:
+ for r in this_err_dict[err]:
+ assert r not in this_err_dict, \
+ ('error %s: correction %s is an error itself in the same '
+ 'dictionary file %s' % (err, r, short_fname))
+ pair = (short_fname, short_fname)
+ assert pair not in global_pairs
+ global_pairs.add(pair)
+ for other_fname, other_err_dict in global_err_dicts.items():
+ # error duplication (eventually maybe we should just merge?)
+ for err in this_err_dict:
+ assert err not in other_err_dict, \
+ ('error %r in dictionary %s already exists in dictionary '
+ '%s' % (err, short_fname, other_fname))
+ # 2. check corrections in this dict against other dicts (upper)
+ pair = (short_fname, other_fname)
+ if pair not in allowed_dups:
+ for err in this_err_dict:
+ assert err not in other_err_dict, \
+ ('error %r in dictionary %s already exists in dictionary '
+ '%s' % (err, short_fname, other_fname))
+ for r in this_err_dict[err]:
+ assert r not in other_err_dict, \
+ ('error %s: correction %s from dictionary %s is an '
+ 'error itself in dictionary %s'
+ % (err, r, short_fname, other_fname))
+ assert pair not in global_pairs
+ global_pairs.add(pair)
+ # 3. check corrections in other dicts against this dict (lower)
+ pair = (other_fname, short_fname)
+ if pair not in allowed_dups:
+ for err in other_err_dict:
+ for r in other_err_dict[err]:
+ assert r not in this_err_dict, \
+ ('error %s: correction %s from dictionary %s is an '
+ 'error itself in dictionary %s'
+ % (err, r, other_fname, short_fname))
+ assert pair not in global_pairs
+ global_pairs.add(pair)
+ global_err_dicts[short_fname] = this_err_dict
+
+
+@pytest.mark.dependency(depends=['dictionary loop'])
+def test_ran_all():
+ """Test that all pairwise tests ran."""
+ for f1, _ in _fnames_in_aspell:
+ f1 = op.basename(f1)
+ for f2, _ in _fnames_in_aspell:
+ f2 = op.basename(f2)
+ assert (f1, f2) in global_pairs
+ assert len(global_pairs) == len(_fnames_in_aspell) ** 2