Check for duplicates across dictionaries (#1501)

* Check for duplicates across dictionaries

And no longer allow self-corrections within these checks (given we exclude them elsewhere already)

* Improve the "is an error itself in another dictionary file" logging

* Try @larsoner 's proposed improvement to global dictionary checking

* Fix the syntax error and I think fix the cross-dictionary detection

* Ensure we catch all the error cases

* ENH: Refactor

* Move chack back to rare

* Fix the substitution variables for an error

* Move wen back from rare to names

* Finish moving chack to rare

* Remove a few en-GB to en-US conversions from rare

Co-authored-by: Eric Larson <larson.eric.d@gmail.com>
diff --git a/.travis.yml b/.travis.yml
index 535d8c0..2f301f7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -30,7 +30,7 @@
     - source venv/bin/activate
     - python --version  # just to check
     - pip install -U pip wheel  # upgrade to latest pip find 3.5 wheels; wheel to avoid errors
-    - retry pip install pytest pytest-cov flake8 coverage codecov chardet setuptools docutils
+    - retry pip install pytest pytest-cov pytest-dependency flake8 coverage codecov chardet setuptools docutils
     - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then retry pip install aspell-python-py2; fi
     - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "3" ]; then retry pip install aspell-python-py3; fi
     - cd $SRC_DIR
diff --git a/appveyor.yml b/appveyor.yml
index 201e011..d7a9999 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -16,7 +16,7 @@
 
 install:
   - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
-  - "pip install pytest pytest-cov setuptools flake8 coverage chardet codecov"
+  - "pip install pytest pytest-cov pytest-dependency setuptools flake8 coverage chardet codecov"
   - "python setup.py develop"
 
 build: false  # Not a C# project, build stuff at the test step instead.
diff --git a/codespell_lib/data/dictionary.txt b/codespell_lib/data/dictionary.txt
index 1e2ffb9..5b65e67 100644
--- a/codespell_lib/data/dictionary.txt
+++ b/codespell_lib/data/dictionary.txt
@@ -4223,7 +4223,6 @@
 chache->cache
 chached->cached
 chacheline->cacheline
-chack->check, chalk,
 chaeck->check
 chaecked->checked
 chaecker->checker
diff --git a/codespell_lib/data/dictionary_names.txt b/codespell_lib/data/dictionary_names.txt
index e58d4c7..a9113db 100644
--- a/codespell_lib/data/dictionary_names.txt
+++ b/codespell_lib/data/dictionary_names.txt
@@ -6,5 +6,4 @@
 que->queue
 sargent->sergeant, argent,
 tim->time
-wen->when
-wight->weight, white, right, write,
+wen->we, when,
diff --git a/codespell_lib/data/dictionary_rare.txt b/codespell_lib/data/dictionary_rare.txt
index f79ee60..cd425c0 100644
--- a/codespell_lib/data/dictionary_rare.txt
+++ b/codespell_lib/data/dictionary_rare.txt
@@ -13,8 +13,7 @@
 busses->buses
 calculatable->calculable
 cant->can't
-catalogue->catalog
-chack->check, cheque,
+chack->check, chalk, cheque,
 chancel->cancel
 chancels->cancels
 circularly->circular
@@ -79,7 +78,6 @@
 marge->merge
 mater->matter, master, mother,
 medias->media, mediums,
-memorise->memorize
 midwifes->midwives
 moil->soil, mohel,
 mot->not
@@ -133,12 +131,11 @@
 wan->want
 want's->wants
 wee->we
-wen->we, when,
 whats->what's
 whet->when, what, wet,
 whiling->while
-wight->weight, white, right,
-wights->weights, whites, rights,
+wight->weight, white, right, write,
+wights->weights, whites, rights, writes,
 wit->with
 withe->with
 wither->either, whether, weather,
diff --git a/codespell_lib/tests/test_dictionary.py b/codespell_lib/tests/test_dictionary.py
index a71e4c7..2349874 100644
--- a/codespell_lib/tests/test_dictionary.py
+++ b/codespell_lib/tests/test_dictionary.py
@@ -28,6 +28,8 @@
 ws = re.compile(r'.*\s.*')  # whitespace
 comma = re.compile(r'.*,.*')  # comma
 
+global_err_dicts = dict()
+global_pairs = set()
 
 # Filename, should be seen as errors in aspell or not
 _data_dir = op.join(op.dirname(__file__), '..', 'data')
@@ -169,21 +171,78 @@
         _check_err_rep(err, rep, (err_aspell, rep_aspell), 'dummy')
 
 
+# allow some duplicates, like "m-i-n-i-m-i-s-e", or "c-a-l-c-u-l-a-t-a-b-l-e"
+allowed_dups = {
+    ('dictionary.txt', 'dictionary_en-GB_to_en-US.txt'),
+    ('dictionary.txt', 'dictionary_rare.txt'),
+}
+
+
 @fname_params
+@pytest.mark.dependency(name='dictionary loop')
 def test_dictionary_looping(fname, in_aspell):
     """Test that all dictionary entries are valid."""
-    err_dict = dict()
+    this_err_dict = dict()
+    short_fname = op.basename(fname)
     with open(fname, 'rb') as fid:
         for line in fid:
             err, rep = line.decode('utf-8').split('->')
             err = err.lower()
-            assert err not in err_dict, 'error %r already exists' % err
+            assert err not in this_err_dict, \
+                'error %r already exists in %s' % (err, short_fname)
             rep = rep.rstrip('\n')
             reps = [r.strip() for r in rep.lower().split(',')]
             reps = [r for r in reps if len(r)]
-            err_dict[err] = reps
-    # check for corrections that are errors (but not self replacements)
-    for err in err_dict:
-        for r in err_dict[err]:
-            assert (r not in err_dict) or (r in err_dict[r]), \
-                ('error %s: correction %s is an error itself' % (err, r))
+            this_err_dict[err] = reps
+    # 1. check the dict against itself (diagonal)
+    for err in this_err_dict:
+        for r in this_err_dict[err]:
+            assert r not in this_err_dict, \
+                ('error %s: correction %s is an error itself in the same '
+                 'dictionary file %s' % (err, r, short_fname))
+    pair = (short_fname, short_fname)
+    assert pair not in global_pairs
+    global_pairs.add(pair)
+    for other_fname, other_err_dict in global_err_dicts.items():
+        # error duplication (eventually maybe we should just merge?)
+        for err in this_err_dict:
+            assert err not in other_err_dict, \
+                ('error %r in dictionary %s already exists in dictionary '
+                 '%s' % (err, short_fname, other_fname))
+        # 2. check corrections in this dict against other dicts (upper)
+        pair = (short_fname, other_fname)
+        if pair not in allowed_dups:
+            for err in this_err_dict:
+                assert err not in other_err_dict, \
+                    ('error %r in dictionary %s already exists in dictionary '
+                     '%s' % (err, short_fname, other_fname))
+                for r in this_err_dict[err]:
+                    assert r not in other_err_dict, \
+                        ('error %s: correction %s from dictionary %s is an '
+                         'error itself in dictionary %s'
+                         % (err, r, short_fname, other_fname))
+        assert pair not in global_pairs
+        global_pairs.add(pair)
+        # 3. check corrections in other dicts against this dict (lower)
+        pair = (other_fname, short_fname)
+        if pair not in allowed_dups:
+            for err in other_err_dict:
+                for r in other_err_dict[err]:
+                    assert r not in this_err_dict, \
+                        ('error %s: correction %s from dictionary %s is an '
+                         'error itself in dictionary %s'
+                         % (err, r, other_fname, short_fname))
+        assert pair not in global_pairs
+        global_pairs.add(pair)
+    global_err_dicts[short_fname] = this_err_dict
+
+
+@pytest.mark.dependency(depends=['dictionary loop'])
+def test_ran_all():
+    """Test that all pairwise tests ran."""
+    for f1, _ in _fnames_in_aspell:
+        f1 = op.basename(f1)
+        for f2, _ in _fnames_in_aspell:
+            f2 = op.basename(f2)
+            assert (f1, f2) in global_pairs
+    assert len(global_pairs) == len(_fnames_in_aspell) ** 2