Merge branch 'dict' into v1.2.x

commit: eeaa6dcdb342063bb6d3a7ed2e5003d772552f43 [log] [tgz]
author: Lucas De Marchi <lucas.de.marchi@gmail.com> Sat Oct 15 10:05:12 2011 -0300
committer: Lucas De Marchi <lucas.de.marchi@gmail.com> Sat Oct 15 10:05:12 2011 -0300
tree: 5c36a8f4c6b27f70497011ac7aa4ef38c97e3779
parent: be9d1ace364d29c528c83e48f0a861aa272c8368 [diff]
parent: 2329b456f5b07b8a6ee888288ff3d201718a739c [diff]
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..6c54f02
--- /dev/null
+++ b/Makefile

@@ -0,0 +1,10 @@
+
+prefix ?= /usr
+bindir ?= ${prefix}/bin
+datadir ?= ${prefix}/share/codespell
+
+
+install:
+	install -d ${DESTDIR}${datadir} ${DESTDIR}${bindir}
+	install -m644 -t ${DESTDIR}${datadir} data/dictionary.txt data/linux-kernel.exclude
+	install -m755 -t ${DESTDIR}${bindir} codespell.py

diff --git a/README b/README
index f9ed44b..47cba2d 100644
--- a/README
+++ b/README

@@ -4,7 +4,13 @@
 Fix common misspellings in text files. It's designed primarily for checking
 misspelled words in source code, but it can be used with other files as well.
 
-USAGE
+Information
+===========
+
+Mailing list:
+	codespell@googlegroups.com
+
+Usage
 =====
 
 Check usage with ./codespell -h. There are a few command line options. We ship

diff --git a/TODO b/TODO
new file mode 100644
index 0000000..4f86e06
--- /dev/null
+++ b/TODO

@@ -0,0 +1,5 @@
+- Add option to disable changes to source code, allowing them only on comments
+  and text files
+
+BUGS
+====

diff --git a/codespell.py b/codespell.py
index cf81490..4252542 100755
--- a/codespell.py
+++ b/codespell.py

@@ -14,21 +14,26 @@
 # along with this program; if not, see
 # http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
 """
-Copyright (C) 2010 Lucas De Marchi <lucas.de.marchi@gmail.com>
+Copyright (C) 2010-2011  Lucas De Marchi <lucas.de.marchi@gmail.com>
+Copyright (C) 2011  ProFUSION embedded systems
 """
 
 import sys
 import re
 from optparse import OptionParser
 import os
+import fnmatch
 
 USAGE = """
 \t%prog [OPTIONS] dict_filename [file1 file2 ... fileN]
 """
-VERSION = '1.0'
+VERSION = '1.2'
 
 misspellings = {}
+exclude_lines = set()
 options = None
+fileopener = None
+quiet_level = 0
 encodings = [ 'utf-8', 'iso-8859-1' ]
 
 #OPTIONS:
@@ -38,7 +43,32 @@
 #                        If set to '-', it will be read from stdin
 #    file1 .. fileN      Files to check spelling
 
-class Mispell:
+class QuietLevels:
+    NONE = 0
+    ENCODING = 1
+    BINARY_FILE = 2
+    DISABLED_FIXES = 4
+    NON_AUTOMATIC_FIXES = 8
+    FIXES = 16
+
+class GlobMatch:
+    def __init__(self, pattern):
+        if pattern:
+            self.pattern_list = pattern.split(',')
+        else:
+            self.pattern_list = None
+
+    def match(self, filename):
+        if self.pattern_list is None:
+            return False
+
+        for p in self.pattern_list:
+            if fnmatch.fnmatch(filename, p):
+                return True
+
+        return False
+
+class Misspell:
     def __init__(self, data, fix, reason):
         self.data = data
         self.fix = fix
@@ -57,6 +87,104 @@
         self.FWORD = ''
         self.DISABLE = ''
 
+class Summary:
+    def __init__(self):
+        self.summary = {}
+
+    def update(self, wrongword):
+        if wrongword in self.summary:
+            self.summary[wrongword] += 1
+        else:
+            self.summary[wrongword] = 1
+
+    def __str__(self):
+        keys = list(self.summary.keys())
+        keys.sort()
+
+        return "\n".join(["{0}{1:{width}}".format(key, self.summary.get(key), width=15 - len(key)) for key in keys])
+
+class FileOpener:
+    def __init__(self, use_chardet):
+        self.use_chardet = use_chardet
+        if use_chardet:
+            self.init_chardet()
+
+    def init_chardet(self):
+        try:
+            from chardet.universaldetector import UniversalDetector
+        except ImportError:
+            raise Exception("There's no chardet installed to import from. "
+                            "Please, install it and check your PYTHONPATH "
+                            "environment variable")
+
+        self.encdetector = UniversalDetector()
+
+    def open(self, filename):
+        if self.use_chardet:
+            return self.open_with_chardet(filename)
+        else:
+            return self.open_with_internal(filename)
+
+    def open_with_chardet(self, filename):
+        self.encdetector.reset()
+        with open(filename, 'rb') as f:
+            for line in f:
+                self.encdetector.feed(line)
+                if self.encdetector.done:
+                    break
+        self.encdetector.close()
+        encoding = self.encdetector.result['encoding']
+
+        try:
+            f = open(filename, encoding=encoding)
+            lines = f.readlines()
+        except UnicodeDecodeError:
+            print('ERROR: Could not detect encoding: %s' % filename,
+                                                        file=sys.stderr)
+            raise
+        except LookupError:
+            print('ERROR: %s -- Don\'t know how to handle encoding %s'
+                                % (filename, encoding), file=sys.stderr)
+            raise
+        finally:
+            f.close()
+
+        return lines, encoding
+
+
+    def open_with_internal(self, filename):
+        curr = 0
+        global encodings
+
+        while True:
+            try:
+                f = open(filename, 'r', encoding=encodings[curr])
+                lines = f.readlines()
+                break
+            except UnicodeDecodeError:
+                if not quiet_level & QuietLevels.ENCODING:
+                    print('WARNING: Decoding file %s' % filename,
+                                                        file=sys.stderr)
+                    print('WARNING: using encoding=%s failed. '
+                                                        % encodings[curr],
+                                                        file=sys.stderr)
+                    print('WARNING: Trying next encoding: %s' % encodings[curr],
+                                                        file=sys.stderr)
+
+                curr += 1
+
+            finally:
+                f.close()
+
+        if not lines:
+            print('ERROR: Could not detect encoding: %s' % filename,
+                                                        file=sys.stderr)
+            raise Exception('Unknown encoding')
+
+        encoding = encodings[curr]
+
+        return lines, encoding
+
 # -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-
 
 def parse_options(args):
@@ -73,6 +201,49 @@
                         action = 'store_true', default = False,
                         help = 'write changes in place if possible')
 
+    parser.add_option('-s', '--summary',
+                        action = 'store_true', default = False,
+                        help = 'print summary of fixes')
+
+    parser.add_option('-S', '--skip',
+                        help = 'Comma-separated list of files to skip. It '\
+                               'accepts globs as well. E.g.: if you want '\
+                               'codespell to skip .eps and .txt files, '\
+                               'you\'d give "*.eps,*.txt" to this option. '\
+                               'It is expecially useful if you are using in '\
+                               'conjunction with -r option.')
+
+    parser.add_option('-x', '--exclude-file',
+                        help = 'FILE with lines that should not be changed',
+                        metavar='FILE')
+
+    parser.add_option('-i', '--interactive',
+                        action='store', type='int', default=0,
+                        help = 'Set interactive mode when writing changes. '  \
+                                '0 is the same of no interactivity; 1 makes ' \
+                                'codespell ask confirmation; 2 ask user to '  \
+                                'choose one fix when more than one is ' \
+                                'available; 3 applies both 1 and 2')
+
+    parser.add_option('-q', '--quiet-level',
+                        action='store', type='int', default=0,
+                        help = 'Bitmask that allows codespell to run quietly. '\
+                                '0: the default, in which all messages are '\
+                                'printed. 1: disable warnings about wrong '\
+                                'encoding. 2: disable warnings about binary '\
+                                'file. 4: shut down warnings about automatic '\
+                                'fixes that were disabled in dictionary. '\
+                                '8: don\'t print anything for non-automatic '\
+                                'fixes. 16: don\'t print fixed files.')
+
+    parser.add_option('-e', '--hard-encoding-detection',
+                        action='store_true', default = False,
+                        help = 'Use chardet to detect the encoding of each '\
+                        'file. This can slow down codespell, but is more '\
+                        'reliable in detecting encodings other than utf-8, '\
+                        'iso8859-1 and ascii.')
+
+
     (o, args) = parser.parse_args()
     if (len(args) < 1):
         print('ERROR: you need to specify a dictionary!', file=sys.stderr)
@@ -83,6 +254,10 @@
 
     return o, args
 
+def build_exclude_hashes(filename):
+    with open(filename, 'r') as f:
+        for line in f:
+            exclude_lines.add(line)
 
 def build_dict(filename):
     with open(filename, 'r') as f:
@@ -103,7 +278,7 @@
                 data = data[:fix]
                 fix = False
 
-            misspellings[key] = Mispell(data, fix, reason)
+            misspellings[key] = Misspell(data, fix, reason)
 
 def ishidden(filename):
     bfilename = os.path.basename(filename)
@@ -123,12 +298,73 @@
 
         return True
 
-def parse_file(filename, colors):
+def fix_case(word, fixword):
+    if word == word.capitalize():
+        return fixword.capitalize()
+    elif word == word.upper():
+        return fixword.upper()
+    # they are both lower case
+    # or we don't have any idea
+    return fixword
+
+def ask_for_word_fix(line, wrongword, misspelling, interactivity):
+    if interactivity <= 0:
+        return misspelling.fix, fix_case(wrongword, misspelling.data)
+
+    if misspelling.fix and interactivity & 1:
+        r = ''
+        fixword = fix_case(wrongword, misspelling.data)
+        while not r:
+            print("%s\t%s  ==> %s (Y/n) " %  (line, wrongword, fixword), end='')
+            r = sys.stdin.readline().strip().upper()
+            if not r: r = 'Y'
+            if r != 'Y' and r != 'N':
+                print("Say 'y' or 'n'")
+                r = ''
+
+        if r == 'N':
+            misspelling.fix = False
+            misspelling.fixword = ''
+
+    elif (interactivity & 2) and not misspelling.reason:
+        # if it is not disabled, i.e. it just has more than one possible fix,
+        # we ask the user which word to use
+
+        r = ''
+        opt = list(map(lambda x: x.strip(), misspelling.data.split(',')))
+        while not r:
+            print("%s Choose an option (blank for none): " % line, end='')
+            for i in range(len(opt)):
+                fixword = fix_case(wrongword, opt[i])
+                print(" %d) %s" % (i, fixword), end='')
+            print(": ", end='')
+            sys.stdout.flush()
+
+            n = sys.stdin.readline().strip()
+            if not n:
+                break
+
+            try:
+                n = int(n)
+                r = opt[n]
+            except (ValueError, IndexError):
+                print("Not a valid option\n")
+
+        if r:
+            misspelling.fix = True
+            misspelling.data = r
+
+    return misspelling.fix, fix_case(wrongword, misspelling.data)
+
+def parse_file(filename, colors, summary):
     lines = None
     changed = False
     global misspellings
     global options
     global encodings
+    global quiet_level
+
+    encoding = encodings[0]  # if not defined, use UTF-8
 
     if filename == '-':
         f = sys.stdin
@@ -136,49 +372,50 @@
     else:
         # ignore binary files
         if not istextfile(filename):
-            print("WARNING: Binary file: %s " % filename, file=sys.stderr)
+            if not quiet_level & QuietLevels.BINARY_FILE:
+                print("WARNING: Binary file: %s " % filename, file=sys.stderr)
             return
-
-        curr = 0
-        while True:
-            try:
-                f = open(filename, 'r', encoding=encodings[curr])
-                lines = f.readlines()
-                break
-            except UnicodeDecodeError:
-                print('WARNING: Decoding file %s' % filename, file=sys.stderr)
-                print('WARNING: using encoding=%s failed. '
-                                            % encodings[curr], file=sys.stderr)
-
-                curr += 1
-                print('WARNING: Trying next encoding: %s' % encodings[curr],
-                                                            file=sys.stderr)
-
-            finally:
-                f.close()
-
-        if not lines:
-            print('ERROR: Could not detect encoding: %s' % filename,
-                                                            file=sys.stderr)
+        try:
+            lines, encoding = fileopener.open(filename)
+        except:
             return
 
     i = 1
+    rx = re.compile(r"[\w']+")
     for line in lines:
-        for word in re.findall('\w+', line):
+        if line in exclude_lines:
+            i += 1
+            continue
+
+        fixed_words = set()
+        asked_for = set()
+
+        for word in rx.findall(line):
             lword = word.lower()
             if lword in misspellings:
-                if word == word.capitalize():
-                    fixword = misspellings[lword].data.capitalize()
-                elif word == word.upper():
-                    fixword = misspellings[lword].data.upper()
-                else:
-                    # even they are the same lower case or
-                    # or we don't have any idea
-                    fixword = misspellings[lword].data
+                fix = misspellings[lword].fix
+                fixword = fix_case(word, misspellings[lword].data)
 
-                if options.write_changes and misspellings[lword].fix:
+                if options.interactive and not lword in asked_for:
+                    fix, fixword = ask_for_word_fix(lines[i - 1], word,
+                                                    misspellings[lword],
+                                                    options.interactive)
+                    asked_for.add(lword)
+
+                if summary and fix:
+                    summary.update(lword)
+
+                if word in fixed_words:
+                    continue
+
+                if options.write_changes and fix:
                     changed = True
-                    lines[i - 1] = line.replace(word, fixword, 1)
+                    lines[i - 1] = re.sub(r'\b%s\b' % word, fixword, lines[i - 1])
+                    fixed_words.add(word)
+                    continue
+
+                # otherwise warning was explicitly set by interactive mode
+                if options.interactive & 2 and not fix and not misspellings[lword].reason:
                     continue
 
                 cfilename = "%s%s%s" % (colors.FILE, filename, colors.DISABLE)
@@ -187,10 +424,16 @@
                 crightword = "%s%s%s" % (colors.FWORD, fixword, colors.DISABLE)
 
                 if misspellings[lword].reason:
+                    if quiet_level & QuietLevels.DISABLED_FIXES:
+                        continue
+
                     creason = "  | %s%s%s" % (colors.FILE,
                                             misspellings[lword].reason,
                                             colors.DISABLE)
                 else:
+                    if quiet_level & QuietLevels.NON_AUTOMATIC_FIXES:
+                        continue
+
                     creason = ''
 
                 if filename != '-':
@@ -213,15 +456,18 @@
             for line in lines:
                 print(line, end='')
         else:
-            print("%sFIXED:%s %s" % (colors.FWORD, colors.DISABLE, filename),
-                                    file=sys.stderr)
-            f = open(filename, 'w')
+            if not quiet_level & QuietLevels.FIXES:
+                print("%sFIXED:%s %s" % (colors.FWORD, colors.DISABLE, filename),
+                                                                file=sys.stderr)
+            f = open(filename, 'w', encoding=encoding)
             f.writelines(lines)
             f.close()
 
-
 def main(*args):
     global options
+    global quiet_level
+    global fileopener
+
     (options, args) = parse_options(args)
 
     build_dict(args[0])
@@ -229,6 +475,21 @@
     if options.disable_colors:
         colors.disable()
 
+    if options.summary:
+        summary = Summary()
+    else:
+        summary = None
+
+    if options.exclude_file:
+        build_exclude_hashes(options.exclude_file)
+
+    if options.quiet_level:
+        quiet_level = options.quiet_level
+
+    fileopener = FileOpener(options.hard_encoding_detection)
+
+    glob_match = GlobMatch(options.skip)
+
     for filename in args[1:]:
         # ignore hidden files
         if ishidden(filename):
@@ -249,12 +510,17 @@
                 for file in files:
                     if os.path.islink(file):
                         continue
-
-                    parse_file(os.path.join(root, file), colors)
+                    if glob_match.match(file):
+                        continue
+                    parse_file(os.path.join(root, file), colors, summary)
 
             continue
 
-        parse_file(filename, colors)
+        parse_file(filename, colors, summary)
+
+    if summary:
+        print("\n-------8<-------\nSUMMARY:")
+        print(summary)
 
 if __name__ == '__main__':
     sys.exit(main(*sys.argv))

diff --git a/data/dictionary.txt b/data/dictionary.txt
index 8a9686b..5b2c793 100644
--- a/data/dictionary.txt
+++ b/data/dictionary.txt

@@ -648,6 +648,7 @@
 cannonical->canonical
 cannotation->connotation
 cannotations->connotations
+cant'->can't
 cant->can't
 caost->coast
 caperbility->capability
@@ -1207,6 +1208,7 @@
 dicovers->discovers
 dicovery->discovery
 dicussed->discussed
+didnt'->didn't
 didnt->didn't
 diea->idea, die,
 dieing->dying, dyeing,
@@ -1303,12 +1305,14 @@
 doctines->doctrines
 documenatry->documentary
 doens->does
+doesnt'->doesn't
 doesnt->doesn't
 doign->doing
 dominaton->domination
 dominent->dominant
 dominiant->dominant
 donig->doing
+dosent'->doesn't
 dosen't->doesn't
 doub->doubt, daub,
 doulbe->double
@@ -1821,6 +1825,7 @@
 harrassing->harassing
 harrassment->harassment
 harrassments->harassments
+hasnt'->hasn't
 hasnt->hasn't
 haviest->heaviest
 headquarer->headquarter
@@ -2147,6 +2152,7 @@
 irreplacable->irreplaceable
 irresistable->irresistible
 irresistably->irresistibly
+isnt'->isn't
 isnt->isn't
 Israelies->Israelis
 issueing->issuing
@@ -3459,6 +3465,7 @@
 shortwhile->short while
 shoudl->should
 shoudln->should, shouldn't,
+shouldnt'->shouldn't
 shouldnt->shouldn't
 shreak->shriek
 shrinked->shrunk
@@ -4116,6 +4123,7 @@
 wardobe->wardrobe
 warrent->warrant
 warrriors->warriors
+wasnt'->wasn't
 wasnt->wasn't
 wass->was
 watn->want

diff --git a/data/linux-kernel.exclude b/data/linux-kernel.exclude
new file mode 100644
index 0000000..d7dd699
--- /dev/null
+++ b/data/linux-kernel.exclude

@@ -0,0 +1,87 @@
+N: Tom Dyas
+ * Copyright (C) 1996 Thomas K. Dyas (tdyas@eden.rutgers.edu)
+ * Copyright (C) 1996 Thomas K. Dyas (tdyas@eden.rutgers.edu)
+ * Copyright (C) 1996 Thomas K. Dyas (tdyas@eden.rutgers.edu)
+ *          Tom Dyas
+ * Copyright (C) 1996 Thomas K. Dyas (tdyas@noc.rutgers.edu)
+MODULE_AUTHOR("Thomas K. Dyas (tdyas@noc.rutgers.edu) and Eddie C. Dost  (ecd@skynet.be)");
+ *  Thomas K. Dyas <tdyas@eden.rutgers.edu>
+ * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas.
+ *		Tom Dyas		:	Module support.
+ *		Tom Dyas	:	Export net symbols.
+ * Copyright (C) 1996,1997 Thomas K. Dyas (tdyas@eden.rutgers.edu)
+MODULE_AUTHOR("Thomas K. Dyas and David S. Miller");
+
+
+
+ * Sun people can't spell worth damn. "compatability" indeed.
+        .asciz "compatability"
+ * Sun people can't spell worth damn. "compatability" indeed.
+	.asciz "compatability"
+
+
+        Jonathan Teh Soon Yew <j.teh@iname.com>
+    <j.teh@iname.com> and Alex van Kaam <darkside@chello.nl>.)
+ <j.teh@iname.com>) */
+
+
+inv24    - change timings parameters for 24bpp modes on Millenium and
+           Millenium II. Specify this if you see strange color shadows around
+           non-Millenium.
+             Millenium I or II, because of these devices have hardware
+* ThrustMaster Millenium 3D Inceptor
+DC390F (Sym53c875) accepted this as well as my Millenium. But the Am53C974
+	{   1, "ThrustMaster Millenium 3D Inceptor",	  6, 2, { 4, 2 }, { 4, 6 }, tmdc_abs, tmdc_btn_joy },
+	 * DoC 2000 (it's in the Millenium docs), but it seems to work. */
+	minfo->millenium = 1;
+	int			millenium;
+						/* 0 except for 6MB Millenium */
+#define isMillenium(x)	 (x->millenium)
+#define isMillenium(x)	 (x->millennium)
+#define FB_AUX_TEXT_MGA_STEP16	3	/* MGA Millenium I: text, attr, 14 reserved bytes */
+#define FB_ACCEL_MATROX_MGA2064W 16	/* Matrox MGA2064W (Millenium)	*/
+#define FB_ACCEL_MATROX_MGA2164W 18	/* Matrox MGA2164W (Millenium II) */
+#define FB_ACCEL_MATROX_MGA2164W_AGP 19	/* Matrox MGA2164W (Millenium II) */
+
+
+ * Copyright (C) 2007 Marvell Internation Ltd.
+ * Copyright (C) 2007-2008 Marvell Internation Ltd.
+
+            dbug(1,dprintf("PTY/ECT/addCONF,relPLCI=%lx",relatedPLCIvalue));
+            /* send PTY/ECT req, cannot check all states because of US stuff */
+        dbug(1,dprintf("ECT OK"));
+   {   /* first indication after ECT-Request on Consultation Call */
+#define INT_CT_REJ 70       /* ECT rejected internal command        */
+	__u8 ip_ect;	/* ECT codepoint of IPv4 header, pre-shifted */
+			__u8 ece:1, cwr:1; /* TCP ECT bits */
+/* set ECT codepoint from IP header.
+		pr_info("new ECT codepoint %x out of mask\n", einfo->ip_ect);
+		/* Funny extension: if ECT is not set on a segment,
+		/* Not-retransmitted data segment: set ECT and inject CWR. */
+			/* ACK or retransmitted segment: clear ECT|CE */
+	 *  "The ECN-Capable Transport (ECT) bit would be set by the
+	 * Now setting the ECT bit all the time, as it should not cause
+
+ * Copyright © 2003 Agere Systems Inc.
+
+S: 1326 De Val-Brillant
+                <slot #2, id = 0x02, characters = "xtension whic">
+
+	at91_set_A_periph(AT91_PIN_PC1, 0);		/* [-SMOE-]{+SOME+} */
+
+	_REGISTER_CLOCK(NULL, "ect", ect_clk)
+infinit:
+	bra	infinit
+the read on an 8-byte boundary (e.g., if you seeked an odd number of bytes
+#define ISNT		   12 
+    (Some conversion-factor data were contributed by Jonathan Teh Soon Yew
+ From HWMon.cpp (Copyright 1998-2000 Jonathan Teh Soon Yew):
+ (These conversions were contributed by Jonathan Teh Soon Yew
+/* linear fits from HWMon.cpp (Copyright 1998-2000 Jonathan Teh Soon Yew)
+/* 	__u16 pallete:1; */
+	__be32 pallete;
+	struct diu_addr pallete;
+	int virtualX, virtualY;
+	struct regid		archType;
+		.ident = "Toshiba Satelite S1800-814",
+	at91_set_A_periph(AT91_PIN_PC1, 0);		/* SMOE */
commit	eeaa6dcdb342063bb6d3a7ed2e5003d772552f43	[log] [tgz]
author	Lucas De Marchi <lucas.de.marchi@gmail.com>	Sat Oct 15 10:05:12 2011 -0300
committer	Lucas De Marchi <lucas.de.marchi@gmail.com>	Sat Oct 15 10:05:12 2011 -0300
tree	5c36a8f4c6b27f70497011ac7aa4ef38c97e3779
parent	be9d1ace364d29c528c83e48f0a861aa272c8368 [diff]
parent	2329b456f5b07b8a6ee888288ff3d201718a739c [diff]