Merge branch 'dict' into v1.2.x
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..6c54f02
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,10 @@
+
+prefix ?= /usr
+bindir ?= ${prefix}/bin
+datadir ?= ${prefix}/share/codespell
+
+
+install:
+ install -d ${DESTDIR}${datadir} ${DESTDIR}${bindir}
+ install -m644 -t ${DESTDIR}${datadir} data/dictionary.txt data/linux-kernel.exclude
+ install -m755 -t ${DESTDIR}${bindir} codespell.py
diff --git a/README b/README
index f9ed44b..47cba2d 100644
--- a/README
+++ b/README
@@ -4,7 +4,13 @@
Fix common misspellings in text files. It's designed primarily for checking
misspelled words in source code, but it can be used with other files as well.
-USAGE
+Information
+===========
+
+Mailing list:
+ codespell@googlegroups.com
+
+Usage
=====
Check usage with ./codespell -h. There are a few command line options. We ship
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..4f86e06
--- /dev/null
+++ b/TODO
@@ -0,0 +1,5 @@
+- Add option to disable changes to source code, allowing them only on comments
+ and text files
+
+BUGS
+====
diff --git a/codespell.py b/codespell.py
index cf81490..4252542 100755
--- a/codespell.py
+++ b/codespell.py
@@ -14,21 +14,26 @@
# along with this program; if not, see
# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
"""
-Copyright (C) 2010 Lucas De Marchi <lucas.de.marchi@gmail.com>
+Copyright (C) 2010-2011 Lucas De Marchi <lucas.de.marchi@gmail.com>
+Copyright (C) 2011 ProFUSION embedded systems
"""
import sys
import re
from optparse import OptionParser
import os
+import fnmatch
USAGE = """
\t%prog [OPTIONS] dict_filename [file1 file2 ... fileN]
"""
-VERSION = '1.0'
+VERSION = '1.2'
misspellings = {}
+exclude_lines = set()
options = None
+fileopener = None
+quiet_level = 0
encodings = [ 'utf-8', 'iso-8859-1' ]
#OPTIONS:
@@ -38,7 +43,32 @@
# If set to '-', it will be read from stdin
# file1 .. fileN Files to check spelling
-class Mispell:
+class QuietLevels:
+ NONE = 0
+ ENCODING = 1
+ BINARY_FILE = 2
+ DISABLED_FIXES = 4
+ NON_AUTOMATIC_FIXES = 8
+ FIXES = 16
+
+class GlobMatch:
+ def __init__(self, pattern):
+ if pattern:
+ self.pattern_list = pattern.split(',')
+ else:
+ self.pattern_list = None
+
+ def match(self, filename):
+ if self.pattern_list is None:
+ return False
+
+ for p in self.pattern_list:
+ if fnmatch.fnmatch(filename, p):
+ return True
+
+ return False
+
+class Misspell:
def __init__(self, data, fix, reason):
self.data = data
self.fix = fix
@@ -57,6 +87,104 @@
self.FWORD = ''
self.DISABLE = ''
+class Summary:
+ def __init__(self):
+ self.summary = {}
+
+ def update(self, wrongword):
+ if wrongword in self.summary:
+ self.summary[wrongword] += 1
+ else:
+ self.summary[wrongword] = 1
+
+ def __str__(self):
+ keys = list(self.summary.keys())
+ keys.sort()
+
+ return "\n".join(["{0}{1:{width}}".format(key, self.summary.get(key), width=15 - len(key)) for key in keys])
+
+class FileOpener:
+ def __init__(self, use_chardet):
+ self.use_chardet = use_chardet
+ if use_chardet:
+ self.init_chardet()
+
+ def init_chardet(self):
+ try:
+ from chardet.universaldetector import UniversalDetector
+ except ImportError:
+ raise Exception("There's no chardet installed to import from. "
+ "Please, install it and check your PYTHONPATH "
+ "environment variable")
+
+ self.encdetector = UniversalDetector()
+
+ def open(self, filename):
+ if self.use_chardet:
+ return self.open_with_chardet(filename)
+ else:
+ return self.open_with_internal(filename)
+
+ def open_with_chardet(self, filename):
+ self.encdetector.reset()
+ with open(filename, 'rb') as f:
+ for line in f:
+ self.encdetector.feed(line)
+ if self.encdetector.done:
+ break
+ self.encdetector.close()
+ encoding = self.encdetector.result['encoding']
+
+ try:
+ f = open(filename, encoding=encoding)
+ lines = f.readlines()
+ except UnicodeDecodeError:
+ print('ERROR: Could not detect encoding: %s' % filename,
+ file=sys.stderr)
+ raise
+ except LookupError:
+ print('ERROR: %s -- Don\'t know how to handle encoding %s'
+ % (filename, encoding), file=sys.stderr)
+ raise
+ finally:
+ f.close()
+
+ return lines, encoding
+
+
+ def open_with_internal(self, filename):
+ curr = 0
+ global encodings
+
+ while True:
+ try:
+ f = open(filename, 'r', encoding=encodings[curr])
+ lines = f.readlines()
+ break
+ except UnicodeDecodeError:
+ if not quiet_level & QuietLevels.ENCODING:
+ print('WARNING: Decoding file %s' % filename,
+ file=sys.stderr)
+ print('WARNING: using encoding=%s failed. '
+ % encodings[curr],
+ file=sys.stderr)
+ print('WARNING: Trying next encoding: %s' % encodings[curr],
+ file=sys.stderr)
+
+ curr += 1
+
+ finally:
+ f.close()
+
+ if not lines:
+ print('ERROR: Could not detect encoding: %s' % filename,
+ file=sys.stderr)
+ raise Exception('Unknown encoding')
+
+ encoding = encodings[curr]
+
+ return lines, encoding
+
# -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-
def parse_options(args):
@@ -73,6 +201,49 @@
action = 'store_true', default = False,
help = 'write changes in place if possible')
+ parser.add_option('-s', '--summary',
+ action = 'store_true', default = False,
+ help = 'print summary of fixes')
+
+ parser.add_option('-S', '--skip',
+ help = 'Comma-separated list of files to skip. It '\
+ 'accepts globs as well. E.g.: if you want '\
+ 'codespell to skip .eps and .txt files, '\
+ 'you\'d give "*.eps,*.txt" to this option. '\
+ 'It is expecially useful if you are using in '\
+ 'conjunction with -r option.')
+
+ parser.add_option('-x', '--exclude-file',
+ help = 'FILE with lines that should not be changed',
+ metavar='FILE')
+
+ parser.add_option('-i', '--interactive',
+ action='store', type='int', default=0,
+ help = 'Set interactive mode when writing changes. ' \
+ '0 is the same of no interactivity; 1 makes ' \
+ 'codespell ask confirmation; 2 ask user to ' \
+ 'choose one fix when more than one is ' \
+ 'available; 3 applies both 1 and 2')
+
+ parser.add_option('-q', '--quiet-level',
+ action='store', type='int', default=0,
+ help = 'Bitmask that allows codespell to run quietly. '\
+ '0: the default, in which all messages are '\
+ 'printed. 1: disable warnings about wrong '\
+ 'encoding. 2: disable warnings about binary '\
+ 'file. 4: shut down warnings about automatic '\
+ 'fixes that were disabled in dictionary. '\
+ '8: don\'t print anything for non-automatic '\
+ 'fixes. 16: don\'t print fixed files.')
+
+ parser.add_option('-e', '--hard-encoding-detection',
+ action='store_true', default = False,
+ help = 'Use chardet to detect the encoding of each '\
+ 'file. This can slow down codespell, but is more '\
+ 'reliable in detecting encodings other than utf-8, '\
+ 'iso8859-1 and ascii.')
+
+
(o, args) = parser.parse_args()
if (len(args) < 1):
print('ERROR: you need to specify a dictionary!', file=sys.stderr)
@@ -83,6 +254,10 @@
return o, args
+def build_exclude_hashes(filename):
+ with open(filename, 'r') as f:
+ for line in f:
+ exclude_lines.add(line)
def build_dict(filename):
with open(filename, 'r') as f:
@@ -103,7 +278,7 @@
data = data[:fix]
fix = False
- misspellings[key] = Mispell(data, fix, reason)
+ misspellings[key] = Misspell(data, fix, reason)
def ishidden(filename):
bfilename = os.path.basename(filename)
@@ -123,12 +298,73 @@
return True
-def parse_file(filename, colors):
+def fix_case(word, fixword):
+ if word == word.capitalize():
+ return fixword.capitalize()
+ elif word == word.upper():
+ return fixword.upper()
+ # they are both lower case
+ # or we don't have any idea
+ return fixword
+
+def ask_for_word_fix(line, wrongword, misspelling, interactivity):
+ if interactivity <= 0:
+ return misspelling.fix, fix_case(wrongword, misspelling.data)
+
+ if misspelling.fix and interactivity & 1:
+ r = ''
+ fixword = fix_case(wrongword, misspelling.data)
+ while not r:
+ print("%s\t%s ==> %s (Y/n) " % (line, wrongword, fixword), end='')
+ r = sys.stdin.readline().strip().upper()
+ if not r: r = 'Y'
+ if r != 'Y' and r != 'N':
+ print("Say 'y' or 'n'")
+ r = ''
+
+ if r == 'N':
+ misspelling.fix = False
+ misspelling.fixword = ''
+
+ elif (interactivity & 2) and not misspelling.reason:
+ # if it is not disabled, i.e. it just has more than one possible fix,
+ # we ask the user which word to use
+
+ r = ''
+ opt = list(map(lambda x: x.strip(), misspelling.data.split(',')))
+ while not r:
+ print("%s Choose an option (blank for none): " % line, end='')
+ for i in range(len(opt)):
+ fixword = fix_case(wrongword, opt[i])
+ print(" %d) %s" % (i, fixword), end='')
+ print(": ", end='')
+ sys.stdout.flush()
+
+ n = sys.stdin.readline().strip()
+ if not n:
+ break
+
+ try:
+ n = int(n)
+ r = opt[n]
+ except (ValueError, IndexError):
+ print("Not a valid option\n")
+
+ if r:
+ misspelling.fix = True
+ misspelling.data = r
+
+ return misspelling.fix, fix_case(wrongword, misspelling.data)
+
+def parse_file(filename, colors, summary):
lines = None
changed = False
global misspellings
global options
global encodings
+ global quiet_level
+
+ encoding = encodings[0] # if not defined, use UTF-8
if filename == '-':
f = sys.stdin
@@ -136,49 +372,50 @@
else:
# ignore binary files
if not istextfile(filename):
- print("WARNING: Binary file: %s " % filename, file=sys.stderr)
+ if not quiet_level & QuietLevels.BINARY_FILE:
+ print("WARNING: Binary file: %s " % filename, file=sys.stderr)
return
-
- curr = 0
- while True:
- try:
- f = open(filename, 'r', encoding=encodings[curr])
- lines = f.readlines()
- break
- except UnicodeDecodeError:
- print('WARNING: Decoding file %s' % filename, file=sys.stderr)
- print('WARNING: using encoding=%s failed. '
- % encodings[curr], file=sys.stderr)
-
- curr += 1
- print('WARNING: Trying next encoding: %s' % encodings[curr],
- file=sys.stderr)
-
- finally:
- f.close()
-
- if not lines:
- print('ERROR: Could not detect encoding: %s' % filename,
- file=sys.stderr)
+ try:
+ lines, encoding = fileopener.open(filename)
+ except:
return
i = 1
+ rx = re.compile(r"[\w']+")
for line in lines:
- for word in re.findall('\w+', line):
+ if line in exclude_lines:
+ i += 1
+ continue
+
+ fixed_words = set()
+ asked_for = set()
+
+ for word in rx.findall(line):
lword = word.lower()
if lword in misspellings:
- if word == word.capitalize():
- fixword = misspellings[lword].data.capitalize()
- elif word == word.upper():
- fixword = misspellings[lword].data.upper()
- else:
- # even they are the same lower case or
- # or we don't have any idea
- fixword = misspellings[lword].data
+ fix = misspellings[lword].fix
+ fixword = fix_case(word, misspellings[lword].data)
- if options.write_changes and misspellings[lword].fix:
+ if options.interactive and not lword in asked_for:
+ fix, fixword = ask_for_word_fix(lines[i - 1], word,
+ misspellings[lword],
+ options.interactive)
+ asked_for.add(lword)
+
+ if summary and fix:
+ summary.update(lword)
+
+ if word in fixed_words:
+ continue
+
+ if options.write_changes and fix:
changed = True
- lines[i - 1] = line.replace(word, fixword, 1)
+ lines[i - 1] = re.sub(r'\b%s\b' % word, fixword, lines[i - 1])
+ fixed_words.add(word)
+ continue
+
+ # otherwise warning was explicitly set by interactive mode
+ if options.interactive & 2 and not fix and not misspellings[lword].reason:
continue
cfilename = "%s%s%s" % (colors.FILE, filename, colors.DISABLE)
@@ -187,10 +424,16 @@
crightword = "%s%s%s" % (colors.FWORD, fixword, colors.DISABLE)
if misspellings[lword].reason:
+ if quiet_level & QuietLevels.DISABLED_FIXES:
+ continue
+
creason = " | %s%s%s" % (colors.FILE,
misspellings[lword].reason,
colors.DISABLE)
else:
+ if quiet_level & QuietLevels.NON_AUTOMATIC_FIXES:
+ continue
+
creason = ''
if filename != '-':
@@ -213,15 +456,18 @@
for line in lines:
print(line, end='')
else:
- print("%sFIXED:%s %s" % (colors.FWORD, colors.DISABLE, filename),
- file=sys.stderr)
- f = open(filename, 'w')
+ if not quiet_level & QuietLevels.FIXES:
+ print("%sFIXED:%s %s" % (colors.FWORD, colors.DISABLE, filename),
+ file=sys.stderr)
+ f = open(filename, 'w', encoding=encoding)
f.writelines(lines)
f.close()
-
def main(*args):
global options
+ global quiet_level
+ global fileopener
+
(options, args) = parse_options(args)
build_dict(args[0])
@@ -229,6 +475,21 @@
if options.disable_colors:
colors.disable()
+ if options.summary:
+ summary = Summary()
+ else:
+ summary = None
+
+ if options.exclude_file:
+ build_exclude_hashes(options.exclude_file)
+
+ if options.quiet_level:
+ quiet_level = options.quiet_level
+
+ fileopener = FileOpener(options.hard_encoding_detection)
+
+ glob_match = GlobMatch(options.skip)
+
for filename in args[1:]:
# ignore hidden files
if ishidden(filename):
@@ -249,12 +510,17 @@
for file in files:
if os.path.islink(file):
continue
-
- parse_file(os.path.join(root, file), colors)
+ if glob_match.match(file):
+ continue
+ parse_file(os.path.join(root, file), colors, summary)
continue
- parse_file(filename, colors)
+ parse_file(filename, colors, summary)
+
+ if summary:
+ print("\n-------8<-------\nSUMMARY:")
+ print(summary)
if __name__ == '__main__':
sys.exit(main(*sys.argv))
diff --git a/data/dictionary.txt b/data/dictionary.txt
index 8a9686b..5b2c793 100644
--- a/data/dictionary.txt
+++ b/data/dictionary.txt
@@ -648,6 +648,7 @@
cannonical->canonical
cannotation->connotation
cannotations->connotations
+cant'->can't
cant->can't
caost->coast
caperbility->capability
@@ -1207,6 +1208,7 @@
dicovers->discovers
dicovery->discovery
dicussed->discussed
+didnt'->didn't
didnt->didn't
diea->idea, die,
dieing->dying, dyeing,
@@ -1303,12 +1305,14 @@
doctines->doctrines
documenatry->documentary
doens->does
+doesnt'->doesn't
doesnt->doesn't
doign->doing
dominaton->domination
dominent->dominant
dominiant->dominant
donig->doing
+dosent'->doesn't
dosen't->doesn't
doub->doubt, daub,
doulbe->double
@@ -1821,6 +1825,7 @@
harrassing->harassing
harrassment->harassment
harrassments->harassments
+hasnt'->hasn't
hasnt->hasn't
haviest->heaviest
headquarer->headquarter
@@ -2147,6 +2152,7 @@
irreplacable->irreplaceable
irresistable->irresistible
irresistably->irresistibly
+isnt'->isn't
isnt->isn't
Israelies->Israelis
issueing->issuing
@@ -3459,6 +3465,7 @@
shortwhile->short while
shoudl->should
shoudln->should, shouldn't,
+shouldnt'->shouldn't
shouldnt->shouldn't
shreak->shriek
shrinked->shrunk
@@ -4116,6 +4123,7 @@
wardobe->wardrobe
warrent->warrant
warrriors->warriors
+wasnt'->wasn't
wasnt->wasn't
wass->was
watn->want
diff --git a/data/linux-kernel.exclude b/data/linux-kernel.exclude
new file mode 100644
index 0000000..d7dd699
--- /dev/null
+++ b/data/linux-kernel.exclude
@@ -0,0 +1,87 @@
+N: Tom Dyas
+ * Copyright (C) 1996 Thomas K. Dyas (tdyas@eden.rutgers.edu)
+ * Copyright (C) 1996 Thomas K. Dyas (tdyas@eden.rutgers.edu)
+ * Copyright (C) 1996 Thomas K. Dyas (tdyas@eden.rutgers.edu)
+ * Tom Dyas
+ * Copyright (C) 1996 Thomas K. Dyas (tdyas@noc.rutgers.edu)
+MODULE_AUTHOR("Thomas K. Dyas (tdyas@noc.rutgers.edu) and Eddie C. Dost (ecd@skynet.be)");
+ * Thomas K. Dyas <tdyas@eden.rutgers.edu>
+ * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas.
+ * Tom Dyas : Module support.
+ * Tom Dyas : Export net symbols.
+ * Copyright (C) 1996,1997 Thomas K. Dyas (tdyas@eden.rutgers.edu)
+MODULE_AUTHOR("Thomas K. Dyas and David S. Miller");
+
+
+
+ * Sun people can't spell worth damn. "compatability" indeed.
+ .asciz "compatability"
+ * Sun people can't spell worth damn. "compatability" indeed.
+ .asciz "compatability"
+
+
+ Jonathan Teh Soon Yew <j.teh@iname.com>
+ <j.teh@iname.com> and Alex van Kaam <darkside@chello.nl>.)
+ <j.teh@iname.com>) */
+
+
+inv24 - change timings parameters for 24bpp modes on Millenium and
+ Millenium II. Specify this if you see strange color shadows around
+ non-Millenium.
+ Millenium I or II, because of these devices have hardware
+* ThrustMaster Millenium 3D Inceptor
+DC390F (Sym53c875) accepted this as well as my Millenium. But the Am53C974
+ { 1, "ThrustMaster Millenium 3D Inceptor", 6, 2, { 4, 2 }, { 4, 6 }, tmdc_abs, tmdc_btn_joy },
+ * DoC 2000 (it's in the Millenium docs), but it seems to work. */
+ minfo->millenium = 1;
+ int millenium;
+ /* 0 except for 6MB Millenium */
+#define isMillenium(x) (x->millenium)
+#define isMillenium(x) (x->millennium)
+#define FB_AUX_TEXT_MGA_STEP16 3 /* MGA Millenium I: text, attr, 14 reserved bytes */
+#define FB_ACCEL_MATROX_MGA2064W 16 /* Matrox MGA2064W (Millenium) */
+#define FB_ACCEL_MATROX_MGA2164W 18 /* Matrox MGA2164W (Millenium II) */
+#define FB_ACCEL_MATROX_MGA2164W_AGP 19 /* Matrox MGA2164W (Millenium II) */
+
+
+ * Copyright (C) 2007 Marvell Internation Ltd.
+ * Copyright (C) 2007-2008 Marvell Internation Ltd.
+
+ dbug(1,dprintf("PTY/ECT/addCONF,relPLCI=%lx",relatedPLCIvalue));
+ /* send PTY/ECT req, cannot check all states because of US stuff */
+ dbug(1,dprintf("ECT OK"));
+ { /* first indication after ECT-Request on Consultation Call */
+#define INT_CT_REJ 70 /* ECT rejected internal command */
+ __u8 ip_ect; /* ECT codepoint of IPv4 header, pre-shifted */
+ __u8 ece:1, cwr:1; /* TCP ECT bits */
+/* set ECT codepoint from IP header.
+ pr_info("new ECT codepoint %x out of mask\n", einfo->ip_ect);
+ /* Funny extension: if ECT is not set on a segment,
+ /* Not-retransmitted data segment: set ECT and inject CWR. */
+ /* ACK or retransmitted segment: clear ECT|CE */
+ * "The ECN-Capable Transport (ECT) bit would be set by the
+ * Now setting the ECT bit all the time, as it should not cause
+
+ * Copyright © 2003 Agere Systems Inc.
+
+S: 1326 De Val-Brillant
+ <slot #2, id = 0x02, characters = "xtension whic">
+
+ at91_set_A_periph(AT91_PIN_PC1, 0); /* [-SMOE-]{+SOME+} */
+
+ _REGISTER_CLOCK(NULL, "ect", ect_clk)
+infinit:
+ bra infinit
+the read on an 8-byte boundary (e.g., if you seeked an odd number of bytes
+#define ISNT 12
+ (Some conversion-factor data were contributed by Jonathan Teh Soon Yew
+ From HWMon.cpp (Copyright 1998-2000 Jonathan Teh Soon Yew):
+ (These conversions were contributed by Jonathan Teh Soon Yew
+/* linear fits from HWMon.cpp (Copyright 1998-2000 Jonathan Teh Soon Yew)
+/* __u16 pallete:1; */
+ __be32 pallete;
+ struct diu_addr pallete;
+ int virtualX, virtualY;
+ struct regid archType;
+ .ident = "Toshiba Satelite S1800-814",
+ at91_set_A_periph(AT91_PIN_PC1, 0); /* SMOE */