blob: 794009bf56034e6a2539c9bcaef32a8f7d1f0725 [file] [log] [blame] [edit]
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see
# https://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
"""
Copyright (C) 2010-2011 Lucas De Marchi <lucas.de.marchi@gmail.com>
Copyright (C) 2011 ProFUSION embedded systems
"""
import argparse
import configparser
import ctypes
import fnmatch
import itertools
import os
import re
import sys
import textwrap
from typing import (
Any,
Dict,
Iterable,
List,
Match,
Optional,
Pattern,
Sequence,
Set,
TextIO,
Tuple,
)
if sys.platform == "win32":
from ctypes import wintypes
ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
STD_OUTPUT_HANDLE = wintypes.HANDLE(-11)
from ._spellchecker import Misspelling, build_dict
from ._text_util import fix_case
# autogenerated by setuptools_scm
from ._version import ( # type: ignore[import-not-found]
__version__ as VERSION, # noqa: N812
)
word_regex_def = r"[\w\-'’]+" # noqa: RUF001
# While we want to treat characters like ( or " as okay for a starting break,
# these may occur unescaped in URIs, and so we are more restrictive on the
# endpoint. Emails are more restrictive, so the endpoint remains flexible.
uri_regex_def = (
"(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|\\b[\\w.%+-]+@[\\w.-]+\\b)"
)
inline_ignore_regex = re.compile(r"[^\w\s]\s?codespell:ignore\b(\s+(?P<words>[\w,]*))?")
USAGE = """
\t%prog [OPTIONS] [file1 file2 ... fileN]
"""
supported_languages_en = ("en", "en_GB", "en_US", "en_CA", "en_AU")
supported_languages = supported_languages_en
# Users might want to link this file into /usr/local/bin, so we resolve the
# symbolic link path to the real path if necessary.
_data_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
_builtin_dictionaries = (
# name, desc, name, err in aspell, correction in aspell, \
# err dictionary array, rep dictionary array
# The arrays must contain the names of aspell dictionaries
# The aspell tests here aren't the ideal state, but the None's are
# realistic for obscure words
("clear", "for unambiguous errors", "", False, None, supported_languages_en, None),
(
"rare",
"for rare (but valid) words that are likely to be errors",
"_rare",
None,
None,
None,
None,
),
(
"informal",
"for making informal words more formal",
"_informal",
True,
True,
supported_languages_en,
supported_languages_en,
),
(
"usage",
"for replacing phrasing with recommended terms",
"_usage",
None,
None,
None,
None,
),
(
"code",
"for words from code and/or mathematics that are likely to be typos in other contexts (such as uint)", # noqa: E501
"_code",
None,
None,
None,
None,
),
(
"names",
"for valid proper names that might be typos",
"_names",
None,
None,
None,
None,
),
(
"en-GB_to_en-US",
"for corrections from en-GB to en-US",
"_en-GB_to_en-US",
True,
True,
("en_GB",),
("en_US",),
),
)
_builtin_default = "clear,rare"
# docs say os.EX_USAGE et al. are only available on Unix systems, so to be safe
# we protect and just use the values they are on macOS and Linux
EX_OK = 0
EX_USAGE = 64
EX_DATAERR = 65
EX_CONFIG = 78
# OPTIONS:
#
# ARGUMENTS:
# dict_filename The file containing the dictionary of misspellings.
# If set to '-', it will be read from stdin
# file1 .. fileN Files to check spelling
class QuietLevels:
NONE = 0
ENCODING = 1
BINARY_FILE = 2
DISABLED_FIXES = 4
NON_AUTOMATIC_FIXES = 8
FIXES = 16
CONFIG_FILES = 32
class GlobMatch:
def __init__(self, pattern: List[str]) -> None:
self.pattern_list: List[str] = pattern
def match(self, filename: str) -> bool:
return any(fnmatch.fnmatch(filename, p) for p in self.pattern_list)
class TermColors:
def __init__(self) -> None:
self.FILE = "\033[33m"
self.WWORD = "\033[31m"
self.FWORD = "\033[32m"
self.DISABLE = "\033[0m"
def disable(self) -> None:
self.FILE = ""
self.WWORD = ""
self.FWORD = ""
self.DISABLE = ""
class Summary:
def __init__(self) -> None:
self.summary: Dict[str, int] = {}
def update(self, wrongword: str) -> None:
if wrongword in self.summary:
self.summary[wrongword] += 1
else:
self.summary[wrongword] = 1
def __str__(self) -> str:
keys = list(self.summary.keys())
keys.sort()
return "\n".join(
[f"{key}{self.summary.get(key):{15 - len(key)}}" for key in keys]
)
class FileOpener:
def __init__(
self,
use_chardet: bool,
quiet_level: int,
ignore_multiline_regex: Optional[Pattern[str]],
) -> None:
self.use_chardet = use_chardet
if use_chardet:
self.init_chardet()
self.quiet_level = quiet_level
self.ignore_multiline_regex = ignore_multiline_regex
def init_chardet(self) -> None:
try:
from chardet.universaldetector import UniversalDetector
except ImportError as e:
msg = (
"There's no chardet installed to import from. "
"Please, install it and check your PYTHONPATH "
"environment variable"
)
raise ImportError(msg) from e
self.encdetector = UniversalDetector()
def open(self, filename: str) -> Tuple[List[str], str]:
if self.use_chardet:
return self.open_with_chardet(filename)
return self.open_with_internal(filename)
def open_with_chardet(self, filename: str) -> Tuple[List[str], str]:
self.encdetector.reset()
with open(filename, "rb") as fb:
for line in fb:
self.encdetector.feed(line)
if self.encdetector.done:
break
self.encdetector.close()
encoding = self.encdetector.result["encoding"]
try:
f = open(filename, encoding=encoding, newline="")
except UnicodeDecodeError:
print(f"ERROR: Could not detect encoding: {filename}", file=sys.stderr)
raise
except LookupError:
print(
f"ERROR: Don't know how to handle encoding {encoding}: {filename}",
file=sys.stderr,
)
raise
else:
lines = self.get_lines(f)
f.close()
return lines, f.encoding
def open_with_internal(self, filename: str) -> Tuple[List[str], str]:
encoding = None
first_try = True
for encoding in ("utf-8", "iso-8859-1"):
if first_try:
first_try = False
elif not self.quiet_level & QuietLevels.ENCODING:
print(f'WARNING: Trying next encoding "{encoding}"', file=sys.stderr)
with open(filename, encoding=encoding, newline="") as f:
try:
lines = self.get_lines(f)
except UnicodeDecodeError:
if not self.quiet_level & QuietLevels.ENCODING:
print(
f'WARNING: Cannot decode file using encoding "{encoding}": '
f"{filename}",
file=sys.stderr,
)
else:
break
else:
# reading with encoding "iso-8859-1" cannot fail with UnicodeDecodeError
msg = "Unknown encoding"
raise RuntimeError(msg) # pragma: no cover
return lines, encoding
def get_lines(self, f: TextIO) -> List[str]:
if self.ignore_multiline_regex:
text = f.read()
pos = 0
text2 = ""
for m in re.finditer(self.ignore_multiline_regex, text):
text2 += text[pos : m.start()]
# Replace with blank lines so line numbers are unchanged.
text2 += "\n" * m.group().count("\n")
pos = m.end()
text2 += text[pos:]
lines = text2.split("\n")
else:
lines = f.readlines()
return lines
# -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-
# If someday this breaks, we can just switch to using RawTextHelpFormatter,
# but it has the disadvantage of not wrapping our long lines.
class NewlineHelpFormatter(argparse.HelpFormatter):
"""Help formatter that preserves newlines and deals with lists."""
def _split_lines(self, text: str, width: int) -> List[str]:
parts = text.split("\n")
out = []
for part in parts:
# Eventually we could allow others...
indent_start = "- "
offset = len(indent_start) if part.startswith(indent_start) else 0
part = part[offset:]
part = self._whitespace_matcher.sub(" ", part).strip()
parts = textwrap.wrap(part, width - offset)
parts = [" " * offset + p for p in parts]
if offset:
parts[0] = indent_start + parts[0][offset:]
out.extend(parts)
return out
def _toml_to_parseconfig(toml_dict: Dict[str, Any]) -> Dict[str, Any]:
"""Convert a dict read from a TOML file to the parseconfig.read_dict() format."""
return {
k: "" if v is True else ",".join(v) if isinstance(v, list) else v
for k, v in toml_dict.items()
if v is not False
}
def _supports_ansi_colors() -> bool:
if sys.platform == "win32":
# Windows Terminal enables ANSI escape codes by default. In other cases
# it is disabled.
# See https://ss64.com/nt/syntax-ansi.html for more information.
kernel32 = ctypes.WinDLL("kernel32")
# fmt: off
kernel32.GetConsoleMode.argtypes = (
wintypes.HANDLE, # _In_ hConsoleHandle
wintypes.LPDWORD, # _Out_ lpMode
)
# fmt: on
kernel32.GetConsoleMode.restype = wintypes.BOOL
mode = wintypes.DWORD()
handle = kernel32.GetStdHandle(STD_OUTPUT_HANDLE)
if not kernel32.GetConsoleMode(handle, ctypes.byref(mode)):
# TODO: print a warning with the error message on stderr?
return False
return (mode.value & ENABLE_VIRTUAL_TERMINAL_PROCESSING) != 0
elif sys.platform == "wasi":
# WASI disables ANSI escape codes for security reasons.
# See https://github.com/WebAssembly/WASI/issues/162.
return False
elif sys.stdout.isatty():
return True
return False
def parse_options(
args: Sequence[str],
) -> Tuple[argparse.Namespace, argparse.ArgumentParser, List[str]]:
parser = argparse.ArgumentParser(formatter_class=NewlineHelpFormatter)
parser.set_defaults(colors=_supports_ansi_colors())
parser.add_argument("--version", action="version", version=VERSION)
parser.add_argument(
"-d",
"--disable-colors",
action="store_false",
dest="colors",
help="disable colors, even when printing to terminal",
)
parser.add_argument(
"-c",
"--enable-colors",
action="store_true",
dest="colors",
help="enable colors, even when not printing to terminal",
)
parser.add_argument(
"-w",
"--write-changes",
action="store_true",
default=False,
help="write changes in place if possible",
)
parser.add_argument(
"-D",
"--dictionary",
action="append",
help="comma-separated list of custom dictionary files that "
"contain spelling corrections. If this flag is not specified "
'or equals "-" then the default dictionary is used.',
)
builtin_opts = "\n- ".join(
[""] + [f"{d[0]!r} {d[1]}" for d in _builtin_dictionaries]
)
parser.add_argument(
"--builtin",
dest="builtin",
default=_builtin_default,
metavar="BUILTIN-LIST",
help="comma-separated list of builtin dictionaries "
'to include (when "-D -" or no "-D" is passed). '
"Current options are:" + builtin_opts + "\n"
"The default is %(default)r.",
)
parser.add_argument(
"--ignore-regex",
action="store",
type=str,
help="regular expression that is used to find "
"patterns to ignore by treating as whitespace. "
"When writing regular expressions, consider "
"ensuring there are boundary non-word chars, "
'e.g., "\\bmatch\\b". Defaults to '
"empty/disabled.",
)
parser.add_argument(
"--ignore-multiline-regex",
action="store",
type=str,
help="regular expression that is used to ignore "
"text that may span multi-line regions. "
"The regex is run with re.DOTALL. For example to "
"allow skipping of regions of Python code using "
"begin/end comments one could use: "
"--ignore-multiline-regex "
"'# codespell:ignore-begin *\\n.*# codespell:ignore-end *\\n'. "
"Defaults to empty/disabled.",
)
parser.add_argument(
"-I",
"--ignore-words",
action="append",
metavar="FILES",
help="comma-separated list of files that contain "
"words to be ignored by codespell. Files must contain "
"1 word per line. Words are case sensitive based on "
"how they are written in the dictionary file.",
)
parser.add_argument(
"-L",
"--ignore-words-list",
action="append",
metavar="WORDS",
help="comma-separated list of words to be ignored "
"by codespell. Words are case sensitive based on "
"how they are written in the dictionary file.",
)
parser.add_argument(
"--uri-ignore-words-list",
action="append",
metavar="WORDS",
help="comma-separated list of words to be ignored "
"by codespell in URIs and emails only. Words are "
"case sensitive based on how they are written in "
'the dictionary file. If set to "*", all '
"misspelling in URIs and emails will be ignored.",
)
parser.add_argument(
"-r",
"--regex",
action="store",
type=str,
help="regular expression that is used to find words. "
"By default any alphanumeric character, the "
"underscore, the hyphen, and the apostrophe are "
"used to build words. This option cannot be "
"specified together with --write-changes.",
)
parser.add_argument(
"--uri-regex",
action="store",
type=str,
help="regular expression that is used to find URIs "
"and emails. A default expression is provided.",
)
parser.add_argument(
"-s",
"--summary",
action="store_true",
default=False,
help="print summary of fixes",
)
parser.add_argument(
"--count",
action="store_true",
default=False,
help="print the number of errors as the last line of stderr",
)
parser.add_argument(
"-S",
"--skip",
action="append",
help="comma-separated list of files to skip. It "
"accepts globs as well. E.g.: if you want "
"codespell to skip .eps and .txt files, "
'you\'d give "*.eps,*.txt" to this option.',
)
parser.add_argument(
"-x",
"--exclude-file",
action="append",
type=str,
metavar="FILES",
help="ignore whole lines that match those in "
"the comma-separated list of files EXCLUDE. "
"The lines in these files should match the "
"to-be-excluded lines exactly",
)
parser.add_argument(
"-i",
"--interactive",
action="store",
type=int,
default=0,
choices=range(0, 4),
help="set interactive mode when writing changes:\n"
"- 0: no interactivity.\n"
"- 1: ask for confirmation.\n"
"- 2: ask user to choose one fix when more than one is available.\n"
"- 3: both 1 and 2",
metavar="MODE",
)
parser.add_argument(
"-q",
"--quiet-level",
action="store",
type=int,
default=34,
choices=range(0, 64),
help="bitmask that allows suppressing messages:\n"
"- 0: print all messages.\n"
"- 1: disable warnings about wrong encoding.\n"
"- 2: disable warnings about binary files.\n"
"- 4: omit warnings about automatic fixes that were disabled in the dictionary.\n" # noqa: E501
"- 8: don't print anything for non-automatic fixes.\n"
"- 16: don't print the list of fixed files.\n"
"- 32: don't print configuration files.\n"
"As usual with bitmasks, these levels can be "
"combined; e.g. use 3 for levels 1+2, 7 for "
"1+2+4, 23 for 1+2+4+16, etc. "
"The default mask is %(default)s.",
metavar="LEVEL",
)
parser.add_argument(
"-e",
"--hard-encoding-detection",
action="store_true",
default=False,
help="use chardet to detect the encoding of each "
"file. This can slow down codespell, but is more "
"reliable in detecting encodings other than "
"utf-8, iso8859-1, and ascii.",
)
parser.add_argument(
"-f",
"--check-filenames",
action="store_true",
default=False,
help="check file names as well",
)
parser.add_argument(
"-H",
"--check-hidden",
action="store_true",
default=False,
help='check hidden files and directories (those starting with ".") as well.',
)
parser.add_argument(
"-A",
"--after-context",
type=int,
metavar="LINES",
help="print LINES of trailing context",
)
parser.add_argument(
"-B",
"--before-context",
type=int,
metavar="LINES",
help="print LINES of leading context",
)
parser.add_argument(
"-C",
"--context",
type=int,
metavar="LINES",
help="print LINES of surrounding context",
)
parser.add_argument(
"--stdin-single-line",
action="store_true",
help="output just a single line for each misspelling in stdin mode",
)
parser.add_argument("--config", type=str, help="path to config file.")
parser.add_argument("--toml", type=str, help="path to a pyproject.toml file.")
parser.add_argument("files", nargs="*", help="files or directories to check")
# Parse command line options.
options = parser.parse_args(list(args))
# Load config files and look for ``codespell`` options.
cfg_files = ["setup.cfg", ".codespellrc"]
if options.config:
cfg_files.append(options.config)
config = configparser.ConfigParser(interpolation=None)
# Read toml before other config files.
toml_files = []
tomllib_raise_error = False
if os.path.isfile("pyproject.toml"):
toml_files.append("pyproject.toml")
if options.toml:
toml_files.append(options.toml)
tomllib_raise_error = True
if toml_files:
if sys.version_info >= (3, 11):
import tomllib
else:
try:
import tomli as tomllib # type: ignore[no-redef]
except ImportError as e:
if tomllib_raise_error:
msg = (
f"tomllib or tomli are required to read pyproject.toml "
f"but could not be imported, got: {e}"
)
raise ImportError(msg) from None
tomllib = None # type: ignore[assignment]
if tomllib is not None:
for toml_file in toml_files:
with open(toml_file, "rb") as f:
data = tomllib.load(f).get("tool", {})
if "codespell" in data:
data["codespell"] = _toml_to_parseconfig(data["codespell"])
config.read_dict(data)
# Collect which config files are going to be used
used_cfg_files = []
for cfg_file in cfg_files:
_cfg = configparser.ConfigParser()
_cfg.read(cfg_file)
if _cfg.has_section("codespell"):
used_cfg_files.append(cfg_file)
# Use config files
config.read(used_cfg_files)
if config.has_section("codespell"):
# Build a "fake" argv list using option name and value.
cfg_args = []
for key in config["codespell"]:
# Add option as arg.
cfg_args.append(f"--{key}")
# If value is blank, skip.
val = config["codespell"][key]
if val:
cfg_args.append(val)
# Parse config file options.
options = parser.parse_args(cfg_args)
# Re-parse command line options to override config.
options = parser.parse_args(list(args), namespace=options)
if not options.files:
options.files.append(".")
return options, parser, used_cfg_files
def process_ignore_words(
words: Iterable[str], ignore_words: Set[str], ignore_words_cased: Set[str]
) -> None:
for word in words:
word = word.strip()
if word == word.lower():
ignore_words.add(word)
else:
ignore_words_cased.add(word)
def parse_ignore_words_option(
ignore_words_option: List[str],
) -> Tuple[Set[str], Set[str]]:
ignore_words: Set[str] = set()
ignore_words_cased: Set[str] = set()
if ignore_words_option:
for comma_separated_words in ignore_words_option:
process_ignore_words(
(word.strip() for word in comma_separated_words.split(",")),
ignore_words,
ignore_words_cased,
)
return (ignore_words, ignore_words_cased)
def build_exclude_hashes(filename: str, exclude_lines: Set[str]) -> None:
with open(filename, encoding="utf-8") as f:
exclude_lines.update(line.rstrip() for line in f)
def build_ignore_words(
filename: str, ignore_words: Set[str], ignore_words_cased: Set[str]
) -> None:
with open(filename, encoding="utf-8") as f:
process_ignore_words(
(line.strip() for line in f), ignore_words, ignore_words_cased
)
def is_hidden(filename: str, check_hidden: bool) -> bool:
bfilename = os.path.basename(filename)
return bfilename not in ("", ".", "..") and (
not check_hidden and bfilename[0] == "."
)
def is_text_file(filename: str) -> bool:
with open(filename, mode="rb") as f:
s = f.read(1024)
return b"\x00" not in s
def ask_for_word_fix(
line: str,
match: Match[str],
misspelling: Misspelling,
interactivity: int,
colors: TermColors,
) -> Tuple[bool, str]:
wrongword = match.group()
if interactivity <= 0:
return misspelling.fix, fix_case(wrongword, misspelling.data)
line_ui = (
f"{line[: match.start()]}"
f"{colors.WWORD}{wrongword}{colors.DISABLE}"
f"{line[match.end() :]}"
)
if misspelling.fix and interactivity & 1:
r = ""
fixword = fix_case(wrongword, misspelling.data)
while not r:
print(f"{line_ui}\t{wrongword} ==> {fixword} (Y/n) ", end="", flush=True)
r = sys.stdin.readline().strip().upper()
if not r:
r = "Y"
if r not in ("Y", "N"):
print("Say 'y' or 'n'")
r = ""
if r == "N":
misspelling.fix = False
elif (interactivity & 2) and not misspelling.reason:
# if it is not disabled, i.e. it just has more than one possible fix,
# we ask the user which word to use
r = ""
opt = [w.strip() for w in misspelling.data.split(",")]
while not r:
print(f"{line_ui} Choose an option (blank for none): ", end="")
for i, o in enumerate(opt):
fixword = fix_case(wrongword, o)
print(f" {i}) {fixword}", end="")
print(": ", end="", flush=True)
n = sys.stdin.readline().strip()
if not n:
break
try:
i = int(n)
r = opt[i]
except (ValueError, IndexError):
print("Not a valid option\n")
if r:
misspelling.fix = True
misspelling.data = r
return misspelling.fix, fix_case(wrongword, misspelling.data)
def print_context(
lines: List[str],
index: int,
context: Tuple[int, int],
) -> None:
# context = (context_before, context_after)
for i in range(index - context[0], index + context[1] + 1):
if 0 <= i < len(lines):
print(f"{'>' if i == index else ':'} {lines[i].rstrip()}")
def _ignore_word_sub(
text: str,
ignore_word_regex: Optional[Pattern[str]],
) -> str:
if ignore_word_regex:
text = ignore_word_regex.sub(" ", text)
return text
def extract_words(
text: str,
word_regex: Pattern[str],
ignore_word_regex: Optional[Pattern[str]],
) -> List[str]:
return word_regex.findall(_ignore_word_sub(text, ignore_word_regex))
def extract_words_iter(
text: str,
word_regex: Pattern[str],
ignore_word_regex: Optional[Pattern[str]],
) -> List[Match[str]]:
return list(word_regex.finditer(_ignore_word_sub(text, ignore_word_regex)))
def apply_uri_ignore_words(
check_matches: List[Match[str]],
line: str,
word_regex: Pattern[str],
ignore_word_regex: Optional[Pattern[str]],
uri_regex: Pattern[str],
uri_ignore_words: Set[str],
) -> List[Match[str]]:
if not uri_ignore_words:
return check_matches
for uri in uri_regex.findall(line):
for uri_word in extract_words(uri, word_regex, ignore_word_regex):
if uri_word in uri_ignore_words:
# determine/remove only the first among matches
for i, match in enumerate(check_matches):
if match.group() == uri_word:
check_matches = check_matches[:i] + check_matches[i + 1 :]
break
return check_matches
def parse_file(
filename: str,
colors: TermColors,
summary: Optional[Summary],
misspellings: Dict[str, Misspelling],
ignore_words_cased: Set[str],
exclude_lines: Set[str],
file_opener: FileOpener,
word_regex: Pattern[str],
ignore_word_regex: Optional[Pattern[str]],
uri_regex: Pattern[str],
uri_ignore_words: Set[str],
context: Optional[Tuple[int, int]],
options: argparse.Namespace,
) -> int:
bad_count = 0
lines = None
changed = False
if filename == "-":
f = sys.stdin
encoding = "utf-8"
lines = f.readlines()
else:
if options.check_filenames:
for word in extract_words(filename, word_regex, ignore_word_regex):
if word in ignore_words_cased:
continue
lword = word.lower()
if lword not in misspellings:
continue
fix = misspellings[lword].fix
fixword = fix_case(word, misspellings[lword].data)
if summary and fix:
summary.update(lword)
cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"
reason = misspellings[lword].reason
if reason:
if options.quiet_level & QuietLevels.DISABLED_FIXES:
continue
creason = f" | {colors.FILE}{reason}{colors.DISABLE}"
else:
if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES:
continue
creason = ""
bad_count += 1
print(f"{cfilename}: {cwrongword} ==> {crightword}{creason}")
# ignore irregular files
if not os.path.isfile(filename):
return bad_count
try:
text = is_text_file(filename)
except PermissionError as e:
print(f"WARNING: {e.strerror}: {filename}", file=sys.stderr)
return bad_count
except OSError:
return bad_count
if not text:
if not options.quiet_level & QuietLevels.BINARY_FILE:
print(f"WARNING: Binary file: {filename}", file=sys.stderr)
return bad_count
try:
lines, encoding = file_opener.open(filename)
except OSError:
return bad_count
for i, line in enumerate(lines):
if line.rstrip() in exclude_lines:
continue
extra_words_to_ignore = set()
match = inline_ignore_regex.search(line)
if match:
extra_words_to_ignore = set(
filter(None, (match.group("words") or "").split(","))
)
if not extra_words_to_ignore:
continue
fixed_words = set()
asked_for = set()
# If all URI spelling errors will be ignored, erase any URI before
# extracting words. Otherwise, apply ignores after extracting words.
# This ensures that if a URI ignore word occurs both inside a URI and
# outside, it will still be a spelling error.
if "*" in uri_ignore_words:
line = uri_regex.sub(" ", line)
check_matches = extract_words_iter(line, word_regex, ignore_word_regex)
if "*" not in uri_ignore_words:
check_matches = apply_uri_ignore_words(
check_matches,
line,
word_regex,
ignore_word_regex,
uri_regex,
uri_ignore_words,
)
for match in check_matches:
word = match.group()
if word in ignore_words_cased:
continue
lword = word.lower()
if lword in misspellings and lword not in extra_words_to_ignore:
# Sometimes we find a 'misspelling' which is actually a valid word
# preceded by a string escape sequence. Ignore such cases as
# they're usually false alarms; see issue #17 among others.
char_before_idx = match.start() - 1
if (
char_before_idx >= 0
and line[char_before_idx] == "\\"
# bell, backspace, formfeed, newline, carriage-return, tab, vtab.
and word.startswith(("a", "b", "f", "n", "r", "t", "v"))
and lword[1:] not in misspellings
):
continue
context_shown = False
fix = misspellings[lword].fix
fixword = fix_case(word, misspellings[lword].data)
if options.interactive and lword not in asked_for:
if context is not None:
context_shown = True
print_context(lines, i, context)
fix, fixword = ask_for_word_fix(
lines[i],
match,
misspellings[lword],
options.interactive,
colors=colors,
)
asked_for.add(lword)
if summary and fix:
summary.update(lword)
if word in fixed_words: # can skip because of re.sub below
continue
if options.write_changes and fix:
changed = True
lines[i] = re.sub(rf"\b{word}\b", fixword, lines[i])
fixed_words.add(word)
continue
# otherwise warning was explicitly set by interactive mode
if (
options.interactive & 2
and not fix
and not misspellings[lword].reason
):
continue
cfilename = f"{colors.FILE}{filename}{colors.DISABLE}"
cline = f"{colors.FILE}{i + 1}{colors.DISABLE}"
cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}"
crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}"
reason = misspellings[lword].reason
if reason:
if options.quiet_level & QuietLevels.DISABLED_FIXES:
continue
creason = f" | {colors.FILE}{reason}{colors.DISABLE}"
else:
if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES:
continue
creason = ""
# If we get to this point (uncorrected error) we should change
# our bad_count and thus return value
bad_count += 1
if (not context_shown) and (context is not None):
print_context(lines, i, context)
if filename != "-":
print(
f"{cfilename}:{cline}: {cwrongword} ==> {crightword}{creason}"
)
elif options.stdin_single_line:
print(f"{cline}: {cwrongword} ==> {crightword}{creason}")
else:
print(
f"{cline}: {line.strip()}\n\t{cwrongword} "
f"==> {crightword}{creason}"
)
if changed:
if filename == "-":
print("---")
for line in lines:
print(line, end="")
else:
if not options.quiet_level & QuietLevels.FIXES:
print(
f"{colors.FWORD}FIXED:{colors.DISABLE} {filename}",
file=sys.stderr,
)
with open(filename, "w", encoding=encoding, newline="") as f:
f.writelines(lines)
return bad_count
def flatten_clean_comma_separated_arguments(
arguments: Iterable[str],
) -> List[str]:
"""
>>> flatten_clean_comma_separated_arguments(["a, b ,\n c, d,", "e"])
['a', 'b', 'c', 'd', 'e']
>>> flatten_clean_comma_separated_arguments([])
[]
"""
return [
item.strip() for argument in arguments for item in argument.split(",") if item
]
def _script_main() -> int:
"""Wrap to main() for setuptools."""
try:
return main(*sys.argv[1:])
except KeyboardInterrupt:
# User has typed CTRL+C
sys.stdout.write("\n")
return 130
def _usage_error(parser: argparse.ArgumentParser, message: str) -> int:
parser.print_usage()
print(message, file=sys.stderr)
return EX_USAGE
def main(*args: str) -> int:
"""Contains flow control"""
try:
options, parser, used_cfg_files = parse_options(args)
except configparser.Error as e:
print(
f"ERROR: ill-formed config file: {e.message}",
file=sys.stderr,
)
return EX_CONFIG
# Report used config files
if not options.quiet_level & QuietLevels.CONFIG_FILES:
if len(used_cfg_files) > 0:
print("Used config files:")
for ifile, cfg_file in enumerate(used_cfg_files, start=1):
print(f" {ifile}: {cfg_file}")
if options.interactive > 0:
options.write_changes = True
if options.regex and options.write_changes:
return _usage_error(
parser,
"ERROR: --write-changes cannot be used together with --regex",
)
word_regex = options.regex or word_regex_def
try:
word_regex = re.compile(word_regex)
except re.error as e:
return _usage_error(
parser,
f'ERROR: invalid --regex "{word_regex}" ({e})',
)
if options.ignore_regex:
try:
ignore_word_regex = re.compile(options.ignore_regex)
except re.error as e:
return _usage_error(
parser,
f'ERROR: invalid --ignore-regex "{options.ignore_regex}" ({e})',
)
else:
ignore_word_regex = None
if options.ignore_multiline_regex:
try:
ignore_multiline_regex = re.compile(
options.ignore_multiline_regex, re.DOTALL
)
except re.error as e:
return _usage_error(
parser,
f"ERROR: invalid --ignore-multiline-regex "
f'"{options.ignore_multiline_regex}" ({e})',
)
else:
ignore_multiline_regex = None
ignore_words, ignore_words_cased = parse_ignore_words_option(
options.ignore_words_list
)
if options.ignore_words:
ignore_words_files = flatten_clean_comma_separated_arguments(
options.ignore_words
)
for ignore_words_file in ignore_words_files:
if not os.path.isfile(ignore_words_file):
return _usage_error(
parser,
f"ERROR: cannot find ignore-words file: {ignore_words_file}",
)
build_ignore_words(ignore_words_file, ignore_words, ignore_words_cased)
uri_regex = options.uri_regex or uri_regex_def
try:
uri_regex = re.compile(uri_regex)
except re.error as e:
return _usage_error(
parser,
f'ERROR: invalid --uri-regex "{uri_regex}" ({e})',
)
uri_ignore_words = set(
itertools.chain(*parse_ignore_words_option(options.uri_ignore_words_list))
)
dictionaries = flatten_clean_comma_separated_arguments(options.dictionary or ["-"])
use_dictionaries = []
for dictionary in dictionaries:
if dictionary == "-":
# figure out which builtin dictionaries to use
use = sorted(set(options.builtin.split(",")))
for u in use:
for builtin in _builtin_dictionaries:
if builtin[0] == u:
use_dictionaries.append(
os.path.join(_data_root, f"dictionary{builtin[2]}.txt")
)
break
else:
return _usage_error(
parser,
f"ERROR: Unknown builtin dictionary: {u}",
)
else:
if not os.path.isfile(dictionary):
return _usage_error(
parser,
f"ERROR: cannot find dictionary file: {dictionary}",
)
use_dictionaries.append(dictionary)
misspellings: Dict[str, Misspelling] = {}
for dictionary in use_dictionaries:
build_dict(dictionary, misspellings, ignore_words)
colors = TermColors()
if not options.colors:
colors.disable()
summary = Summary() if options.summary else None
context = None
if options.context is not None:
if (options.before_context is not None) or (options.after_context is not None):
return _usage_error(
parser,
"ERROR: --context/-C cannot be used together with "
"--context-before/-B or --context-after/-A",
)
context_both = max(0, options.context)
context = (context_both, context_both)
elif (options.before_context is not None) or (options.after_context is not None):
context_before = 0
context_after = 0
if options.before_context is not None:
context_before = max(0, options.before_context)
if options.after_context is not None:
context_after = max(0, options.after_context)
context = (context_before, context_after)
exclude_lines: Set[str] = set()
if options.exclude_file:
exclude_files = flatten_clean_comma_separated_arguments(options.exclude_file)
for exclude_file in exclude_files:
build_exclude_hashes(exclude_file, exclude_lines)
file_opener = FileOpener(
options.hard_encoding_detection,
options.quiet_level,
ignore_multiline_regex,
)
glob_match = GlobMatch(
flatten_clean_comma_separated_arguments(options.skip) if options.skip else []
)
try:
glob_match.match("/random/path") # does not need a real path
except re.error:
return _usage_error(
parser,
"ERROR: --skip/-S has been fed an invalid glob, "
"try escaping special characters",
)
bad_count = 0
for filename in sorted(options.files):
# ignore hidden files
if is_hidden(filename, options.check_hidden):
continue
if os.path.isdir(filename):
for root, dirs, files in os.walk(filename):
if glob_match.match(root): # skip (absolute) directories
dirs.clear()
continue
if is_hidden(root, options.check_hidden): # dir itself hidden
continue
for file_ in sorted(files):
# ignore hidden files in directories
if is_hidden(file_, options.check_hidden):
continue
if glob_match.match(file_): # skip files
continue
fname = os.path.join(root, file_)
if glob_match.match(fname): # skip paths
continue
bad_count += parse_file(
fname,
colors,
summary,
misspellings,
ignore_words_cased,
exclude_lines,
file_opener,
word_regex,
ignore_word_regex,
uri_regex,
uri_ignore_words,
context,
options,
)
# skip (relative) directories
dirs[:] = [
dir_
for dir_ in dirs
if not glob_match.match(dir_)
and not is_hidden(dir_, options.check_hidden)
]
elif not glob_match.match(filename): # skip files
bad_count += parse_file(
filename,
colors,
summary,
misspellings,
ignore_words_cased,
exclude_lines,
file_opener,
word_regex,
ignore_word_regex,
uri_regex,
uri_ignore_words,
context,
options,
)
if summary:
print("\n-------8<-------\nSUMMARY:")
print(summary)
if options.count:
print(bad_count, file=sys.stderr)
return EX_DATAERR if bad_count else EX_OK