blob: 0eda6a2cb60891773621a8a99508bb21ecdd53cd [file] [log] [blame]
# Copyright (c) 2009-2010 Arista Networks, Inc. - James Lingard
# Copyright (c) 2004-2013 LOGILAB S.A. (Paris, FRANCE).
# Copyright 2012 Google Inc.
#
# http://www.logilab.fr/ -- mailto:contact@logilab.fr
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""Checker for string formatting operations.
"""
import sys
import tokenize
import string
try:
import numbers
except ImportError:
numbers = None
import astroid
from pylint.interfaces import ITokenChecker, IAstroidChecker, IRawChecker
from pylint.checkers import BaseChecker, BaseTokenChecker
from pylint.checkers import utils
from pylint.checkers.utils import check_messages
_PY3K = sys.version_info[:2] >= (3, 0)
_PY27 = sys.version_info[:2] == (2, 7)
MSGS = {
'E1300': ("Unsupported format character %r (%#02x) at index %d",
"bad-format-character",
"Used when a unsupported format character is used in a format\
string."),
'E1301': ("Format string ends in middle of conversion specifier",
"truncated-format-string",
"Used when a format string terminates before the end of a \
conversion specifier."),
'E1302': ("Mixing named and unnamed conversion specifiers in format string",
"mixed-format-string",
"Used when a format string contains both named (e.g. '%(foo)d') \
and unnamed (e.g. '%d') conversion specifiers. This is also \
used when a named conversion specifier contains * for the \
minimum field width and/or precision."),
'E1303': ("Expected mapping for format string, not %s",
"format-needs-mapping",
"Used when a format string that uses named conversion specifiers \
is used with an argument that is not a mapping."),
'W1300': ("Format string dictionary key should be a string, not %s",
"bad-format-string-key",
"Used when a format string that uses named conversion specifiers \
is used with a dictionary whose keys are not all strings."),
'W1301': ("Unused key %r in format string dictionary",
"unused-format-string-key",
"Used when a format string that uses named conversion specifiers \
is used with a dictionary that conWtains keys not required by the \
format string."),
'E1304': ("Missing key %r in format string dictionary",
"missing-format-string-key",
"Used when a format string that uses named conversion specifiers \
is used with a dictionary that doesn't contain all the keys \
required by the format string."),
'E1305': ("Too many arguments for format string",
"too-many-format-args",
"Used when a format string that uses unnamed conversion \
specifiers is given too many arguments."),
'E1306': ("Not enough arguments for format string",
"too-few-format-args",
"Used when a format string that uses unnamed conversion \
specifiers is given too few arguments"),
'W1302': ("Invalid format string",
"bad-format-string",
"Used when a PEP 3101 format string is invalid.",
{'minversion': (2, 7)}),
'W1303': ("Missing keyword argument %r for format string",
"missing-format-argument-key",
"Used when a PEP 3101 format string that uses named fields "
"doesn't receive one or more required keywords.",
{'minversion': (2, 7)}),
'W1304': ("Unused format argument %r",
"unused-format-string-argument",
"Used when a PEP 3101 format string that uses named "
"fields is used with an argument that "
"is not required by the format string.",
{'minversion': (2, 7)}),
'W1305': ("Format string contains both automatic field numbering "
"and manual field specification",
"format-combined-specification",
"Usen when a PEP 3101 format string contains both automatic "
"field numbering (e.g. '{}') and manual field "
"specification (e.g. '{0}').",
{'minversion': (2, 7)}),
'W1306': ("Missing format attribute %r in format specifier %r",
"missing-format-attribute",
"Used when a PEP 3101 format string uses an "
"attribute specifier ({0.length}), but the argument "
"passed for formatting doesn't have that attribute.",
{'minversion': (2, 7)}),
'W1307': ("Using invalid lookup key %r in format specifier %r",
"invalid-format-index",
"Used when a PEP 3101 format string uses a lookup specifier "
"({a[1]}), but the argument passed for formatting "
"doesn't contain or doesn't have that key as an attribute.",
{'minversion': (2, 7)})
}
OTHER_NODES = (astroid.Const, astroid.List, astroid.Backquote,
astroid.Lambda, astroid.Function,
astroid.ListComp, astroid.SetComp, astroid.GenExpr)
if _PY3K:
import _string
def split_format_field_names(format_string):
return _string.formatter_field_name_split(format_string)
else:
def _field_iterator_convertor(iterator):
for is_attr, key in iterator:
if isinstance(key, numbers.Number):
yield is_attr, int(key)
else:
yield is_attr, key
def split_format_field_names(format_string):
keyname, fielditerator = format_string._formatter_field_name_split()
# it will return longs, instead of ints, which will complicate
# the output
return keyname, _field_iterator_convertor(fielditerator)
def collect_string_fields(format_string):
""" Given a format string, return an iterator
of all the valid format fields. It handles nested fields
as well.
"""
formatter = string.Formatter()
parseiterator = formatter.parse(format_string)
try:
for result in parseiterator:
if all(item is None for item in result[1:]):
# not a replacement format
continue
name = result[1]
nested = result[2]
yield name
if nested:
for field in collect_string_fields(nested):
yield field
except ValueError:
# probably the format string is invalid
# should we check the argument of the ValueError?
raise utils.IncompleteFormatString(format_string)
def parse_format_method_string(format_string):
"""
Parses a PEP 3101 format string, returning a tuple of
(keys, num_args, manual_pos_arg),
where keys is the set of mapping keys in the format string, num_args
is the number of arguments required by the format string and
manual_pos_arg is the number of arguments passed with the position.
"""
keys = []
num_args = 0
manual_pos_arg = set()
for name in collect_string_fields(format_string):
if name and str(name).isdigit():
manual_pos_arg.add(str(name))
elif name:
keyname, fielditerator = split_format_field_names(name)
if isinstance(keyname, numbers.Number):
# In Python 2 it will return long which will lead
# to different output between 2 and 3
keyname = int(keyname)
keys.append((keyname, list(fielditerator)))
else:
num_args += 1
return keys, num_args, len(manual_pos_arg)
def get_args(callfunc):
""" Get the arguments from the given `CallFunc` node.
Return a tuple, where the first element is the
number of positional arguments and the second element
is the keyword arguments in a dict.
"""
positional = 0
named = {}
for arg in callfunc.args:
if isinstance(arg, astroid.Keyword):
named[arg.arg] = utils.safe_infer(arg.value)
else:
positional += 1
return positional, named
def get_access_path(key, parts):
""" Given a list of format specifiers, returns
the final access path (e.g. a.b.c[0][1]).
"""
path = []
for is_attribute, specifier in parts:
if is_attribute:
path.append(".{}".format(specifier))
else:
path.append("[{!r}]".format(specifier))
return str(key) + "".join(path)
class StringFormatChecker(BaseChecker):
"""Checks string formatting operations to ensure that the format string
is valid and the arguments match the format string.
"""
__implements__ = (IAstroidChecker,)
name = 'string'
msgs = MSGS
@check_messages(*(MSGS.keys()))
def visit_binop(self, node):
if node.op != '%':
return
left = node.left
args = node.right
if not (isinstance(left, astroid.Const)
and isinstance(left.value, basestring)):
return
format_string = left.value
try:
required_keys, required_num_args = \
utils.parse_format_string(format_string)
except utils.UnsupportedFormatCharacter, e:
c = format_string[e.index]
self.add_message('bad-format-character',
node=node, args=(c, ord(c), e.index))
return
except utils.IncompleteFormatString:
self.add_message('truncated-format-string', node=node)
return
if required_keys and required_num_args:
# The format string uses both named and unnamed format
# specifiers.
self.add_message('mixed-format-string', node=node)
elif required_keys:
# The format string uses only named format specifiers.
# Check that the RHS of the % operator is a mapping object
# that contains precisely the set of keys required by the
# format string.
if isinstance(args, astroid.Dict):
keys = set()
unknown_keys = False
for k, _ in args.items:
if isinstance(k, astroid.Const):
key = k.value
if isinstance(key, basestring):
keys.add(key)
else:
self.add_message('bad-format-string-key',
node=node, args=key)
else:
# One of the keys was something other than a
# constant. Since we can't tell what it is,
# supress checks for missing keys in the
# dictionary.
unknown_keys = True
if not unknown_keys:
for key in required_keys:
if key not in keys:
self.add_message('missing-format-string-key',
node=node, args=key)
for key in keys:
if key not in required_keys:
self.add_message('unused-format-string-key',
node=node, args=key)
elif isinstance(args, OTHER_NODES + (astroid.Tuple,)):
type_name = type(args).__name__
self.add_message('format-needs-mapping',
node=node, args=type_name)
# else:
# The RHS of the format specifier is a name or
# expression. It may be a mapping object, so
# there's nothing we can check.
else:
# The format string uses only unnamed format specifiers.
# Check that the number of arguments passed to the RHS of
# the % operator matches the number required by the format
# string.
if isinstance(args, astroid.Tuple):
num_args = len(args.elts)
elif isinstance(args, OTHER_NODES + (astroid.Dict, astroid.DictComp)):
num_args = 1
else:
# The RHS of the format specifier is a name or
# expression. It could be a tuple of unknown size, so
# there's nothing we can check.
num_args = None
if num_args is not None:
if num_args > required_num_args:
self.add_message('too-many-format-args', node=node)
elif num_args < required_num_args:
self.add_message('too-few-format-args', node=node)
class StringMethodsChecker(BaseChecker):
__implements__ = (IAstroidChecker,)
name = 'string'
msgs = {
'E1310': ("Suspicious argument in %s.%s call",
"bad-str-strip-call",
"The argument to a str.{l,r,}strip call contains a"
" duplicate character, "),
}
@check_messages(*(MSGS.keys()))
def visit_callfunc(self, node):
func = utils.safe_infer(node.func)
if (isinstance(func, astroid.BoundMethod)
and isinstance(func.bound, astroid.Instance)
and func.bound.name in ('str', 'unicode', 'bytes')):
if func.name in ('strip', 'lstrip', 'rstrip') and node.args:
arg = utils.safe_infer(node.args[0])
if not isinstance(arg, astroid.Const):
return
if len(arg.value) != len(set(arg.value)):
self.add_message('bad-str-strip-call', node=node,
args=(func.bound.name, func.name))
elif func.name == 'format':
if _PY27 or _PY3K:
self._check_new_format(node, func)
def _check_new_format(self, node, func):
""" Check the new string formatting. """
# TODO: skip (for now) format nodes which don't have
# an explicit string on the left side of the format operation.
# We do this because our inference engine can't properly handle
# redefinitions of the original string.
# For more details, see issue 287.
if not isinstance(node.func.expr, astroid.Const):
return
try:
strnode = func.bound.infer().next()
except astroid.InferenceError:
return
if not isinstance(strnode, astroid.Const):
return
if node.starargs or node.kwargs:
# TODO: Don't complicate the logic, skip these for now.
return
try:
positional, named = get_args(node)
except astroid.InferenceError:
return
try:
fields, num_args, manual_pos = parse_format_method_string(strnode.value)
except utils.IncompleteFormatString:
self.add_message('bad-format-string', node=node)
return
manual_fields = set(field[0] for field in fields
if isinstance(field[0], numbers.Number))
named_fields = set(field[0] for field in fields
if isinstance(field[0], basestring))
if num_args and manual_pos:
self.add_message('format-combined-specification',
node=node)
return
check_args = False
# Consider "{[0]} {[1]}" as num_args.
num_args += sum(1 for field in named_fields
if field == '')
if named_fields:
for field in named_fields:
if field not in named and field:
self.add_message('missing-format-argument-key',
node=node,
args=(field, ))
for field in named:
if field not in named_fields:
self.add_message('unused-format-string-argument',
node=node,
args=(field, ))
# num_args can be 0 if manual_pos is not.
num_args = num_args or manual_pos
if positional or num_args:
empty = any(True for field in named_fields
if field == '')
if named or empty:
# Verify the required number of positional arguments
# only if the .format got at least one keyword argument.
# This means that the format strings accepts both
# positional and named fields and we should warn
# when one of the them is missing or is extra.
check_args = True
else:
check_args = True
if check_args:
# num_args can be 0 if manual_pos is not.
num_args = num_args or manual_pos
if positional > num_args:
# We can have two possibilities:
# * "{0} {1}".format(a, b)
# * "{} {} {}".format(a, b, c, d)
# We can check the manual keys for the first one.
if len(manual_fields) != positional:
self.add_message('too-many-format-args', node=node)
elif positional < num_args:
self.add_message('too-few-format-args', node=node)
self._check_new_format_specifiers(node, fields, named)
def _check_new_format_specifiers(self, node, fields, named):
"""
Check attribute and index access in the format
string ("{0.a}" and "{0[a]}").
"""
for key, specifiers in fields:
# Obtain the argument. If it can't be obtained
# or infered, skip this check.
if key == '':
# {[0]} will have an unnamed argument, defaulting
# to 0. It will not be present in `named`, so use the value
# 0 for it.
key = 0
if isinstance(key, numbers.Number):
try:
argname = utils.get_argument_from_call(node, key)
except utils.NoSuchArgumentError:
continue
else:
if key not in named:
continue
argname = named[key]
if argname in (astroid.YES, None):
continue
try:
argument = argname.infer().next()
except astroid.InferenceError:
continue
if not specifiers or argument is astroid.YES:
# No need to check this key if it doesn't
# use attribute / item access
continue
if argument.parent and isinstance(argument.parent, astroid.Arguments):
# Check to see if our argument is kwarg or vararg,
# and skip the check for this argument if so, because when inferring,
# astroid will return empty objects (dicts and tuples) and
# that can lead to false positives.
if argname.name in (argument.parent.kwarg, argument.parent.vararg):
continue
previous = argument
parsed = []
for is_attribute, specifier in specifiers:
if previous is astroid.YES:
break
parsed.append((is_attribute, specifier))
if is_attribute:
try:
previous = previous.getattr(specifier)[0]
except astroid.NotFoundError:
if (hasattr(previous, 'has_dynamic_getattr') and
previous.has_dynamic_getattr()):
# Don't warn if the object has a custom __getattr__
break
path = get_access_path(key, parsed)
self.add_message('missing-format-attribute',
args=(specifier, path),
node=node)
break
else:
warn_error = False
if hasattr(previous, 'getitem'):
try:
previous = previous.getitem(specifier)
except (IndexError, TypeError):
warn_error = True
else:
try:
# Lookup __getitem__ in the current node,
# but skip further checks, because we can't
# retrieve the looked object
previous.getattr('__getitem__')
break
except astroid.NotFoundError:
warn_error = True
if warn_error:
path = get_access_path(key, parsed)
self.add_message('invalid-format-index',
args=(specifier, path),
node=node)
break
try:
previous = previous.infer().next()
except astroid.InferenceError:
# can't check further if we can't infer it
break
class StringConstantChecker(BaseTokenChecker):
"""Check string literals"""
__implements__ = (ITokenChecker, IRawChecker)
name = 'string_constant'
msgs = {
'W1401': ('Anomalous backslash in string: \'%s\'. '
'String constant might be missing an r prefix.',
'anomalous-backslash-in-string',
'Used when a backslash is in a literal string but not as an '
'escape.'),
'W1402': ('Anomalous Unicode escape in byte string: \'%s\'. '
'String constant might be missing an r or u prefix.',
'anomalous-unicode-escape-in-string',
'Used when an escape like \\u is encountered in a byte '
'string where it has no effect.'),
}
# Characters that have a special meaning after a backslash in either
# Unicode or byte strings.
ESCAPE_CHARACTERS = 'abfnrtvx\n\r\t\\\'\"01234567'
# TODO(mbp): Octal characters are quite an edge case today; people may
# prefer a separate warning where they occur. \0 should be allowed.
# Characters that have a special meaning after a backslash but only in
# Unicode strings.
UNICODE_ESCAPE_CHARACTERS = 'uUN'
def process_module(self, module):
self._unicode_literals = 'unicode_literals' in module.future_imports
def process_tokens(self, tokens):
for (tok_type, token, (start_row, start_col), _, _) in tokens:
if tok_type == tokenize.STRING:
# 'token' is the whole un-parsed token; we can look at the start
# of it to see whether it's a raw or unicode string etc.
self.process_string_token(token, start_row, start_col)
def process_string_token(self, token, start_row, start_col):
for i, c in enumerate(token):
if c in '\'\"':
quote_char = c
break
prefix = token[:i].lower() # markers like u, b, r.
after_prefix = token[i:]
if after_prefix[:3] == after_prefix[-3:] == 3 * quote_char:
string_body = after_prefix[3:-3]
else:
string_body = after_prefix[1:-1] # Chop off quotes
# No special checks on raw strings at the moment.
if 'r' not in prefix:
self.process_non_raw_string_token(prefix, string_body,
start_row, start_col)
def process_non_raw_string_token(self, prefix, string_body, start_row,
start_col):
"""check for bad escapes in a non-raw string.
prefix: lowercase string of eg 'ur' string prefix markers.
string_body: the un-parsed body of the string, not including the quote
marks.
start_row: integer line number in the source.
start_col: integer column number in the source.
"""
# Walk through the string; if we see a backslash then escape the next
# character, and skip over it. If we see a non-escaped character,
# alert, and continue.
#
# Accept a backslash when it escapes a backslash, or a quote, or
# end-of-line, or one of the letters that introduce a special escape
# sequence <http://docs.python.org/reference/lexical_analysis.html>
#
# TODO(mbp): Maybe give a separate warning about the rarely-used
# \a \b \v \f?
#
# TODO(mbp): We could give the column of the problem character, but
# add_message doesn't seem to have a way to pass it through at present.
i = 0
while True:
i = string_body.find('\\', i)
if i == -1:
break
# There must be a next character; having a backslash at the end
# of the string would be a SyntaxError.
next_char = string_body[i+1]
match = string_body[i:i+2]
if next_char in self.UNICODE_ESCAPE_CHARACTERS:
if 'u' in prefix:
pass
elif (_PY3K or self._unicode_literals) and 'b' not in prefix:
pass # unicode by default
else:
self.add_message('anomalous-unicode-escape-in-string',
line=start_row, args=(match, ))
elif next_char not in self.ESCAPE_CHARACTERS:
self.add_message('anomalous-backslash-in-string',
line=start_row, args=(match, ))
# Whether it was a valid escape or not, backslash followed by
# another character can always be consumed whole: the second
# character can never be the start of a new backslash escape.
i += 2
def register(linter):
"""required method to auto register this checker """
linter.register_checker(StringFormatChecker(linter))
linter.register_checker(StringMethodsChecker(linter))
linter.register_checker(StringConstantChecker(linter))