third_party/pylint/checkers/similar.py - depot_tools - Git at Google

 # pylint: disable=W0622
 # Copyright (c) 2004-2013 LOGILAB S.A. (Paris, FRANCE).
 # http://www.logilab.fr/ -- mailto:contact@logilab.fr
 #
 # This program is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation; either version 2 of the License, or (at your option) any later
 # version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details
 #
 # You should have received a copy of the GNU General Public License along with
 # this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 """a similarities / code duplication command line tool and pylint checker
 """
 from __future__ import print_function
 import sys
 from collections import defaultdict

 from logilab.common.ureports import Table

 from pylint.interfaces import IRawChecker
 from pylint.checkers import BaseChecker, table_lines_from_stats

 import six
 from six.moves import zip


 class Similar(object):
     """finds copy-pasted lines of code in a project"""

     def __init__(self, min_lines=4, ignore_comments=False,
                  ignore_docstrings=False, ignore_imports=False):
         self.min_lines = min_lines
         self.ignore_comments = ignore_comments
         self.ignore_docstrings = ignore_docstrings
         self.ignore_imports = ignore_imports
         self.linesets = []

     def append_stream(self, streamid, stream, encoding=None):
         """append a file to search for similarities"""
         if encoding is None:
             readlines = stream.readlines
         else:
             readlines = lambda: [line.decode(encoding) for line in stream]
         try:
             self.linesets.append(LineSet(streamid,
                                          readlines(),
                                          self.ignore_comments,
                                          self.ignore_docstrings,
                                          self.ignore_imports))
         except UnicodeDecodeError:
             pass

     def run(self):
         """start looking for similarities and display results on stdout"""
         self._display_sims(self._compute_sims())

     def _compute_sims(self):
         """compute similarities in appended files"""
         no_duplicates = defaultdict(list)
         for num, lineset1, idx1, lineset2, idx2 in self._iter_sims():
             duplicate = no_duplicates[num]
             for couples in duplicate:
                 if (lineset1, idx1) in couples or (lineset2, idx2) in couples:
                     couples.add((lineset1, idx1))
                     couples.add((lineset2, idx2))
                     break
             else:
                 duplicate.append(set([(lineset1, idx1), (lineset2, idx2)]))
         sims = []
         for num, ensembles in six.iteritems(no_duplicates):
             for couples in ensembles:
                 sims.append((num, couples))
         sims.sort()
         sims.reverse()
         return sims

     def _display_sims(self, sims):
         """display computed similarities on stdout"""
         nb_lignes_dupliquees = 0
         for num, couples in sims:
             print()
             print(num, "similar lines in", len(couples), "files")
             couples = sorted(couples)
             for lineset, idx in couples:
                 print("==%s:%s" % (lineset.name, idx))
             # pylint: disable=W0631
             for line in lineset._real_lines[idx:idx+num]:
                 print("  ", line.rstrip())
             nb_lignes_dupliquees += num * (len(couples)-1)
         nb_total_lignes = sum([len(lineset) for lineset in self.linesets])
         print("TOTAL lines=%s duplicates=%s percent=%.2f" \
             % (nb_total_lignes, nb_lignes_dupliquees,
                nb_lignes_dupliquees*100. / nb_total_lignes))

     def _find_common(self, lineset1, lineset2):
         """find similarities in the two given linesets"""
         lines1 = lineset1.enumerate_stripped
         lines2 = lineset2.enumerate_stripped
         find = lineset2.find
         index1 = 0
         min_lines = self.min_lines
         while index1 < len(lineset1):
             skip = 1
             num = 0
             for index2 in find(lineset1[index1]):
                 non_blank = 0
                 for num, ((_, line1), (_, line2)) in enumerate(
                         zip(lines1(index1), lines2(index2))):
                     if line1 != line2:
                         if non_blank > min_lines:
                             yield num, lineset1, index1, lineset2, index2
                         skip = max(skip, num)
                         break
                     if line1:
                         non_blank += 1
                 else:
                     # we may have reach the end
                     num += 1
                     if non_blank > min_lines:
                         yield num, lineset1, index1, lineset2, index2
                     skip = max(skip, num)
             index1 += skip

     def _iter_sims(self):
         """iterate on similarities among all files, by making a cartesian
         product
         """
         for idx, lineset in enumerate(self.linesets[:-1]):
             for lineset2 in self.linesets[idx+1:]:
                 for sim in self._find_common(lineset, lineset2):
                     yield sim

 def stripped_lines(lines, ignore_comments, ignore_docstrings, ignore_imports):
     """return lines with leading/trailing whitespace and any ignored code
     features removed
     """

     strippedlines = []
     docstring = None
     for line in lines:
         line = line.strip()
         if ignore_docstrings:
             if not docstring and \
                    (line.startswith('"""') or line.startswith("'''")):
                 docstring = line[:3]
                 line = line[3:]
             if docstring:
                 if line.endswith(docstring):
                     docstring = None
                 line = ''
         if ignore_imports:
             if line.startswith("import ") or line.startswith("from "):
                 line = ''
         if ignore_comments:
             # XXX should use regex in checkers/format to avoid cutting
             # at a "#" in a string
             line = line.split('#', 1)[0].strip()
         strippedlines.append(line)
     return strippedlines


 class LineSet(object):
     """Holds and indexes all the lines of a single source file"""
     def __init__(self, name, lines, ignore_comments=False,
                  ignore_docstrings=False, ignore_imports=False):
         self.name = name
         self._real_lines = lines
         self._stripped_lines = stripped_lines(lines, ignore_comments,
                                               ignore_docstrings,
                                               ignore_imports)
         self._index = self._mk_index()

     def __str__(self):
         return '<Lineset for %s>' % self.name

     def __len__(self):
         return len(self._real_lines)

     def __getitem__(self, index):
         return self._stripped_lines[index]

     def __lt__(self, other):
         return self.name < other.name

     def __hash__(self):
         return id(self)

     def enumerate_stripped(self, start_at=0):
         """return an iterator on stripped lines, starting from a given index
         if specified, else 0
         """
         idx = start_at
         if start_at:
             lines = self._stripped_lines[start_at:]
         else:
             lines = self._stripped_lines
         for line in lines:
             #if line:
             yield idx, line
             idx += 1

     def find(self, stripped_line):
         """return positions of the given stripped line in this set"""
         return self._index.get(stripped_line, ())

     def _mk_index(self):
         """create the index for this set"""
         index = defaultdict(list)
         for line_no, line in enumerate(self._stripped_lines):
             if line:
                 index[line].append(line_no)
         return index


 MSGS = {'R0801': ('Similar lines in %s files\n%s',
                   'duplicate-code',
                   'Indicates that a set of similar lines has been detected \
                   among multiple file. This usually means that the code should \
                   be refactored to avoid this duplication.')}

 def report_similarities(sect, stats, old_stats):
     """make a layout with some stats about duplication"""
     lines = ['', 'now', 'previous', 'difference']
     lines += table_lines_from_stats(stats, old_stats,
                                     ('nb_duplicated_lines',
                                      'percent_duplicated_lines'))
     sect.append(Table(children=lines, cols=4, rheaders=1, cheaders=1))


 # wrapper to get a pylint checker from the similar class
 class SimilarChecker(BaseChecker, Similar):
     """checks for similarities and duplicated code. This computation may be
     memory / CPU intensive, so you should disable it if you experiment some
     problems.
     """

     __implements__ = (IRawChecker,)
     # configuration section name
     name = 'similarities'
     # messages
     msgs = MSGS
     # configuration options
     # for available dict keys/values see the optik parser 'add_option' method
     options = (('min-similarity-lines',
                 {'default' : 4, 'type' : "int", 'metavar' : '<int>',
                  'help' : 'Minimum lines number of a similarity.'}),
                ('ignore-comments',
                 {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
                  'help': 'Ignore comments when computing similarities.'}
                ),
                ('ignore-docstrings',
                 {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
                  'help': 'Ignore docstrings when computing similarities.'}
                ),
                ('ignore-imports',
                 {'default' : False, 'type' : 'yn', 'metavar' : '<y or n>',
                  'help': 'Ignore imports when computing similarities.'}
                ),
               )
     # reports
     reports = (('RP0801', 'Duplication', report_similarities),)

     def __init__(self, linter=None):
         BaseChecker.__init__(self, linter)
         Similar.__init__(self, min_lines=4,
                          ignore_comments=True, ignore_docstrings=True)
         self.stats = None

     def set_option(self, optname, value, action=None, optdict=None):
         """method called to set an option (registered in the options list)

         overridden to report options setting to Similar
         """
         BaseChecker.set_option(self, optname, value, action, optdict)
         if optname == 'min-similarity-lines':
             self.min_lines = self.config.min_similarity_lines
         elif optname == 'ignore-comments':
             self.ignore_comments = self.config.ignore_comments
         elif optname == 'ignore-docstrings':
             self.ignore_docstrings = self.config.ignore_docstrings
         elif optname == 'ignore-imports':
             self.ignore_imports = self.config.ignore_imports

     def open(self):
         """init the checkers: reset linesets and statistics information"""
         self.linesets = []
         self.stats = self.linter.add_stats(nb_duplicated_lines=0,
                                            percent_duplicated_lines=0)

     def process_module(self, node):
         """process a module

         the module's content is accessible via the stream object

         stream must implement the readlines method
         """
         with node.stream() as stream:
             self.append_stream(self.linter.current_name,
                                stream,
                                node.file_encoding)

     def close(self):
         """compute and display similarities on closing (i.e. end of parsing)"""
         total = sum([len(lineset) for lineset in self.linesets])
         duplicated = 0
         stats = self.stats
         for num, couples in self._compute_sims():
             msg = []
             for lineset, idx in couples:
                 msg.append("==%s:%s" % (lineset.name, idx))
             msg.sort()
             # pylint: disable=W0631
             for line in lineset._real_lines[idx:idx+num]:
                 msg.append(line.rstrip())
             self.add_message('R0801', args=(len(couples), '\n'.join(msg)))
             duplicated += num * (len(couples) - 1)
         stats['nb_duplicated_lines'] = duplicated
         stats['percent_duplicated_lines'] = total and duplicated * 100. / total


 def register(linter):
     """required method to auto register this checker """
     linter.register_checker(SimilarChecker(linter))

 def usage(status=0):
     """display command line usage information"""
     print("finds copy pasted blocks in a set of files")
     print()
     print('Usage: symilar [-d|--duplicates min_duplicated_lines] \
 [-i|--ignore-comments] [--ignore-docstrings] [--ignore-imports] file1...')
     sys.exit(status)

 def Run(argv=None):
     """standalone command line access point"""
     if argv is None:
         argv = sys.argv[1:]
     from getopt import getopt
     s_opts = 'hdi'
     l_opts = ('help', 'duplicates=', 'ignore-comments', 'ignore-imports',
               'ignore-docstrings')
     min_lines = 4
     ignore_comments = False
     ignore_docstrings = False
     ignore_imports = False
     opts, args = getopt(argv, s_opts, l_opts)
     for opt, val in opts:
         if opt in ('-d', '--duplicates'):
             min_lines = int(val)
         elif opt in ('-h', '--help'):
             usage()
         elif opt in ('-i', '--ignore-comments'):
             ignore_comments = True
         elif opt in ('--ignore-docstrings',):
             ignore_docstrings = True
         elif opt in ('--ignore-imports',):
             ignore_imports = True
     if not args:
         usage(1)
     sim = Similar(min_lines, ignore_comments, ignore_docstrings, ignore_imports)
     for filename in args:
         with open(filename) as stream:
             sim.append_stream(filename, stream)
     sim.run()
     sys.exit(0)

 if __name__ == '__main__':
     Run()
	# pylint: disable=W0622
	# Copyright (c) 2004-2013 LOGILAB S.A. (Paris, FRANCE).
	# http://www.logilab.fr/ -- mailto:contact@logilab.fr
	#
	# This program is free software; you can redistribute it and/or modify it under
	# the terms of the GNU General Public License as published by the Free Software
	# Foundation; either version 2 of the License, or (at your option) any later
	# version.
	#
	# This program is distributed in the hope that it will be useful, but WITHOUT
	# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details
	#
	# You should have received a copy of the GNU General Public License along with
	# this program; if not, write to the Free Software Foundation, Inc.,
	# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
	"""a similarities / code duplication command line tool and pylint checker
	"""
	from __future__ import print_function
	import sys
	from collections import defaultdict

	from logilab.common.ureports import Table

	from pylint.interfaces import IRawChecker
	from pylint.checkers import BaseChecker, table_lines_from_stats

	import six
	from six.moves import zip


	class Similar(object):
	"""finds copy-pasted lines of code in a project"""

	def __init__(self, min_lines=4, ignore_comments=False,
	ignore_docstrings=False, ignore_imports=False):
	self.min_lines = min_lines
	self.ignore_comments = ignore_comments
	self.ignore_docstrings = ignore_docstrings
	self.ignore_imports = ignore_imports
	self.linesets = []

	def append_stream(self, streamid, stream, encoding=None):
	"""append a file to search for similarities"""
	if encoding is None:
	readlines = stream.readlines
	else:
	readlines = lambda: [line.decode(encoding) for line in stream]
	try:
	self.linesets.append(LineSet(streamid,
	readlines(),
	self.ignore_comments,
	self.ignore_docstrings,
	self.ignore_imports))
	except UnicodeDecodeError:
	pass

	def run(self):
	"""start looking for similarities and display results on stdout"""
	self._display_sims(self._compute_sims())

	def _compute_sims(self):
	"""compute similarities in appended files"""
	no_duplicates = defaultdict(list)
	for num, lineset1, idx1, lineset2, idx2 in self._iter_sims():
	duplicate = no_duplicates[num]
	for couples in duplicate:
	if (lineset1, idx1) in couples or (lineset2, idx2) in couples:
	couples.add((lineset1, idx1))
	couples.add((lineset2, idx2))
	break
	else:
	duplicate.append(set([(lineset1, idx1), (lineset2, idx2)]))
	sims = []
	for num, ensembles in six.iteritems(no_duplicates):
	for couples in ensembles:
	sims.append((num, couples))
	sims.sort()
	sims.reverse()
	return sims

	def _display_sims(self, sims):
	"""display computed similarities on stdout"""
	nb_lignes_dupliquees = 0
	for num, couples in sims:
	print()
	print(num, "similar lines in", len(couples), "files")
	couples = sorted(couples)
	for lineset, idx in couples:
	print("==%s:%s" % (lineset.name, idx))
	# pylint: disable=W0631
	for line in lineset._real_lines[idx:idx+num]:
	print(" ", line.rstrip())
	nb_lignes_dupliquees += num * (len(couples)-1)
	nb_total_lignes = sum([len(lineset) for lineset in self.linesets])
	print("TOTAL lines=%s duplicates=%s percent=%.2f" \
	% (nb_total_lignes, nb_lignes_dupliquees,
	nb_lignes_dupliquees*100. / nb_total_lignes))

	def _find_common(self, lineset1, lineset2):
	"""find similarities in the two given linesets"""
	lines1 = lineset1.enumerate_stripped
	lines2 = lineset2.enumerate_stripped
	find = lineset2.find
	index1 = 0
	min_lines = self.min_lines
	while index1 < len(lineset1):
	skip = 1
	num = 0
	for index2 in find(lineset1[index1]):
	non_blank = 0
	for num, ((_, line1), (_, line2)) in enumerate(
	zip(lines1(index1), lines2(index2))):
	if line1 != line2:
	if non_blank > min_lines:
	yield num, lineset1, index1, lineset2, index2
	skip = max(skip, num)
	break
	if line1:
	non_blank += 1
	else:
	# we may have reach the end
	num += 1
	if non_blank > min_lines:
	yield num, lineset1, index1, lineset2, index2
	skip = max(skip, num)
	index1 += skip

	def _iter_sims(self):
	"""iterate on similarities among all files, by making a cartesian
	product
	"""
	for idx, lineset in enumerate(self.linesets[:-1]):
	for lineset2 in self.linesets[idx+1:]:
	for sim in self._find_common(lineset, lineset2):
	yield sim

	def stripped_lines(lines, ignore_comments, ignore_docstrings, ignore_imports):
	"""return lines with leading/trailing whitespace and any ignored code
	features removed
	"""

	strippedlines = []
	docstring = None
	for line in lines:
	line = line.strip()
	if ignore_docstrings:
	if not docstring and \
	(line.startswith('"""') or line.startswith("'''")):
	docstring = line[:3]
	line = line[3:]
	if docstring:
	if line.endswith(docstring):
	docstring = None
	line = ''
	if ignore_imports:
	if line.startswith("import ") or line.startswith("from "):
	line = ''
	if ignore_comments:
	# XXX should use regex in checkers/format to avoid cutting
	# at a "#" in a string
	line = line.split('#', 1)[0].strip()
	strippedlines.append(line)
	return strippedlines


	class LineSet(object):
	"""Holds and indexes all the lines of a single source file"""
	def __init__(self, name, lines, ignore_comments=False,
	ignore_docstrings=False, ignore_imports=False):
	self.name = name
	self._real_lines = lines
	self._stripped_lines = stripped_lines(lines, ignore_comments,
	ignore_docstrings,
	ignore_imports)
	self._index = self._mk_index()

	def __str__(self):
	return '<Lineset for %s>' % self.name

	def __len__(self):
	return len(self._real_lines)

	def __getitem__(self, index):
	return self._stripped_lines[index]

	def __lt__(self, other):
	return self.name < other.name

	def __hash__(self):
	return id(self)

	def enumerate_stripped(self, start_at=0):
	"""return an iterator on stripped lines, starting from a given index
	if specified, else 0
	"""
	idx = start_at
	if start_at:
	lines = self._stripped_lines[start_at:]
	else:
	lines = self._stripped_lines
	for line in lines:
	#if line:
	yield idx, line
	idx += 1

	def find(self, stripped_line):
	"""return positions of the given stripped line in this set"""
	return self._index.get(stripped_line, ())

	def _mk_index(self):
	"""create the index for this set"""
	index = defaultdict(list)
	for line_no, line in enumerate(self._stripped_lines):
	if line:
	index[line].append(line_no)
	return index


	MSGS = {'R0801': ('Similar lines in %s files\n%s',
	'duplicate-code',
	'Indicates that a set of similar lines has been detected \
	among multiple file. This usually means that the code should \
	be refactored to avoid this duplication.')}

	def report_similarities(sect, stats, old_stats):
	"""make a layout with some stats about duplication"""
	lines = ['', 'now', 'previous', 'difference']
	lines += table_lines_from_stats(stats, old_stats,
	('nb_duplicated_lines',
	'percent_duplicated_lines'))
	sect.append(Table(children=lines, cols=4, rheaders=1, cheaders=1))


	# wrapper to get a pylint checker from the similar class
	class SimilarChecker(BaseChecker, Similar):
	"""checks for similarities and duplicated code. This computation may be
	memory / CPU intensive, so you should disable it if you experiment some
	problems.
	"""

	__implements__ = (IRawChecker,)
	# configuration section name
	name = 'similarities'
	# messages
	msgs = MSGS
	# configuration options
	# for available dict keys/values see the optik parser 'add_option' method
	options = (('min-similarity-lines',
	{'default' : 4, 'type' : "int", 'metavar' : '<int>',
	'help' : 'Minimum lines number of a similarity.'}),
	('ignore-comments',
	{'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
	'help': 'Ignore comments when computing similarities.'}
	),
	('ignore-docstrings',
	{'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
	'help': 'Ignore docstrings when computing similarities.'}
	),
	('ignore-imports',
	{'default' : False, 'type' : 'yn', 'metavar' : '<y or n>',
	'help': 'Ignore imports when computing similarities.'}
	),
	)
	# reports
	reports = (('RP0801', 'Duplication', report_similarities),)

	def __init__(self, linter=None):
	BaseChecker.__init__(self, linter)
	Similar.__init__(self, min_lines=4,
	ignore_comments=True, ignore_docstrings=True)
	self.stats = None

	def set_option(self, optname, value, action=None, optdict=None):
	"""method called to set an option (registered in the options list)

	overridden to report options setting to Similar
	"""
	BaseChecker.set_option(self, optname, value, action, optdict)
	if optname == 'min-similarity-lines':
	self.min_lines = self.config.min_similarity_lines
	elif optname == 'ignore-comments':
	self.ignore_comments = self.config.ignore_comments
	elif optname == 'ignore-docstrings':
	self.ignore_docstrings = self.config.ignore_docstrings
	elif optname == 'ignore-imports':
	self.ignore_imports = self.config.ignore_imports

	def open(self):
	"""init the checkers: reset linesets and statistics information"""
	self.linesets = []
	self.stats = self.linter.add_stats(nb_duplicated_lines=0,
	percent_duplicated_lines=0)

	def process_module(self, node):
	"""process a module

	the module's content is accessible via the stream object

	stream must implement the readlines method
	"""
	with node.stream() as stream:
	self.append_stream(self.linter.current_name,
	stream,
	node.file_encoding)

	def close(self):
	"""compute and display similarities on closing (i.e. end of parsing)"""
	total = sum([len(lineset) for lineset in self.linesets])
	duplicated = 0
	stats = self.stats
	for num, couples in self._compute_sims():
	msg = []
	for lineset, idx in couples:
	msg.append("==%s:%s" % (lineset.name, idx))
	msg.sort()
	# pylint: disable=W0631
	for line in lineset._real_lines[idx:idx+num]:
	msg.append(line.rstrip())
	self.add_message('R0801', args=(len(couples), '\n'.join(msg)))
	duplicated += num * (len(couples) - 1)
	stats['nb_duplicated_lines'] = duplicated
	stats['percent_duplicated_lines'] = total and duplicated * 100. / total


	def register(linter):
	"""required method to auto register this checker """
	linter.register_checker(SimilarChecker(linter))

	def usage(status=0):
	"""display command line usage information"""
	print("finds copy pasted blocks in a set of files")
	print()
	print('Usage: symilar [-d\|--duplicates min_duplicated_lines] \
	[-i\|--ignore-comments] [--ignore-docstrings] [--ignore-imports] file1...')
	sys.exit(status)

	def Run(argv=None):
	"""standalone command line access point"""
	if argv is None:
	argv = sys.argv[1:]
	from getopt import getopt
	s_opts = 'hdi'
	l_opts = ('help', 'duplicates=', 'ignore-comments', 'ignore-imports',
	'ignore-docstrings')
	min_lines = 4
	ignore_comments = False
	ignore_docstrings = False
	ignore_imports = False
	opts, args = getopt(argv, s_opts, l_opts)
	for opt, val in opts:
	if opt in ('-d', '--duplicates'):
	min_lines = int(val)
	elif opt in ('-h', '--help'):
	usage()
	elif opt in ('-i', '--ignore-comments'):
	ignore_comments = True
	elif opt in ('--ignore-docstrings',):
	ignore_docstrings = True
	elif opt in ('--ignore-imports',):
	ignore_imports = True
	if not args:
	usage(1)
	sim = Similar(min_lines, ignore_comments, ignore_docstrings, ignore_imports)
	for filename in args:
	with open(filename) as stream:
	sim.append_stream(filename, stream)
	sim.run()
	sys.exit(0)

	if __name__ == '__main__':
	Run()