| # pylint: disable=W0622 |
| # Copyright (c) 2004-2006 LOGILAB S.A. (Paris, FRANCE). |
| # http://www.logilab.fr/ -- mailto:contact@logilab.fr |
| # |
| # This program is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free Software |
| # Foundation; either version 2 of the License, or (at your option) any later |
| # version. |
| # |
| # This program is distributed in the hope that it will be useful, but WITHOUT |
| # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details |
| # |
| # You should have received a copy of the GNU General Public License along with |
| # this program; if not, write to the Free Software Foundation, Inc., |
| # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
| """a similarities / code duplication command line tool and pylint checker |
| """ |
| from __future__ import generators |
| |
| import sys |
| from itertools import izip |
| |
| from logilab.common.ureports import Table |
| |
| from pylint.interfaces import IRawChecker |
| from pylint.checkers import BaseChecker, table_lines_from_stats |
| |
| |
| class Similar: |
| """finds copy-pasted lines of code in a project""" |
| |
| def __init__(self, min_lines=4, ignore_comments=False, |
| ignore_docstrings=False): |
| self.min_lines = min_lines |
| self.ignore_comments = ignore_comments |
| self.ignore_docstrings = ignore_docstrings |
| self.linesets = [] |
| |
| def append_stream(self, streamid, stream): |
| """append a file to search for similarities""" |
| stream.seek(0) # XXX may be removed with astng > 0.23 |
| self.linesets.append(LineSet(streamid, |
| stream.readlines(), |
| self.ignore_comments, |
| self.ignore_docstrings)) |
| |
| def run(self): |
| """start looking for similarities and display results on stdout""" |
| self._display_sims(self._compute_sims()) |
| |
| def _compute_sims(self): |
| """compute similarities in appended files""" |
| no_duplicates = {} |
| for num, lineset1, idx1, lineset2, idx2 in self._iter_sims(): |
| duplicate = no_duplicates.setdefault(num, []) |
| for couples in duplicate: |
| if (lineset1, idx1) in couples or (lineset2, idx2) in couples: |
| couples.add( (lineset1, idx1) ) |
| couples.add( (lineset2, idx2) ) |
| break |
| else: |
| duplicate.append( set([(lineset1, idx1), (lineset2, idx2)]) ) |
| sims = [] |
| for num, ensembles in no_duplicates.iteritems(): |
| for couples in ensembles: |
| sims.append( (num, couples) ) |
| sims.sort() |
| sims.reverse() |
| return sims |
| |
| def _display_sims(self, sims): |
| """display computed similarities on stdout""" |
| nb_lignes_dupliquees = 0 |
| for num, couples in sims: |
| print |
| print num, "similar lines in", len(couples), "files" |
| couples = sorted(couples) |
| for lineset, idx in couples: |
| print "==%s:%s" % (lineset.name, idx) |
| # pylint: disable=W0631 |
| for line in lineset._real_lines[idx:idx+num]: |
| print " ", line, |
| nb_lignes_dupliquees += num * (len(couples)-1) |
| nb_total_lignes = sum([len(lineset) for lineset in self.linesets]) |
| print "TOTAL lines=%s duplicates=%s percent=%.2f" \ |
| % (nb_total_lignes, nb_lignes_dupliquees, |
| nb_lignes_dupliquees*100. / nb_total_lignes) |
| |
| def _find_common(self, lineset1, lineset2): |
| """find similarities in the two given linesets""" |
| lines1 = lineset1.enumerate_stripped |
| lines2 = lineset2.enumerate_stripped |
| find = lineset2.find |
| index1 = 0 |
| min_lines = self.min_lines |
| while index1 < len(lineset1): |
| skip = 1 |
| num = 0 |
| for index2 in find( lineset1[index1] ): |
| non_blank = 0 |
| for num, ((_, line1), (_, line2)) in enumerate( |
| izip(lines1(index1), lines2(index2))): |
| if line1 != line2: |
| if non_blank > min_lines: |
| yield num, lineset1, index1, lineset2, index2 |
| skip = max(skip, num) |
| break |
| if line1: |
| non_blank += 1 |
| else: |
| # we may have reach the end |
| num += 1 |
| if non_blank > min_lines: |
| yield num, lineset1, index1, lineset2, index2 |
| skip = max(skip, num) |
| index1 += skip |
| |
| def _iter_sims(self): |
| """iterate on similarities among all files, by making a cartesian |
| product |
| """ |
| for idx, lineset in enumerate(self.linesets[:-1]): |
| for lineset2 in self.linesets[idx+1:]: |
| for sim in self._find_common(lineset, lineset2): |
| yield sim |
| |
| def stripped_lines(lines, ignore_comments, ignore_docstrings): |
| strippedlines = [] |
| docstring = None |
| for line in lines: |
| line = line.strip() |
| if ignore_docstrings: |
| if not docstring and \ |
| (line.startswith('"""') or line.startswith("'''")): |
| docstring = line[:3] |
| line = line[3:] |
| if docstring: |
| if line.endswith(docstring): |
| docstring = None |
| line = '' |
| if ignore_comments: |
| # XXX should use regex in checkers/format to avoid cutting |
| # at a "#" in a string |
| line = line.split('#', 1)[0].strip() |
| strippedlines.append(line) |
| return strippedlines |
| |
| class LineSet: |
| """Holds and indexes all the lines of a single source file""" |
| def __init__(self, name, lines, ignore_comments=False, |
| ignore_docstrings=False): |
| self.name = name |
| self._real_lines = lines |
| self._stripped_lines = stripped_lines(lines, ignore_comments, |
| ignore_docstrings) |
| self._index = self._mk_index() |
| |
| def __str__(self): |
| return '<Lineset for %s>' % self.name |
| |
| def __len__(self): |
| return len(self._real_lines) |
| |
| def __getitem__(self, index): |
| return self._stripped_lines[index] |
| |
| def __lt__(self, other): |
| return self.name < other.name |
| |
| def __hash__(self): |
| return id(self) |
| |
| def enumerate_stripped(self, start_at=0): |
| """return an iterator on stripped lines, starting from a given index |
| if specified, else 0 |
| """ |
| idx = start_at |
| if start_at: |
| lines = self._stripped_lines[start_at:] |
| else: |
| lines = self._stripped_lines |
| for line in lines: |
| #if line: |
| yield idx, line |
| idx += 1 |
| |
| def find(self, stripped_line): |
| """return positions of the given stripped line in this set""" |
| return self._index.get(stripped_line, ()) |
| |
| def _mk_index(self): |
| """create the index for this set""" |
| index = {} |
| for line_no, line in enumerate(self._stripped_lines): |
| if line: |
| index.setdefault(line, []).append( line_no ) |
| return index |
| |
| |
| MSGS = {'R0801': ('Similar lines in %s files\n%s', |
| 'Indicates that a set of similar lines has been detected \ |
| among multiple file. This usually means that the code should \ |
| be refactored to avoid this duplication.')} |
| |
| def report_similarities(sect, stats, old_stats): |
| """make a layout with some stats about duplication""" |
| lines = ['', 'now', 'previous', 'difference'] |
| lines += table_lines_from_stats(stats, old_stats, |
| ('nb_duplicated_lines', |
| 'percent_duplicated_lines')) |
| sect.append(Table(children=lines, cols=4, rheaders=1, cheaders=1)) |
| |
| |
| # wrapper to get a pylint checker from the similar class |
| class SimilarChecker(BaseChecker, Similar): |
| """checks for similarities and duplicated code. This computation may be |
| memory / CPU intensive, so you should disable it if you experiment some |
| problems. |
| """ |
| |
| __implements__ = (IRawChecker,) |
| # configuration section name |
| name = 'similarities' |
| # messages |
| msgs = MSGS |
| # configuration options |
| # for available dict keys/values see the optik parser 'add_option' method |
| options = (('min-similarity-lines', |
| {'default' : 4, 'type' : "int", 'metavar' : '<int>', |
| 'help' : 'Minimum lines number of a similarity.'}), |
| ('ignore-comments', |
| {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>', |
| 'help': 'Ignore comments when computing similarities.'} |
| ), |
| ('ignore-docstrings', |
| {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>', |
| 'help': 'Ignore docstrings when computing similarities.'} |
| ), |
| ) |
| # reports |
| reports = ( ('R0801', 'Duplication', report_similarities), ) # XXX actually a Refactoring message |
| |
| def __init__(self, linter=None): |
| BaseChecker.__init__(self, linter) |
| Similar.__init__(self, min_lines=4, |
| ignore_comments=True, ignore_docstrings=True) |
| self.stats = None |
| |
| def set_option(self, optname, value, action=None, optdict=None): |
| """method called to set an option (registered in the options list) |
| |
| overridden to report options setting to Similar |
| """ |
| BaseChecker.set_option(self, optname, value, action, optdict) |
| if optname == 'min-similarity-lines': |
| self.min_lines = self.config.min_similarity_lines |
| elif optname == 'ignore-comments': |
| self.ignore_comments = self.config.ignore_comments |
| elif optname == 'ignore-docstrings': |
| self.ignore_docstrings = self.config.ignore_docstrings |
| |
| def open(self): |
| """init the checkers: reset linesets and statistics information""" |
| self.linesets = [] |
| self.stats = self.linter.add_stats(nb_duplicated_lines=0, |
| percent_duplicated_lines=0) |
| |
| def process_module(self, node): |
| """process a module |
| |
| the module's content is accessible via the stream object |
| |
| stream must implement the readlines method |
| """ |
| self.append_stream(self.linter.current_name, node.file_stream) |
| |
| def close(self): |
| """compute and display similarities on closing (i.e. end of parsing)""" |
| total = sum([len(lineset) for lineset in self.linesets]) |
| duplicated = 0 |
| stats = self.stats |
| for num, couples in self._compute_sims(): |
| msg = [] |
| for lineset, idx in couples: |
| msg.append("==%s:%s" % (lineset.name, idx)) |
| msg.sort() |
| # pylint: disable=W0631 |
| for line in lineset._real_lines[idx:idx+num]: |
| msg.append(line.rstrip()) |
| self.add_message('R0801', args=(len(couples), '\n'.join(msg))) |
| duplicated += num * (len(couples) - 1) |
| stats['nb_duplicated_lines'] = duplicated |
| stats['percent_duplicated_lines'] = total and duplicated * 100. / total |
| |
| |
| def register(linter): |
| """required method to auto register this checker """ |
| linter.register_checker(SimilarChecker(linter)) |
| |
| def usage(status=0): |
| """display command line usage information""" |
| print "finds copy pasted blocks in a set of files" |
| print |
| print 'Usage: symilar [-d|--duplicates min_duplicated_lines] \ |
| [-i|--ignore-comments] file1...' |
| sys.exit(status) |
| |
| def run(argv=None): |
| """standalone command line access point""" |
| if argv is None: |
| argv = sys.argv[1:] |
| from getopt import getopt |
| s_opts = 'hdi' |
| l_opts = ('help', 'duplicates=', 'ignore-comments') |
| min_lines = 4 |
| ignore_comments = False |
| opts, args = getopt(argv, s_opts, l_opts) |
| for opt, val in opts: |
| if opt in ('-d', '--duplicates'): |
| min_lines = int(val) |
| elif opt in ('-h', '--help'): |
| usage() |
| elif opt in ('-i', '--ignore-comments'): |
| ignore_comments = True |
| if not args: |
| usage(1) |
| sim = Similar(min_lines, ignore_comments) |
| for filename in args: |
| sim.append_stream(filename, open(filename)) |
| sim.run() |
| |
| if __name__ == '__main__': |
| run() |