anyduplicate.rb #1

#!/usr/bin/ruby

#################################################################
#
# Copyright (c) 2008,2010 Perforce Software, Inc.  All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1.  Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#
# 2.  Redistributions in binary form must reproduce the above copyright
#     notice, this list of conditions and the following disclaimer in the
#     documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL PERFORCE SOFTWARE, INC. BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#  
# = Description
#
#   Parse a checkpoint and identify duplicate files based on their md5 
#   checksum. Lazy copies and rcs files are ignored.
#
# = Usage
#
#   anyduplicate.rb <checkpoint_name>
#
#################################################################

class FileSize
    attr_reader :filename, :size
    def initialize(filename, size)
	@filename = filename
	@size = size
    end
end

if (ARGV[0] == nil) 
    puts("Usage: anyduplicate.rb <checkpoint_name>")
    exit(0)
end
if !File.exists?(ARGV[0])
   printf("%s does not exist!\n", ARGV[0])
   exit(0)
end

md5Hash = Hash.new
re_rev = Regexp.new('@pv@ \d+ @db.rev@ @(.*)@ (\d+) \d+ \d+ \d+ \d+ \d+ (.*) (\d+) \d+ (\d+) @.*@ @.*@ (\d+)')
ckpSize = File.size(ARGV[0])
readBytes = 0.0
progress = 0.0
mod = 10

line = $<.gets
printf("Processing checkpoint: 0%%")
$stdout.flush
while line
  readBytes = readBytes + line.length
  progress = (readBytes / ckpSize) * 100.0
  if (progress / mod).floor == 1.0
     printf("...%d%%", mod)
     $stdout.flush
     mod += 10
  end
  if (match = re_rev.match(line))
    depotFile = match[1]
    rev = match[2]
    digest = match[3]
    size = match[4].to_i
    lbrIsLazy = match[5]
    lbrType = match[6].to_i
    if lbrIsLazy == "0" && ((lbrType & 255) == 1) || ((lbrType & 255) == 3)
      file = FileSize.new( depotFile+"#"+rev, size )
      md5Hash[ digest ] = Array.new if( md5Hash[ digest ] == nil || md5Hash.empty? )
      md5Hash[ digest ].push( file )
    end
  end
  line = $<.gets
end

printf("\nAnalysing results...\n")
totalSize = 0
totalFile = 0
md5Hash.values.each do
  |md5|
  size = 0
  nbFile = 0
  if md5.length > 1
     md5.each_with_index do
       |file, index|
          printf("%s ", file.filename)
          if index > 0
             size += file.size
             nbFile += 1
          end
     end
  printf(" = %d bytes duplicated in %d file(s)\n", size, nbFile)
  end
  totalSize += size
  totalFile += nbFile
end
printf("%d bytes duplicated in %d file(s)\n", totalSize, totalFile)
#	Change	User	Description	Committed
#1	7674	Pascal Soccard	Script to identify identical depot files stored on the server