#!/usr/bin/ruby
#################################################################
#
# Copyright (c) 2008,2010 Perforce Software, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL PERFORCE SOFTWARE, INC. BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# = Description
#
# Parse a checkpoint and identify duplicate files based on their md5
# checksum. Lazy copies and rcs files are ignored.
#
# = Usage
#
# anyduplicate.rb <checkpoint_name>
#
#################################################################
class FileSize
attr_reader :filename, :size
def initialize(filename, size)
@filename = filename
@size = size
end
end
if (ARGV[0] == nil)
puts("Usage: anyduplicate.rb <checkpoint_name>")
exit(0)
end
if !File.exists?(ARGV[0])
printf("%s does not exist!\n", ARGV[0])
exit(0)
end
md5Hash = Hash.new
re_rev = Regexp.new('@pv@ \d+ @db.rev@ @(.*)@ (\d+) \d+ \d+ \d+ \d+ \d+ (.*) (\d+) \d+ (\d+) @.*@ @.*@ (\d+)')
ckpSize = File.size(ARGV[0])
readBytes = 0.0
progress = 0.0
mod = 10
line = $<.gets
printf("Processing checkpoint: 0%%")
$stdout.flush
while line
readBytes = readBytes + line.length
progress = (readBytes / ckpSize) * 100.0
if (progress / mod).floor == 1.0
printf("...%d%%", mod)
$stdout.flush
mod += 10
end
if (match = re_rev.match(line))
depotFile = match[1]
rev = match[2]
digest = match[3]
size = match[4].to_i
lbrIsLazy = match[5]
lbrType = match[6].to_i
if lbrIsLazy == "0" && ((lbrType & 255) == 1) || ((lbrType & 255) == 3)
file = FileSize.new( depotFile+"#"+rev, size )
md5Hash[ digest ] = Array.new if( md5Hash[ digest ] == nil || md5Hash.empty? )
md5Hash[ digest ].push( file )
end
end
line = $<.gets
end
printf("\nAnalysing results...\n")
totalSize = 0
totalFile = 0
md5Hash.values.each do
|md5|
size = 0
nbFile = 0
if md5.length > 1
md5.each_with_index do
|file, index|
printf("%s ", file.filename)
if index > 0
size += file.size
nbFile += 1
end
end
printf(" = %d bytes duplicated in %d file(s)\n", size, nbFile)
end
totalSize += size
totalFile += nbFile
end
printf("%d bytes duplicated in %d file(s)\n", totalSize, totalFile)