#!/usr/bin/perl -w

# Generate a profile of a set of files whose names read from stdin.
# This can be used with profilecmp to determine how much duplication exists
# between two sets of files.

use strict;
use Digest::MD5;

# Given a stream of names from stdin, look for duplicates

my %known;

while(<>) {
	chomp;
	open(FILE, $_) or next;

	my $size = (stat(FILE))[7];

	my $ctx = Digest::MD5->new;
	if (!$ctx->addfile(*FILE)) {
		print STDERR "Failed to read $_";
		next;
	}
	close FILE;

	my $digest = $ctx->hexdigest;

	my $key = "$digest:$size";

	#print "$_ -> $key\n";
	if (exists $known{$key}) {
		push @{$known{$key}}, $_;
	} else {
		$known{$key} = [ $_ ];
	}
}

foreach my $key (keys %known) {
	my @files = @{$known{$key}};

	my ($hash,$size) = split /:/,$key;

	$hash = substr($hash, 0, 6);
	$size = int(($size + 4095) / 4096) * 4096;

	#print "$key -> $hash, $size\n";
	print "$hash $size ".scalar(@files)."\n";
}
