#! /usr/bin/perl -w

# Check submission format for KDD Cup 2007 tasks
# Usage:
#	 perl check_format.pl [-h|-w] <submission_filename>
#
# Specify one of:
# -h	Check submission file for "How Many Ratings 2006"
# -w	Check submission file for "Who Rated What 2006"
#
use strict;
use warnings;
use Getopt::Std;
$^W = 1;
$| = 1; # Turn on auto-flush
my ($wrw_lines,$hmr_lines) = (100000,8863); # number of lines expected
our ($opt_w,$opt_h);
getopts("hw");
die "Which type to check again? Provide one of -h or -w." if (defined $opt_w and defined $opt_h) or (not defined $opt_w and not defined $opt_h);

my $submission_filename = $ARGV[0];
die "No file to check!" unless defined $submission_filename;
my $lines = 0;
open SUBMISSION, "<$submission_filename" or die "Unable to open $submission_filename! ($!)";
while (<SUBMISSION>) {
    s/\cJ//g; # lose line terminators anywhere in the string
    s/\cM//g; # lose line terminators anywhere in the string
    my $prediction = $_;
    $lines++;
    if ($opt_w) {
        die "Line $.: Improperly formatted probability prediction for WRW: $prediction" unless $prediction =~ /^[\d\.]+$/; # FP number?  No constraint on "probability" except > 0.0
    }
    if ($opt_h) {
        die "Line $.: Improperly formatted numeric prediction for HMR: $prediction" unless $prediction =~ /^[\d\.]+$/; # FP number?
    }
}
close SUBMISSION;
die "Insufficient lines for WRW: $lines vs $wrw_lines" if defined $opt_w and $lines != $wrw_lines;
die "Insufficient lines for HMR: $lines vs $hmr_lines" if defined $opt_h and $lines != $hmr_lines;
print "OK!\n";
