#! /usr/bin/perl -w # Check submission format for KDD Cup 2007 tasks # Usage: # perl check_format.pl [-h|-w] # # Specify one of: # -h Check submission file for "How Many Ratings 2006" # -w Check submission file for "Who Rated What 2006" # use strict; use warnings; use Getopt::Std; $^W = 1; $| = 1; # Turn on auto-flush my ($wrw_lines,$hmr_lines) = (100000,8863); # number of lines expected our ($opt_w,$opt_h); getopts("hw"); die "Which type to check again? Provide one of -h or -w." if (defined $opt_w and defined $opt_h) or (not defined $opt_w and not defined $opt_h); my $submission_filename = $ARGV[0]; die "No file to check!" unless defined $submission_filename; my $lines = 0; open SUBMISSION, "<$submission_filename" or die "Unable to open $submission_filename! ($!)"; while () { s/\cJ//g; # lose line terminators anywhere in the string s/\cM//g; # lose line terminators anywhere in the string my $prediction = $_; $lines++; if ($opt_w) { die "Line $.: Improperly formatted probability prediction for WRW: $prediction" unless $prediction =~ /^[\d\.]+$/; # FP number? No constraint on "probability" except > 0.0 } if ($opt_h) { die "Line $.: Improperly formatted numeric prediction for HMR: $prediction" unless $prediction =~ /^[\d\.]+$/; # FP number? } } close SUBMISSION; die "Insufficient lines for WRW: $lines vs $wrw_lines" if defined $opt_w and $lines != $wrw_lines; die "Insufficient lines for HMR: $lines vs $hmr_lines" if defined $opt_h and $lines != $hmr_lines; print "OK!\n";