#!/usr/bin/perl -w use strict; # Check a TREC 2008 feedback track retrieval track submission for various # common errors: # * extra fields # * multiple run tags # * missing or extraneous topics # * invalid retrieved documents # * duplicate retrieved documents in a single topic # * too many documents retrieved for a topic # * fewer than maximum allowed retrieved for a topic (warning) # Messages regarding submission are printed to an error log # Results input file is in the form # topic_num Q0 docno rank sim tag # # Change these variable values to the directory in which the error log # should be put my $errlog_dir = "."; # If more than 25 errors, then stop processing; something drastically # wrong with the file. my $MAX_ERRORS = 25; my @topics = (702, 704, 706, 708, 710, 714, 716, 722, 726, 738, 740, 742, 744, 746, 748, 760, 762, 764, 766, 768, 770, 772, 776, 782, 786, 788, 790, 792, 794, 796, 798, 800, 802, 804, 806, 808, 812, 814, 818, 820, 826, 828, 834, 836, 838, 840, 842, 846, 848, 850, 2010, 2098, 2106, 2126, 2134, 2174, 2186, 2234, 2270, 2378, 2386, 2430, 2454, 2466, 2474, 2478, 2510, 2562, 2650, 2674, 2682, 2722, 2752, 2906, 2962, 2994, 3030, 3038, 3094, 3158, 3206, 3210, 3230, 3238, 3254, 3310, 3338, 3362, 3374, 3390, 3402, 3510, 3554, 3558, 3566, 3618, 3718, 3802, 3918, 3938, 3972, 4050, 4082, 4150, 4162, 4258, 4278, 4290, 4294, 4330, 4354, 4358, 4398, 4414, 4438, 4566, 4594, 4638, 4670, 4710, 4726, 4766, 4830, 4878, 4910, 4946, 4970, 5026, 5094, 5106, 5126, 5134, 5214, 5250, 5294, 5310, 5330, 5454, 5554, 5590, 5654, 5686, 5730, 5734, 5738, 5806, 5814, 5898, 6010, 6018, 6026, 6030, 6038, 6042, 6062, 6086, 6110, 6122, 6142, 6172, 6198, 6206, 6222, 6286, 6322, 6338, 6414, 6450, 6462, 6486, 6498, 6518, 6602, 6618, 6624, 6634, 6654, 6710, 6718, 6730, 6766, 6786, 6834, 6854, 6906, 6942, 7038, 7046, 7130, 7142, 7234, 7262, 7286, 7314, 7318, 7322, 7462, 7522, 7544, 7566, 7586, 7606, 7662, 7710, 7746, 7762, 7766, 7802, 7830, 7858, 7878, 8002, 8138, 8218, 8290, 8318, 8366, 8370, 8386, 8406, 8486, 8554, 8566, 8602, 8642, 8750, 8754, 8762, 8818, 8842, 8850, 8886, 8894, 8906, 8930, 8946, 8978, 9086, 9098, 9130, 9138, 9150, 9198, 9290, 9314, 9394, 9402, 9442, 9482, 9550, 9566, 9570, 9578, 9602, 9670, 9714, 9722, 9726, 9762, 9786, 9798, 9922, 9958, 9994); my $MAX_RET = 2500; my %docnos; # hash of all valid docnos my %numret; # number of docs retrieved per topic my $task; # A, B, C, D, E, or F my $results_file; # input file to be checked my $errlog; # file name of error log my ($q0warn, $num_errors); # flags for errors detected my $d; # current docid my $line; # current input line my ($topic_string,$q0,$docno,$rank,$sim,$tag,$rest); my $line_num; # current input line number my ($topic, $old_topic); my $run_id; my $found; my ($i,$t,$col1,$col2,$last_i); my $usage = "Usage: $0 [ABDCEF] resultsfile\n"; $task = shift @ARGV or die $usage; $results_file = shift @ARGV or die $usage; die $usage unless ($task =~ /[ABCDEF]/); # Initialize data structures used in checks if ($task eq "F") { @topics = (704, 726, 746, 762, 770, 776, 792, 808, 812, 814, 820, 828, 836, 2722, 4358, 4766, 5214, 6222, 6518, 6786, 6834, 7286, 8138, 8818, 9130); } # number retrived difficulty value assigned per topic foreach $t (@topics) { $numret{$t} = 0; } open RESULTS, "<$results_file" || die "Unable to open results file $results_file: $!\n"; my @path = split "/", $results_file; my $base = pop @path; $errlog = $errlog_dir . "/" . $base . ".errlog"; open ERRLOG, ">$errlog" || die "Cannot open error log for writing\n"; $q0warn = 0; $num_errors = 0; $line_num = 0; $old_topic = "-1"; $run_id = ""; while ($line = ) { chomp $line; next if ($line =~ /^\s*$/); undef $tag; my @fields = split " ", $line; $line_num++; if (scalar(@fields) == 6) { ($topic_string,$q0,$docno,$rank,$sim,$tag) = @fields; } else { &error("Too many fields"); exit 255; } # make sure runtag is ok if (! $run_id) { # first line --- remember tag $run_id = $tag; if ($run_id !~ /^[A-Za-z0-9.]{1,12}$/) { &error("Run tag `$run_id' is malformed"); next; } } else { # otherwise just make sure one tag used if ($tag ne $run_id) { &error("Run tag inconsistent (`$tag' and `$run_id')"); next; } } # get topic number ($topic = $topic_string) =~ s/^0+//; if (!exists($numret{$topic})) { &error("Unknown topic ($topic_string)"); $topic = 0; next; } # make sure second field is "Q0" if ($q0 ne "Q0" && ! $q0warn) { $q0warn = 1; &error("Field 2 is `$q0' not `Q0'"); } # make sure DOCNO known and not duplicated if ($docno =~ /^GX\d\d\d-\d\d-\d\d\d\d\d\d\d\d?$/) { # valid DOCNO if (exists $docnos{$docno} && $docnos{$docno} eq $topic) { &error("Document `$docno' retrieved more than once for topic $topic"); next; } $docnos{$docno} = $topic; } else { # invalid DOCNO &error("Unknown document `$docno'"); next; } # remove leading 0's from rank (but keep final 0!) $rank =~ s/^0*//; if (! $rank) { $rank = "0"; } $numret{$topic}++; } # Do global checks: # error if some topic has no (or too many) documents retrieved for it # warning if too few documents retrieved for a topic foreach $t (@topics) { if ($numret{$t} == 0) { &error("No documents retrieved for topic $t"); } elsif ($numret{$t} > $MAX_RET) { &error("Too many documents ($numret{$t}) retrieved for topic $t"); } elsif ($numret{$t} < $MAX_RET) { print ERRLOG "$0 of $results_file: WARNING: only $numret{$t} documents retrieved for topic $t\n" } } print ERRLOG "Finished processing $results_file\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; if ($num_errors) { exit 255; } exit 0; # print error message, keeping track of total number of errors # line numbers refer to SORTED file since that is the actual input file sub error { my $msg_string = pop(@_); print ERRLOG "$0 of $results_file: Error on line $line_num --- $msg_string\n"; $num_errors++; if ($num_errors > $MAX_ERRORS) { print ERRLOG "$0 of $results_file: Quit. Too many errors!\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; exit 255; } }