#!/usr/local/bin/perl -w use strict; ################################# # History: # # version 3 # * bug fix in SOURCE decoding # * option added to not count spaces (to accommodate systems that add spaces) # # version 2 # * zero-pad TIMEX2 ID's to facilitate chronological ordering in ace-eval # # version 1 # * adapted from score_timex2 # * bug fix in GetTags # * whitespace characters kept in start/end character count # ################################# ####################################################################### ## Converting TIMEX2 annotations in MITRE TERN format to APF format ## ---------------------------- ## ## USAGE ## ----- ## Invoke this script as follows: ## perl tern2apf.pl -f [-w -s "source" -t "type"] > ## ## use the -w to ignore whitespace in computing character indices ## ## Example: ## perl tern2apf.pl -f APW0001.sgml -s newswire -t text >APW0001.apf ## ## If you're on a PC, you have to have perl installed and then you can do ## the same thing in a command prompt window. ## ## WARNINGS / INPUT ASSUMPTIONS ## ---------------------------- ## 1. All open and close tags are paired (i.e. no at this time) ## 2. Closing tags must look like ## 3. No text inside of text ... ... ... ## (That really means the TEXT tag. For other tags it is OK.) ## ####################################################################### ####################################################################### ### Variables for user to modify ####################################################################### my $DocType = "DOCTYPE"; my $DocTag = "DOC"; my $DocLabel = "DOC_?ID"; my $DocNo = "DOCNO"; # srl - hack to handle both lowercase and uppercase attribute names # enter both cases in table ## Count specified attributes ### Use 1 to count attribute, 0 to ignore my %CountableTags = ("TIMEX2" => { "TEXT" => 1, "VAL" => 1, "MOD" => 1, "SET" => 1, "GRANULARITY" => 1, # srl - obsolete "NON_SPECIFIC" => 1, "PERIODICITY" => 1, # srl - obsolete "ANCHOR_DIR" => 1, "ANCHOR_VAL" => 1, "TYPE", => 0, "COMMENTS" => 0, "COMMENT" => 0 }, ); ## Ignore tags like this my %IgnoreTags = ( "S" => 1, "s" => 1, "BR" => 1, "br" => 1 ); ####################################################################### ### Nothing to modify below this ####################################################################### undef $/; ## Global variables my $TAG = "(?:<[^>]*>)"; my $usage = "Usage: perl tern2apf.pl -f [-w -s \"source\" -t \"type\"]\n"; use vars qw ($opt_h $opt_w $opt_f $opt_s $opt_t $source); use Getopt::Std; getopts ('hwf:s:t:'); not defined $opt_h or die $usage; defined $opt_f or die $usage; #### Load Files my $File1; open (FILE, $opt_f) or die "\nUnable to open file '$opt_f'", $usage; $File1 .= $_ while (); close FILE; $File1 =~ s/[\013]([\n]?)/\n/g; #normalize line termination (discard carriage returns) $opt_s = "newswire" unless defined $opt_s; $opt_t = "text" unless defined $opt_t; print "\n"; print "\n"; while($File1 =~ /<\/$DocTag>/mo) { # srl - look for end of a doc my $Doc1 = $` . $&; # srl - get next doc, leave on start tag $File1 = $'; # srl - rest of file $Doc1 =~ s/.*<$DocTag>(.*)<\/$DocTag>/$1/s; my $new_source = $1 if $Doc1 =~ /<$DocType\s[^>]*SOURCE\s*=\s*\"([^\">]*)\"/s; if (not $source or ($new_source and $new_source ne $source)) { print "\n" if $source; $source = $new_source ? $new_source : $opt_s; print "\n"; } ScoreText($Doc1); print "\n"; } ## End of document processing loop print "\n"; ############################################### ###### END of Main Program ###### ############################################### ############################################## ## sub ScoreText ## Runs scoring and prints out results ## Takes three inputs: KeyText, TestText and ErrID ## The ErrID is used for display when errors occur ## Returns a hash of arrays ## Keys are attribute names ## Array values are (Correct, Incorrect, Missing, Spurious) ############################################## sub ScoreText { my ($Key) = @_; my(%KeyTags, @KeySK, %TextTags, @TextSK); my($KeyExt, $TextExt, $KeyTH, $TextTH, %KeyAV, %TextAV); my($tmpKTag, $KTag, $tmpTTag, $TTag, $KStart, $TStart,$KEnd, $TEnd); my ($docid, $sn); my $LAB_FORMAT = "%4s %15s | %30s | %30s\n"; #eliminate unclosed tags foreach my $tag ("TURN") { $Key =~ s/<$tag>//gs; } %KeyTags = GetTags($Key); # srl - docs are same length, get tags @KeySK = sort(keys %KeyTags); foreach $KTag (@KeySK) { next unless $KTag =~ /DOCNO/; print "\n" if defined $docid; $docid = $KeyTags{$KTag}[1]; $docid =~ s/^\s*|\s*$//g; print "\n"; $sn = 0; last; } while($KeySK[0]) { # srl - loop through tags in order $KeySK[0] =~ /(\d+):(\d+):(\w+)/o; $KStart = $1; $KEnd = $2; $tmpKTag = $3; my @temp = @{$KeyTags{$KeySK[0]} }; # srl - entry value $KTag = uc $tmpKTag; # srl uppercase all tags for comparison purposes if (defined ($CountableTags{$KTag})) { $KeyTH = $temp[0]; # srl - tag $KeyExt = $temp[1]; # srl - text between start & end tag %KeyAV = AttVal($KeyTH); $sn++; printf "\n". " \n"; $KeyExt =~ s/\n/ /gs; printf " \n". " $KeyExt\n". " \n". " \n", $KStart, $KEnd; print "\n"; } else { if($KTag =~ /\ABAD_DATA_/o) { my $temp = $'; printf STDERR ($LAB_FORMAT, "BAD ", $temp, $KeyTags{$KeySK[0]}[0], ""); if($KeyTags{$KeySK[0]}[1]) { printf STDERR ($LAB_FORMAT, "BAD ", "TEXT", $KeyTags{$KeySK[0]}[1], ""); } } } shift @KeySK; } } ############################################## ## sub AttVal ## Extracts attributes and values from a tag ############################################## sub AttVal { my($tag, %AV); ($tag) = @_; my($content, $temp); unless($tag =~ /<([^>]+)>/om) { print STDERR "*** BAD TAG *** $tag\n"; } $content = $1; while($content =~ /(\S+)=\"([^\"]+)\"/) { $AV{$1} = $2; $temp = quotemeta($&); $content =~ s/$temp//; } return(%AV); } ############################################## ## sub GetTags ## Finds all tags in a string ## Takes one input: The string ## Returns a hash of arrays ## Keys are strings of the form StartPosition:EndPosition ## Array values are (TagContents, TaggedData) ############################################## sub GetTags { my($string, %TagHash); ($string) = @_; my($SPos, $EPos, $key); foreach my $tag (keys %IgnoreTags) { $string =~ s/<($IgnoreTags{$tag})[^>]*>|<\/$IgnoreTags{$tag}>//gs; } ## Get all tags!!! while($string =~ /<\/(\w+)>/s) { # srl - find an end tag my $type = $1; my $before_endtag = $`; my $after_endtag = $'; if ($before_endtag =~ /^(.*)(<$type[^>]*>)/s) { # find matching start tag my $before_tag = $1; my $TagHead = $2; my $Int = $'; $string = $before_tag . $Int . $after_endtag; $before_tag =~ s/$TAG//sg; # srl - get rid of tags $SPos = length $before_tag; # srl - start position $SPos -= $before_tag =~ s/\s//gs if $opt_w; # don't count whitespace $EPos = length($Int) - 1 + $SPos; # srl - end position $EPos -= $Int =~ s/(\s)/$1/gs if $opt_w; # don't count whitespace if ($Int =~ /$TAG/s) { ## Unclosed tag, or crossing tags $key = sprintf("%07d:%07d:%s", $SPos, $EPos, "BAD_DATA_UNCLOSED_TAG"); $TagHash{$key} = [uc $TagHead, $Int]; } $key = sprintf("%07d:%07d:%s", $SPos, $EPos, uc $type); # srl - key is startpos, endpos, tagtype #srl upcase tag if(defined($TagHash{$key})) { ## Duplicate tag at one location $key = sprintf("%07d:%07d:%s", $SPos, $EPos, "BAD_DATA_DUP_TAG"); } $TagHash{$key} = [uc $TagHead , $Int]; # srl - add to hashtable, value=starttag . text between start & end tags #srl upcase taghead } else { ## Unmatched close tag $string = $before_endtag . $after_endtag; $before_endtag =~ s/$TAG//sg; # srl - get rid of tags $SPos = length $before_endtag; # srl - start position $key = sprintf("%07d:%07d:%s", $SPos, $SPos, "BAD_DATA_UNMATCHED_CLOSE"); $TagHash{$key} = [$type , ""]; } } return(%TagHash); }