#!/usr/bin/perl -w use strict; ################################# # History: # # version 1: adapted from ctm+mdtm-to-rtml # version 2: sort_ips modified as George Doddington requested. # Confidence field of CTM and MDTM records now preserved. # NON-LEX subtypes now reflect content of ortho field. # version 3: October 1, 2003 # Now merges adjacent IPs, as specified in section 4.2.3 # of version 9 of the EARS 2003 Evaluation Plan. # # To facilitate testing, the program now accepts several MDTM # types that parallel types in the RTTM data format. No # MDTM data that includes those types has been distributed, # and there is no plan to distribute such data. # ################################# ###### # Intro my ($date, $time) = date_time_stamp(); print ";; CTM+MDTM-to-RTTM run on $date at $time\n"; print ";; command line: ", $0, " ", join(" ", @ARGV), "\n"; my $usage = "\n\nUsage: $0 [-h] -c [-m ]\n\n". "Description: This Perl script reads a CTM file and an MDTM file,\n". " reformats the records in these files into RTTM format, then\n". " time-orders the data (in ascending order of Tbeg) and writes\n". " the time-ordered RTTM-formatted records to STDOUT. (The CTM\n". " and MDTM data should both correspond to the same source speech\n". " data file(s), of course.)\n". "INPUT:\n". " -c A file containing word-level transcripts,\n". " in CTM format\n". " -m A file containing metadata information,\n". " in MDTM format\n". "OUTPUT:\n". " RTTM formatted data are written to STDOUT\n". "\n"; ###### #global data my %speaker_data; my %sort_order = ("OMIT" => 0, "NOSCORE" => 1, "NO_RT_METADATA" => 2, "SEGMENT" => 3, "SPEAKER" => 4, "SU" => 5, "A/P" => 6, "CB" => 7, "IP" => 8, "EDIT" => 9, "FILLER" => 10, "NON-SPEECH" => 11, "NON-LEX" => 12, "LEXEME" => 13, "SPKR-INFO" => 14, "SUboundary" => 15); my $dt_MaxIPMerge = 0.3; #maximum between-IP gap to merge two IP's into one. use vars qw ($opt_h $opt_c $opt_m); use Getopt::Std; getopts ('hc:m:'); die $usage if defined($opt_h); die "\n\nFATAL ERROR: no CTM data file specified$usage" unless defined $opt_c; { my $rttm = {}; my $comments = []; my ($file, $chnl); get_data ("CTM", $rttm, $comments, $opt_c); get_data ("MDTM", $rttm, $comments, $opt_m) if $opt_m; foreach $file (keys %$rttm) { foreach $chnl (keys %{$rttm->{$file}}) { add_IPs ($chnl, $rttm->{$file}{$chnl}); } } output_rttm ($rttm, $comments); } ################################# sub output_rttm { my ($data, $comments) = @_; my ($file, $chnl, $name, $item); print ";;This is an RTTM file. Each record contains 9 whitespace separated fields:\n"; print ";; 1:type 2:file 3:chnl 4:tbeg 5:tdur 6:ortho 7:subtype 8:spkrname 9:conf\n"; foreach $item (@$comments) { print "$item\n"; } foreach $file (sort keys %$data) { foreach $chnl (sort keys %{$data->{$file}}) { foreach $name (sort keys %{$speaker_data{$file}}) { print_object ($speaker_data{$file}{$name}); } @{$data->{$file}{$chnl}} = sort sort_objects @{$data->{$file}{$chnl}}; foreach $item (@{$data->{$file}{$chnl}}) { print_object ($item) if $item->{type} =~ /^(LEXEME|NON-LEX|NON-SPEECH|FILLER|EDIT|IP|SU|SPEAKER|SEGMENT|CB|NOSCORE|NO_RT_METADATA)$/; } } } } ################################# sub sort_objects { return ($a->{tbeg} < $b->{tbeg}-0.0001 ? -1 : ($a->{tbeg} > $b->{tbeg}+0.0001 ? 1 : (($a->{type} =~ /end/ and $b->{type} =~ /beg/) ? -1 : (($a->{type} =~ /beg/ and $b->{type} =~ /end/) ? 1 : $sort_order{$a->{type}} <=> $sort_order{$b->{type}})))); } ################################# sub print_object { my ($object) = @_; if ($object->{type} =~ /SPKR-INFO/) { if ($object->{name} =~ //) { return; } elsif ($object->{name} =~ /unknown/) { $object->{stype} = "unknown"; } } printf "%-12s %-12s %3s", $object->{type}, $object->{file}, $object->{chnl}; defined $object->{tbeg} ? (printf " %8.3f%s", $object->{tbeg} , ($object->{tbegValid} ? " ":"*")) : (printf " %9s", " "); if (defined $object->{tdur} and $object->{tdur} =~ //) { print " "; } else { defined $object->{tdur} ? (printf " %6.3f%s", $object->{tdur} , ($object->{tendValid} ? " ":"*")) : (printf " %7s", " "); } printf " %-16s %-16s %-16s %s\n", (defined $object->{ortho} ? $object->{ortho} : ""), (defined $object->{stype} ? $object->{stype} : ""), (defined $object->{name} ? $object->{name} : ""), (defined $object->{conf} ? $object->{conf} : ""); } ################################# sub get_data { my ($type, $data, $comments, $file) = @_; my ($record, @fields, $rec); open DATA, $file or die "\n\nFATAL ERROR: unable to open $type file '$file'$usage"; while ($record = ) { chomp $record; (push @$comments, $record), next if $record =~ /^\s*[\#;]|^\s*$/; @fields = split /\s+/, $record; shift @fields if $fields[0] eq ""; die "\n\nFATAL ERROR: insufficient number of fields in record '$record'\n\n" if @fields < 5; undef $rec; $rec->{TEXT} = $record; $rec->{FILE} = shift @fields; $rec->{CHNL} = shift @fields; $rec->{TBEG} = shift @fields; $rec->{TDUR} = shift @fields; if ($type eq "CTM") { $rec->{RECORD_TYPE} = "CTM"; $rec->{WORD} = shift @fields; $rec->{CONF} = shift @fields; $rec->{TYPE} = shift @fields; } elsif ($type eq "MDTM") { $rec->{RECORD_TYPE} = "MDTM"; $rec->{TYPE} = shift @fields; $rec->{CONF} = shift @fields; $rec->{SUBT} = shift @fields; } $rec->{SPKR} = shift @fields; add_rttm_keys ($rec); push @{$data->{$rec->{FILE}}{$rec->{CHNL}}}, $rec unless $rec->{RECORD_TYPE} eq "OMIT" or ($rec->{RECORD_TYPE} eq "MDTM" and $rec->{TYPE} eq "ip"); } close DATA; } ################################# sub add_rttm_keys { my ($rec) = @_; my ($spkr_info); $rec->{file} = $rec->{FILE}; $rec->{chnl} = $rec->{CHNL}; $rec->{tbeg} = $rec->{TBEG}; $rec->{tbegValid} = 1; $rec->{tdur} = $rec->{TDUR}; $rec->{tend} = ($rec->{tdur} =~ //) ? $rec->{tbeg} : $rec->{tbeg}+$rec->{tdur}; $rec->{tendValid} = 1; $rec->{name} = $rec->{SPKR}; if (defined $rec->{CONF}) { if ($rec->{CONF} eq "-" or $rec->{CONF} eq "NA") { $rec->{conf} = ""; } else { $rec->{conf} = $rec->{CONF}; } } else { $rec->{conf} = ""; } $rec->{TYPE} = lc $rec->{TYPE}; $rec->{TYPE} or die "\n\nFATAL ERROR: no record TYPE in record '$rec->{TEXT}'\n\n"; if ($rec->{RECORD_TYPE} =~ /CTM/) { $rec->{ortho} = $rec->{WORD}; ($rec->{type}, $rec->{stype}) = ($rec->{TYPE} =~ /^lex$/ ? ("LEXEME", "lex") : ($rec->{TYPE} =~ /^frag$/ ? ("LEXEME", "frag") : ($rec->{TYPE} =~ /^fp$/ ? ("LEXEME", "fp") : ($rec->{TYPE} =~ /^un-lex$/ ? ("LEXEME", "un-lex") : ($rec->{TYPE} =~ /^for-lex$/ ? ("LEXEME", "for-lex") : ($rec->{TYPE} =~ /^non-speech$/ ? ("NON-SPEECH", "noise") : ($rec->{TYPE} =~ /^non-lex$/ ? ("NON-LEX", "noise") : # see code at end of sub ($rec->{TYPE} =~ /^misc$/ ? ("OMIT", "other") : ($rec->{TYPE} =~ /^noscore$/ ? ("NOSCORE", "") : die "\n\nFATAL ERROR: unknown record TYPE in record '$rec->{TEXT}'\n\n" ))))))))); if ($rec->{type} eq "LEXEME" and $rec->{stype} eq "lex" and $rec->{ortho} =~ /^[A-Za-z]\.$/) { $rec->{stype} = "alpha"; } if ($rec->{TYPE} eq "NOSCORE") { $rec->{RECORD_TYPE} = "MDTM"; } elsif ($rec->{TYPE} eq "OMIT") { $rec->{RECORD_TYPE} = "OMIT"; # this rec will vanish } } elsif ($rec->{RECORD_TYPE} =~ /MDTM/) { $rec->{SUBT} or die "\n\nFATAL ERROR: no record SUBTYPE in record '$rec->{TEXT}'\n\n"; ($rec->{type}, $rec->{stype}) = ($rec->{TYPE} =~ /^filler$/ ? ($rec->{SUBT} !~ /^(filled_pause|discourse_marker|explicit_editing_term)$/ ? die "\n\nFATAL ERROR: unknown $rec->{TYPE} record SUBTYPE $rec->{SUBT} in record '$rec->{TEXT}'\n\n" : ("FILLER", $rec->{SUBT})) : ($rec->{TYPE} =~ /^edit$/ ? ($rec->{SUBT} !~ /^(none|simple|repetition|revision|restart|complex)$/ ? die "\n\nFATAL ERROR: unknown $rec->{TYPE} record SUBTYPE $rec->{SUBT} in record '$rec->{TEXT}'\n\n" : ("EDIT", $rec->{SUBT})) : ($rec->{TYPE} =~ /^su$/ ? ($rec->{SUBT} !~ /^(statement|question|backchannel|incomplete|unannotated)$/ ? die "\n\nFATAL ERROR: unknown $rec->{TYPE} record SUBTYPE $rec->{SUBT} in record '$rec->{TEXT}'\n\n" : ("SU", $rec->{SUBT})) : ($rec->{TYPE} =~ /^ip$/ ? ($rec->{SUBT} !~ /^(edit|filler|edit&filler)$/ ? die "\n\nFATAL ERROR: unknown $rec->{TYPE} record SUBTYPE $rec->{SUBT} in record '$rec->{TEXT}'\n\n" : ("IP", $rec->{SUBT})) : ($rec->{TYPE} =~ /^segment$/ ? ($rec->{SUBT} !~ /^(|)$/ ? die "\n\nFATAL ERROR: unknown $rec->{TYPE} record SUBTYPE $rec->{SUBT} in record '$rec->{TEXT}'\n\n" : ("SEGMENT", "")) : ($rec->{TYPE} =~ /^noscore$/ ? ($rec->{SUBT} !~ /^(|)$/ ? die "\n\nFATAL ERROR: unknown $rec->{TYPE} record SUBTYPE $rec->{SUBT} in record '$rec->{TEXT}'\n\n" : ("NOSCORE", "")) : ($rec->{TYPE} =~ /^no_rt_metadata$/ ? ($rec->{SUBT} !~ /^(|)$/ ? die "\n\nFATAL ERROR: unknown $rec->{TYPE} record SUBTYPE $rec->{SUBT} in record '$rec->{TEXT}'\n\n" : ("NO_RT_METADATA", "")) : ($rec->{TYPE} =~ /^cb$/ ? ($rec->{SUBT} !~ /^(coordinating|clausal|other)$/ ? die "\n\nFATAL ERROR: unknown $rec->{TYPE} record SUBTYPE $rec->{SUBT} in record '$rec->{TEXT}'\n\n" : ("CB", $rec->{SUBT})) : ($rec->{TYPE} =~ /^speaker$/ ? ($rec->{SUBT} !~ /^(adult_male|adult_female|child|unknown)$/ ? die "\n\nFATAL ERROR: unknown $rec->{TYPE} record SUBTYPE $rec->{SUBT} in record '$rec->{TEXT}'\n\n" : ("SPEAKER", "")) : die "\n\nFATAL ERROR: unknown record TYPE in record '$rec->{TEXT}'\n\n" ))))))))); delete $rec->{stype} if $rec->{stype} =~ /^(none|unknown)$/; undef $spkr_info; $spkr_info = {type => "SPKR-INFO", file => $rec->{file}, chnl => $rec->{chnl}, name => $rec->{name}}; $spkr_info->{stype} = ($rec->{SUBT} =~ /unknown/ or $rec->{name} =~ /unknown/) ? "unknown" : $rec->{SUBT} if $rec->{type} eq "SPEAKER"; if (defined $rec->{name}) { if (my $spkr=$speaker_data{$rec->{file}}{$rec->{name}}) { ($spkr->{chnl} eq $spkr_info->{chnl}) or die "\n\nFATAL ERROR: inconsistent speaker chnl info for speaker $spkr->{name} in file $rec->{file}:" ." '$spkr->{chnl}' and '$spkr_info->{chnl}'\n\n"; (not defined $spkr->{stype} or not defined $spkr_info->{stype} or $spkr->{stype} eq $spkr_info->{stype}) or die "\n\nFATAL ERROR: inconsistent speaker type info for speaker $spkr->{name} in file $rec->{file}:" ." '$spkr->{stype}' and '$spkr_info->{stype}'\n\n"; $spkr_info->{stype} = $spkr->{stype} if defined $spkr->{stype}; } } $speaker_data{$rec->{file}}{$rec->{name}} = $spkr_info; } # If type is non-lex and subtype is noise, then fix up the subtype to correspond to the ortho if ($rec->{type} eq "NON-LEX" and $rec->{stype} eq "noise") { if ($rec->{ortho} =~ /^(breath|breathes|breaths|breathing|breath_sound|breath-sound)$/) { $rec->{stype} = "breath"; } elsif ($rec->{ortho} =~ /^(cough|coughs|coughing)$/) { $rec->{stype} = "cough"; } elsif ($rec->{ortho} =~ /^(laugh|laughs|laughter|laughing)$/) { $rec->{stype} = "laugh"; } elsif ($rec->{ortho} =~ /^(lipsmack|lip-smack|lip_smack)$/) { $rec->{stype} = "lipsmack"; } elsif ($rec->{ortho} =~ /^(sneeze|sneezes|sneezing)$/) { $rec->{stype} = "sneeze"; } else { $rec->{stype} = "other"; } } } ################################# sub add_IPs { my ($chnl, $rttm) = @_; my ($object, $ip, $ipp, $prev_edit, @ipdata, @ips); foreach $object (@$rttm) { push @ipdata, $object if $chnl eq $object->{chnl}; } @ipdata = sort sort_ips @ipdata; foreach $object (@ipdata) { if ($object->{type} =~ /FILLER|EDIT/) { undef $ip; $ip->{type} = "IP"; $ip->{file} = $object->{file}; $ip->{chnl} = $object->{chnl}; $ip->{name} = $object->{name}; if ($object->{type} eq "FILLER") { $ipp = $ips[@ips-1] if $prev_edit; if ($prev_edit) { $ipp->{stype} = "edit&filler"; } else { $ip->{stype} = "filler"; $ip->{tbeg} = $ip->{tend} = $object->{tbeg}; $ip->{tbegValid} = $ip->{tendValid} = $object->{tbegValid}; push @ips, $ip; } } else { $ip->{stype} = "edit"; $ip->{tbeg} = $ip->{tend} = $object->{tend}; $ip->{tbegValid} = $ip->{tendValid} = $object->{tendValid}; push @ips, $ip; $prev_edit = 1; } } #elsif ($object->{type} =~ /LEXEME|NON-LEX/) { elsif ($object->{type} =~ /LEXEME|SU/) { undef $prev_edit; } } push @$rttm, @ips if @ips; } ################################# sub sort_ips { return (ip_time($a) < ip_time($b)-0.0001 ? -1 : (ip_time($a) > ip_time($b)+0.0001 ? 1 : $sort_order{$a->{type}} <=> $sort_order{$b->{type}})); } ################################# sub ip_time { my ($object) = @_; return $object->{type} =~ /EDIT/ ? $object->{tend} : $object->{tbeg}; } ################################# sub prev_sort_ips { return (($a->{type} =~ /EDIT/ ? $a->{tend} : $a->{tbeg}) > ($b->{type} =~ /EDIT/ ? $b->{tend} : $b->{tbeg}) - 0.0001 ? 1 : ($b->{type} =~ /EDIT/ ? $b->{tend} : $b->{tbeg}) > ($a->{type} =~ /EDIT/ ? $a->{tend} : $a->{tbeg}) -0.0001 ? -1 : 0); } ################################# sub date_time_stamp { my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(); my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); my ($date, $time); $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec; $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday; return ($date, $time); }