#!/usr/bin/perl -w use strict; ################# # # RTTM_TO_CTM+MDTM.PL # # v01 Created (in part) by re-using code from df_eval. # v02 Fixed failure to lower-case type fields for comparison with allowed values. # ################# # # Intro # my ($date, $time) = date_time_stamp(); print "rttm_to_ctm+mdtm.pl run on $date at $time\n"; print "command line: ", $0, " ", join(" ", @ARGV), "\n"; my $usage = "\n\nUsage: $0 [-h] -r -c -m \n\n". "Description: rttm_to_ctm+mdtm creates CTM and MDTM files\n". " from an RTTM input file\n". "INPUT:\n". " -r A file containing data, in RTTM format.\n". " -c A CTM file with this name will be created.\n". " -m An MDTM file with this name will be created.\n". "\n"; ################# # # Global data # my %df_types = (filler => {filled_pause => 1, discourse_marker => 1, explicit_editing_term => 1}, edit => {repetition => 1, revision => 1, restart => 1, simple => 1, complex => 1}, ip => {filler => 1, edit => 1, "edit&filler" => 1}); use vars qw ($opt_h $opt_r $opt_c $opt_m); use Getopt::Std; getopts ('r:c:m:h'); die $usage if defined($opt_h); die "\n\nFATAL ERROR: no rttm input file specified$usage" unless defined $opt_r; die "\n\nFATAL ERROR: no ctm output data file specified$usage" unless defined $opt_c; my $exp_id = ""; my $inputs = ""; { my (%ref); get_rttm_data (\%ref, $opt_r); gen_ctm(\%ref); if ($opt_m) { gen_mdtm(\%ref); } } ################################# sub get_rttm_data { my ($data, $file) = @_; my ($record, @fields, $data_type, $word, @words, $token, $chnl); open DATA, $file or die "\n\nFATAL ERROR: unable to open RTTM file '$file'$usage"; while ($record = ) { chomp $record; next if $record =~ /^\s*[\#;]|^\s*$/; @fields = split /\s+/, $record; shift @fields if $fields[0] eq ""; $data_type = shift @fields; if ($data_type =~ /^(LEXEME|NON-LEX|NON-SPEECH)$/) { undef $word; $word->{FILE} = shift @fields; $word->{CHNL} = shift @fields; $word->{TBEG} = shift @fields; $word->{TBEG} =~ s/\*//; $word->{TDUR} = shift @fields; $word->{TDUR} =~ s/\*//; $word->{WORD} = shift @fields; $word->{stype} = shift @fields; $word->{SPKR} = shift @fields; $word->{CONF} = shift @fields; $word->{CONF} = "-" if (not defined $word->{CONF} or $word->{CONF} eq ""); $word->{SPKR} = "unknown" if (not defined $word->{SPKR} or $word->{SPKR} eq ""); $word->{TEND} = $word->{TBEG}+$word->{TDUR}; $word->{TMID} = $word->{TBEG}+$word->{TDUR}/2; $word->{TYPE} = ($data_type =~ /^NON-SPEECH$/ ? "misc" : ($data_type =~ /^NON-LEX$/ ? "non-lex" : ($word->{stype} =~ /^fp$/ ? "fp" : ($word->{stype} =~ /^frag$/ ? "frag" : ($word->{stype} =~ /^un-lex$/ ? "un-lex" : ($word->{stype} =~ /^for-lex$/ ? "for-lex" : "lex")))))); push @{$data->{CTM}{$word->{FILE}}{$word->{CHNL}}}, $word; } elsif ($data_type eq "SPKR-INFO") { undef $token; $token->{FILE} = shift @fields; $token->{CHNL} = shift @fields; shift @fields; shift @fields; shift @fields; $token->{SUBT} = lc shift @fields; $token->{SPKR} = shift @fields; shift @fields; $token->{SUBT} =~ /^(adult_female|adult_male|child|unknown)$/ or die("\n\nFATAL ERROR: unknown spkr_type ($token->{SUBT}) of speaker ($token->{SPKR}) in file '$file'\n\trecord is '$record'\n\n"); $data->{SPKR_TYPE}{$token->{SPKR}} = $token->{SUBT}; } elsif ($data_type =~ /^(FILLER|EDIT|IP|SPEAKER)$/) { undef $token; $token->{TYPE} = $data_type; $token->{FILE} = shift @fields; $token->{CHNL} = shift @fields; $token->{TBEG} = shift @fields; $token->{TBEG} =~ s/\*//; $token->{TDUR} = shift @fields; $token->{TDUR} =~ s/\*//; $token->{TDUR} = 0 if $token->{TDUR} eq ""; $token->{ortho} = shift @fields; $token->{SUBT} = lc shift @fields; $token->{SPKR} = shift @fields; $token->{CONF} = shift @fields; $token->{TEND} = $token->{TBEG}+$token->{TDUR}; $token->{TMID} = $token->{TBEG}+$token->{TDUR}/2; if ($data_type ne "SPEAKER" and not defined $df_types{lc $token->{TYPE}}{$token->{SUBT}}) { die("\n\nFATAL ERROR: unknown disfluency subtype ($token->{SUBT}) in file '$file'\n". " record is '$record'\n\n"); } push @{$data->{MDTM}{$token->{FILE}}{$token->{CHNL}}}, $token; push @{$data->{filler}{$token->{FILE}}{$token->{CHNL}}}, $token if $token->{TYPE} eq "filler"; push @{$data->{edit}{$token->{FILE}}{$token->{CHNL}}}, $token if $token->{TYPE} eq "edit"; push @{$data->{ip}{$token->{FILE}}{$token->{CHNL}}}, $token if $token->{TYPE} eq "ip"; } } close DATA; } ################################# sub gen_ctm { my ($ref_data) = @_; my ($file, $spkr, $chnl); my ($ref_wds, $wd); open CTMFILE, ">$opt_c" or die "\n\nFATAL ERROR: unable to open ctm file '$opt_c'$usage"; foreach $file (sort keys %{$ref_data->{CTM}}) { die "\n\nFATAL ERROR: no words output for file '$file'\n\n" unless defined $ref_data->{CTM}{$file}; foreach $chnl (sort keys %{$ref_data->{CTM}{$file}}) { $ref_wds = $ref_data->{CTM}{$file}{$chnl}; for (my $i=0; $i<@$ref_wds; $i++) { $wd = $ref_wds->[$i]; if ($wd->{CONF} eq "-" or $wd->{CONF} eq "") { $wd->{CONF} = "NA"; } printf CTMFILE "$file $chnl %.3f %.3f $wd->{WORD} $wd->{CONF} $wd->{TYPE} $wd->{SPKR}\n", $wd->{TBEG}, $wd->{TDUR}; } } } close CTMFILE; } ################################# sub gen_mdtm { my ($ref_data) = @_; my ($file, $spkr, $chnl); my ($ref_toks, $tok); open MDTMFILE, ">$opt_m" or die "\n\nFATAL ERROR: unable to open mdtm file '$opt_m'$usage"; foreach $file (sort keys %{$ref_data->{MDTM}}) { die "\n\nFATAL ERROR: no metadata output for file '$file'\n\n" unless defined $ref_data->{MDTM}{$file}; foreach $chnl (sort keys %{$ref_data->{MDTM}{$file}}) { $ref_toks = $ref_data->{MDTM}{$file}{$chnl}; for (my $i=0; $i<@$ref_toks; $i++) { $tok = $ref_toks->[$i]; if ($tok->{CONF} eq "-" or $tok->{CONF} eq "") { $tok->{CONF} = "NA"; } if ($tok->{TYPE} eq "SPEAKER") { print MDTMFILE "$file $chnl $tok->{TBEG} $tok->{TDUR} speaker $tok->{CONF} $ref_data->{SPKR_TYPE}{$tok->{SPKR}} $tok->{SPKR}\n"; } elsif ($tok->{TYPE} eq "FILLER" or $tok->{TYPE} eq "filler") { print MDTMFILE "$file $chnl $tok->{TBEG} $tok->{TDUR} filler $tok->{CONF} $tok->{SUBT} $tok->{SPKR}\n"; } elsif ($tok->{TYPE} eq "EDIT" or $tok->{TYPE} eq "edit") { print MDTMFILE "$file $chnl $tok->{TBEG} $tok->{TDUR} edit $tok->{CONF} $tok->{SUBT} $tok->{SPKR}\n"; } elsif ($tok->{TYPE} eq "IP" or $tok->{TYPE} eq "ip") { print MDTMFILE "$file $chnl $tok->{TBEG} $tok->{TDUR} ip $tok->{CONF} $tok->{SUBT} $tok->{SPKR}\n"; } else { print "unknown tok $file $chnl type=$tok->{TYPE} beg=$tok->{TBEG} spkr=$tok->{SPKR}\n"; } } } } close MDTMFILE; } ################################# sub date_time_stamp { my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime(); my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); my ($date, $time); $time = sprintf "%2.2d:%2.2d:%2.2d", $hour, $min, $sec; $date = sprintf "%4.4s %3.3s %s", 1900+$year, $months[$mon], $mday; return ($date, $time); } #################################