b2txt25/language_model/tools/validate_dict_dir.pl

#!/usr/bin/env perl

# Apache 2.0.
# Copyright  2012 Guoguo Chen
#            2015 Daniel Povey
#            2017 Johns Hopkins University (Jan "Yenda" Trmal <jtrmal@gmail.com>)
#
# Validation script for 'dict' directories (e.g. data/local/dict)

# this function reads the opened file (supplied as a first
# parameter) into an array of lines. For each
# line, it tests whether it's a valid utf-8 compatible
# line. If all lines are valid utf-8, it returns the lines
# decoded as utf-8, otherwise it assumes the file's encoding
# is one of those 1-byte encodings, such as ISO-8859-x
# or Windows CP-X.
# Please recall we do not really care about
# the actually encoding, we just need to
# make sure the length of the (decoded) string
# is correct (to make the output formatting looking right).
sub get_utf8_or_bytestream {
  use Encode qw(decode encode);
  my $is_utf_compatible = 1;
  my @unicode_lines;
  my @raw_lines;
  my $raw_text;
  my $lineno = 0;
  my $file = shift;

  while (<$file>) {
    $raw_text = $_;
    last unless $raw_text;
    if ($is_utf_compatible) {
      my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ;
      $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
      push @unicode_lines, $decoded_text;
    } else {
      #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
      ;
    }
    push @raw_lines, $raw_text;
    $lineno += 1;
  }

  if (!$is_utf_compatible) {
    return (0, @raw_lines);
  } else {
    return (1, @unicode_lines);
  }
}

# check if the given unicode string contain unicode whitespaces
# other than the usual four: TAB, LF, CR and SPACE
sub validate_utf8_whitespaces {
  my $unicode_lines = shift;
  use feature 'unicode_strings';
  for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
    my $current_line = $unicode_lines->[$i];
    if ((substr $current_line, -1) ne "\n"){
      print STDERR "$0: The current line (nr. $i) has invalid newline\n";
      return 1;
    }
    my @A = split(" ", $current_line);
    my $utt_id = $A[0];
    # we replace TAB, LF, CR, and SPACE
    # this is to simplify the test
    if ($current_line =~ /\x{000d}/) {
      print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n";
      return 1;
    }
    $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g;
    if ($current_line =~/\s/) {
      print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n";
      return 1;
    }
  }
  return 0;
}

# checks if the text in the file (supplied as the argument) is utf-8 compatible
# if yes, checks if it contains only allowed whitespaces. If no, then does not
# do anything. The function seeks to the original position in the file after
# reading the text.
sub check_allowed_whitespace {
  my $file = shift;
  my $pos = tell($file);
  (my $is_utf, my @lines) = get_utf8_or_bytestream($file);
  seek($file, $pos, SEEK_SET);
  if ($is_utf) {
    my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines);
    print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n";
    if ($has_invalid_whitespaces) {
      print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n";
      return 0;
    } else {
      print "--> text contains only allowed whitespaces\n";
    }
  } else {
    print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n";
  }
  return 1;
}


if(@ARGV != 1) {
  die "Usage: validate_dict_dir.pl <dict-dir>\n" .
      "e.g.: validate_dict_dir.pl data/local/dict\n";
}

$dict = shift @ARGV;
$dict =~ s:/$::;

$exit = 0;
$success = 1;  # this is re-set each time we read a file.

sub set_to_fail { $exit = 1; $success = 0; }

# Checking silence_phones.txt -------------------------------
print "Checking $dict/silence_phones.txt ...\n";
if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;}
if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;}
$idx = 1;
%silence = ();
$crlf = 1;

print "--> reading $dict/silence_phones.txt\n";
check_allowed_whitespace(\*S) || set_to_fail();
while(<S>) {
  if (! s/\n$//) {
    print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
    set_to_fail();
  }
  if ($crlf == 1 && m/\r/) {
    print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n";
    set_to_fail();
    $crlf = 0;
  }
  my @col = split(" ", $_);
  if (@col == 0) {
    set_to_fail();
    print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n";
  }
  foreach(0 .. @col-1) {
    my $p = $col[$_];
    if($silence{$p}) {
      set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n";
    } else {
      $silence{$p} = 1;
    }
    # disambiguation symbols; phones ending in _B, _E, _S or _I will cause
    # problems with word-position-dependent systems, and <eps> is obviously
    # confusable with epsilon.
    if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq "<eps>"){
      set_to_fail();
      print "--> ERROR: phone \"$p\" has disallowed written form\n";
    }
  }
  $idx ++;
}
close(S);
$success == 0 || print "--> $dict/silence_phones.txt is OK\n";
print "\n";

# Checking optional_silence.txt -------------------------------
print "Checking $dict/optional_silence.txt ...\n";
if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;}
if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;}
$idx = 1;
$success = 1;
$crlf = 1;
print "--> reading $dict/optional_silence.txt\n";
check_allowed_whitespace(\*OS) or exit 1;
while(<OS>) {
  chomp;
  my @col = split(" ", $_);
  if ($idx > 1 or @col > 1) {
    set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n";
  } elsif (!$silence{$col[0]}) {
    set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n";
  }
  if ($crlf == 1 && m/\r/) {
    print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n";
    set_to_fail();
    $crlf = 0;
  }
  $idx ++;
}
close(OS);
$success == 0 || print "--> $dict/optional_silence.txt is OK\n";
print "\n";

# Checking nonsilence_phones.txt -------------------------------
print "Checking $dict/nonsilence_phones.txt ...\n";
if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;}
if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;}
$idx = 1;
%nonsilence = ();
$success = 1;
$crlf = 1;
print "--> reading $dict/nonsilence_phones.txt\n";
check_allowed_whitespace(\*NS) or set_to_fail();
while(<NS>) {
  if ($crlf == 1 && m/\r/) {
    print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n";
    set_to_fail();
    $crlf = 0;
  }
  if (! s/\n$//) {
    print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n";
    set_to_fail();
  }
  my @col = split(" ", $_);
  if (@col == 0) {
    set_to_fail();
    print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n";
  }
  foreach(0 .. @col-1) {
    my $p = $col[$_];
    if($nonsilence{$p}) {
      set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n";
    } else {
      $nonsilence{$p} = 1;
    }
    # phones that start with the pound sign/hash may be mistaken for
    # disambiguation symbols; phones ending in _B, _E, _S or _I will cause
    # problems with word-position-dependent systems, and <eps> is obviously
    # confusable with epsilon.
    if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq "<eps>"){
      set_to_fail();
      print "--> ERROR: phone \"$p\" has disallowed written form\n";
    }
  }
  $idx ++;
}
close(NS);
$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n";
print "\n";

# Checking disjoint -------------------------------
sub intersect {
  my ($a, $b) = @_;
  @itset = ();
  %itset = ();
  foreach(keys %$a) {
    if(exists $b->{$_} and !$itset{$_}) {
      push(@itset, $_);
      $itset{$_} = 1;
    }
  }
  return @itset;
}

print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n";
@itset = intersect(\%silence, \%nonsilence);
if(@itset == 0) {print "--> disjoint property is OK.\n";}
else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";}
print "\n";


sub check_lexicon {
  my ($lex, $num_prob_cols, $num_skipped_cols) = @_;
  print "Checking $lex\n";
  !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail();
  my %seen_line = {};
  $idx = 1; $success = 1; $crlf = 1;
  print "--> reading $lex\n";
  check_allowed_whitespace(\*L) or set_to_fail();
  while (<L>) {
    if ($crlf == 1 && m/\r/) {
      print "--> ERROR: $lex contains Carriage Return (^M) characters.\n";
      set_to_fail();
      $crlf = 0;
    }
    if (defined $seen_line{$_}) {
      print "--> ERROR: line '$_' of $lex is repeated\n";
      set_to_fail();
    }
    $seen_line{$_} = 1;
    if (! s/\n$//) {
      print "--> ERROR: last line '$_' of $lex does not end in newline.\n";
      set_to_fail();
    }
    my @col = split(" ", $_);
    $word = shift @col;
    if (!defined $word) {
      print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail();
    }
    if ($word eq "<s>" || $word eq "</s>" || $word eq "<eps>" || $word eq "#0") {
      print "--> ERROR: lexicon.txt contains forbidden word $word\n";
      set_to_fail();
    }
    for ($n = 0; $n < $num_prob_cols; $n++) {
      $prob = shift @col;
      if (!($prob > 0.0 && $prob <= 1.0)) {
        print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n";
        set_to_fail();
      }
    }
    for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; }
    if (@col == 0) {
      print "--> ERROR: lexicon.txt contains word $word with empty ";
      print "pronunciation.\n";
      set_to_fail();
    }
    foreach (0 .. @col-1) {
      if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
        print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt ";
        print "(line $idx)\n";
        set_to_fail();
      }
    }
    $idx ++;
  }
  close(L);
  $success == 0 || print "--> $lex is OK\n";
  print "\n";
}

if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); }
if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); }
if (-f "$dict/lexiconp_silprob.txt") {
  # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also
  # exist.
  check_lexicon("$dict/lexiconp_silprob.txt", 2, 2);
  if (-f "$dict/silprob.txt") {
    !open(SP, "<$dict/silprob.txt") &&
      print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail();
      $crlf = 1;
    while (<SP>) {
      if ($crlf == 1 && m/\r/) {
        print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n";
        set_to_fail();
        $crlf = 0;
      }
      chomp; my @col = split;
      @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail();
      if ($col[0] eq "<s>" || $col[0] eq "overall") {
        if (!($col[1] > 0.0 && $col[1] <= 1.0)) {
          set_to_fail();
          print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n";
        }
      } elsif ($col[0] eq "</s>_s" || $col[0] eq "</s>_n") {
        if ($col[1] <= 0.0) {
          set_to_fail();
          print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n";
        }
      } else {
        print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n";
        set_to_fail();
      }
    }
    close(SP);
  } else {
    set_to_fail();
    print "--> ERROR: expecting $dict/silprob.txt to exist\n";
  }
}

if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) {
  print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n";
  set_to_fail();
}

sub check_lexicon_pair {
  my ($lex1, $num_prob_cols1, $num_skipped_cols1,
      $lex2, $num_prob_cols2, $num_skipped_cols2) = @_;
  # We have checked individual lexicons already.
  open(L1, "<$lex1"); open(L2, "<$lex2");
  print "Checking lexicon pair $lex1 and $lex2\n";
  my $line_num = 0;
  while(<L1>) {
    $line_num++;
    @A = split;
    $line_B = <L2>;
    if (!defined $line_B) {
      print "--> ERROR: $lex1 and $lex2 have different number of lines.\n";
      set_to_fail(); last;
    }
    @B = split(" ", $line_B);
    # Check if the word matches.
    if ($A[0] ne $B[0]) {
      print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n";
      set_to_fail(); last;
    }
    shift @A; shift @B;
    for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; }
    for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; }
    # Check if the pronunciation matches
    if (join(" ", @A) ne join(" ", @B)) {
      print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n";
      set_to_fail(); last;
    }
  }
  $line_B = <L2>;
  if (defined $line_B && $exit == 0) {
    print "--> ERROR: $lex1 and $lex2 have different number of lines.\n";
    set_to_fail();
  }
  $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n";
}

# If more than one lexicon exist, we have to check if they correspond to each
# other. It could be that the user overwrote one and we need to regenerate the
# other, but we do not know which is which.
if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") {
  check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0);
}
if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") {
  check_lexicon_pair("$dict/lexiconp.txt", 1, 0,
                     "$dict/lexiconp_silprob.txt", 2, 2);
}

# Checking extra_questions.txt -------------------------------
%distinguished = (); # Keep track of all phone-pairs including nonsilence that
                     # are distinguished (split apart) by extra_questions.txt,
                     # as $distinguished{$p1,$p2} = 1.  This will be used to
                     # make sure that we don't have pairs of phones on the same
                     # line in nonsilence_phones.txt that can never be
                     # distinguished from each other by questions.  (If any two
                     # phones appear on the same line in nonsilence_phones.txt,
                     # they share a tree root, and since the automatic
                     # question-building treats all phones that appear on the
                     # same line of nonsilence_phones.txt as being in the same
                     # group, we can never distinguish them without resorting to
                     # questions in extra_questions.txt.
print "Checking $dict/extra_questions.txt ...\n";
if (-s "$dict/extra_questions.txt") {
  if (!open(EX, "<$dict/extra_questions.txt")) {
    set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n";
  }
  $idx = 1;
  $success = 1;
  $crlf = 1;
  print "--> reading $dict/extra_questions.txt\n";
  check_allowed_whitespace(\*EX) or set_to_fail();
  while(<EX>) {
    if ($crlf == 1 && m/\r/) {
      print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n";
      set_to_fail();
      $crlf = 0;
    }
    if (! s/\n$//) {
      print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n";
      set_to_fail();
    }
    my @col = split(" ", $_);
    if (@col == 0) {
      set_to_fail();  print "--> ERROR: empty line in $dict/extra_questions.txt\n";
    }
    foreach (0 .. @col-1) {
      if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
        set_to_fail();  print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n";
      }
      $idx ++;
    }
    %col_hash = ();
    foreach $p (@col) { $col_hash{$p} = 1; }
    foreach $p1 (@col) {
      # Update %distinguished hash.
      foreach $p2 (keys %nonsilence) {
        if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not
                                       # in this question (and in nonsilence
                                       # phones)... mark p1,p2 as being split apart
          $distinguished{$p1,$p2} = 1;
          $distinguished{$p2,$p1} = 1;
        }
      }
    }
  }
  close(EX);
  $success == 0 || print "--> $dict/extra_questions.txt is OK\n";
} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";}

if (-f "$dict/nonterminals.txt") {
  open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt";
  my %nonterminals = ();
  my $line_number = 1;
  while (<NT>) {
    chop;
    my @line = split(" ", $_);
    if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) {
      print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1;
    }
    $nonterminals{$line[0]} = 1;
    $line_number++;
  }
  print "--> $dict/nonterminals.txt is OK\n";
}


# check nonsilence_phones.txt again for phone-pairs that are never
# distnguishable.  (note: this situation is normal and expected for silence
# phones, so we don't check it.)
if(!open(NS, "<$dict/nonsilence_phones.txt")) {
  print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1;
}

$num_warn_nosplit = 0;
$num_warn_nosplit_limit = 10;
while(<NS>) {
  my @col = split(" ", $_);
  foreach $p1 (@col) {
    foreach $p2 (@col) {
      if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) {
        set_to_fail();
        if ($num_warn_nosplit <= $num_warn_nosplit_limit) {
          print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n";
        }
        if ($num_warn_nosplit == $num_warn_nosplit_limit) {
          print "... Not warning any more times about this issue.\n";
        }
        if ($num_warn_nosplit == 0) {
          print "    (note: we started checking for this only recently.  You can still build a system but\n";
          print "     phones $p1 and $p2 will be acoustically indistinguishable).\n";
        }
        $num_warn_nosplit++;
      }
    }
  }
}


if ($exit == 1) {
  print "--> ERROR validating dictionary directory $dict (see detailed error ";
  print "messages above)\n\n";
  exit 1;
} else {
  print "--> SUCCESS [validating dictionary directory $dict]\n\n";
}

exit 0;