#!/usr/bin/env perl
# $XTermId: annotate-enc,v 1.12 2021/02/18 10:41:35 tom Exp $
# -----------------------------------------------------------------------------
# Copyright 2021 by Thomas E. Dickey
#
# All Rights Reserved
#
# Permission to use, copy, modify, and distribute this software and its
# documentation for any purpose and without fee is hereby granted,
# provided that the above copyright notice appear in all copies and that
# both that copyright notice and this permission notice appear in
# supporting documentation, and that the name of the above listed
# copyright holder(s) not be used in advertising or publicity pertaining
# to distribution of the software without specific, written prior
# permission.
#
# THE ABOVE LISTED COPYRIGHT HOLDER(S) DISCLAIM ALL WARRANTIES WITH REGARD
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
# AND FITNESS, IN NO EVENT SHALL THE ABOVE LISTED COPYRIGHT HOLDER(S) BE
# LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
# -----------------------------------------------------------------------------
# Annotate ".enc" files using the descriptions from the Unicode data file.
# Also, check that the mapping is unique.

use strict;
use warnings;

use Getopt::Std;

$| = 1;

our ( $opt_d, $opt_l, $opt_v );
our $UnicodeData = "UnicodeData.txt";
our %annotations;

sub read_file($) {
    my $path = shift;
    open( FP, $path ) || die "cannot open $path";
    my (@input) = <FP>;
    close(FP);
    chomp @input;
    return @input;
}

sub codepoint($) {
    my $code = shift;
    $code =~ s/^0x//i;
    $code = uc $code;
    $code = "0" . $code while ( length($code) < 6 );
    $code = "0x" . $code;
    return $code;
}

sub parse_unicode() {
    my @data = &read_file($UnicodeData);
    for my $n ( 0 .. $#data ) {
        next unless ( $data[$n] =~ /^[[:xdigit:]]+;.*;/ );
        my $code = $data[$n];
        $code =~ s/;.*//;
        $code = &codepoint($code);
        my $text = $data[$n];
        $text =~ s/^[^;]*;//;
        $text =~ s/;.*//;
        $text = lc $text if ($opt_l);
        $annotations{$code} = $text;
        printf "'%s' ->%s\n", $code, $text if ($opt_d);
    }
}

sub private_use($) {
    my $value  = shift;
    my $result = 0;
    my $lo     = hex '0xE000';
    my $hi     = hex '0xF8FF';
    $value = hex $value;
    $result = 1 if ( $value >= $lo and $value <= $hi );
    return $result;
}

sub expected($$$$) {
    my $path = shift;
    my $line = shift;
    my $text = shift;
    my $want = shift;
    printf STDERR "expected %s\n", $want;
    printf STDERR "%s:%d:%s\n", $path, 1 + $line, $text;
}

sub parse_encoding($) {
    my $path   = shift;
    my @data   = &read_file($path);
    my $state  = 0;
    my $ignore = 0;
    my %mapped;
    for my $n ( 0 .. $#data ) {
        my $data = $data[$n];
        $data =~ s/^\s+//;
        $data =~ s/\s+$//;
        if ( $data =~ /^#/ ) {
            printf "[%d:%d]", $state, $n + 1 if ($opt_v);
            printf "%s\n", $data;
            next;
        }
        if ( $state == 0 ) {
            if ( $data =~ /^STARTENCODING\s+.*$/ ) {
                $state++;
            }
            else {
                &expected( $path, $n, $data, "STARTENCODING" );
            }
        }
        elsif ( $state == 1 ) {
            if ( $data =~ /^SIZE\s+(0x)?[[:xdigit:]]+$/ ) {

                # ignore
            }
            elsif ( $data =~ /^ALIAS\s+[[:alnum:].-]+$/ ) {

                # ignore
            }
            elsif ( $data =~ /^STARTMAPPING\s+unicode$/ ) {
                $state++;
                $ignore = 0;
            }
            elsif ( $data =~ /^STARTMAPPING\s+[[:alnum:].-]+$/ ) {
                $state++;
                $ignore = 1;
            }
            else {
                &expected( $path, $n, $data, "STARTMAPPING" );
            }
        }
        elsif ( $state == 2 ) {
            if ( $data =~ /^ENDMAPPING\b/ ) {
                $state++;
            }
            elsif ($ignore) {
            }
            elsif ( $data =~ /^UNDEFINE(\s+0x[[:xdigit:]]){1,2}/ ) {

                # ignore
            }
            elsif ( $data =~ /^0x[[:xdigit:]]+\s+0x[[:xdigit:]]+.*/ ) {
                my $source = $data;
                $source =~ s/\s.*//;
                $source = hex $source;
                my $target = $data;
                $target =~ s/^[^\s]+\s+//;
                $target =~ s/[\s#].*//;
                $target = &codepoint($target);
                if ( $target !~ /^0x[[:xdigit:]]+$/ ) {
                    &expected( $path, $n, $data, "hex target, not $target" );
                    next;
                }
                my $expect = "";
                if ( &private_use($target) ) {

                    # ignore
                }
                elsif ( $annotations{$target} ) {
                    $expect = $annotations{$target};
                }
                else {
                    &expected( $path, $n, $data, "Unicode for \"$target\"" );
                    next;
                }
                my $actual = "";
                if ( $data =~ /#/ ) {
                    $actual = $data;
                    $actual =~ s/^[^#]*#\s*//;
                    $actual = "" if ( $actual eq "XXX" );
                }
                if ( $actual ne "" and index( $actual, $expect ) != 0 ) {
                    &expected( $path, $n, $data, "Description: $expect" );
                    next;
                }
                $target = hex $target;
                next if ( $target == 0 );
                if ( $mapped{$target} ) {
                    &expected( $path, $n, $data,
                        "distinct mapping of $target" );
                }
                $mapped{$target} = $source;
                $data =~ s/\s*#.*//;
                $data .= "\t# $expect" if ( $expect ne "" );
            }
            elsif ( $data =~ /^0x[[:xdigit:]]+(\s+0\b)?\s*(#.*)?$/ ) {

                #ignore
            }
            else {
                &expected( $path, $n, $data, "mapping" );
            }
        }
        elsif ( $state == 4 ) {
            if ( $data =~ /^ENDENCODING\b/ ) {
                $state++;
            }
        }
        printf "[%d:%d]", $state, $n + 1 if ($opt_v);
        printf "%s\n", $data;
    }
}

sub main::HELP_MESSAGE() {
    printf STDERR <<EOF
Usage: $0 [options]

Options:
    -d      dump $UnicodeData codepoints and descriptions
    -l      use lowercase for descriptions from $UnicodeData
    -v      print state information on each line
EOF
      ;
    exit 1;
}
$Getopt::Std::STANDARD_HELP_VERSION = 1;
&getopts('dlv') || main::HELP_MESSAGE;

&parse_unicode;

while ( $#ARGV >= 0 ) {
    &parse_encoding( shift @ARGV );
}

1;
