use strict;
use warnings;

my @els5;
my %attrs5;

{
	open my $source, 'source' or die $!;

	my $el = '_global';
	while (<$source>)
	{
		if (/<h4.*>The (.*) element<\/h4>/)
		{
			my $c = $1;
			$c =~ />([^<>]+)</ or die $c;
			$el = $1;
			push @els5, $el;
			$attrs5{$el} ||= [ @{$attrs5{_global}} ];
		}
		elsif (/<dd><code title="(attr-.*)">(.*?)<\/code>.*<\/dd>/)
		{
			my ($id, $attr) = ($1, $2);
			push @{$attrs5{$el}}, $attr;
		}
		elsif (/<li><code title="handler-(\w+)">(\w+)<\/code><\/li>/)
		{
			die $_ unless $1 eq $2;
			push @{$attrs5{_global}}, $1;
		}
		last if /Acknowledgements/;
	}

	for ('h1'..'h6')
	{
		push @els5, $_;
		$attrs5{$_} = $attrs5{_global};
	}
}

sub split_things {
	my $x = $_[0];
	$x =~ s/$_->[0]/$_->[1]/ for @{$_[1]};
	my @r;
	for (split /\|/, $x) {
		s/[()]//;
		push @r, $_;
	}
	@r;
}

sub dtd {
	my @els4;
	my %attrs4;

	open my $dtd, $_[0] or die $!;
	my @els;
	my %ents;
	my $ent;
	while (<$dtd>)
	{
		if (/<!ENTITY % (coreattrs|i18n|events|reserved|cellhalign|cellvalign|bodycolors|align)/)
		{
			next if $1 eq 'reserved';
			$ent = $1;
		}
		elsif ($ent and /^ +"?>/)
		{
			undef $ent;
		}
		elsif (@els and /^ +>/)
		{
			@els = ();
		}
		elsif ($ent and /^ *[ "](\w+)/)
		{
			push @{$ents{$ent}}, $1;
		}
		elsif (/^<!ELEMENT (\S+)/)
		{
			push @els4, lc $_ for split_things($1, $_[1]);
			$attrs4{$_} ||= [] for @els4;
		}
		elsif (/^<!ATTLIST (\S+)/)
		{
			@els = map lc $_, split_things($1, $_[1]);
		}
		elsif (@els and /^ +%attrs/)
		{
			for my $el (@els) {			
				push @{$attrs4{$el}}, map @$_, @ents{qw(coreattrs events i18n)};
			}
		}
		elsif (@els and /^ +%(version|align)/)
		{
			for my $el (@els) {			
				push @{$attrs4{$el}}, $1;
			}
		}
		elsif (@els and /^ +%(\w+)/)
		{
			do { warn $_; next } unless $ents{$1};
			for my $el (@els) {			
				push @{$attrs4{$el}}, @{$ents{$1}};
			}
		}
		elsif (@els and /^  ?([\w-]+)/)
		{
			for my $el (@els) {
				push @{$attrs4{$el}}, $1;
			}
		}
	}

	return (\@els4, \%attrs4);
}

my @els4t;
my %attrs4t;
my @els4s;
my %attrs4s;

{ my ($e, $a) = dtd('loose.dtd', [
	[ '%fontstyle;', 'TT|I|B|BIG|SMALL' ],
	[ '%phrase;', 'EM|STRONG|DFN|CODE|SAMP|KBD|VAR|CITE|ABBR|ACRONYM' ],
	[ '%heading;', 'H1|H2|H3|H4|H5|H6' ],
]); @els4t = @$e; %attrs4t = %$a }
{ my ($e, $a) = dtd('strict.dtd', [
	[ '%fontstyle;', 'TT|I|B|U|S|STRIKE|BIG|SMALL' ],
	[ '%phrase;', 'EM|STRONG|DFN|CODE|SAMP|KBD|VAR|CITE|ABBR|ACRONYM' ],
	[ '%heading;', 'H1|H2|H3|H4|H5|H6' ],
]); @els4s = @$e; %attrs4s = %$a }

sub uniq { my %u; @u{@_} = (); sort keys %u }

sub a_and_b
{
	my ($a, $b) = @_;
	my @r;
	for my $x (@$a) { push @r, $x if grep $_ eq $x, @$b }
	return @r;
}

sub a_and_not_b
{
	my ($a, $b) = @_;
	my @r;
	for my $x (@$a) { push @r, $x unless grep $_ eq $x, @$b }
	return @r;
}

print "Elements only in HTML4 Transitional:\n";
for my $e (sort(a_and_not_b(\@els4t, [@els5, @els4s])))
{
	print "  $e\n";
}

print "Elements only in HTML4:\n";
for my $e (uniq sort(a_and_not_b([@els4s, @els4t], \@els5)))
{
	print "  $e\n";
}

print "Elements only in HTML5:\n";
for my $e (sort(a_and_not_b(\@els5, [@els4s, @els4t])))
{
	print "  $e\n";
}

print "Elements only in HTML5 and HTML4 Transitional:\n";
for my $e (sort(a_and_not_b([ a_and_b(\@els5, \@els4t) ], \@els4s)))
{
	print "  $e\n";
}

my $is_global = qr/@{[ join '|', @{$attrs5{_global}} ]}/;

print "\nIgnoring removal of attribute 'style', and addition of HTML5 global attributes:\n\n";

print "Elements not new in HTML5:\n";
for my $e (sort(a_and_b(\@els5, [ @els4s, @els4t ])))
{
	my @a;
	print "  $e:\n";

	@a = sort(a_and_not_b($attrs4t{$e}, [ @{$attrs5{$e}||[]}, @{$attrs4s{$e}||[]} ]));
	@a = grep !/^(style)$/, @a;
	if (@a) {
		print "    Attributes only in HTML4 Transitional (deprecated by HTML4, dropped by HTML5):\n";
		print "      $_\n" for @a;
	}

	@a = uniq sort(a_and_not_b($attrs4s{$e}, $attrs5{$e}));
	@a = grep !/^(style)$/, @a;
	if (@a) {
		print "    Attributes only in HTML4 (dropped by HTML5):\n";
		print "      $_\n" for @a;
	}

	@a = sort(a_and_not_b($attrs5{$e}, [ @{$attrs4s{$e}||[]}, @{$attrs4t{$e}||[]} ] ));
	@a = grep !/^($is_global)$/, @a;
	if (@a) {
		print "    Attributes only in HTML5 (added by HTML5):\n";
		print "      $_\n" for @a;
	}

	@a = sort(a_and_not_b([ a_and_b($attrs5{$e}, $attrs4t{$e}) ], $attrs4s{$e} ));
	@a = grep !/^($is_global)$/, @a;
	if (@a) {
		print "    Attributes only in HTML5 and HTML4 Transitional (deprecated by HTML4, restored by HTML5):\n";
		print "      $_\n" for @a;
	}
}


