perl script to legalize HTML files

Dan Connolly <connolly@pixel.convex.com>
Message-id: <9207160349.AA25229@pixel.convex.com>
To: www-talk@nxoc01.cern.ch
Subject: perl script to legalize HTML files
Date: Wed, 15 Jul 92 22:49:21 CDT
From: Dan Connolly <connolly@pixel.convex.com>
#!/usr/local/bin/perl
#
# USE
#   fix-html.pl <W3-file.html >W3-file.sgml
#
# SEE ALSO
#   the html.dtd.
#

print "<!DOCTYPE HTML SYSTEM>\n";

@html = <>;			# read whole file
$_ = join('', @html);

while(/</){
    &out($`);
    $_ = $';
    if(s/^A\s+//i){
	&fix_anchor;
    }elsif(s/^NEXTID\s+(\d+)\s*>//){
	&out("<NEXTID N=$1>");
    }else{
	&out('<');
    }
}

&out($_);

sub out{
    print $_[0];
}

sub fix_anchor{
    local($name, $href, $type);

    # What exactly is the syntax of an SGML attribute value?
    while(s/^(\w+)\s*=\s*((\"[^\"]*\")|([^\s>]+))\s*//){
	local($v) = ($3 || $4);
	local($a) = $1;
	$href = $v if $a =~ /^href$/i;
	$name = $v if $a =~ /^name$/i;
	$type = $v if $a =~ /^type$/i;
    }
    s/[^>]*>//;

    &out("<A");
    &out(" NAME=\"$name\"") if $name ne '';
    &out(" TYPE=\"$type\"") if $type ne '';
    &out(" HREF=\"$href\"") if $href ne '';
    &out(">");
}