#! perl # # Mary Holstege (holstege@mathling.com) 2001/05/15 # From: http://www.mathling.com/xmlschema/dtd2xsd.pl # derived from # # Yuichi Koike ($Id: dtd2xsd.pl,v 1.2 2001/05/15 13:08:21 mholstege Exp $) # derived from # # Dan Connolly # derived from # # Bert Bos # Created: 17 Mar 1998 # use strict; # Handling command line argument my $targetNS = "http://www.w3.org/namespace/"; my $prefix = "t"; my $alias = 0; my $file = ""; my %SimpleTypes; my @AttrGroupPatterns; my @ModelGroupPatterns; my @SubstitutionGroupPatterns; my %SubstitutionGroup; my @ComplexTypePatterns; my %Mixed; my %ModelGroup; my $mapping_file; my $pcdata_flag = 0; my $pcdata_simpletype = "string"; my $debug = 0; while ($#ARGV >= 0) { my $para = shift(@ARGV); if ($para eq "-ns") { $targetNS = shift(@ARGV); } elsif ($para eq "-prefix") { $prefix = shift(@ARGV); } elsif ($para eq "-alias") { $alias = 1; } elsif ($para eq "-pcdata") { # Treat #PCDATA by itself as being string (or other simple type # if so designated in the mapping file) $pcdata_flag = 1; } elsif ($para eq "-mapfile") { $mapping_file = shift(@ARGV); } elsif ($para eq "-simpletype") { my($pat) = shift(@ARGV); my($b) = shift(@ARGV); $SimpleTypes{$pat} = $b; } elsif ($para eq "-attrgroup") { push(@AttrGroupPatterns, shift(@ARGV)); } elsif ($para eq "-modelgroup") { push(@ModelGroupPatterns, shift(@ARGV)); } elsif ($para eq "-substgroup") { push(@SubstitutionGroupPatterns, shift(@ARGV)); } elsif ($para eq "-complextype") { push(@ComplexTypePatterns, shift(@ARGV)); } elsif ($para eq "-debug") { $debug = 1; } else { $file = $para; } } open( INTERMEDIATE, ">intermediate.out"); # Alias dictionary: defaults my %alias_dic; $alias_dic{"URI"} = "uriReference"; $alias_dic{"LANG"} = "language"; $alias_dic{"NUMBER"} = "nonNegativeInteger"; $alias_dic{"Date"} = "date"; $alias_dic{"Boolean"} = "boolean"; if ( $mapping_file ) { print STDERR "Open mapping $mapping_file "; if ( !open( MAPPINGS, "<$mapping_file" ) ) { print STDERR "unsuccessful.\n"; } else { print STDERR "successful.\n"; while ( ) { chop; if ( /^alias\s+([^ \t]+)\s*=\s*([^ \t]+)\s*/i ) { $alias_dic{$1} = $2; } elsif ( /^simpletype\s+([^ \t]+)\s*=\s*([^ \t]+)\s*/i ) { $SimpleTypes{$1} = $2; } elsif ( /^attrgroup\s+([^ \t]+)\s*/i ) { push( @AttrGroupPatterns, $1 ); } elsif ( /^modelgroup\s+([^ \t]+)\s*/i ) { push( @ModelGroupPatterns, $1 ); } elsif ( /^substgroup\s+([^ \t]+)\s*/i ) { push( @SubstitutionGroupPatterns, $1 ); } elsif ( /^complextype\s+([^ \t]+)\s*/i ) { push( @ComplexTypePatterns, $1 ); } elsif ( /^pcdata\s+([^ \t]+)\s*/i ) { ## BUGLET: doesn't pay attention to prefix; just a special alias $pcdata_simpletype = $1; } } } foreach my $key (keys(%alias_dic)) { print STDERR "Alias \%$key to $alias_dic{$key}\n" } } # Variable declaration my $linelen = 72; my $PROG = substr($0, rindex($0, "/") + 1); my $USAGE = "Usage: $PROG file\n"; my $str = "(?:\"([^\"]*)\"|\'([^\']*)\')"; my %pent; # Parameter entities my %attributes; # Attribute lists my @element; # Elements in source order my %model; # Content models # Main $/ = undef; # Open file, remove comment and include external entity my $buf = openFile($file); open( RAW, ">raw.out"); print RAW $buf; # Alias treatment my $alias_ident = "_alias_"; if ($alias eq 1) { foreach my $key (keys(%alias_dic)) { my $aliaskey = sprintf("%s%s%s", $alias_ident, $key, $alias_ident); $buf =~ s/\%$key;/$aliaskey/gsie; } } my %imports; # store external parameter entities while ($buf =~ s///sie) { print STDERR "$1 is $4.$5\n"; $imports{$1} = $4.$5; } while ($buf =~ s///sie) { print STDERR "$1 is $2.$3\n"; $imports{$1} = $2.$3; } foreach my $key (keys(%imports)) { $pent{$key} = " "; } # store all parameter entities while ($buf =~ s///sie) { my($n, $repltext) = ($1, $2.$3); my ($pat); next if $pent{$n}; # only the first declaration of an entity counts foreach $pat (keys %SimpleTypes){ if ($n =~ /^$pat$/){ $buf .= " "; $pent{$n} = "#DATATYPEREF $n"; undef $n; last; } } foreach $pat (@AttrGroupPatterns){ if ($n =~ /^$pat$/){ $buf .= " "; $pent{$n} = "#ATTRGROUPREF $n"; undef $n; last; } } foreach $pat (@ModelGroupPatterns){ if ($n =~ /^$pat$/){ $buf .= " "; $pent{$n} = "#MODELGROUPREF $n"; undef $n; last; } } foreach $pat (@SubstitutionGroupPatterns){ if ($n =~ /^$pat$/){ $buf .= " "; $pent{$n} = "#SUBSTGROUPREF $n"; undef $n; last; } } foreach $pat (@ComplexTypePatterns) { if ($n =~ /^$pat$/){ $buf .= " "; $pent{$n} = "#COMPLEXTYPEREF $n"; undef $n; last; } } $pent{$n}=$repltext if $n; } open( MUNGED, ">munged.out"); print MUNGED $buf; # remove all general entities $buf =~ s///gsie; # loop until parameter entities fully expanded my $i; do { # count # of substitutions $i = 0; # expand parameter entities $buf =~ s/%([a-zA-Z0-9_\.-]+);?/$i++,$pent{$1}/gse; } while ($i != 0); # treat conditional sections # BUG: Doesn't handle nested conditional sections right while($buf =~ s//$1/sie) {} while($buf =~ s//$1/sie) {} open( EXPANDED, ">expanded.out"); print EXPANDED $buf; # store attribute lists $buf =~ s//store_att($1, $2)/gsie; # store content models $buf =~ s//store_elt($1, $2)/gsie; #print "\n"; print "\n"; # find maximum length of non-terminals #my $maxlen = max(map(length, @element)) + 4; # write imports $buf =~ s//write_import($1)/gsie; # write simple type declarations $buf =~ s//write_simpleType($1, $2, $3)/gsie; # write complex type declarations $buf =~ s//write_complexType($1, $2)/gsie; # write attribute groups $buf =~ s//write_attrGroup($1, $2)/gsie; # write model groups $buf =~ s//write_modelGroup($1, $2)/gsie; # write subsitution groups $buf =~ s//write_substitutionGroup($1, $2)/gsie; my $str2 = "(\"[^\"]*\"|\'[^\']*\')"; # write notation declarations $buf =~ s//write_notation($1, $2, $3)/gsie; $buf =~ s//write_notation($1, $2)/gsie; print INTERMEDIATE $buf; my($e); # loop over elements, writing XML schema foreach $e (@element) { my $h = $model{$e}; my $h2 = $attributes{$e}; my @model = @$h; my $isSimple = ($pcdata_flag eq 1) && ($model[1] eq '#PCDATA') && ( ($#model eq 2) || ( ($#model eq 3) && ($model[3] eq '*') ) ); my $substGroup = $SubstitutionGroup{$e}; if ( $substGroup ) { $substGroup = " substitutionGroup='$substGroup'"; } # print rule for element $e if ( $isSimple && ! $h2 ) { # Assume (#PCDATA) is string print "\n \n"; } else { print "\n \n"; } if ( $isSimple ) { # Assume (#PCDATA) is string if ( $h2 ) { print " \n"; print " \n"; print " \n"; } } else { # print rule for $e's content model print " \n"; } else { print ">\n"; } } elsif ( $model[0] eq 'ANY' ) { print ">\n"; print " \n"; print " \n"; print " \n"; } else { if ( $debug eq 1 ) { print STDERR "==mixed? @model\n"; #@@ } if (&isMixed(@model)) { print " mixed='true'>\n"; } else { print ">\n"; } my @list = &makeChildList('', @model); &printChildList(3, @list); } } # print rule for $e's attributes if (! $h2) { # nothing } else { &printAttrDecls(@$h2); if ( $isSimple ) { print " \n"; print " \n"; } } if ( !$h2 && $isSimple ) { # Do nothing } elsif ($h2 || $model[0] ne 'EMPTY') { print " \n"; } print " \n"; } print "\n"; exit; sub printSpace { my ($num) = $_[0]; for (my $i=0; $i<$num; $i++) { print " "; } } sub printChildList { my ($num, @list) = @_; my @currentTag = (); for (my $i=0; $i<= $#list; $i++) { my $n = $list[$i]; if ($n eq 0 || $n eq 1 || $n eq 2 || $n eq 3) { if ( ($pcdata_flag eq 0) && ($n eq 0 || $n eq 1) && $list[$i+1] eq 20) { # The whole list is 0 20 or 1 20; i.e. (#PCDATA) or (#PCDATA)*. # Don't generate a sequence child; mixed handles all this. } else { if ( $currentTag[$#currentTag] eq "" && $n eq 0 ) { push(@currentTag, ""); } printSpace($num); $num++; print "\n"; push(@currentTag, "sequence"); } } elsif ($n eq 10 || $n eq 11 || $n eq 12 || $n eq 13) { printSpace($num); $num++; print "\n"; push(@currentTag, "choice"); } elsif ($n eq 20) { my $tag = pop(@currentTag); if ($tag ne "") { $num--; printSpace($num); print "\n"; } } else { printSpace($num); if ($n eq '#MODELGROUPREF') { print "\n"; } } } sub makeChildList { my ($groupName, @model) = @_; print INTERMEDIATE "GROUPNAME=", $groupName, "; MODEL=", @model, "\n"; my @ret = (); my @brace = (); for (my $i=0; $i<=$#model; $i++) { my $n = $model[$i]; if ($n eq "(") { push(@ret, 0); push(@brace, $#ret); } elsif ($n eq ")") { if ($model[$i+1] eq "*") { $ret[$brace[$#brace]] += 1; $i++; } elsif ($model[$i+1] eq "+") { $ret[$brace[$#brace]] += 2; $i++; } elsif ($model[$i+1] eq "?") { $ret[$brace[$#brace]] += 3; $i++; } pop(@brace); push(@ret, 20); } elsif ($n eq ",") { $ret[$brace[$#brace]] = 0; } elsif ($n eq "|") { $ret[$brace[$#brace]] = 10; } elsif ($n eq "#PCDATA") { if ($model[$i+1] eq "|") { $i++; } if($groupName){ $Mixed{$groupName} = 1; } } else { push(@ret, $n); } } # "( ( a | b | c )* )" gets mapped to "0 10 a b c 20 20" which will generate # a spurious sequence element. This is not too harmful when this is an # element content model, but with model groups it is incorrect. # In general we need to strip off 0 20 from the ends when it is redundant. # Redundant means: there is some other group that bounds the whole list. # Note that it gets a little tricky: # ( (a|b),(c|d) ) gets mapped to "0 10 a b 20 10 c d 20 20". If one # naively chops off the 0 and 20 on the groups that there is a 10 on one # end and a 20 on the other, one loses the bounding sequence, which is # required in this case. # if ( $ret[0] eq 0 && $ret[$#ret] eq 20 && $ret[$#ret-1] eq 20 && ( $ret[1] eq 0 || $ret[1] eq 1 || $ret[1] eq 2 || $ret[1] eq 3 || $ret[1] eq 10 || $ret[1] eq 11 || $ret[1] eq 12 || $ret[1] eq 13 ) ) { # OK, it is possible that the 0 20 is redundant. Now scan for balance: # All interim 20 between the proposed new start and the proposed new # final one should be at level 1 or above. my $depth = 0; my $redundant_paren = 1; # Assume redundant until proved otherwise for ( my $i = 1; $i <= $#ret-1; $i++ ) { if ( $ret[$i] eq 20 ) { $depth--; if ( $i < $#ret-1 && $depth < 1 ) { $redundant_paren = 0; # print STDERR "i=$i,depth=$depth\n"; } } elsif ( $ret[$i] eq 0 || $ret[$i] eq 1 || $ret[$i] eq 2 || $ret[$i] eq 3 || $ret[$i] eq 10 || $ret[$i] eq 11 || $ret[$i] eq 12 || $ret[$i] eq 13 ) { $depth++; } } # for if ( $redundant_paren eq 1 ) { print STDERR "Truncating @ret\n"; @ret = @ret[1..$#ret-1]; } } if ( $debug eq 1 ) { print STDERR "@model to @ret\n"; } return @ret; } sub printAttrDecls{ my @atts = @_; for (my $i = 0; $i <= $#atts; $i++) { if ($atts[$i] eq '#ATTRGROUPREF'){ print " \n"; $i ++; } else { # attribute name print " \n"; } else { # minOccurs if ($atts[$i] eq "#REQUIRED") { print " use='required'"; } elsif ($atts[$i] eq "#IMPLIED") { print " use='optional'"; } else { print " use='default' value='$atts[$i]'"; } # enumerate if ($#enume eq -1) { print "/>\n"; } else { print ">\n"; print " \n"; print " \n"; &write_enum(@enume); print " \n"; print " \n"; print " \n"; } } } } } sub write_enum{ my(@enume) = @_; for (my $j = 0; $j <= $#enume; $j++) { print " \n"; } } # Parse a string into an array of "words". # Words are whitespace-separated sequences of non-whitespace characters, # or quoted strings ("" or ''), with the quotes removed. # HACK: added () stuff for attlist stuff # Parse words for attribute list sub parsewords { my $line = $_[0]; $line =~ s/(\(|\)|\|)/ $1 /g; my $token; my @words = (); while ($line ne '') { if ($line =~ /^\s+/) { # Skip whitespace } elsif ($line =~ /^\"((?:[^\"]|\\\")*)\"/) { $token = $1; $token =~ s/^://gso; $token =~ s/$prefix://gso; push(@words, $token); } elsif ($line =~ /^\'((?:[^\']|\\\')*)\'/) { $token = $1; $token =~ s/^://gso; $token =~ s/$prefix://gso; push(@words, $token); } elsif ($line =~ /^\S+/) { $token = $&; $token =~ s/^://gso; $token =~ s/$prefix://gso; push(@words, $token); } else { die "Cannot happen\n"; } $line = $'; } return @words; } # Store content model, return empty string sub store_elt { my ($name, $model) = @_; $model =~ s/\s+/ /gso; $name =~ s/$prefix://gso; $name =~ s/^[^:]+://gso; ###XYZZY latest print INTERMEDIATE "NAME=", $name, "\n"; print INTERMEDIATE "MODEL=", $model, "\n"; push(@element, $name); my @words; while ($model =~ s/^\s*(\(|\)|,|\+|\?|\||[\w:_\.-]+|\#\w+|\*)//) { push(@words, $1); print INTERMEDIATE "WORD=", $1, "\n"; }; $model{$name} = [ @words ]; return ''; } # Store attribute list, return empty string sub store_att { my ($element, $atts) = @_; my @words = parsewords($atts); $element =~ s/://gso; $element =~ s/$prefix://gso; $attributes{$element} = [ @words ]; return ''; } sub write_import { my($file) = @_; $file =~ s/dtd$/xsd/; print "\n \n"; } # write_import sub write_simpleType{ my($n, $b, $stuff) = @_; my @words = parsewords($stuff); print "\n \n"; print " \n"; # print STDERR "\n==stuff:\n$stuff \n\n===\n", join('|', @words); my $i = 0; my @enume; if ($words[$i] eq "(") { $i++; while ($words[$i] ne ")") { if ($words[$i] ne "|") { push(@enume, $words[$i]); } $i++; } write_enum(@enume); } print " \n"; print " \n"; } sub write_complexType { my($n, $stuff) = @_; my @words = parsewords($stuff); my($n, $stuff) = @_; my @words = parsewords($stuff); print "\n \n"; print "\n"; my @list = &makeChildList($n, '(', @words, ')'); &printChildList(3, @list); print " \n"; } # write_complexType sub write_attrGroup{ my($n, $stuff) = @_; my @words = parsewords($stuff); print "\n \n"; # print STDERR "\n==stuff:\n$stuff \n\n===\n", join('|', @words); printAttrDecls(@words); print " \n"; } sub write_modelGroup{ my($n, $stuff) = @_; my @words = parsewords($stuff); print "\n \n"; print "\n"; my @list = &makeChildList($n, '(', @words, ')'); &printChildList(3, @list); $ModelGroup{$n} = \@list; print " \n"; } sub write_substitutionGroup { my($n, $stuff) = @_; my @words = parsewords($stuff); print "\n \n"; my @list = &makeChildList($n, '(', @words, ')'); for ( my $i = 0; $i < $#list; $i++ ) { $SubstitutionGroup{ $list[$i] } = $n; } print " \n"; } sub write_notation { my($n, $p, $s) = @_; # No quotes around $p and $s, we already pulled them in print "\n \n"; } sub isMixed{ my(@model) = @_; my $isSimple = ($pcdata_flag eq 1) && ($model[1] eq '#PCDATA') && ( ($#model eq 2) || ( ($#model eq 3) && ($model[3] eq '*') ) ); if ( $debug eq 1 ) { print STDERR "++ mixed? @model\n"; #@@ } if ( $isSimple ) { if ( $debug eq 1 ) { print STDERR "++ no; simple type. @model\n"; #@@ } return 0; } my($i); for ($i = 0; $i <= $#model; $i++) { if ( $model[$i] eq '#PCDATA' || ($model[$i] eq '#MODELGROUPREF' && $Mixed{$model[$i+1]}) || ($model[$i] eq '#SUBSTGROUPREF' && $Mixed{$model[$i+1]}) ) { if ( $debug eq 1 ) { print STDERR "++ yes! $i @model\n"; #@@ } return 1; } } if ( $debug eq 1 ) { print STDERR "++ no. @model\n"; #@@ } return 0; } # Return maximum value of an array of numbers sub max { my $max = $_[0]; foreach my $i (@_) { if ($i > $max) {$max = $i;} } return $max; } # 1) Open file # 2) Remove comment, processing instructions, and general entities # 3) Include external parameter entities recursively # 4) Return the contents of opened file sub openFile { my $file = $_[0]; my %extent; my $bufbuf; if ($file ne "") { print STDERR "open $file "; if(! open AAA, $file) { print STDERR " failed!!\n"; return ""; } print STDERR " successful\n"; $bufbuf = ; } else { print STDERR "open STDIN successful\n"; $bufbuf = <>; } # Strip newlines $bufbuf =~ s/\n//gso; # remove comments $bufbuf =~ s///gso; # remove processing instructions $bufbuf =~ s/<\?.*?>//gso; # store external parameter entities while ($bufbuf =~ s///sie) { print STDERR "$1 is $4.$5\n"; $extent{$1} = $4.$5; } while ($bufbuf =~ s///sie) { print STDERR "$1 is $2.$3\n"; $extent{$1} = $2.$3; } # read external entity files foreach my $key (keys(%extent)) { $bufbuf =~ s/%$key;/openFile($extent{$key})/gsie; } return $bufbuf; } # # Changes: 20001/05/15 mh # Changed to namespace of rec. # Changes: 2001/01/10 mh # Switch to CR syntax # Support external mapping file for type aliases, simple types, model and # attribute groups # Map ANY correctly to wildcard rather than element 'ANY' # Support treating lead PCDATA as string or other aliased simple type instead # of as mixed content (may be more appropriate for data-oriented DTDs) # e.g. => # Support subsitution groups. # # 2001/01/12 mh # Support NOTATION declarations # Fix handling of nested conditional sections # Attempt to compensate for DTDs that are already constructed for namespaces # BUG: Easy to get this wrong ## TODO: Change import of external DTDs into schema import ## remove xmlns 'attributes'