package HTML::TreeBuilder;
use strict;
use integer; # vroom vroom!
use Carp ();
use vars qw(@ISA $VERSION $DEBUG);
$VERSION = '3.23';
#---------------------------------------------------------------------------
# Make a 'DEBUG' constant...
BEGIN {
# We used to have things like
# print $indent, "lalala" if $Debug;
# But there were an awful lot of having to evaluate $Debug's value.
# If we make that depend on a constant, like so:
# sub DEBUG () { 1 } # or whatever value.
# ...
# print $indent, "lalala" if DEBUG;
# Which at compile-time (thru the miracle of constant folding) turns into:
# print $indent, "lalala";
# or, if DEBUG is a constant with a true value, then that print statement
# is simply optimized away, and doesn't appear in the target code at all.
# If you don't believe me, run:
# perl -MO=Deparse,-uHTML::TreeBuilder -e 'BEGIN { \
# $HTML::TreeBuilder::DEBUG = 4} use HTML::TreeBuilder'
# and see for yourself (substituting whatever value you want for $DEBUG
# there).
if(defined &DEBUG) {
# Already been defined! Do nothing.
} elsif($] < 5.00404) {
# Grudgingly accomodate ancient (pre-constant) versions.
eval 'sub DEBUG { $Debug } ';
} elsif(!$DEBUG) {
eval 'sub DEBUG () {0}'; # Make it a constant.
} elsif($DEBUG =~ m<^\d+$>s) {
eval 'sub DEBUG () { ' . $DEBUG . ' }'; # Make THAT a constant.
} else { # WTF?
warn "Non-numeric value \"$DEBUG\" in \$HTML::Element::DEBUG";
eval 'sub DEBUG () { $DEBUG }'; # I guess.
}
}
#---------------------------------------------------------------------------
use HTML::Entities ();
use HTML::Tagset 3.02 ();
use HTML::Element ();
use HTML::Parser ();
@ISA = qw(HTML::Element HTML::Parser);
# This looks schizoid, I know.
# It's not that we ARE an element AND a parser.
# We ARE an element, but one that knows how to handle signals
# (method calls) from Parser in order to elaborate its subtree.
# Legacy aliases:
*HTML::TreeBuilder::isKnown = \%HTML::Tagset::isKnown;
*HTML::TreeBuilder::canTighten = \%HTML::Tagset::canTighten;
*HTML::TreeBuilder::isHeadElement = \%HTML::Tagset::isHeadElement;
*HTML::TreeBuilder::isBodyElement = \%HTML::Tagset::isBodyElement;
*HTML::TreeBuilder::isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup;
*HTML::TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement;
*HTML::TreeBuilder::isList = \%HTML::Tagset::isList;
*HTML::TreeBuilder::isTableElement = \%HTML::Tagset::isTableElement;
*HTML::TreeBuilder::isFormElement = \%HTML::Tagset::isFormElement;
*HTML::TreeBuilder::p_closure_barriers = \@HTML::Tagset::p_closure_barriers;
#==========================================================================
# Two little shortcut constructors:
sub new_from_file { # or from a FH
my $class = shift;
Carp::croak("new_from_file takes only one argument")
unless @_ == 1;
Carp::croak("new_from_file is a class method only")
if ref $class;
my $new = $class->new();
$new->parse_file($_[0]);
return $new;
}
sub new_from_content { # from any number of scalars
my $class = shift;
Carp::croak("new_from_content is a class method only")
if ref $class;
my $new = $class->new();
foreach my $whunk (@_) {
if(ref($whunk) eq 'SCALAR') {
$new->parse($$whunk);
} else {
$new->parse($whunk);
}
last if $new->{'_stunted'}; # might as well check that.
}
$new->eof();
return $new;
}
# TODO: document more fully?
sub parse_content { # from any number of scalars
my $tree = shift;
my $retval;
foreach my $whunk (@_) {
if(ref($whunk) eq 'SCALAR') {
$retval = $tree->parse($$whunk);
} else {
$retval = $tree->parse($whunk);
}
last if $tree->{'_stunted'}; # might as well check that.
}
$tree->eof();
return $retval;
}
#---------------------------------------------------------------------------
sub new { # constructor!
my $class = shift;
$class = ref($class) || $class;
my $self = HTML::Element->new('html'); # Initialize HTML::Element part
{
# A hack for certain strange versions of Parser:
my $other_self = HTML::Parser->new();
%$self = (%$self, %$other_self); # copy fields
# Yes, multiple inheritance is messy. Kids, don't try this at home.
bless $other_self, "HTML::TreeBuilder::_hideyhole";
# whack it out of the HTML::Parser class, to avoid the destructor
}
# The root of the tree is special, as it has these funny attributes,
# and gets reblessed into this class.
# Initialize parser settings
$self->{'_implicit_tags'} = 1;
$self->{'_implicit_body_p_tag'} = 0;
# If true, trying to insert text, or any of %isPhraseMarkup right
# under 'body' will implicate a 'p'. If false, will just go there.
$self->{'_tighten'} = 1;
# whether ignorable WS in this tree should be deleted
$self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag
$self->{'_element_class'} = 'HTML::Element';
$self->{'_ignore_unknown'} = 1;
$self->{'_ignore_text'} = 0;
$self->{'_warn'} = 0;
$self->{'_no_space_compacting'}= 0;
$self->{'_store_comments'} = 0;
$self->{'_store_declarations'} = 1;
$self->{'_store_pis'} = 0;
$self->{'_p_strict'} = 0;
# Parse attributes passed in as arguments
if(@_) {
my %attr = @_;
for (keys %attr) {
$self->{"_$_"} = $attr{$_};
}
}
# rebless to our class
bless $self, $class;
$self->{'_element_count'} = 1;
# undocumented, informal, and maybe not exactly correct
$self->{'_head'} = $self->insert_element('head',1);
$self->{'_pos'} = undef; # pull it back up
$self->{'_body'} = $self->insert_element('body',1);
$self->{'_pos'} = undef; # pull it back up again
return $self;
}
#==========================================================================
sub _elem # universal accessor...
{
my($self, $elem, $val) = @_;
my $old = $self->{$elem};
$self->{$elem} = $val if defined $val;
return $old;
}
# accessors....
sub implicit_tags { shift->_elem('_implicit_tags', @_); }
sub implicit_body_p_tag { shift->_elem('_implicit_body_p_tag', @_); }
sub p_strict { shift->_elem('_p_strict', @_); }
sub no_space_compacting { shift->_elem('_no_space_compacting', @_); }
sub ignore_unknown { shift->_elem('_ignore_unknown', @_); }
sub ignore_text { shift->_elem('_ignore_text', @_); }
sub ignore_ignorable_whitespace { shift->_elem('_tighten', @_); }
sub store_comments { shift->_elem('_store_comments', @_); }
sub store_declarations { shift->_elem('_store_declarations', @_); }
sub store_pis { shift->_elem('_store_pis', @_); }
sub warn { shift->_elem('_warn', @_); }
#==========================================================================
sub warning {
my $self = shift;
CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'};
# should maybe say HTML::TreeBuilder instead
}
#==========================================================================
{
# To avoid having to rebuild these lists constantly...
my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)];
my $indent;
sub start {
return if $_[0]{'_stunted'};
# Accept a signal from HTML::Parser for start-tags.
my($self, $tag, $attr) = @_;
# Parser passes more, actually:
# $self->start($tag, $attr, $attrseq, $origtext)
# But we can merrily ignore $attrseq and $origtext.
if($tag eq 'x-html') {
print "Ignoring open-x-html tag.\n" if DEBUG;
# inserted by some lame code-generators.
return; # bypass tweaking.
}
$tag =~ s{/$}{}s; # So turns into . Silently forgive.
unless($tag =~ m/^[-_a-zA-Z0-9:%]+$/s) {
DEBUG and print "Start-tag name $tag is no good. Skipping.\n";
return;
# This avoids having Element's new() throw an exception.
}
my $ptag = (
my $pos = $self->{'_pos'} || $self
)->{'_tag'};
my $already_inserted;
#my($indent);
if(DEBUG) {
# optimization -- don't figure out indenting unless we're in debug mode
my @lineage = $pos->lineage;
$indent = ' ' x (1 + @lineage);
print
$indent, "Proposing a new \U$tag\E under ",
join('/', map $_->{'_tag'}, reverse($pos, @lineage)) || 'Root',
".\n";
#} else {
# $indent = ' ';
}
#print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2;
# $attr = {%$attr};
foreach my $k (keys %$attr) {
# Make sure some stooge doesn't have "".
# That happens every few million Web pages.
$attr->{' ' . $k} = delete $attr->{$k}
if length $k and substr($k,0,1) eq '_';
# Looks bad, but is fine for round-tripping.
}
my $e =
($self->{'_element_class'} || 'HTML::Element')->new($tag, %$attr);
# Make a new element object.
# (Only rarely do we end up just throwing it away later in this call.)
# Some prep -- custom messiness for those damned tables, and strict P's.
if($self->{'_implicit_tags'}) { # wallawallawalla!
unless($HTML::TreeBuilder::isTableElement{$tag}) {
if ($ptag eq 'table') {
print $indent,
" * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n"
if DEBUG > 1;
$self->insert_element('tr', 1);
$pos = $self->insert_element('td', 1); # yes, needs updating
} elsif ($ptag eq 'tr') {
print $indent,
" * Phrasal \U$tag\E right under TR makes an implicit TD\n"
if DEBUG > 1;
$pos = $self->insert_element('td', 1); # yes, needs updating
}
$ptag = $pos->{'_tag'}; # yes, needs updating
}
# end of table-implication block.
# Now maybe do a little dance to enforce P-strictness.
# This seems like it should be integrated with the big
# "ALL HOPE..." block, further below, but that doesn't
# seem feasable.
if(
$self->{'_p_strict'}
and $HTML::TreeBuilder::isKnown{$tag}
and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag}
) {
my $here = $pos;
my $here_tag = $ptag;
while(1) {
if($here_tag eq 'p') {
print $indent,
" * Inserting $tag closes strict P.\n" if DEBUG > 1;
$self->end(\q{p});
# NB: same as \'q', but less confusing to emacs cperl-mode
last;
}
#print("Lasting from $here_tag\n"),
last if
$HTML::TreeBuilder::isKnown{$here_tag}
and not $HTML::Tagset::is_Possible_Strict_P_Content{$here_tag};
# Don't keep looking up the tree if we see something that can't
# be strict-P content.
$here_tag = ($here = $here->{'_parent'} || last)->{'_tag'};
}# end while
$ptag = ($pos = $self->{'_pos'} || $self)->{'_tag'}; # better update!
}
# end of strict-p block.
}
# And now, get busy...
#----------------------------------------------------------------------
if (!$self->{'_implicit_tags'}) { # bimskalabim
# do nothing
print $indent, " * _implicit_tags is off. doing nothing\n"
if DEBUG > 1;
#----------------------------------------------------------------------
} elsif ($HTML::TreeBuilder::isHeadOrBodyElement{$tag}) {
if ($pos->is_inside('body')) { # all is well
print $indent,
" * ambilocal element \U$tag\E is fine under BODY.\n"
if DEBUG > 1;
} elsif ($pos->is_inside('head')) {
print $indent,
" * ambilocal element \U$tag\E is fine under HEAD.\n"
if DEBUG > 1;
} else {
# In neither head nor body! mmmmm... put under head?
if ($ptag eq 'html') { # expected case
# TODO?? : would there ever be a case where _head would be
# absent from a tree that would ever be accessed at this
# point?
die "Where'd my head go?" unless ref $self->{'_head'};
if ($self->{'_head'}{'_implicit'}) {
print $indent,
" * ambilocal element \U$tag\E makes an implicit HEAD.\n"
if DEBUG > 1;
# or rather, points us at it.
$self->{'_pos'} = $self->{'_head'}; # to insert under...
} else {
$self->warning(
"Ambilocal element <$tag> not under HEAD or BODY!?");
# Put it under HEAD by default, I guess
$self->{'_pos'} = $self->{'_head'}; # to insert under...
}
} else {
# Neither under head nor body, nor right under html... pass thru?
$self->warning(
"Ambilocal element <$tag> neither under head nor body, nor right under html!?");
}
}
#----------------------------------------------------------------------
} elsif ($HTML::TreeBuilder::isBodyElement{$tag}) {
# Ensure that we are within
if($ptag eq 'body') {
# We're good.
} elsif($HTML::TreeBuilder::isBodyElement{$ptag} # glarg
and not $HTML::TreeBuilder::isHeadOrBodyElement{$ptag}
) {
# Special case: Save ourselves a call to is_inside further down.
# If our $ptag is an isBodyElement element (but not an
# isHeadOrBodyElement element), then we must be under body!
print $indent, " * Inferring that $ptag is under BODY.\n",
if DEBUG > 3;
# I think this and the test for 'body' trap everything
# bodyworthy, except the case where the parent element is
# under an unknown element that's a descendant of body.
} elsif ($pos->is_inside('head')) {
print $indent,
" * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n"
if DEBUG > 1;
$ptag = (
$pos = $self->{'_pos'} = $self->{'_body'} # yes, needs updating
|| die "Where'd my body go?"
)->{'_tag'}; # yes, needs updating
} elsif (! $pos->is_inside('body')) {
print $indent,
" * body-element \U$tag\E makes implicit BODY.\n"
if DEBUG > 1;
$ptag = (
$pos = $self->{'_pos'} = $self->{'_body'} # yes, needs updating
|| die "Where'd my body go?"
)->{'_tag'}; # yes, needs updating
}
# else we ARE under body, so okay.
# Handle implicit endings and insert based on and position
# ... ALL HOPE ABANDON ALL YE WHO ENTER HERE ...
if ($tag eq 'p' or
$tag eq 'h1' or $tag eq 'h2' or $tag eq 'h3' or
$tag eq 'h4' or $tag eq 'h5' or $tag eq 'h6' or
$tag eq 'form'
# Hm, should