Yet Another CPAN Grep

HTML-HTML5-Outline/lib/HTML/HTML5/Outline.pm

package HTML::HTML5::Outline;

use 5.008;
use strict;

use Carp qw[];
use HTML::HTML5::Outline::Outlinee;
use HTML::HTML5::Outline::Section;
use HTML::HTML5::Parser;
use Scalar::Util qw[blessed];
use XML::LibXML;

our $VERSION = '0.006';

my $HAS_RDF = undef;

sub import
{
	my ($class, %import) = @_;
	if (exists $import{rdf} and !$import{rdf})
	{
		$HAS_RDF = 0;
	}
	else
	{
		local $@;
		$HAS_RDF = eval 'require HTML::HTML5::Outline::RDF; 1' || 0;
		Carp::croak("RDF support not available: $@\n")
			if (exists $import{rdf} and $import{rdf} and !$HAS_RDF);
	}
}

sub has_rdf
{
	return $HAS_RDF;
}

sub new
{
	my ($class, $dom, %options) = @_;

	$options{parser} = 'html' unless defined $options{parser};

	unless (blessed($dom) and $dom->isa('XML::LibXML::Document'))
	{
		my $parser = (lc $options{parser} eq 'xml' or lc $options{parser} eq 'xhtml')
			? XML::LibXML->new
			: HTML::HTML5::Parser->new;
		$dom = $parser->parse_string($dom);
	}

	my $self = bless {
		count            => 0,
		current_outlinee => undef,
		current_section  => undef,
		stack            => [],
		outlines         => {},
		page             => \%options,
		options          => \%options,
		dom              => $dom,
		element_subjects => $options{element_subjects},
	}, $class;
	
	my @roots = $dom->getElementsByTagName('html');
	my $root  = $roots[0];
	
	return unless $root;
	
	my $node = $root;
	my $r;
	START: while ($node)
	{
		$r = $self->tag('start', $node);
		if ($node->firstChild)
		{
			$node = $node->firstChild;
			next START;
		}
		while ($node)
		{
			$r = $self->tag('end', $node);
			last START if ($r<0);
			if ($node->nextSibling)
			{
				$node = $node->nextSibling;
				next START;
			}
			if ($node == $root)
				{ $node = undef; }
			else
				{ $node = $node->parentNode; }
		}
	}

	$self->{primary_outlinee} = k($self->{current_outlinee});
	$self->{primary_outline}  = $self->{outlines}->{$self->{primary_outlinee}};
	
	return $self;
}

sub primary_outlinee { return $_[0]->{primary_outline}; }

sub to_hashref
{
	my ($self) = @_;

	unless (defined $self->{hashref})
	{
		$self->{hashref} = $self->primary_outlinee->to_hashref;
	}
	
	return $self->{hashref};
}

sub _mk_outlinee
{
	my ($self, %options) = @_;
	$options{outliner} = $self;
	return HTML::HTML5::Outline::Outlinee->new(%options);
}

sub _mk_section
{
	my ($self, %options) = @_;
	$options{outliner} = $self;
	return HTML::HTML5::Outline::Section->new(%options);
}

sub tag
{
	my $self = shift;
	my $type = shift;  # 'start' or 'end'
	my $node = shift;
	
	my $top_of_stack;
	if ($self->{stack})
			{ $top_of_stack = $self->{stack}->[-1]; }
	
	# If the top of the stack is an element, and you are exiting that element
	if ($type eq 'end' && $top_of_stack && (k($top_of_stack) eq k($node)))
	{
		# Note: The element being exited is a heading content element.
		warn("This element should be a heading content element!\n")
			unless ( $self->is_heading_content($node) );
		
		# Pop that element from the stack.
		pop @{ $self->{stack} };
	}
	
	# If the top of the stack is a heading content element
	elsif ( $top_of_stack && $self->is_heading_content($top_of_stack) )
	{
		# Do nothing.
	}
	
	# When entering a sectioning content element or a sectioning root element
	elsif ($type eq 'start' && ($self->is_sectioning_content($node)||$self->is_sectioning_root($node)))
	{
		# XXX: If current outlinee is not null, and the current section has no heading, create an implied heading and let that be the heading for the current section.
		
		# If current outlinee is not null, push current outlinee onto the stack.
		push @{ $self->{stack} }, $self->{current_outlinee}
			if (defined $self->{current_outlinee});
		
		# Let current outlinee be the element that is being entered.
		$self->{current_outlinee} = $node;
		
		# Let current section be a newly created section for the current outlinee
		#     element.
		$self->{current_section} = $self->_mk_section(
			document_order => $self->{count}++,
			);
		
		# Let there be a new outline for the new current outlinee, initialized
		#     with just the new current section as the only section in the
		#     outline.
		$self->{outlines}->{ k($self->{current_outlinee}) } = $self->_mk_outlinee(
			document_order => $self->{count}++,
			sections => [$self->{current_section}],
			element  => $self->{current_outlinee},
			tagname  => $self->{current_outlinee}->tagName,
			);
	}
	
	# When exiting a sectioning content element, if the stack is not empty
	elsif ($type eq 'end' && $top_of_stack && $self->is_sectioning_content($node))
	{
		# Pop the top element from the stack, and let the current outlinee be
		#     that element.
		$self->{current_outlinee} = pop @{ $self->{stack} };
		
		# Let current section be the last section in the outline of the current
		#     outlinee element.
		my $ootco = $self->{outlines}->{ k($self->{current_outlinee}) };
		$self->{current_section} = $ootco->{sections}->[-1]
			if defined $ootco->{sections}->[-1];
			
		# Append the outline of the sectioning content element being exited to
		#     the current section. (This does not change which section is the
		#     last section in the outline.)
		push @{ $self->{current_section}->{outlines} },
			$self->{outlines}->{ k($node) };
	}
	
	# When exiting a sectioning root element, if the stack is not empty
	elsif ($type eq 'end' && $top_of_stack && $self->is_sectioning_root($node))
	{
		# Pop the top element from the stack, and let the current outlinee be
		#     that element.
		$self->{current_outlinee} = pop @{ $self->{stack} };
		
		# Let current section be the last section in the outline of the current
		#     outlinee element.
		$self->{current_section} =
			$self->{outlines}->{ k($self->{current_outlinee}) }->{sections}->[-1];
		
		# Finding the deepest child: If current section has no child sections,
		#    stop these steps.
		FINDING_DEEPEST_CHILD: while ($self->{current_section}->{sections}->[0])
		{
			# Let current section be the last child section of the current 'current
			#     section'.
			$self->{current_section} = $self->{current_section}->{sections}->[-1];
		}
	}

	# When exiting a sectioning content element or a sectioning root element	
	elsif ($type eq 'end' && ($self->is_sectioning_content($node)||$self->is_sectioning_root($node)))
	{
		# Note: The current outlinee is the element being exited.
		warn("The current outlinee seems to be wrong.\n")
			unless (k($self->{current_outlinee}) eq k($node));
			
		# Let current section be the first section in the outline of the current
		#     outlinee element.
		$self->{current_section} =
			$self->{outlines}->{ k($self->{current_outlinee}) }->{sections}->[0];
			
		# Skip to the next step in the overall set of steps. (The walk is over.)
		return -1;
	}
	
	# If the current outlinee is null.
	elsif (!defined $self->{current_outlinee})
	{
		# Do nothing.
	}
	
	# When entering a heading content element
	elsif ($type eq 'start' && $self->is_heading_content($node))
	{
		# If the current section has no heading, let the element being entered be
		#     the heading for the current section.
		if (!defined $self->{current_section}->{header})
		{
			$self->{current_section}->{header}  = $node;
			$self->{current_section}->{heading} = $self->stringify($node);
		}
			
		# Otherwise, if the element being entered has a rank equal to or greater
		#     than the heading of the last section of the outline of the current
		#     outlinee, then create a new section and append it to the outline of
		#     the current outlinee element, so that this new section is the new
		#     last section of that outline. Let current section be that new
		#     section. Let the element being entered be the new heading for the
		#     current section.
		elsif (rank_of($node) >= rank_of($self->{outlines}->{ k($self->{current_outlinee}) }->{sections}->[-1]->{header}))
		{
			$self->{current_section} = $self->_mk_section(
				document_order => $self->{count}++,
				header   => $node,
				heading  => $self->stringify($node),
				);
			push @{ $self->{outlines}->{ k($self->{current_outlinee}) }->{sections} },
				$self->{current_section};
		}

		# Otherwise, run these substeps:
		else
		{
			# Let candidate section be current section.
			my $candidate = $self->{current_section};
			
			while (1)
			{
				# If the element being entered has a rank lower than the rank of
				#     the heading of the candidate section, then create a new
				#     section, and append it to candidate section. (This does not
				#     change which section is the last section in the outline.)
				#     Let current section be this new section. Let the element
				#     being entered be the new heading for the current section.
				#     Abort these substeps.
				if (rank_of($node) < rank_of($candidate->{header}))
				{
					$self->{current_section} = $self->_mk_section(
						document_order => $self->{count}++,
						header   => $node,
						heading  => $self->stringify($node),
						parent   => $candidate,
						);
					push @{ $candidate->{sections} }, $self->{current_section};
					last;
				}
				
				# Let new candidate section be the section that contains candidate
				#     section in the outline of current outlinee.
				# Let candidate section be new candidate section.
				$candidate = $candidate->{parent};
			}
			
			# Push the element being entered onto the stack. (This causes the
			#     algorithm to skip any descendants of the element.)
			push @{ $self->{stack} }, $node;
		}
		
	}
	
	# Otherwise
	else
	{
		# Do nothing.
	}
	
	# In addition, whenever you exit a node, after doing the steps above, if
	#     current section is not null, associate the node with the section 
	#     current section.
	if ($type eq 'end' && $self->{current_section})
	{
		push @{ $self->{current_section}->{elements} }, $node;
	}
	
	return 1; # continue
}


sub is_sectioning_content
{
	my ($self, $node) = @_;	
	return 0 unless $node->nodeType == XML_ELEMENT_NODE;

	if ( $node->tagName =~ /^(section|nav|article|aside)$/i )
		{ return 1; }
		
	return 0;
}

sub is_sectioning_root
{
	my ($self, $node) = @_;	
	return 0 unless $node->nodeType == XML_ELEMENT_NODE;

	my @bodies = $self->{dom}->getElementsByTagName('body');
	
	if ( @bodies && $node->tagName =~ /^(blockquote|body|details|fieldset|figure|td|datagrid|th)$/i ) # <datagrid> from earlier HTML5 drafts; <th> I've added
		{ return 1; }

	# Some tagsoup parsers don't add in BODY elements when they're missing
	# from the markup, so if there is no <body> element found, treat <html>
	# as a sectioning root instead.
	elsif ( (!@bodies) && $node->tagName =~ /^(blockquote|html|details|fieldset|figure|td|datagrid|th)$/i )
		{ return 1; }

	# Support for figure microformat
	elsif ($node->hasAttribute('class') && $node->getAttribute('class') =~ /\bfigure\b/ && $self->{options}->{microformats})
		{ return 1; }
	
	# Support for XOXO
	elsif ($node->tagName =~ /^(ul|li)$/i && $node->hasAttribute('class') && $node->getAttribute('class') =~ /\bxoxo\b/ && $self->{options}->{microformats})
		{ return 1; }

	return 0;
}

sub is_heading_content
{
	my ($self, $node) = @_;	
	return 0 unless $node->nodeType == XML_ELEMENT_NODE;

	if ( $node->tagName =~ /^(h[1-6]|h|heading|hgroup)$/i )  # <h> from XHTML2; <heading> from early HTML5 drafts
		{ return 1; }
	# Perhaps add <caption>?
	return 0;
}

sub stringify
{
	my ($self, $node) = @_;
	return $node->textContent;
}

# Recall that h1 has the highest rank, and h6 has the lowest rank.
sub rank_of
{
	# not a method
	my $node = shift;
	return 0 unless ($node->nodeType==XML_ELEMENT_NODE);
	
	if ( $node->tagName =~ /^h([1-6])$/i )
		{ return 0 - $1; }
	if ( $node->tagName =~ /^h$/i )
		{ return 1; }
	if ( $node->tagName =~ /^(header|hgroup)$/i )
	{
		foreach my $c ($node->getElementsByTagName('*'))
			{ return rank_of($c) if (is_heading_content($c)); }
		return 1;
	}
	
	return 0;
}

sub k
{
	# not a method
	my $node = shift;
	return '/html/body' unless ($node);
	return $node->nodePath();
}

sub _node_lang
{
	my $self = shift;
	my $node = shift;

	my $XML_XHTML_NS = 'http://www.w3.org/1999/xhtml';

	if ($node->hasAttributeNS(XML_XML_NS, 'lang'))
	{
		return _valid_lang($node->getAttributeNS(XML_XML_NS, 'lang')) ?
			$node->getAttributeNS(XML_XML_NS, 'lang'):
			undef;
	}

	if ($node->hasAttributeNS($XML_XHTML_NS, 'lang'))
	{
		return _valid_lang($node->getAttributeNS($XML_XHTML_NS, 'lang')) ?
			$node->getAttributeNS($XML_XHTML_NS, 'lang'):
			undef;
	}

	if ($node->hasAttributeNS(undef, 'lang'))
	{
		return _valid_lang($node->getAttributeNS(undef, 'lang')) ?
			$node->getAttributeNS(undef, 'lang'):
			undef;
	}

	if ($node != $self->{'dom'}->documentElement
	&&  defined $node->parentNode
	&&  $node->parentNode->nodeType == XML_ELEMENT_NODE)
	{
		return $self->_node_lang($node->parentNode);
	}
	
	return $self->{'options'}->{'default_language'};
}

sub _valid_lang
{
	my $value_to_test = shift;

	return 1 if (defined $value_to_test) && ($value_to_test eq '');
	return 0 unless defined $value_to_test;
	
	# Regex for recognizing RFC 4646 well-formed tags
	# http://www.rfc-editor.org/rfc/rfc4646.txt
	# http://tools.ietf.org/html/draft-ietf-ltru-4646bis-21

	# The structure requires no forward references, so it reverses the order.
	# It uses Java/Perl syntax instead of the old ABNF
	# The uppercase comments are fragments copied from RFC 4646

	# Note: the tool requires that any real "=" or "#" or ";" in the regex be escaped.

	my $alpha      = '[a-z]';      # ALPHA
	my $digit      = '[0-9]';      # DIGIT
	my $alphanum   = '[a-z0-9]';   # ALPHA / DIGIT
	my $x          = 'x';          # private use singleton
	my $singleton  = '[a-wyz]';    # other singleton
	my $s          = '[_-]';       # separator -- lenient parsers will use [_-] -- strict will use [-]

	# Now do the components. The structure is slightly different to allow for capturing the right components.
	# The notation (?:....) is a non-capturing version of (...): so the "?:" can be deleted if someone doesn't care about capturing.

	my $language   = '([a-z]{2,8}) | ([a-z]{2,3} $s [a-z]{3})';
	
	# ABNF (2*3ALPHA) / 4ALPHA / 5*8ALPHA  --- note: because of how | works in regex, don't use $alpha{2,3} | $alpha{4,8} 
	# We don't have to have the general case of extlang, because there can be only one extlang (except for zh-min-nan).

	# Note: extlang invalid in Unicode language tags

	my $script = '[a-z]{4}' ;   # 4ALPHA 

	my $region = '(?: [a-z]{2}|[0-9]{3})' ;    # 2ALPHA / 3DIGIT

	my $variant    = '(?: [a-z0-9]{5,8} | [0-9] [a-z0-9]{3} )' ;  # 5*8alphanum / (DIGIT 3alphanum)

	my $extension  = '(?: [a-wyz] (?: [_-] [a-z0-9]{2,8} )+ )' ; # singleton 1*("-" (2*8alphanum))

	my $privateUse = '(?: x (?: [_-] [a-z0-9]{1,8} )+ )' ; # "x" 1*("-" (1*8alphanum))

	# Define certain grandfathered codes, since otherwise the regex is pretty useless.
	# Since these are limited, this is safe even later changes to the registry --
	# the only oddity is that it might change the type of the tag, and thus
	# the results from the capturing groups.
	# http://www.iana.org/assignments/language-subtag-registry
	# Note that these have to be compared case insensitively, requiring (?i) below.

	my $grandfathered  = '(?:
			  (en [_-] GB [_-] oed)
			| (i [_-] (?: ami | bnn | default | enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu ))
			| (no [_-] (?: bok | nyn ))
			| (sgn [_-] (?: BE [_-] (?: fr | nl) | CH [_-] de ))
			| (zh [_-] min [_-] nan)
			)';

	# old:         | zh $s (?: cmn (?: $s Hans | $s Hant )? | gan | min (?: $s nan)? | wuu | yue );
	# For well-formedness, we don't need the ones that would otherwise pass.
	# For validity, they need to be checked.

	# $grandfatheredWellFormed = (?:
	#         art $s lojban
	#     | cel $s gaulish
	#     | zh $s (?: guoyu | hakka | xiang )
	# );

	# Unicode locales: but we are shifting to a compatible form
	# $keyvalue = (?: $alphanum+ \= $alphanum+);
	# $keywords = ($keyvalue (?: \; $keyvalue)*);

	# We separate items that we want to capture as a single group

	my $variantList   = $variant . '(?:' . $s . $variant . ')*' ;     # special for multiples
	my $extensionList = $extension . '(?:' . $s . $extension . ')*' ; # special for multiples

	my $langtag = "
			($language)
			($s ( $script ) )?
			($s ( $region ) )?
			($s ( $variantList ) )?
			($s ( $extensionList ) )?
			($s ( $privateUse ) )?
			";

	# Here is the final breakdown, with capturing groups for each of these components
	# The variants, extensions, grandfathered, and private-use may have interior '-'
	
	my $r = ($value_to_test =~ 
		/^(
			($langtag)
		 | ($privateUse)
		 | ($grandfathered)
		 )$/xi);
	
	return $r;
}

1;

__END__

=head1 NAME

HTML::HTML5::Outline - implementation of the HTML5 Outline algorithm

=head1 SYNOPSIS

	use JSON;
	use HTML::HTML5::Outline;
	
	my $html = <<'HTML';
	<!doctype html>
	<h1>Hello</h1>
	<h2>World</h2>
	<h1>Good Morning</h1>
	<h2>Vietnam</h2>
	HTML
	
	my $outline = HTML::HTML5::Outline->new($html);
	print to_json($outline->to_hashref, {pretty=>1,canonical=>1});

=head1 DESCRIPTION

This is an implementation of the HTML5 Outline algorithm, as per
L<http://www.w3.org/TR/html5/sections.html#outlines>.

The module can output a JSON-friendly hashref, or an RDF model.

=head2 Constructor

=over

=item * C<< HTML::HTML5::Outline->new($html, %options) >>

Construct a new outline. C<< $html >> is the HTML to generate an outline from,
either as an HTML or XHTML string, or as an L<XML::LibXML::Document> object.

Options:

=over

=item * B<default_language> - default language to assume text is in when no
lang/xml:lang attribute is available. e.g. 'en-gb'.

=item * B<element_subjects> - rather advanced feature that doesn't bear
explaining. See USE WITH RDF::RDFA::PARSER for an example.

=item * B<microformats> - support C<< <ul class="xoxo"> >>,
C<< <ol class="xoxo"> >> and C<< <whatever class="figure"> >> as
sectioning elements (like C<< <section> >>, C<< <figure> >>, etc).
Boolean, defaults to false.

=item * B<parser> - 'html' (default) or 'xml' - choose the parser to use for
XHTML/HTML. If the constructor is passed an XML::LibXML::Document, this is
ignored.

=item * B<suppress_collections> - allows rdf:List stuff to be suppressed
from RDF output. RDF output - especially in Turtle format - looks somewhat
nicer without them, but if you care about the order of headings and sections,
then you'll want them. Boolean, defaults to false.

=item * B<uri> - the document URI for resolving relative URI references.
Only really used by the RDF output.

=back

=back

=head2 Object Methods

=over

=item * C<< to_hashref >>

Returns data as a nested hashref/arrayref structure. Dump it as JSON and
you'll figure out the format pretty easily.

=item * C<< to_rdf >>

Returns data as a n L<RDF::Trine::Model>. Requires RDF::Trine to be
installed. Otherwise this method won't exist.

=item * C<< primary_outlinee >>

Returns a L<HTML::HTML5::Outline::Outlinee> element representing the
outline for the page.

=back

=head2 Class Methods

=over

=item * C<< has_rdf >>

Indicates whether the C<< to_rdf >> object method exists.

=back


=head1 USE WITH RDF::RDFA::PARSER

This module produces RDF data where many of the resources described
are HTML elements. RDFa data typically does not, but RDF::RDFa::Parser
does also support some extensions to RDFa which do (e.g. support for the
C<cite> and C<role> attributes). It's useful to combine the RDF data
from each, and RDF::RDFa::Parser 1.093 and upwards contains a few shims
to make this possible.

Without further ado...

	use HTML::HTML5::Outline;
	use RDF::RDFa::Parser 1.093;
	use RDF::TrineShortcuts;

	my $rdfa = RDF::RDFa::Parser->new(
		$html_source,
		$base_url,
		RDF::RDFa::Parser::Config->new(
			'html5', '1.1',
			role_attr     => 1,
			cite_attr     => 1,
			longdesc_attr => 1,
			),
		)->consume;
	
	my $outline = HTML::HTML5::Outline->new(
		$rdfa->dom,
		uri              => $rdfa->uri,
		element_subjects => $rdfa->element_subjects,
		);
	
	# Merging two graphs is pretty complicated in RDF::Trine
	# but a little easier with RDF::TrineShortcuts...
	my $combined = rdf_parse();
	rdf_parse($rdfa->graph,     model => $combined);
	rdf_parse($outline->to_rdf, model => $combined);
	
	my $NS = {
		dc    => 'http://purl.org/dc/terms/',
		o     => 'http://ontologi.es/outline#',
		type  => 'http://purl.org/dc/dcmitype/',
		xs    => 'http://www.w3.org/2001/XMLSchema#',
		xhv   => 'http://www.w3.org/1999/xhtml/vocab#',
		};
	
	print rdf_string($combined => 'Turtle', namespaces => $NS);

=head1 SEE ALSO

L<HTML::HTML5::Outline::RDF>,
L<HTML::HTML5::Outline::Outlinee>,
L<HTML::HTML5::Outline::Section>.

L<HTML::HTML5::Parser>, L<HTML::HTML5::Sanity>.

=head1 AUTHOR

Toby Inkster, E<lt>tobyink@cpan.orgE<gt>

=head1 ACKNOWLEDGEMENTS

This module is a fork of the document structure parser from Swignition
<http://buzzword.org.uk/swignition/>.

That in turn includes the following credits: thanks to Ryan King and
Geoffrey Sneddon for pointing me towards [the HTML5] algorithm. I also
used Geoffrey's python implementation as a crib sheet to help me figure
out what was supposed to happen when the HTML5 spec was ambiguous.

=head1 COPYRIGHT AND LICENCE

Copyright (C) 2008-2011 by Toby Inkster

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut