Yet Another CPAN Grep

HTML-Element-Convert/lib/HTML/Element/Convert.pm

package HTML::Element::Convert;

use warnings;
use strict;

=head1 NAME

HTML::Element::Convert - Monkeypatch content conversion methods into HTML::Element

=head1 VERSION

Version 0.10

=cut

our $VERSION = '0.10';

=head1 SYNOPSIS

  use HTML::TreeBulder;
  use HTML::Element::Convert;

  my $tree = HTML::TreeBuilder->new_from_content($html);

  # Search for some JSON-encoded meta-data embedded in the document and extract it:
  my $element = $tree->look_down(...);
  my $hash = $element->extract_content;

  # This time extract the YAML data and delete the containing element from the tree:
  $element = $tree->look_down(...);
  $hash = $element->pull_content;

  # Convert the content of any <div> with a 'lang="markdown"' attribute into HTML:
  $tree->convert_content;

=over 4

=item $element->convert_content

Look for every div below $element containing a C<lang> attribute. If it recognizes C<lang>, it
will convert and replace the div's content.

Currently, only markdown is supported.

=item $content = $element->extract_content([TYPE])

Extract and parse the content of $element. If C<TYPE> is given, then this method will assume the content is of the given type, and try to parse it accordingly. Otherwise it will use the C<lang> attribute of $element to detemine the type.

=item $content = $element->pull_content([TYPE])

Like C<extract_content>, extract and parse the content of $element. It will also delete $element from the tree.

=back

=cut

use Carp;

our %PARSE_FUNC;
our %EXTRACT_FUNC;
BEGIN {
	1 and			do { eval { require Text::Markdown };	$PARSE_FUNC{markdown} = \&Text::Markdown::markdown unless $@ };
	1 and			do { eval { require JSON };		$PARSE_FUNC{JSON} = \&JSON::jsonToObj unless $@ };
	1 and			do { eval { require YAML::Syck };	$PARSE_FUNC{YAML} = \&YAML::Syck::Load unless $@ };
	$PARSE_FUNC{YAML} or	do { eval { require YAML };		$PARSE_FUNC{YAML} = \&YAML::Load unless $@ };
}


sub _as_text($) { return shift->as_text }
sub _as_raw_HTML($) { return join '', map { if (ref $_) { $_ = $_->as_XML; chomp $_; $_ =~ s/"/'/g } $_ } shift->content_list }

sub _extract_text { return _as_text shift }
$EXTRACT_FUNC{text} = \&_extract_text;

sub _extract_YAML { return $PARSE_FUNC{YAML}->(_as_raw_HTML(shift) . "\n") }
$EXTRACT_FUNC{YAML} = \&_extract_YAML;

sub _extract_JSON { return $PARSE_FUNC{JSON}->(_as_raw_HTML shift) }
$EXTRACT_FUNC{JSON} = \&_extract_JSON;

sub _extract_markdown { return HTML::TreeBuilder->new_from_content($PARSE_FUNC{markdown}->(_as_text shift)) }
$EXTRACT_FUNC{markdown} = \&_extract_markdown;

sub _extract {
	my $element = shift;
	my $type = shift;

	$type = "plain" if $type eq "text";
	my $func;
	for (qw(plain JSON YAML markdown)) {
		if ($type =~ m/^$_$/i) {
			$func = $_;
			last;
		}
	}

	return unless $func;
	return $EXTRACT_FUNC{$func}->($element);
}

package HTML::Element;

use Carp;
use UNIVERSAL;

sub extract_content {
	my $self = shift;
	my $type = shift;
	$type ||= $self->attr("lang");
	$type ||= "text";
	my $content;
	if	(! $type)	{}
	else 			{ $content = HTML::Element::Convert::_extract($self, $type) }
	return $content;
}

sub convert_content {
	my $self = shift;
	for (qw(markdown)) {
		my @elements = $self->look_down("_tag", "div", "lang", qr/$_/i);
		for my $element (@elements) {
			my $content = $element->extract_content;
			if (UNIVERSAL::can($content, "guts")) {
				my $new_element = $content->guts;
				if ($element eq $self) {
					$self->delete_content;
					$self->push_content($new_element->content_list);
					$self->attr("lang", undef);
					$new_element->delete;
				}
				else {
					$element->replace_with($new_element)->delete;
				}
			}
		}
	}
}

sub pull_content {
	my $self = shift;
	my $content = $self->extract_content(@_);
	$self->delete;
	return $content;
}

# TODO Check to see if we're using HTML::TreeBuilder::Select first...

# Alpha function
sub _extract_child_content {
	my $self = shift;
	if (UNIVERSAL::can($self, "select")) {
		my $query = shift or croak "Need a query (a CSS selector or XPath)";
		return $self->select($query => 'extract-content');
	}
}

# Alpha function
sub _pull_child_content {
	my $self = shift;
	if (UNIVERSAL::can($self, "select")) {
		my $query = shift or croak "Need a query (a CSS selector or XPath)";
		return $self->select($query => 'pull-content');
	}
}


=head1 AUTHOR

Robert Krimen, C<< <rkrimen at cpan.org> >>

=head1 BUGS

Please report any bugs or feature requests to
C<bug-html-element-convert at rt.cpan.org>, or through the web interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-Element-Convert>.
I will be notified, and then you'll automatically be notified of progress on
your bug as I make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc HTML::Element::Convert

You can also look for information at:

=over 4

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/HTML-Element-Convert>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/HTML-Element-Convert>

=item * RT: CPAN's request tracker

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Element-Convert>

=item * Search CPAN

L<http://search.cpan.org/dist/HTML-Element-Convert>

=back

=head1 ACKNOWLEDGEMENTS

=head1 COPYRIGHT & LICENSE

Copyright 2007 Robert Krimen, all rights reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of HTML::Element::Convert
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.