Group
Extension

HTML-Microdata/lib/HTML/Microdata.pm

package HTML::Microdata;

use strict;
use warnings;

use HTML::TreeBuilder::LibXML;
use Hash::MultiValue;
use Scalar::Util qw(refaddr);
use JSON;
use URI;

our $VERSION = '0.05';

sub new {
	my ($class, %args) = @_;
	bless {
		items => [],
		base  => $args{base} ? URI->new($args{base}) : undef,
	}, $class;
}

sub extract {
	my ($class, $content, %opts) = @_;
	my $self = $class->new(%opts);
	$self->_parse($content);
	$self
}

sub as_json {
	my ($self) = @_;
	encode_json +{
		items => $self->{items},
	};
}

sub items {
	my ($self) = @_;
	$self->{items};
}

sub _parse {
	my ($self, $content) = @_;

	my $items = {};
	my $tree = HTML::TreeBuilder::LibXML->new_from_content($content);
	my $scopes = $tree->findnodes('//*[@itemscope]');
	my $number = 0;

	for my $scope (@$scopes) {
		my $type = $scope->attr('itemtype');
		my $id   = $scope->attr('itemid');

		unless ($scope->id) {
			$scope->id($number++);
		}

		my $item = {
			($id   ? (id   => $id)   : ()),
			($type ? (type => $type) : ()),
			properties => Hash::MultiValue->new,
		};

		$items->{ $scope->id } = $item;

		unless ($scope->attr('itemprop')) {
			# This is top level item
			push @{ $self->{items} }, $item;
		}
	}

	for my $scope (@$scopes) {
		if (my $refs = $scope->attr('itemref')) {
			my $ids = [ split /\s+/, $refs ];
			for my $id (@$ids) {
				my $props = $tree->findnodes('//*[@id="' . $id . '"]/descendant-or-self::*[@itemprop]');
				for my $prop (@$props) {
					my $name = $prop->attr('itemprop');
					my $value = $self->extract_value($prop, items => $items);
					$items->{ $scope->id }->{properties}->add($name => $value);
					$prop->delete;
				}
			}
		}
	}

	my $props = $tree->findnodes('//*[@itemscope]/descendant-or-self::*[@itemprop]');
	for my $prop (@$props) {
		my $value = $self->extract_value($prop, items => $items);
		my $scope = $prop->findnodes('./ancestor::*[@itemscope]')->[-1];
		for my $name (split /\s+/, $prop->attr('itemprop')) {
			$items->{ $scope->id }->{properties}->add($name => $value);
		}
	}

	for my $key (keys %$items) {
		my $item = $items->{$key};
		$item->{properties} = $item->{properties}->multi;
	}

}

sub absolute {
	my ($self, $uri) = @_;
	if (defined $uri) {
		if ($self->{base}) {
			URI->new_abs($uri, $self->{base}).q();
		} else {
			$uri;
		}
	} else {
		"";
	}
}

sub extract_value {
	my ($self, $prop, %opts) = @_;

	my $value;
	if (defined $prop->attr('itemscope')) {
		# XXX : inifinite loop
		$value = $opts{items}->{ $prop->id };
	} elsif ($prop->tag eq 'meta') {
		$value = $prop->attr('content');
	} elsif ($prop->tag =~ m{^audio|embed|iframe|img|source|video|track$}) {
		$value = $self->absolute($prop->attr('src'));
	} elsif ($prop->tag =~ m{^a|area|link$}) {
		$value = $self->absolute($prop->attr('href'));
	} elsif ($prop->tag eq 'object') {
		$value = $self->absolute($prop->attr('data'));
	} elsif ($prop->tag eq 'data') {
		$value = $prop->attr('value');
	} elsif ($prop->tag eq 'time' && $prop->attr('datetime')) {
		$value = $prop->attr('datetime');
	} elsif (defined $prop->attr('content')) {
		$value = $prop->attr('content');
	} else {
		$value = $prop->findvalue('normalize-space(.)');
	}

	$value;
}

1;
__END__

=encoding utf8

=head1 NAME

HTML::Microdata - Extractor of microdata from HTML.

=head1 SYNOPSIS

  use HTML::Microdata;

  my $microdata = HTML::Microdata->extract(<<EOF, base => 'http://example.com/');
  ...
  EOF
  my $json = $microdata->as_json;

  use Data::Dumper;
  warn Dumper $microdata->items; # returns top level items


=head1 DESCRIPTION

HTML::Microdata is extractor of microdata from HTML to JSON etc.

Implementation of http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#microdata .

=head1 TODO

itemref implementation has not been completed.

=head1 WHY

There already is HTML::HTML5::Microdata::Parser in CPAN. But it has very heavy dependency and I can't install it. And more, package name should not include "HTML5" because HTML5 is just HTML now.

=head1 AUTHOR

cho45 E<lt>cho45@lowreal.netE<gt>

=head1 SEE ALSO

L<HTML::HTML5::Microdata::Parser|HTML::HTML5::Microdata::Parser>

=head1 LICENSE

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.