Yet Another CPAN Grep

XML-Atom-Microformats/lib/XML/Atom/Microformats.pm

package XML::Atom::Microformats;

use 5.010;
use autodie;
use strict;
use utf8;

use HTML::Microformats 0.100 qw();
use JSON 2.00 qw();
use RDF::Trine 0.135 qw();
use RDF::Query 2.900 qw();
use XML::Atom::OWL 0.100 qw();

our ($VERSION, $AUTHORITY, $HAS_RDFA);
BEGIN {
	$XML::Atom::Microformats::VERSION   = '0.004';
	$XML::Atom::Microformats::AUTHORITY = 'cpan:TOBYINK';
	$XML::Atom::Microformats::HAS_RDFA  =
		eval "use RDF::RDFa::Parser 1.097; 1" || 0;
}

sub new_feed
{
	my ($class, $xml, $base_uri) = @_;
	
	my $awol;

	if ($HAS_RDFA)
	{
		my $cfg = RDF::RDFa::Parser::Config->new(
			RDF::RDFa::Parser::Config->HOST_ATOM,
			RDF::RDFa::Parser::Config->RDFA_11,
			atom_parser => 1,
			);
		$awol = RDF::RDFa::Parser->new($xml, $base_uri, $cfg, RDF::Trine::Store::Memory->temporary_store)->consume;
	}
	else
	{
		$awol = XML::Atom::OWL->new($xml, $base_uri, undef, RDF::Trine::Store::Memory->temporary_store)->consume;
	}
	
	my $self = bless { 'AWOL' => $awol , 'base' => $base_uri }, $class;
	return $self->_find_contexts->_prepare_contexts;	
}

sub _find_contexts
{
	my ($self) = @_;
	
	my $sparql = <<SPARQL;
PREFIX awol: <http://bblfish.net/work/atom-owl/2006-06-06/#>
PREFIX iana: <http://www.iana.org/assignments/relation/>
SELECT ?entry ?entryid ?entrylink ?contenttype ?contentbody ?contentbase ?contentlang ?profile
WHERE
{
	?entry a awol:Entry ;
		awol:content ?content ;
		awol:id ?entryid .
	?content a awol:Content ;
		awol:type ?contenttype ;
		awol:body ?contentbody .
	OPTIONAL { ?entry iana:self ?entrylink . }
	OPTIONAL { ?content awol:base ?contentbase . }
	OPTIONAL { ?content awol:lang ?contentlang . }
	OPTIONAL
	{
		{ ?feed awol:entry ?entry ; iana:profile ?profile . }
		UNION { ?entry iana:profile ?profile . }
	}
}
SPARQL

	my $query  = RDF::Query->new($sparql);
	my $result = $query->execute($self->{'AWOL'}->graph);
	my $data = {};
	while (my $row = $result->next)
	{
		my $e = $row->{'entry'}->as_ntriples;
		
		$data->{$e}->{'entryid'}     ||= $row->{'entryid'}->literal_value;
		$data->{$e}->{'contentbody'} ||= $row->{'contentbody'}->literal_value;
		$data->{$e}->{'contenttype'} ||= $row->{'contenttype'}->literal_value;
		$data->{$e}->{'contentlang'} ||= $row->{'contentlang'}->literal_value
			if defined $row->{'contentlang'};
		$data->{$e}->{'contentbase'} ||= $row->{'contentbase'}->uri
			if defined $row->{'contentbase'};
		$data->{$e}->{'entrylink'}   ||= $row->{'entrylink'}->uri
			if defined $row->{'entrylink'};
		
		if (defined $row->{'profile'})
		{
			push @{ $data->{$e}->{'profiles'} }, $row->{'profile'}->uri;
		}
	}
	$self->{'contexts'} = [values %$data];
	
	return $self;
}

sub _prepare_contexts
{
	my ($self) = @_;
	
	foreach my $context (@{$self->{'contexts'}})
	{
		next unless $context->{'contenttype'} eq 'text/html'
			|| $context->{'contenttype'} eq 'application/xhtml+xml';
		
		my $dom;
		my $html = sprintf("<html xml:lang=\"%s\" lang=\"%s\" xmlns=\"http://www.w3.org/1999/xhtml\"><head><title></title></head><body><div>%s</div></body></html>",
			$context->{'contentlang'},
			$context->{'contentlang'},
			$context->{'contentbody'});
		
		my $hmf = HTML::Microformats->new_document(
			$html,
			($context->{'contentbase'} || $self->{'base'}),
			type => $context->{'contenttype'});
		
		if ($@ || !defined $hmf)
		{
			warn sprintf("ENTRY <%s>: %s",
				$context->{'entryid'},
				($@ || "Could not process entry."));
			next;
		}
		
		$hmf->{'context'}->{'document_uri'} = $context->{'entrylink'} || $context->{'entryid'};
		$hmf->add_profile( @{$context->{'profiles'}} );
		
		$context->{'HMF'} = $hmf;
	}
	
	return $self;
}

sub add_profile
{
	my ($self, @profiles) = @_;
	foreach my $context (@{$self->{'contexts'}})
	{
		next unless $context->{'HMF'};
		$context->{'HMF'}->add_profile(@profiles);
	}
	return $self;
}

sub entry_add_profile
{
	my ($self, $entry, @profiles) = @_;
	foreach my $context (@{$self->{'contexts'}})
	{
		next unless $context->{'HMF'};
		next unless $context->{'entryid'} eq $entry;
		$context->{'HMF'}->add_profile(@profiles);
	}
	return $self;
}

sub assume_profile
{
	my ($self, @profiles) = @_;
	foreach my $context (@{$self->{'contexts'}})
	{
		next unless $context->{'HMF'};
		$context->{'HMF'}->assume_profile(@profiles);
	}
	return $self;
}

sub entry_assume_profile
{
	my ($self, $entry, @profiles) = @_;
	foreach my $context (@{$self->{'contexts'}})
	{
		next unless $context->{'HMF'};
		next unless $context->{'entryid'} eq $entry;
		$context->{'HMF'}->assume_profile(@profiles);
	}
	return $self;
}

sub assume_all_profiles
{
	my ($self) = @_;
	foreach my $context (@{$self->{'contexts'}})
	{
		next unless $context->{'HMF'};
		$context->{'HMF'}->assume_all_profiles;
	}
	return $self;
}

sub entry_assume_all_profiles
{
	my ($self, $entry) = @_;
	foreach my $context (@{$self->{'contexts'}})
	{
		next unless $context->{'HMF'};
		next unless $context->{'entryid'} eq $entry;
		$context->{'HMF'}->assume_all_profiles;
	}
	return $self;
}

sub parse_microformats
{
	my ($self) = @_;
	return $self if $self->{'parsed'};
	
	foreach my $context (@{$self->{'contexts'}})
	{
		next unless $context->{'HMF'};
		$context->{'objects'} = $context->{'HMF'}->objects;
	}
	
	$self->{'parsed'} = 1;
	return $self;
}


sub clear_microformats
{
	my ($self) = @_;
	
	foreach my $context (@{$self->{'contexts'}})
	{
		$context->{'objects'} = undef;
		next unless $context->{'HMF'};
		$context->{'HMF'}->clear_microformats;
	}
	
	$self->{'parsed'} = 0;
	return $self;
}

sub objects
{
	my ($self, $format, $entry) = @_;
	$self->parse_microformats;
	
	my @rv;
	
	foreach my $context (@{$self->{'contexts'}})
	{
		next unless $context->{'HMF'};
		
		if ($entry eq $context->{'entryid'} || !defined $entry)
		{
			my @these = $context->{'HMF'}->objects($format);
			push @rv, @these;
		}
	}
	
	return @rv if (wantarray);
	return \@rv;
}

sub entry_objects
{
	my ($self, $entry, $format) = @_;
	return $self->objects($format, $entry);
}

sub all_objects
{
	my ($self, $entry) = @_;
	my $rv = {};
	
	foreach my $format (HTML::Microformats->formats)
	{
		$rv->{$format} = $self->objects($format, $entry);
	}
	
	return $rv;
}

*entry_all_objects = \&all_objects;

sub TO_JSON
{
	return  $_[0]->all_objects;
}

sub json
{
	my ($self, %opts) = @_;
	
	$opts{'convert_blessed'} = 1
		unless defined $opts{'convert_blessed'};
	
	$opts{'utf8'} = 1
		unless defined $opts{'utf8'};

	return JSON::to_json($self->all_objects, \%opts);
}

sub entry_json
{
	my ($self, $entry, %opts) = @_;
	
	$opts{'convert_blessed'} = 1
		unless defined $opts{'convert_blessed'};
	
	$opts{'utf8'} = 1
		unless defined $opts{'utf8'};

	return JSON::to_json($self->entry_all_objects($entry), \%opts);
}

sub model
{
	my ($self, %opts)  = @_;
	my $model = RDF::Trine::Model->temporary_model;
	$self->add_to_model($model, %opts);
	return $model;
}

sub entry_model
{
	my ($self, $entry, %opts)  = @_;
	my $model = RDF::Trine::Model->temporary_model;
	$self->entry_add_to_model($model, %opts);
	return $model;
}

sub add_to_model
{
	my ($self, $model, %opts) = @_;
	$self->parse_microformats;
	
	my %entry_opts = %opts;
	$entry_opts{'atomowl'} = 0;
	foreach my $context (@{$self->{'contexts'}})
	{
		next unless $context->{'HMF'};
		$self->entry_add_to_model($context->{'entryid'}, $model, %entry_opts);
	}
	
	if ($opts{'atomowl'})
	{
		my $iter = $self->{'AWOL'}->graph->as_stream;
		while (my $st = $iter->next)
		{
			$model->add_statement($st);
		}
	}
	
	return $self;
}

sub entry_add_to_model
{
	my ($self, $entry, $model, %opts) = @_;
	$self->parse_microformats;
	
	foreach my $context (@{$self->{'contexts'}})
	{
		next unless $context->{'HMF'};
		next unless $entry eq $context->{'entryid'};
		
		my $iter = $context->{'HMF'}->model->as_stream;
		while (my $st = $iter->next)
		{
			$model->add_statement(
				RDF::Trine::Statement->new(($st->nodes)[0..2]),
				RDF::Trine::Node::Resource->new($entry),
				);
		}
	}
	
	if ($opts{'atomowl'})
	{
		my $iter = $self->{'AWOL'}->graph->as_stream;
		while (my $st = $iter->next)
		{
			$model->add_statement($st);
		}
	}
	
	return $self;
}

1;

__END__

=head1 NAME

XML::Atom::Microformats - parse microformats in Atom content

=head1 SYNOPSIS

 use XML::Atom::Microformats;
 
 my $feed = XML::Atom::Microformats
    -> new_feed( $xml, $base_uri )
    -> assume_profile( qw(hCard hCalendar) );
 print $feed->json(pretty => 1);
 
 my $results = RDF::Query
    -> new( $sparql )
    -> execute( $feed->model );
 
=head1 DESCRIPTION

The XML::Atom::Microformats module brings the functionality of
L<HTML::Microformats> to Atom 1.0 Syndication feeds. It finds
microformats embedded in the E<lt>contentE<gt> elements
(note: not E<lt>summaryE<gt>) of Atom entries.

The general pattern of usage is to create an XML::Atom::Microformats
object (which corresponds to an Atom 1.0 feed) using the
C<new_feed> method; tell it which types of Microformat you expect
to find there; then ask for the data, as a Perl hashref,
a JSON string, or an RDF::Trine model.

=head2 Constructor

=over

=item C<< XML::Atom::Microformats->new_feed($xml, $base_url) >>

Constructs a feed object.

$xml is the Atom source (string) or an XML::LibXML::Document.

$base_url is the feed URL, important for resolving relative URL references.

=back

=head2 Profile Management Methods

HTML::Microformats uses HTML profiles (i.e. the profile attribute on the
HTML <head> element) to detect which Microformats are used on a page. Any
microformats which do not have a profile URI declared will not be parsed.

XML::Atom::Microformats uses a similar mechanism. Because Atom does not
have a E<lt>headE<gt> element, Atom E<lt>linkE<gt> is used instead:

  <link rel="profile" href="http://ufs.cc/x/hcalendar" />

These links can be used on a per-entry basis, or for the whole feed.

Because many feeds fail to properly declare which profiles they use, there
are various profile management methods to tell XML::Atom::Microformats to
assume the presence of particular profile URIs, even if they're actually
missing.

=over

=item C<< add_profile(@profiles) >>

Using C<add_profile> you can add one or more profile URIs, and they are
treated as if they were found on the document.

For example:

 $feed->add_profile('http://microformats.org/profile/rel-tag')

This is useful for adding profile URIs declared outside the document itself
(e.g. in HTTP headers).

=item C<< entry_add_profile($entryid, @profiles) >>

C<entry_add_profile> is a variant to allow you to add a profile which applies
only to one specific entry within the feed, if you know that entry's ID.

=item C<< assume_profile(@microformats) >>

For example:

 $feed->assume_profile(qw(hCard adr geo))

This method acts similarly to C<add_profile> but allows you to use
names of microformats rather than URIs. Microformat names are case
sensitive, and must match HTML::Microformats::Format::Foo module names.

=item C<< entry_assume_profile($entryid, @profiles) >>

C<entry_assume_profile> is a variant to allow you to add a profile which applies
only to one specific entry within the feed, if you know that entry's ID.

=item C<< assume_all_profiles >>

This method is equivalent to calling C<assume_profile> for
all known microformats.

=item C<< entry_assume_all_profiles($entryid) >>

This method is equivalent to calling C<entry_assume_profile> for
all known microformats.

=back

=head2 Parsing Methods

You can probably skip this section. The C<data>, C<json> and
C<model> methods will automatically do this for you.

=over

=item C<< parse_microformats >>

Scans through the feed, finding microformat objects.

On subsequent calls, does nothing (as everything is already parsed).

=item C<< clear_microformats >>

Forgets information gleaned by C<parse_microformats> and thus allows
C<parse_microformats> to be run again. This is useful if you've
added some profiles between runs of C<parse_microformats>.

=back

=head2 Data Retrieval Methods

These methods allow you to retrieve the feed's data, and do things
with it.

=over

=item C<< objects($format) >>

$format is, for example, 'hCard', 'adr' or 'RelTag'.

Returns a list of objects of that type. (If called in scalar context,
returns an arrayref.)

Each object is, for example, an HTML::Microformat::hCard object, or an
HTML::Microformat::RelTag object, etc. See the relevent documentation
for details.

=item C<< entry_objects($entryid, $format) >>

C<entry_objects> is a variant to allow you to fetch data for
one specific entry within the feed, if you know that entry's ID.

=item C<< all_objects >>

Returns a hashref of data. Each hashref key is the name of a microformat
(e.g. 'hCard', 'RelTag', etc), and the values are arrayrefs of objects.

Each object is, for example, an HTML::Microformat::hCard object, or an
HTML::Microformat::RelTag object, etc. See the relevent documentation
for details.

=item C<< entry_all_objects($entryid) >>

C<entry_all_objects> is a variant to allow you to fetch data for
one specific entry within the feed, if you know that entry's ID.

=item C<< json(%opts) >>

Returns data roughly equivalent to the C<all_objects> method, but as a JSON
string.

%opts is a hash of options, suitable for passing to the L<JSON>
module's to_json function. The 'convert_blessed' and 'utf8' options are
enabled by default, but can be disabled by explicitly setting them to 0, e.g.

  print $feed->json( pretty=>1, canonical=>1, utf8=>0 );

=item C<< entry_json($entryid, %opts) >>

C<entry_json> is a variant to allow you to fetch data for
one specific entry within the feed, if you know that entry's ID.

=item C<< model(%opts) >>

Returns data as an RDF::Trine::Model, suitable for serialising as
RDF or running SPARQL queries. Quads are used (rather than
triples) which allows you to trace statements to the entries from
which they came.

$opts{'atomowl'} is a boolean indicating whether or not to
include data from XML::Atom::OWL in the returned model.
If enabled, this always includes AtomOWL data for the whole
feed (not just for a specific entry), even if you use the
C<entry_model> method.

If RDF::RDFa::Parser 1.096 or above is installed, then
$opts{'atomowl'} will automatically pull in DataRSS data
too.

=item C<< entry_model($entryid, %opts) >>

C<entry_model> is a variant to allow you to fetch data for
one specific entry within the feed, if you know that entry's ID.

=item C<< add_to_model($model, %opts) >>

Adds data to an existing RDF::Trine::Model. Otherwise, the same as C<model>.

=item C<< entry_add_to_model($entry, $model, %opts) >>

Adds data to an existing RDF::Trine::Model. Otherwise, the same as C<entry_model>.

=begin private

=item C<< TO_JSON >>

Provided for the benefit of L<JSON>'s convert_blessed feature.

=end private

=back

=head1 BUGS

Please report any bugs to L<http://rt.cpan.org/>.

=head1 SEE ALSO

L<HTML::Microformats>,
L<XML::Atom::OWL>,
L<XML::Atom::FromOWL>,
L<RDF::RDFa::Parser>.

L<http://microformats.org/>,
L<http://www.perlrdf.org/>.

=head1 AUTHOR

Toby Inkster E<lt>tobyink@cpan.orgE<gt>.

=head1 COPYRIGHT

Copyright 2010-2012 Toby Inkster

This library is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=head1 DISCLAIMER OF WARRANTIES

THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.