XML-GRDDL/lib/XML/GRDDL.pm
package XML::GRDDL;
use 5.008;
use strict;
use constant GRDDL_NS => 'http://www.w3.org/2003/g/data-view#';
use constant XHTML_NS => 'http://www.w3.org/1999/xhtml';
use Carp;
use Data::UUID;
use RDF::RDFa::Parser '1.097';
use RDF::Trine qw[iri statement];
use Scalar::Util qw[blessed];
use URI;
use URI::Escape qw[uri_escape];
use XML::GRDDL::Namespace;
use XML::GRDDL::Profile;
use XML::GRDDL::Transformation;
use XML::LibXML;
our $VERSION = '0.004';
use base 'Exporter';
our @EXPORT_OK = qw( GRDDL_NS XHTML_NS );
sub new
{
my ($class) = @_;
return bless { cache=>{}, ua=>undef, }, $class;
}
sub ua
{
my $self = shift;
if (@_)
{
my $rv = $self->{'ua'};
$self->{'ua'} = shift;
croak "Set UA to something that is not an LWP::UserAgent!"
unless blessed $self->{'ua'} && $self->{'ua'}->isa('LWP::UserAgent');
return $rv;
}
unless (blessed $self->{'ua'} && $self->{'ua'}->isa('LWP::UserAgent'))
{
$self->{'ua'} = LWP::UserAgent->new(agent=>sprintf('%s/%s ', __PACKAGE__, $VERSION));
}
return $self->{'ua'};
}
sub data
{
my ($self, $document, $uri, %options) = @_;
unless (blessed($document) && $document->isa('XML::LibXML::Document'))
{
my $parser = XML::LibXML->new;
$document = $parser->parse_string($document);
}
my $model = RDF::Trine::Model->temporary_model;
my @transformations = $self->discover($document, $uri, %options, strings => 0);
foreach my $t (@transformations)
{
my $m = $t->model($document);
if ($m)
{
my $context = iri('urn:uuid:'.Data::UUID->new->create_str);
my $rootnode = iri('urn:uuid:'.Data::UUID->new->create_str);
my $property = iri('http://ontologi.es/grddl?transformation='.uri_escape($t->uri).'#result');
$model->add_hashref($m->as_hashref, $context);
if ($options{metadata})
{
$model->add_statement(statement(
iri($uri),
iri('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
iri(GRDDL_NS.'InformationResource'),
));
$model->add_statement(statement(
iri($uri),
iri(GRDDL_NS.'rootNode'),
$rootnode,
));
$model->add_statement(statement(
$rootnode,
iri('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
iri(GRDDL_NS.'RootNode'),
));
$model->add_statement(statement(
iri($uri),
iri(GRDDL_NS.'result'),
$context,
));
$model->add_statement(statement(
$context,
iri('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
iri(GRDDL_NS.'RDFGraph'),
));
$model->add_statement(statement(
iri($t->uri),
iri('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
iri(GRDDL_NS.'Transformation'),
));
$model->add_statement(statement(
iri($t->uri),
iri(GRDDL_NS.'transformationProperty'),
$property,
));
$model->add_statement(statement(
$property,
iri('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
iri(GRDDL_NS.'TransformationProperty'),
));
$model->add_statement(statement(
$rootnode,
$property,
$context,
));
}
}
}
return $model;
}
sub discover
{
my ($self, $document, $uri, %options) = @_;
unless (blessed($document) && $document->isa('XML::LibXML::Document'))
{
my $parser = XML::LibXML->new;
$document = $parser->parse_string($document);
}
my @transformations;
push @transformations,
$self->_discover_from_rel_attribute($document, $uri, %options);
push @transformations,
$self->_discover_from_transformation_attribute($document, $uri, %options);
push @transformations,
$self->_discover_from_profiles($document, $uri, %options);
push @transformations,
$self->_discover_from_namespace($document, $uri, %options);
if ($options{'strings'})
{
return @transformations;
}
else
{
return map { XML::GRDDL::Transformation->new($_, $uri, $self); } @transformations;
}
}
sub _discover_from_rel_attribute
{
my ($self, $document, $uri, %options) = @_;
my @transformations;
my $profile_found = $options{'force_rel'};
my $xpc = XML::LibXML::XPathContext->new;
$xpc->registerNs(xhtml => XHTML_NS);
unless ($profile_found)
{
my @nodes = $xpc->findnodes('/xhtml:html/xhtml:head[@profile]', $document);
foreach my $head (@nodes)
{
if ($head->getAttribute('profile') =~ m!(^|\s) http://www\.w3\.org/2003/g/data-view\#? (\s|$)!x)
{
$profile_found = 1;
last;
}
}
}
if ($profile_found)
{
my $is_html = $document->documentElement->namespaceURI eq XHTML_NS;
my $rdfa = $self->_rdf_model($document, $uri, $is_html?'application/xhtml+xml':'application/xml');
my $iter = $rdfa->get_statements(iri($uri), iri(GRDDL_NS.'transformation'), undef);
while (my $st = $iter->next)
{
next unless $st->object->is_resource;
push @transformations, $st->object->uri;
}
}
return @transformations;
}
sub _discover_from_transformation_attribute
{
my ($self, $document, $uri, %options) = @_;
my @transformations;
# Right now just doing this on root element. Supposed to also check others??
my $attr = $document->documentElement->getAttributeNS(GRDDL_NS, 'transformation');
my @t = split /\s+/, $attr;
foreach my $t (@t)
{
next unless $t =~ /[a-z0-9\.]/i;
push @transformations, $self->_resolve_relative_ref($t, $uri);
}
return @transformations;
}
sub _discover_from_profiles
{
my ($self, $document, $uri, %options) = @_;
my @transformations;
my $xpc = XML::LibXML::XPathContext->new;
$xpc->registerNs(xhtml => XHTML_NS);
my @profiles;
my @nodes = $xpc->findnodes('/xhtml:html/xhtml:head[@profile]', $document);
foreach my $head (@nodes)
{
my @t = split /\s+/, $head->getAttribute('profile');
foreach my $t (@t)
{
next unless $t =~ /[a-z0-9\.]/i;
push @profiles, $self->_resolve_relative_ref($t, $uri);
}
}
foreach my $profile (@profiles)
{
my $profile_object = XML::GRDDL::Profile->new($profile, $uri, $self);
push @transformations, $profile_object->transformations;
}
return @transformations;
}
sub _discover_from_namespace
{
my ($self, $document, $uri, %options) = @_;
my $ns = $document->documentElement->namespaceURI;
my $ns_obj = XML::GRDDL::Namespace->new($ns, $uri, $self);
return $ns_obj->transformations;
}
sub _fetch
{
my ($self, $document, %headers) = @_;
$self->{'cache'}->{$document} ||= $self->ua->get($document, %headers);
return $self->{'cache'}->{$document};
}
sub _rdf_model
{
my ($self, $document, $uri, $type, $nocache) = @_;
if ($nocache || !$self->{'cached-rdf'}->{$uri})
{
if ($type eq 'application/xhtml+xml'
or $type eq 'text/html'
or $type eq 'application/atom+xml'
or $type eq 'image/svg+xml')
{
my $config = RDF::RDFa::Parser::Config->new(
$type,
'1.1',
initial_context => 'http://www.w3.org/2003/g/data-view',
);
my $parser = RDF::RDFa::Parser->new($document, $uri, $config);
return $parser->graph if $nocache;
$self->{'cached-rdf'}->{$uri} = $parser->graph;
}
else
{
if (blessed($document))
{
$document = $document->toString;
}
my $model = RDF::Trine::Model->temporary_model;
my $pclass = $RDF::Trine::Parser::media_types{ $type };
my $parser = ($pclass && $pclass->can('new'))
? $pclass->new
: RDF::Trine::Parser::RDFXML->new;
$parser->parse_into_model($uri, $document, $model);
return $model if $nocache;
$self->{'cached-rdf'}->{$uri} = $model;
}
}
return $self->{'cached-rdf'}->{$uri};
}
sub _resolve_relative_ref
{
my ($self, $ref, $base) = @_;
return $ref unless $base; # keep relative unless we have a base URI
if ($ref =~ /^([a-z][a-z0-9\+\.\-]*)\:/i)
{
return $ref; # already an absolute reference
}
# create absolute URI
my $abs = URI->new_abs($ref, $base)->canonical->as_string;
while ($abs =~ m!^(http://.*)(\.\./|\.)+(\.\.|\.)?$!i)
{ $abs = $1; } # fix edge case of 'http://example.com/../../../'
return $abs;
}
1;
__END__
=head1 NAME
XML::GRDDL - transform XML and XHTML to RDF
=head1 SYNOPSIS
High-level interface:
my $grddl = XML::GRDDL->new;
my $model = $grddl->data($xmldoc, $baseuri);
# $model is an RDF::Trine::Model
Low-level interface:
my $grddl = XML::GRDDL->new;
my @transformations = $grddl->discover($xmldoc, $baseuri);
foreach my $t (@transformations)
{
# $t is an XML::GRDDL::Transformation
my ($output, $mediatype) = $t->transform($xmldoc);
# $output is a string of type $mediatype.
}
=head1 DESCRIPTION
GRDDL is a W3C Recommendation for extracting RDF data from arbitrary
XML and XHTML via a transformation, typically written in XSLT. See
L<http://www.w3.org/TR/grddl/> for more details.
This module implements GRDDL in Perl. It offers both a low level interface,
allowing you to generate a list of transformations associated with the
document being processed, and thus the ability to selectively run the
transformation; and a high-level interface where a single RDF model
is returned representing the union of the RDF graphs generated by
applying all available transformations.
=head2 Constructor
=over 4
=item C<< XML::GRDDL->new >>
The constructor accepts no parameters and returns an XML::GRDDL
object.
=back
=head2 Methods
=over 4
=item C<< $grddl->discover($xml, $base, %options) >>
Processes the document to discover the transformations associated
with it. $xml is the raw XML source of the document, or an
XML::LibXML::Document object. ($xml cannot be "tag soup" HTML,
though you should be able to use L<HTML::HTML5::Parser> to
parse tag soup into an XML::LibXML::Document.) $base is the
base URI for resolving relative references.
Returns a list of L<XML::GRDDL::Transformation> objects.
Options include:
=over 4
=item * B<force_rel> - boolean; interpret XHTML rel="transformation" even in the absence of the GRDDL profile.
=item * B<strings> - boolean; return a list of plain strings instead of blessed objects.
=back
=item C<< $grddl->data($xml, $base, %options) >>
Processes the document, discovers the transformations associated
with it, applies the transformations and merges the results into a
single RDF model. $xml and $base are as per C<discover>.
Returns an RDF::Trine::Model containing the data. Statement contexts
(a.k.a. named graphs / quads) are used to distinguish between data
from the result of each transformation.
Options include:
=over 4
=item * B<force_rel> - boolean; interpret XHTML rel="transformation" even in the absence of the GRDDL profile.
=item * B<metadata> - boolean; include provenance information in the default graph (a.k.a. nil context).
=back
=item C<< $grddl->ua( [$ua] ) >>
Get/set the user agent used for HTTP requests. $ua, if supplied, must be
an LWP::UserAgent.
=back
=head2 Constants
These constants may be exported upon request.
=over
=item C<< GRDDL_NS >>
=item C<< XHTML_NS >>
=back
=head1 FEATURES
XML::GRDDL supports transformations written in XSLT 1.0, and in RDF-EASE.
XML::GRDDL is a good HTTP citizen: Referer headers are included in requests,
and appropriate Accept headers supplied. To be an even better citizen, I
recommend changing the User-Agent header to advertise the name of the
application:
$grddl->ua->default_header(user_agent => 'MyApp/1.0 ');
Provenance information for GRDDL transformations is returned using the
GRDDL vocabulary at L<http://www.w3.org/2003/g/data-view#>.
Certain XHTML profiles and XML namespaces known not to contain any
transformations, or to contain useless transformations are skipped. See
L<XML::GRDDL::Namespace> and L<XML::GRDDL::Profile> for details. In
particular profiles for RDFa and many Microformats are skipped, as
L<RDF::RDFa::Parser> and L<HTML::Microformats> will typically yield
far superior results.
=head1 BUGS
Please report any bugs to L<http://rt.cpan.org/>.
Known limitations:
=over 4
=item * Recursive GRDDL doesn't work yet.
That is, the profile documents and namespace documents linked to from
your primary document cannot themselves rely on GRDDL.
=back
=head1 SEE ALSO
L<XML::GRDDL::Transformation>,
L<XML::GRDDL::Namespace>,
L<XML::GRDDL::Profile>,
L<XML::GRDDL::Transformation::RDF_EASE::Functional>,
L<XML::Saxon::XSLT2>.
L<HTML::HTML5::Parser>,
L<RDF::RDFa::Parser>,
L<HTML::Microformats>.
L<JSON::GRDDL>.
L<http://www.w3.org/TR/grddl/>.
L<http://www.perlrdf.org/>.
This module is derived from Swignition L<http://buzzword.org.uk/swignition/>.
=head1 AUTHOR
Toby Inkster E<lt>tobyink@cpan.orgE<gt>.
=head1 COPYRIGHT AND LICENCE
Copyright 2008-2012 Toby Inkster
This library is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
=head1 DISCLAIMER OF WARRANTIES
THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.