Plack-App-RDF-Files/lib/Plack/App/RDF/Files.pm
package Plack::App::RDF::Files;
use strict;
use warnings;
use v5.10;
use parent 'Plack::Component';
use Plack::Util;
use Plack::Request;
use Plack::Middleware::ConditionalGET;
use Plack::Util;
use RDF::Trine qw(statement iri);
use File::Spec::Functions qw(catfile catdir);
use URI;
use Scalar::Util qw(blessed reftype);
use Carp qw(croak);
use Digest::MD5;
use HTTP::Date;
use List::Util qw(max);
use Encode qw(is_utf8 encode_utf8);
use Moo;
our $VERSION = '0.12';
our %FORMATS = (
ttl => 'Turtle',
nt => 'NTriples',
n3 => 'Notation3',
json => 'RDFJSON',
rdfxml => 'RDFXML'
);
has base_dir => (
is => 'ro', required => 1,
isa => sub { die "base_dir not found" unless -d $_[0] },
);
has base_uri => (
is => 'ro', coerce => sub { URI->new($_[0]) },
);
has file_types => (
is => 'ro',
default => sub { [qw(rdfxml nt ttl)] },
coerce => sub {
my $types = join '|', @{$_[0]}; qr/^($types)$/;
},
);
has path_map => (
is => 'ro', default => sub { sub { $_[0] } }
);
has index_property => (
is => 'rw',
coerce => sub {
$_[0] ?
iri($_[0] eq '1' ? 'http://www.w3.org/2000/01/rdf-schema#seeAlso' : $_[0])
: undef
},
);
has namespaces => (
is => 'ro'
);
has normalize => (
is => 'ro',
coerce => sub {
return unless $_[0];
require Unicode::Normalize;
$_[0] =~ /^nf(k?[dc])$/i ? uc($1) : die "unknown normalization form: $_[0]";
}
);
# find out which URI to retrieve for
sub _uri {
my ($self, $env) = @_;
my $req = Plack::Request->new($env);
if (!$env->{'rdf.uri'}) {
$env->{'rdf.uri'} = URI->new(
($self->base_uri // $req->base) . $self->_path($env)
);
} elsif (!ref $env->{'rdf.uri'}) {
$env->{'rdf.uri'} = URI->new( $env->{'rdf.uri'} );
}
$env->{'rdf.uri'};
}
sub _path {
my ($self, $env) = @_;
my $path = substr(Plack::Request->new($env)->path, 1);
if ($path =~ /^[a-z0-9:\._@\/-]*$/i and $path !~ /\.\.\/|^\//) {
return $path;
} else {
return;
}
}
sub _dir {
my ($self, $path) = @_;
my $dir = catdir( $self->base_dir, $self->path_map->($path) );
return (-d $dir ? $dir : undef);
}
sub files {
my ($self, $env) = @_;
my $path = $self->_path($env);
return if !defined $path;
return if $path eq '' and !$self->index_property;
my $dir = $self->_dir($path);
return unless eval { -r $dir };
return unless opendir(my $dh, $dir);
my $files = { };
while ( readdir $dh ) {
next if $_ !~ /\.(\w+)$/;
next if $1 !~ $self->file_types;
my @stat = stat(catfile($dir,$_));
$files->{$_} = {
location => $dir,
size => $stat[7],
mtime => $stat[9],
}
}
closedir $dh;
return $files;
}
sub index_statements {
my ($self, $req) = @_;
my $uri = $self->_uri($req->env);
my $subject = iri($uri);
my $predicate = $self->index_property;
my $statements = [ ];
my $path = $self->_path($req->env);
return [ ] if !defined $path;
my $dir = $self->_dir($path);
if ( opendir(my $dirhandle, $dir) ) {
foreach my $p (readdir $dirhandle) {
next unless -d catdir( $dir, $p ) and $p !~ /^\.\.?$/;
push @$statements, statement(
$subject,
$predicate,
RDF::Trine::Node::Resource->new( "$uri$p" )
);
}
closedir $dirhandle;
}
return $statements;
}
sub call {
my ($self, $env) = @_;
my $req = Plack::Request->new($env);
return [405, ['Content-type' => 'text/plain', 'Allow' => 'GET,HEAD'], ['Method not allowed']]
unless (($req->method eq 'GET') || ($req->method eq 'HEAD'));
# find out which RDF files to retrieve
my $files = $self->files($env);
return [404, ['Content-type' => 'text/plain'], ['Not found']]
unless $files;
my $uri = $self->_uri($env);
my $headers = $self->headers($files);
# negotiate serialization format
my $serializer;
eval {
my %options = (
base_url => $env->{'rdf.uri'},
namespaces => $self->namespaces // { },
);
if ( $env->{'negotiate.format'} ) {
my $format = $FORMATS{$env->{'negotiate.format'}}
// $env->{'negotiate.format'};
$serializer = RDF::Trine::Serializer->new($format, %options);
} else {
my $type;
($type, $serializer) = RDF::Trine::Serializer->negotiate(
request_headers => $req->headers, %options );
$headers->set('Content-type' => $type);
$headers->set( Vary => 'Accept');
}
};
if ($@ or !$serializer) { # RDF::Trine::Error::SerializationError
$serializer = RDF::Trine::Serializer->new('NTriples', base_uri => $uri);
$headers->set('Content-type' => 'text/plain');
}
# don't bother parsing and serializing on HEAD request or conditional GET
# (this implies these requests will not detect RDF parsing errors)
if ( Plack::Middleware::ConditionalGET->etag_matches($headers, $env) ||
Plack::Middleware::ConditionalGET->not_modified_since($headers, $env) ) {
return [304, $headers->headers, []];
}
if ($req->method eq 'HEAD') {
return [200, $headers->headers, []];
}
# parse RDF
my $model = RDF::Trine::Model->new;
my $triples = 0;
my $add_statement = $self->normalize
? sub {
my $s = $_[0];
if ($s->object->is_literal) {
my $value = Unicode::Normalize::normalize($self->normalize, $s->object->literal_value);
$s->object->literal_value($value);
}
$model->add_statement($s);
}
: sub {
$model->add_statement($_[0]);
};
while (my ($name, $file) = each %$files) {
my $fullname = catdir($file->{location},$name);
my $parser = RDF::Trine::Parser->guess_parser_by_filename( $fullname );
$parser = $parser->new unless ref $parser;
eval { # parse file into model
$model->begin_bulk_ops;
$parser->parse_file( $uri, $fullname, $add_statement );
$model->end_bulk_ops();
};
if ($@) {
$file->{error} = $@;
} else {
$file->{triples} = $model->size - $triples;
$triples = $model->size;
}
}
$env->{'rdf.files'} = $files;
my $iterator = $model->as_stream;
# add listing on base URI
if ( $self->index_property and "$uri" eq ($self->base_uri // $req->base) ) {
my $stms = $self->index_statements($req);
if (@$stms) {
$iterator = $iterator->concat( RDF::Trine::Iterator::Graph->new( $stms ) );
}
}
# add axiomatic triple to empty graphs
if ($iterator->finished) {
$iterator = RDF::Trine::Iterator::Graph->new( [ statement(
iri($uri),
iri('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
iri('http://www.w3.org/2000/01/rdf-schema#Resource')
) ] );
}
# construct PSGI response
if ( $env->{'psgi.streaming'} ) {
$env->{'rdf.iterator'} = $iterator;
return sub {
my $responder = shift;
my $body = $self->_serialize_body( $serializer, $iterator );
$responder->( [ 200, $headers->headers, $body ] );
};
} else {
my $body = $self->_serialize_body( $serializer, $iterator );
return [ 200, $headers->headers, $body ];
}
}
sub _serialize_body {
my ($self, $serializer, $iterator) = @_;
# serialize as last as possible
return Plack::Util::inline_object(
getline => sub {
return if !$iterator or $iterator->finished;
my $string = '';
open ( my $fh, '>:encoding(UTF-8)', \$string );
$serializer->serialize_iterator_to_file($fh, $iterator);
close $fh;
$iterator = 0;
return $string;
},
close => sub { $iterator = 0 },
);
}
sub headers {
my ($self, $files) = @_;
# calculate Etag based on file names, locations, sizes, and mtimes
my $md5 = Digest::MD5->new;
foreach my $name (sort keys %$files) {
$md5->add( map { $files->{$name}->{$_} } sort keys %{$files->{$name}} );
}
# get last modification time
my $lastmod = max map { $_->{mtime} } values %$files;
Plack::Util::headers([
'ETag' => 'W/"'.$md5->hexdigest.'"',
'Last-Modified' => HTTP::Date::time2str($lastmod)
]);
}
use parent 'Exporter';
our @EXPORT_OK = qw(app);
sub app { Plack::App::RDF::Files->new(@_) }
1;
__END__
=head1 NAME
Plack::App::RDF::Files - serve RDF data from files
=begin markdown
# STATUS
[](https://travis-ci.org/nichtich/Plack-App-RDF-Files)
[](https://coveralls.io/r/nichtich/Plack-App-RDF-Files)
[](http://cpants.cpanauthors.org/dist/Plack-App-RDF-Files)
=end markdown
=head1 SYNOPSIS
Create and run a Linked Open Data server in one line:
plackup -e 'use Plack::App::RDF::Files "app"; app(base_dir=>"/path/to/rdf")'
In more detail, create a file C<app.psgi>:
use Plack::App::RDF::Files;
Plack::App::RDF::Files->new(
base_dir => '/path/to/rdf/', # mandatory
base_uri => 'http://example.org/' # optional
)->to_app;
Run it as web application by calling C<plackup>. Request URLs are then mapped
to URIs and directories to return data from RDF files as following:
http://localhost:5000/foo => http://example.org/foo
/path/to/rdf/foo/
/path/to/rdf/foo/*.(nt|ttl|rdfxml)
http://localhost:5000/x/y => http://example.org/x/y
/path/to/rdf/x/y/
/path/to/rdf/x/y/*.(nt|ttl|rdfxml)
In short, each subdirectory corresponds to an RDF resource.
=head1 DESCRIPTION
This L<PSGI> application serves RDF from files. Each accessible RDF resource
corresponds to a (sub)directory, located in a common based directory. All RDF
files in a directory are merged and returned as RDF graph. If no RDF data was
found in an existing subdirectory, an axiomatic triple is returned:
$REQUEST_URI <a <http://www.w3.org/2000/01/rdf-schema#Resource> .
Requesting the base directory, however will result in a HTTP 404 error unless
option C<index_property> is enabled.
HTTP HEAD and conditional GET requests are supported by ETag and
Last-Modified-Headers (see L<Plack::Middleware::ConditionalGET>).
=head1 CONFIGURATION
=over 4
=item base_dir
Mandatory base directory that all resource directories are located in.
=item base_uri
The base URI of all resources. If no base URI has been specified, the
base URI is taken from the PSGI request.
=item file_types
An array of RDF file types, given as extensions to look for. Set to
C<['rdfxml','nt','ttl']> by default.
=item index_property
By default a HTTP 404 error is returned if one tries to access the base
directory. Enable this option by setting it to 1 or to an URI, to also serve
RDF data from the base directory. By default
C<http://www.w3.org/2000/01/rdf-schema#seeAlso> is used as index property, if
enabled.
=item path_map
Optional code reference that maps a local part of an URI to a relative
directory. Set to the identity mapping by default.
=item namespaces
Optional namespaces for serialization, passed to L<RDF::Trine::Serializer>.
=item normalize
Optional Unicode Normalization form (NFD, NFKC, NFC, NFKC). Requires
L<Unicode::Normalize>.
=back
=head1 METHODS
=head2 call( $env )
Core method of the PSGI application.
The following PSGI environment variables are read and/or set by the
application.
=over 4
=item rdf.uri
The requested URI as string or L<URI> object.
=item rdf.iterator
The L<RDF::Trine::Iterator> that will be used for serializing, if
C<psgi.streaming> is set. One can use this variable to catch the RDF
data in another post-processing middleware.
=item rdf.files
An hash of source filenames, each with the number of triples (on success)
as property C<size>, an error message as C<error> if parsing failed, and
the timestamp of last modification as C<mtime>. C<size> and C<error> may
not be given before parsing, if C<rdf.iterator> is set.
=item negotiate.format
RDF serialization format (See L<Plack::Middleware::Negotiate>). Supported
values are C<ttl>, C<nt>, C<n3>, C<json>, and C<rdfxml>.
=back
If an existing resource does not contain triples, the axiomatic triple
C<< $uri rdf:type rdfs:Resource >> is returned.
=head2 files( $env )
Get a list of RDF files (as hash reference) that will be read for a given
request, given as L<PSGI> environment.
The requested URI is saved in field C<rdf.uri> of the request environment. On
success returns the base directory and a list of files, each mapped to its last
modification time. Undef is returned if the request contained invalid
characters (everything but C<a-zA-Z0-9:.@/-> and the forbidden sequence C<../>
or a sequence starting with C</>), or if called with the base URI and
C<index_property> not enabled.
=head2 headers( $files )
Get a response headers object (as provided by L<Plack::Util>::headers) with
ETag and Last-Modified from a list of RDF files given as returned by the files
method.
=head1 FUNCTIONS
=head2 app( %options )
This shortcut for C<< Plack::App::RDF::Files->new >> can be exported on request
to simplify one-liners.
=head1 SEE ALSO
Use L<Plack::Middleware::Negotiate> to add content negotiation based on
an URL parameter and/or suffix.
See L<RDF::LinkedData> for a different module to serve RDF as linked data.
See also L<RDF::Flow> and L<RDF::Lazy> for processing RDF data.
See L<http://foafpress.org/> for a similar approach in PHP.
=head1 COPYRIGHT AND LICENSE
Copyright Jakob Voss, 2014-
This library is free software; you can redistribute it and/or modify it under
the same terms as Perl itself.
=cut