Yet Another CPAN Grep

Data-SplitSerializer/lib/Data/SplitSerializer.pm

package Data::SplitSerializer;

our $VERSION = '0.91'; # VERSION
# ABSTRACT: Modules that "split serialize" data structures

#############################################################################
# Modules

use sanity;
use Moo;
use Types::Standard qw(Bool Str HashRef InstanceOf HasMethods);

use Module::Runtime qw( use_module );
use Hash::Merge;
use Try::Tiny;
use Scalar::Util qw( blessed );

use namespace::clean;
no warnings 'uninitialized';

#############################################################################
# Custom Hash::Merge behaviors

my $default_behavior = 'LEFT_PRECEDENT_STRICT_ARRAY_INDEX';

Hash::Merge::specify_behavior(
   {
      # NOTE: Undef is still considered 'SCALAR'.
      SCALAR => {
         SCALAR => sub { $_[1] },
         ARRAY  => sub {
            return $_[1] unless defined $_[0];
            die sprintf('mismatched type (%s vs. %s) found during merge: $scalar = %s', 'SCALAR', 'ARRAY', $_[0]);
         },
         HASH   => sub {
            return $_[1] unless defined $_[0];
            die sprintf('mismatched type (%s vs. %s) found during merge: $scalar = %s', 'SCALAR', 'HASH',  $_[0]);
         },
      },
      ARRAY => {
         SCALAR => sub {
            return $_[0] unless defined $_[1];
            die sprintf('mismatched type (%s vs. %s) found during merge: $scalar = %s', 'ARRAY', 'SCALAR', $_[1]);
         },
         ARRAY  => sub {
            # Handle arrays by index, not by combining
            my ($l, $r) = @_;
            $l->[$_] = $r->[$_] for (
               grep { defined $r->[$_] }
               (0 .. $#{$_[1]})
            );
            return $l;
         },
         HASH   => sub { die sprintf('mismatched type (%s vs. %s) found during merge', 'ARRAY', 'HASH'); },
      },
      HASH => {
         SCALAR => sub {
            return $_[0] unless defined $_[1];
            die sprintf('mismatched type (%s vs. %s) found during merge: $scalar = %s', 'HASH', 'SCALAR', $_[1]);
         },
         ARRAY  => sub { die sprintf('mismatched type (%s vs. %s) found during merge', 'HASH', 'ARRAY'); },
         HASH   => sub { Hash::Merge::_merge_hashes( $_[0], $_[1] ) },
      },
   },
   $default_behavior,
);

#############################################################################
# Attributes

has _merge_obj => (
   is      => 'rw',
   isa     => InstanceOf['Hash::Merge'],
   default => sub { Hash::Merge->new($default_behavior); },
   handles => { qw(
      merge             merge
      specify_behavior  specify_merge_behavior
      set_behavior      set_merge_behavior
   ) },
);

has path_style => (
   is      => 'ro',
   isa     => Str,
   default => sub { 'DZIL' },
   coerce  => sub {
      'Parse::Path::'.$_[0] unless ($_[0] =~ s/^\=//);  # NOTE: kill two birds with one stone
   },
);

has path_options => (
   is      => 'ro',
   isa     => HashRef,
   default => sub { {
      auto_normalize => 1,
      auto_cleanup   => 1,
   } },
);

has remove_undefs => (
   is      => 'ro',
   isa     => Bool,
   default => sub { 1 },
);

#############################################################################
# Pre/post-BUILD


sub BUILD {
   my $self = $_[0];

   # Load the path class
   use_module $self->path_style;

   return $self;
}

#############################################################################
# Methods

### FLATTENING ###

sub serialize {
   my ($self, $ref) = @_;

   my $type = ref $ref;
   die 'Reference must be an unblessed HASH or ARRAY!'
      unless (defined $ref && !blessed $ref && $type =~ /HASH|ARRAY/);

   return $self->serialize_refpath('', $ref);
}

sub serialize_refpath {
   my ($self, $path, $ref) = @_;
   $path //= '';

   my $prh = { $path => $ref };  # single row answer

   return $prh if blessed $ref;  # down that path leads madness...
   my $type = ref $ref || return $prh;        # that covers SCALARs...
   return $prh unless $type =~ /HASH|ARRAY/;  # ...and all other endpoints

   # Blessed is the path
   unless (blessed $path) {
      $path = $self->path_style->new(
         %{ $self->path_options },
         stash_obj => $self,
         path => $path,
      );
   }

   die sprintf("Too deep down the rabbit hole, stopped at '%s'", $path)
      if ($path->step_count > 255);

   my $hash = {};
   my @keys = $type eq 'HASH' ? (keys %$ref) : (0 .. $#$ref);
   foreach my $key (@keys) {
      my $val = $type eq 'HASH' ? $ref->{$key} : $ref->[$key];

      # Add on to $path
      my $newpath = $path->clone;
      $newpath->push( $newpath->key2hash($key, $type) );

      # Recurse back to give us a full set of $path => $val pairs
      my $newhash = $self->serialize_refpath($newpath, $val);

      # Merge (shallowly)
      $hash->{$_} = $newhash->{$_} for (grep { defined $newhash->{$_} or !$self->remove_undefs } keys %$newhash);
   }

   return $hash;
}

### EXPANSION ###

sub deserialize {
   my ($self, $hash) = @_;

   my $root;  # not sure if it's a hash or array yet
   foreach my $path (sort keys %$hash) {
      my $branch = $self->deserialize_pathval($path, $hash->{$path}) || return;  # error already set

      # New root?
      unless (defined $root) {
         $root = $branch;
         next;
      }

      # Our merge behavior might die on us (or Hash::Merge itself)
      my $err;
      try   { $root = $self->merge($root, $branch); }
      catch { $err = $_; };

      # Add path to error
      die sprintf("In path '%s', %s", $path, $err) if ($err);
   }

   return $root;
}

sub deserialize_pathval {
   my ($self, $path, $val) = @_;

   my ($root, $leaf, $hash_steps);
   $path = $self->path_style->new(
      %{ $self->path_options },
      path => $path,
   );

   for my $i (0 .. $path->step_count - 1) {
      my $hash_step = $path->_path->[$i];
      my $next_step = ($i == $path->step_count - 1) ? undef : $path->_path->[$i+1];

      # Construct $root if we need to
      $root = $leaf = ( $hash_step->{type} eq 'HASH' ? {} : [] ) unless ($i);

      # Add in the key, construct the next ref, and move the leaf forward
      my $type_str = substr($hash_step->{type}, 0, 1);
      $type_str   .= substr($next_step->{type}, 0, 1) if $next_step;

      my $key = $hash_step->{key};

      # (RIP for/when)
      if    ($type_str eq 'HH') { $leaf = $leaf->{$key} = {};   }
      elsif ($type_str eq 'HA') { $leaf = $leaf->{$key} = [];   }
      elsif ($type_str eq 'AH') { $leaf = $leaf->[$key] = {};   }
      elsif ($type_str eq 'AA') { $leaf = $leaf->[$key] = [];   }
      elsif ($type_str eq 'H')  {         $leaf->{$key} = $val; }
      elsif ($type_str eq 'A')  {         $leaf->[$key] = $val; }
   }

   return $root;
}

42;

__END__

=pod

=encoding utf-8

=head1 NAME

Data::SplitSerializer - Modules that "split serialize" data structures

=head1 SYNOPSIS

    use Data::SplitSerializer;
 
    my $dss = Data::SplitSerializer->new( path_style => 'DZIL' );
    my $serialized = {
       'gophers[0].holes'      => 3,
       'gophers[0].food.type'  => 'grubs',
       'gophers[0].food.count' => 7,
 
       'gophers[1].holes'      => 1,
       'gophers[1].food.type'  => 'fruit',
       'gophers[1].food.count' => 5,
    };
    my $deserialized = $dss->deserialize($serialized);
 
    my $more_gophers = [];
    $more_gophers->[2] = {
       holes => 2,
       food  => {
          type  => 'earthworms',
          count => 15,
       },
    };
 
    $deserialized = $dss->merge( $deserialized, $more_gophers );

=head1 DESCRIPTION

Split serialization is a unique form of serialization that only serializes part of the data structure (as a path on the left side) and
leaves the rest of the data, typically a scalar, untouched (as a value on the right side).  Consider the gopher example above:

    my $deserialized = {
       gophers => [
          {
             holes => 3,
             food  => {
                type  => 'grubs',
                count => 7,
             },
          },
          {
             holes => 1,
             food  => {
                type  => 'fruit',
                count => 5,
             },
          },
          {
             holes => 2,
             food  => {
                type  => 'earthworms',
                count => 15,
             },
          }
       ],
    };

A full serializer, like L<Data::Serializer> or L<Data::Dumper>, would turn the entire object into a string, much like the real code
above.  Or into JSON, XML, BerkleyDB, etc.  But, the end values would be lost in the stream.  If you were given an object like this,
how would you be able to store the data in an easy-to-access form for a caching module like L<CHI>?  It requires keyE<sol>value pairs.  Same
goes for L<KiokuDB> or various other storageE<sol>ORM modules.

Data::SplitSerializer uses split serialization to turn the data into a path like this:

    my $serialized = {
       'gophers[0].holes'      => 3,
       'gophers[0].food.type'  => 'grubs',
       'gophers[0].food.count' => 7,
 
       'gophers[1].holes'      => 1,
       'gophers[1].food.type'  => 'fruit',
       'gophers[1].food.count' => 5,
 
       'gophers[2].holes'      => 2,
       'gophers[2].food.type'  => 'earthworms',
       'gophers[2].food.count' => 15,
    };

Now, you can stash the data into whatever storage engine you want... or use just use it as a simple hash.

=for Pod::Coverage BUILD

=head1 CONSTRUCTOR

    # Defaults shown
    my $stash = Data::Stash->new(
       path_style   => 'DZIL',
       path_options => {
          auto_normalize => 1,
          auto_cleanup   => 1,
       },
    );

Creates a new serializer object.  Accepts the following arguments:

=head2 path_style

    path_style => 'File::Unix'
    path_style => '=MyApp::Parse::Path::Foobar'

Class used to create new L<path objects|Parse::Path> for path parsing.  With a C<<< = >>> prefix, it will use that as the full
class.  Otherwise, the class will be intepreted as C<<< Parse::Path::$class >>>.

Default is L<DZIL|Parse::Path::DZIL>.

=head2 path_options

    path_options => {
       auto_normalize => 1,
       auto_cleanup   => 1,
    }

Hash of options to pass to new path objects.  Typically, the default set of options are recommended to ensure a more commutative
path.

=head2 remove_undefs

    remove_undefs => 0

Boolean to indicate whether to remove   See L</Undefined values> for more information.

Default is on.

=head1 METHODS

=head2 serialize

    my $serialized = $dss->serialize($deserialized);

SerializesE<sol>flattens a ref.  Returns a serialized hashref of pathE<sol>value pairs.

=head2 serialize_refpath

    my $serialized = $dss->serialize_refpath($path_prefix, $deserialized);
 
    # serialize is basically this with some extra sanity checks
    my $serialized = $dss->serialize_refpath('', $deserialized);

The real workhorse for C<<< serialize_ref >>>.  Recursively dives down the different pieces of the deserialized tree and eventually comes
back with the serialized hashref.  The path prefix can be used for prepending all of the paths returned in the serialized hashref.

=head2 deserialize

    my $deserialized = $dss->deserialize($serialized);

DeserializesE<sol>expands a hash of pathE<sol>data pairs.  Returns the expanded object, which is usually a hashref, but might be an arrayref.
For example:

    # Starts with an array
    my $serialized = {
       '[0].thingy' => 1,
       '[1].thingy' => 2,
    };
    my $deserialized = $dss->deserialize($serialized);
 
    # Returns:
    $deserialized = [
       { thingy => 1 },
       { thingy => 2 },
    ];

=head2 deserialize_pathval

    my $deserialized = $dss->deserialize_pathval($path, $value);

DeserializesE<sol>expands a single pathE<sol>data pair.  Returns the expanded object.

=head2 merge

    my $newhash = $dss->merge($hash1, $hash2);

Merges two hashes.  This is a direct handle to C<<< merge >>> from an (internal) L<Hash::Merge> object, and is used by L</deserialize> to
combine individual expanded objects.

=head2 set_merge_behavior

Handle to C<<< set_behavior >>> from the (internal) L<Hash::Merge> object.  B<Advanced usage only!>

Data::SplitSerializer uses a special custom type called C<<< LEFT_PRECEDENT_STRICT_ARRAY_INDEX >>>, which properly handles array
indexes and dies on any non-array-or-hash refs.

=head2 specify_merge_behavior

Handle to C<<< specify_behavior >>> from the (internal) L<Hash::Merge> object.  B<Advanced usage only!>

=head1 CAVEATS

=head2 Undefined values

Flattening will remove pathE<sol>values if the value is undefined.  This is to clean up unused array values that appeared as holes in a
sparse array.  For example:

    # From one of the basic tests
    my $round_trip = $dss->serialize( $dss->deserialize_pathval(
       'a[0][1][1][1][1][2].too' => 'long'
    ) );
 
    # Without undef removal, this returns:
    $round_trip = {
       'a[0][0]'                 => undef,
       'a[0][1][0]'              => undef,
       'a[0][1][1][0]'           => undef,
       'a[0][1][1][1][0]'        => undef,
       'a[0][1][1][1][1][0]'     => undef,
       'a[0][1][1][1][1][1]'     => undef,
       'a[0][1][1][1][1][2].too' => 'long',
    };

You can disable this with the L</remove_undefs> switch.

=head2 Refs in split serialization

Split serialization works by looking for HASH or ARRAY refs and diving further into them, adding path prefixes as it goes down.  If
it encounters some other ref (like a SCALAR), it will stop and consider that to be the value for that path.  In terms of ref parsing,
this means two things:

=over

=item 1.

Only HASH and ARRAYs can be examined deeper.

=item 2.

If you have a HASH or ARRAY as a "value", serialization cannot tell the difference and it will be included in the path.

=back

The former isn't that big of a problem, since deeper dives with other kinds of refs are either not possible or dangerous (like CODE).

The latter could be a problem if you started with a hashref with a pathE<sol>data pair, expanded it, and tried to flatten it again.  This
can be solved by protecting the hash with a REF.  Consider this example:

    my $round_trip = $dss->serialize( $dss->deserialize_pathval(
       'a[0]' => { your => 'hash' }
    ) );
 
    # Returns:
    $round_trip = {
       'a[0].your' => 'hash',
    };
 
    # Now protect the hash
    my $round_trip = $dss->serialize( $dss->deserialize_pathval(
       'a[0]' => \{ your => 'hash' }
    ) );
 
    # Returns:
    $round_trip = {
       'a[0]' => \{ your => 'hash' }
    };

=head2 Sparse arrays and memory usage

Since arrays within paths are based on indexes, there's a potential security issue with large indexes causing abnormal memory usage.
In Perl, these two arrays would have drastically different memory footprints:

    my @small;
    $small[0] = 1;
 
    my @large;
    $large[999999] = 1;

This can be mitigated by making sure the Path style you use will limit the total digits for array indexes.  L<Parse::Path> handles
this on all of its paths, but it's something to be aware of if you create your own path classes.

=head1 TODO

This module might split off into individual split serializers, but so far, this is the only one "out in the wild".

=head1 SEE ALSO

L<Parse::Path>

=head1 ACKNOWLEDGEMENTS

Kent Fredric for getting me started on the basic idea.

=head1 AVAILABILITY

The project homepage is L<https://github.com/SineSwiper/Data-SplitSerializer/wiki>.

The latest version of this module is available from the Comprehensive Perl
Archive Network (CPAN). Visit L<http://www.perl.com/CPAN/> to find a CPAN
site near you, or see L<https://metacpan.org/module/Data::SplitSerializer/>.

=for :stopwords cpan testmatrix url annocpan anno bugtracker rt cpants kwalitee diff irc mailto metadata placeholders metacpan

=head1 SUPPORT

=head2 Internet Relay Chat

You can get live help by using IRC ( Internet Relay Chat ). If you don't know what IRC is,
please read this excellent guide: L<http://en.wikipedia.org/wiki/Internet_Relay_Chat>. Please
be courteous and patient when talking to us, as we might be busy or sleeping! You can join
those networks/channels and get help:

=over 4

=item *

irc.perl.org

You can connect to the server at 'irc.perl.org' and talk to this person for help: SineSwiper.

=back

=head2 Bugs / Feature Requests

Please report any bugs or feature requests via L<https://github.com/SineSwiper/Data-SplitSerializer/issues>.

=head1 AUTHOR

Brendan Byrd <BBYRD@CPAN.org>

=head1 CONTRIBUTOR

Brendan Byrd <bbyrd@cpan.org>

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2013 by Brendan Byrd.

This is free software, licensed under:

  The Artistic License 2.0 (GPL Compatible)

=cut
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.