Yet Another CPAN Grep

lsn-data-hub/lib/Data/Format/Hash.pm

package Data::Format::Hash;
use strict;
our $VERSION = 2.1;

use Exporter qw(import);
use Perl::Module;
use Data::OrderedHash;
use Error::Programatic;
use Data::Hub::Address;
use Data::Hub::Util qw(addr_parent addr_pop);

our @EXPORT = qw();
our @EXPORT_OK = qw(
  HF_VERSION_MAJOR
  HF_VERSION_MINOR
  HF_VERSION_STR
  hf_parse
  hf_format
);
our %EXPORT_TAGS = (all => [@EXPORT_OK],);

# Format version
sub HF_VERSION_MAJOR  {2}
sub HF_VERSION_MINOR  {1}
sub HF_VERSION_STR    {HF_VERSION_MAJOR . '.' . HF_VERSION_MINOR}

# Constants
our $NEWLINE            = "\n";
our $SPACE              = ' ';
our $INDENT             = '  ';

# Literal constants
our $LIT_OPEN           = '{';
our $LIT_CLOSE          = '}';
our $LIT_HASH           = '%';
our $LIT_ARRAY          = '@';
our $LIT_SCALAR         = '$';
our $LIT_ASSIGN         = '=>';
our $LIT_COMMENT        = '#';
our $LIT_COMMENT_BEGIN  = '#{';
our $LIT_COMMENT_END    = '#}';

# Used in regular expressions
our $PAT_OPEN           = $LIT_OPEN;
our $PAT_CLOSE          = $LIT_CLOSE;
our $PAT_HASH           = $LIT_HASH;
our $PAT_ARRAY          = $LIT_ARRAY;
our $PAT_SCALAR         = "\\$LIT_SCALAR";
our $PAT_ASSIGN         = "(?<!\\\\)$LIT_ASSIGN";
our $PAT_ASSIGN_STRUCT  = '[\$\%\@]';
our $PAT_ASSIGN_BLOCK   = '<<';
our $PAT_COMMENT        = $LIT_COMMENT;
our $PAT_COMMENT_BEGIN  = $LIT_COMMENT_BEGIN;
our $PAT_COMMENT_END    = $LIT_COMMENT_END;
#our $PAT_LVAL           = '[\w\d\.\_\-\s]';
our $PAT_LVAL           = '[^\{\=]';
our $PAT_IRREGULAR_LVAL = '[\{\=]';
our $PAT_UNESCAPE       = '[\%\@\$\{\}\>\=\#]'; # backward compat
our $PAT_BLOCK_END      = '[a-zA-Z0-9_-]';

# Alias

sub hf_parse {goto &parse}
sub hf_format {goto &format}

# ------------------------------------------------------------------------------
# parse - Parse text into perl data structures
# parse \$text, [options]
# options:
#   -as_array=1         # Treat text as an array list (and return an array ref)
#   -hint=hint          # Usually a filename, used in debug/error output
# ------------------------------------------------------------------------------
#|test(!abort) use Data::Format::Hash qw(hf_parse);
#|test(match) # Parse a simple nested collection
#|my $d = q(
#|baz => biz
#|foo => %{
#|  bar => @{
#|    tender
#|    stool
#|  }
#|}
#|);
#|my $h = hf_parse(\$d);
#|$$h{'baz'};
#=biz
# ------------------------------------------------------------------------------

sub parse {
  my ($opts, $str) = my_opts(\@_, {
    'hint'      => undef,
    'as_array'  => 0,
  });
  my $text = isa($str, 'SCALAR') ? $str : \$str;
  my $root = $$opts{'into'} ? $$opts{'into'} : ();
  $root ||= $$opts{'as_array'} ? [] : Data::OrderedHash->new();
  my $ptr = $root;
  my $block_comment = 0;
  my $block_text = 0;
  my @parents = ();
  local $. = 0;

  for (split /\r?\n\r?/, $$text) {
    $.++;

    if ($block_comment) {
      # End of a block comment?
      /\s*$PAT_COMMENT_END/ and do {
        next if (ref($ptr) eq 'SCALAR');
        _trace($., "comment-e", $_);
        $block_comment = 0;
        next;
      };
      _trace($., "comment+", $_);
      next;
    }

    if ($block_text) {
      # End of a text block?
      /\s*$block_text\s*/ and do {
        _trace($., "txtblk-e", $_);
        $block_text = 0;
        $ptr = pop @parents;
        next;
      };
      _trace($., "txtblk+", $_);
      $$ptr .= $$ptr ? $NEWLINE . _unescape($_) : _unescape($_);
      next;
    }

    # Begin of a new hash structure
    /^\s*$PAT_HASH($PAT_LVAL*)\s*$PAT_OPEN\s*$/ and do {
      _trace($., "hash", $_);
      push @parents, $ptr;
      my $h = Data::OrderedHash->new();
      my $var_name = _trim_whitespace(\$1);
      isa($ptr, 'HASH') and $ptr->{$var_name} = $h;
      isa($ptr, 'ARRAY') and push @$ptr, $h;
      $ptr = $h;
      next;
    };

    # Begin of a new array structure
    /^\s*$PAT_ARRAY($PAT_LVAL*)\s*$PAT_OPEN\s*$/ and do {
      _trace($., "array", $_);
      push @parents, $ptr;
      my $a = [];
      my $var_name = _trim_whitespace(\$1);
      isa($ptr, 'HASH') and $ptr->{$var_name} = $a;
      isa($ptr, 'ARRAY') and push @$ptr, $a;
      $ptr = $a;
      next;
    };

    # Begin of a new scalar structure
    /^\s*$PAT_SCALAR($PAT_LVAL*)\s*$PAT_OPEN\s*$/ and do {
      _trace($., "scalar", $_);
      push @parents, $ptr;
      if (isa($ptr, 'HASH')) {
        my $var_name = _trim_whitespace(\$1);
        $ptr->{$var_name} = '';
        $ptr = \$ptr->{$var_name};
      } elsif (isa($ptr, 'ARRAY')) {
        push @$ptr, '';
        $ptr = \$ptr->[$#$ptr];
      }
      next;
    };

    # A block comment
    /^\s*$PAT_COMMENT_BEGIN/ and do {
      next if (ref($ptr) eq 'SCALAR');
      _trace($., "comment-b", $_);
      $block_comment = 1;
      next;
    };

    # A one-line comment
    /^\s*$PAT_COMMENT/ and do {
      if ($. == 1) {
        _trace($., "crown", $_);
        my @parts = split '\s';
        if (@parts >= 3 && $parts[0] =~ /^Hash(File|Format)$/) {
          my ($major, $minor) = split '\.', $parts[2];
          if ($major > HF_VERSION_MAJOR) {
            die "Hash format version '$major' is too new",
                _get_hint($., $_, $$opts{'hint'});
          }
        }
      } else {
        _trace($., "comment", $_);
      }
      next unless (ref($ptr) eq 'SCALAR');
    };

    # A one-line hash member value
    /^\s*($PAT_LVAL+)\s*$PAT_ASSIGN\s*(.*)/ and do {
      my $lval = $1;
      my $rval = $2;
      my $var_name = _trim_whitespace(\$lval);

      # Structure assignment
      $rval =~ /^($PAT_ASSIGN_STRUCT)\s*$PAT_OPEN\s*$/ and do {
        _trace($., "assign-$1", $_);
        unless (isa($ptr, 'HASH')) {
          warn "Cannot assign structure to '$ptr'",
              _get_hint($., $_, $$opts{'hint'});
          next;
        }
        push @parents, $ptr;
        if ($1 eq $LIT_HASH) {
          my $h = Data::OrderedHash->new();
          $ptr->{$var_name} = $h;
          $ptr = $h;
        } elsif ($1 eq $LIT_ARRAY) {
          my $a = [];
          $ptr->{$var_name} = $a;
          $ptr = $a;
        } elsif ($1 eq $LIT_SCALAR) {
          $ptr->{$var_name} = '';
          $ptr = \$ptr->{$var_name};
        } else {
          warn "Unexpected structure assignment",
              _get_hint($., $_, $$opts{'hint'});
        }
        next;
      };

      # Block assignment
      $rval =~ /$PAT_ASSIGN_BLOCK\s*($PAT_BLOCK_END+)\s*$/ and do {
        _trace($., "txtblk", $_);
        push @parents, $ptr;
        if (isa($ptr, 'HASH')) {
          $ptr->{$var_name} = '';
          $ptr = \$ptr->{$var_name};
        } elsif (isa($ptr, 'ARRAY')) {
          push @$ptr, '';
          $ptr = \$ptr->[$#$ptr];
        }
        $block_text = $1;
        next;
      };

      # Value assignment
      _trace($., "assign", $_);
      unless (isa($ptr, 'HASH')) {
        warn "Cannot assign variable to '$ptr'", _get_hint($., $_, $$opts{'hint'});
        isa($ptr, 'ARRAY') and push @$ptr, $_;
        isa($ptr, 'SCALAR') and $$ptr .= $_;
        next;
      }
      $ptr->{$var_name} = _unescape($rval);
      next;
    };

    # Close a structure
    /^\s*$PAT_CLOSE\s*$/ and do {
      _trace($., "close", $_);
      $ptr = pop @parents;
      unless (defined $ptr) {
        warn "No parent" . _get_hint($., $_, $$opts{'hint'});
      }
      next;
    };

    # A one-line array item
    ref($ptr) eq 'ARRAY' and do {
      _trace($., "array+", $_);
      s/^\s+//g;
      next if $_ eq ''; # Could be a blank line (arrays of hashes)
      push @$ptr, _unescape($_);
      next;
    };

    # Part of a scalar
    ref($ptr) eq 'SCALAR' and do {
      _trace($., "scalar+", $_);
      $$ptr .= $$ptr ? $NEWLINE . _unescape($_) : _unescape($_);
#     $$ptr .= $$ptr ? $NEWLINE . $_ : $_;
      next;
    };

    _trace($., "?", $_);
  }

  warn "Unclosed structure" . _get_hint($., 'EOF', $$opts{'hint'}) if @parents > 1;
  return $root;
}

# ------------------------------------------------------------------------------
# _trace - Debug output while parsing
# ------------------------------------------------------------------------------

sub _trace {
# warn sprintf("%4d: %10s %s\n", @_);
}

# ------------------------------------------------------------------------------
# _make_crown - Return the hash-format file crown
# ------------------------------------------------------------------------------

sub _make_crown {
  return '# HashFile ' . HF_VERSION_STR . "\n";
}

# ------------------------------------------------------------------------------
# format - Format nested data structure as string
# format [options]
#
# options:
#
#   -as_ref => 1        Return a reference (default 0)
#   -with_crown => 1    Prepend output with "# HashFile M.m" (where M.m is version)
# ------------------------------------------------------------------------------
#|test(!abort) use Data::Format::Hash qw(hf_format);
#|test(match) # Format a simple nested collection
#|my $d = {foo=>{bar=>['tender','stool']}};
#|hf_format($d)
#=foo => %{
#=  bar => @{
#=    tender
#=    stool
#=  }
#=}
# ------------------------------------------------------------------------------

sub format {
  my ($opts, $ref) = my_opts(\@_, {
    'as_ref' => 0, # Return a scalar reference instead of a scalar
    'with_crown' => 0, # Prepend the '# HashFile 2.1' firstline
    'indent_level' => 0, # Base indentation level
  });
  croak "Provide a reference" unless ref($ref);
  $opts->{'addr'} ||= new Data::Hub::Address();
  my $result = $$opts{'with_crown'} ? _make_crown() : '';
  _format($ref, undef, $$opts{'indent_level'}, undef, $opts, \$result);
  chomp $result;
  return $$opts{'as_ref'} ? \$result : $result;
}

# ------------------------------------------------------------------------------
# _format - Implementation of format
# ------------------------------------------------------------------------------

sub _format {
  my $ref = shift;
  my $name = shift;
  my $level = shift || 0;
  my $parent = shift;
  my $opts = shift;
  my $result = shift || str_ref();
  my $is_named = defined $name && $name ne ''; # 0 is a valid name

  # TODO handle undefined names (when other than level 0) otherwise the error
  # will happen on the parsing, i.e., a bad format is created.

  # Tame beastly names
  if ($is_named && $name =~ /$PAT_IRREGULAR_LVAL/) {
    $name =~ s/([^A-Za-z0-9_])/sprintf("_0x%2x_", unpack("C", $1))/eg;
    #TODO to unpack the name, add this to the `parse` function:
    #$str =~ s/_0x([a-fA-F0-9][a-fA-F0-9])_/pack("C",hex($1))/eg;
  }

  $is_named and $name =~ s/&#x([a-fA-F0-9]{2,3});/pack("C",hex($1))/eg;

  if ($opts->{ignore} && grep_first(sub {isa($ref, $_)}, @{$opts->{ignore}})) {

    $$result .= _get_indent($level) . $LIT_COMMENT.$SPACE;
    $$result .= $name.$SPACE.$LIT_ASSIGN.$SPACE if $is_named;
    $$result .= ref($ref).$NEWLINE;

  } elsif (isa($ref, 'HASH') || isa($ref, 'ARRAY')) {

    # Structure declaration and name
    if ($level > 0) {
      my $symbol = isa($ref, 'HASH') ? $LIT_HASH : $LIT_ARRAY;
#     if (defined $parent && isa($parent, 'HASH')) {
      if ($is_named) {
        $$result .= _get_indent($level) 
          .$name.$SPACE.$LIT_ASSIGN.$SPACE.$symbol.$LIT_OPEN.$NEWLINE;
      } else {
#       $$result .= _get_indent($level) .$symbol.$name.$LIT_OPEN.$NEWLINE;
        $$result .= _get_indent($level) .$symbol.$LIT_OPEN.$NEWLINE;
      }
    }

    # Contents
    if (isa($ref, 'HASH')) {
      $level++;
      for (keys %$ref) {
        $opts->{'addr'}->push($_);
        if (ref($$ref{$_})) {
          $$result .= ${_format($$ref{$_}, $_, $level, $ref, $opts)};
        } else {
          $$result .= ${_format(\$$ref{$_}, $_, $level, $ref, $opts)};
        }
        $opts->{'addr'}->pop();
      }
      $level--;
    } elsif (isa($ref, 'ARRAY')) {
      $level++;
      my $idx = 0;
      for (@$ref) {
        $opts->{'addr'}->push($idx++);
        $$result .= ref($_) ?
          ${_format($_, '', $level, $ref, $opts)} :
          ${_format(\$_, '', $level, $ref, $opts)};
        $opts->{'addr'}->pop();
      }
      $level--;
    }

    # Close the structure
    $$result .= _get_indent($level) . $LIT_CLOSE.$NEWLINE
      if $level > 0;

  } elsif (ref($ref) eq 'SCALAR') {

    my $value = $$ref;
    $value = '' unless defined $value;

    # Scalar
    if (index($value, "\n") > -1 || $value =~ /^\s+/
        || (defined $parent && isa($parent, 'ARRAY') && $value eq '')) {
      $$result .= _get_indent($level);
      if (defined $parent && isa($parent, 'HASH')) {
        $$result .= $name.$SPACE.$LIT_ASSIGN.$SPACE.$LIT_SCALAR.$LIT_OPEN.$NEWLINE;
      } else {
        $$result .= $LIT_SCALAR.$name.$LIT_OPEN.$NEWLINE;
      }
      # Write a scalar block to protect data
      $$result .= _escape($value).$NEWLINE;
      $$result .= _get_indent($level) .$LIT_CLOSE.$NEWLINE;
    } else {
      # One-line scalar (key/value)
      if ($is_named) {
        $$result .= _get_indent($level) .
        $name.$SPACE.$LIT_ASSIGN.$SPACE.$value.$NEWLINE;
      } else {
        $$result .= _get_indent($level)._escape($value).$NEWLINE;
      }
    }

  } elsif (isa($ref, 'JSON::XS::Boolean')) {
      $$result .= _get_indent($level) .
      $name.$SPACE.$LIT_ASSIGN.$SPACE.$ref.$NEWLINE;
  } else {
#   $ref = '' unless defined $ref;
    $$result .= _get_indent($level) . $LIT_COMMENT.$SPACE;
    $$result .= $name.$SPACE.$LIT_ASSIGN.$SPACE if $is_named;
    $$result .= $ref.'('.ref($ref).')'.$NEWLINE;
  }
  return $result;
}

sub _trim_whitespace {
  my $result = ${$_[0]};
  $result =~ s/^\s+|\s+$//g;
  return $result;
}

# ------------------------------------------------------------------------------
# _escape - Esacape patterns which would be interpred as control characters
# ------------------------------------------------------------------------------

sub _escape {
  my $result = $_[0];
  $result =~ s/(?<!\\)(=>|[\$\@\%]\{)/\\$1/g;
  $result =~ s/^(\s*)(?<!\\)\}/$1\\\}/gm;
  $result =~ s/^(\s*)(?<!\\)#/$1\\#/gm;
  return $result;
}#_escape

# ------------------------------------------------------------------------------
# _unescape - Remove protective backslashes
# ------------------------------------------------------------------------------

sub _unescape {
  my $result = $_[0];
# $result =~ s/\\($PAT_UNESCAPE)/$1/g;
  $result =~ s/\\(=>|[\$\@\%]\{)/$1/g;
  $result =~ s/^(\s*)\\\}/$1\}/gm;
  $result =~ s/^(\s*)\\#/$1#/gm;
  return $result;
}#_unescape

# ------------------------------------------------------------------------------
# _get_indent - Get the indent for formatting nested sructures
# _get_indent $level
# ------------------------------------------------------------------------------

sub _get_indent {
  my $indent = $INDENT;
  return $_[0] > 1 ? $indent x= ($_[0] - 1): '';
}

# ------------------------------------------------------------------------------
# _get_hint - Context information for error messages
# _get_hint $line_num, $line_text
# ------------------------------------------------------------------------------

sub _get_hint {
  my $result = '';
  if (defined $_[2]) {
    $result = " ($_[2])";
  }
  my $str =  substr($_[1], 0, 40);
  $str =~ s/^\s+//g;
  $result .= " at line $_[0]: '$str'";
  return $result;
}

1;

__END__

=pod:summary Parse and format perl data structures

=pod:synopsis

=pod:description

=cut
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.