Group
Extension

Text-CSV-Flatten/lib/Text/CSV/Flatten.pm

package Text::CSV::Flatten;

use v5.014;
use strict;
use warnings;

our $VERSION = '0.04';

use JSON qw/ encode_json /;
use Text::CSV;

my @KNOWN_ARGS= qw/ column_name /;

sub new {
    my ($class, $pattern, %args)= @_;

    my $data= delete $args{data};

    my %known_args;
    @known_args{@KNOWN_ARGS}= delete @args{@KNOWN_ARGS};
    if(keys %args) {
        my $unknown_keys= join ",", keys %args;
        die "Unknown arguments: $unknown_keys";
    }

    my $self= bless {
        %known_args,
        data_matrix => {},
    }, $class;

    $self->_set_pattern($pattern);
    $self->data($data) if $data;

    return $self;
}

sub _set_pattern {
    my ($self, $pattern_definition)= @_;

    my @pattern_def= split / /, $pattern_definition;

    my %index_column_names;
    my @pattern_parts;
    for my $pattern (@pattern_def) {
        $pattern =~ /^\.(.*)$/
            or die "invalid pattern part: <$pattern>";
        my $p= $1;
        my @pattern= split /\./, $p;

        my @index_column_names= map { /^<(.*)>$/ ? $1 : () } @pattern;
        $index_column_names{ join("\0", @index_column_names) }= 1;

        push @pattern_parts, \@pattern;
    }

    if(keys %index_column_names == 1) {
        $self->{index_column_names}= [ split "\0", (keys %index_column_names)[0] ];
    } else {
        die "Invalid pattern: the different pattern chunks have different index columns";
    }

    $self->{pattern_parts}= \@pattern_parts;
}

sub data {
    my ($self, $data)= @_;

    my $data_matrix= $self->{data_matrix};
    my $pattern_parts= $self->{pattern_parts};

    my @default_column_names;
    if(my $default_column_name= $self->{column_name}) {
        if(ref $default_column_name eq 'ARRAY') {
            @default_column_names= @$default_column_name;
        } else {
            @default_column_names= ($self->{column_name}) x @$pattern_parts;
        }
    }
    for my $pattern (@$pattern_parts) {
        my $has_column_name= scalar grep {
            $_ eq '*' || /^{(.*)}$/
        } @$pattern;
        $self->{_default_column_name}= shift @default_column_names
            if !$has_column_name;
        $self->_recurse_pattern($data, $pattern, [], []);
    }

    return $self;
}

sub csv {
    my ($self)= @_;

    my $data_matrix= $self->{data_matrix};
    my $index_column_names= $self->{index_column_names};

    my @records;
    my %column_names;
    for my $index (sort keys %$data_matrix) {
        my $data= $data_matrix->{$index};
        my %record;
        @record{@$index_column_names}= _deserialize_tuple($index);
        for my $column_key (keys %$data) {
            my $friendly_column_name= join "_", _deserialize_tuple($column_key);
            $record{$friendly_column_name}= $data->{$column_key};
        }
        @column_names{keys %record}= (1) x keys %record;

        push @records, \%record;
    }
    my @column_names= sort keys %column_names;
    my $render_header= scalar grep $_, @column_names;

    my $csv= Text::CSV->new({binary => 1});

    my @result;
    if($render_header) {
        if(my $status= $csv->combine(@column_names)) {
            push @result, $csv->string();
        } else {
            my $error= $csv->error_input();
            die "Error while rendering header row: $error";
        }
    }
    for my $record (@records) {
        my @columns= @$record{@column_names};
        if(my $status= $csv->combine(@columns)) {
            push @result, $csv->string();
        } else {
            my $error= $csv->error_input();
            die "Error while rendering row: $error";
        }
    }

    return join "\n", @result;
}

# utility function to iterate over key => value pairs with the added
# bonus that it also works for arrays
sub _foreach(&$) {
    my ($codeblock, $it)= @_;

    if(!defined $it || !ref $it) {
        return;
    } elsif('ARRAY' eq ref $it) {
        for my $i (0 .. @$it - 1) {
            $codeblock->($i, $it->[$i]);
        }
    } elsif('HASH' eq ref $it) {
        for my $i (keys %$it) {
            $codeblock->($i, $it->{$i});
        }
    } elsif($it->can('TO_JSON')) {
        no warnings 'prototype';            # avoid "_foreach() called too early to check prototype"
        _foreach($codeblock, $it->TO_JSON);
    } else {
        die "Can't iterate over item";
    }
}

sub _serialize_tuple {
    return pack "(S/a)*", @_;
}

sub _deserialize_tuple {
    return unpack "(S/a)*", $_[0];
}

sub _recurse_pattern {
    my ($self, $cur_data, $pattern, $column_name_prefix, $index_prefix)= @_;

    if(@$pattern) {
        my ($p, @p)= @$pattern;
        eval {
            if($p eq '*') {
                _foreach {
                    my ($key, $value)= @_;
                    _recurse_pattern($self, $value, \@p, [@$column_name_prefix, $key], $index_prefix);
                } $cur_data;
            } elsif($p =~ /^{(.*)}$/) {
                my @keys= split ',', $1;
                for my $key (@keys) {
                    my $recurse_data;
                    if(ref $cur_data eq 'HASH' && exists $cur_data->{$key}) {
                        _recurse_pattern($self, $cur_data->{$key}, \@p, [@$column_name_prefix, $key], $index_prefix)
                    } elsif(ref $cur_data eq 'ARRAY' && exists $cur_data->[$key]) {
                        _recurse_pattern($self, $cur_data->[$key], \@p, [@$column_name_prefix, $key], $index_prefix)
                    }
                }
            } elsif($p =~ /^<(.*)>$/) {
                _foreach {
                    my ($key, $value)= @_;
                    _recurse_pattern($self, $value, \@p, $column_name_prefix, [@$index_prefix, $key]);
                } $cur_data;
            } else {
                if(ref $cur_data eq 'HASH' && exists $cur_data->{$p}) {
                    _recurse_pattern($self, $cur_data->{$p}, \@p, $column_name_prefix, $index_prefix)
                } elsif(ref $cur_data eq 'ARRAY' && exists $cur_data->[$p]) {
                    _recurse_pattern($self, $cur_data->[$p], \@p, $column_name_prefix, $index_prefix)
                }
            }
            1;
        } or do {
            my $error= $@ || "Zombie error";
            my $debugstr= join(".", "-->$p<--", @p);
            die "Error while applying pattern chunk $debugstr: $error";
        }
    } else {
        my $cell_value= ref $cur_data
                      ? encode_json($cur_data)
                      : $cur_data;
        my @column_tuple= @$column_name_prefix ? @$column_name_prefix : ($self->{_default_column_name} || '');
        $self->{data_matrix}{_serialize_tuple(@$index_prefix)}{_serialize_tuple(@column_tuple)}= $cell_value;
    }
}


1;
__END__

=head1 NAME

Text::CSV::Flatten - Perl extension for transforming hierarchical data (nested
arrays/hashes) to comma-separated value (csv) output according to a compact,
readable, user-specified pattern.


=head1 SYNOPSIS

  use Text::CSV::Flatten;
  Text::CSV::Flatten->new(
    '.<index>.*',
    data    => [{ a => 1, b => 2 }, { a => 3, b => 4 }],
  )->csv();

=head1 DESCRIPTION

This module transforms hierarchical data (nested arrays/hashes) to
comma-separated value (csv) output according to a compact, readable,
user-specified pattern.

For example, the pattern C<< .<index>.* >> transforms a data structure
of the form

    [{ a => 1, b => 2 }, { a => 3, b => 4 }]

to the CSV output

    a,b,index
    1,2,0
    3,4,1

The pattern C<.*.*> applied to the same data gives the output

    0_a,0_b,1_a,1_b
    1,2,3,4

The pattern C<< .*.<key> >> gives the output

    0,1,key
    1,3,a
    2,4,b

It is hoped that the pattern specification is sufficiently powerful for this
module to replace a lot of simple boiler-plate data transformations.

=head1 PATTERN SPECIFICATION

The dot-separated components represent the following:

=over

=item

C<< <name> >> represents that the keys at that position should be put in a
column named name in the csv output. This column will be considered a primary
key, and the values belonging to those keys become rows;

=item

C<*> represents that the keys at that position in the pattern should be
interpreted as column names; their values should be the values for that column,
all beloning to the same row;

=item

C<{column_name}> or C<{column_name_1,column_name_2,...}> is similar to C<*>,
but instead of capturing all the keys at that level of the hierarchy, it only
captures the named columns.

=item

anything else represents a literal key name.

=item

If your pattern does not contain C<*> or C<{...}>, you need to pass an
additional C<< column_name => >> parameter to the constructor to specify the
name for the single column where the value will go.

=back

For the purposes of this description, an array should be seen as a collection
of index => value pairs.

It is possible to specify several dot-separated paths in a single pattern,
separated by spaces. In that case, all the paths need to have the same primary
key (that is, the same set of names in C<< <...> >>). Rows will be formed by
joining the columns resulting from the different paths.

=head1 SEE ALSO

  Text::CSV

=head1 AUTHOR

Timo Kluck, E<lt>tkluck@infty.nlE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2016 by Timo Kluck

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.22.1 or,
at your option, any later version of Perl 5 you may have available.


=cut


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.