Group
Extension

Text-BSV/lib/Text/BSV/BsvParsing.pm

####
# BsvParsing.pm:  A Perl module containing helper functions for parsing and
# generating BSV data.
#
#   NOTE:  For a complete specification of the BSV (Bar-Separated Values)
#   format, see "bsv_format.txt".
#
####
#
# Copyright 2010 by Benjamin Fitch.
#
# This library is free software; you can redistribute it and/or modify it
# under the same terms as Perl itself.
####
package Text::BSV::BsvParsing;

use 5.010001;
use strict;
use warnings;
use utf8;

use English "-no_match_vars";
use Hash::Util ("lock_keys");
use List::Util ("first", "max", "min", "sum");
use Scalar::Util ("looks_like_number");
use Exporter;

use Text::BSV::Exception;

# Version:
our $VERSION = '1.04';

# Specify default exports:
our @ISA = ("Exporter");
our @EXPORT = (
  "get_field_delimiter",
  "parse_header_row",
  "parse_row",
  "generate_header_row",
  "generate_row",
  );

# Constants:
my $POUND     = "#";
my $SQ        = "'";
my $DQ        = "\"";
my $SEMICOLON = ";";
my $CR        = "\r";
my $LF        = "\n";
my $SPACE     = " ";
my $EMPTY     = "";
my $TRUE      = 1;
my $FALSE     = 0;

my $INCLUDE_TRAILING_EMPTY_FIELDS = -1;

####
# Exported helper functions for use by the other Text::BSV::* modules:
####

# The get_field_delimiter() function takes a string containing the header
# row and a string containing the first non-header row.  The function
# returns a string containing the field delimiter.
#
# If the field delimiter cannot be unambiguously determined, the function
# throws an exception of type $Text::BSV::Exception::INVALID_DATA_FORMAT.
sub get_field_delimiter {
    my $header_row = $_[0];
    my $first_nonheader_row = $_[1];
    my $field_delimiter;

    if (index($header_row, "|") > -1) {
        $field_delimiter = "|";
    }
    elsif (index($header_row, $DQ) > -1
      || index($first_nonheader_row, $DQ) > -1) {
        die Text::BSV::Exception->new(
          $Text::BSV::Exception::INVALID_DATA_FORMAT,
          "There is no vertical bar in the header row, but there is a "
          . "double quotation mark in the header row or in the first "
          . "non-header row.");
    }
    else {
        my @potential_delimiters;

        for my $delim (",", $SEMICOLON, "\t") {
            if (index($header_row, $delim) > -1) {
                push @potential_delimiters, $delim;
            } # end if
        } # next $delim

        given (scalar @potential_delimiters) {
            when (0) {
                die Text::BSV::Exception->new(
                  $Text::BSV::Exception::INVALID_DATA_FORMAT,
                  "The field delimiter in the BSV data cannot be "
                  . "unambiguously determined.");
            }
            when (1) {
                $field_delimiter = $potential_delimiters[0];
            }
            default {
                my $found_winner = $FALSE;

                for my $potential_delimiter (@potential_delimiters) {
                    if (num_delimiters($header_row, $potential_delimiter)
                      == num_delimiters(
                      $first_nonheader_row, $potential_delimiter)) {
                        if ($found_winner) {
                            die Text::BSV::Exception->new(
                              $Text::BSV::Exception::INVALID_DATA_FORMAT,
                              "The field delimiter in the BSV data "
                              . "cannot be unambiguously determined.");
                        }
                        else {
                            $found_winner = $TRUE;
                            $field_delimiter = $potential_delimiter;
                        } # end if
                    } # end if
                } # next $potential_delimiter

                unless ($found_winner) {
                    die Text::BSV::Exception->new(
                      $Text::BSV::Exception::INVALID_DATA_FORMAT,
                      "The field delimiter in the BSV data cannot be "
                          . "unambiguously determined.");
                } # end unless
            } # end when
        } # end given
    } # end if

    return $field_delimiter;
} # end sub

# The parse_header_row() function takes a string containing the header row
# (without end-of-line characters) and a string containing the field
# delimiter.  The function parses the header row, verifies that the field
# names are unique, and then returns a reference to an array of strings
# containing the field names.  If the field delimiter is the vertical bar,
# the function translates any escape sequences in the field names into the
# corresponding actual characters.
#
# If the header row contains any newline or carriage-return characters or
# the specified field delimiter is not supported by the BSV format, the
# function throws an exception of type
# $Text::BSV::Exception::ILLEGAL_ARGUMENT.  If the field names are not
# unique, or if the header row contains a double quotation mark but the
# field delimiter is not the vertical bar, the function throws an exception
# of type $Text::BSV::Exception::INVALID_DATA_FORMAT.
sub parse_header_row {
    my $header_row = $_[0];
    my $field_delimiter = $_[1];
    my @field_names;

    if ($header_row =~ /[\r\n]/s) {
        die Text::BSV::Exception->new(
          $Text::BSV::Exception::ILLEGAL_ARGUMENT,
          "The header row passed to the parse_header_row() function "
          . "contains newline or carriage-return characters.");
    } # end if

    if ($field_delimiter eq "|") {
        my $bsv_data = $header_row;
        my @pieces;

        $bsv_data =~ s/\\\\/\n/gs;
        @pieces = split /(?<!\\)\|/s, $header_row,
          $INCLUDE_TRAILING_EMPTY_FIELDS;

        for my $piece (@pieces) {
            my $field_name = $piece;

            $field_name =~ s/\n/\\\\/gs;
            $field_name = translate_from_bsv($field_name);

            if ($field_name ~~ @field_names) {
                die Text::BSV::Exception->new(
                  $Text::BSV::Exception::INVALID_DATA_FORMAT,
                  "Duplicate field names.");
            } # end if

            push @field_names, $field_name;
        } # next $piece
    }
    elsif (index($header_row, $DQ) > -1) {
        die Text::BSV::Exception->new(
          $Text::BSV::Exception::INVALID_DATA_FORMAT,
          "The field delimiter in the BSV data is not the vertical bar, "
          . "but there is a double quotation mark in the header row.");
    }
    elsif ($field_delimiter eq ","
      || $field_delimiter eq $SEMICOLON
      || $field_delimiter eq "\t") {
        my @pieces = split /$field_delimiter/s, $header_row,
          $INCLUDE_TRAILING_EMPTY_FIELDS;

        for my $piece (@pieces) {
            if ($piece ~~ @field_names) {
                die Text::BSV::Exception->new(
                  $Text::BSV::Exception::INVALID_DATA_FORMAT,
                  "Duplicate field names.");
            } # end if

            push @field_names, $piece;
        } # next $piece
    }
    else {
        die Text::BSV::Exception->new(
          $Text::BSV::Exception::ILLEGAL_ARGUMENT,
          "The field delimiter passed to the parse_header_row() function "
          . "is not supported by the BSV format.");
    } # end if

    return \@field_names;
} # end sub

# The parse_row() function takes a string containing a non-header row
# (without end-of-line characters), a string containing the field delimiter,
# and a reference to an array of strings containing the field names.  The
# function parses the row, verifies that the number of field values is
# correct, and then returns a reference to a hash in which the keys are the
# field names and the values are the record's field values.  If the field
# delimiter is the vertical bar, the function translates any escape
# sequences in the field values into the appropriate actual characters.
#
# The function assumes that the field names passed in are already translated
# from BSV format into ordinary text.
#
# If the row contains any newline or carriage-return characters or the
# specified field delimiter is not supported by the BSV format, the
# function throws an exception of type
# $Text::BSV::Exception::ILLEGAL_ARGUMENT.  If the row does not contain the
# number of fields matching the number of field names passed in, or if the
# row contains a double quotation mark but the field delimiter is not the
# vertical bar, the function throws an exception of type
# $Text::BSV::Exception::INVALID_DATA_FORMAT.
sub parse_row {
    my $row = $_[0];
    my $field_delimiter = $_[1];
    my $field_names = $_[2];
    my $record = {};

    if ($row =~ /[\r\n]/s) {
        die Text::BSV::Exception->new(
          $Text::BSV::Exception::ILLEGAL_ARGUMENT,
          "The row passed to the parse_row() function contains "
          . "newline or carriage-return characters.");
    } # end if

    if ($field_delimiter eq "|") {
        my $bsv_data = $row;
        my @pieces;

        $bsv_data =~ s/\\\\/\n/gs;
        @pieces = split /(?<!\\)\|/s, $row,
          $INCLUDE_TRAILING_EMPTY_FIELDS;

        unless (scalar(@pieces) == scalar(@{ $field_names })) {
            die Text::BSV::Exception->new(
              $Text::BSV::Exception::INVALID_DATA_FORMAT,
              "The number of fields in a row passed to the "
              . "parse_row() function does not match the number of "
              . "field names passed in.");
        } # end unless

        for my $dex (0..$#pieces) {
            my $field_value = $pieces[$dex];

            $field_value =~ s/\n/\\\\/gs;
            $field_value = translate_from_bsv($field_value);
            $record->{$field_names->[$dex]} = $field_value;
        } # next $piece
    }
    elsif (index($row, $DQ) > -1) {
        die Text::BSV::Exception->new(
          $Text::BSV::Exception::INVALID_DATA_FORMAT,
          "The field delimiter in the BSV data is not the vertical bar, "
          . "but there is a double quotation mark in at least one "
          . "of the rows.");
    }
    elsif ($field_delimiter eq ","
      || $field_delimiter eq $SEMICOLON
      || $field_delimiter eq "\t") {
        my @pieces = split /$field_delimiter/s, $row,
          $INCLUDE_TRAILING_EMPTY_FIELDS;

        unless (scalar(@pieces) == scalar(@{ $field_names })) {
            die Text::BSV::Exception->new(
              $Text::BSV::Exception::INVALID_DATA_FORMAT,
              "The number of fields in a row passed to the "
              . "parse_row() function does not match the number of "
              . "field names passed in.");
        } # end unless

        for my $dex (0..$#pieces) {
            $record->{$field_names->[$dex]} = $pieces[$dex];
        } # next $dex
    }
    else {
        die Text::BSV::Exception->new(
          $Text::BSV::Exception::ILLEGAL_ARGUMENT,
          "The field delimiter passed to the parse_row() function "
          . "is not supported by the BSV format.");
    } # end if

    return $record
} # end sub

# The generate_header_row() function takes a reference to an array of
# strings containing field names, and returns a string containing a BSV
# header row (with no end-of-line characters added).
#
# If the field names passed in are not unique, the function throws an
# exception of type $Text::BSV::Exception::ILLEGAL_ARGUMENT.
sub generate_header_row {
    my $field_names = $_[0];
    my @validated_field_names = ();
    my $header_row = $EMPTY;

    for my $dex (0..$#{ $field_names }) {
        if ($field_names->[$dex] ~~ @validated_field_names) {
            die Text::BSV::Exception->new(
              $Text::BSV::Exception::ILLEGAL_ARGUMENT,
              "Field names are not unique.");
        } # end if

        push @validated_field_names, $field_names->[$dex];

        if ($dex > 0) {
            $header_row .= "|";
        } # end if

        $header_row .= translate_to_bsv($field_names->[$dex]);
    } # next $dex

    return $header_row;
} # end sub

# The generate_row() function takes a reference to a hash that encapsulates
# a BSV record and a reference to an array of strings containing the field
# names.  Using the array of field names to determine the order of the
# fields, the function generates and returns a string containing a BSV
# non-header row (with no end-of-line characters added).
#
# If the number and names of the fields in the passed-in hash do not match
# the passed-in array of field names, the function throws an exception of
# type $Text::BSV::Exception::ILLEGAL_ARGUMENT.
sub generate_row {
    my $record = $_[0];
    my $field_names = $_[1];
    my $row = $EMPTY;

    unless (scalar(keys %{ $record }) == scalar(@{ $field_names })) {
        die Text::BSV::Exception->new(
          $Text::BSV::Exception::ILLEGAL_ARGUMENT,
          "The number of field names passed to the generate_row() function "
          . "does not match the number field values in the record.");
    } # end unless

    for my $dex (0..$#{ $field_names }) {
        unless (exists $record->{$field_names->[$dex]}) {
            die Text::BSV::Exception->new(
              $Text::BSV::Exception::ILLEGAL_ARGUMENT,
              "The record passed to the generate_row() function does not "
              . "match the list of field names passed in.");
        } # end unless

        if ($dex > 0) {
            $row .= "|";
        } # end if

        $row .= translate_to_bsv($record->{$field_names->[$dex]});
    } # next $dex

    return $row;
} # end sub

####
# Private helper functions for use by this module:
####

# The private num_delimiters() function takes a string to be searched and a
# string containing a delimiter.  The function returns the number of
# occurrences of the delimiter within the string to be searched.
sub num_delimiters {
    my $str = $_[0];
    my $delim = $_[1];
    my $result = 0;

    for my $dex (0..(length($str) - 1)) {
        if (substr($str, $dex, 1) eq $delim) {
            $result++;
        } # end if
    } # next $dex

    return $result;
} # end sub

# The private translate_from_bsv() function takes a string containing a BSV
# field name or value and returns a string containing the translated version
# in which escape sequences have been replaced with the correct
# corresponding characters.
#
# If the field name or value contains any instances of invalid backslash
# usage, the function throws an exception of type
# $Text::BSV::Exception::INVALID_DATA_FORMAT.  If the field name or value
# contains any newline or carriage-return characters, the function throws an
# exception of type $Text::BSV::Exception::ILLEGAL_ARGUMENT.
sub translate_from_bsv {
    my $bsv_str = $_[0];
    my $normal_str = $bsv_str;

    if ($bsv_str =~ /[\r\n]/s) {
        die Text::BSV::Exception->new(
          $Text::BSV::Exception::ILLEGAL_ARGUMENT,
          "A BSV field name or value cannot contain newline or "
          . "carriage-return characters.");
    } # end if

    $normal_str =~ s/\\\\/\n/gs;

    if ($normal_str =~ /\\[^|n]/s) {
        die Text::BSV::Exception->new(
          $Text::BSV::Exception::INVALID_DATA_FORMAT,
          "Invalid backslash usage in BSV data.");
    } # end if

    $normal_str =~ s/\n/\\z/gs;
    $normal_str =~ s/\\\|/|/gs;
    $normal_str =~ s/\\n/\n/gs;
    $normal_str =~ s/\\z/\\/gs;

    return $normal_str;
} # end sub

# The private translate_to_bsv() function takes a string containing a
# field name or value and returns a string containing the BSV version,
# in which carriage returns have been stripped and then backslashes,
# vertical bars, and newline characters have been escaped.
sub translate_to_bsv {
    my $bsv_str = $_[0];

    $bsv_str =~ s/\r/$EMPTY/gs;
    $bsv_str =~ s/\\/\\\\/gs;
    $bsv_str =~ s/\|/\\|/gs;
    $bsv_str =~ s/\n/\\n/gs;

    return $bsv_str;
} # end sub

# Module return value:
1;


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.