Group
Extension

JSON_minify/lib/JSON_minify.pm

#!/usr/bin/perl
##
## JSON_minify.pm
## Copyright ©2018 Rémi Cohen-Scali
##
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the “Software”), to 
## deal in the Software without restriction, including without limitation the 
## rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 
## sell copies of the Software, and to permit persons to whom the Software is 
## furnished to do so, subject to the following conditions:
## 
## The above copyright notice and this permission notice shall be included in
## all copies or substantial portions of the Software.
## 
## THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
## FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
## IN THE SOFTWARE.
##

our $VERSION = '1.1.1';

package JSON_minify;

use strict;
use warnings;

sub new {
    my $class = shift;
    return bless {}, $class;
}

##
## minify_string
##
## Minify a json content available in the 2nd input parameter 'input_string'
##
sub minify_string {
    # self param: the object instance
    my $self = shift;
    # input_string param: a string containing json
    my $input_string = shift;
    # strip_space param: a boolean that specify if caller 
    # wants to strip spaces
    my $strip_space = (@_ ? shift : 1);

    # Returned value: a string containing the minified json
    my $new_str = "";
    # Current position of processing in input content
    my $index = 0;

    # Flag indicating if processing is currently inside a multi line comment
    my $in_multi = 0;
    # Flag indicating if processing is currently inside a single line comment
    my $in_single = 0;
    # Flag indicating if processing is currently inside a string
    my $in_string = 0;
    # Flag indicating if processing is currently inside a comment (single or multi)
    my $in_comment = 0;

    # Let's iterate on every match for each token
    # This is actually a tokenization
    # Token of interrest are ", /*, */, //, \n, \r, \t, and space
    # Regex options are GLOBAL & MULTILINE
    while ($input_string =~ m/("|(\/\*)|(\*\/)|(\/\/)|[[:space:]])/gm)
    {
        # Initialize context for this match
        # First the match itself (which is group 1)
        my $token = $1;
        # Its position in the content
        my $input_pos = pos $input_string;
        # And its length.
        # FIXME: Should always match till end of string, so defined not useful anymore
        # Should replace with:
        # my $token_len = length $token;
        my $token_len = defined $token ? length $token : 1;

        # Integrate in_comment value
        $in_comment = $in_multi || $in_single;
        # if not in a comment
        if (!$in_comment)
        {
            # Get the substring between previous pos and now pos
            my $len = ($input_pos - $index - $token_len);
            # If len < 0, set to 0
            # FIXME: this was necessary because of a bug later that has been fixed.
            #        I should now be able remove it
            if ($len == -1) {$len = 0;}
            # Get the sub string
            my $tmp = substr $input_string, $index, $len;
            
            ## Eventually strip spaces
            if (! $in_string && $strip_space) {$tmp =~ s/[[:space:]]*//gm;}
            # And add it in final result
            $new_str .= $tmp;
        }
        # Else if not in comment and not stripping spaces
        elsif (! $strip_space)
        {
            # we replace the substring with spaces
            $new_str .= ' ' x ($input_pos - $index - $token_len);
        }

        # As we copied the input chars, let's set index to actual position
        $index = $input_pos;
        # And get the match in a temporary
        ##my $val = $token;
        # If we are closing a string (in string, tok is a dbl quote and not in comment)
        if ($token eq '"' && ! $in_comment)
        {
            # Get the left context of the match
            my $leftcontext = substr($input_string, 0, $input_pos-1);
            # Match it searching for a string of backslash (i.e. \ or \\ or \\\ etc)
            # at the end of the string
            (my $escaped = $leftcontext) =~ m/(\\)*$/;
            # Get length of match
            my $escaped_full_len = length $& || '';

            # We got a dbl quote, then
            # we are either at start of string or unescaped dbl quote (end of string)
            # if not in string, then a string is starting OR
            # in string and no backslash at all, we close the string OR
            # in string and even number of backslash (dbl quote is not backslashed)
            # then we close the string also
            if (! $in_string || ! defined $1 || ($escaped_full_len % 2 == 0))
            {
                # Then change the in string flag to its negated value
                $in_string = $in_string ? 0 : 1;
            }
            # Let's go back one char to put the dbl quote in the new_string at next iteration
            $index--;
        }
        # Else if we are neither in string nor in comment
        elsif (! ($in_string || $in_comment))
        {
            # Check that token is '/*', then start a multi comment 
            if ($token eq '/*') {$in_multi = 1; }
            # Or check that token is '//', then start a single comment 
            elsif ($token eq '//') {$in_single = 1; }
        }
        # Else if token is closing multi and we are in multi and we are not 
        # in string neither in single 
        elsif ($token eq '*/' && $in_multi && !($in_string || $in_single))
        {
            # Multi line comment reached its end: unset flag in_multi
            $in_multi = 0;
            # If we d'ont strip spaces, let's add spaces for 
            # token (same length to preserve indentation)
            if (! $strip_space) {$new_str .= ' ' x length($token);}
        }
        # Else if token is some kind of cariage return and we are not in multiline
        # comment and we are in single line comment
        elsif (($token eq "\r" || $token eq "\n") && ! $in_multi && $in_single)
        {
            # Single line comment reached its end: unset flag in_single
            $in_single = 0;
        }
        # Else if we are not in any comment at all, token is any kind of space
        # and we do not strip space
        elsif (! $in_comment || 
               (($token eq ' ' || $token eq "\r" || $token eq "\n" || $token eq "\t") && 
                ! $strip_space))
        {
            # Then add these spaces to the new string
            $new_str .= $token;
        }
        # Set in_comment flag value according to its compositing ones
        $in_comment = $in_single || $in_multi;

        # If we do not strip spaces
        if (!$strip_space)
        {
            # We need to replace separating tokens with spaces
            if ($token eq "\r" || $token eq "\n") {$new_str .= $token; }
            # or to replace comments tokens characters with spaces 
            elsif ($in_comment) {$new_str .= (' ' x length($token)); }
        }
    }
    $new_str .= substr $input_string, $index;
}

1;

__END__

=head1 NAME

JSON_minify.pm - minify a JSON and also remove comments

=head1 SYNOPSIS

 use JSON_minify;
 my $minifier = JSON_minify->new();
 my $json_string = "<a json contents with comments>";
 my $minified_json = $minifier->minify_string($json_string, 0);

=head1 DESCRIPTION

 This module provides a unique method for minifying a json string. This 
 string may eventually contains some C/C++ like comments. The minify_string
 method accept two arguments. First the json content as a string, and a
 boolean for striping_space (default is: space stripped).

=head2 Exports
 
=over

=item :minify_string 

=back

=cut



Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.