Group
Extension

JSON-XS-ByteString/ByteString.xs

#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"

#include "ppport.h"

#ifndef UNLIKELY
#  define UNLIKELY(x) (x)
#endif
#ifndef LIKELY
#  define LIKELY(x) (x)
#endif

#ifndef SvREFCNT_dec_NN
#  define SvREFCNT_dec_NN SvREFCNT_dec
#endif

#define CONCAT_PASTE(prefix, suffix) prefix ## suffix
#define CONCAT(prefix, suffix) CONCAT_PASTE(prefix, suffix)

static SV** visited;
int visited_capacity;
int visited_p;

static inline void reserve_visited_capacity(){
    if( visited_p >= visited_capacity ){
        visited_capacity += visited_capacity / 4;
        Renew(visited, visited_capacity, SV*);
    }
}

static inline STRLEN estimate_str(unsigned char * str, STRLEN len){
    unsigned char * str_begin = str;
    STRLEN out_len = len+2;
    for(unsigned char * str_end=str+len; str!=str_end; ++str){
        if( *str < 0x20 ){
            switch( *str ){
                case '\n': case '\t': case '\r': case '\b': case '\f':
                    ++out_len;
                    break;
                default:
                    out_len += 5;
            }
        }
        else switch( *str ){
            case '\\': case '"':
                ++out_len;
                break;
            case '/':
                if( str!=str_begin && *(str-1)=='<' )
                    ++out_len;
            default:
                ;
        }
    }
    return out_len;
}

static inline char hex(unsigned char ch){
    if( ch>9 )
        return 'A' + ch - 10;
    else
        return '0' + ch;
}
static inline unsigned int decode_hex(unsigned char ch){
    if( ch<='9' )
        return ch - '0';
    if( ch<='Z' )
        return ch - 'A' + 10;
    return ch - 'a' + 10;
}

static inline unsigned char * encode_str(unsigned char * buffer, unsigned char * str, STRLEN len){
    unsigned char * str_begin = str;
    *buffer++ = '"';
    for(unsigned char * str_end=str+len; str!=str_end; ++str){
        if( *str < 0x20 ){
            *buffer++ = '\\';
            switch( *str ){
                case '\n':
                    *buffer++ = 'n';
                    break;
                case '\t':
                    *buffer++ = 't';
                    break;
                case '\r':
                    *buffer++ = 'r';
                    break;
                case '\b':
                    *buffer++ = 'b';
                    break;
                case '\f':
                    *buffer++ = 'f';
                    break;
                default:
                    *buffer++ = 'u';
                    *buffer++ = '0';
                    *buffer++ = '0';
                    *buffer++ = hex(*str >> 4);
                    *buffer++ = hex(*str & 15);
            }
        }
        else{
            switch( *str ){
                case '\\': case '"':
                    *buffer++ = '\\';
                    break;

                case '/':
                    if( str!=str_begin && *(str-1)=='<' )
                        *buffer++ = '\\';

                default:
                    ;
            }
            *buffer++ = *str;
        }
    }
    *buffer++ = '"';
    return buffer;
}

#define NAME normal
#define UNBLESSED FALSE
#define PRETTY FALSE
#include "encode_gen.h"
#undef PRETTY
#undef UNBLESSED
#undef NAME

#define NAME normal_pretty
#define UNBLESSED FALSE
#define PRETTY TRUE
#include "encode_gen.h"
#undef PRETTY
#undef UNBLESSED
#undef NAME

#define NAME unblessed
#define UNBLESSED TRUE
#define PRETTY FALSE
#include "encode_gen.h"
#undef PRETTY
#undef UNBLESSED
#undef NAME

#define NAME unblessed_pretty
#define UNBLESSED TRUE
#define PRETTY TRUE
#include "encode_gen.h"
#undef PRETTY
#undef UNBLESSED
#undef NAME

static inline unsigned char * skip_bom(unsigned char * str, unsigned char * str_end){
    if( str_end - str >= 3 && str[0]==(unsigned char)'\xEF' && str[1]==(unsigned char)'\xBB' && str[2]==(unsigned char)'\xBF' )
        return str+3;
    return str;
}

static inline unsigned char * skip_space(unsigned char * str, unsigned char * str_end){
    while( str!=str_end && isSPACE(*str) )
        ++str;
    return str;
}

static inline bool is_identity(unsigned char ch){
    return !isSPACE(ch) && ch!=',' && ch!=':' && ch!=']' && ch!='}';
}

static inline bool is_key(unsigned char ch){
    return !isSPACE(ch) && ch!=':';
}

static inline STRLEN is_number(unsigned char * str, unsigned char * str_end){
    if( str==str_end )
        return 0;

    STRLEN len = 0;
    bool has_digit = FALSE;
    if( *str=='+' || *str=='-' ){
        str = skip_space(str+1, str_end);
        ++len;
    }

    if( str!=str_end && isDIGIT(*str) )
        has_digit = TRUE;
    while( str!=str_end && isDIGIT(*str) ){
        ++len;
        ++str;
    }
    if( str!=str_end && *str=='.' ){
        ++len;
        ++str;
    }
    if( str!=str_end && isDIGIT(*str) )
        has_digit = TRUE;
    while( str!=str_end && isDIGIT(*str) ){
        ++len;
        ++str;
    }
    if( !has_digit )
        return 0;

    if( str!=str_end && (*str=='e' || *str=='E') ){
        ++len;
        ++str;
        if( str!=str_end && (*str=='+' || *str=='-') ){
            ++len;
            ++str;
        }
        while( str!=str_end && isDIGIT(*str) ){
            ++len;
            ++str;
        }
    }
    return len;
}

static inline unsigned char * decode_number_r(unsigned char * str, unsigned char * str_end, unsigned char ** out, unsigned char ** out_capacity_end, unsigned char ** out_end){
    STRLEN len = is_number(str, str_end);
    if( len<=0 ){
        *out_end = NULL;
        return str;
    }

    if( !*out ){
        Newx(*out, len+1, unsigned char);
        *out_capacity_end = *out + len + 1;
    }
    else if( *out_capacity_end - *out < len + 1 ){
        Renew(*out, len+1, unsigned char);
        *out_capacity_end = *out + len + 1;
    }

    *out_end = *out + len;
    **out_end = 0;
    unsigned char * out_cur = *out;

    if( *str=='+' || *str=='-' ){
        *out_cur++ = *str;
        --len;
        str = skip_space(str+1, str_end);
    }
    while( len-- )
        *out_cur++ = *str++;
    return str;
}

static inline STRLEN estimate_orig_key(unsigned char * str, unsigned char * str_end){
    if( str==str_end )
        return 0;
    if( *str=='"' || *str=='\'' ){
        char delimiter = *str;
        ++str;
        STRLEN len = 0;
        while(TRUE){
            if( str==str_end )
                return -1;
            if( *str==delimiter )
                return len;
            if( *str=='\\' ){
                ++str;
                switch( *str++ ){
                    case 'u': {
                        unsigned int d = 0;

                        if( str!=str_end && isXDIGIT(*str) )
                            d = (d << 4) + decode_hex(*str++);
                        if( str!=str_end && isXDIGIT(*str) )
                            d = (d << 4) + decode_hex(*str++);
                        if( str!=str_end && isXDIGIT(*str) )
                            d = (d << 4) + decode_hex(*str++);
                        if( str!=str_end && isXDIGIT(*str) )
                            d = (d << 4) + decode_hex(*str++);

                        if( d <= 0x7f )
                            ++len;
                        else if( d <= 0x7ff )
                            len += 2;
                        else if( d <= 0xffff )
                            len += 3;
                        else
                            len += 4;

                        break;
                    }
                    case 'n': case '\\': case 't': case 'r': case '/': case 'b': case 'f':
                        ++len;
                        break;
                    default:
                        if( *(str-1)==delimiter )
                            ++len;
                        else
                            len += 2;
                }
            }
            else{
                ++len;
                ++str;
            }
        }
    }
    else{
        STRLEN len = 0;
        while( str!=str_end && is_key(*str) ){
            ++len;
            ++str;
        }
        return len;
    }
}

static inline STRLEN estimate_orig_str(unsigned char * str, unsigned char * str_end){
    if( str==str_end )
        return -1;
    if( *str=='"' || *str=='\'' ){
        char delimiter = *str;
        ++str;
        STRLEN len = 0;
        while(TRUE){
            if( str==str_end )
                return -1;
            if( *str==delimiter )
                return len;
            if( *str=='\\' ){
                ++str;
                switch( *str++ ){
                    case 'u': {
                        unsigned int d = 0;

                        if( str!=str_end && isXDIGIT(*str) )
                            d = (d << 4) + decode_hex(*str++);
                        if( str!=str_end && isXDIGIT(*str) )
                            d = (d << 4) + decode_hex(*str++);
                        if( str!=str_end && isXDIGIT(*str) )
                            d = (d << 4) + decode_hex(*str++);
                        if( str!=str_end && isXDIGIT(*str) )
                            d = (d << 4) + decode_hex(*str++);

                        if( d <= 0x7f )
                            ++len;
                        else if( d <= 0x7ff )
                            len += 2;
                        else if( d <= 0xffff )
                            len += 3;
                        else
                            len += 4;

                        break;
                    }
                    case 'n': case '\\': case 't': case 'r': case '/': case 'b': case 'f':
                        ++len;
                        break;
                    default:
                        if( *(str-1)==delimiter )
                            ++len;
                        else
                            len += 2;
                }
            }
            else{
                ++len;
                ++str;
            }
        }
    }
    else
        return -1;
}

static inline unsigned char * decode_key_r(unsigned char * str, unsigned char * str_end, unsigned char ** out, unsigned char ** out_capacity_end, unsigned char ** out_end){
    STRLEN len = estimate_orig_key(str, str_end);
    if( len==-1 ){
        *out_end = NULL;
        return str;
    }

    if( !*out ){
        Newx(*out, len+1, unsigned char);
        *out_capacity_end = *out + len + 1;
    }
    else if( *out_capacity_end - *out < len + 1 ){
        Renew(*out, len+1, unsigned char);
        *out_capacity_end = *out + len + 1;
    }

    *out_end = *out + len;
    **out_end = 0;
    unsigned char * out_cur = *out;

    if( *str=='"' || *str=='\'' ){
        char delimiter = *str;
        ++str;
        while(TRUE){
            if( *str==delimiter )
                return str+1;
            if( *str=='\\' ){
                ++str;
                switch( *str++ ){
                    case 'u': {
                        unsigned int d = 0;

                        if( str!=str_end && isXDIGIT(*str) )
                            d = (d << 4) + decode_hex(*str++);
                        if( str!=str_end && isXDIGIT(*str) )
                            d = (d << 4) + decode_hex(*str++);
                        if( str!=str_end && isXDIGIT(*str) )
                            d = (d << 4) + decode_hex(*str++);
                        if( str!=str_end && isXDIGIT(*str) )
                            d = (d << 4) + decode_hex(*str++);

                        if( d <= 0x7f )
                            *out_cur++ = (unsigned char) d;
                        else if( d <= 0x7ff ){
                            *out_cur++ = (unsigned char)( d >> 6          | 0xC0);
                            *out_cur++ = (unsigned char)((d       & 0x3F) | 0x80);
                        }
                        else if( d <= 0xffff ){
                            *out_cur++ = (unsigned char)( d >> 12         | 0xE0);
                            *out_cur++ = (unsigned char)((d >> 6  & 0x3F) | 0x80);
                            *out_cur++ = (unsigned char)((d       & 0x3F) | 0x80);
                        }
                        else{
                            *out_cur++ = (unsigned char)( d >> 18         | 0xF0);
                            *out_cur++ = (unsigned char)((d >> 12 & 0x3F) | 0x80);
                            *out_cur++ = (unsigned char)((d >>  6 & 0x3F) | 0x80);
                            *out_cur++ = (unsigned char)((d       & 0x3F) | 0x80);
                        }

                        break;
                    }
                    case 'n':
                        *out_cur++ = '\n';
                        break;
                    case '\\':
                        *out_cur++ = '\\';
                        break;
                    case 't':
                        *out_cur++ = '\t';
                        break;
                    case 'r':
                        *out_cur++ = '\r';
                        break;
                    case '/':
                        *out_cur++ = '/';
                        break;
                    case 'b':
                        *out_cur++ = '\b';
                        break;
                    case 'f':
                        *out_cur++ = '\f';
                        break;
                    default:
                        if( *(str-1)!=delimiter )
                            *out_cur++ = '\\';
                        *out_cur++ = *(str-1);
                }
            }
            else
                *out_cur++ = *str++;
        }
    }
    else{
        while( str!=str_end && is_key(*str) )
            *out_cur++ = *str++;
        return str;
    }
}

static inline unsigned char * decode_str_r(unsigned char * str, unsigned char * str_end, unsigned char ** out, unsigned char ** out_capacity_end, unsigned char ** out_end){
    STRLEN len = estimate_orig_str(str, str_end);
    if( len==-1 ){
        *out_end = NULL;
        return str;
    }

    if( !*out ){
        Newx(*out, len+1, unsigned char);
        *out_capacity_end = *out + len + 1;
    }
    else if( *out_capacity_end - *out < len + 1 ){
        Renew(*out, len+1, unsigned char);
        *out_capacity_end = *out + len + 1;
    }

    *out_end = *out + len;
    **out_end = 0;
    unsigned char * out_cur = *out;

    char delimiter = *str;
    ++str;
    while(TRUE){
        if( *str==delimiter )
            return str+1;
        if( *str=='\\' ){
            ++str;
            switch( *str++ ){
                case 'u': {
                    unsigned int d = 0;

                    if( str!=str_end && isXDIGIT(*str) )
                        d = (d << 4) + decode_hex(*str++);
                    if( str!=str_end && isXDIGIT(*str) )
                        d = (d << 4) + decode_hex(*str++);
                    if( str!=str_end && isXDIGIT(*str) )
                        d = (d << 4) + decode_hex(*str++);
                    if( str!=str_end && isXDIGIT(*str) )
                        d = (d << 4) + decode_hex(*str++);

                    if( d <= 0x7f )
                        *out_cur++ = (unsigned char) d;
                    else if( d <= 0x7ff ){
                        *out_cur++ = (unsigned char)( d >> 6          | 0xC0);
                        *out_cur++ = (unsigned char)((d       & 0x3F) | 0x80);
                    }
                    else if( d <= 0xffff ){
                        *out_cur++ = (unsigned char)( d >> 12         | 0xE0);
                        *out_cur++ = (unsigned char)((d >> 6  & 0x3F) | 0x80);
                        *out_cur++ = (unsigned char)((d       & 0x3F) | 0x80);
                    }
                    else{
                        *out_cur++ = (unsigned char)( d >> 18         | 0xF0);
                        *out_cur++ = (unsigned char)((d >> 12 & 0x3F) | 0x80);
                        *out_cur++ = (unsigned char)((d >>  6 & 0x3F) | 0x80);
                        *out_cur++ = (unsigned char)((d       & 0x3F) | 0x80);
                    }

                    break;
                }
                case 'n':
                    *out_cur++ = '\n';
                    break;
                case '\\':
                    *out_cur++ = '\\';
                    break;
                case 't':
                    *out_cur++ = '\t';
                    break;
                case 'r':
                    *out_cur++ = '\r';
                    break;
                case '/':
                    *out_cur++ = '/';
                    break;
                case 'b':
                    *out_cur++ = '\b';
                    break;
                case 'f':
                    *out_cur++ = '\f';
                    break;
                default:
                    if( *(str-1)!=delimiter )
                        *out_cur++ = '\\';
                    *out_cur++ = *(str-1);
            }
        }
        else
            *out_cur++ = *str++;
    }
}

// the created SV has refcnt=1
unsigned char * decode(unsigned char * str, unsigned char * str_end, SV**out){
    str = skip_space(str, str_end);
    if( str==str_end )
        goto GIVEUP;

    switch( *str ){
        case '[': {
            AV * av = newAV();
            *out = newRV_noinc((SV*) av);
            str = skip_space(str+1, str_end);

            while(TRUE){
                if( str==str_end )
                    goto ROLLBACK;
                if( *str == ']' )
                    return str+1;

                SV * elem;
                str = decode(str, str_end, &elem);
                if( elem==NULL )
                    goto ROLLBACK;
                av_push(av, elem);

                str = skip_space(str, str_end);
                if( str==str_end )
                    goto ROLLBACK;
                if( *str == ']' )
                    return str+1;
                if( *str==',' )
                    str = skip_space(str+1, str_end);
                else
                    goto ROLLBACK;
            }
        }
        case '{': {
            HV * hv = newHV();
            *out = newRV_noinc((SV*) hv);
            str = skip_space(str+1, str_end);
            unsigned char *key_buffer=0, *key_buffer_end, *key_end;
            while(TRUE){
                if( str==str_end ){
                    if( key_buffer )
                        Safefree(key_buffer);
                    goto ROLLBACK;
                }
                if( *str=='}' ){
                    if( key_buffer )
                        Safefree(key_buffer);
                    return str+1;
                }
                str = decode_key_r(str, str_end, &key_buffer, &key_buffer_end, &key_end);
                if( !key_end ){
                    if( key_buffer )
                        Safefree(key_buffer);
                    goto ROLLBACK;
                }
                str = skip_space(str, str_end);

                SV * elem = NULL;
                if( *str==':' )
                    str = decode(str+1, str_end, &elem);
                if( elem==NULL ){
                    Safefree(key_buffer);
                    goto ROLLBACK;
                }
                hv_store(hv, (char*)key_buffer, key_end-key_buffer, elem, 0);

                str = skip_space(str, str_end);
                if( str==str_end ){
                    Safefree(key_buffer);
                    goto ROLLBACK;
                }
                if( *str=='}' ){
                    Safefree(key_buffer);
                    return str+1;
                }
                if( *str==',' )
                    str = skip_space(str+1, str_end);
                else{
                    Safefree(key_buffer);
                    goto ROLLBACK;
                }
            }
            break;
        }
        case '"': case '\'': {
            unsigned char *value_buffer=0, *value_buffer_end, *value_end;
            str = decode_str_r(str, str_end, &value_buffer, &value_buffer_end, &value_end);
            if( !value_end )
                goto GIVEUP;
            *out = newSV(0);
            sv_upgrade(*out, SVt_PV);
            SvPOK_on(*out);
            SvPV_set(*out, (char*)value_buffer);
            SvCUR_set(*out, value_end - value_buffer);
            SvLEN_set(*out, value_buffer_end - value_buffer);
            return str;
        }
        default: {
            if( str_end-str==4 || (str_end-str>4 && !is_identity(str[4])) ){
                if( (str[0]=='T' || str[0]=='t') && (str[1]=='R' || str[1]=='r') && (str[2]=='U' || str[2]=='u') && (str[3]=='E' || str[3]=='e') ){
                    *out = newSViv(1);
                    return str+4;
                }
                if( (str[0]=='N' || str[0]=='n') && (str[1]=='U' || str[1]=='u') && (str[2]=='L' || str[2]=='l') && (str[3]=='L' || str[3]=='l') ){
                    *out = newSV(0);
                    return str+4;
                }
            }
            if( str_end-str==5 || (str_end-str>5 && !is_identity(str[5])) ){
                if( (str[0]=='F' || str[0]=='f') && (str[1]=='A' || str[1]=='a') && (str[2]=='L' || str[2]=='l') && (str[3]=='S' || str[3]=='s') && (str[4]=='E' || str[4]=='e') ){
                    *out = newSVpvn("", 0);
                    return str+5;
                }
            }

            unsigned char *value_buffer=0, *value_buffer_end, *value_end;
            str = decode_number_r(str, str_end, &value_buffer, &value_buffer_end, &value_end);
            if( value_end ){
                *out = newSV(0);
                sv_upgrade(*out, SVt_PV);
                SvPOK_on(*out);
                SvPV_set(*out, (char*)value_buffer);
                SvCUR_set(*out, value_end - value_buffer);
                SvLEN_set(*out, value_buffer_end - value_buffer);
                return str;
            }

            goto GIVEUP;
        }
    }

ROLLBACK:
    SvREFCNT_dec_NN(*out);
GIVEUP:
    *out = NULL;
    return str;
}

MODULE = JSON::XS::ByteString		PACKAGE = JSON::XS::ByteString		

void
encode_json(SV * data)
    PPCODE:
        visited_p = 0;
        STRLEN need_size = estimate_normal(data, 0);
        SV * out_sv = sv_2mortal(newSV(need_size));
        SvPOK_only(out_sv);
        visited_p = 0;
        char * cur = (char*)encode_normal((unsigned char*)SvPVX(out_sv), data, 0);
        if( cur - SvPVX(out_sv) != need_size )
            croak("need_size(%d) != real_size(%d)", need_size, cur - SvPVX(out_sv));
        SvCUR_set(out_sv, cur - SvPVX(out_sv));
        *SvEND(out_sv) = 0;
        PUSHs(out_sv);

void
encode_json_pretty(SV * data)
    PPCODE:
        visited_p = 0;
        STRLEN need_size = estimate_normal_pretty(data, 0);
        SV * out_sv = sv_2mortal(newSV(need_size));
        SvPOK_only(out_sv);
        visited_p = 0;
        char * cur = (char*)encode_normal_pretty((unsigned char*)SvPVX(out_sv), data, 0);
        if( cur - SvPVX(out_sv) != need_size )
            croak("need_size(%d) != real_size(%d)", need_size, cur - SvPVX(out_sv));
        SvCUR_set(out_sv, cur - SvPVX(out_sv));
        *SvEND(out_sv) = 0;
        PUSHs(out_sv);

void
encode_json_unblessed(SV * data)
    PPCODE:
        visited_p = 0;
        STRLEN need_size = estimate_unblessed(data, 0);
        SV * out_sv = sv_2mortal(newSV(need_size));
        SvPOK_only(out_sv);
        visited_p = 0;
        char * cur = (char*)encode_unblessed((unsigned char*)SvPVX(out_sv), data, 0);
        if( cur - SvPVX(out_sv) != need_size )
            croak("need_size(%d) != real_size(%d)", need_size, cur - SvPVX(out_sv));
        SvCUR_set(out_sv, cur - SvPVX(out_sv));
        *SvEND(out_sv) = 0;
        PUSHs(out_sv);

void
encode_json_unblessed_pretty(SV * data)
    PPCODE:
        visited_p = 0;
        STRLEN need_size = estimate_unblessed_pretty(data, 0);
        SV * out_sv = sv_2mortal(newSV(need_size));
        SvPOK_only(out_sv);
        visited_p = 0;
        char * cur = (char*)encode_unblessed_pretty((unsigned char*)SvPVX(out_sv), data, 0);
        if( cur - SvPVX(out_sv) + 1 != need_size )
            croak("need_size(%d) != real_size(%d)", need_size, cur - SvPVX(out_sv) + 1);
        SvCUR_set(out_sv, cur - SvPVX(out_sv));
        *SvEND(out_sv) = 0;
        PUSHs(out_sv);

void
decode_json(SV * json, bool warn2die=FALSE)
    PPCODE:
        unsigned char *str, *str_end, *str_adv;
        STRLEN len;
        SV * out = NULL;
        str = (unsigned char*) SvPV(json, len);
        str_end = str + len;
        str_adv = skip_space(decode(skip_bom(str, str_end), str_end, &out), str_end);
        if( str_end != str_adv ){
            if( warn2die )
                croak("decode_json: Unconsumed characters from offset %d", (int)(str_adv-str));
            else{
                warn("decode_json: Unconsumed characters from offset %d", (int)(str_adv-str));
                SvREFCNT_dec(out);
                PUSHs(&PL_sv_undef);
            }
        }
        else if( out==NULL ){
            if( warn2die )
                croak("decode_json: Unconsumed characters from offset %d", (int)(str_adv-str));
            else{
                warn("decode_json: Unconsumed characters from offset %d", (int)(str_adv-str));
                PUSHs(&PL_sv_undef);
            }
        }
        else
            PUSHs(sv_2mortal(out));

void
decode_json_safe(SV * json)
    PPCODE:
        unsigned char *str, *str_end, *str_adv;
        STRLEN len;
        SV * out = NULL;
        str = (unsigned char*) SvPV(json, len);
        str_end = str + len;
        str_adv = skip_space(decode(skip_bom(str, str_end), str_end, &out), str_end);
        if( str_end != str_adv ){
            SvREFCNT_dec(out);
            PUSHs(&PL_sv_undef);
        }
        else if( out==NULL )
            PUSHs(&PL_sv_undef);
        else
            PUSHs(sv_2mortal(out));

BOOT:
    visited_capacity = 32;
    Newx(visited, visited_capacity, SV*);


Powered by Groonga
Maintained by Kenichi Ishigaki <ishigaki@cpan.org>. If you find anything, submit it on GitHub.