jsonescapes.cc #1

# include <stdhdrs.h>
# include <strbuf.h>
# include "jsonescapes.h"

static void codepointToUtf8( StrBuf &out, unsigned int codepoint );

// Look for and replace:
// \" \\ \/ \b \f \n \r \t \uxxxx
// We silently discard all control characters  (0x00-0x1f),
// unknown escape characters (e.g. \a) and incomplete
// sequences, e.g. \uxxx
void
jsonEscapes( const char *input, int len, StrBuf &out )
{
	const char *in = input;
	int c;
	// We want to avoid 'Append'ing one character at a time
	// to the 'out' StrBuf, so we just remember the unescaped
	// characters we have not copied to the 'out' yet. When
	// we spot an escape sequence we 'flush' these characters,
	// and reset 'begin' so we don't flush them again.
	// At the end of the loop see if we have any to flush.
	// Most strings won't have any escapes, so this just costs
	// us the scan.
	const char *begin = in;

	// 0 = normal
	// 1-4, number of hex bytes to process (after a \u sequence)
	// -1, have seen a '\'
	int state = 0;
	unsigned int codepoint;
	while( in < &input[len] )
	{
	    if( !begin && ( state == 0) )
	        begin = in;

	    c = ( unsigned int )*in++;

	    if( state == 0 )
	    {
	        if( c == '\\' )
	        {
	            state = -1;
	            if( ( in - begin - 1 ) > 0 )
	                out.Append(begin, in - begin - 1);
	            begin = 0;
	        }
	        else
	        {
	            if( ( ( c >= 0 ) && ( c <= 0x1f ) ) )
	            {
	                // we are not allowed a control character
	                // in a JSON string. Silently ignore it.
	                if( ( in - begin - 1 ) > 0 )
	                    out.Append(begin, in - begin - 1);
	                begin = 0;
	            }
	        }
	    }
	    else
	    {
	        if( state == -1 )
	        {
	            switch( c )
	            {
	            case '"':
	                out << "\"";
	                state = 0;
	                begin = 0;
	                break;
	            case '\\':
	                out << "\\";
	                state = 0;
	                begin = 0;
	                break;
	            case '/':
	                out << "/";
	                state = 0;
	                begin = 0;
	                break;
	            case 'b':
	                out << "\b";
	                state = 0;
	                begin = 0;
	                break;
	            case 'f':
	                out << "\f";
	                state = 0;
	                begin = 0;
	                break;
	            case 'n':
	                out << "\n";
	                state = 0;
	                begin = 0;
	                break;
	            case 'r':
	                out << "\r";
	                state = 0;
	                begin = 0;
	                break;
	            case 't':
	                out << "\t";
	                state = 0;
	                begin = 0;
	                break;
	            case 'u':
	                state = 4;
	                codepoint = 0;
	                break;
	            default:
	                // silently discard
	                state = 0;
	                begin = 0;
	                break;
	            }
	        }
	        else
	        {
	            codepoint <<= 4;
	            if( ( c >= '0' ) && ( c <= '9' ) )
	            {
	                codepoint += c - '0';
	            }
	            else
	            {
	                if( ( c >= 'a' ) && ( c <= 'f' ) )
	                {
	                    codepoint += c - 'a' + 10;
	                }
	                else
	                {
	                    if( ( c >= 'A' ) && ( c <= 'F' ) )
	                    {
	                        codepoint += c - 'A' + 10;
	                    }
	                    else
	                    {
	                        // a bad character, backup and
	                        // discard
	                        in--;
	                        state = 0;
	                        begin = 0;
	                        continue;
	                    }
	                }
	            }
	            state--;

	            if( state == 0 )
	            {
	                // check for a hidden null.
	                if( codepoint )
	                {
	                    // push this codepoint coded as utf8
	                    // on to the output stream.
	                    codepointToUtf8( out, codepoint );
	                }
	                begin = 0;
	            }
	        }
	    }
	}
	if( begin && ( ( in - begin ) > 0 ) )
	        out.Append( begin, in - begin );
	out.Terminate();
}

// Convert the codepoint into the correct sequence of utf8
// bytes.
void
codepointToUtf8( StrBuf &out, unsigned int c )
{
	char b[4];
	int len = 0;
	// Ranges taken from https://en.wikipedia.org/wiki/UTF-8
	if( c <= 0x7f )
	{
	    b[0] = c;
	    len = 1;
	}
	else if( ( c >= 0x80 ) && ( c <= 0x7ff ) )
	{
	    b[0] = 0xc0 | ( ( c >> 6 ) & 0x1f );
	    b[1] = 0x80 | ( c & 0x3f );
	    len = 2;
	} else if( ( c >= 0x800 ) && ( c <= 0xffff ) )
	{
	    b[0] = 0xe0 | ( ( c >> 12 ) & 0xf );
	    b[1] = 0x80 | ( ( c >> 6 ) & 0x3f );
	    b[2] = 0x80 | ( c & 0x3f ) ;
	    len = 3;
	} else if( ( c <= 0x10000 ) && ( c <= 0x10ffff ) )
	{
	    b[0] = 0xf0 | ( ( c >> 18 ) & 0x7 );
	    b[1] = 0x80 | ( ( c >> 12 ) & 0x3f );
	    b[2] = 0x80 | ( ( c >> 6 ) & 0x3f );
	    b[3] = 0x80 | ( c & 0x3f );
	    len = 4;
	}
	if( len )
	    out.Append( b, len );
}
#	Change	User	Description	Committed
#1	23020	Nick Poole	Import 2017.2 code drop