[Libwebsockets] Lightweight JSON parser in C

"Andy Green (林安廸)" andy at warmcat.com
Fri Feb 22 16:59:47 CET 2013


On 22/02/13 23:28, the mail apparently from "Andy Green (林安廸)" included:
> Hi -
>
> Is there any interest in a very lightweight but 100% correct JSON parser?
>
> I don't mean like you have in a browser where it goes through and spawns
> objects for everything in the JSON.  The idea is it is basically a
> stateful stream reader, which can do something when the object path it
> is on top of matches what you're interested in.
>
> It does not recurse, nor malloc anything.  You call it with cursor
> structs which start off zero'd and hold all parsing state.  The cursor
> structs have no pointers and may be copied.  You can call the multiple
> times with the same cursor as new JSON pieces come in and it will parse
> on from the previous state; it's a bytewise parser so there are no
> fragmentation issues.  It means you don't need to hold the JSON all at
> once anywhere but can still parse the whole JSON.
>
> So the style would be parse until the node of interest is held in the
> cursor struct, then you can use a copy of that as a starting point for
> rummaging around.
>
> I have the parser working - it is very lightweight, the cursor struct is
> the only storage and it's < 256 bytes - but I am a bit stumped about how
> to most flexibly expose the wanted results.  I know what I want it for
> and can hack that up easily, but I am wondering if there are other
> real-world uses for very cheap JSON parsing in C... if so, what does the
> JSON look like and what are the operations needed on it?

Here is the code I have at the moment.  It's just the parser part with 
no way to deliver the results.

You can build it with gcc, then pipe JSON into it

$ cat file.json | ./lejp

I used the examples from here to test it

http://json.org/example.html

I will add a scheme for getting the results, but exactly how to come at 
that is open.

The code mantains a "path" in ctx->path like "menu.items[11].label", 
this can be the basis for matching.

However delivery of the parsed results after that is the next problem. 
I can do it easily for the simple case that I need, but that won't cover 
things like arrays inside the results or new objects inside the results 
cleanly.

What kind of results are people asking from their JSON parser?

-Andy

-------------- next part --------------
/*
 * Lightweight Embedded JSON Parser
 * 
 * Initial POC version
 * 
 * Copyright (C) 2013 Andy Green <andy at warmcat.com>
 * This code is licensed under LGPL 2.1
 * http://www.gnu.org/licenses/lgpl-2.1.html
 */

#include <string.h>
#include <unistd.h>
#include <stdio.h>

#define LEJP_FLAG_WS_KEEP 64

enum lejp_states {
	LEJP_IDLE = 0,
	LEJP_MEMBERS = 1,
	LEJP_M_P = 2,
	LEJP_M_P_STRING = LEJP_FLAG_WS_KEEP | 3,
	LEJP_M_P_STRING_ESC = LEJP_FLAG_WS_KEEP | 4,
	LEJP_M_P_STRING_ESC_U1 = LEJP_FLAG_WS_KEEP | 5,
	LEJP_M_P_STRING_ESC_U2 = LEJP_FLAG_WS_KEEP | 6,
	LEJP_M_P_STRING_ESC_U3 = LEJP_FLAG_WS_KEEP | 7,
	LEJP_M_P_STRING_ESC_U4 = LEJP_FLAG_WS_KEEP | 8,
	LEJP_M_P_DELIM = 9,
	LEJP_M_P_VALUE = 10,
	LEJP_M_P_VALUE_NUMBER_INT = LEJP_FLAG_WS_KEEP | 11,
	LEJP_M_P_VALUE_NUMBER_EXP = LEJP_FLAG_WS_KEEP | 12,
	LEJP_M_P_VALUE_TOK = LEJP_FLAG_WS_KEEP | 13,
	LEJP_M_P_COMMA_OR_END = 14,
	LEJP_M_P_ARRAY_END = 15,
};

enum lejp_reasons {
	LEJP_REJECT_IDLE_NO_BRACE = 1,
	LEJP_REJECT_MEMBERS_NO_CLOSE,
	LEJP_REJECT_M_P_NO_OPEN_QUOTE,
	LEJP_REJECT_M_P_STRING_UNDERRUN,
	LEJP_REJECT_M_P_ILLEGAL_CTRL,
	LEJP_REJECT_M_P_STRING_ESC_ILLEGAL_ESC,
	LEJP_REJECT_ILLEGAL_HEX,
	LEJP_REJECT_M_P_DELIM_MISSING_COLON,
	LEJP_REJECT_M_P_DELIM_BAD_VALUE_START,
	LEJP_REJECT_M_P_VAL_NUM_INT_NO_FRAC,
	LEJP_REJECT_M_P_VAL_NUM_INT_NO_NUMBER,
	LEJP_REJECT_M_P_VAL_NUM_EXP_BAD_EXP,
	LEJP_REJECT_M_P_VAL_TOK_UNKNOWN,
	LEJP_REJECT_M_P_C_OR_E_UNDERF,
	LEJP_REJECT_M_P_C_OR_E_NOTARRAY,
	LEJP_REJECT_M_P_ARRAY_END_MISSING,
	LEJP_REJECT_STACK_OVERFLOW,
	LEJP_REJECT_NUM_TOO_LONG,
	LEJP_REJECT_M_P_C_OR_E_NEITHER,
	LEJP_REJECT_UNKNOWN,
	
};

#define LEJP_MAX_DEPTH 10
#define LEJP_MAX_PATH 128

struct lejp_ctx {
	/* stack */
	char s[LEJP_MAX_DEPTH]; /* lejp_state stack*/
	int i[LEJP_MAX_DEPTH]; /* index stack */
	char p[LEJP_MAX_DEPTH];	/* scope length */
	char spos; /* stack head */

	/* number */
	unsigned char uni;
	char num[30]; /* number assembly buffer */
	char npos;
	char dcount;

	/* path */
	char path[LEJP_MAX_PATH];
	unsigned char ppos;
};

char
lejp_parse_object(struct lejp_ctx *ctx, const char *json, int len)
{
	char c, n, s, ret = LEJP_REJECT_UNKNOWN;
	static const char esc_char[] = "\"\\/bfnrt";
	static const char esc_tran[] = "\"\\/\b\f\n\r\t";
	static const char tokens[] = "rue alse ull ";

	while (len--) {
		c = *json++;

		/* skip whitespace unless we should care */
		if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
			if (!(s & LEJP_FLAG_WS_KEEP))
				continue;

		s = ctx->s[ctx->spos];
//		fprintf(stderr, "%c %02X (%d / %d)\n", c, c, s, ctx->spos);

		switch (s) {
		case LEJP_IDLE:
			if (c != '{') {
				ret = LEJP_REJECT_IDLE_NO_BRACE;
				goto reject;
			}
			ctx->s[ctx->spos] = LEJP_MEMBERS;
			break;
		case LEJP_MEMBERS:
			if (c == '}') {
				ctx->s[ctx->spos] = LEJP_IDLE;
				ret = LEJP_REJECT_MEMBERS_NO_CLOSE;
				goto reject;
			}
			ctx->s[ctx->spos] = LEJP_M_P;
			goto redo_character;
		case LEJP_M_P:
			if (c != '\"') {
				ret = LEJP_REJECT_M_P_NO_OPEN_QUOTE;
				goto reject;
			}
			/* push */
			ctx->s[ctx->spos] = LEJP_M_P_DELIM;
			c = LEJP_M_P_STRING;
			goto add_stack_level;

		case LEJP_M_P_STRING:
			if (c == '\"') {
				if (!ctx->spos) {
					ret = LEJP_REJECT_M_P_STRING_UNDERRUN;
					goto reject;
				}
				/* pop */
				ctx->spos--;
				break;
			}
			if (c == '\\') {
				ctx->s[ctx->spos] = LEJP_M_P_STRING_ESC;
				break;
			}
			if (c < ' ') {/* "control characters" not allowed */
				ret = LEJP_REJECT_M_P_ILLEGAL_CTRL;
				goto reject;
			}

			goto emit_string_char;

		case LEJP_M_P_STRING_ESC:
			if (c == 'u') {
				ctx->s[ctx->spos] = LEJP_M_P_STRING_ESC_U1;
				ctx->uni = 0;
				break;
			}
			for (n = 0; n < sizeof esc_char; n++) {
				if (c != esc_char[n])
					continue;
				/* found it */
				c = esc_tran[n];
				ctx->s[ctx->spos] = LEJP_M_P_STRING;
				goto emit_string_char;
			}
			ret = LEJP_REJECT_M_P_STRING_ESC_ILLEGAL_ESC;
			/* illegal escape char */
			goto reject;

		case LEJP_M_P_STRING_ESC_U1:
		case LEJP_M_P_STRING_ESC_U2:
		case LEJP_M_P_STRING_ESC_U3:
		case LEJP_M_P_STRING_ESC_U4:
			ctx->uni <<= 4;
			if (c >= '0' && c <= '9')
				ctx->uni |= c - '0';
			else
				if (c >= 'a' && c <= 'f')
					ctx->uni = c - 'a' + 10;
				else
					if (c >= 'A' && c <= 'F')
						ctx->uni = c - 'A' + 10;
					else {
						ret = LEJP_REJECT_ILLEGAL_HEX;
						goto reject;
					}
			ctx->s[ctx->spos]++;
			c = ctx->uni;
			if (s == LEJP_M_P_STRING_ESC_U2)
				goto emit_string_char;

			if (s == LEJP_M_P_STRING_ESC_U4) {
				ctx->s[ctx->spos] = LEJP_M_P_STRING;
				goto emit_string_char;
			}
			break;

		case LEJP_M_P_DELIM:
			if (c != ':') {
				ret = LEJP_REJECT_M_P_DELIM_MISSING_COLON;
				goto reject;
			}
			ctx->s[ctx->spos] = LEJP_M_P_VALUE;
			ctx->path[ctx->ppos] = '\0';
			fprintf(stderr, "%s (%d)\n", ctx->path, ctx->ppos);
			break;

		case LEJP_M_P_VALUE:

			if (c >= '0' && c <= '9') {
				ctx->npos = 0;
				ctx->dcount = 0;
				ctx->s[ctx->spos] = LEJP_M_P_VALUE_NUMBER_INT;
				goto redo_character;
			}
			switch (c) {
			case'\"':
				/* push */
				ctx->s[ctx->spos] = LEJP_M_P_COMMA_OR_END;
				c = LEJP_M_P_STRING;
				goto add_stack_level;

			case '{':
				/* push */
				ctx->s[ctx->spos] = LEJP_M_P_COMMA_OR_END;
				c = LEJP_MEMBERS;
				goto add_stack_level;

			case '[':
				/* push */
				ctx->s[ctx->spos] = LEJP_M_P_ARRAY_END;
				c = LEJP_M_P_VALUE;
				ctx->ppos += sprintf(
					&ctx->path[ctx->ppos], "[0]");
				goto add_stack_level;

			case 't': /* true */
				ctx->uni = 0;
				ctx->s[ctx->spos] = LEJP_M_P_VALUE_TOK;
				break;

			case 'f':
				ctx->uni = 4;
				ctx->s[ctx->spos] = LEJP_M_P_VALUE_TOK;
				break;

			case 'n':
				ctx->uni = 4 + 5;
				ctx->s[ctx->spos] = LEJP_M_P_VALUE_TOK;
				break;
			default:
				ret = LEJP_REJECT_M_P_DELIM_BAD_VALUE_START;
				goto reject;
			}
			break;

		case LEJP_M_P_VALUE_NUMBER_INT:
			if (!ctx->npos && c == '-')
				goto append_npos;

			if (ctx->dcount < 10 && c >= '0' && c <= '9') {
				if (ctx->dcount >= 0)
					ctx->dcount++;
				goto append_npos;
			}
			if (ctx->dcount > 0 && c == '.') {
				ctx->dcount = -ctx->npos; /* mark as seen . */
				goto append_npos;
			}
			/*
			 * before exponent, if we had . we must have had at
			 * least one more digit
			 */
			if (ctx->dcount < 0 && ctx->dcount == -ctx->npos) {
				ret = LEJP_REJECT_M_P_VAL_NUM_INT_NO_FRAC;
				goto reject;
			}

			if (c == 'e' || c == 'E') {
				ctx->s[ctx->spos] = LEJP_M_P_VALUE_NUMBER_EXP;
				goto append_npos;
			}
			/* if none of the above, did we even have a number? */
			if (!ctx->dcount) {
				ret = LEJP_REJECT_M_P_VAL_NUM_INT_NO_NUMBER;
				goto reject;
			}

			/* then this is the post-number character, loop */
			ctx->s[ctx->spos] = LEJP_M_P_COMMA_OR_END;
			goto redo_character;

		case LEJP_M_P_VALUE_NUMBER_EXP:
			ctx->s[ctx->spos] = LEJP_M_P_VALUE_NUMBER_INT;
			if (c >= '0' && c <= '9')
				goto redo_character;
			if (c == '+' || c == '-')
				goto append_npos;
			ret = LEJP_REJECT_M_P_VAL_NUM_EXP_BAD_EXP;
			goto reject;

		case LEJP_M_P_VALUE_TOK: /* true, false, null */
			if (c != tokens[ctx->uni]) {
				ret = LEJP_REJECT_M_P_VAL_TOK_UNKNOWN;
				goto reject;
			}
			ctx->uni++;
			if (tokens[ctx->uni] == ' ') {
				switch (ctx->uni) {
				case 3: /* emit 'true' */
					break;
				case 8: /* emit 'false' */
					break;
				case 12: /* emit 'null' */
					break;
				}
				ctx->s[ctx->spos] = LEJP_M_P_COMMA_OR_END;
			}
			break;

		case LEJP_M_P_COMMA_OR_END:
			ctx->path[ctx->ppos] = '\0';
			fprintf(stderr, "%s (LEJP_M_P_CA_OR_END)\n", ctx->path);

			if (c == ',') {
				/* increment this stack level's index */
				ctx->i[ctx->spos]++;
				ctx->s[ctx->spos] = LEJP_M_P;
				if (!ctx->spos)
					break;
				ctx->ppos = ctx->p[ctx->spos - 1];
				if (ctx->s[ctx->spos - 1] != LEJP_M_P_ARRAY_END)
					break;
				ctx->s[ctx->spos] = LEJP_M_P_VALUE;
				while (ctx->path[ctx->ppos] != '[')
					ctx->ppos--;
				ctx->ppos += sprintf(&ctx->path[ctx->ppos],
						     "[%u]", ctx->i[ctx->spos]);
				break;
			}
			if (c == ']') {
				if (!ctx->spos) {
					ret = LEJP_REJECT_M_P_C_OR_E_UNDERF;
					goto reject;
				}
				/* pop */
				ctx->spos--;
				if (ctx->s[ctx->spos] != LEJP_M_P_ARRAY_END) {
					ret = LEJP_REJECT_M_P_C_OR_E_NOTARRAY;
					goto reject;
				}
				/* drop the path [n] bit */
				ctx->ppos = ctx->p[ctx->spos - 1];

				goto redo_character;
			}
			if (c == '}') {
				if (ctx->spos == 0) {
					/* done */
					return 0;
				}
				/* pop */
				ctx->spos--;
				ctx->ppos = ctx->p[ctx->spos -1];
				break;
			}

			ret = LEJP_REJECT_M_P_C_OR_E_NEITHER;
			goto reject;

		case LEJP_M_P_ARRAY_END:
			ctx->path[ctx->ppos] = '\0';
			fprintf(stderr, "%s (LEJP_M_P_ARRAY_END spos=%d)\n",
							ctx->path, ctx->spos);


			if (c == ',') {
				/* increment this stack level's index */
				ctx->i[ctx->spos]++;
				ctx->s[ctx->spos] = LEJP_M_P_VALUE;
				if (ctx->spos) {
					ctx->ppos = ctx->p[ctx->spos - 1];
					ctx->ppos += sprintf(
						&ctx->path[ctx->ppos],
						     "[%u]", ctx->i[ctx->spos]);
				}
				break;
			}
			if (c != ']') {
				ret = LEJP_REJECT_M_P_ARRAY_END_MISSING;
				goto reject;
			}
			ctx->s[ctx->spos] = LEJP_M_P_COMMA_OR_END;
			break;
		}

		continue;
emit_string_char:
		if (ctx->spos)
			if (ctx->s[ctx->spos - 1] == LEJP_M_P_DELIM)
				ctx->path[ctx->ppos++] = c;
		continue;

add_stack_level:
		/* add a level on the object stack */
		ctx->p[ctx->spos] = ctx->ppos;
		ctx->spos++;
		if (ctx->spos == sizeof ctx->s) {
			ret = LEJP_REJECT_STACK_OVERFLOW;
			goto reject;
		}
		if (ctx->ppos && ctx->path[ctx->ppos - 1] != '.')
			ctx->path[ctx->ppos++] = '.';
		ctx->s[ctx->spos] = c;
		ctx->i[ctx->spos] = 0;
		continue;

append_npos:
		if (ctx->npos >= sizeof ctx->num) {
			ret = LEJP_REJECT_NUM_TOO_LONG;
			goto reject;
		}

		ctx->num[ctx->npos++] = c;
		continue;
redo_character:
		json--;
		len++;
	}

	return 1;

reject:
	return -ret;
}

int main(void)
{
	struct lejp_ctx ctx;
	char buf[512];
	int n;
	
	memset(&ctx, 0, sizeof ctx);
	
	while (1) {
		n = read(STDIN_FILENO, buf, sizeof buf);
		if (!n)
			return 0;
			
		n = lejp_parse_object(&ctx, buf, n);
		fprintf(stderr, "returned %d\n", n);
		if (n < 0)
			return 1;
	}
}


More information about the Libwebsockets mailing list