scan.c #5

/*
 * Copyright 1993-2002 Christopher Seiwald and Perforce Software, Inc.
 *
 * This file is part of Jam - see jam.c for Copyright information.
 */

/*
 * scan.c - the jam yacc scanner
 *
 * 12/26/93 (seiwald) - bump buf in yylex to 10240 - yuk.
 * 09/16/94 (seiwald) - check for overflows, unmatched {}'s, etc.
 *			Also handle tokens abutting EOF by remembering
 *			to return EOF now matter how many times yylex()
 *			reinvokes yyline().
 * 02/11/95 (seiwald) - honor only punctuation keywords if SCAN_PUNCT.
 * 07/27/95 (seiwald) - Include jamgram.h after scan.h, so that YYSTYPE is
 *			defined before Linux's yacc tries to redefine it.
 * 01/10/01 (seiwald) - \ can now escape any whitespace char
 * 11/04/02 (seiwald) - const-ing for string literals
 */

# include "jam.h"
# include "lists.h"
# include "parse.h"
# include "scan.h"
# include "jamgram.h"
# include "jambase.h"
# include "newstr.h"

#ifdef GRAPHISOFT_LEXSCAN
/***********************************************************************

  separators:
   "(", ")"
   "[", "]"
   "{", "}"
   ":",";"
   "!","!=",
   "?","?=",
   "&&",
   "||",
   "=",
   " ", "\t", "\n", "\r", EOF

   todo:
   + should be too, but C++ is used in jamfile...

 ***********************************************************************/
char	tokenseparator [ 256 ]  ;
char	tokenseparatornext [ 256 ] ; // characters that forms a token if the follow a tokenseparator
#endif

#ifdef GRAPHISOFT_MPW_FIX
#if defined (macintosh)
# include "CursorCtl.h"
static int totalline = 0;
#endif
#endif

struct keyword {
	const char *word;
	int type;
} keywords[] = {
# include "jamgramtab.h"
	{ 0, 0 }
} ;

#ifdef GRAPHISOFT_FIX_NONNATIVENEWLINE

#define XFILEBUFSIZE 4096
typedef struct XFILE {
	char 	buffer [XFILEBUFSIZE];
	size_t	start;
	size_t	end;
	FILE*	f;
} XFILE;

static XFILE* xfopen ( const char* file, const char* mode)
{
	FILE* f = fopen (file, mode);
	if (f == NULL)
		return NULL;

	XFILE* result = (XFILE*)(malloc (sizeof (XFILE)));
	if (result == NULL) {
		fclose (f);
		return NULL;
	}
	result->f = f;
	result->start = 0;
	result->end = 0;
	return result;
}

static char* xfgets (char* buf, size_t size, XFILE* f)
{
	size_t remaining = size;

	while (remaining > 0) {
		if (f->start == f->end) {
			/* read to buffer */
			f->start = 0;
			f->end = fread (f->buffer, sizeof(char), XFILEBUFSIZE, f->f);
			if (f->end == 0)
				return NULL; /* EOF */
		}
		int 	i = f->start;
		int 	end = (f->end > (f->start+size-1)) ? (f->start+size-1) : f->end ;
		char*	bufd = buf-(f->start)+(size-remaining);
		char* 	bufs = f->buffer;

		while (i < end) {
			if (bufs[i] == '\n' || bufs[i] == '\r') {
				bufd[i] = '\n';
				bufd[i+1] = 0;

				const char first = ('\n' == 0x0D) ? '\n' : '\r' ;
				const char second = ('\r' == 0x0D) ? '\r' : '\n' ;

				if (first == bufs[i]) {
					if (i+1 >= f->end) {
						f->start = 0;
						f->end = fread (f->buffer, sizeof(char), XFILEBUFSIZE, f->f);
						i = f->start-1;
					}
					if ((i+1 < f->end) && (second == bufs[i+1]))
						i++;
				}
				f->start = i+1;
				return buf;
			} else {
				bufd[i] = bufs[i];
			}
			i++;
		}

		remaining -= end-f->start;
		f->start = end;
	}
	return buf;
}

static void xfclose (XFILE* f)
{
	fclose (f->f);
	free ((void*)f);
}

#endif


struct include {
	struct include 	*next;		/* next serial include file */
	const char 	*string;	/* pointer into current line */
	char		**strings;	/* for yyfparse() -- text to parse */
#ifdef GRAPHISOFT_FIX_NONNATIVENEWLINE
	XFILE 		*file;		/* for yyfparse() -- file being read */
#else
	FILE 		*file;		/* for yyfparse() -- file being read */
#endif
	const char 	*fname;		/* for yyfparse() -- file name */
	int 		line;		/* line counter for error messages */
	char 		buf[ 512 ];	/* for yyfparse() -- line buffer */
} ;

static struct include *incp = 0; /* current file; head of chain */

static int scanmode = SCAN_NORMAL;
static int anyerrors = 0;
static char *symdump( YYSTYPE *s );
#ifdef GRAPHISOFT_LEXSCAN
static int lexscanmode = LEXSCANMODE_COMPATIBLE;
#endif

# define BIGGEST_TOKEN 10240	/* no single token can be larger */

#ifdef GRAPHISOFT_LEXSCAN
/*
 * Set up token separator:
 */
void initscan (int inlexscanmode)
{
	int i ;

	for (i = 0; i < 256; i++) {
		tokenseparator[i] = 0;
		tokenseparatornext[i] = 0;
	}

	tokenseparator[(unsigned char)'['] = 1;
	tokenseparator[(unsigned char)']'] = 1;
	tokenseparator[(unsigned char)'('] = 1;
	tokenseparator[(unsigned char)')'] = 1;
	tokenseparator[(unsigned char)'{'] = 1;
	tokenseparator[(unsigned char)'}'] = 1;
	tokenseparator[(unsigned char)'!'] = 1; /* Note: != should be parsed!!*/
	tokenseparator[(unsigned char)'?'] = 1; /* Note: ?= should be parsed!!*/
	tokenseparator[(unsigned char)'='] = 1; /* Note: !=, ?= should be parsed!*/
	tokenseparator[(unsigned char)'+'] = 3; /* Note: + is not a valid token , but "+=" is!!*/ /* This is problematic.. += shall be a separator,
											   but ++ shouldn't */
	tokenseparator[(unsigned char)':'] = 1;
	tokenseparator[(unsigned char)';'] = 1;
	tokenseparator[(unsigned char)' '] = 1;
	tokenseparator[(unsigned char)'\t'] = 1;
	tokenseparator[(unsigned char)'\n'] = 1;
	tokenseparator[(unsigned char)'\r'] = 1;
	tokenseparator[(unsigned char)'<'] = 1; /* Note: <= should be parsed!!*/
	tokenseparator[(unsigned char)'>'] = 1; /* Note: >= should be parsed!!*/
	tokenseparator[(unsigned char)'&'] = 2; /* Note: & is not a valid token , but "&&" is!!*/
	tokenseparator[(unsigned char)'|'] = 2; /* Note: | is not a valid token , but "||" is!!*/
	
	tokenseparatornext[(unsigned char)'?'] = '=' ;
	tokenseparatornext[(unsigned char)'!'] = '=' ;
	tokenseparatornext[(unsigned char)'+'] = '=' ;
	tokenseparatornext[(unsigned char)'<'] = '=' ;
	tokenseparatornext[(unsigned char)'>'] = '=' ;
	tokenseparatornext[(unsigned char)'&'] = '&' ;
	tokenseparatornext[(unsigned char)'|'] = '|' ;
	

	lexscanmode = inlexscanmode;
}
#endif

/* 
 * Set parser mode: normal, string, or keyword
 */

void
yymode( int n )
{
	scanmode = n;
}

void
yyerror( const char *s )
{
#ifdef GRAPHISOFT_JAM
#ifdef OS_MAC
	if( incp )
		printf ("File '%s' ; line %d \n",incp->fname,incp->line);
	printf( "# %s at %s\n", s, symdump( &yylval ) );
#elif defined (NT)
	if (incp)
		printf ("%s(%d) : ", incp->fname,incp->line);
	printf ("%s at %s\n", s, symdump (&yylval) );
#else
	if( incp )
	    printf( "%s: line %d: ", incp->fname, incp->line );
	printf( "%s at %s\n", s, symdump( &yylval ) );
#endif
#else
	if( incp )
	    printf( "%s: line %d: ", incp->fname, incp->line );

	printf( "%s at %s\n", s, symdump( &yylval ) );
#endif

	++anyerrors;
}

#ifdef GRAPHISOFT_JAM
void
yywarning( const char *s )
{
#ifdef OS_MAC
	if( incp )
		printf ("File '%s' ; line %d \n",incp->fname,incp->line);
	printf( "# %s at %s\n", s, symdump( &yylval ) );
#else
	if( incp )
	    printf( "%s: line %d: ", incp->fname, incp->line );
	printf( "%s at %s\n", s, symdump( &yylval ) );
#endif
}
#endif

int
yyanyerrors()
{
	return anyerrors != 0;
}

void
yyfparse( const char *s )
{
	struct include *i = (struct include *)malloc( sizeof( *i ) );

	/* Push this onto the incp chain. */

	i->string = "";
	i->strings = 0;
	i->file = 0;
	i->fname = copystr( s );
	i->line = 0;
	i->next = incp;
	incp = i;

	/* If the filename is "+", it means use the internal jambase. */

	if( !strcmp( s, "+" ) )
	    i->strings = jambase;
}

/*
 * yyline() - read new line and return first character
 *
 * Fabricates a continuous stream of characters across include files,
 * returning EOF at the bitter end.
 */

int
yyline()
{
	struct include *i = incp;

	if( !incp )
	    return EOF;

#ifdef GRAPHISOFT_MPW_FIX
#if defined (macintosh)
	if (!(++totalline % 1000))
		SpinCursor (1);
#endif
#endif
	/* Once we start reading from the input stream, we reset the */
	/* include insertion point so that the next include file becomes */
	/* the head of the list. */

	/* If there is more data in this line, return it. */

	if( *i->string )
	    return *i->string++;

	/* If we're reading from an internal string list, go to the */
	/* next string. */

	if( i->strings )
	{
	    if( !*i->strings )
		goto next;

	    i->line++;
	    i->string = *(i->strings++);
	    return *i->string++;
	}

	/* If necessary, open the file */

	if( !i->file )
	{
#ifdef GRAPHISOFT_FIX_NONNATIVENEWLINE
	    XFILE *f = NULL;
#else
	    FILE *f = stdin;
#endif

#ifdef GRAPHISOFT_FIX_NONNATIVENEWLINE
	    if( strcmp( i->fname, "-" ) && !( f = xfopen( i->fname, "r" ) ) )
#else
	    if( strcmp( i->fname, "-" ) && !( f = fopen( i->fname, "r" ) ) )
#endif
		perror( i->fname );

	    i->file = f;
	}

	/* If there's another line in this file, start it. */

#ifdef GRAPHISOFT_FIX_NONNATIVENEWLINE
	if( i->file && xfgets( i->buf, sizeof( i->buf ), i->file ) )
#else
	if( i->file && fgets( i->buf, sizeof( i->buf ), i->file ) )
#endif
	{
	    i->line++;
	    i->string = i->buf;
	    return *i->string++;
	}

    next:
	/* This include is done.  */
	/* Free it up and return EOF so yyparse() returns to parse_file(). */

	incp = i->next;

	/* Close file, free name */

#ifdef GRAPHISOFT_FIX_NONNATIVENEWLINE
	if( i->file && i->file != NULL )
	    xfclose( i->file );
#else
	if( i->file && i->file != stdin )
	    fclose( i->file );
#endif
	freestr( i->fname );
	free( (char *)i );

	return EOF;
}

/*
 * yylex() - set yylval to current token; return its type
 *
 * Macros to move things along:
 *
 *	yychar() - return and advance character; invalid after EOF
 *	yyprev() - back up one character; invalid before yychar()
 *
 * yychar() returns a continuous stream of characters, until it hits
 * the EOF of the current include file.
 */

# define yychar() ( *incp->string ? *incp->string++ : yyline() )
# define yyprev() ( incp->string-- )

int
yylex()
{
	int c;
	char buf[BIGGEST_TOKEN];
	char *b = buf;

	if( !incp )
	    goto eof;

	/* Get first character (whitespace or of token) */

	c = yychar();

	if( scanmode == SCAN_STRING )
	{
	    /* If scanning for a string (action's {}'s), look for the */
	    /* closing brace.  We handle matching braces, if they match! */

	    int nest = 1;

	    while( c != EOF && b < buf + sizeof( buf ) )
	    {
		    if( c == '{' )
			nest++;

		    if( c == '}' && !--nest )
			break;

		    *b++ = c;

		    c = yychar();
	    }

	    /* We ate the ending brace -- regurgitate it. */

	    if( c != EOF )
		yyprev();

	    /* Check obvious errors. */

	    if( b == buf + sizeof( buf ) )
	    {
		yyerror( "action block too big" );
		goto eof;
	    }

	    if( nest )
	    {
		yyerror( "unmatched {} in action block" );
		goto eof;
	    }

	    *b = 0;
	    yylval.type = STRING;
	    yylval.string = newstr( buf );

	}
	else
	{
	    char *b = buf;
	    struct keyword *k;
	    int inquote = 0;
#ifdef GRAPHISOFT_LEXSCAN
		int	invariable = 0; /* If !=0 we are inside a $(..) variable evaluation expression. */
		int	wasdollar = 0; /* Last charater was a '$' */
		int istokenseparator = 0;
#endif
	    int notkeyword;
		
	    /* Eat white space */

	    for( ;; )
	    {
		/* Skip past white space */

		while( c != EOF && isspace( c ) )
			c = yychar();

		/* Not a comment?  Swallow up comment line. */

		if( c != '#' )
			break;
		while( ( c = yychar() ) != EOF && c != '\n' && c != '\r' )
			;
	    }

	    /* c now points to the first character of a token. */

	    if( c == EOF )
		goto eof;

	    /* While scanning the word, disqualify it for (expensive) */
	    /* keyword lookup when we can: $anything, "anything", \anything */

	    notkeyword = c == '$';

		#ifdef GRAPHISOFT_LEXSCAN
		/* See if this is a single char token from tokenseparator */
		if ((lexscanmode != LEXSCANMODE_OLD) && (c != EOF && tokenseparator[c])) {
			char oldc = c;
			*b++ = c;
			c = yychar();
			
			if (c == tokenseparatornext[oldc]) { /* parse tokens staring with a tokensaparator: ?=, <=, >=, &&, || */
				*b++ = c;
				c = yychar();
			}
			
			if (lexscanmode == LEXSCANMODE_COMPATIBLE) {
				if (c != EOF && !isspace (c) && tokenseparator[oldc] != 3) {
					if( incp )
						printf( "%s: line %d: Warning: %c will be a token separator, use spaces or quotes!\n", incp->fname, incp->line, *(b-1) );
				}
			}
		} else {
		#endif
	    /* look for white space to delimit word */
	    /* "'s get stripped but preserve white space */
	    /* \ protects next character */

	    while( 
		c != EOF &&
		b < buf + sizeof( buf ) &&
		( inquote || !istokenseparator ) )
	    {
		int isdollar = 0;

		if( c == '"' )
		{
		    /* begin or end " */
		    inquote = !inquote;
		    notkeyword = 1;
		}
		else if( c != '\\' )
		{
		    if (!inquote)
		    {
			if (c == '$') {
			    isdollar = 1;
			} else if (c == '(') {
			    if (wasdollar && !invariable) {
				invariable = 1;
			    } else if (invariable) {
				invariable++;
			    }
			} else if (c == ')' && invariable) {
			    invariable--;
			}
		    }
		    /* normal char */
		    *b++ = c;
		}
		else if( ( c = yychar()) != EOF )
		{
		    /* \c */
		    *b++ = c;
		    notkeyword = 1;
		}
		else
		{
		    /* \EOF */
		    break;
		}
		wasdollar = isdollar;
		
		c = yychar();
		istokenseparator = (c == EOF) || (!invariable && !inquote && !(wasdollar && c == '(') && tokenseparator[c] && tokenseparator[c] != 3);

		/* incompatiblity in new and old lexscanmode */
		if (!inquote && istokenseparator && !isspace (c)) {
			if (lexscanmode == LEXSCANMODE_COMPATIBLE) {
				if( incp )
					printf( "%s: line %d: Warning: %c will be a token separator, use spaces or quotes!\n", incp->fname, incp->line, c );
			}
			if (lexscanmode == LEXSCANMODE_COMPATIBLE || lexscanmode == LEXSCANMODE_OLD) {
				istokenseparator = 0;
			}
		}
	    }
		}
	    /* Check obvious errors. */

	    if( b == buf + sizeof( buf ) )
	    {
		yyerror( "string too big" );
		goto eof;
	    }

	    if( inquote )
	    {
		yyerror( "unmatched \" in string" );
		goto eof;
	    }

	    /* We looked ahead a character - back up. */

	    if( c != EOF )
		yyprev();

	    /* scan token table */
	    /* don't scan if it's obviously not a keyword or if its */
	    /* an alphabetic when were looking for punctuation */

	    *b = 0;
	    yylval.type = ARG;

	    if( !notkeyword && !( isalpha( *buf ) && scanmode == SCAN_PUNCT ) )
	    {
		for( k = keywords; k->word; k++ )
		    if( *buf == *k->word && !strcmp( k->word, buf ) )
		{
		    yylval.type = k->type;
		    yylval.string = k->word;	/* used by symdump */
		    break;
		}
	    }

	    if( yylval.type == ARG )
		yylval.string = newstr( buf );
	}

	if( DEBUG_SCAN )
		printf( "scan %s\n", symdump( &yylval ) );

	return yylval.type;

eof:
	yylval.type = EOF;
	return yylval.type;
}

static char *
symdump( YYSTYPE *s )
{
	static char buf[ BIGGEST_TOKEN + 20 ];

	switch( s->type )
	{
	case EOF:
		sprintf( buf, "EOF" );
		break;
	case 0:
		sprintf( buf, "unknown symbol %s", s->string );
		break;
	case ARG:
		sprintf( buf, "argument %s", s->string );
		break;
	case STRING:
		sprintf( buf, "string \"%s\"", s->string );
		break;
	default:
		sprintf( buf, "keyword %s", s->string );
		break;
	}
	return buf;
}
#	Change	User	Description
#9	2985	Miklos Fazekas	Scan.c bugfix
#8	2983	Miklos Fazekas	Fixed error in handling cr/lf problem
#7	2642	Miklos Fazekas	Sync to 2.5rc2
#6	2579	Miklos Fazekas	GSJam to 2.5rc1 integration
#5	2578	Miklos Fazekas	Integrate new lexical scanner code to GSJam
#4	2539	Miklos Fazekas	Updated sources
#3	2519	Miklos Fazekas	Sync to 2.5rc1
#2	1395	Miklos Fazekas	Merge with main jam
#1	1212	Miklos Fazekas	Created a Jam branch
//guest/perforce_software/jam/src/scan.c
#2	486	Perforce staff	Jam 2.3. See RELNOTES for a list of changes from 2.2.x. Just about every source file was touched when jam got ANSI-fied.
#1	2	laura	Add Jam/MR 2.2 source