#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <stdarg.h>
#include <limits.h>
#include <locale.h>
#include <wchar.h>
#include <wctype.h>
#define PCRE2_CODE_UNIT_WIDTH 32
#include <pcre2.h>
#include <defs.h>
#include <main.h>
#include <error.h>
#include <msglog.h>
#include <xalloc.h>
#include <utf8ing.h>
#include <symtab.h>
#include <parse.h>
#include <lex.h>
int lineno = 0;
int colno = 0;
static int maxtoken;
static wchar_t *token_buffer;
static int max8token;
static utf8_t *token_utf8_buffer;
int indent_level = 0; /* Number of '{' minus number of '}'. */
static int end_of_file = 0;
static int nextchar = -1;
static char *locale;
#define GETC(c) ({ wint_t ret; ++colno; ret = fgetwc( config ); ret; })
#define UNGETC(c) ({ wint_t ret; --colno; ret = ungetwc( c, config ); ret; })
static wchar_t *extend_token_buffer( wchar_t *p )
{
int offset = p - token_buffer;
maxtoken = maxtoken * 2 + 10;
token_buffer = (wchar_t *)xrealloc( token_buffer, (maxtoken + 2)*sizeof(wchar_t) );
return( token_buffer + offset );
}
static utf8_t *extend_token_utf8_buffer( utf8_t *p )
{
int offset = p - token_utf8_buffer;
max8token = max8token * 2 + 10;
token_utf8_buffer = (utf8_t *)xrealloc( token_utf8_buffer, (max8token + 2)*6 );
return( token_utf8_buffer + offset );
}
void yyerror( char const *s )
{
error( "%s", s );
}
void init_lex( void )
{
locale = setlocale( LC_ALL, "en_US.utf8" );
lineno = 0;
colno = 0;
nextchar = -1;
maxtoken = 40;
max8token = 40;
indent_level = 0;
end_of_file = 0;
token_buffer = (wchar_t *)xmalloc( maxtoken * sizeof(wchar_t) + 2 );
token_utf8_buffer = (utf8_t *)xmalloc( max8token * 6 + 2 );
}
void fini_lex( void )
{
locale = setlocale( LC_ALL, locale );
if( token_buffer ) { free( token_buffer ); token_buffer = NULL; }
if( token_utf8_buffer ) { free( token_utf8_buffer ); token_utf8_buffer = NULL; }
indent_level = 0;
end_of_file = 0;
max8token = 0;
maxtoken = 0;
nextchar = -1;
lineno = 0;
colno = 0;
}
static wint_t check_newline( void )
{
wint_t c;
++lineno;
colno = 0; /* считает GETC()/UNGETC(); здесь надо только обнулить */
/*****************************************
Read first nonwhite char on the line.
*****************************************/
c = GETC();
while( c == ' ' || c == '\t' ) c = GETC();
if( c == '#' ) goto skipline;
else return( c );
/* skip the rest of this line */
skipline:
while( c != '\n' && c != WEOF )
c = GETC();
return( c );
}
static wint_t skip_comment( int c )
{
if( c == '*' )
{
do1:
do
{
c = GETC();
if( c == '\n' ) { ++lineno; colno = 0; }
} while( c != '*' && c != WEOF );
if( c == WEOF )
{
unterminated_comment();
return( WEOF );
}
c = GETC();
if( c == '/' )
{
c = GETC();
if( c == '\n' ) c = check_newline();
return( c );
}
else
{
UNGETC( c );
goto do1;
}
}
else if( c == '/' || c == '#' )
{
do
{
c = GETC();
} while( c != '\n' && c != WEOF );
if( c == WEOF )
{
unterminated_comment();
return( WEOF );
}
else c = check_newline();
return( c );
}
return( c );
} /* End skip_commemnt() */
static wint_t skip_white_space( wint_t c )
{
for( ;; )
{
switch( c )
{
case '\n':
c = check_newline();
break;
case '#':
c = skip_comment( c );
return( skip_white_space( c ) );
break;
case '/':
c = GETC();
if( c == '/' || c == '*' )
{
c = skip_comment( c );
return( skip_white_space( c ) );
}
else
{
UNGETC( c );
return( '/' );
}
break;
case ' ':
case '\t':
case '\f':
case '\v':
case '\b':
case '\r':
c = GETC();
break;
case '\\':
c = GETC();
if( c == '\n' ) { ++lineno; colno = 0; }
else
{
warning( "%s", "Stray '\\' in program" );
}
c = GETC();
break;
default:
return( c );
} /* End switch( c ) */
} /* End for( ;; ) */
} /* End skip_white_space() */
static wint_t readescape( int *ignore_ptr )
/*
read escape sequence, returning a char, or store 1 in *ignore_ptr
if it is backslash-newline
*/
{
wint_t c = GETC();
wint_t code;
unsigned count;
unsigned firstdig = 0;
int nonull;
switch( c )
{
case 'x':
code = 0;
count = 0;
nonull = 0;
while( 1 )
{
c = GETC();
if( !(c >= 'a' && c <= 'f') &&
!(c >= 'A' && c <= 'F') &&
!(c >= '0' && c <= '9') )
{
UNGETC( c );
break;
}
code *= 16;
if( c >= 'a' && c <= 'f' ) code += c - 'a' + 10;
if( c >= 'A' && c <= 'F' ) code += c - 'A' + 10;
if( c >= '0' && c <= '9' ) code += c - '0';
if( code != 0 || count != 0 )
{
if( count == 0 ) firstdig = code;
count++;
}
nonull = 1;
} /* End while( 1 ) */
if( !nonull )
{
error( "%s", "\\x used with no following hex digits" );
}
else if( count == 0 )
/* Digits are all 0's. Ok. */
;
else if( (count - 1) * 4 >= 32 || /* 32 == bits per INT */
(count > 1 && ((1 << (32 - (count-1) * 4)) <= firstdig )))
{
warning( "%s", "Hex escape out of range" );
}
return( code );
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7':
code = 0;
count = 0;
while( (c <= '7') && (c >= '0') && (count++ < 6) )
{
code = (code * 8) + (c - '0');
c = GETC();
}
UNGETC( c );
return( code );
case '\\': case '\'': case '"':
return( c );
case '\n':
lineno++; colno = 0;
*ignore_ptr = 1;
return( 0 );
case 'n':
return( '\n' );
case 't':
return( '\t' );
case 'r':
return( '\r' );
case 'f':
return( '\f' );
case 'b':
return( '\b' );
case 'a':
return( '\a' );
case 'v':
return( '\v' );
}
return( c );
} /* End of readescape() */
int html_symbol_name( wchar_t *str )
{
int rc = 0, error = 0;
PCRE2_SIZE offset = 0;
wchar_t pattern[] = L"^(&[#A-Za-z0-9]*;)";
pcre2_match_data *match;
pcre2_code *regexp = pcre2_compile( (PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, 0, &error, &offset, NULL );
if( regexp == NULL )
{
return 0; /* PCRE compilation failed */
}
match = pcre2_match_data_create_from_pattern( regexp, NULL );
rc = pcre2_match( regexp, (PCRE2_SPTR)str, (int)wcslen(str), 0, 0, match, NULL );
if( rc < 0 )
{
/* not match */
pcre2_match_data_free( match );
pcre2_code_free( regexp );
return 0;
}
else
{
/* match */
pcre2_match_data_free( match );
pcre2_code_free( regexp );
return 1;
}
}
int yylex( void )
{
wint_t c;
wchar_t *p;
int value;
if( nextchar >= 0 )
c = nextchar, nextchar = -1;
else
c = GETC();
while( 1 )
{
switch( c )
{
case ' ':
case '\t':
case '\f':
case '\v':
case '\b':
c = skip_white_space( c );
break;
case '\r':
case '\n':
case '/':
case '#':
case '\\':
c = skip_white_space( c );
default:
goto found_nonwhite;
} /* End switch( c ) */
found_nonwhite:
token_buffer[0] = c;
token_buffer[1] = 0;
switch( c )
{
case WEOF:
end_of_file = 1;
token_buffer[0] = 0;
value = 0;
goto done;
break;
case '$': /* dollar in identifier */
if( 1 ) goto letter;
return '$';
case 'A': case 'B': case 'C': case 'D': case 'E':
case 'F': case 'G': case 'H': case 'I': case 'J':
case 'K': case 'L': case 'M': case 'N': case 'O':
case 'P': case 'Q': case 'R': case 'S': case 'T':
case 'U': case 'V': case 'W': case 'X': case 'Y':
case 'Z':
case 'a': case 'b': case 'c': case 'd': case 'e':
case 'f': case 'g': case 'h': case 'i': case 'j':
case 'k': case 'l': case 'm': case 'n': case 'o':
case 'p': case 'q': case 'r': case 's': case 't':
case 'u': case 'v': case 'w': case 'x': case 'y':
case 'z':
case '_':
/* RUSSIAN */
case L'А': case L'Б': case L'В': case L'Г': case L'Д':
case L'Е': case L'Ё': case L'Ж': case L'З': case L'И':
case L'Й': case L'К': case L'Л': case L'М': case L'Н':
case L'О': case L'П': case L'Р': case L'С': case L'Т':
case L'У': case L'Ф': case L'Х': case L'Ц': case L'Ч':
case L'Ш': case L'Щ': case L'Ъ': case L'Ы': case L'Ь':
case L'Э': case L'Ю': case L'Я':
case L'а': case L'б': case L'в': case L'г': case L'д':
case L'е': case L'ё': case L'ж': case L'з': case L'и':
case L'й': case L'к': case L'л': case L'м': case L'н':
case L'о': case L'п': case L'р': case L'с': case L'т':
case L'у': case L'ф': case L'х': case L'ц': case L'ч':
case L'ш': case L'щ': case L'ъ': case L'ы': case L'ь':
case L'э': case L'ю': case L'я':
letter:
p = token_buffer;
while( iswalnum( c ) || c == '_' || c == '$' || c == '@' || c == '-' || c == '.' || c == ':' )
{
if( p >= token_buffer + maxtoken )
{
p = extend_token_buffer( p );
extend_token_utf8_buffer( token_utf8_buffer );
}
*p++ = c;
c = GETC();
}
*p = 0;
nextchar = c;
value = VARIABLE;
(void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)token_buffer );
/*********************
install into symtab
*********************/
{
if( !strcmp( "section", (const char *)token_utf8_buffer ) )
{
value = SECTION;
yylval.sym = install( NULL, SECTION, NULL );
}
else if( !strcmp( "repo", (const char *)token_utf8_buffer ) )
{
value = REPO;
yylval.sym = install( NULL, REPO, NULL );
}
else
{
SYMBOL *sp = NULL;
if( (sp = lookup( (const char *)token_utf8_buffer )) == (SYMBOL *)0 )
sp = install( (const char *)token_utf8_buffer, VARIABLE, 0 );
/******************************************************************
Если переменная уже в таблице, то мы предполагаем, что она имеет
тип равный одному из допустимых: NUMERICAL, STRING, или PATH.
******************************************************************/
if( sp->type != VARIABLE )
{
switch( sp->type )
{
case NUMERICAL:
case STRING:
case PATH:
value = sp->type;
break;
default:
/* error */
break;
}
}
yylval.sym = sp;
}
}
token_buffer[0] = 0;
token_utf8_buffer[0] = 0;
goto done;
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
{
int constant = 0;
/* integer: */
p = token_buffer;
while( iswdigit( c ) )
{
if( p >= token_buffer + maxtoken )
{
p = extend_token_buffer( p );
extend_token_utf8_buffer( token_utf8_buffer );
}
*p++ = c;
c = GETC();
}
*p = 0;
nextchar = c;
value = NUMERICAL;
(void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)token_buffer );
/*********************
install into symtab
*********************/
{
(void)swscanf( (const wchar_t *)token_buffer, L"%d", &constant );
yylval.sym = install( NULL, NUMERICAL, constant );
}
token_buffer[0] = 0;
token_utf8_buffer[0] = 0;
goto done;
break;
}
case '\'':
/* path_constant: */
{
int num_chars = 0;
unsigned int width = 8; /* to allow non asscii in path set width = 16 */
while( 1 )
{
tryagain:
c = GETC();
if( c == '\'' || c == WEOF ) break;
if( c == '\\' )
{
int ignore = 0;
c = readescape( &ignore );
if( ignore ) goto tryagain;
if( (unsigned)c >= (1 << width) )
{
warning( "%s", "Escape sequence out of range" );
}
}
else if( c == '\n' ) { lineno++; colno = 0; }
num_chars++;
if( num_chars > maxtoken - 4 )
{
extend_token_buffer( token_buffer );
extend_token_utf8_buffer( token_utf8_buffer );
}
token_buffer[num_chars] = c;
} /* End while( 1 ) */
token_buffer[num_chars + 1] = '\'';
token_buffer[num_chars + 2] = 0;
if( c != '\'' )
{
error( "%s", "Malformated path constant" );
}
else if( num_chars == 0 )
{
error( "%s", "Empty path constant" );
}
/* build path: */
{
wchar_t *s, *string = NULL;
wchar_t *p = &token_buffer[0];
while( *p )
{
if( *p == '\n' || *p == '\t' ) *p = ' ';
++p;
}
string = (wchar_t *)malloc( maxtoken * 4 + 10 );
p = &token_buffer[1];
s = &string[0];
while( *p == ' ' ) ++p;
while( *p )
{
if( *p != ' ' )
*s++ = *p++;
else
++p;
}
--s; *s = 0;
while( *(s-1) == ' ' ) --s;
*s = 0;
(void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)string );
free( string );
}
/*********************
install into symtab
*********************/
{
yylval.sym = install( NULL, PATH, (char *)token_utf8_buffer );
}
token_buffer[0] = 0;
token_utf8_buffer[0] = 0;
value = PATH;
goto done;
}
case '"':
/* string_constant: */
{
c = GETC();
p = token_buffer + 1;
while( c != '"' && c >= 0 )
{
if( c == '\\' )
{
int ignore = 0;
c = readescape( &ignore );
if( ignore ) goto skipnewline;
}
else if( c == '\n' ) lineno++;
if( p == token_buffer + maxtoken )
{
p = extend_token_buffer( p );
extend_token_utf8_buffer( token_utf8_buffer );
}
*p++ = c;
skipnewline:
c = GETC();
} /* End while( " ) */
*p = 0;
if( c < 0 )
{
error( "%s", "Unterminated string constant" );
}
*p++ = '"';
*p = 0;
/* build string: */
{
wchar_t *s, *string = NULL;
wchar_t *p = &token_buffer[0];
while( *p )
{
if( *p == '\n' || *p == '\t' ) *p = ' ';
++p;
}
string = (wchar_t *)malloc( maxtoken * 4 + 10 );
p = &token_buffer[1];
s = &string[0];
while( *p == ' ' ) ++p;
while( *p )
{
if( *p != ' ' )
{
switch( *p )
{
case '&':
/************************************************
Skip HTML symbol names such as  ,... etc.:
*/
if( ! html_symbol_name( p ) )
{
*s++ = '&'; *s++ = 'a'; *s++ = 'm'; *s++ = 'p'; *s++ = ';'; ++p;
}
else
{
*s++ = *p++;
}
break;
case '<':
*s++ = '&'; *s++ = 'l'; *s++ = 't'; *s++ = ';'; ++p;
break;
case '>':
*s++ = '&'; *s++ = 'g'; *s++ = 't'; *s++ = ';'; ++p;
break;
default:
*s++ = *p++;
break;
}
}
else
{
/* skip multiple spaces */
if( *(p+1) != ' ' )
*s++ = *p++;
else
++p;
}
}
--s; *s = 0;
while( *(s-1) == ' ' ) --s;
*s = 0;
(void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)string );
free( string );
}
/*********************
install into symtab
*********************/
{
yylval.sym = install( NULL, STRING, (char *)token_utf8_buffer );
}
token_buffer[0] = 0;
token_utf8_buffer[0] = 0;
value = STRING;
goto done;
}
case 0:
value = 1;
goto done;
break;
case '{':
indent_level++;
value = c;
goto done;
break;
case '}':
indent_level--;
value = c;
goto done;
break;
default:
value = c;
goto done;
break;
} /* End switch( c ) */
} /* End while( 1 ) */
done:
return( value );
}