cScm Configuration Daemon

cScm – is a tool to convert SCM configuration files into binary format and store its in shared memory for reading by cSvn-ui and cGit-ui CGI scripts

12 Commits   0 Branches   1 Tag

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <stdarg.h>
#include <limits.h>
#include <locale.h>
#include <wchar.h>
#include <wctype.h>

#define PCRE2_CODE_UNIT_WIDTH 32
#include <pcre2.h>

#include <defs.h>

#include <main.h>
#include <error.h>
#include <msglog.h>
#include <xalloc.h>
#include <utf8ing.h>
#include <symtab.h>
#include <parse.h>

#include <lex.h>



int lineno = 0;
int colno  = 0;

static int       maxtoken;
static wchar_t  *token_buffer;

static int       max8token;
static utf8_t   *token_utf8_buffer;

int       indent_level = 0; /* Number of '{' minus number of '}'. */

static int       end_of_file = 0;
static int       nextchar = -1;

static char     *locale;

#define GETC(c)    ({ wint_t ret; ++colno; ret = fgetwc( config ); ret; })
#define UNGETC(c)  ({ wint_t ret; --colno; ret = ungetwc( c, config ); ret; })


static wchar_t *extend_token_buffer( wchar_t *p )
{
   int offset = p - token_buffer;
   maxtoken = maxtoken * 2 + 10;
   token_buffer = (wchar_t *)xrealloc( token_buffer, (maxtoken + 2)*sizeof(wchar_t) );

   return( token_buffer + offset );
}

static utf8_t *extend_token_utf8_buffer( utf8_t *p )
{
   int offset = p - token_utf8_buffer;
   max8token = max8token * 2 + 10;
   token_utf8_buffer = (utf8_t *)xrealloc( token_utf8_buffer, (max8token + 2)*6 );

   return( token_utf8_buffer + offset );
}


void yyerror( char const *s )
{
  error( "%s", s );
}


void init_lex( void )
{
  locale = setlocale( LC_ALL, "en_US.utf8" );

  lineno = 0;
  colno  = 0;

  nextchar  = -1;
  maxtoken  = 40;
  max8token = 40;

  indent_level = 0;
  end_of_file  = 0;

  token_buffer = (wchar_t *)xmalloc( maxtoken * sizeof(wchar_t) + 2 );
  token_utf8_buffer = (utf8_t *)xmalloc( max8token * 6 + 2 );
}

void fini_lex( void )
{
  locale = setlocale( LC_ALL, locale );

  if( token_buffer ) { free( token_buffer ); token_buffer = NULL; }
  if( token_utf8_buffer ) { free( token_utf8_buffer ); token_utf8_buffer = NULL; }

  indent_level = 0;
  end_of_file  = 0;

  max8token =  0;
  maxtoken  =  0;
  nextchar  = -1;

  lineno = 0;
  colno  = 0;
}

static wint_t check_newline( void )
{
  wint_t  c;

  ++lineno;
  colno  = 0; /* считает GETC()/UNGETC(); здесь надо только обнулить */

  /*****************************************
    Read first nonwhite char on the line.
   *****************************************/
  c = GETC();
  while( c == ' ' || c == '\t' ) c = GETC();

  if( c == '#' ) goto skipline;
  else           return( c );

  /* skip the rest of this line */
skipline:

  while( c != '\n' && c != WEOF )
    c = GETC();

  return( c );
}

static wint_t skip_comment( int c )
{
  if( c == '*' )
  {
do1:
    do
    {
       c = GETC();
       if( c == '\n' ) { ++lineno; colno = 0; }

    } while( c != '*' && c != WEOF );

    if( c == WEOF )
    {
       unterminated_comment();
       return( WEOF );
    }

    c = GETC();

    if( c == '/' )
    {
       c = GETC();
       if( c == '\n' ) c = check_newline();
       return( c );
    }
    else
    {
       UNGETC( c );
       goto do1;
    }
  }
  else if( c == '/' || c == '#' )
  {
    do
    {
       c = GETC();

    } while( c != '\n' && c != WEOF );

    if( c == WEOF )
    {
       unterminated_comment();
       return( WEOF );
    }
    else c = check_newline();

    return( c );
  }

  return( c );

} /* End skip_commemnt() */

static wint_t skip_white_space( wint_t c )
{
  for( ;; )
  {
    switch( c )
    {
      case '\n':
        c = check_newline();
        break;

      case '#':
        c = skip_comment( c );
        return( skip_white_space( c ) );
        break;

      case '/':
        c = GETC();
        if( c == '/' || c == '*' )
        {
          c = skip_comment( c );
          return( skip_white_space( c ) );
        }
        else
        {
          UNGETC( c );
          return( '/' );
        }
        break;

      case ' ':
      case '\t':
      case '\f':
      case '\v':
      case '\b':
      case '\r':
        c = GETC();
        break;
      case '\\':
        c = GETC();
        if( c == '\n' ) { ++lineno; colno = 0; }
        else
        {
          warning( "%s", "Stray '\\' in program" );
        }
        c = GETC();
        break;
      default:
        return( c );

    } /* End switch( c ) */

  } /* End for( ;; ) */

} /* End skip_white_space() */

static wint_t readescape( int *ignore_ptr )
/*
   read escape sequence, returning a char, or store 1 in *ignore_ptr
   if it is backslash-newline
 */
{
  wint_t    c = GETC();
  wint_t    code;
  unsigned  count;
  unsigned  firstdig = 0;
  int       nonull;

  switch( c )
  {
     case 'x':
        code   = 0;
        count  = 0;
        nonull = 0;
        while( 1 )
        {
           c = GETC();
           if( !(c >= 'a' && c <= 'f') &&
               !(c >= 'A' && c <= 'F') &&
               !(c >= '0' && c <= '9')   )
           {
              UNGETC( c );
              break;
           }
           code *= 16;
           if( c >= 'a' && c <= 'f' ) code += c - 'a' + 10;
           if( c >= 'A' && c <= 'F' ) code += c - 'A' + 10;
           if( c >= '0' && c <= '9' ) code += c - '0';
           if( code != 0 || count != 0 )
           {
              if( count == 0 ) firstdig = code;
              count++;
           }
           nonull = 1;

        } /* End while( 1 ) */

        if( !nonull )
        {
           error( "%s", "\\x used with no following hex digits" );
        }
        else if( count == 0 )
           /* Digits are all 0's. Ok. */
           ;
        else if( (count - 1) * 4 >= 32 || /* 32 == bits per INT */
                 (count > 1 && ((1 << (32 - (count-1) * 4)) <= firstdig )))
        {
           warning( "%s", "Hex escape out of range" );
        }
        return( code );

     case '0': case '1': case '2': case '3': case '4':
     case '5': case '6': case '7':
        code  = 0;
        count = 0;
        while( (c <= '7') && (c >= '0') && (count++ < 6) )
        {
           code = (code * 8) + (c - '0');
           c = GETC();
        }
        UNGETC( c );
        return( code );

     case '\\': case '\'': case '"':
        return( c );

     case '\n':
        lineno++; colno = 0;
        *ignore_ptr = 1;
        return( 0 );

     case 'n':
        return( '\n' );

     case 't':
        return( '\t' );

     case 'r':
        return( '\r' );

     case 'f':
        return( '\f' );

     case 'b':
        return( '\b' );

     case 'a':
        return( '\a' );

     case 'v':
        return( '\v' );
  }

  return( c );

} /* End of readescape() */


int html_symbol_name( wchar_t *str )
{
  int         rc = 0, error = 0;
  PCRE2_SIZE  offset = 0;
  wchar_t     pattern[] = L"^(&[#A-Za-z0-9]*;)";

  pcre2_match_data *match;

  pcre2_code *regexp = pcre2_compile( (PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, 0, &error, &offset, NULL );
  if( regexp == NULL )
  {
    return 0; /* PCRE compilation failed */
  }

  match = pcre2_match_data_create_from_pattern( regexp, NULL );

  rc = pcre2_match( regexp, (PCRE2_SPTR)str, (int)wcslen(str), 0, 0, match, NULL );
  if( rc < 0 )
  {
    /* not match */
    pcre2_match_data_free( match );
    pcre2_code_free( regexp );
    return 0;
  }
  else
  {
    /* match */
    pcre2_match_data_free( match );
    pcre2_code_free( regexp );
    return 1;
  }
}


int yylex( void )
{
  wint_t   c;
  wchar_t *p;
  int      value;

  if( nextchar >= 0 )
    c = nextchar, nextchar = -1;
  else
    c = GETC();

  while( 1 )
  {
    switch( c )
    {
      case ' ':
      case '\t':
      case '\f':
      case '\v':
      case '\b':
        c = skip_white_space( c );
        break;

      case '\r':
      case '\n':
      case '/':
  case '#':
      case '\\':
        c = skip_white_space( c );

      default:
        goto found_nonwhite;

    } /* End switch( c ) */
found_nonwhite:

    token_buffer[0] = c;
    token_buffer[1] = 0;

    switch( c )
    {
      case WEOF:
        end_of_file = 1;
        token_buffer[0] = 0;
        value = 0;
        goto done;
        break;

      case '$': /* dollar in identifier */
        if( 1 ) goto letter;
        return '$';

      case 'A': case 'B': case 'C': case 'D': case 'E':
      case 'F': case 'G': case 'H': case 'I': case 'J':
      case 'K': case 'L': case 'M': case 'N': case 'O':
      case 'P': case 'Q': case 'R': case 'S': case 'T':
      case 'U': case 'V': case 'W': case 'X': case 'Y':
      case 'Z':
      case 'a': case 'b': case 'c': case 'd': case 'e':
      case 'f': case 'g': case 'h': case 'i': case 'j':
      case 'k': case 'l': case 'm': case 'n': case 'o':
      case 'p': case 'q': case 'r': case 's': case 't':
      case 'u': case 'v': case 'w': case 'x': case 'y':
      case 'z':
      case '_':

      /* RUSSIAN */
      case L'А': case L'Б': case L'В': case L'Г': case L'Д':
      case L'Е': case L'Ё': case L'Ж': case L'З': case L'И':
      case L'Й': case L'К': case L'Л': case L'М': case L'Н':
      case L'О': case L'П': case L'Р': case L'С': case L'Т':
      case L'У': case L'Ф': case L'Х': case L'Ц': case L'Ч':
      case L'Ш': case L'Щ': case L'Ъ': case L'Ы': case L'Ь':
      case L'Э': case L'Ю': case L'Я':

      case L'а': case L'б': case L'в': case L'г': case L'д':
      case L'е': case L'ё': case L'ж': case L'з': case L'и':
      case L'й': case L'к': case L'л': case L'м': case L'н':
      case L'о': case L'п': case L'р': case L'с': case L'т':
      case L'у': case L'ф': case L'х': case L'ц': case L'ч':
      case L'ш': case L'щ': case L'ъ': case L'ы': case L'ь':
      case L'э': case L'ю': case L'я':

letter:
        p = token_buffer;
        while( iswalnum( c ) || c == '_' || c == '$' || c == '@' || c == '-' || c == '.' || c == ':' )
        {
          if( p >= token_buffer + maxtoken )
          {
            p = extend_token_buffer( p );
            extend_token_utf8_buffer( token_utf8_buffer );
          }

          *p++ = c;
          c = GETC();
        }
        *p = 0;
        nextchar = c;
        value = VARIABLE;

        (void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)token_buffer );

        /*********************
          install into symtab
         *********************/
        {
          if( !strcmp( "section", (const char *)token_utf8_buffer ) )
          {
            value = SECTION;
            yylval.sym = install( NULL, SECTION, NULL );
          }
          else if( !strcmp( "repo", (const char *)token_utf8_buffer ) )
          {
            value = REPO;
            yylval.sym = install( NULL, REPO, NULL );
          }
          else
          {
            SYMBOL *sp = NULL;

            if( (sp = lookup( (const char *)token_utf8_buffer )) == (SYMBOL *)0 )
              sp = install( (const char *)token_utf8_buffer, VARIABLE, 0 );

            /******************************************************************
              Если переменная уже в таблице, то мы предполагаем, что она имеет
              тип равный одному из допустимых: NUMERICAL, STRING, или PATH.
             ******************************************************************/
            if( sp->type != VARIABLE )
            {
              switch( sp->type )
              {
                case NUMERICAL:
                case STRING:
                case PATH:
                  value = sp->type;
                  break;
                default:
                  /* error */
                  break;
              }
            }
            yylval.sym = sp;
          }
        }

        token_buffer[0] = 0;
        token_utf8_buffer[0] = 0;
        goto done;
        break;

      case '0': case '1': case '2': case '3': case '4':
      case '5': case '6': case '7': case '8': case '9':
        {
          int constant = 0;
/* integer: */
          p = token_buffer;
          while( iswdigit( c ) )
          {
            if( p >= token_buffer + maxtoken )
            {
              p = extend_token_buffer( p );
              extend_token_utf8_buffer( token_utf8_buffer );
            }

            *p++ = c;
            c = GETC();
          }
          *p = 0;
          nextchar = c;
          value = NUMERICAL;

          (void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)token_buffer );

          /*********************
            install into symtab
           *********************/
          {
            (void)swscanf( (const wchar_t *)token_buffer, L"%d", &constant );
            yylval.sym = install( NULL, NUMERICAL, constant );
          }

          token_buffer[0] = 0;
          token_utf8_buffer[0] = 0;
          goto done;
          break;
        }

      case '\'':
/* path_constant: */
        {
          int           num_chars = 0;
          unsigned int  width = 8; /* to allow non asscii in path set width = 16 */

          while( 1 )
          {
tryagain:
            c = GETC();

            if( c == '\'' || c == WEOF ) break;
            if( c == '\\' )
            {
              int ignore = 0;
              c = readescape( &ignore );
              if( ignore ) goto tryagain;
              if( (unsigned)c >= (1 << width) )
              {
                warning( "%s", "Escape sequence out of range" );
              }
            }
            else if( c == '\n' ) { lineno++; colno = 0; }

            num_chars++;
            if( num_chars > maxtoken - 4 )
            {
              extend_token_buffer( token_buffer );
              extend_token_utf8_buffer( token_utf8_buffer );
            }

            token_buffer[num_chars] = c;

          } /* End while( 1 ) */

          token_buffer[num_chars + 1] = '\'';
          token_buffer[num_chars + 2] = 0;

          if( c != '\'' )
          {
            error( "%s", "Malformated path constant" );
          }
          else if( num_chars == 0 )
          {
            error( "%s", "Empty path constant" );
          }

          /* build path: */
          {
            wchar_t *s, *string = NULL;
            wchar_t *p = &token_buffer[0];

            while( *p )
            {
              if( *p == '\n' || *p == '\t' ) *p = ' ';
              ++p;
            }

            string = (wchar_t *)malloc( maxtoken * 4 + 10 );

            p = &token_buffer[1];
            s = &string[0];

            while( *p == ' ' ) ++p;

            while( *p )
            {
              if( *p != ' ' )
                *s++ = *p++;
              else
                ++p;
            }
            --s; *s = 0;
            while( *(s-1) == ' ' ) --s;
            *s = 0;

            (void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)string );

            free( string );
          }

          /*********************
            install into symtab
           *********************/
          {
            yylval.sym = install( NULL, PATH, (char *)token_utf8_buffer );
          }

          token_buffer[0] = 0;
          token_utf8_buffer[0] = 0;
          value = PATH;
          goto done;
        }

      case '"':
/* string_constant: */
        {
          c = GETC();
          p = token_buffer + 1;

          while( c != '"' && c >= 0 )
          {
            if( c == '\\' )
            {
              int ignore = 0;
              c = readescape( &ignore );
              if( ignore ) goto skipnewline;
            }
            else if( c == '\n' ) lineno++;

            if( p == token_buffer + maxtoken )
            {
              p = extend_token_buffer( p );
              extend_token_utf8_buffer( token_utf8_buffer );
            }
            *p++ = c;

skipnewline:
            c = GETC();

          } /* End while( " ) */

          *p = 0;

          if( c < 0 )
          {
            error( "%s", "Unterminated string constant" );
          }


          *p++ = '"';
          *p = 0;

          /* build string: */
          {
            wchar_t *s, *string = NULL;
            wchar_t *p = &token_buffer[0];

            while( *p )
            {
              if( *p == '\n' || *p == '\t' ) *p = ' ';
              ++p;
            }

            string = (wchar_t *)malloc( maxtoken * 4 + 10 );

            p = &token_buffer[1];
            s = &string[0];

            while( *p == ' ' ) ++p;

            while( *p )
            {
              if( *p != ' ' )
              {
                switch( *p )
                {
                  case '&':
                    /************************************************
                      Skip HTML symbol names such as &nbsp,... etc.:
                     */
                    if( ! html_symbol_name( p ) )
                    {
                      *s++ = '&'; *s++ = 'a'; *s++ = 'm'; *s++ = 'p'; *s++ = ';'; ++p;
                    }
                    else
                    {
                      *s++ = *p++;
                    }
                    break;

                  case '<':
                    *s++ = '&'; *s++ = 'l'; *s++ = 't'; *s++ = ';'; ++p;
                    break;

                  case '>':
                    *s++ = '&'; *s++ = 'g'; *s++ = 't'; *s++ = ';'; ++p;
                    break;

                  default:
                    *s++ = *p++;
                    break;
                }
              }
              else
              {
                /* skip multiple spaces */
                if( *(p+1) != ' ' )
                  *s++ = *p++;
                else
                  ++p;
              }
            }
            --s; *s = 0;
            while( *(s-1) == ' ' ) --s;
            *s = 0;

            (void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)string );

            free( string );
          }

          /*********************
            install into symtab
           *********************/
          {
            yylval.sym = install( NULL, STRING, (char *)token_utf8_buffer );
          }

          token_buffer[0] = 0;
          token_utf8_buffer[0] = 0;
          value = STRING;
          goto done;
        }

      case 0:
        value = 1;
        goto done;
        break;

      case '{':
        indent_level++;
        value = c;
        goto done;
        break;

      case '}':
        indent_level--;
        value = c;
        goto done;
        break;

      default:
        value = c;
        goto done;
        break;

    } /* End switch( c ) */

  } /* End while( 1 ) */

done:

   return( value );
}