Main Page | Alphabetical List | Data Structures | File List | Data Fields | Globals

lexer.c

Go to the documentation of this file.
00001 /*************************************************************************
00002  *
00003  *  file:  lexer.c
00004  *
00005  * =======================================================================
00006  *
00007  *                              lexer.c
00008  *
00009  *  The lexer reads files and returns a stream of lexemes.  Get_lexeme() is
00010  *  the main routine; it looks for the next lexeme in the input, and stores
00011  *  it in the global variable "lexeme".  See the structure definition below.
00012  *
00013  *  Restrictions:  the lexer cannot read individual input lines longer than
00014  *  MAX_LEXER_LINE_LENGTH characters.  Thus, a single lexeme can't be longer
00015  *  than that either.
00016  *
00017  *  The lexer maintains a stack of files being read, in order to handle nested
00018  *  loads.  Start_lex_from_file() and stop_lex_from_file() push and pop the
00019  *  stack.  Immediately after start_lex_from_file(), the current lexeme (global
00020  *  variable) is undefined.  Immediately after stop_lex_from_file(), the 
00021  *  current lexeme is automatically restored to whatever it was just before
00022  *  the corresponding start_lex_from_file() call.
00023  *  
00024  *  Determine_possible_symbol_types_for_string() is a utility routine which
00025  *  figures out what kind(s) of symbol a given string could represent.
00026  *  
00027  *  Print_location_of_most_recent_lexeme() is used to print an indication
00028  *  of where a parser error occurred.  It tries to print out the current
00029  *  source line with a pointer to where the error was detected.
00030  *  
00031  *  Current_lexer_parentheses_level() returns the current level of parentheses
00032  *  nesting (0 means no open paren's have been encountered).
00033  *  Skip_ahead_to_balanced_parentheses() eats lexemes until the appropriate
00034  *  closing paren is found (0 means eat until back at the top level).
00035  *  
00036  *  Fake_rparen_at_next_end_of_line() tells the lexer to insert a fake
00037  *  R_PAREN_LEXEME token the next time it reaches the end of a line.
00038  *  
00039  *  Set_lexer_allow_ids() tells the lexer whether to allow identifiers to
00040  *  be read.  If FALSE, things that look like identifiers will be returned
00041  *  as SYM_CONSTANT_LEXEME's instead.
00042  *
00043  *  BUGBUG There are still problems with Soar not being very friendly
00044  *  when users have typos in productions, particularly with mismatched
00045  *  braces and parens.  see also parser.c
00046  * =======================================================================
00047  *
00048  * Copyright 1995-2003 Carnegie Mellon University,
00049  *                                                                               University of Michigan,
00050  *                                                                               University of Southern California/Information
00051  *                                                                               Sciences Institute. All rights reserved.
00052  *                                                                              
00053  * Redistribution and use in source and binary forms, with or without
00054  * modification, are permitted provided that the following conditions are met:
00055  *
00056  * 1.   Redistributions of source code must retain the above copyright notice,
00057  *              this list of conditions and the following disclaimer. 
00058  * 2.   Redistributions in binary form must reproduce the above copyright notice,
00059  *              this list of conditions and the following disclaimer in the documentation
00060  *              and/or other materials provided with the distribution. 
00061  *
00062  * THIS SOFTWARE IS PROVIDED BY THE SOAR CONSORTIUM ``AS IS'' AND ANY EXPRESS OR
00063  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
00064  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
00065  * EVENT SHALL THE SOAR CONSORTIUM  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
00066  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
00067  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00068  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
00069  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00070  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00071  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00072  * The views and conclusions contained in the software and documentation are
00073  * those of the authors and should not be interpreted as representing official
00074  * policies, either expressed or implied, of Carnegie Mellon University, the
00075  * University of Michigan, the University of Southern California/Information
00076  * Sciences Institute, or the Soar consortium.
00077  * =======================================================================
00078  */
00079 /* ======================================================================
00080                              lexer.c
00081 
00082     See comments in soarkernel.h for an overview.
00083    ====================================================================== */
00084 
00085 #include "soarkernel.h"
00086 #include <ctype.h>
00087 #include <errno.h>
00088 #include <string.h>
00089 #include <stdlib.h>
00090 
00091 bool constituent_char[256];     /* is the character a symbol constituent? */
00092 bool whitespace[256];           /* is the character whitespace? */
00093 bool number_starters[256];      /* could the character initiate a number? */
00094 
00095 /* ======================================================================
00096                        Start/Stop Lex from File
00097                        
00098   The lexer maintains a stack of files being read, in order to handle nested
00099   loads.  Start_lex_from_file() and stop_lex_from_file() push and pop the
00100   stack.  Immediately after start_lex_from_file(), the current lexeme (agent
00101   variable) is undefined.  Immediately after stop_lex_from_file(), the 
00102   current lexeme is automatically restored to whatever it was just before
00103   the corresponding start_lex_from_file() call.
00104 ====================================================================== */
00105 
00106 void start_lex_from_file(char *filename, FILE * already_opened_file)
00107 {
00108     lexer_source_file *lsf;
00109 
00110     lsf = allocate_memory(sizeof(lexer_source_file), MISCELLANEOUS_MEM_USAGE);
00111     lsf->saved_lexeme = current_agent(lexeme);
00112     lsf->saved_current_char = current_agent(current_char);
00113     lsf->parent_file = current_agent(current_file);
00114     current_agent(current_file) = lsf;
00115     lsf->filename = make_memory_block_for_string(filename);
00116     lsf->file = already_opened_file;
00117     lsf->fake_rparen_at_eol = FALSE;
00118     lsf->allow_ids = TRUE;
00119     lsf->parentheses_level = 0;
00120     lsf->column_of_start_of_last_lexeme = 0;
00121     lsf->line_of_start_of_last_lexeme = 0;
00122     lsf->current_line = 0;
00123     lsf->current_column = 0;
00124     lsf->buffer[0] = 0;
00125     current_agent(current_char) = ' ';  /* whitespace--to force immediate read of first line */
00126 }
00127 
00128 void stop_lex_from_file(void)
00129 {
00130     lexer_source_file *lsf;
00131 
00132     if (reading_from_top_level()) {
00133         print("Internal error: tried to stop_lex_from_file at top level\n");
00134         return;
00135     }
00136     lsf = current_agent(current_file);
00137     current_agent(current_file) = current_agent(current_file)->parent_file;
00138     current_agent(current_char) = lsf->saved_current_char;
00139     current_agent(lexeme) = lsf->saved_lexeme;
00140 
00141     free_memory_block_for_string(lsf->filename);
00142     free_memory(lsf, MISCELLANEOUS_MEM_USAGE);
00143 }
00144 
00145 /* ======================================================================
00146                              Get next char
00147 
00148   Get_next_char() gets the next character from the current input file and
00149   puts it into the agent variable current_char.
00150 ====================================================================== */
00151 
00152 void get_next_char(void)
00153 {
00154     char *s;
00155 
00156     /* 
00157        Previously this block of code was only used in conjunction with the
00158        Tcl interface, however, the more robust alternate_input_string
00159        mechanism subsumes input_sting, so I am using this block in 
00160        all future builds of Soar8.
00161 
00162        081699 SW 
00163      */
00164 
00165     /* Soar-Bugs #54, TMH */
00166     if (current_agent(alternate_input_exit) &&
00167         (current_agent(alternate_input_string) == NULL) && (current_agent(alternate_input_suffix) == NULL)) {
00168         current_agent(current_char) = EOF_AS_CHAR;
00169         control_c_handler(0);
00170         return;
00171     }
00172 
00173     if (current_agent(alternate_input_string) != NULL) {
00174         current_agent(current_char) = *current_agent(alternate_input_string)++;
00175 
00176         if (current_agent(current_char) == '\0') {
00177             current_agent(alternate_input_string) = NIL;
00178             current_agent(current_char) = *current_agent(alternate_input_suffix)++;
00179         }
00180     } else if (current_agent(alternate_input_suffix) != NULL) {
00181         current_agent(current_char) = *current_agent(alternate_input_suffix)++;
00182 
00183         if (current_agent(current_char) == '\0') {
00184             current_agent(alternate_input_suffix) = NIL;
00185 
00186             /* Soar-Bugs #54, TMH */
00187             if (current_agent(alternate_input_exit)) {
00188                 current_agent(current_char) = EOF_AS_CHAR;
00189                 control_c_handler(0);
00190                 return;
00191             }
00192 
00193             current_agent(current_char) = current_agent(current_file)->buffer
00194                 [current_agent(current_file)->current_column++];
00195         }
00196     } else {
00197         current_agent(current_char) = current_agent(current_file)->buffer
00198             [current_agent(current_file)->current_column++];
00199     }
00200 
00201     if (current_agent(current_char))
00202         return;
00203 
00204     if ((current_agent(current_file)->current_column == BUFSIZE) &&
00205         (current_agent(current_file)->buffer[BUFSIZE - 2] != '\n') &&
00206         (current_agent(current_file)->buffer[BUFSIZE - 2] != EOF_AS_CHAR)) {
00207         char msg[512];
00208         snprintf(msg, MESSAGE_SIZE,
00209                  "lexer.c: Error:  line too long (max allowed is %d chars)\nFile %s, line %lu\n",
00210                  MAX_LEXER_LINE_LENGTH, current_agent(current_file)->filename,
00211                  current_agent(current_file)->current_line);
00212         msg[MESSAGE_SIZE - 1] = 0;      /* snprintf doesn't set last char to null if output is truncated */
00213         abort_with_fatal_error(msg);
00214     }
00215 
00216     s = fgets(current_agent(current_file)->buffer, BUFSIZE, current_agent(current_file)->file);
00217 
00218     if (s) {
00219         current_agent(current_file)->current_line++;
00220         if (reading_from_top_level()) {
00221             tell_printer_that_output_column_has_been_reset();
00222             if (current_agent(logging_to_file))
00223                 print_string_to_log_file_only(current_agent(current_file)->buffer);
00224         }
00225     } else {
00226         /* s==NIL means immediate eof encountered or read error occurred */
00227         if (!feof(current_agent(current_file)->file)) {
00228             if (reading_from_top_level()) {
00229 
00230                 control_c_handler(0);   /* AGR 581 */
00231 
00232                 return;
00233             } else {
00234                 print("I/O error while reading file %s; ignoring the rest of it.\n",
00235                       current_agent(current_file)->filename);
00236             }
00237         }
00238         current_agent(current_file)->buffer[0] = EOF_AS_CHAR;
00239         current_agent(current_file)->buffer[1] = 0;
00240     }
00241     current_agent(current_char) = current_agent(current_file)->buffer[0];
00242     current_agent(current_file)->current_column = 1;
00243 }
00244 
00245 /* ======================================================================
00246 
00247                          Lexer Utility Routines
00248 
00249 ====================================================================== */
00250 
00251 #define record_position_of_start_of_lexeme() { \
00252   current_agent(current_file)->column_of_start_of_last_lexeme = \
00253     current_agent(current_file)->current_column - 1; \
00254   current_agent(current_file)->line_of_start_of_last_lexeme = \
00255     current_agent(current_file)->current_line; }
00256 
00257 /*  redefined for Soar 7, want case-sensitivity to match Tcl.  KJC 5/96 
00258 #define store_and_advance() { \
00259   current_agent(lexeme).string[current_agent(lexeme).length++] = (isupper((char)current_agent(current_char)) ? \
00260                                     tolower((char)current_agent(current_char)) : \
00261                                     (char)current_agent(current_char)); \
00262   get_next_char(); }
00263 */
00264 #define store_and_advance() { \
00265   current_agent(lexeme).string[current_agent(lexeme).length++] = \
00266     (char)current_agent(current_char); \
00267   get_next_char(); }
00268 
00269 #define finish() { current_agent(lexeme).string[current_agent(lexeme).length]=0; }
00270 
00271 void read_constituent_string(void)
00272 {
00273 #ifdef __SC__
00274     char *buf;
00275     int i, len;
00276 #endif
00277 
00278     while ((current_agent(current_char) != EOF_AS_CHAR) &&
00279            constituent_char[(unsigned char) current_agent(current_char)])
00280         store_and_advance();
00281     finish();
00282 }
00283 
00284 void read_rest_of_floating_point_number(void)
00285 {
00286     /* --- at entry, current_char=="."; we read the "." and rest of number --- */
00287     store_and_advance();
00288     while (isdigit(current_agent(current_char)))
00289         store_and_advance();    /* string of digits */
00290     if ((current_agent(current_char) == 'e') || (current_agent(current_char) == 'E')) {
00291         store_and_advance();    /* E */
00292         if ((current_agent(current_char) == '+') || (current_agent(current_char) == '-'))
00293             store_and_advance();        /* optional leading + or - */
00294         while (isdigit(current_agent(current_char)))
00295             store_and_advance();        /* string of digits */
00296     }
00297     finish();
00298 
00299 #ifdef __SC__
00300     if (strcmp("soar>", current_agent(lexeme).string)) {        /* if the lexeme doesn't equal "soar>" */
00301         if (!(strncmp("soar>", current_agent(lexeme).string, 5))) {     /* but the first 5 chars are "soar>" */
00302             /* then SIOW messed up so ignore the "soar>" */
00303             buf =
00304                 (char *) allocate_memory((len = (strlen(current_agent(lexeme).string) + 1)) * sizeof(char),
00305                                          STRING_MEM_USAGE);
00306             for (i = 0; i <= len; i++) {
00307                 buf[i] = current_agent(lexeme).string[i];
00308             }
00309             for (i = 5; i <= len; i++) {
00310                 current_agent(lexeme).string[i - 5] = buf[i];
00311             }
00312             free_memory_block_for_string(buf);
00313         }
00314     }
00315 #endif
00316 }
00317 
00318 void determine_type_of_constituent_string(void)
00319 {
00320     bool possible_id, possible_var, possible_sc, possible_ic, possible_fc;
00321     bool rereadable;
00322 
00323     determine_possible_symbol_types_for_string(current_agent(lexeme).string,
00324                                                current_agent(lexeme).length,
00325                                                &possible_id,
00326                                                &possible_var, &possible_sc, &possible_ic, &possible_fc, &rereadable);
00327 
00328     /* --- check whether it's a variable --- */
00329     if (possible_var) {
00330         current_agent(lexeme).type = VARIABLE_LEXEME;
00331         return;
00332     }
00333 
00334     /* --- check whether it's an integer --- */
00335     if (possible_ic) {
00336         errno = 0;
00337         current_agent(lexeme).type = INT_CONSTANT_LEXEME;
00338         current_agent(lexeme).int_val = strtol(current_agent(lexeme).string, NULL, 10);
00339         if (errno) {
00340             print("Error: bad integer (probably too large)\n");
00341             print_location_of_most_recent_lexeme();
00342             current_agent(lexeme).int_val = 0;
00343         }
00344         return;
00345     }
00346 
00347     /* --- check whether it's a floating point number --- */
00348     if (possible_fc) {
00349         errno = 0;
00350         current_agent(lexeme).type = FLOAT_CONSTANT_LEXEME;
00351         /*current_agent(lexeme).float_val = (float) strtod (current_agent(lexeme).string,NULL,10); */
00352         current_agent(lexeme).float_val = (float) strtod(current_agent(lexeme).string, NULL);
00353         if (errno) {
00354             print("Error: bad floating point number\n");
00355             print_location_of_most_recent_lexeme();
00356             current_agent(lexeme).float_val = 0.0;
00357         }
00358         return;
00359     }
00360 
00361     /* --- check if it's an identifier --- */
00362     if (current_agent(current_file)->allow_ids && possible_id) {
00363         current_agent(lexeme).id_letter = (char) toupper(current_agent(lexeme).string[0]);
00364         errno = 0;
00365         current_agent(lexeme).type = IDENTIFIER_LEXEME;
00366         current_agent(lexeme).id_number = strtoul(&(current_agent(lexeme).string[1]), NULL, 10);
00367         if (errno) {
00368             print("Error: bad number for identifier (probably too large)\n");
00369             print_location_of_most_recent_lexeme();
00370             current_agent(lexeme).id_number = 0;
00371         }
00372         return;
00373     }
00374 
00375     /* --- otherwise it must be a symbolic constant --- */
00376     if (possible_sc) {
00377         current_agent(lexeme).type = SYM_CONSTANT_LEXEME;
00378         if (current_agent(sysparams)[PRINT_WARNINGS_SYSPARAM]) {
00379             if (current_agent(lexeme).string[0] == '<') {
00380                 if (current_agent(lexeme).string[1] == '<') {
00381                     print("Warning: Possible disjunctive encountered in reading symbolic constant\n");
00382                     print("         If a disjunctive was intended, add a space after <<\n");
00383                     print("         If a constant was intended, surround constant with vertical bars\n");
00384                     print_location_of_most_recent_lexeme();
00385                 } else {
00386                     print("Warning: Possible variable encountered in reading symbolic constant\n");
00387                     print("         If a constant was intended, surround constant with vertical bars\n");
00388                     print_location_of_most_recent_lexeme();
00389                 }
00390             } else {
00391                 if (current_agent(lexeme).string[current_agent(lexeme).length - 1] == '>') {
00392                     if (current_agent(lexeme).string[current_agent(lexeme).length - 2] == '>') {
00393                         print("Warning: Possible disjunctive encountered in reading symbolic constant\n");
00394                         print("         If a disjunctive was intended, add a space before >>\n");
00395                         print("         If a constant was intended, surround constant with vertical bars\n");
00396                         print_location_of_most_recent_lexeme();
00397                     } else {
00398                         print("Warning: Possible variable encountered in reading symbolic constant\n");
00399                         print("         If a constant was intended, surround constant with vertical bars\n");
00400                         print_location_of_most_recent_lexeme();
00401                     }
00402                 }
00403             }
00404         }
00405         return;
00406     }
00407 
00408     /* 
00409        previously the following statement was used only with the Tcl Interface
00410        but its functionality is required for the API.  Thus, I am leaving
00411        it in, and commenting out the block which follows it.
00412 
00413        081699 SW
00414      */
00415     current_agent(lexeme).type = QUOTED_STRING_LEXEME;
00416 
00417     /*
00418        char msg[128];
00419        strcpy (msg, "Internal error: can't determine_type_of_constituent_string\n");
00420        abort_with_fatal_error(msg);
00421      */
00422 }
00423 
00424 void do_fake_rparen(void)
00425 {
00426     record_position_of_start_of_lexeme();
00427     current_agent(lexeme).type = R_PAREN_LEXEME;
00428     current_agent(lexeme).length = 1;
00429     current_agent(lexeme).string[0] = ')';
00430     current_agent(lexeme).string[1] = 0;
00431     if (current_agent(current_file)->parentheses_level > 0)
00432         current_agent(current_file)->parentheses_level--;
00433     current_agent(current_file)->fake_rparen_at_eol = FALSE;
00434 }
00435 
00436 /* ======================================================================
00437                         Lex such-and-such Routines
00438 
00439   These routines are called from get_lexeme().  Which routine gets called
00440   depends on the first character of the new lexeme being read.  Each routine's
00441   job is to finish reading the lexeme and store the necessary items in 
00442   the agent variable "lexeme".
00443 ====================================================================== */
00444 
00445 void (*(lexer_routines[256])) (void);
00446 
00447 void lex_eof(void)
00448 {
00449     if (current_agent(current_file)->fake_rparen_at_eol) {
00450         do_fake_rparen();
00451         return;
00452     }
00453     store_and_advance();
00454     finish();
00455     current_agent(lexeme).type = EOF_LEXEME;
00456 }
00457 
00458 void lex_at(void)
00459 {
00460     store_and_advance();
00461     finish();
00462     current_agent(lexeme).type = AT_LEXEME;
00463 }
00464 
00465 void lex_tilde(void)
00466 {
00467     store_and_advance();
00468     finish();
00469     current_agent(lexeme).type = TILDE_LEXEME;
00470 }
00471 
00472 void lex_up_arrow(void)
00473 {
00474     store_and_advance();
00475     finish();
00476     current_agent(lexeme).type = UP_ARROW_LEXEME;
00477 }
00478 
00479 void lex_lbrace(void)
00480 {
00481     store_and_advance();
00482     finish();
00483     current_agent(lexeme).type = L_BRACE_LEXEME;
00484 }
00485 
00486 void lex_rbrace(void)
00487 {
00488     store_and_advance();
00489     finish();
00490     current_agent(lexeme).type = R_BRACE_LEXEME;
00491 }
00492 
00493 void lex_exclamation_point(void)
00494 {
00495     store_and_advance();
00496     finish();
00497     current_agent(lexeme).type = EXCLAMATION_POINT_LEXEME;
00498 }
00499 
00500 void lex_comma(void)
00501 {
00502     store_and_advance();
00503     finish();
00504     current_agent(lexeme).type = COMMA_LEXEME;
00505 }
00506 
00507 void lex_equal(void)
00508 {
00509     /* Lexeme might be "=", or symbol */
00510     /* Note: this routine relies on = being a constituent character */
00511 
00512     read_constituent_string();
00513     if (current_agent(lexeme).length == 1) {
00514         current_agent(lexeme).type = EQUAL_LEXEME;
00515         return;
00516     }
00517     determine_type_of_constituent_string();
00518 }
00519 
00520 void lex_ampersand(void)
00521 {
00522     /* Lexeme might be "&", or symbol */
00523     /* Note: this routine relies on & being a constituent character */
00524 
00525     read_constituent_string();
00526     if (current_agent(lexeme).length == 1) {
00527         current_agent(lexeme).type = AMPERSAND_LEXEME;
00528         return;
00529     }
00530     determine_type_of_constituent_string();
00531 }
00532 
00533 void lex_lparen(void)
00534 {
00535     store_and_advance();
00536     finish();
00537     current_agent(lexeme).type = L_PAREN_LEXEME;
00538     current_agent(current_file)->parentheses_level++;
00539 }
00540 
00541 void lex_rparen(void)
00542 {
00543     store_and_advance();
00544     finish();
00545     current_agent(lexeme).type = R_PAREN_LEXEME;
00546     if (current_agent(current_file)->parentheses_level > 0)
00547         current_agent(current_file)->parentheses_level--;
00548 }
00549 
00550 void lex_greater(void)
00551 {
00552     /* Lexeme might be ">", ">=", ">>", or symbol */
00553     /* Note: this routine relies on =,> being constituent characters */
00554 
00555     read_constituent_string();
00556     if (current_agent(lexeme).length == 1) {
00557         current_agent(lexeme).type = GREATER_LEXEME;
00558         return;
00559     }
00560     if (current_agent(lexeme).length == 2) {
00561         if (current_agent(lexeme).string[1] == '>') {
00562             current_agent(lexeme).type = GREATER_GREATER_LEXEME;
00563             return;
00564         }
00565         if (current_agent(lexeme).string[1] == '=') {
00566             current_agent(lexeme).type = GREATER_EQUAL_LEXEME;
00567             return;
00568         }
00569     }
00570     determine_type_of_constituent_string();
00571 }
00572 
00573 void lex_less(void)
00574 {
00575     /* Lexeme might be "<", "<=", "<=>", "<>", "<<", or variable */
00576     /* Note: this routine relies on =,<,> being constituent characters */
00577 
00578     read_constituent_string();
00579     if (current_agent(lexeme).length == 1) {
00580         current_agent(lexeme).type = LESS_LEXEME;
00581         return;
00582     }
00583     if (current_agent(lexeme).length == 2) {
00584         if (current_agent(lexeme).string[1] == '>') {
00585             current_agent(lexeme).type = NOT_EQUAL_LEXEME;
00586             return;
00587         }
00588         if (current_agent(lexeme).string[1] == '=') {
00589             current_agent(lexeme).type = LESS_EQUAL_LEXEME;
00590             return;
00591         }
00592         if (current_agent(lexeme).string[1] == '<') {
00593             current_agent(lexeme).type = LESS_LESS_LEXEME;
00594             return;
00595         }
00596     }
00597     if (current_agent(lexeme).length == 3) {
00598         if ((current_agent(lexeme).string[1] == '=') && (current_agent(lexeme).string[2] == '>')) {
00599             current_agent(lexeme).type = LESS_EQUAL_GREATER_LEXEME;
00600             return;
00601         }
00602     }
00603     determine_type_of_constituent_string();
00604 
00605 }
00606 
00607 void lex_period(void)
00608 {
00609     store_and_advance();
00610     finish();
00611     /* --- if we stopped at '.', it might be a floating-point number, so be
00612        careful to check for this case --- */
00613     if (isdigit(current_agent(current_char)))
00614         read_rest_of_floating_point_number();
00615     if (current_agent(lexeme).length == 1) {
00616         current_agent(lexeme).type = PERIOD_LEXEME;
00617         return;
00618     }
00619     determine_type_of_constituent_string();
00620 }
00621 
00622 void lex_plus(void)
00623 {
00624     /* Lexeme might be +, number, or symbol */
00625     /* Note: this routine relies on various things being constituent chars */
00626     int i;
00627     bool could_be_floating_point;
00628 
00629     read_constituent_string();
00630     /* --- if we stopped at '.', it might be a floating-point number, so be
00631        careful to check for this case --- */
00632     if (current_agent(current_char) == '.') {
00633         could_be_floating_point = TRUE;
00634         for (i = 1; i < current_agent(lexeme).length; i++)
00635             if (!isdigit(current_agent(lexeme).string[i]))
00636                 could_be_floating_point = FALSE;
00637         if (could_be_floating_point)
00638             read_rest_of_floating_point_number();
00639     }
00640     if (current_agent(lexeme).length == 1) {
00641         current_agent(lexeme).type = PLUS_LEXEME;
00642         return;
00643     }
00644     determine_type_of_constituent_string();
00645 }
00646 
00647 void lex_minus(void)
00648 {
00649     /* Lexeme might be -, -->, number, or symbol */
00650     /* Note: this routine relies on various things being constituent chars */
00651     int i;
00652     bool could_be_floating_point;
00653 
00654     read_constituent_string();
00655     /* --- if we stopped at '.', it might be a floating-point number, so be
00656        careful to check for this case --- */
00657     if (current_agent(current_char) == '.') {
00658         could_be_floating_point = TRUE;
00659         for (i = 1; i < current_agent(lexeme).length; i++)
00660             if (!isdigit(current_agent(lexeme).string[i]))
00661                 could_be_floating_point = FALSE;
00662         if (could_be_floating_point)
00663             read_rest_of_floating_point_number();
00664     }
00665     if (current_agent(lexeme).length == 1) {
00666         current_agent(lexeme).type = MINUS_LEXEME;
00667         return;
00668     }
00669     if (current_agent(lexeme).length == 3) {
00670         if ((current_agent(lexeme).string[1] == '-') && (current_agent(lexeme).string[2] == '>')) {
00671             current_agent(lexeme).type = RIGHT_ARROW_LEXEME;
00672             return;
00673         }
00674     }
00675     determine_type_of_constituent_string();
00676 }
00677 
00678 void lex_digit(void)
00679 {
00680     int i;
00681     bool could_be_floating_point;
00682 
00683     read_constituent_string();
00684     /* --- if we stopped at '.', it might be a floating-point number, so be
00685        careful to check for this case --- */
00686     if (current_agent(current_char) == '.') {
00687         could_be_floating_point = TRUE;
00688         for (i = 1; i < current_agent(lexeme).length; i++)
00689             if (!isdigit(current_agent(lexeme).string[i]))
00690                 could_be_floating_point = FALSE;
00691         if (could_be_floating_point)
00692             read_rest_of_floating_point_number();
00693     }
00694     determine_type_of_constituent_string();
00695 }
00696 
00697 void lex_unknown(void)
00698 {
00699     if (reading_from_top_level() && current_agent(current_char) == 0) {
00700     } else {
00701         print("Error:  Unknown character encountered by lexer, code=%d\n", current_agent(current_char));
00702         print("File %s, line %lu, column %lu.\n", current_agent(current_file)->filename,
00703               current_agent(current_file)->current_line, current_agent(current_file)->current_column);
00704         /*
00705            As far as we can tell this won't work
00706            currently, even if it ever does get called.
00707            Thus, we're taking it out.
00708            081799 SW
00709          */
00710         /*
00711            if (! reading_from_top_level()) {
00712            respond_to_load_errors ();
00713            if (current_agent(load_errors_quit))
00714            current_agent(current_char) = EOF_AS_CHAR;
00715            }
00716          */
00717     }
00718     get_next_char();
00719     get_lexeme();
00720 }
00721 
00722 void lex_constituent_string(void)
00723 {
00724     read_constituent_string();
00725     determine_type_of_constituent_string();
00726 }
00727 
00728 void lex_vbar(void)
00729 {
00730     current_agent(lexeme).type = SYM_CONSTANT_LEXEME;
00731     get_next_char();
00732 
00733     for (;;) {
00734         if ((current_agent(current_char) == EOF_AS_CHAR) || (current_agent(lexeme).length == MAX_LEXEME_LENGTH)) {
00735             print("Error:  opening '|' without closing '|'\n");
00736             print_location_of_most_recent_lexeme();
00737             /* BUGBUG if reading from top level, don't want to signal EOF */
00738             current_agent(lexeme).type = EOF_LEXEME;
00739             current_agent(lexeme).string[0] = EOF_AS_CHAR;
00740             current_agent(lexeme).string[1] = 0;
00741             current_agent(lexeme).length = 1;
00742             return;
00743         }
00744         if (current_agent(current_char) == '\\') {
00745             get_next_char();
00746             current_agent(lexeme).string[current_agent(lexeme).length++] = (char) current_agent(current_char);
00747             get_next_char();
00748         } else if (current_agent(current_char) == '|') {
00749             get_next_char();
00750             break;
00751         } else {
00752             current_agent(lexeme).string[current_agent(lexeme).length++] = (char) current_agent(current_char);
00753             get_next_char();
00754         }
00755     }
00756 
00757     current_agent(lexeme).string[current_agent(lexeme).length] = 0;
00758 }
00759 
00760 void lex_quote(void)
00761 {
00762     current_agent(lexeme).type = QUOTED_STRING_LEXEME;
00763     get_next_char();
00764     for (;;) {
00765         if ((current_agent(current_char) == EOF_AS_CHAR) || (current_agent(lexeme).length == MAX_LEXEME_LENGTH)) {
00766             print("Error:  opening '\"' without closing '\"'\n");
00767             print_location_of_most_recent_lexeme();
00768             /* BUGBUG if reading from top level, don't want to signal EOF */
00769             current_agent(lexeme).type = EOF_LEXEME;
00770             current_agent(lexeme).string[0] = EOF_AS_CHAR;
00771             current_agent(lexeme).string[1] = 0;
00772             current_agent(lexeme).length = 1;
00773             return;
00774         }
00775         if (current_agent(current_char) == '\\') {
00776             get_next_char();
00777             current_agent(lexeme).string[current_agent(lexeme).length++] = (char) current_agent(current_char);
00778             get_next_char();
00779         } else if (current_agent(current_char) == '"') {
00780             get_next_char();
00781             break;
00782         } else {
00783             current_agent(lexeme).string[current_agent(lexeme).length++] = (char) current_agent(current_char);
00784             get_next_char();
00785         }
00786     }
00787     current_agent(lexeme).string[current_agent(lexeme).length] = 0;
00788 }
00789 
00790 /* AGR 562 begin */
00791 
00792 /* There are 2 functions here, for 2 different schemes for handling the
00793    shell escape.
00794    Scheme 1:  A '$' signals that all the rest of the text up to the '\n'
00795    is to be passed to the system() command verbatim.  The whole string,
00796    including the '$' as its first character, is stored in a single
00797    lexeme which has the type DOLLAR_STRING_LEXEME.
00798    Scheme 2:  A '$' is a single lexeme, much like a '(' or '&'.  All the
00799    subsequent lexemes are gotten individually with calls to get_lexeme().
00800    This makes it easier to parse the shell command, so that commands like
00801    cd, pushd, popd, etc. can be trapped and the equivalent Soar commands
00802    executed instead.  The problem with this scheme is that pulling the
00803    string apart into lexemes eliminates any special spacing the user may
00804    have done in specifying the shell command.  For that reason, my current
00805    plan is to follow scheme 1.  AGR 3-Jun-94  */
00806 
00807 void lex_dollar(void)
00808 {
00809     current_agent(lexeme).type = DOLLAR_STRING_LEXEME;
00810     current_agent(lexeme).string[0] = '$';
00811     current_agent(lexeme).length = 1;
00812     get_next_char();            /* consume the '$' */
00813     while ((current_agent(current_char) != '\n') &&
00814            (current_agent(current_char) != EOF_AS_CHAR) && (current_agent(lexeme).length < MAX_LEXEME_LENGTH - 1)) {
00815         current_agent(lexeme).string[current_agent(lexeme).length++] = current_agent(current_char);
00816         get_next_char();
00817     }
00818     current_agent(lexeme).string[current_agent(lexeme).length] = '\0';
00819 }
00820 
00821 /*
00822 void lex_dollar (void) {
00823   store_and_advance();
00824   finish();
00825   current_agent(lexeme).type = DOLLAR_STRING_LEXEME;
00826 }
00827 */
00828 
00829 /* AGR 562 end */
00830 
00831 /* ======================================================================
00832                              Get lexeme
00833 
00834   This is the main routine called from outside the lexer.  It reads past 
00835   any whitespace, then calls some lex_xxx routine (using the lexer_routines[]
00836   table) based on the first character of the lexeme.
00837 ====================================================================== */
00838 
00839 void get_lexeme(void)
00840 {
00841 
00842     /* AGR 568 begin */
00843     if (current_agent(lex_alias)) {
00844         current_agent(lexeme) = current_agent(lex_alias)->lexeme;
00845         current_agent(lex_alias) = current_agent(lex_alias)->next;
00846         return;
00847     }
00848     /* AGR 568 end */
00849 
00850     current_agent(lexeme).length = 0;
00851     current_agent(lexeme).string[0] = 0;
00852 
00853     /* 
00854        A block of code was removed from here which seemed 
00855        to deal with a number of interface
00856        and formatting details.  I am taking out ancient stuff which
00857        deals with the unix interface (this should be pushed into 
00858        a real interface layer if it to be used anyway) and leaving
00859        behind the new Soar8 behavior which is what we used to get
00860        only in the cases of using the Tcl interface
00861        081699 SW
00862      */
00863 
00864 /* AGR 534  The only time a prompt should be printed out is if there's
00865    a command being expected; ie. the prompt shouldn't print out if we're
00866    in the middle of entering a production.  So if we're in the middle of
00867    entering a production, then the parentheses level will be > 0, so that's
00868    the criteria we will use.  AGR  5-Apr-94  */
00869 
00870     current_agent(load_errors_quit) = FALSE;    /* AGR 527c */
00871 
00872     while (current_agent(load_errors_quit) == FALSE) {  /* AGR 527c */
00873         if (current_agent(current_char) == EOF_AS_CHAR)
00874             break;
00875         if (whitespace[(unsigned char) current_agent(current_char)]) {
00876             if (current_agent(current_char) == '\n') {
00877                 if (current_agent(current_file)->fake_rparen_at_eol) {
00878                     do_fake_rparen();
00879                     return;
00880                 }
00881 
00882                 /* 
00883                    A block of code was removed from here which seemed 
00884                    to deal with a number of interface
00885                    and formatting details.  I am taking out ancient stuff which
00886                    deals with the unix interface (this should be pushed into 
00887                    a real interface layer if it to be used anyway) and leaving
00888                    behind the new Soar8 behavior which is what we used to get
00889                    only in the cases of using the Tcl interface
00890                    081699 SW
00891                  */
00892 
00893             }
00894             get_next_char();
00895             continue;
00896         }
00897 
00898         /* 
00899            The following section deals with parsing Soar8 and Soar7 syntax.
00900            I am removing the old (Soar7 and prior) stuff, and leaving
00901            the behavior which previously was used only in conjunction
00902            with the Tcl interface.
00903 
00904            081699 SW
00905          */
00906 
00907         if (current_agent(current_char) == ';') {
00908             /* --- skip the semi-colon, forces newline in Soar8 --- */
00909             get_next_char();    /* consume it */
00910             continue;
00911         }
00912         if (current_agent(current_char) == '#') {
00913             /* --- read from hash to end-of-line --- */
00914             while ((current_agent(current_char) != '\n') && (current_agent(current_char) != EOF_AS_CHAR))
00915                 get_next_char();
00916             if (current_agent(current_file)->fake_rparen_at_eol) {
00917                 do_fake_rparen();
00918                 return;
00919             }
00920             if (current_agent(current_char) != EOF_AS_CHAR)
00921                 get_next_char();
00922             continue;
00923         }
00924 
00925         break;                  /* if no whitespace or comments found, break out of the loop */
00926     }
00927     /* --- no more whitespace, so go get the actual lexeme --- */
00928     record_position_of_start_of_lexeme();
00929     if (current_agent(current_char) != EOF_AS_CHAR)
00930         (*(lexer_routines[(unsigned char) current_agent(current_char)])) ();
00931     else
00932         lex_eof();
00933 }
00934 
00935 /* ======================================================================
00936                             Init lexer
00937 
00938   This should be called before anything else in this file.  It does all 
00939   the necessary init stuff for the lexer, and starts the lexer reading from
00940   standard input.
00941 ====================================================================== */
00942 
00943 char extra_constituents[] = "$%&*+-/:<=>?_";
00944 
00945 void init_lexer(void)
00946 {
00947     unsigned int i;
00948 
00949     /* --- setup constituent_char array --- */
00950     for (i = 0; i < 256; i++)
00951         if (isalnum(i))
00952             constituent_char[i] = TRUE;
00953         else
00954             constituent_char[i] = FALSE;
00955     for (i = 0; i < strlen(extra_constituents); i++)
00956         constituent_char[(int) extra_constituents[i]] = TRUE;
00957 
00958     /* --- setup whitespace array --- */
00959     for (i = 0; i < 256; i++)
00960         if (isspace(i))
00961             whitespace[i] = TRUE;
00962         else
00963             whitespace[i] = FALSE;
00964 
00965     /* --- setup number_starters array --- */
00966     for (i = 0; i < 256; i++)
00967         if (isdigit(i))
00968             number_starters[i] = TRUE;
00969         else
00970             number_starters[i] = FALSE;
00971     number_starters['+'] = TRUE;
00972     number_starters['-'] = TRUE;
00973     number_starters['.'] = TRUE;
00974 
00975     /* --- setup lexer_routines array --- */
00976     for (i = 0; i < 256; i++)
00977         lexer_routines[i] = lex_unknown;
00978     for (i = 0; i < 256; i++)
00979         if (constituent_char[i])
00980             lexer_routines[i] = lex_constituent_string;
00981     for (i = 0; i < 256; i++)
00982         if (isdigit(i))
00983             lexer_routines[i] = lex_digit;
00984     lexer_routines['@'] = lex_at;
00985     lexer_routines['('] = lex_lparen;
00986     lexer_routines[')'] = lex_rparen;
00987     lexer_routines['+'] = lex_plus;
00988     lexer_routines['-'] = lex_minus;
00989     lexer_routines['~'] = lex_tilde;
00990     lexer_routines['^'] = lex_up_arrow;
00991     lexer_routines['{'] = lex_lbrace;
00992     lexer_routines['}'] = lex_rbrace;
00993     lexer_routines['!'] = lex_exclamation_point;
00994     lexer_routines['>'] = lex_greater;
00995     lexer_routines['<'] = lex_less;
00996     lexer_routines['='] = lex_equal;
00997     lexer_routines['&'] = lex_ampersand;
00998     lexer_routines['|'] = lex_vbar;
00999     lexer_routines[','] = lex_comma;
01000     lexer_routines['.'] = lex_period;
01001     lexer_routines['"'] = lex_quote;
01002     lexer_routines['$'] = lex_dollar;   /* AGR 562 */
01003 
01004     /* --- initially we're reading from the standard input --- */
01005     start_lex_from_file("[standard input]", stdin);
01006 }
01007 
01008 /* ======================================================================
01009                    Print location of most recent lexeme
01010 
01011   This routine is used to print an indication of where a parser or interface
01012   command error occurred.  It tries to print out the current source line
01013   with a pointer to where the error was detected.  If the current source
01014   line is no longer available, it just prints out the line number instead.
01015 
01016   BUGBUG: if the input line contains any tabs, the pointer comes out in
01017   the wrong place.
01018 ====================================================================== */
01019 
01020 void print_location_of_most_recent_lexeme(void)
01021 {
01022     int i;
01023 
01024     if (current_agent(current_file)->line_of_start_of_last_lexeme == current_agent(current_file)->current_line) {
01025         /* --- error occurred on current line, so print out the line --- */
01026         if (!reading_from_top_level()) {
01027             print("File %s, line %lu:\n", current_agent(current_file)->filename,
01028                   current_agent(current_file)->current_line);
01029             /*       respond_to_load_errors ();     AGR 527a */
01030         }
01031         if (current_agent(current_file)->buffer[strlen(current_agent(current_file)->buffer) - 1] == '\n')
01032             print_string(current_agent(current_file)->buffer);
01033         else
01034             print("%s\n", current_agent(current_file)->buffer);
01035         for (i = 0; i < current_agent(current_file)->column_of_start_of_last_lexeme; i++)
01036             print_string("-");
01037         print_string("^\n");
01038 
01039         /*
01040            As far as we can tell this won't work
01041            currently, even if it ever does get called.
01042            Thus, we're taking it out.
01043            081799 SW
01044          */
01045         /*
01046            if (! reading_from_top_level()) {
01047            respond_to_load_errors (); 
01048            if (current_agent(load_errors_quit))
01049            current_agent(current_char) = EOF_AS_CHAR;
01050            }
01051          */
01052 /* AGR 527a  The respond_to_load_errors call came too early (above),
01053    and the "continue" prompt appeared before the offending line was printed
01054    out, so the respond_to_load_errors call was moved here.
01055    AGR 26-Apr-94 */
01056 
01057     } else {
01058         /* --- error occurred on a previous line, so just give the position --- */
01059         print("File %s, line %lu, column %lu.\n", current_agent(current_file)->filename,
01060               current_agent(current_file)->line_of_start_of_last_lexeme,
01061               current_agent(current_file)->column_of_start_of_last_lexeme + 1);
01062 
01063         /*
01064            As far as we can tell this won't work
01065            currently, even if it ever does get called.
01066            Thus, we're taking it out.
01067            081799 SW
01068          */
01069         /*
01070            if (! reading_from_top_level()) {
01071            respond_to_load_errors ();
01072            if (current_agent(load_errors_quit))
01073            current_agent(current_char) = EOF_AS_CHAR;
01074            }
01075          */
01076     }
01077 }
01078 
01079 /* ======================================================================
01080                        Parentheses Utilities
01081 
01082   Current_lexer_parentheses_level() returns the current level of parentheses
01083   nesting (0 means no open paren's have been encountered).
01084 
01085   Skip_ahead_to_balanced_parentheses() eats lexemes until the appropriate
01086   closing paren is found (0 means eat until back at the top level).
01087   
01088   Fake_rparen_at_next_end_of_line() tells the lexer to insert a fake
01089   R_PAREN_LEXEME token the next time it reaches the end of a line.
01090 ====================================================================== */
01091 
01092 int current_lexer_parentheses_level(void)
01093 {
01094     return current_agent(current_file)->parentheses_level;
01095 }
01096 
01097 void skip_ahead_to_balanced_parentheses(int parentheses_level)
01098 {
01099     for (;;) {
01100         if (current_agent(lexeme).type == EOF_LEXEME)
01101             return;
01102         if ((current_agent(lexeme).type == R_PAREN_LEXEME) &&
01103             (parentheses_level == current_agent(current_file)->parentheses_level))
01104             return;
01105         get_lexeme();
01106     }
01107 }
01108 
01109 void fake_rparen_at_next_end_of_line(void)
01110 {
01111     current_agent(current_file)->parentheses_level++;
01112     current_agent(current_file)->fake_rparen_at_eol = TRUE;
01113 }
01114 
01115 /* ======================================================================
01116                         Set lexer allow ids
01117 
01118   This routine should be called to tell the lexer whether to allow
01119   identifiers to be read.  If FALSE, things that look like identifiers
01120   will be returned as SYM_CONSTANT_LEXEME's instead.
01121 ====================================================================== */
01122 
01123 void set_lexer_allow_ids(bool allow_identifiers)
01124 {
01125     current_agent(current_file)->allow_ids = allow_identifiers;
01126 }
01127 
01128 /* ======================================================================
01129                Determine possible symbol types for string
01130 
01131   This is a utility routine which figures out what kind(s) of symbol a 
01132   given string could represent.  At entry:  s, length_of_s represent the
01133   string.  At exit:  possible_xxx is set to TRUE/FALSE to indicate
01134   whether the given string could represent that kind of symbol; rereadable
01135   is set to TRUE indicating whether the lexer would read the given string
01136   as a symbol with exactly the same name (as opposed to treating it as a
01137   special lexeme like "+", changing upper to lower case, etc.
01138 ====================================================================== */
01139 
01140 void determine_possible_symbol_types_for_string(char *s,
01141                                                 int length_of_s,
01142                                                 bool * possible_id,
01143                                                 bool * possible_var,
01144                                                 bool * possible_sc,
01145                                                 bool * possible_ic, bool * possible_fc, bool * rereadable)
01146 {
01147     char *ch;
01148     bool rereadability_dead, rereadability_questionable;
01149 
01150     *possible_id = FALSE;
01151     *possible_var = FALSE;
01152     *possible_sc = FALSE;
01153     *possible_ic = FALSE;
01154     *possible_fc = FALSE;
01155     *rereadable = FALSE;
01156 
01157     /* --- check if it's an integer or floating point number --- */
01158     if (number_starters[(unsigned char) (*s)]) {
01159         ch = s;
01160         if ((*ch == '+') || (*ch == '-'))
01161             ch++;               /* optional leading + or - */
01162         while (isdigit(*ch))
01163             ch++;               /* string of digits */
01164         if ((*ch == 0) && (isdigit(*(ch - 1))))
01165             *possible_ic = TRUE;
01166         if (*ch == '.') {
01167             ch++;               /* decimal point */
01168             while (isdigit(*ch))
01169                 ch++;           /* string of digits */
01170             if ((*ch == 'e') || (*ch == 'E')) {
01171                 ch++;           /* E */
01172                 if ((*ch == '+') || (*ch == '-'))
01173                     ch++;       /* optional leading + or - */
01174                 while (isdigit(*ch))
01175                     ch++;       /* string of digits */
01176             }
01177             if (*ch == 0)
01178                 *possible_fc = TRUE;
01179         }
01180     }
01181 
01182     /* --- make sure it's entirely constituent characters --- */
01183     for (ch = s; *ch != 0; ch++)
01184         if (!constituent_char[(unsigned char) (*ch)])
01185             return;
01186 
01187     /* --- check for rereadability --- */
01188     rereadability_questionable = FALSE;
01189     rereadability_dead = FALSE;
01190     for (ch = s; *ch != 0; ch++) {
01191         if (islower(*ch) || isdigit(*ch))
01192             continue;           /* these guys are fine */
01193         if (isupper(*ch)) {
01194             rereadability_dead = TRUE;
01195             break;
01196         }
01197         rereadability_questionable = TRUE;
01198     }
01199     if (!rereadability_dead) {
01200         if ((!rereadability_questionable) ||
01201             (length_of_s >= LENGTH_OF_LONGEST_SPECIAL_LEXEME) || ((length_of_s == 1) && (*s == '*')))
01202             *rereadable = TRUE;
01203     }
01204 
01205     /* --- any string of constituents could be a sym constant --- */
01206     *possible_sc = TRUE;
01207 
01208     /* --- check whether it's a variable --- */
01209     if ((*s == '<') && (*(s + length_of_s - 1) == '>'))
01210         *possible_var = TRUE;
01211 
01212     /* --- check if it's an identifier --- */
01213     if (isalpha(*s)) {
01214         /* --- is the rest of the string an integer? --- */
01215         ch = s + 1;
01216         while (isdigit(*ch))
01217             ch++;               /* string of digits */
01218         if ((*ch == 0) && (isdigit(*(ch - 1))))
01219             *possible_id = TRUE;
01220     }
01221 }

Generated on Thu Dec 11 13:00:16 2003 for Soar Kernel by doxygen 1.3.5