#include #include #include /* Each different token type has its own unique code */ #define T_SEMICOLON ';' // use ASCII values for single char tokens #define T_LPAREN '(' #define T_RPAREN ')' #define T_ASSIGN '=' #define T_LT '<' #define T_GT '>' #define T_DIVIDE '/' #define T_CONCAT '.' // ... #define T_WHILE 257 // reserved words #define T_IF 258 #define T_RETURN 259 // ... #define T_IDENTIFIER 268 // identifiers, constants, etc. #define T_INTEGER 269 #define T_DOUBLE 270 #define T_STRING 271 #define T_END 349 // code used when at end of file #define T_UNKNOWN 350 // token was unrecognized by scanner struct token_t { int type; // one of the token codes from above union { char stringValue[256]; // holds lexeme value if string/identifier int intValue; // holds lexeme value if integer double doubleValue; // holds lexeme value if double } val; }; int lookup_reserved (const char *kw) { if (strcmp("WHILE", kw) == 0) return T_WHILE; if (strcmp("IF", kw) == 0) return T_IF; if (strcmp("RETURN", kw) == 0) return T_RETURN; // ... } static int ScanOneToken (FILE *fp, struct token_t *token) { int i, ch, nextch, prevch; ch = getc(fp); // read next char from input stream while (isspace(ch)) // if necessary, keep reading til non-space char ch = getc(fp); // (discard any white space) switch(ch) { case '/': // could either begin comment or T_DIVIDE op nextch = getc(fp); if (nextch == '/' || nextch == '*') ; // here you would skip over the comment else ungetc(nextch, fp); // fall-through to single-char token case case ';': case '(': case ')': case ',': case '=': // ... and other single char tokens token->type = ch; // ASCII value is used as token type return ch; // ASCII value used as token type case '\"': token->type = T_STRING; prevch = ch; ch = getc(fp); for (i = 0; (prevch != '\\') && (ch != '\"'); i++) { token->val.stringValue[i] = ch; prevch = ch; ch = getc(fp); } token->val.stringValue[i] = '\0'; return token->type; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': token->val.stringValue[0] = ch; for (i = 1; isupper(ch = getc(fp)); i++) // gather uppercase token->val.stringValue[i] = ch; ungetc(ch, fp); token->val.stringValue[i] = '\0'; // lookup reserved word token->type = lookup_reserved(token->val.stringValue); return token->type; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': token->type = T_IDENTIFIER; token->val.stringValue[0] = ch; for (i = 1; islower(ch = getc(fp)); i++) token->val.stringValue[i] = ch; // gather lowercase ungetc(ch, fp); token->val.stringValue[i] = '\0'; return T_IDENTIFIER; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': token->type = T_INTEGER; token->val.intValue = ch - '0'; while (isdigit(ch = getc(fp))) // convert digit char to number token->val.intValue = token->val.intValue * 10 + ch - '0'; ungetc(ch, fp); return T_INTEGER; case EOF: return T_END; default: // anything else is not recognized token->val.intValue = ch; token->type = T_UNKNOWN; return T_UNKNOWN; } } int main (int argc, char *argv[]) { struct token_t token; while (ScanOneToken(stdin, &token) != T_END) { // here is where you would process each token printf("type:%d", token.type); switch(token.type) { case T_WHILE: case T_IF: case T_RETURN: case T_IDENTIFIER: printf(" value:\'%s\'", token.val.stringValue); break; case T_STRING: printf(" value:\"%s\"", token.val.stringValue); break; case T_INTEGER: printf(" value:%d", token.val.intValue); break; case T_DOUBLE: printf(" value:%lf", token.val.doubleValue); break; } if (token.type < 127) printf(" value:\'%c\'", token.type); printf("\n"); } return 0; }