lexer done

1e561222 · Ronald Charles Moore · 0f3fa0ff · 1e561222 · 1e561222
Commit 1e561222 authored 5 years ago by Ronald Charles Moore
--- a/recursiveDescentParsers/cplusplus/interpreter/interpreter
+++ b/recursiveDescentParsers/cplusplus/interpreter/interpreter
--- a/recursiveDescentParsers/cplusplus/interpreter/interpreter.cpp
+++ b/recursiveDescentParsers/cplusplus/interpreter/interpreter.cpp
@@ -2,59 +2,158 @@
 // Prof. Ronald Moore  
 //     https://fbi.h-da.de/personen/ronald-moore/  
 //     mailto:ronald.moore@h-da.de
-// with no warranties whatsoever
-//
-// The grammar we are going to parse here is:
-// Grammar:
-// 	E	→ T E´
-// 	E´	→ + T E´  | - T E´ | ε
-// 	T	→ F T´
-// 	T´	→ * F T´  |  / F T´  |  ε
-// 	F	→ ( E ) | num
-// where the following are taken to be tokens:
+// with no warranties whatsoever!
+
+
+#include <cassert>
+#include <cctype> // for isspace
+#include <cstdlib> // for strtod
+#include <iostream>
+#include <fstream>
+#include <string>
+// include <vector>
+
+// ===================
+// LEXICAL ANALYSIS
+// The following are taken to be tokens:
 // left and right parenthesis, the plus and minus characters,
 // as well as asterisk and forward slash -- and numbers.
 // In the script, substraction and division are not supported,
 // but it seems like time to add them.
-//
-// Note that the recursive descent function for (e.g.) E´ 
-// is nameded "E2ndHalf"-
-
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>

 // Preliminaries and Utilities
 // ============================

-// global variables -- sue me if you don't like that!
-std::string	currentLine( "" );
-int		currentLineNumber = 0;
-int		currentColumnNumber = 0;
-int		currentTokenLength = 0;
 // Utility Types
 typedef double numberType; // feel fee to change this to something else like int or float or bigint....
+typedef enum Token { 
+	tok_number = 'n',
+	tok_lparen = '(',
+	tok_rparen = ')',
+	tok_plus   = '+',
+	tok_minus  = '-',
+	tok_times  = '*',
+	tok_div    = '/',
+	tok_eof    = 'E',
+	bad_tok    = 'X'
+} Token;
+
+// global variables -- sue me if you don't like that!
+static std::istream *input = &(std::cin); // until proven otherwise
+static std::string	currentLine( "" );
+static int	currentLineNumber = -1;
+static int	currentColumnNumber = 0;
+static int	currentTokenLength = 0;

-// the tokens
-enum Token { 
-	tok_number,
-	tok_lparen,
-	tok_rparen,
-	tok_plus,
-	tok_minus,
-	tok_times,
-	tok_div
-} next_token; // again with the global variables... 
+static Token next_token; // again with the global variables... 
+static numberType currentNumber; // = zero....

 // The Lexer
 // ==========
+static bool skippedWhiteSpace( ) { // return true if not at EOF, i.e. if skipped
+	while ( true ) {
+		int currentLineLength = currentLine.length();
+		while ( currentColumnNumber < currentLineLength )
+			if ( isspace( currentLine[ currentColumnNumber ] ) ) 
+				currentColumnNumber++;
+			else // if NOT isspace() 
+				return true;
+				
+		// if we're here, we're at the end of a line.
+		std::getline( *input, currentLine ); 
+		currentLineNumber++;
+		currentColumnNumber = 0;
+		if ( ! *input ) // EOF!!
+			return false;
+		// else, repeat! 
+		// Which is the same as 
+		// return skippedWhiteSpace()  -- i.e. tail recursion.
+	};
+};		

+static Token gettok( ) {
+	assert( input ); // we assume nullptr != input
+	if ( ! *input ) return bad_tok;
+	// else, we can read from input
 	
+	// Skip white space, going to next line as necessary
+	if ( ! skippedWhiteSpace( ) ) return tok_eof;
+	        
+	// We're have visible text in front of us.
+	char currentChar = currentLine[ currentColumnNumber ];
+	currentColumnNumber++; // usually, but see num...
+	switch (  currentChar ) {
+		case '(' : return tok_lparen;
+		case ')' : return tok_rparen;
+		case '+' : return tok_plus;
+		case '-' : return tok_minus;
+		case '*' : return tok_times;
+		case '/' : return tok_div;
+		default : 
+			// either we have a number in front of us, or we don't
+			assert( 0 < currentColumnNumber );
+			char *alpha = &(currentLine[ currentColumnNumber-1 ]);
+			// minus one because we incremented it before the switch
+			char *omega = nullptr; // until we call strtod...
+			double tmpValue = strtod( alpha, &omega );
+			if ( alpha == omega ) {
+				return bad_tok; // !!!
+			};
+			// else if strtod found a real number (or at least a double)
+			currentNumber = tmpValue; // let C++ do the converison
+			currentColumnNumber += (omega - alpha) -1;
+			// minus one because we incremented it before the switch
+			return tok_number;
+	
+	}; // end switch
+	assert( false ); // we should never get here!
+	return bad_tok;
+} // end gettok
+	
+// PARSING!!!
+// ===========
+// 
+// The grammar we are going to parse here is:
+// Grammar:
+// 	E	→ T E´
+// 	E´	→ + T E´  | - T E´ | ε
+// 	T	→ F T´
+// 	T´	→ * F T´  |  / F T´  |  ε
+// 	F	→ ( E ) | num
+// Note that the recursive descent function for (e.g.) E´ 
+// is nameded "E2ndHalf"-
+
+
+
+
 // main (!)
 // =========

 int main( int argc, char **argv ) {

+	if (2 != argc) {
+		std::cerr << "Usage: " << argv[0] << " <fileName>.\n" 
+		          << "You provided " << argc-1 << " arguments, we take exactly one (only).\n";
+		return( -1 );
+	};
+	// else if 1 == argc ....
+	std::string	fileName( argv[1] );
+	if ( "-" != fileName ) {
+		static std::ifstream ifs( fileName );
+		input = &ifs;
+	}
+
+	// Prime the pump!
+	next_token = gettok( );
+	
+	// get tokens and dump them... 
+	while ( tok_eof != next_token ) {
+		std::cout << "TOKEN = " << (char)next_token 
+				  << " current num = " << currentNumber
+				  << std::endl;
+		next_token = gettok();
+	};
+	std::cout << "EOF" << std::endl;
+	
 	return 0; // Alles klar!!!
 }