/* input: { f = 1; i = 1; while (i < 7) do { f = (f * i); i = (i + 1) } print f } desired output: 5040 first stage: '{', ' ', 'f', ' ', '=', ... '}', EOF second stage: "{", punctuation, 0 "f", identifier, 0 "=", operator, 0 "1", number, 1 ... "while", reserved_word, 0 ... "^D", end_of_file, 0 Lexical analysis third stage: Parsing ------------------------- | block | v | v | v | v | ----------v---v---v---v-- v v v v v v v ------------- v | print | v | v ----------v-- v v v v v ------------------ v | variable | "f" | v ------------------ v ----------------- | while | v | v | ----------v---v-- v v ----------------- | block | v | v | ----------v---v-- v v v ------------------------ | assignment | "f" | v | ---------------------v-- v ---------------------------- | expression | "*" | v | v | ---------------------v---v-- v v To do this, you of course must have exact knowledge of everything that is allowed in your language. Identify all the different kinds of syntax and give them all names and write out their definitions. This is usually done using the BNF notation. Here is what we need for our example language: ::= while do ::= print operator ")" ::= { ";" } OR, if we want to do it (almost) the C++ way ::= ( ";" ) * ::= "{" "}" fourth stage: Interpretation big recursive function(s) to obey the tree. */ #include #include #include using namespace std; const char control_d = 'D' - 64; struct inputsystem { string line; int pos, number; inputsystem() { line = ""; pos = 0; number = 0; } char get() { while (true) { int len = line.length(); if (pos == len) { pos += 1; return ' '; } else if (pos > len) { getline(cin, line); if (cin.fail()) return control_d; pos = 0; number += 1; continue; } char c = line[pos]; pos += 1; return c; } } void unget() { pos -= 1; } void error(string message) { cout << "Line " << number << " Error " << message << "\n"; cout << line << "\n"; exit(1); } }; inputsystem is; struct symbol { string form; string type; int value; symbol(string f, string t, int v = 0) { form = f; type = t; value = v; } }; symbol lexan() { static symbol sym("", "error"); char c = is.get(); while (c <= ' ' && c != control_d) c = is.get(); if (c >= '0' && c <= '9') { string form = ""; int value = 0; while (c >= '0' && c <= '9') { form += c; value = value * 10 + c - '0'; c = is.get(); } is.unget(); return sym = symbol(form, "number", value); } else if (c == '(') return sym = symbol("(", "punctuation"); else if (c == ')') return sym = symbol(")", "punctuation"); else if (c == '{') return sym = symbol("{", "punctuation"); else if (c == '}') return sym = symbol("}", "punctuation"); else if (c == ';') return sym = symbol(";", "punctuation"); else if (c == '=') { c = is.get(); if (c == '=') return sym = symbol("==", "operator"); is.unget(); return sym = symbol("=", "operator"); } else if (c == '+') return sym = symbol("+", "operator"); else if (c == '*') return sym = symbol("*", "operator"); else if (c == '<') return sym = symbol("<", "operator"); else if (c == '>') { c = is.get(); if (c == '=') return sym = symbol(">=", "operator"); is.unget(); return sym = symbol(">", "operator"); } else if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') { string form = ""; while (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9') { form += c; c = is.get(); } is.unget(); if (form == "while") return sym = symbol(form, "reserved_word"); else if (form == "print") return sym = symbol(form, "reserved_word"); else return sym = symbol(form, "identifier"); } else if (c == control_d) return sym = symbol("eof", "end_of_file"); else is.error("unrecognised character"); return sym = symbol("", "error"); } struct node { string type; int value; vector child; node(string t, int v = 0) { type = t; value = v; } }; int main() { while (true) { symbol s = lexan(); cout << s.form << " " << s.type << " " << s.value << "\n"; if (s.type == "end_of_file") break; } }