/* input: { f = 1; i = 1; while (i <= 7) do { f = (f * i); i = (i + 1); }; print f; } desired output: 5040 first stage: '{', ' ', 'f', ' ', '=', ... '}', EOF second stage: "{", punctuation, 0 "f", identifier, 0 "=", operator, 0 "1", number, 1 ... "while", reserved_word, 0 ... "^D", end_of_file, 0 Lexical analysis third stage: Parsing ------------------------- | block | v | v | v | v | ----------v---v---v---v-- v v v v v v v ------------- v | print | v | v ----------v-- v v v v v ------------------ v | variable | "f" | v ------------------ v ----------------- | while | v | v | ----------v---v-- v v ----------------- | block | v | v | ----------v---v-- v v v ------------------------ | assignment | "f" | v | ---------------------v-- v ---------------------------- | expression | "*" | v | v | ---------------------v---v-- v v To do this, you of course must have exact knowledge of everything that is allowed in your language. Identify all the different kinds of syntax and give them all names and write out their definitions. This is usually done using the BNF notation. Here is what we need for our example language: ::= while do ::= print operator ")" ::= { ";" } OR, if we want to do it (almost) the C++ way ::= ( ";" ) * ::= "{" "}" fourth stage: Interpretation big recursive function(s) to obey the tree. */ #include #include #include #include #include using namespace std; const char control_d = 'D' - 64; struct inputsystem { string line; int pos, number; inputsystem() { line = ""; pos = 0; number = 0; } char get() { while (true) { int len = line.length(); if (pos == len) { pos += 1; return ' '; } else if (pos > len) { getline(cin, line); if (cin.fail()) return control_d; pos = 0; number += 1; continue; } char c = line[pos]; pos += 1; return c; } } void unget() { pos -= 1; } void error(string message) { cout << "Line " << number << " Error " << message << "\n"; cout << line << "\n"; exit(1); } }; inputsystem is; struct symbol { string form; string type; int value; symbol(string f, string t, int v = 0) { form = f; type = t; value = v; } }; bool unreading = false; void lexunread() { unreading = true; } symbol lexan() { static symbol sym("", "error"); if (unreading) { unreading = false; return sym; } char c = is.get(); while (c <= ' ' && c != control_d) c = is.get(); if (c >= '0' && c <= '9') { string form = ""; int value = 0; while (c >= '0' && c <= '9') { form += c; value = value * 10 + c - '0'; c = is.get(); } is.unget(); return sym = symbol(form, "number", value); } else if (c == '(') return sym = symbol("(", "punctuation"); else if (c == ')') return sym = symbol(")", "punctuation"); else if (c == '{') return sym = symbol("{", "punctuation"); else if (c == '}') return sym = symbol("}", "punctuation"); else if (c == ';') return sym = symbol(";", "punctuation"); else if (c == '=') { c = is.get(); if (c == '=') return sym = symbol("==", "operator"); is.unget(); return sym = symbol("=", "operator"); } else if (c == '+') return sym = symbol("+", "operator"); else if (c == '*') return sym = symbol("*", "operator"); else if (c == '<') { c = is.get(); if (c == '=') return sym = symbol("<=", "operator"); is.unget(); return sym = symbol("<", "operator"); } else if (c == '>') { c = is.get(); if (c == '=') return sym = symbol(">=", "operator"); is.unget(); return sym = symbol(">", "operator"); } else if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') { string form = ""; while (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9') { form += c; c = is.get(); } is.unget(); if (form == "while") return sym = symbol(form, "reserved_word"); else if (form == "print") return sym = symbol(form, "reserved_word"); else return sym = symbol(form, "identifier"); } else if (c == control_d) return sym = symbol("eof", "end_of_file"); else is.error("unrecognised character"); return sym = symbol("", "error"); } struct node { string type, form; int value; vector child; node(string t, string f = "", int v = 0) { type = t; form = f; value = v; } }; node * parse_expression() { symbol s = lexan(); if (s.type == "identifier") return new node("identifier", s.form); else if (s.type == "number") return new node("number", s.form, s.value); else if (s.form == "(") { node * L = parse_expression(); s = lexan(); if (s.type != "operator") { is.error("operator expected"); return NULL; } string op = s.form; node * R = parse_expression(); s = lexan(); if (s.form != ")") { is.error("close parenthesis expected"); return NULL; } node * n = new node("expression", op); n->child.push_back(L); n->child.push_back(R); return n; } else { is.error("Illegal beginning of expression"); return NULL; } } node * parse_statement(); node * parse_while_statement() { symbol s = lexan(); if (s.form != "while") { is.error("while expected"); return NULL; } node * E = parse_expression(); s = lexan(); if (s.form != "do") { is.error("do expected"); return NULL; } node * S = parse_statement(); node * n = new node("while"); n->child.push_back(E); n->child.push_back(S); return n; } node * parse_print_statement() { symbol s = lexan(); if (s.form != "print") { is.error("print expected"); return NULL; } node * E = parse_expression(); node * n = new node("print"); n->child.push_back(E); return n; } node * parse_assignment_statement() { symbol s = lexan(); if (s.type != "identifier") { is.error("identifier expected"); return NULL; } string id = s.form; s = lexan(); if (s.form != "=") { is.error("= expected"); return NULL; } node * E = parse_expression(); node * n = new node("assign", id); n->child.push_back(E); return n; } node * parse_statement_sequence() { node * n = new node("sequence"); while (true) { symbol s = lexan(); lexunread(); if (s.form == "}") break; n->child.push_back(parse_statement()); s = lexan(); if (s.form != ";") { is.error("; expected"); return NULL; } } return n; } node * parse_block() { symbol s = lexan(); if (s.form != "{") { is.error("{ expected"); return NULL; } node * S = parse_statement_sequence(); s = lexan(); if (s.form != "}") { is.error("} expected"); return NULL; } node * n = new node("block"); n->child.push_back(S); return n; } node * parse_statement() { symbol s = lexan(); lexunread(); if (s.form == "while") return parse_while_statement(); if (s.form == "print") return parse_print_statement(); if (s.type == "identifier") return parse_assignment_statement(); if (s.form == "{") return parse_block(); is.error("statement expected"); return NULL; } void print(node * n, int indent = 0) { if (n == NULL) return; cout << setw(indent*4) << "" << n->type << " " << n->form << " " << n->value << "\n"; for (int i = 0; i < n->child.size(); i += 1) print(n->child[i], indent + 1); } unordered_map memory; int evaluate(node * n) { if (n->type == "identifier") return memory[n->form]; if (n->type == "number") return n->value; int L = evaluate(n->child[0]); int R = evaluate(n->child[1]); if (n->form == "+") return L + R; if (n->form == "-") return L - R; if (n->form == "*") return L * R; if (n->form == "==") return L == R; if (n->form == "<") return L < R; if (n->form == ">") return L > R; if (n->form == "<=") return L <= R; if (n->form == ">=") return L >= R; cout << "Unexpected operator \"" << n->form << "\"\n"; exit(1); } void execute(node * n) { if (n->type == "print") { int E = evaluate(n->child[0]); cout << E << "\n"; return; } if (n->type == "block") { execute(n->child[0]); return; } if (n->type == "assign") { memory[n->form] = evaluate(n->child[0]); return; } if (n->type == "sequence") { for (int i = 0; i < n->child.size(); i += 1) execute(n->child[i]); return; } if (n->type == "while") { while (true) { int E = evaluate(n->child[0]); if (! E) break; execute(n->child[1]); } return; } cout << "Unexpected statement type \"" << n->type << "\"\n"; exit(1); } int main() { while (true) { node * n = parse_statement(); if (n == NULL) continue; print(n); execute(n); } }