/* input: { fact = 1; i = 1; while (i < 7) do { fact = (fact * i); i = (i + 1) } print f } desired output: 5040 To do this, you of course must have exact knowledge of everything that is allowed in your language. Identify all the different kinds of syntax and give them all names and write out their definitions. This is usually done using the BNF notation. Here is what we need for our example language: ::= name | number | "(" ")" ::= ( ) * ::= "+" | "-" | "*" | "/" | "<" | ">" | "<=" | ">=" | "==" | "!=" ::= | | | | ::= "print" ::= name "=" ::= "while" "do" ::= "if" "then" { "else" } "fi" ::= "{" { } "}" ::= ( ";" ) * first stage: seems trivial but it's important '{', ' ', 'f', ' ', '=', ... '}', EOF second stage: "{", punctuation, 0 "fact", identifier, 0 "=", operator, 0 "1", number, 1 ... "while", reserved_word, 0 ... "^D", end_of_file, 0 Lexical analysis third stage: Parsing ------------------------- | block | | | | | | | | | ----------|---|---|---|-- | | | | | | | v | | | ------------- v v | | print | | | | ----------|-- . . | | . . | v . . | --------------------- | | variable | "fact" | | --------------------- v ----------------- | while | | | | | ----------|---|-- | v ----------------- | block | | | | | ----------|---|-- | | | | | v | | . | . | . | v --------------------------- | assignment | "fact" | | | ------------------------|-- v ---------------------------- | expression | "*" | | | | | ---------------------|---|-- | | | | | | v v fourth stage: Interpretation . . big recursive function(s) to obey the tree. . . . . OR fourth stage: Code generation big recursive function(s) to produce assembly code that can be converted into an executable file */ #include #include #include #include #include #include using namespace std; struct symbol { string form; string type; int value; bool hasvalue; symbol(string t, string f = "") { type = t; form = f; hasvalue = false; } symbol(string t, int v) { type = t; form = ""; value = v; hasvalue = true; } symbol(string t, string f, int v) { type = t; form = f; value = v; hasvalue = true; } string to_string() const { ostringstream os; os << "symbol[type = \"" << type << "\""; if (form != "") os << ", form = \"" << form << "\""; if (hasvalue) os << ", value = " << value; os << "]"; return os.str(); } }; ostream & operator<<(ostream & out, const symbol & sym) { out << sym.to_string(); return out; } struct inputsystem { string line; int pos, len, linenumber, numerrors; ifstream & file; bool dontread; static const char control_d = 'D' - 64; inputsystem(ifstream & f): file(f) { line = ""; pos = 0; len = 0; numerrors = 0; linenumber = 0; dontread = false; } char get() { while (true) { if (pos == len) { pos += 1; return ' '; } else if (pos > len) { getline(file, line); if (file.fail()) return control_d; pos = 0; len = line.length(); linenumber += 1; continue; } char c = line[pos]; pos += 1; return c; } } void unget() { pos -= 1; } void error(string message, string detail = "") { cerr << "Line " << linenumber << " Error " << message << detail << "\n"; cerr << line << "\n"; numerrors += 1; if (numerrors >= 10) { cerr << "Too many errors, giving up.\n"; exit(1); } } symbol lexan() { static symbol sym("error"); if (dontread) { dontread = false; return sym; } char c = get(); while (c <= ' ' && c != control_d) c = get(); if (c >= '0' && c <= '9') { string form = ""; int value = 0; while (c >= '0' && c <= '9') { form += c; value = value * 10 + c - '0'; c = get(); } unget(); return sym = symbol("number", value); } else if (c == '{') return sym = symbol("punctuation", "{"); else if (c == '}') return sym = symbol("punctuation", "}"); else if (c == ';') return sym = symbol("punctuation", ";"); else if (c == '=') { c = get(); if (c == '=') return sym = symbol("operator", "=="); unget(); return sym = symbol("operator", "="); } else if (c == '+') return sym = symbol("operator", "+"); else if (c == '-') return sym = symbol("operator", "-"); else if (c == '*') return sym = symbol("operator", "*"); else if (c == '/') return sym = symbol("operator", "/"); else if (c == '>') { c = get(); if (c == '=') return sym = symbol("operator", ">="); unget(); return sym = symbol("operator", ">"); } else if (c == '<') { c = get(); if (c == '=') return sym = symbol("operator", "<="); unget(); return sym = symbol("operator", "<"); } else if (c == '!') { c = get(); if (c == '=') return sym = symbol("operator", "!="); error("lone !"); unget(); } else if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z') { string form = ""; while (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9') { form += c; c = get(); } unget(); if (form == "while" || form == "do" || form == "if" || form == "then" || form == "else" || form == "fi" || form == "print") return sym = symbol("reserved_word", form); else return sym = symbol("name", form); } else if (c == control_d) return sym = symbol("end_of_file", "eof"); else { string msg = "unrecognised character: "; if (c >= ' ' && c <= '~') msg = msg + "'" + c + "'"; else msg = msg + "ASCII[" + to_string((int)c) + "]"; error(msg); } return sym = symbol("error"); } void unlexan() { dontread = true; } void test() { while (true) { symbol sy = lexan(); cout << sy << "\n"; if (sy.type == "end_of_file") break; } } }; int main(int argc, char * argv[]) { string fname = "/dev/tty"; if (argc > 1) fname = argv[1]; ifstream inf(fname); if (inf.fail()) { cerr << "Can't read \"" << fname << "\"\n"; exit(1); } inputsystem is(inf); is.test(); inf.close(); } /* struct node { string type, form; int value; bool hasvalue; vector part; node(string t, string f = "") { type = t; form = f; hasvalue = false; } node(string t, string f, int v) { type = t; form = f; value = v; hasvalue = true; } node(string t, int v) { type = t; form = ""; value = v; hasvalue = true; } void print(ostream & out, int depth = 0) { out << setw(depth * 3) << "" << type; if (form != "") out << " " << form; if (hasvalue) out << " " << value; out << "\n"; for (node * n: part) n->print(out, depth + 1); } }; class parser { protected: inputsystem & is; node * parse_value() { symbol sy = is.lexan(); if (sy.type == "number") return new node("number", sy.value); else if (sy.type == "name") return new node("name", sy.form); else { is.error("unrecognised beginning of value: ", sy.to_string()); return new node("error"); } } public: parser(inputsystem & isys): is(isys) { } node * parse() { symbol sy = is.lexan(); if (sy.type == "end_of_file") return NULL; is.unlexan(); return parse_value(); } }; int main(int argc, char * argv[]) { string fname = "/dev/tty"; if (argc > 1) fname = argv[1]; ifstream inf(fname); if (inf.fail()) { cerr << "Can't read \"" << fname << "\"\n"; exit(1); } inputsystem is(inf); parser p(is); while (true) { node * n = p.parse(); if (n == NULL) break; n->print(cout); cout << "\n"; } inf.close(); } */