手写编译器
- code -> tokenizer -> tokens
- tokens -> parser -> ast
- ast -> transformer -> newAst
- newAst -> generator -> output
```javascript
// 第一步:代码分析阶段。parseing
// 代码分析包括 词法分析 lexical analysis【就是tokenizer阶段】 和 语法分析 syntactic analysis
// (add 2 (subtract 4 2)) => [{type: “parent”, value: “(“}, //…]
function tokenizer(code) {
// 用于定位当前代码中的位置
let current = 0;
// tokens用于保存截取的token
let tokens = [];
while (current < code.length) {
// char保存 code中current位置的字符
let char = code[current];
//@1: 判断当前是否为 ‘(‘
if (char === “(“) {
} //@2: 判断当前是否为 ‘)’ if (char === “)”) {// 如果是左括号,就向 tokens.push新的{type: "parent", value: "("}tokens.push({type: "parent",value: "(",});current++;continue;
} //@3: 检查空格符,如果是空格,就使用continue,结束本次循环,直接进入下次循环 const WHITESPACE = /\s/; if (WHITESPACE.test(char)) {// 如果是左括号,就向 tokens.push新的{type: "parent", value: "("}tokens.push({type: "parent",value: ")",});current++;continue;
} //@4: 检查数字 const NUMBERS = /[0-9]/; if (NUMBERS.test(char)) {current++;continue;
} // 接下来处理有双引号包括起来的文本 (concat “aaa” “bbb”) //@5: 首先 检查 “ 双引号 if (char === ‘“‘) {// value作为存储多个数字的中间物let value = "";while (NUMBERS.test(char)) {value += char;char = code[++current];}tokens.push({ type: "number", value: value });continue;
} //@6: 检查 a-z 字符类型,处理非双引号包裹的字符,该类字符具有特殊含义的函数 (add 2 5) let LETTERS = /[a-z]/i; if (LETTERS.test(char)) {let value = "";// 跳过左侧的双引号,不把它加入到token中char = code[++current];// 遍历字符,直到第二个双引号while (char !== '"') {value += char;char = code[++current];}char = code[++current];tokens.push({ type: "string", value: value });continue;
} // 到这里还没被匹配到的字符,就抛出异常,并退出程序 throw new TypeError(“Invalid character” + char); } return tokens; } // 词法分析:接收tokens数组,然后将其转换为AST // [{type: “parent”, value: “(“}] => {type: “Program”, body: […]} function parser(tokens) { let current = 0; // 创建递归函数,这里面要深度优先进行遍历 function walk() { let token = tokens[current]; // token的类型为number if (token.type === “number”) {let value = "";while (LETTERS.test(char)) {value += char;char = code[++current];}tokens.push({ type: "name", value: value });continue;
} // token的类型为string, if (token.type === “string”) {current++;return {type: "NumberLiteral",value: token.value,};
} // 处理CallExpressions,当遇到一个左括号时就开始处理 if (token.type === “parent” && token.value === “(“) {current++;return {type: "StringLiteral",value: token.value,};
} throw new TypeError(“Invalid token type: “ + token.type); } let ast = { type: “Program”, body: [], }; while (current < tokens.length) { ast.body.push(walk()); } return ast; } // 经过词法分析和语法分析,接着定义traverser函数,它接收AST和visitor[定义的是转换规则]作为参数 function traverser(ast, visitor) { function traverseArray(array, visitor) { array.forEach((child) => {// 递增current变量,直接跳过左括号,在AST中并不关注它token = tokens[++current];// 创建节点node,类型为CallExpression,并将name设置为token的value,因为左括号后的下一个token就是函数名let node = {type: "CallExpression",name: token.value,params: [],};// 递增current,跳过刚才的函数名tokentoken = tokens[++current];// 创建while循环,直到token是右括号时停止循环while (token.type !== "parent" ||(token.type === "parent" && token.value !== ")")) {// 调用walk 函数,并将它返回的节点 加入node.paramsnode.params.push(walk());token = tokens[current];}// 递增current,跳过右括号current++;return node;
}); } function traverseNode(node, parent) { //首先判断node节点的类型,在visitor中是否存在对应的方法 let methods = visitor[node.type]; // 如果存在 enter 方法,则调用它,并以 node和parent作为参数 if (methods && methods.enter) {traverseNode(child, visitor);
} // 按照节点类型分别处理 switch (node.type) {methods.enter(node, parent);
} if (methods && methods.exit) {case "Program":traverseArray(node.body, node);break;case "CallExpression":traverseArray(node.params, node);break;case "NumberLiteral":case "StringLiteral":break;default:throw new Error("unknown node type : " + node.type);
} } traverseNode(ast, null); } // 第二步:代码转换 transformation function transformer(ast) { // 创建新的AST语法结构 let newAst = { type: “Program”, body: [], }; // _context是旧的AST上对新AST的引用 ast._context = newAst.body; // 调用traverser函数,并传入AST和visitor traverser(ast, { NumberLiteral: {methods.exit(node, parent);
}, StringLiteral: {// 使用enter方法访问enter(node, parent) {parent._context.push({type: "NumberLiteral",value: node.value,});},
}, CallExpression: {enter(node, parent) {parent._context.push({type: "StringLiteral",value: node.value,});},
}, }); return newAst; }enter(node, parent) {let expression = {type: "CallExpression",callee: {type: "Identifier",name: node.name,},arguments: [],};node._context = expression.arguments;// 判断父节点是否为CallExpression,如果不是if (parent.type !== "CallExpression") {// 这样设置的原因:在js中顶层CallExpression实际是声明expression = {type: "ExpressionStatement",expression: expression,};}parent._context.push(expression);},
// 第三步:代码生成 function codeGenerator(node) { switch (node.type) { // 遇到Program节点,使用codeGenerator遍历body中的每个节点,并对得到的数组以换行符为参数进行 join 操作 case “Program”: return node.body.map(codeGenerator).join(“\n”); // 遇到ExpressionStatement类型节点,则使用codeGenerator 处理node.expression case “ExpressionStatement”: return codeGenerator(node.expression) + “;”; case “CallExpression”: return ( codeGenerator(node.callee) + “(“ + node.arguments.map(codeGenerator).join(“, “) + “)” ); case “Identifier”: return node.name; case “NumberLiteral”: return node.value; case “StringLiteral”: return ‘“‘ + node.value + ‘“‘; default: throw new Error(node.type + “Error”); } }
// 最后创建compiler编译函数 function compiler(node) { let tokens = tokenizer(node); let ast = parser(tokens); let newAst = transformer(ast, tokens); let output = codeGenerator(newAst); return output; }
let str = “(add 2 (subtract 4 2) (plus 8 10))”; console.dir(compiler(str), { depth: null, });
[链接](https://github.com/shenshuai89/the-super-tiny-compiler)<a name="Ykqr7"></a>## 借助npm包实现编译器安装包`esprima、estraverse、escodegen`<br />`yarn add esprima、estraverse、escodegen -D````javascriptconst esprima = require("esprima");const estraverse = require("estraverse");const escodegen = require("escodegen");const source = "async function foo(){}";let ast = esprima.parse(source);console.log(ast, "ast---------");let indent = 0;const padding = () => " ".repeat(indent);// 第二个参数是visitor,表示怎么定义新的语法规范estraverse.traverse(ast, {enter(node) {console.log(padding() + "enter " + node.type);if (node.type === "FunctionDeclaration") {//如果是函数,就把函数名转为大写形式node.id.name = node.id.name.toUpperCase();}indent += 2;},leave(node) {indent -= 2;console.log(padding() + "leave " + node.type);},});let newCode = escodegen.generate(ast);console.log("newCode", newCode);
