-
Notifications
You must be signed in to change notification settings - Fork 0
Home
Drakoon is a toy compiler from Moonbit to LLVM. This wiki explains some design choices and introduces how Drakoon works to interested contributors :)
The compiler takes a source file and produces LLVM IR:
- Lexing, tokenizes the source (Logos library)
- Parsing, builds the AST using LALRPOP (LARLPOP library)
- Code generation, walks the AST and emits LLVM IR text (codegen.rs)
let lexer = Lexer::new(&source);
let parser = ScriptParser::new();
let ast = parser.parse(lexer).unwrap();
let mut codegen = CodeGen::new();
for stmt in &ast {
codegen.append_stmt(stmt);
}// ... other code
#[derive(Logos, Clone, Debug, PartialEq)]
#[logos(skip r"[ \t\n\f]+", skip r"//[^\n]*",skip r"///[^\n]*", skip r"///\|[^\n]*", error = LexicalError)]
pub enum Token {
#[token("fn")]
Fn,
// ... other code
#[regex(r"[A-Za-z_][A-Za-z0-9_]*", |lex| lex.slice().to_string())]
Id(String),
#[regex(r"[1-9][0-9]*|0", |lex| lex.slice().parse())]
Int(i32),
#[regex(r"[0-9]+\.[0-9]*([eE][+-]?[0-9]+)?", |lex| lex.slice().parse())]
Double(f64),
#[regex(r#""([^"\\]|\\.)*""#, |lex| lex.slice()[1..lex.slice().len() - 1].to_string())]
#[regex(r#"'([^'\\]|\\.)*'"#, |lex| lex.slice()[1..lex.slice().len() - 1].to_string())]
String(String),
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{:?}", self)
}
}When multiple token patterns in Logos could match the same input, the lexer follows specific selection rules. Generally:
- The longer match takes precedence over the shorter one
- The more specific pattern is preferred over the more generic one (exact match win over regex match)
Implementing the Display trait is necessary because LALRPOP uses the token’s textual representation when generating error messages during parsing.
use crate::tokens::{Token, LexicalError};
use crate::ast;
use crate::types::Type;
grammar;
pub Script: Vec<ast::Stmt> = {
<gstmts:GlobalStmt*> => gstmts
}
pub GlobalStmt: ast::Stmt = {
StmtMainDef,
StmtGlobalVarDef,
StmtFnDef,
}
pub Stmt: ast::Stmt = {
StmtVarDef,
StmtVarAssign,
StmtFixedArrayDef,
StmtPrint,
StmtFnCall,
StmtReturn,
StmtIf,
StmtWhile,
StmtFor,
}
pub StmtMainDef: ast::Stmt = {
"fn" "main" "{" <body:Stmt*> "}" => {
ast::Stmt::MainDef {
body,
}
},
}
StmtIf: ast::Stmt = {
"if" <cond:Expr> "{" <then_body:Stmt*> "}" => ast::Stmt::If {
cond,
then_body,
else_body: None,
},
"if" <cond:Expr> "{" <then_body:Stmt*> "}" "else" "{" <else_body:Stmt*> "}" => ast::Stmt::If {
cond,
then_body,
else_body: Some(else_body),
},
};
pub StmtWhile: ast::Stmt = {
"while" <cond:Expr> "{" <body:Stmt*> "}" => ast::Stmt::While { cond, body },
}
pub StmtVarDefFor: ast::Stmt = {
<name:"id"> "=" <value:Expr> => {
ast::Stmt::VarDef { name, annot: None, value, mutable: true }
},
}
pub StmtFor: ast::Stmt = {
"for" <init:StmtVarDefFor?> ";" <cond:Expr?> ";" <step:StmtVarAssign?> "{" <body:Stmt*> "}" => {
ast::Stmt::For { init: init.map(Box::new), cond, step: step.map(Box::new), body }
},
}
pub StmtReturn: ast::Stmt = {
"return" <value:Expr> => ast::Stmt::Return { value: Some(value) }
}
pub StmtFnDef: ast::Stmt = {
"fn" <name:"id"> "(" <params:ParamList?> ")" "->" <ret:BasicType> "{" <body:Stmt*> "}" => {
ast::Stmt::FnDef {
name,
params: params.unwrap_or_default(),
ret_type: ret,
body,
}
},
}
pub StmtFnCall: ast::Stmt = {
<name:"id"> "(" <args:ExprList?> ")" => {
ast::Stmt::FnCall { name, args: args.unwrap_or_default() }
},
}
pub ExprValue: ast::Expr = {
<e:Expr> => *e,
}
pub ExprList: Vec<ast::Expr> = {
<first:ExprValue> <rest:("," <ExprValue>)*> => {
let mut v = vec![first];
v.extend(rest);
v
}
}
pub ParamList: Vec<(String, Type)> = {
<first:Param> <rest:("," <Param>)*> => {
let mut v = vec![first];
v.extend(rest);
v
}
}
pub Param: (String, Type) = {
<name:"id"> ":" <ty:ParamType> => (name, ty)
}
pub ParamType: Type = {
BasicType,
"FixedArray" "[" <elem:BasicType>"]" => {
Type::FixedArray(Box::new(elem), None)
},
}
// other code ...AST nodes struct live under src/ast. The most relevant:
- Expr: literals, variables, binary ops, calls, and array indexing.
- Stmt: variable/array defs, assignments, function defs/calls, control flow, printing.
Cool thigs about LALRPOP grammar:
- Macros let you reuse grammar parts
- Repeated and optional constructs like
Expr*andExpr?
The core of codegen is CodeGen (src/codegen.rs). It emits text IR and accumulates errors.
#[derive(Default)]
struct Scope {
vars: HashMap<String, ValueObj>,
}
#[derive(Default, Clone)]
pub struct FnInfo {
params: Vec<(String, Type)>,
ret_type: Type,
}
pub struct CodeGen {
pub output: String,
pub errors: String,
pub sem_errors: u32,
pub functions: HashMap<String, FnInfo>,
scopes: Vec<Scope>,
tmp_count: u32,
printf_declared: bool,
string_literals: HashMap<String, (String, usize)>,
ret_type: Option<Type>,
label_count: u32,
param_types: HashMap<String, Type>,
}-
output, is the buffer for the LLVM IR being generated, chosen as a single String to make emission simple and predictable.
-
errors, accumulates human readable diagnostics and counts them, and avoid exiting until the end so multiple issues can be reported at once.
-
functions: records declared functions, their parameter types, and return type, to enable validation at fn call.
-
scopes, a stack of symbol tables, inner scopes shadow outer ones, obviously lookup walks from innermost to outermost.
-
tmp_count, generates unique SSA names (%t0, %t1, …).
-
printf_declared, ensures we declare printf only once at the top of the IR.
-
string_literals, to avoid globals for identical strings.
-
ret_type, used to validate that the declared return type matches the implementation.
-
label_count, enerates unique labels for control flow.
-
param_type:
- Tracks the declared types of parameters in the current function.
- Critical to supporting FixedArray parameters “by reference”: distinguishes parameters that are T* (for FixedArray[T]) from local [N x T].
fn main {
let arr : FixedArray[Int] = [1,2,3]
println(arr[0])
}@.str.0 = private constant [4 x i8] c"%d\0A\00"
declare i32 @printf(i8*, ...)
define i32 @main() {
entry:
%arr = alloca [3 x i32], align 4
%t0 = getelementptr inbounds [3 x i32], [3 x i32]* %arr, i32 0, i32 0
store i32 1, i32* %t0, align 4
%t1 = getelementptr inbounds [3 x i32], [3 x i32]* %arr, i32 0, i32 1
store i32 2, i32* %t1, align 4
%t2 = getelementptr inbounds [3 x i32], [3 x i32]* %arr, i32 0, i32 2
store i32 3, i32* %t2, align 4
%t3 = getelementptr inbounds [3 x i32], [3 x i32]* %arr, i32 0, i32 0
%t4 = load i32, i32* %t3, align 4
%t5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.0, i32 0, i32 0), i32 %t4)
ret i32 0
}