harb/tools/push3-transpiler/src/parser.ts

/**
 * Push3 parser — converts Push3 source text into an AST.
 *
 * Node types:
 *   { kind: 'int',  value: bigint }       — integer literal (1e18 etc.)
 *   { kind: 'bool', value: boolean }      — TRUE / FALSE
 *   { kind: 'instr', name: string }       — DYADIC.+, EXEC.IF, etc.
 *   { kind: 'name',  text: string }       — unbound identifier (TAXRATE etc.)
 *   { kind: 'list',  items: Node[] }      — ( ... )
 */

export type Node =
  | { kind: 'int';   value: bigint }
  | { kind: 'bool';  value: boolean }
  | { kind: 'instr'; name: string }
  | { kind: 'name';  text: string }
  | { kind: 'list';  items: Node[] };

// Known instruction prefixes / exact names
const KNOWN_INSTR_PREFIXES = [
  'DYADIC.', 'EXEC.', 'BOOLEAN.', 'CODE.', 'NAME.', 'INDEX.',
  'INTVECTOR.', 'FLOATVECTOR.', 'BOOLVECTOR.', 'GRAPH.',
];

function isInstruction(token: string): boolean {
  for (const pfx of KNOWN_INSTR_PREFIXES) {
    if (token.startsWith(pfx)) return true;
  }
  return false;
}

function tokenize(src: string): string[] {
  // Strip comments (;; to end of line)
  const noComments = src.replace(/;;[^\n]*/g, ' ');
  // Split on whitespace, treating ( and ) as separate tokens
  const spaced = noComments.replace(/\(/g, ' ( ').replace(/\)/g, ' ) ');
  return spaced.trim().split(/\s+/).filter(t => t.length > 0);
}

function parseTokens(tokens: string[], pos: number): [Node, number] {
  const token = tokens[pos];
  if (token === undefined) throw new Error('Unexpected end of tokens');

  if (token === '(') {
    // Parse list until matching ')'
    const items: Node[] = [];
    let i = pos + 1;
    while (i < tokens.length && tokens[i] !== ')') {
      const [node, next] = parseTokens(tokens, i);
      items.push(node);
      i = next;
    }
    if (tokens[i] !== ')') throw new Error('Unmatched (');
    return [{ kind: 'list', items }, i + 1];
  }

  if (token === 'TRUE')  return [{ kind: 'bool', value: true },  pos + 1];
  if (token === 'FALSE') return [{ kind: 'bool', value: false }, pos + 1];

  // Integer literal — may be large (BigInt)
  if (/^-?\d+$/.test(token)) {
    return [{ kind: 'int', value: BigInt(token) }, pos + 1];
  }

  if (isInstruction(token)) {
    return [{ kind: 'instr', name: token }, pos + 1];
  }

  // Otherwise: unbound name (e.g. TAXRATE, STAKED, DELTAS, EFFIDX)
  return [{ kind: 'name', text: token }, pos + 1];
}

export function parse(src: string): Node {
  const tokens = tokenize(src);
  if (tokens.length === 0) throw new Error('Empty program');
  const [node, consumed] = parseTokens(tokens, 0);
  if (consumed !== tokens.length) {
    throw new Error(`Unexpected tokens after position ${consumed}: ${tokens[consumed]}`);
  }
  return node;
}