/** Further utilities for retrieving information from grammars * (lookupRule, findSym), for turning strings into rule lists * (CFGparseRules), for turning tipless CFGsyntaxTrees into Strings * formed by pasting their terminal symbols together * (CFGtreeSentence), for turning Sexps into CFGsyntaxTrees * (CFGparseTree), and doing the reverse (CFGtreeUnparse). The * "grammars" for these operations are explained in the comments * before CFGparseRules, CFGparseTree, and CFGtreeUnparse. */ object CFGaux { import CFG._ import CFGutil._ final val separatorRegex = ",".r final val arrowRegex = "->".r final val spaceRegex = " +".r /** Retrieves the `n`th rule of context-free grammar `g` -- */ def lookupRule(g: CFG, n: Int) = { val rules = g.rules if (n < 0 || n >= rules.length) throw new Exception( "Meaningless rule number: " + n + " in grammar \n " + g ) else rules(n) } /** Given a string, parse into a list of rules. * The grammar of ruleLists is * ruleList ::= rule | rule (; rule)* * rule ::= nonterm -> ruleRHS * ruleRHS ::= emptyRHS | (terminal | nonterminal)+ * emptyRHs ::= | _e_ */ def CFGparseRules(s: String, terms: List[CFGterminal], nonterms: List[CFGnonterminal]): List[CFGrule] = { def findSym(name: String): CFGsymbol = { terms.find(_.name == name) match { case Some(term) => term case None => nonterms.find(_.name == name) match { case Some(nonterm) => nonterm case None => { println("Undefined symbol: [" + name + "]") syntaxError("Undefined symbol: " + name) } }}} def nonblank(s: String): Boolean = s.exists(!_.isWhitespace) def splitStringAtBlanks(ruleSide: String): Array[String] = { spaceRegex.split(ruleSide).filter(nonblank) } private def stringArrayToNonterminal(lhsStrings: Array[String]): CFGnonterminal = { if (lhsStrings.length != 1) syntaxError("Illegal left-hand side (length " + lhsStrings.length + ")\n [" + lhsStrings.toList + "]\n in rule " + s) else { val lhs: CFGnonterminal = findSym(lhsStrings(0)) match { case s: CFGterminal => syntaxError("Left-hand side of rule is a terminal" + " symbol: " + s) case CFGempty => syntaxError("Left-hand side of rule is " + CFGempty.name) case nonterm: CFGnonterminal => nonterm } lhs } } // Body of CFGparseRules starts here -- val split1 = separatorRegex.split(s) //// println("Split string into rules = " + split1.toList) for (rule <- split1.toList if nonblank(rule)) yield { //// println("Parsing rule /" + rule + "/") val rulePieces = arrowRegex.split(rule) if (rulePieces.length == 2) { val lhsStrings = spaceRegex.split(rulePieces(0)).filter(nonblank) val rhsStrings = spaceRegex.split(rulePieces(1)).filter(nonblank) val lhs = stringArrayToNonterminal(lhsStrings) val rhsSyms: List[CFGsymbol] = rhsStrings match { case Array("_e_") => List() case _ => if (rhsStrings.contains("_e_")) syntaxError( "Emptymarker (\"_e_\") must appear alone on rhs of" + " rule or not at all" + "\n Rule: " + rule ) else rhsStrings.toList.map(findSym(_)) } CFGrule(lhs, rhsSyms) } else syntaxError("Rule must have exactly one arrow: " + rule) } } /** Parses a string into a CFG syntax tree * The grammar of syntax trees: * syntaxTree ::= emptyTree | subTree * emptyTree ::= _e_ * subTree ::= terminal // -- leaf * subTree ::= (* nonterminal) // -- tip * subTree ::= (nonterminal ruleNum subTree*) * ruleNum ::= number between 0 and number of rules - 1 */ def CFGparseTree(grammar: CFG, s: String): CFGsyntaxTree = { import Sexp._ def sexpToParseTree(e: Sexp, depth: Int = 0): CFGsyntaxTree = e match { case SexpAtom(name) => { if (name == CFGempty.name) CFGtreeEmpty(depth) else CFGtreeLeaf(grammar.lookupTerminal(name), depth) } case SexpList(el) => el match { case Nil => syntaxErrorInString(s, "() found in parse tree:" + e, "from") case List(SexpAtom("*"), SexpAtom(nt)) => CFGtreeTip(grammar.lookupNonterminal(nt), depth) case SexpAtom(nt) :: SexpAtom(ruleNumString) :: subTreeSexps => { val expandedNT: CFGnonterminal = grammar.lookupNonterminal(nt) val ruleNum = try { ruleNumString.toInt } catch { case _: java.lang.NumberFormatException => { syntaxErrorInString( s, "Number expected, found \"" + ruleNumString, "from" )}} val rule: CFGrule = lookupRule(grammar, ruleNum) val ruleLHS: CFGnonterminal = rule.lhs if (ruleLHS != expandedNT) syntaxErrorInString(s, "Left-hand side of rule " + ruleNum + " doesn't match nontermininal of expansion:" + "\n Rule lhs = " + ruleLHS + " Nonterminal of expansion = " + expandedNT + "\n ", "from" ) val subTrees = subTreeSexps.map(e => sexpToParseTree(e, depth + 1)) // Check that the rule really does license this expansion: val numSubs = subTrees.length if (rule.rhs.length != numSubs) syntaxErrorInString(s, "Rule " + ruleNum + " doesn't match subtree list in length:" + "\n Rule length = " + rule.rhs.length + " Subtree list length = " + numSubs + "\n Sexp: " + e, "from" ) else { val rhsDiscreps: List[((CFGsymbol, CFGsyntaxTree), Int)] = ((rule.rhs zip subTrees).zipWithIndex.filter { case ((terminal1: CFGterminal, CFGtreeLeaf(terminal2, _)), ruleNum) => terminal1 != terminal2 case ((nonterm1: CFGnonterminal, CFGexpansion(nonterm2, _, _, _, _)), ruleNum) => nonterm1 != nonterm2 case ((nonterm1: CFGnonterminal, CFGtreeTip(nonterm2, _)), ruleNum) => nonterm1 != nonterm2 case _ => true }) if (rhsDiscreps.nonEmpty) { syntaxErrorInString(s, "Rule " + ruleNum + "'s right-hand-side doesn't match" + " subtree list" + "\n Discrepancies at positions " + rhsDiscreps.map({ case (_, i) => i }).mkString("(", ", ", ")") + "\n Sexp: " + e, "from") } // Finally, we've verified it -- else { val maxDepth = subTrees.foldLeft(depth)((fromLeft, nextSub) => Math.max(fromLeft, nextSub.maxDepth) ) CFGexpansion(expandedNT, subTrees, rule, depth, maxDepth) }}}}} // Body of CFGparseTree starts here val prelim: Sexp = entireStringToSexp(s) sexpToParseTree(prelim) } /** Turn a tree back into an Sexp. * The form on output is different; to display depths, * Leaf(term, d) comes out as (term d) * Tip(nonterm, d) comes out as (* nonterm d) * Expansion(nonterm subtrees rule d m) comes out as * (nonterm (i d m) --subtrees--) * where i is the index of `rule` in the grammar's * rule list. */ def CFGtreeUnparse(tr: CFGsyntaxTree, grammar: CFG): Sexp = { def intAtom(n: Int): SexpAtom = SexpAtom(n.toString) def sexpsList(sexps: Sexp*): SexpList = SexpList(sexps.toList) def unparse(tr: CFGsyntaxTree): Sexp = tr match { case CFGtreeEmpty(d) => SexpAtom(CFGempty.name) case CFGtreeTip(nonterm, d) => sexpsList(SexpAtom("*"), SexpAtom(nonterm.name), intAtom(d)) case CFGtreeLeaf(term, d) => sexpsList(SexpAtom(term.name), intAtom(d)) case CFGexpansion(nonterm, expansion, rule, d, m) => { // If `rule` is not in the index, we'll get a -1, which // is as good as anything -- val ruleIndex: Int = grammar.rules.indexOf(rule) val numList = SexpList(List(ruleIndex, d, m).map(intAtom)) SexpList(SexpAtom(nonterm.name) :: numList :: expansion.map(unparse)) } } // Body of CFGtreeUnparse unparse(tr) } /** Turns a tipless tree into a sentence, with terminals separated * by `separator`. */ def CFGtreeSentence(tr: CFGsyntaxTree, separator: String = " "): String = tr match { case CFGtreeEmpty(_) => "" case CFGtreeLeaf(term, _) => term.name case CFGtreeTip(_, _) => throw new Exception( "Can't produce 'sentence' for trees with tips such as \n" + tr.toString ) case CFGexpansion(_, expansion, _, _, _) => (expansion.map(sub => CFGtreeSentence(sub, separator) )).mkString(separator) } }