import collection.immutable.HashMap // We settle the vexed question of how to handle acronyms in // camelCase. For our purposes an *acronym* is a string of letters // standing for a phrase, said string normally being printed in // capital letters. So XML and UNESCO are acronyms, whereas // "scuba" is not (even though it originally was an acronym // standing for Self-Contained Underwater Breathing Apparatus; // I'm sure there are better examples.). // Note that XML is normally spelled out; UNESCO is pronounced as // a word, but if it's all caps it counts as an acronym for my // purposes. (If you think it should be written Unesco, it isn't // an acronym for you any more.) // The problem is what to do with acronyms in camelCase. Doing // a Google search finds few consistent answers; purists want XML // treated like any other word and written Xml; pragmatists // look for compromises (such as relaxing the rules for acronyms // like "US"). If two acronyms occur one after the other, as in // XMLHTTPRequest, things get bad, and it seems that writing this // as XmlHttpRequest is the only sane approach. In my opinion, // you don't have to have two acronyms in a row; even HTTPRequest // is painful to read. // If I were running the world I would abandon camelCase; I don't think // the reasons given for why we should avoid internal underscores in // Scala make any sense. // // I'm not running the world, but I am running this file, where the // acronym CFG (for Content-Free Grammar) occurs repeatedly. // I propose the following convention: Acronyms cause the // capitalization of the following word to be reversed. // So the "XML_HTTP" example becomes XMLhttpRequest. // More generally, if the case of the last letter in a symbol // would be the same as the case of the first letter in the next symbol, // the capitalization of the next symbol is reversed. // By "reversing a symbol's capitalization," I mean that if it is a // spelled-out acronym, all its letters are changed in case; if it is // a pronounced word, only the first letter's case is reversed. // So myiMac becomes myIMac // HTTPRequest becomes HTTPrequest // XMLHTTPRequest becomes XMLhttpRequest sealed trait CFGsymbol { def name: String override def toString: String = { "<" + (this match { case nt: CFGnonterminal => "." + nt.name case t: CFGterminal => t.name case CFGempty => CFG.Epsilon // this.name }) + ">" } } // What we have in mind for terminals is, for example, `N`[oun], not // `lamp`. That will mean "all possible strings" will have strings // like `List(Det, N, V, Det, Noun)` --- which isn't a string at // all, but a list giving a possible sequence of nonterminals. // To actually make a string out of this, you would have to select // strings of the appropriate category, which allows us to make wise // choices and get (e.g.) "The girl loved the tree" and avoid // "A girls love tunafish", thus postponing solving agreement problems // once again. case class CFGterminal(val name: String) extends CFGsymbol { } case class CFGnonterminal(val name: String) extends CFGsymbol { } // The empty string. Of use in the definition of some grammars. case object CFGempty extends CFGsymbol { val name: String = "_e_" } case class CFGrule(lhs: CFGnonterminal, rhs: List[CFGsymbol]) { override def toString: String = "Rule[" + lhs.toString + " --> " + rhs.toString + "]" } case class CFG(terms: List[CFGterminal], nonterms: List[CFGnonterminal], rules: List[CFGrule], startSym: CFGnonterminal) { lazy val leftIndex: HashMap[CFGnonterminal, List[CFGrule]] = { rules.foldLeft(new HashMap[CFGnonterminal, List[CFGrule]])( (indexSoFar, nextRule) => // nextRule adds to list of rules already accumulated // for the nonterminal on the left-hand side: indexSoFar + (nextRule.lhs -> (nextRule :: indexSoFar.getOrElse(nextRule.lhs, Nil)))) } /** Finds all rules with `nonterm` as their left-hand side -- */ def lookupLeft(nonterm: CFGnonterminal): List[CFGrule] = leftIndex.getOrElse(nonterm, Nil) /** Returns the terminal symbol with name `s` or else uh-oh -- */ def lookupTerminal(s: String) = terms.find(_.name == s).getOrElse { throw new Exception("Unknown terminal: " + s) } /** Returns the nonterminal symbol with name `s` or else uh-oh -- */ def lookupNonterminal(s: String) = nonterms.find(_.name == s).getOrElse { throw new Exception("Unknown nonterminal: " + s) } } object CFG { final val Epsilon = "\u03B5" // -- Greek letter, lower-case def legalityCheck(cfg: CFG): Boolean = { (cfg.nonterms contains cfg.startSym) && cfg.rules.forall(legalRule(_, cfg)) } def listsDisjoint[A](l1: List[A], l2: List[A]): Boolean = l1.forall(x1 => !l2.contains(x1)) def legalRule(rule: CFGrule, cfg: CFG): Boolean = { cfg.nonterms.contains(rule.lhs) && (rule.rhs == List(CFGempty) || rule.rhs.forall(s => (s != CFGempty && ( cfg.terms.contains(s) || cfg.nonterms.contains(s))))) } /** Returns well-formed serial conjunction of strings, plus correct * verb ending for it ("s" for 1 element, "" for more than 1, ...). * For example, englishConjunction(List("red", "white", "blue")) * returns ("red, white, and blue", "") * englishConjunction(List("S")) = ("S", "s") * Of course, the verb ending is somewhat simplistic. */ def englishConjunction(l: List[String]): (String, String) = { val n = l.length if (n == 0) ("nothing", "s") else if (n == 1) (l.head, "s") else if (n == 2) (l.head ++ " and " ++ l.tail.head, "") else { val (front, back) = l.splitAt(n-2) (front.mkString(", ") ++ ", and " ++ back.head, "") } } /** Returns text specifying reasons why legalityCheck(cfg) == false. * If legalityCheck(cfg) == true, the text will be the empty string. */ def diagnosis(cfg: CFG): String = { def ruleDiagnosis(r: CFGrule): String = { if (CFG.legalRule(r, cfg)) "" else { "In rule " ++ r.toString ++ ":\n" ++ (if (cfg.nonterms.contains(r.lhs)) "" else { " lhs is not in nonterminals list\n" }) ++ { if (r.rhs == List(CFGempty)) "" else { val empties = r.rhs.count(_ == CFGempty) val badOnRight = r.rhs.filter(s => s == CFGempty || !cfg.terms.contains(s) && !cfg.nonterms.contains(s)) (if (empties == 0) "" else { val singular = (empties == 1) "There " + (if (singular) "is an" else ("are " + empties)) + " occurrence" + (if (singular) "" else "s") + " of " + CFG.Epsilon + " in the rhs\n" }) ++ (if (badOnRight == Nil) "" else { val (badConj, ve) = englishConjunction(badOnRight.map(_.toString)) " In rhs, " ++ badConj ++ (if (ve == "s") " does" else " do") ++ " not occur in either terminals or nonterminals\n" }) } } } } (if (cfg.nonterms contains cfg.startSym) "" else cfg.startSym.toString ++ " is not one of the nonterminals\n") ++ // disjointnessDiagnosis() ++ cfg.rules.foldRight("")((rule, d) => ruleDiagnosis(rule) ++ d) } } /** Syntax tree occurring at level `depth` in some bigger tree (unless * `depth` = 0, when this is the root). * maxDepth is the depth of the deepest node of the tree (wrt the * root) * If depth = maxDepth, then this tree is a leaf. */ abstract class CFGsyntaxTree(val depth: Int, val maxDepth: Int) { } /** A tree consisting of a single terminal symbol */ case class CFGtreeLeaf(val term: CFGterminal, d: Int) extends CFGsyntaxTree(d, d) { } /** A tree consisting of a single unexpanded nonterminal */ case class CFGtreeTip(val nonterm: CFGnonterminal, d: Int) extends CFGsyntaxTree(d, d) { } /** A tree consisting of nothing. Does *not* increase the maxDepth * of any tree it occurs in. */ case class CFGtreeEmpty(d: Int) extends CFGsyntaxTree(d, d-1) { } /** A tree recording the expansion of a nonterminal using a rule. * If d = m, then the rule's right-hand side is empty, and this * is a "terminal nonterminal" node. */ case class CFGexpansion(val nonterm: CFGnonterminal, val expansion: List[CFGsyntaxTree], val rule: CFGrule, d: Int, m: Int) extends CFGsyntaxTree(d, m) { }