// ── programs/extract_corpus/emit.es ── // E# — a verified example from the E# language corpus (CLR language; .es, not ECMAScript). // provenance: emit.es topic: programs status: unverified // part of extract_corpus — the first real E# program (multi-file, dogfood) namespace ExtractCorpus // Corpus output: write each example to a standalone `.es` file (by topic, or to authored/ // programs), plus a structured manifest, a JSONL training payload, and a coverage report. // Featured curation is applied post-hoc from an external `featured.txt` — tests are never // touched. A literal `{` anywhere in a string is mis-lexed as an interpolation-hole start // (known compiler gap), so JSON braces are emitted as char codes (123='{', 125='}') and // banners/ids are built with `+` rather than interpolation holes that contain `{`. // ---- Topic classification ------------------------------------------------- // Taxonomy bucket from the host test file name (fast path). func topicOf(file: string) -> string { if file.Contains("HeapPointer") || file.Contains("Refs") || file.Contains("Pointer") { return "pointers" } if file.Contains("Inheritance") { return "inheritance" } if file.Contains("Async") { return "async" } if file.Contains("Delegates") || file.Contains("Events") { return "delegates-events" } if file.Contains("Const") { return "const" } if file.Contains("Embedding") { return "embedding" } if file.Contains("StaticFunc") { return "static-func" } if file.Contains("Result") || file.Contains("Combinator") { return "result" } if file.Contains("FunctionPointers") { return "function-pointers" } if file.Contains("FieldDefaults") { return "field-defaults" } if file.Contains("Interop") || file.Contains("External") { return "interop" } if file.Contains("New") { return "allocation" } if file.Contains("TaskScope") || file.Contains("Concurrency") { return "concurrency" } if file.Contains("DataContract") { return "data" } return "core" } // True when an ordinary "..." string in `source` contains an interpolation hole ({letter). func hasInterpolation(source: string) -> bool { var inStr = false var i = 0 while i < source.Length { let c = source[i] if inStr { if c == '"' { inStr = false } else if c == '{' && i + 1 < source.Length && char.IsLetter(source[i + 1]) { return true } } else if c == '"' { inStr = true } i += 1 } return false } // True when `source` uses a pointer type `*T` ('*' directly followed by an uppercase letter). func hasStarType(source: string) -> bool { var i = 0 while i + 1 < source.Length { if source[i] == '*' && char.IsUpper(source[i + 1]) { return true } i += 1 } return false } // Content-based bucket for examples the filename heuristic dropped into "core". Order is // precedence: the most distinctive feature wins. func topicOfContent(source: string) -> string { if source.Contains("task func") || source.Contains("await ") || source.Contains("async ") || source.Contains("Job<") || source.Contains("chan<") { return "async" } if source.Contains("open ref data") || source.Contains("abstract ref data") || source.Contains("virtual func") || source.Contains("abstract func") || source.Contains(": base(") { return "inheritance" } if source.Contains("choice ") || source.Contains("match ") { return "choice" } if hasStarType(source) || source.Contains("HeapPointer") || source.Contains("StackAlloc") || source.Contains("HeapAlloc") { return "pointers" } if source.Contains("enum ") { return "enum" } if source.Contains("Result<") || source.Contains("ok(") || source.Contains("error(") { return "result" } if source.Contains("delegate func") || source.Contains("event ") || source.Contains("raise ") || source.Contains("&(") { return "delegates-events" } if source.Contains("") || source.Contains("") { return "generics" } if hasInterpolation(source) { return "interpolation" } if source.Contains("static func ") { return "static-func" } if source.Contains("using \"System") || source.Contains("using \"Microsoft") || source.Contains("StringBuilder") || source.Contains("Dictionary<") { return "interop" } if source.Contains("data ") { return "data" } return "core" } // Final topic: filename heuristic first, content classification for the "core" residue. func classifyTopic(file: string, source: string) -> string { let t = topicOf(file) if t == "core" { return topicOfContent(source) } return t } // ---- Identity / banner / paths -------------------------------------------- // Strip a trailing ".cs" or ".es" extension. func stripExt(file: string) -> string { if file.EndsWith(".cs") || file.EndsWith(".es") { return file.Substring(0, file.Length - 3) } return file } // Stable provenance id, as a free function over primitives. Program files: // `__`. Otherwise `[__]`. Kept // receiver-free so it can be called on `let`-bound Fact locals (a promoted method on a // `let`-local `ref data` receiver currently mis-emits — see tickets/compiler-gaps). func computeId(program: string, file: string, method: string) -> string { if program.Length > 0 { return program + "__" + stripExt(file) } let f = stripExt(file) if method.Length > 0 { return f + "__" + method } return f } // Promoted convenience: `fact.idOf()`. Safe on loop-variable / parameter receivers. func idOf(fact: Fact) -> string { return computeId(fact.program, fact.file, fact.method) } // Relative path of the example's `.es` within the corpus dir. func esPathOf(fact: Fact) -> string { if fact.kind == "program" { return "programs/" + fact.program + "/" + fact.file } if fact.kind == "authored" { return "authored/" + fact.idOf() + ".es" } return "examples/" + fact.topic + "/" + fact.idOf() + ".es" } // Per-`.es` header banner: an E#-identity line (doubles as the .es-vs-ECMAScript // disambiguator), provenance, and the verified behavior. Built with `+` so an `expected` // value containing `{` never trips interpolation-hole lexing. func banner(fact: Fact) -> string { var claim = "// compiles cleanly (no auto-run claim was extracted)" if fact.kind == "runnable" { claim = "// verified behavior: Test." + fact.entry + "(...) == " + fact.expected } else if fact.kind == "negative" { claim = "// verified behavior: reports diagnostic " + fact.diag } else if fact.kind == "authored" { claim = "// hand-authored, idiomatic E# — verified through the E# compiler" } else if fact.kind == "program" { claim = "// part of extract_corpus — the first real E# program (multi-file, dogfood)" } var status = "unverified" if fact.verified { status = "verified" } var prov = fact.file if fact.method.Length > 0 { prov = fact.file + "::" + fact.method } let l1 = "// E# — a verified example from the E# language corpus (CLR language; .es, not ECMAScript).\n" let l2 = "// provenance: " + prov + " topic: " + fact.topic + " status: " + status + "\n" return l1 + l2 + claim + "\n\n" } func jsonEscape(s: string) -> string { var r = s r = r.Replace("\\", "\\\\") r = r.Replace("\"", "\\\"") r = r.Replace("\r", "") r = r.Replace("\n", "\\n") r = r.Replace("\t", "\\t") return r } // Append a JSON string literal (quotes + escaped value), piece-by-piece — a hole with a // NESTED call (`{jsonEscape(f.idOf())}`, parens depth > 1) currently mis-parses. func appendStr(sb: StringBuilder, value: string) { sb.Append("\"") sb.Append(jsonEscape(value)) sb.Append("\"") } // ---- Dedup ---------------------------------------------------------------- // Conservative content key: normalize line endings + outer whitespace only (do not strip // comments) so we only collapse genuinely identical programs. func normalizeBody(source: string) -> string { var s = source.Replace("\r\n", "\n") s = s.Replace("\r", "\n") return s.Trim() } // Canonical preference (inlined in dedup to avoid a two-`ref data`-param promoted // method, which currently mis-emits the receiver — see tickets/compiler-gaps): // a non-core topic beats core, then verified beats unverified, then the shorter id. func betterCanonical(candTopic: string, candVerified: bool, candIdLen: int, curTopic: string, curVerified: bool, curIdLen: int) -> bool { let candCore = candTopic == "core" let curCore = curTopic == "core" if candCore != curCore { return !candCore } if candVerified != curVerified { return candVerified } return candIdLen < curIdLen } // Collapse identical-source facts to one canonical each, preserving every absorbed // `file::method` in `origins` and the group size in `duplicateCount`. No silent drops: // Σ duplicateCount == input count, and every input id lands in some `origins`. // // Implementation note: every Fact member is touched only through a *loop variable*. // All maps hold strings / List / bool — never a Fact — because member access on // a ref-data value pulled out of a generic collection currently mis-emits (the receiver // is loaded by address). See tickets/compiler-gaps-corpus-extractor.md. func dedup(facts: List) -> List { // Pass 1 — group by normalized source. Track all ids per key (the future // `origins`), and the chosen-canonical's identity as primitives. let idsByKey = Dictionary>() // key -> every id in the group let bestId = Dictionary() // key -> chosen canonical id let bestTopic = Dictionary() let bestVer = Dictionary() let bestLen = Dictionary() for fact in facts { let key = normalizeBody(fact.source) let id = computeId(fact.program, fact.file, fact.method) if !idsByKey.ContainsKey(key) { idsByKey[key] = List() bestId[key] = id bestTopic[key] = fact.topic bestVer[key] = fact.verified bestLen[key] = id.Length } else if betterCanonical(fact.topic, fact.verified, id.Length, bestTopic[key], bestVer[key], bestLen[key]) { bestId[key] = id bestTopic[key] = fact.topic bestVer[key] = fact.verified bestLen[key] = id.Length } idsByKey[key].Add(id) } // Pass 2 — emit the canonical fact of each group (the one whose id is `bestId`), // stamping its provenance; mark the rest non-canonical. Both happen on loop vars. let result = List() for fact in facts { let key = normalizeBody(fact.source) let id = computeId(fact.program, fact.file, fact.method) if id == bestId[key] { fact.origins = idsByKey[key] fact.duplicateCount = idsByKey[key].Count fact.canonical = true result.Add(fact) } else { fact.canonical = false } } return result } // ---- Writers -------------------------------------------------------------- // Delete the generated example trees so a re-run never leaves stale files (a renamed // topic, a removed test, a different dedup choice) behind in the published corpus. // Only the generated subdirs are touched; manifest.json / corpus.jsonl / coverage.md are // overwritten in place, and featured.txt (curation input) is left alone. func cleanOutput(corpusDir: string) { let dirs = List() dirs.Add("examples") dirs.Add("authored") dirs.Add("programs") dirs.Add("featured") for d in dirs { let p = System.IO.Path.Combine(corpusDir, d) if System.IO.Directory.Exists(p) { System.IO.Directory.Delete(p, true) } } } // Write every example to its `.es` path (examples//, authored/, or programs/

/). func writeCorpus(facts: List, corpusDir: string) { cleanOutput(corpusDir) for fact in facts { let path = System.IO.Path.Combine(corpusDir, fact.esPathOf()) let dir = System.IO.Path.GetDirectoryName(path) System.IO.Directory.CreateDirectory(dir) System.IO.File.WriteAllText(path, fact.banner() + fact.source) } } // Write manifest.json — one entry per example, hand-rolled so source stays in the .es // files (esPath points at them) and only metadata lands here. func writeManifest(facts: List, corpusDir: string) { let sb = StringBuilder() sb.Append("[\n") var first = true for fact in facts { if !first { sb.Append(",\n") } first = false var v = "false" if fact.verified { v = "true" } let dc = fact.duplicateCount sb.Append(" ") sb.Append(Convert.ToChar(123)) sb.Append("\"id\":") appendStr(sb, fact.idOf()) sb.Append(",\"topic\":") appendStr(sb, fact.topic) sb.Append(",\"kind\":") appendStr(sb, fact.kind) sb.Append(",\"program\":") appendStr(sb, fact.program) sb.Append(",\"esPath\":") appendStr(sb, fact.esPathOf()) sb.Append(",\"entry\":") appendStr(sb, fact.entry) sb.Append(",\"expected\":") appendStr(sb, fact.expected) sb.Append(",\"diag\":") appendStr(sb, fact.diag) sb.Append(",\"verified\":") sb.Append(v) sb.Append(",\"duplicateCount\":") sb.Append(Convert.ToString(dc)) sb.Append(",\"args\":[") var af = true for a in fact.args { if !af { sb.Append(",") } af = false appendStr(sb, a) } sb.Append("]") sb.Append(",\"origins\":[") var of = true for o in fact.origins { if !of { sb.Append(",") } of = false appendStr(sb, o) } sb.Append("]") sb.Append(Convert.ToChar(125)) } sb.Append("\n]\n") System.IO.File.WriteAllText(System.IO.Path.Combine(corpusDir, "manifest.json"), sb.ToString()) } // Write corpus.jsonl — one JSON object per line with INLINE source + metadata. This is // the HuggingFace-ready training payload; consumers load it directly. func writeJsonl(facts: List, corpusDir: string) { let sb = StringBuilder() for fact in facts { var v = "false" if fact.verified { v = "true" } let dc = fact.duplicateCount sb.Append(Convert.ToChar(123)) sb.Append("\"id\":") appendStr(sb, fact.idOf()) sb.Append(",\"topic\":") appendStr(sb, fact.topic) sb.Append(",\"kind\":") appendStr(sb, fact.kind) sb.Append(",\"program\":") appendStr(sb, fact.program) sb.Append(",\"entry\":") appendStr(sb, fact.entry) sb.Append(",\"expected\":") appendStr(sb, fact.expected) sb.Append(",\"diag\":") appendStr(sb, fact.diag) sb.Append(",\"verified\":") sb.Append(v) sb.Append(",\"duplicateCount\":") sb.Append(Convert.ToString(dc)) sb.Append(",\"source\":") appendStr(sb, fact.source) sb.Append(Convert.ToChar(125)) sb.Append("\n") } System.IO.File.WriteAllText(System.IO.Path.Combine(corpusDir, "corpus.jsonl"), sb.ToString()) } // Write coverage.md — totals per bucket, verification rate, and dedup summary. No silent // drops: the unknown bucket and the collapsed-duplicate counts are reported. func writeCoverage(facts: List, corpusDir: string, fileCount: int) { var runnable = 0 var negative = 0 var unknown = 0 var authored = 0 var program = 0 var verified = 0 var redundant = 0 var groups = 0 for fact in facts { if fact.kind == "runnable" { runnable += 1 } else if fact.kind == "negative" { negative += 1 } else if fact.kind == "authored" { authored += 1 } else if fact.kind == "program" { program += 1 } else { unknown += 1 } if fact.verified { verified += 1 } if fact.duplicateCount > 1 { groups += 1 redundant += fact.duplicateCount - 1 } } let sb = StringBuilder() sb.Append("# E# corpus — extraction & verification coverage\n\n") sb.Append("Generated by `tools/extract_corpus` (written in E#).\n\n") sb.Append("| metric | count |\n") sb.Append("|---|---|\n") sb.Append("| host files scanned | {fileCount} |\n") sb.Append("| canonical examples | {facts.Count} |\n") sb.Append("| runnable (value claim) | {runnable} |\n") sb.Append("| negative (diagnostic) | {negative} |\n") sb.Append("| unknown (compile-only) | {unknown} |\n") sb.Append("| authored (artisanal) | {authored} |\n") sb.Append("| program (flagship) | {program} |\n") sb.Append("| re-verified through the E# compiler | {verified} |\n") sb.Append("| duplicate groups collapsed | {groups} |\n") sb.Append("| redundant files removed | {redundant} |\n") System.IO.File.WriteAllText(System.IO.Path.Combine(corpusDir, "coverage.md"), sb.ToString()) } // Featured curation, applied post-hoc: read `featured.txt` (one id per line) and copy the // named examples into `featured/`. The test files are never touched. func applyFeatured(facts: List, corpusDir: string) { let listPath = System.IO.Path.Combine(corpusDir, "featured.txt") if !System.IO.File.Exists(listPath) { return } let wanted = List() for line in System.IO.File.ReadAllLines(listPath) { let t = line.Trim() if t.Length > 0 && !t.StartsWith("#") { wanted.Add(t) } } if wanted.Count == 0 { return } let featuredDir = System.IO.Path.Combine(corpusDir, "featured") System.IO.Directory.CreateDirectory(featuredDir) for fact in facts { let id = fact.idOf() if wanted.Contains(id) { let dst = System.IO.Path.Combine(featuredDir, id + ".es") System.IO.File.WriteAllText(dst, fact.banner() + fact.source) } } } // ── programs/extract_corpus/extract.es ── // E# — a verified example from the E# language corpus (CLR language; .es, not ECMAScript). // provenance: extract.es topic: programs status: unverified // part of extract_corpus — the first real E# program (multi-file, dogfood) namespace ExtractCorpus // Stage 1 — Extract: walk the test corpus's C# host files via Roslyn and lift the // embedded `.es` programs + their behavioral claims out of each `[Fact]`. This is // the interop-aggressive half of the dogfood: E# consuming Roslyn's generic, // type-pattern-heavy API. LINQ (`OfType`) resolves from the implicit System.Linq // import; the Roslyn syntax types come from the two imports below. The walk is // deliberately string-centric (`.ToString()` on nodes) rather than type-pattern // matching, so it leans on Roslyn's enumerables, not `is`-patterns over CLR types. using "Microsoft.CodeAnalysis" using "Microsoft.CodeAnalysis.CSharp" using "Microsoft.CodeAnalysis.CSharp.Syntax" // One extracted example: an `.es` program plus the behavioral claim its [Fact] made. // id (provenance) is `file::method`; kind is the classification bucket. ref data Fact { var file: string = "" var method: string = "" var source: string = "" var kind: string = "unknown" // "runnable" | "negative" | "unknown" var entry: string = "" // E# function to invoke (runnable) var args: List = List() // literal arg texts (runnable) var expected: string = "" // literal expected text (runnable) var diag: string = "" // expected diagnostic code (negative) var verified: bool = false // re-verified through the E# compiler (Stage 2) var topic: string = "core" // taxonomy bucket (file + content) var origins: List = List() // every file::method whose source is identical (dedup) var duplicateCount: int = 1 // how many test methods carried this exact source var canonical: bool = true // false for absorbed duplicates (not written) var program: string = "" // non-empty for multi-file flagship-program files (kind "program") } // True when a method declaration carries an xUnit [Fact] or [Theory] attribute. func isTestMethod(m: MethodDeclarationSyntax) -> bool { for a in m.DescendantNodes().OfType() { let name = a.Name.ToString() if name == "Fact" || name == "Theory" { return true } } return false } // Heuristic: a string literal is an embedded E# program if it carries E# structure. func looksLikeEsharp(s: string) -> bool { return s.Contains("namespace ") || s.Contains("func ") || s.Contains("data ") || s.Contains("choice ") } // "ES2151" / "ES3012" etc. — a diagnostic-code literal (E + S + four digits). func isDiagCode(s: string) -> bool { if s.Length != 6 { return false } if s[0] != 'E' || s[1] != 'S' { return false } var i = 2 while i < 6 { if !char.IsDigit(s[i]) { return false } i += 1 } return true } // Strip a single layer of surrounding double quotes from an argument's source text. func unquote(s: string) -> string { if s.Length >= 2 && s[0] == '"' && s[s.Length - 1] == '"' { return s.Substring(1, s.Length - 2) } return s } // Split a call's argument text at top-level commas (ignoring commas nested inside // (), [], <>, {}, ordinary "..." strings, or """...""" raw strings). Works off the // invocation's source text so it never touches Roslyn's `SeparatedSyntaxList` struct // (a known interop gap). Brace + raw-string awareness is what keeps object initializers // like `new object?[] { "/users/{id}", x }` and multi-line raw-string args from // mis-splitting at an interior comma. func splitTopLevel(inner: string) -> List { let result = List() var depth = 0 var inStr = false // inside an ordinary "..." literal var inRaw = false // inside a """...""" raw-string literal var start = 0 var i = 0 while i < inner.Length { let c = inner[i] if inRaw { // Only a closing triple-quote exits a raw string; everything else is inert. if c == '"' && i + 2 < inner.Length && inner[i + 1] == '"' && inner[i + 2] == '"' { inRaw = false i += 3 continue } } else if inStr { if c == '"' { inStr = false } } else if c == '"' && i + 2 < inner.Length && inner[i + 1] == '"' && inner[i + 2] == '"' { inRaw = true i += 3 continue } else if c == '"' { inStr = true } else if c == '(' || c == '[' || c == '<' || c == '{' { depth += 1 } else if c == ')' || c == ']' || c == '>' || c == '}' { depth -= 1 } else if c == ',' && depth == 0 { result.Add(inner.Substring(start, i - start).Trim()) start = i + 1 } i += 1 } if start < inner.Length { result.Add(inner.Substring(start, inner.Length - start).Trim()) } return result } // An E# identifier: [A-Za-z_][A-Za-z0-9_]* (specification/lexical.md). Used to reject // mis-parsed `entry` values (object initializers, `new`, fragments with spaces/braces) // before a fact is allowed to claim a runnable behavior. func isIdentifier(s: string) -> bool { if s.Length == 0 { return false } let c0 = s[0] if !char.IsLetter(c0) && c0 != '_' { return false } var i = 1 while i < s.Length { let c = s[i] if !char.IsLetter(c) && !char.IsDigit(c) && c != '_' { return false } i += 1 } return true } // The argument-list text inside the outermost parentheses of a call expression's // source, e.g. `Invoke(asm, "Test", "sumTo", 10)` -> `asm, "Test", "sumTo", 10`. func innerArgs(callText: string) -> string { let open = callText.IndexOf('(') let close = callText.LastIndexOf(')') if open < 0 || close <= open { return "" } return callText.Substring(open + 1, close - open - 1) } // The source-text of each argument of an invocation, parsed from its source text. func argTexts(inv: InvocationExpressionSyntax) -> List { return splitTopLevel(innerArgs(inv.ToString())) } // The first embedded E# program in a test method (most tests carry exactly one). func primarySource(m: MethodDeclarationSyntax) -> string { for lit in m.DescendantNodes().OfType() { let v = lit.Token.ValueText if looksLikeEsharp(v) { return v } } return "" } // The diagnostic code a negative test asserts, if any ("" otherwise). func diagCode(m: MethodDeclarationSyntax) -> string { for lit in m.DescendantNodes().OfType() { let v = lit.Token.ValueText if isDiagCode(v) { return v } } return "" } // Parse a runnable claim from an `Assert.Equal(expected, Invoke/Run(...))` invocation. // Fills entry/args/expected on `fact` and returns true on the dominant shapes: // Invoke(asm, "Test", "method", args...) — typeName + method, then args // Run(asm, "Test", "method", args...) // EsHarness.Run(src, "method", args...) — source expr, method, then args func tryRunnable(assertInv: InvocationExpressionSyntax, fact: Fact) -> bool { let assertArgs = argTexts(assertInv) if assertArgs.Count != 2 { return false } // The inner Invoke/Run call lives among the assert's descendants. for inner in assertInv.DescendantNodes().OfType() { let callee = inner.Expression.ToString() if callee.EndsWith("Invoke") || callee.EndsWith("Run") { let ia = argTexts(inner) if ia.Count < 2 { continue } var entry = "" let candidateArgs = List() // typeName-shape when the second arg is the literal "Test". if ia.Count >= 3 && ia[1] == "\"Test\"" { entry = unquote(ia[2]) var i = 3 while i < ia.Count { candidateArgs.Add(ia[i]) i += 1 } } else { // Run(src, "method", args...) shape. entry = unquote(ia[1]) var i = 2 while i < ia.Count { candidateArgs.Add(ia[i]) i += 1 } } // Guard: a mis-parsed entry (object initializer, `new`, a fragment with // braces/spaces) is not a valid method name. Skip it — the fact still ships // its source as compile-only ("unknown"), never a garbled runnable claim. if !isIdentifier(entry) { continue } fact.entry = entry for a in candidateArgs { fact.args.Add(a) } fact.expected = assertArgs[0] fact.kind = "runnable" return true } } return false } // Extract every [Fact]/[Theory] in one C# host file into Fact records. func extractFile(path: string) -> List { let facts = List() let text = System.IO.File.ReadAllText(path) let tree = CSharpSyntaxTree.ParseText(text) let root = tree.GetRoot() let fileName = System.IO.Path.GetFileName(path) for m in root.DescendantNodes().OfType() { if !isTestMethod(m) { continue } let src = primarySource(m) if src == "" { continue } // no embedded E# program — metadata/reflection test let fact = Fact { file: fileName, method: m.Identifier.ValueText, source: src, topic: classifyTopic(fileName, src) } // Runnable: an Assert.Equal whose second argument runs the program. var matched = false for inv in m.DescendantNodes().OfType() { if inv.Expression.ToString() == "Assert.Equal" { if tryRunnable(inv, fact) { matched = true } } } // Negative: asserts a diagnostic code instead of a value. if !matched { let code = diagCode(m) if code != "" { fact.kind = "negative" fact.diag = code } } facts.Add(fact) } return facts } // Read every hand-authored `.es` under `authoredDir` into "authored" Facts — the // curated, idiomatic tier. Topic is content-classified so they also surface under their // feature's topic; they are written to `corpus/authored/` regardless (see esPathOf). func ingestAuthored(authoredDir: string) -> List { let facts = List() if !System.IO.Directory.Exists(authoredDir) { return facts } for path in System.IO.Directory.GetFiles(authoredDir, "*.es") { let src = System.IO.File.ReadAllText(path) let name = System.IO.Path.GetFileName(path) let fact = Fact { file: name, method: "", source: src, kind: "authored", topic: classifyTopic(name, src) } facts.Add(fact) } return facts } // Read a multi-file program's `.es` files (one Fact per file, kind "program") — the // flagship "first real E# program". Written to `corpus/programs//`. func ingestProgram(programName: string, srcDir: string, fileNames: List) -> List { let facts = List() for fn in fileNames { let path = System.IO.Path.Combine(srcDir, fn) if !System.IO.File.Exists(path) { continue } let src = System.IO.File.ReadAllText(path) let fact = Fact { file: fn, method: "", source: src, kind: "program", program: programName, topic: "programs" } facts.Add(fact) } return facts } // ── programs/extract_corpus/main.es ── // E# — a verified example from the E# language corpus (CLR language; .es, not ECMAScript). // provenance: main.es topic: programs status: unverified // part of extract_corpus — the first real E# program (multi-file, dogfood) namespace ExtractCorpus // extract_corpus — the E#-written tool that lifts the verified `.es` corpus out of the // test suite (Stage 1, Roslyn) and independently re-verifies each example through the E# // compiler (Stage 2), then writes corpus/examples + authored + the flagship program, plus // manifest.json, corpus.jsonl (training payload), and coverage.md. Dogfood: the toolchain // that ships E# runs on E#. // // Stage 1 (extract.es) C# host files -> per-[Fact] Fact records via Roslyn // Stage 2 (verify.es) each example recompiled through the E# IL backend // dedup (emit.es) collapse identical sources, preserve provenance // output (emit.es) corpus/{examples,authored,programs} + manifest + jsonl + coverage func main() { // Paths are relative to the esharp repo root (run the tool from there), or pass // absolute positional argv overrides for CI: // extract_corpus var testDir = "tests/Esharp.Tests" var corpusDir = "corpus" var toolDir = "tools/extract_corpus" let argv = System.Environment.GetCommandLineArgs() if argv.Length > 1 { testDir = argv[1] } if argv.Length > 2 { corpusDir = argv[2] } if argv.Length > 3 { toolDir = argv[3] } // Stage 1 — extract from C# host files. let files = System.IO.Directory.GetFiles(testDir, "*.cs") let extracted = List() for f in files { for fact in extractFile(f) { extracted.Add(fact) } } Console.WriteLine("Stage 1: extracted {extracted.Count} examples from {files.Length} host files.") // Stage 2 — re-verify each through the E# compiler. Console.WriteLine("Stage 2: re-verifying extracted examples (this recompiles every one)...") verifyAll(extracted) // Dedup the extracted bulk (authored + program are curated/unique, kept as-is). let canon = dedup(extracted) Console.WriteLine("Dedup: {extracted.Count} -> {canon.Count} canonical.") // Artisanal hand-authored examples. let authoredDir = System.IO.Path.Combine(toolDir, "authored") let authored = ingestAuthored(authoredDir) verifyAll(authored) Console.WriteLine("Authored: {authored.Count} artisanal examples.") // Flagship program: extract_corpus's own source — the first real E# program. let programFiles = List() programFiles.Add("extract.es") programFiles.Add("emit.es") programFiles.Add("verify.es") programFiles.Add("main.es") let program = ingestProgram("extract_corpus", toolDir, programFiles) verifyAll(program) Console.WriteLine("Program: {program.Count} flagship files.") // Combine and write. let all = List() for fact in canon { all.Add(fact) } for fact in authored { all.Add(fact) } for fact in program { all.Add(fact) } writeCorpus(all, corpusDir) writeManifest(all, corpusDir) writeJsonl(all, corpusDir) writeCoverage(all, corpusDir, files.Length) applyFeatured(all, corpusDir) var runnable = 0 var negative = 0 var unknown = 0 for fact in canon { if fact.kind == "runnable" { runnable += 1 } else if fact.kind == "negative" { negative += 1 } else { unknown += 1 } } var verified = 0 for fact in all { if fact.verified { verified += 1 } } Console.WriteLine("=== extract_corpus complete ===") Console.WriteLine(" total: {all.Count}") Console.WriteLine(" runnable {runnable} negative {negative} unknown {unknown} authored {authored.Count} program {program.Count}") Console.WriteLine(" re-verified: {verified}/{all.Count}") Console.WriteLine(" written to: {corpusDir}") } // ── programs/extract_corpus/verify.es ── // E# — a verified example from the E# language corpus (CLR language; .es, not ECMAScript). // provenance: verify.es topic: programs status: unverified // part of extract_corpus — the first real E# program (multi-file, dogfood) namespace ExtractCorpus // Stage 2 — Re-verify: each extracted example is recompiled through the E# IL backend // (the source-of-truth pipeline EsHarness uses) so every published example is provably // green, decoupled from xUnit. This is the cleanest dogfood: E# invoking the E# compiler. // // Per-file `using`s scope these Esharp.* imports to this file only — extract.es imports // Roslyn under the same namespace with no collision (per-file import scoping). using "Esharp.Compiler.Parsing" using "Esharp.Compiler.Binding" using "Esharp.Compiler.Diagnostics" using "Esharp.ILEmit" // True when `source` parses, binds, and emits verifiable IL with zero errors — the same // parse -> bind -> emit(verify) pipeline the test harness runs. A corpus example that // passes here is provably compilable independent of the test suite. func verifyCompiles(source: string) -> bool { let parser = Parser(source, "corpus.es") let unit = parser.ParseCompilationUnit() // Only hard parse ERRORS disqualify an example — a warning (e.g. a deprecation // notice) still compiles and runs. Mirror the binder/emit error filtering below. for d in parser.Diagnostics { if d.Severity == DiagnosticSeverity.Error { return false } } let binder = Binder() let bound = binder.Bind(unit) for d in binder.Diagnostics { if d.Severity == DiagnosticSeverity.Error { return false } } let tmp = System.IO.Path.Combine(System.IO.Path.GetTempPath(), "corpus_verify.dll") // Pass implicitUsings=true explicitly (final arg). Omitting this trailing optional // makes E# zero-fill it to default(bool)=false rather than honoring C#'s default // (true), which disables the implicit BCL-namespace search and leaves unqualified // types like `List()` unresolved. See tickets/compiler-gaps-corpus-extractor.md. let emitDiags = ILEmitter.EmitToFile(bound, "corpus_verify", tmp, false, nil, true, true) for d in emitDiags { if d.Severity == DiagnosticSeverity.Error { return false } } return true } // True when binding `source` reports the expected diagnostic code — a negative example // is "verified" when it still produces the error its [Fact] asserted. func verifyDiagnostic(source: string, code: string) -> bool { let parser = Parser(source, "corpus.es") let unit = parser.ParseCompilationUnit() let binder = Binder() binder.Bind(unit) for d in binder.Diagnostics { if d.Message.Contains(code) { return true } } return false } // Set `verified` on each fact by re-running it through the compiler: runnable/unknown // examples must compile clean; negative examples must still surface their diagnostic. func verifyAll(facts: List) { for fact in facts { try { if fact.kind == "negative" { fact.verified = verifyDiagnostic(fact.source, fact.diag) } else { fact.verified = verifyCompiles(fact.source) } } catch { // A malformed extraction that throws inside the compiler is simply // unverified — never abort the whole run. fact.verified = false } } }