Skip to content

extract_corpus

← all programs · 4 files · raw source ↓

extract_corpus__emit

programs program

Part of a flagship E# program

// E# — a verified example from the E# language corpus (CLR language; .es, not ECMAScript).
// provenance: emit.es   topic: programs   status: unverified
// part of extract_corpus — the first real E# program (multi-file, dogfood)

namespace ExtractCorpus

// Corpus output: write each example to a standalone `.es` file (by topic, or to authored/
// programs), plus a structured manifest, a JSONL training payload, and a coverage report.
// Featured curation is applied post-hoc from an external `featured.txt` — tests are never
// touched. A literal `{` anywhere in a string is mis-lexed as an interpolation-hole start
// (known compiler gap), so JSON braces are emitted as char codes (123='{', 125='}') and
// banners/ids are built with `+` rather than interpolation holes that contain `{`.

// ---- Topic classification -------------------------------------------------

// Taxonomy bucket from the host test file name (fast path).
func topicOf(file: string) -> string {
    if file.Contains("HeapPointer") || file.Contains("Refs") || file.Contains("Pointer") {
        return "pointers"
    }
    if file.Contains("Inheritance") { return "inheritance" }
    if file.Contains("Async") { return "async" }
    if file.Contains("Delegates") || file.Contains("Events") { return "delegates-events" }
    if file.Contains("Const") { return "const" }
    if file.Contains("Embedding") { return "embedding" }
    if file.Contains("StaticFunc") { return "static-func" }
    if file.Contains("Result") || file.Contains("Combinator") { return "result" }
    if file.Contains("FunctionPointers") { return "function-pointers" }
    if file.Contains("FieldDefaults") { return "field-defaults" }
    if file.Contains("Interop") || file.Contains("External") { return "interop" }
    if file.Contains("New") { return "allocation" }
    if file.Contains("TaskScope") || file.Contains("Concurrency") { return "concurrency" }
    if file.Contains("DataContract") { return "data" }
    return "core"
}

// True when an ordinary "..." string in `source` contains an interpolation hole ({letter).
func hasInterpolation(source: string) -> bool {
    var inStr = false
    var i = 0
    while i < source.Length {
        let c = source[i]
        if inStr {
            if c == '"' {
                inStr = false
            } else if c == '{' && i + 1 < source.Length && char.IsLetter(source[i + 1]) {
                return true
            }
        } else if c == '"' {
            inStr = true
        }
        i += 1
    }
    return false
}

// True when `source` uses a pointer type `*T` ('*' directly followed by an uppercase letter).
func hasStarType(source: string) -> bool {
    var i = 0
    while i + 1 < source.Length {
        if source[i] == '*' && char.IsUpper(source[i + 1]) { return true }
        i += 1
    }
    return false
}

// Content-based bucket for examples the filename heuristic dropped into "core". Order is
// precedence: the most distinctive feature wins.
func topicOfContent(source: string) -> string {
    if source.Contains("task func") || source.Contains("await ") || source.Contains("async ") || source.Contains("Job<") || source.Contains("chan<") {
        return "async"
    }
    if source.Contains("open ref data") || source.Contains("abstract ref data") || source.Contains("virtual func") || source.Contains("abstract func") || source.Contains(": base(") {
        return "inheritance"
    }
    if source.Contains("choice ") || source.Contains("match ") {
        return "choice"
    }
    if hasStarType(source) || source.Contains("HeapPointer") || source.Contains("StackAlloc") || source.Contains("HeapAlloc") {
        return "pointers"
    }
    if source.Contains("enum ") {
        return "enum"
    }
    if source.Contains("Result<") || source.Contains("ok(") || source.Contains("error(") {
        return "result"
    }
    if source.Contains("delegate func") || source.Contains("event ") || source.Contains("raise ") || source.Contains("&(") {
        return "delegates-events"
    }
    if source.Contains("<T>") || source.Contains("<T,") || source.Contains("<T ") || source.Contains("<TKey") || source.Contains("<TValue") || source.Contains("<TResult") || source.Contains("<U>") {
        return "generics"
    }
    if hasInterpolation(source) {
        return "interpolation"
    }
    if source.Contains("static func ") {
        return "static-func"
    }
    if source.Contains("using \"System") || source.Contains("using \"Microsoft") || source.Contains("StringBuilder") || source.Contains("Dictionary<") {
        return "interop"
    }
    if source.Contains("data ") {
        return "data"
    }
    return "core"
}

// Final topic: filename heuristic first, content classification for the "core" residue.
func classifyTopic(file: string, source: string) -> string {
    let t = topicOf(file)
    if t == "core" {
        return topicOfContent(source)
    }
    return t
}

// ---- Identity / banner / paths --------------------------------------------

// Strip a trailing ".cs" or ".es" extension.
func stripExt(file: string) -> string {
    if file.EndsWith(".cs") || file.EndsWith(".es") {
        return file.Substring(0, file.Length - 3)
    }
    return file
}

// Stable provenance id, as a free function over primitives. Program files:
// `<program>__<file-without-ext>`. Otherwise `<file-without-ext>[__<method>]`. Kept
// receiver-free so it can be called on `let`-bound Fact locals (a promoted method on a
// `let`-local `ref data` receiver currently mis-emits — see tickets/compiler-gaps).
func computeId(program: string, file: string, method: string) -> string {
    if program.Length > 0 {
        return program + "__" + stripExt(file)
    }
    let f = stripExt(file)
    if method.Length > 0 {
        return f + "__" + method
    }
    return f
}

// Promoted convenience: `fact.idOf()`. Safe on loop-variable / parameter receivers.
func idOf(fact: Fact) -> string {
    return computeId(fact.program, fact.file, fact.method)
}

// Relative path of the example's `.es` within the corpus dir.
func esPathOf(fact: Fact) -> string {
    if fact.kind == "program" {
        return "programs/" + fact.program + "/" + fact.file
    }
    if fact.kind == "authored" {
        return "authored/" + fact.idOf() + ".es"
    }
    return "examples/" + fact.topic + "/" + fact.idOf() + ".es"
}

// Per-`.es` header banner: an E#-identity line (doubles as the .es-vs-ECMAScript
// disambiguator), provenance, and the verified behavior. Built with `+` so an `expected`
// value containing `{` never trips interpolation-hole lexing.
func banner(fact: Fact) -> string {
    var claim = "// compiles cleanly (no auto-run claim was extracted)"
    if fact.kind == "runnable" {
        claim = "// verified behavior: Test." + fact.entry + "(...) == " + fact.expected
    } else if fact.kind == "negative" {
        claim = "// verified behavior: reports diagnostic " + fact.diag
    } else if fact.kind == "authored" {
        claim = "// hand-authored, idiomatic E# — verified through the E# compiler"
    } else if fact.kind == "program" {
        claim = "// part of extract_corpus — the first real E# program (multi-file, dogfood)"
    }
    var status = "unverified"
    if fact.verified {
        status = "verified"
    }
    var prov = fact.file
    if fact.method.Length > 0 {
        prov = fact.file + "::" + fact.method
    }
    let l1 = "// E# — a verified example from the E# language corpus (CLR language; .es, not ECMAScript).\n"
    let l2 = "// provenance: " + prov + "   topic: " + fact.topic + "   status: " + status + "\n"
    return l1 + l2 + claim + "\n\n"
}

func jsonEscape(s: string) -> string {
    var r = s
    r = r.Replace("\\", "\\\\")
    r = r.Replace("\"", "\\\"")
    r = r.Replace("\r", "")
    r = r.Replace("\n", "\\n")
    r = r.Replace("\t", "\\t")
    return r
}

// Append a JSON string literal (quotes + escaped value), piece-by-piece — a hole with a
// NESTED call (`{jsonEscape(f.idOf())}`, parens depth > 1) currently mis-parses.
func appendStr(sb: StringBuilder, value: string) {
    sb.Append("\"")
    sb.Append(jsonEscape(value))
    sb.Append("\"")
}

// ---- Dedup ----------------------------------------------------------------

// Conservative content key: normalize line endings + outer whitespace only (do not strip
// comments) so we only collapse genuinely identical programs.
func normalizeBody(source: string) -> string {
    var s = source.Replace("\r\n", "\n")
    s = s.Replace("\r", "\n")
    return s.Trim()
}

// Canonical preference (inlined in dedup to avoid a two-`ref data`-param promoted
// method, which currently mis-emits the receiver — see tickets/compiler-gaps):
// a non-core topic beats core, then verified beats unverified, then the shorter id.
func betterCanonical(candTopic: string, candVerified: bool, candIdLen: int, curTopic: string, curVerified: bool, curIdLen: int) -> bool {
    let candCore = candTopic == "core"
    let curCore = curTopic == "core"
    if candCore != curCore {
        return !candCore
    }
    if candVerified != curVerified {
        return candVerified
    }
    return candIdLen < curIdLen
}

// Collapse identical-source facts to one canonical each, preserving every absorbed
// `file::method` in `origins` and the group size in `duplicateCount`. No silent drops:
// Σ duplicateCount == input count, and every input id lands in some `origins`.
//
// Implementation note: every Fact member is touched only through a *loop variable*.
// All maps hold strings / List<string> / bool — never a Fact — because member access on
// a ref-data value pulled out of a generic collection currently mis-emits (the receiver
// is loaded by address). See tickets/compiler-gaps-corpus-extractor.md.
func dedup(facts: List<Fact>) -> List<Fact> {
    // Pass 1 — group by normalized source. Track all ids per key (the future
    // `origins`), and the chosen-canonical's identity as primitives.
    let idsByKey = Dictionary<string, List<string>>()    // key -> every id in the group
    let bestId = Dictionary<string, string>()            // key -> chosen canonical id
    let bestTopic = Dictionary<string, string>()
    let bestVer = Dictionary<string, bool>()
    let bestLen = Dictionary<string, int>()
    for fact in facts {
        let key = normalizeBody(fact.source)
        let id = computeId(fact.program, fact.file, fact.method)
        if !idsByKey.ContainsKey(key) {
            idsByKey[key] = List<string>()
            bestId[key] = id
            bestTopic[key] = fact.topic
            bestVer[key] = fact.verified
            bestLen[key] = id.Length
        } else if betterCanonical(fact.topic, fact.verified, id.Length, bestTopic[key], bestVer[key], bestLen[key]) {
            bestId[key] = id
            bestTopic[key] = fact.topic
            bestVer[key] = fact.verified
            bestLen[key] = id.Length
        }
        idsByKey[key].Add(id)
    }

    // Pass 2 — emit the canonical fact of each group (the one whose id is `bestId`),
    // stamping its provenance; mark the rest non-canonical. Both happen on loop vars.
    let result = List<Fact>()
    for fact in facts {
        let key = normalizeBody(fact.source)
        let id = computeId(fact.program, fact.file, fact.method)
        if id == bestId[key] {
            fact.origins = idsByKey[key]
            fact.duplicateCount = idsByKey[key].Count
            fact.canonical = true
            result.Add(fact)
        } else {
            fact.canonical = false
        }
    }
    return result
}

// ---- Writers --------------------------------------------------------------

// Delete the generated example trees so a re-run never leaves stale files (a renamed
// topic, a removed test, a different dedup choice) behind in the published corpus.
// Only the generated subdirs are touched; manifest.json / corpus.jsonl / coverage.md are
// overwritten in place, and featured.txt (curation input) is left alone.
func cleanOutput(corpusDir: string) {
    let dirs = List<string>()
    dirs.Add("examples")
    dirs.Add("authored")
    dirs.Add("programs")
    dirs.Add("featured")
    for d in dirs {
        let p = System.IO.Path.Combine(corpusDir, d)
        if System.IO.Directory.Exists(p) {
            System.IO.Directory.Delete(p, true)
        }
    }
}

// Write every example to its `.es` path (examples/<topic>/, authored/, or programs/<p>/).
func writeCorpus(facts: List<Fact>, corpusDir: string) {
    cleanOutput(corpusDir)
    for fact in facts {
        let path = System.IO.Path.Combine(corpusDir, fact.esPathOf())
        let dir = System.IO.Path.GetDirectoryName(path)
        System.IO.Directory.CreateDirectory(dir)
        System.IO.File.WriteAllText(path, fact.banner() + fact.source)
    }
}

// Write manifest.json — one entry per example, hand-rolled so source stays in the .es
// files (esPath points at them) and only metadata lands here.
func writeManifest(facts: List<Fact>, corpusDir: string) {
    let sb = StringBuilder()
    sb.Append("[\n")
    var first = true
    for fact in facts {
        if !first {
            sb.Append(",\n")
        }
        first = false
        var v = "false"
        if fact.verified {
            v = "true"
        }
        let dc = fact.duplicateCount
        sb.Append("  ")
        sb.Append(Convert.ToChar(123))
        sb.Append("\"id\":")
        appendStr(sb, fact.idOf())
        sb.Append(",\"topic\":")
        appendStr(sb, fact.topic)
        sb.Append(",\"kind\":")
        appendStr(sb, fact.kind)
        sb.Append(",\"program\":")
        appendStr(sb, fact.program)
        sb.Append(",\"esPath\":")
        appendStr(sb, fact.esPathOf())
        sb.Append(",\"entry\":")
        appendStr(sb, fact.entry)
        sb.Append(",\"expected\":")
        appendStr(sb, fact.expected)
        sb.Append(",\"diag\":")
        appendStr(sb, fact.diag)
        sb.Append(",\"verified\":")
        sb.Append(v)
        sb.Append(",\"duplicateCount\":")
        sb.Append(Convert.ToString(dc))
        sb.Append(",\"args\":[")
        var af = true
        for a in fact.args {
            if !af {
                sb.Append(",")
            }
            af = false
            appendStr(sb, a)
        }
        sb.Append("]")
        sb.Append(",\"origins\":[")
        var of = true
        for o in fact.origins {
            if !of {
                sb.Append(",")
            }
            of = false
            appendStr(sb, o)
        }
        sb.Append("]")
        sb.Append(Convert.ToChar(125))
    }
    sb.Append("\n]\n")
    System.IO.File.WriteAllText(System.IO.Path.Combine(corpusDir, "manifest.json"), sb.ToString())
}

// Write corpus.jsonl — one JSON object per line with INLINE source + metadata. This is
// the HuggingFace-ready training payload; consumers load it directly.
func writeJsonl(facts: List<Fact>, corpusDir: string) {
    let sb = StringBuilder()
    for fact in facts {
        var v = "false"
        if fact.verified {
            v = "true"
        }
        let dc = fact.duplicateCount
        sb.Append(Convert.ToChar(123))
        sb.Append("\"id\":")
        appendStr(sb, fact.idOf())
        sb.Append(",\"topic\":")
        appendStr(sb, fact.topic)
        sb.Append(",\"kind\":")
        appendStr(sb, fact.kind)
        sb.Append(",\"program\":")
        appendStr(sb, fact.program)
        sb.Append(",\"entry\":")
        appendStr(sb, fact.entry)
        sb.Append(",\"expected\":")
        appendStr(sb, fact.expected)
        sb.Append(",\"diag\":")
        appendStr(sb, fact.diag)
        sb.Append(",\"verified\":")
        sb.Append(v)
        sb.Append(",\"duplicateCount\":")
        sb.Append(Convert.ToString(dc))
        sb.Append(",\"source\":")
        appendStr(sb, fact.source)
        sb.Append(Convert.ToChar(125))
        sb.Append("\n")
    }
    System.IO.File.WriteAllText(System.IO.Path.Combine(corpusDir, "corpus.jsonl"), sb.ToString())
}

// Write coverage.md — totals per bucket, verification rate, and dedup summary. No silent
// drops: the unknown bucket and the collapsed-duplicate counts are reported.
func writeCoverage(facts: List<Fact>, corpusDir: string, fileCount: int) {
    var runnable = 0
    var negative = 0
    var unknown = 0
    var authored = 0
    var program = 0
    var verified = 0
    var redundant = 0
    var groups = 0
    for fact in facts {
        if fact.kind == "runnable" {
            runnable += 1
        } else if fact.kind == "negative" {
            negative += 1
        } else if fact.kind == "authored" {
            authored += 1
        } else if fact.kind == "program" {
            program += 1
        } else {
            unknown += 1
        }
        if fact.verified {
            verified += 1
        }
        if fact.duplicateCount > 1 {
            groups += 1
            redundant += fact.duplicateCount - 1
        }
    }
    let sb = StringBuilder()
    sb.Append("# E# corpus — extraction & verification coverage\n\n")
    sb.Append("Generated by `tools/extract_corpus` (written in E#).\n\n")
    sb.Append("| metric | count |\n")
    sb.Append("|---|---|\n")
    sb.Append("| host files scanned | {fileCount} |\n")
    sb.Append("| canonical examples | {facts.Count} |\n")
    sb.Append("| runnable (value claim) | {runnable} |\n")
    sb.Append("| negative (diagnostic) | {negative} |\n")
    sb.Append("| unknown (compile-only) | {unknown} |\n")
    sb.Append("| authored (artisanal) | {authored} |\n")
    sb.Append("| program (flagship) | {program} |\n")
    sb.Append("| re-verified through the E# compiler | {verified} |\n")
    sb.Append("| duplicate groups collapsed | {groups} |\n")
    sb.Append("| redundant files removed | {redundant} |\n")
    System.IO.File.WriteAllText(System.IO.Path.Combine(corpusDir, "coverage.md"), sb.ToString())
}

// Featured curation, applied post-hoc: read `featured.txt` (one id per line) and copy the
// named examples into `featured/`. The test files are never touched.
func applyFeatured(facts: List<Fact>, corpusDir: string) {
    let listPath = System.IO.Path.Combine(corpusDir, "featured.txt")
    if !System.IO.File.Exists(listPath) {
        return
    }
    let wanted = List<string>()
    for line in System.IO.File.ReadAllLines(listPath) {
        let t = line.Trim()
        if t.Length > 0 && !t.StartsWith("#") {
            wanted.Add(t)
        }
    }
    if wanted.Count == 0 {
        return
    }
    let featuredDir = System.IO.Path.Combine(corpusDir, "featured")
    System.IO.Directory.CreateDirectory(featuredDir)
    for fact in facts {
        let id = fact.idOf()
        if wanted.Contains(id) {
            let dst = System.IO.Path.Combine(featuredDir, id + ".es")
            System.IO.File.WriteAllText(dst, fact.banner() + fact.source)
        }
    }
}

extract_corpus__extract

programs program

Part of a flagship E# program

// E# — a verified example from the E# language corpus (CLR language; .es, not ECMAScript).
// provenance: extract.es   topic: programs   status: unverified
// part of extract_corpus — the first real E# program (multi-file, dogfood)

namespace ExtractCorpus

// Stage 1 — Extract: walk the test corpus's C# host files via Roslyn and lift the
// embedded `.es` programs + their behavioral claims out of each `[Fact]`. This is
// the interop-aggressive half of the dogfood: E# consuming Roslyn's generic,
// type-pattern-heavy API. LINQ (`OfType<T>`) resolves from the implicit System.Linq
// import; the Roslyn syntax types come from the two imports below. The walk is
// deliberately string-centric (`.ToString()` on nodes) rather than type-pattern
// matching, so it leans on Roslyn's enumerables, not `is`-patterns over CLR types.
using "Microsoft.CodeAnalysis"
using "Microsoft.CodeAnalysis.CSharp"
using "Microsoft.CodeAnalysis.CSharp.Syntax"

// One extracted example: an `.es` program plus the behavioral claim its [Fact] made.
// id (provenance) is `file::method`; kind is the classification bucket.
ref data Fact {
    var file: string = ""
    var method: string = ""
    var source: string = ""
    var kind: string = "unknown"        // "runnable" | "negative" | "unknown"
    var entry: string = ""               // E# function to invoke (runnable)
    var args: List<string> = List<string>()  // literal arg texts (runnable)
    var expected: string = ""            // literal expected text (runnable)
    var diag: string = ""                // expected diagnostic code (negative)
    var verified: bool = false           // re-verified through the E# compiler (Stage 2)
    var topic: string = "core"           // taxonomy bucket (file + content)
    var origins: List<string> = List<string>()  // every file::method whose source is identical (dedup)
    var duplicateCount: int = 1          // how many test methods carried this exact source
    var canonical: bool = true           // false for absorbed duplicates (not written)
    var program: string = ""             // non-empty for multi-file flagship-program files (kind "program")
}

// True when a method declaration carries an xUnit [Fact] or [Theory] attribute.
func isTestMethod(m: MethodDeclarationSyntax) -> bool {
    for a in m.DescendantNodes().OfType<AttributeSyntax>() {
        let name = a.Name.ToString()
        if name == "Fact" || name == "Theory" {
            return true
        }
    }
    return false
}

// Heuristic: a string literal is an embedded E# program if it carries E# structure.
func looksLikeEsharp(s: string) -> bool {
    return s.Contains("namespace ") || s.Contains("func ") || s.Contains("data ") || s.Contains("choice ")
}

// "ES2151" / "ES3012" etc. — a diagnostic-code literal (E + S + four digits).
func isDiagCode(s: string) -> bool {
    if s.Length != 6 { return false }
    if s[0] != 'E' || s[1] != 'S' { return false }
    var i = 2
    while i < 6 {
        if !char.IsDigit(s[i]) { return false }
        i += 1
    }
    return true
}

// Strip a single layer of surrounding double quotes from an argument's source text.
func unquote(s: string) -> string {
    if s.Length >= 2 && s[0] == '"' && s[s.Length - 1] == '"' {
        return s.Substring(1, s.Length - 2)
    }
    return s
}

// Split a call's argument text at top-level commas (ignoring commas nested inside
// (), [], <>, {}, ordinary "..." strings, or """...""" raw strings). Works off the
// invocation's source text so it never touches Roslyn's `SeparatedSyntaxList<T>` struct
// (a known interop gap). Brace + raw-string awareness is what keeps object initializers
// like `new object?[] { "/users/{id}", x }` and multi-line raw-string args from
// mis-splitting at an interior comma.
func splitTopLevel(inner: string) -> List<string> {
    let result = List<string>()
    var depth = 0
    var inStr = false       // inside an ordinary "..." literal
    var inRaw = false       // inside a """...""" raw-string literal
    var start = 0
    var i = 0
    while i < inner.Length {
        let c = inner[i]
        if inRaw {
            // Only a closing triple-quote exits a raw string; everything else is inert.
            if c == '"' && i + 2 < inner.Length && inner[i + 1] == '"' && inner[i + 2] == '"' {
                inRaw = false
                i += 3
                continue
            }
        } else if inStr {
            if c == '"' { inStr = false }
        } else if c == '"' && i + 2 < inner.Length && inner[i + 1] == '"' && inner[i + 2] == '"' {
            inRaw = true
            i += 3
            continue
        } else if c == '"' {
            inStr = true
        } else if c == '(' || c == '[' || c == '<' || c == '{' {
            depth += 1
        } else if c == ')' || c == ']' || c == '>' || c == '}' {
            depth -= 1
        } else if c == ',' && depth == 0 {
            result.Add(inner.Substring(start, i - start).Trim())
            start = i + 1
        }
        i += 1
    }
    if start < inner.Length {
        result.Add(inner.Substring(start, inner.Length - start).Trim())
    }
    return result
}

// An E# identifier: [A-Za-z_][A-Za-z0-9_]* (specification/lexical.md). Used to reject
// mis-parsed `entry` values (object initializers, `new`, fragments with spaces/braces)
// before a fact is allowed to claim a runnable behavior.
func isIdentifier(s: string) -> bool {
    if s.Length == 0 { return false }
    let c0 = s[0]
    if !char.IsLetter(c0) && c0 != '_' { return false }
    var i = 1
    while i < s.Length {
        let c = s[i]
        if !char.IsLetter(c) && !char.IsDigit(c) && c != '_' { return false }
        i += 1
    }
    return true
}

// The argument-list text inside the outermost parentheses of a call expression's
// source, e.g. `Invoke(asm, "Test", "sumTo", 10)` -> `asm, "Test", "sumTo", 10`.
func innerArgs(callText: string) -> string {
    let open = callText.IndexOf('(')
    let close = callText.LastIndexOf(')')
    if open < 0 || close <= open { return "" }
    return callText.Substring(open + 1, close - open - 1)
}

// The source-text of each argument of an invocation, parsed from its source text.
func argTexts(inv: InvocationExpressionSyntax) -> List<string> {
    return splitTopLevel(innerArgs(inv.ToString()))
}

// The first embedded E# program in a test method (most tests carry exactly one).
func primarySource(m: MethodDeclarationSyntax) -> string {
    for lit in m.DescendantNodes().OfType<LiteralExpressionSyntax>() {
        let v = lit.Token.ValueText
        if looksLikeEsharp(v) {
            return v
        }
    }
    return ""
}

// The diagnostic code a negative test asserts, if any ("" otherwise).
func diagCode(m: MethodDeclarationSyntax) -> string {
    for lit in m.DescendantNodes().OfType<LiteralExpressionSyntax>() {
        let v = lit.Token.ValueText
        if isDiagCode(v) {
            return v
        }
    }
    return ""
}

// Parse a runnable claim from an `Assert.Equal(expected, Invoke/Run(...))` invocation.
// Fills entry/args/expected on `fact` and returns true on the dominant shapes:
//   Invoke(asm, "Test", "method", args...)   — typeName + method, then args
//   Run(asm, "Test", "method", args...)
//   EsHarness.Run(src, "method", args...)     — source expr, method, then args
func tryRunnable(assertInv: InvocationExpressionSyntax, fact: Fact) -> bool {
    let assertArgs = argTexts(assertInv)
    if assertArgs.Count != 2 { return false }

    // The inner Invoke/Run call lives among the assert's descendants.
    for inner in assertInv.DescendantNodes().OfType<InvocationExpressionSyntax>() {
        let callee = inner.Expression.ToString()
        if callee.EndsWith("Invoke") || callee.EndsWith("Run") {
            let ia = argTexts(inner)
            if ia.Count < 2 { continue }
            var entry = ""
            let candidateArgs = List<string>()
            // typeName-shape when the second arg is the literal "Test".
            if ia.Count >= 3 && ia[1] == "\"Test\"" {
                entry = unquote(ia[2])
                var i = 3
                while i < ia.Count {
                    candidateArgs.Add(ia[i])
                    i += 1
                }
            } else {
                // Run(src, "method", args...) shape.
                entry = unquote(ia[1])
                var i = 2
                while i < ia.Count {
                    candidateArgs.Add(ia[i])
                    i += 1
                }
            }
            // Guard: a mis-parsed entry (object initializer, `new`, a fragment with
            // braces/spaces) is not a valid method name. Skip it — the fact still ships
            // its source as compile-only ("unknown"), never a garbled runnable claim.
            if !isIdentifier(entry) { continue }
            fact.entry = entry
            for a in candidateArgs {
                fact.args.Add(a)
            }
            fact.expected = assertArgs[0]
            fact.kind = "runnable"
            return true
        }
    }
    return false
}

// Extract every [Fact]/[Theory] in one C# host file into Fact records.
func extractFile(path: string) -> List<Fact> {
    let facts = List<Fact>()
    let text = System.IO.File.ReadAllText(path)
    let tree = CSharpSyntaxTree.ParseText(text)
    let root = tree.GetRoot()
    let fileName = System.IO.Path.GetFileName(path)

    for m in root.DescendantNodes().OfType<MethodDeclarationSyntax>() {
        if !isTestMethod(m) { continue }

        let src = primarySource(m)
        if src == "" { continue }   // no embedded E# program — metadata/reflection test

        let fact = Fact { file: fileName, method: m.Identifier.ValueText, source: src, topic: classifyTopic(fileName, src) }

        // Runnable: an Assert.Equal whose second argument runs the program.
        var matched = false
        for inv in m.DescendantNodes().OfType<InvocationExpressionSyntax>() {
            if inv.Expression.ToString() == "Assert.Equal" {
                if tryRunnable(inv, fact) {
                    matched = true
                }
            }
        }

        // Negative: asserts a diagnostic code instead of a value.
        if !matched {
            let code = diagCode(m)
            if code != "" {
                fact.kind = "negative"
                fact.diag = code
            }
        }

        facts.Add(fact)
    }
    return facts
}

// Read every hand-authored `.es` under `authoredDir` into "authored" Facts — the
// curated, idiomatic tier. Topic is content-classified so they also surface under their
// feature's topic; they are written to `corpus/authored/` regardless (see esPathOf).
func ingestAuthored(authoredDir: string) -> List<Fact> {
    let facts = List<Fact>()
    if !System.IO.Directory.Exists(authoredDir) {
        return facts
    }
    for path in System.IO.Directory.GetFiles(authoredDir, "*.es") {
        let src = System.IO.File.ReadAllText(path)
        let name = System.IO.Path.GetFileName(path)
        let fact = Fact { file: name, method: "", source: src, kind: "authored", topic: classifyTopic(name, src) }
        facts.Add(fact)
    }
    return facts
}

// Read a multi-file program's `.es` files (one Fact per file, kind "program") — the
// flagship "first real E# program". Written to `corpus/programs/<program>/<file>`.
func ingestProgram(programName: string, srcDir: string, fileNames: List<string>) -> List<Fact> {
    let facts = List<Fact>()
    for fn in fileNames {
        let path = System.IO.Path.Combine(srcDir, fn)
        if !System.IO.File.Exists(path) {
            continue
        }
        let src = System.IO.File.ReadAllText(path)
        let fact = Fact { file: fn, method: "", source: src, kind: "program", program: programName, topic: "programs" }
        facts.Add(fact)
    }
    return facts
}

extract_corpus__main

programs program

Part of a flagship E# program

// E# — a verified example from the E# language corpus (CLR language; .es, not ECMAScript).
// provenance: main.es   topic: programs   status: unverified
// part of extract_corpus — the first real E# program (multi-file, dogfood)

namespace ExtractCorpus

// extract_corpus — the E#-written tool that lifts the verified `.es` corpus out of the
// test suite (Stage 1, Roslyn) and independently re-verifies each example through the E#
// compiler (Stage 2), then writes corpus/examples + authored + the flagship program, plus
// manifest.json, corpus.jsonl (training payload), and coverage.md. Dogfood: the toolchain
// that ships E# runs on E#.
//
//   Stage 1 (extract.es)  C# host files -> per-[Fact] Fact records via Roslyn
//   Stage 2 (verify.es)   each example recompiled through the E# IL backend
//   dedup   (emit.es)     collapse identical sources, preserve provenance
//   output  (emit.es)     corpus/{examples,authored,programs} + manifest + jsonl + coverage
func main() {
    // Paths are relative to the esharp repo root (run the tool from there), or pass
    // absolute positional argv overrides for CI:
    //   extract_corpus <testDir> <corpusDir> <toolDir>
    var testDir = "tests/Esharp.Tests"
    var corpusDir = "corpus"
    var toolDir = "tools/extract_corpus"

    let argv = System.Environment.GetCommandLineArgs()
    if argv.Length > 1 { testDir = argv[1] }
    if argv.Length > 2 { corpusDir = argv[2] }
    if argv.Length > 3 { toolDir = argv[3] }

    // Stage 1 — extract from C# host files.
    let files = System.IO.Directory.GetFiles(testDir, "*.cs")
    let extracted = List<Fact>()
    for f in files {
        for fact in extractFile(f) {
            extracted.Add(fact)
        }
    }
    Console.WriteLine("Stage 1: extracted {extracted.Count} examples from {files.Length} host files.")

    // Stage 2 — re-verify each through the E# compiler.
    Console.WriteLine("Stage 2: re-verifying extracted examples (this recompiles every one)...")
    verifyAll(extracted)

    // Dedup the extracted bulk (authored + program are curated/unique, kept as-is).
    let canon = dedup(extracted)
    Console.WriteLine("Dedup: {extracted.Count} -> {canon.Count} canonical.")

    // Artisanal hand-authored examples.
    let authoredDir = System.IO.Path.Combine(toolDir, "authored")
    let authored = ingestAuthored(authoredDir)
    verifyAll(authored)
    Console.WriteLine("Authored: {authored.Count} artisanal examples.")

    // Flagship program: extract_corpus's own source — the first real E# program.
    let programFiles = List<string>()
    programFiles.Add("extract.es")
    programFiles.Add("emit.es")
    programFiles.Add("verify.es")
    programFiles.Add("main.es")
    let program = ingestProgram("extract_corpus", toolDir, programFiles)
    verifyAll(program)
    Console.WriteLine("Program: {program.Count} flagship files.")

    // Combine and write.
    let all = List<Fact>()
    for fact in canon {
        all.Add(fact)
    }
    for fact in authored {
        all.Add(fact)
    }
    for fact in program {
        all.Add(fact)
    }

    writeCorpus(all, corpusDir)
    writeManifest(all, corpusDir)
    writeJsonl(all, corpusDir)
    writeCoverage(all, corpusDir, files.Length)
    applyFeatured(all, corpusDir)

    var runnable = 0
    var negative = 0
    var unknown = 0
    for fact in canon {
        if fact.kind == "runnable" {
            runnable += 1
        } else if fact.kind == "negative" {
            negative += 1
        } else {
            unknown += 1
        }
    }
    var verified = 0
    for fact in all {
        if fact.verified {
            verified += 1
        }
    }

    Console.WriteLine("=== extract_corpus complete ===")
    Console.WriteLine("  total:       {all.Count}")
    Console.WriteLine("  runnable {runnable}  negative {negative}  unknown {unknown}  authored {authored.Count}  program {program.Count}")
    Console.WriteLine("  re-verified: {verified}/{all.Count}")
    Console.WriteLine("  written to:  {corpusDir}")
}

extract_corpus__verify

programs program

Part of a flagship E# program

// E# — a verified example from the E# language corpus (CLR language; .es, not ECMAScript).
// provenance: verify.es   topic: programs   status: unverified
// part of extract_corpus — the first real E# program (multi-file, dogfood)

namespace ExtractCorpus

// Stage 2 — Re-verify: each extracted example is recompiled through the E# IL backend
// (the source-of-truth pipeline EsHarness uses) so every published example is provably
// green, decoupled from xUnit. This is the cleanest dogfood: E# invoking the E# compiler.
//
// Per-file `using`s scope these Esharp.* imports to this file only — extract.es imports
// Roslyn under the same namespace with no collision (per-file import scoping).
using "Esharp.Compiler.Parsing"
using "Esharp.Compiler.Binding"
using "Esharp.Compiler.Diagnostics"
using "Esharp.ILEmit"

// True when `source` parses, binds, and emits verifiable IL with zero errors — the same
// parse -> bind -> emit(verify) pipeline the test harness runs. A corpus example that
// passes here is provably compilable independent of the test suite.
func verifyCompiles(source: string) -> bool {
    let parser = Parser(source, "corpus.es")
    let unit = parser.ParseCompilationUnit()
    // Only hard parse ERRORS disqualify an example — a warning (e.g. a deprecation
    // notice) still compiles and runs. Mirror the binder/emit error filtering below.
    for d in parser.Diagnostics {
        if d.Severity == DiagnosticSeverity.Error {
            return false
        }
    }
    let binder = Binder()
    let bound = binder.Bind(unit)
    for d in binder.Diagnostics {
        if d.Severity == DiagnosticSeverity.Error {
            return false
        }
    }
    let tmp = System.IO.Path.Combine(System.IO.Path.GetTempPath(), "corpus_verify.dll")
    // Pass implicitUsings=true explicitly (final arg). Omitting this trailing optional
    // makes E# zero-fill it to default(bool)=false rather than honoring C#'s default
    // (true), which disables the implicit BCL-namespace search and leaves unqualified
    // types like `List<int>()` unresolved. See tickets/compiler-gaps-corpus-extractor.md.
    let emitDiags = ILEmitter.EmitToFile(bound, "corpus_verify", tmp, false, nil, true, true)
    for d in emitDiags {
        if d.Severity == DiagnosticSeverity.Error {
            return false
        }
    }
    return true
}

// True when binding `source` reports the expected diagnostic code — a negative example
// is "verified" when it still produces the error its [Fact] asserted.
func verifyDiagnostic(source: string, code: string) -> bool {
    let parser = Parser(source, "corpus.es")
    let unit = parser.ParseCompilationUnit()
    let binder = Binder()
    binder.Bind(unit)
    for d in binder.Diagnostics {
        if d.Message.Contains(code) {
            return true
        }
    }
    return false
}

// Set `verified` on each fact by re-running it through the compiler: runnable/unknown
// examples must compile clean; negative examples must still surface their diagnostic.
func verifyAll(facts: List<Fact>) {
    for fact in facts {
        try {
            if fact.kind == "negative" {
                fact.verified = verifyDiagnostic(fact.source, fact.diag)
            } else {
                fact.verified = verifyCompiles(fact.source)
            }
        } catch {
            // A malformed extraction that throws inside the compiler is simply
            // unverified — never abort the whole run.
            fact.verified = false
        }
    }
}