extract_corpus
← all programs · 4 files · raw source ↓
Part of a flagship E# program
// E# — a verified example from the E# language corpus (CLR language; .es, not ECMAScript).
// provenance: emit.es topic: programs status: unverified
// part of extract_corpus — the first real E# program (multi-file, dogfood)
namespace ExtractCorpus
// Corpus output: write each example to a standalone `.es` file (by topic, or to authored/
// programs), plus a structured manifest, a JSONL training payload, and a coverage report.
// Featured curation is applied post-hoc from an external `featured.txt` — tests are never
// touched. A literal `{` anywhere in a string is mis-lexed as an interpolation-hole start
// (known compiler gap), so JSON braces are emitted as char codes (123='{', 125='}') and
// banners/ids are built with `+` rather than interpolation holes that contain `{`.
// ---- Topic classification -------------------------------------------------
// Taxonomy bucket from the host test file name (fast path).
func topicOf(file: string) -> string {
if file.Contains("HeapPointer") || file.Contains("Refs") || file.Contains("Pointer") {
return "pointers"
}
if file.Contains("Inheritance") { return "inheritance" }
if file.Contains("Async") { return "async" }
if file.Contains("Delegates") || file.Contains("Events") { return "delegates-events" }
if file.Contains("Const") { return "const" }
if file.Contains("Embedding") { return "embedding" }
if file.Contains("StaticFunc") { return "static-func" }
if file.Contains("Result") || file.Contains("Combinator") { return "result" }
if file.Contains("FunctionPointers") { return "function-pointers" }
if file.Contains("FieldDefaults") { return "field-defaults" }
if file.Contains("Interop") || file.Contains("External") { return "interop" }
if file.Contains("New") { return "allocation" }
if file.Contains("TaskScope") || file.Contains("Concurrency") { return "concurrency" }
if file.Contains("DataContract") { return "data" }
return "core"
}
// True when an ordinary "..." string in `source` contains an interpolation hole ({letter).
func hasInterpolation(source: string) -> bool {
var inStr = false
var i = 0
while i < source.Length {
let c = source[i]
if inStr {
if c == '"' {
inStr = false
} else if c == '{' && i + 1 < source.Length && char.IsLetter(source[i + 1]) {
return true
}
} else if c == '"' {
inStr = true
}
i += 1
}
return false
}
// True when `source` uses a pointer type `*T` ('*' directly followed by an uppercase letter).
func hasStarType(source: string) -> bool {
var i = 0
while i + 1 < source.Length {
if source[i] == '*' && char.IsUpper(source[i + 1]) { return true }
i += 1
}
return false
}
// Content-based bucket for examples the filename heuristic dropped into "core". Order is
// precedence: the most distinctive feature wins.
func topicOfContent(source: string) -> string {
if source.Contains("task func") || source.Contains("await ") || source.Contains("async ") || source.Contains("Job<") || source.Contains("chan<") {
return "async"
}
if source.Contains("open ref data") || source.Contains("abstract ref data") || source.Contains("virtual func") || source.Contains("abstract func") || source.Contains(": base(") {
return "inheritance"
}
if source.Contains("choice ") || source.Contains("match ") {
return "choice"
}
if hasStarType(source) || source.Contains("HeapPointer") || source.Contains("StackAlloc") || source.Contains("HeapAlloc") {
return "pointers"
}
if source.Contains("enum ") {
return "enum"
}
if source.Contains("Result<") || source.Contains("ok(") || source.Contains("error(") {
return "result"
}
if source.Contains("delegate func") || source.Contains("event ") || source.Contains("raise ") || source.Contains("&(") {
return "delegates-events"
}
if source.Contains("<T>") || source.Contains("<T,") || source.Contains("<T ") || source.Contains("<TKey") || source.Contains("<TValue") || source.Contains("<TResult") || source.Contains("<U>") {
return "generics"
}
if hasInterpolation(source) {
return "interpolation"
}
if source.Contains("static func ") {
return "static-func"
}
if source.Contains("using \"System") || source.Contains("using \"Microsoft") || source.Contains("StringBuilder") || source.Contains("Dictionary<") {
return "interop"
}
if source.Contains("data ") {
return "data"
}
return "core"
}
// Final topic: filename heuristic first, content classification for the "core" residue.
func classifyTopic(file: string, source: string) -> string {
let t = topicOf(file)
if t == "core" {
return topicOfContent(source)
}
return t
}
// ---- Identity / banner / paths --------------------------------------------
// Strip a trailing ".cs" or ".es" extension.
func stripExt(file: string) -> string {
if file.EndsWith(".cs") || file.EndsWith(".es") {
return file.Substring(0, file.Length - 3)
}
return file
}
// Stable provenance id, as a free function over primitives. Program files:
// `<program>__<file-without-ext>`. Otherwise `<file-without-ext>[__<method>]`. Kept
// receiver-free so it can be called on `let`-bound Fact locals (a promoted method on a
// `let`-local `ref data` receiver currently mis-emits — see tickets/compiler-gaps).
func computeId(program: string, file: string, method: string) -> string {
if program.Length > 0 {
return program + "__" + stripExt(file)
}
let f = stripExt(file)
if method.Length > 0 {
return f + "__" + method
}
return f
}
// Promoted convenience: `fact.idOf()`. Safe on loop-variable / parameter receivers.
func idOf(fact: Fact) -> string {
return computeId(fact.program, fact.file, fact.method)
}
// Relative path of the example's `.es` within the corpus dir.
func esPathOf(fact: Fact) -> string {
if fact.kind == "program" {
return "programs/" + fact.program + "/" + fact.file
}
if fact.kind == "authored" {
return "authored/" + fact.idOf() + ".es"
}
return "examples/" + fact.topic + "/" + fact.idOf() + ".es"
}
// Per-`.es` header banner: an E#-identity line (doubles as the .es-vs-ECMAScript
// disambiguator), provenance, and the verified behavior. Built with `+` so an `expected`
// value containing `{` never trips interpolation-hole lexing.
func banner(fact: Fact) -> string {
var claim = "// compiles cleanly (no auto-run claim was extracted)"
if fact.kind == "runnable" {
claim = "// verified behavior: Test." + fact.entry + "(...) == " + fact.expected
} else if fact.kind == "negative" {
claim = "// verified behavior: reports diagnostic " + fact.diag
} else if fact.kind == "authored" {
claim = "// hand-authored, idiomatic E# — verified through the E# compiler"
} else if fact.kind == "program" {
claim = "// part of extract_corpus — the first real E# program (multi-file, dogfood)"
}
var status = "unverified"
if fact.verified {
status = "verified"
}
var prov = fact.file
if fact.method.Length > 0 {
prov = fact.file + "::" + fact.method
}
let l1 = "// E# — a verified example from the E# language corpus (CLR language; .es, not ECMAScript).\n"
let l2 = "// provenance: " + prov + " topic: " + fact.topic + " status: " + status + "\n"
return l1 + l2 + claim + "\n\n"
}
func jsonEscape(s: string) -> string {
var r = s
r = r.Replace("\\", "\\\\")
r = r.Replace("\"", "\\\"")
r = r.Replace("\r", "")
r = r.Replace("\n", "\\n")
r = r.Replace("\t", "\\t")
return r
}
// Append a JSON string literal (quotes + escaped value), piece-by-piece — a hole with a
// NESTED call (`{jsonEscape(f.idOf())}`, parens depth > 1) currently mis-parses.
func appendStr(sb: StringBuilder, value: string) {
sb.Append("\"")
sb.Append(jsonEscape(value))
sb.Append("\"")
}
// ---- Dedup ----------------------------------------------------------------
// Conservative content key: normalize line endings + outer whitespace only (do not strip
// comments) so we only collapse genuinely identical programs.
func normalizeBody(source: string) -> string {
var s = source.Replace("\r\n", "\n")
s = s.Replace("\r", "\n")
return s.Trim()
}
// Canonical preference (inlined in dedup to avoid a two-`ref data`-param promoted
// method, which currently mis-emits the receiver — see tickets/compiler-gaps):
// a non-core topic beats core, then verified beats unverified, then the shorter id.
func betterCanonical(candTopic: string, candVerified: bool, candIdLen: int, curTopic: string, curVerified: bool, curIdLen: int) -> bool {
let candCore = candTopic == "core"
let curCore = curTopic == "core"
if candCore != curCore {
return !candCore
}
if candVerified != curVerified {
return candVerified
}
return candIdLen < curIdLen
}
// Collapse identical-source facts to one canonical each, preserving every absorbed
// `file::method` in `origins` and the group size in `duplicateCount`. No silent drops:
// Σ duplicateCount == input count, and every input id lands in some `origins`.
//
// Implementation note: every Fact member is touched only through a *loop variable*.
// All maps hold strings / List<string> / bool — never a Fact — because member access on
// a ref-data value pulled out of a generic collection currently mis-emits (the receiver
// is loaded by address). See tickets/compiler-gaps-corpus-extractor.md.
func dedup(facts: List<Fact>) -> List<Fact> {
// Pass 1 — group by normalized source. Track all ids per key (the future
// `origins`), and the chosen-canonical's identity as primitives.
let idsByKey = Dictionary<string, List<string>>() // key -> every id in the group
let bestId = Dictionary<string, string>() // key -> chosen canonical id
let bestTopic = Dictionary<string, string>()
let bestVer = Dictionary<string, bool>()
let bestLen = Dictionary<string, int>()
for fact in facts {
let key = normalizeBody(fact.source)
let id = computeId(fact.program, fact.file, fact.method)
if !idsByKey.ContainsKey(key) {
idsByKey[key] = List<string>()
bestId[key] = id
bestTopic[key] = fact.topic
bestVer[key] = fact.verified
bestLen[key] = id.Length
} else if betterCanonical(fact.topic, fact.verified, id.Length, bestTopic[key], bestVer[key], bestLen[key]) {
bestId[key] = id
bestTopic[key] = fact.topic
bestVer[key] = fact.verified
bestLen[key] = id.Length
}
idsByKey[key].Add(id)
}
// Pass 2 — emit the canonical fact of each group (the one whose id is `bestId`),
// stamping its provenance; mark the rest non-canonical. Both happen on loop vars.
let result = List<Fact>()
for fact in facts {
let key = normalizeBody(fact.source)
let id = computeId(fact.program, fact.file, fact.method)
if id == bestId[key] {
fact.origins = idsByKey[key]
fact.duplicateCount = idsByKey[key].Count
fact.canonical = true
result.Add(fact)
} else {
fact.canonical = false
}
}
return result
}
// ---- Writers --------------------------------------------------------------
// Delete the generated example trees so a re-run never leaves stale files (a renamed
// topic, a removed test, a different dedup choice) behind in the published corpus.
// Only the generated subdirs are touched; manifest.json / corpus.jsonl / coverage.md are
// overwritten in place, and featured.txt (curation input) is left alone.
func cleanOutput(corpusDir: string) {
let dirs = List<string>()
dirs.Add("examples")
dirs.Add("authored")
dirs.Add("programs")
dirs.Add("featured")
for d in dirs {
let p = System.IO.Path.Combine(corpusDir, d)
if System.IO.Directory.Exists(p) {
System.IO.Directory.Delete(p, true)
}
}
}
// Write every example to its `.es` path (examples/<topic>/, authored/, or programs/<p>/).
func writeCorpus(facts: List<Fact>, corpusDir: string) {
cleanOutput(corpusDir)
for fact in facts {
let path = System.IO.Path.Combine(corpusDir, fact.esPathOf())
let dir = System.IO.Path.GetDirectoryName(path)
System.IO.Directory.CreateDirectory(dir)
System.IO.File.WriteAllText(path, fact.banner() + fact.source)
}
}
// Write manifest.json — one entry per example, hand-rolled so source stays in the .es
// files (esPath points at them) and only metadata lands here.
func writeManifest(facts: List<Fact>, corpusDir: string) {
let sb = StringBuilder()
sb.Append("[\n")
var first = true
for fact in facts {
if !first {
sb.Append(",\n")
}
first = false
var v = "false"
if fact.verified {
v = "true"
}
let dc = fact.duplicateCount
sb.Append(" ")
sb.Append(Convert.ToChar(123))
sb.Append("\"id\":")
appendStr(sb, fact.idOf())
sb.Append(",\"topic\":")
appendStr(sb, fact.topic)
sb.Append(",\"kind\":")
appendStr(sb, fact.kind)
sb.Append(",\"program\":")
appendStr(sb, fact.program)
sb.Append(",\"esPath\":")
appendStr(sb, fact.esPathOf())
sb.Append(",\"entry\":")
appendStr(sb, fact.entry)
sb.Append(",\"expected\":")
appendStr(sb, fact.expected)
sb.Append(",\"diag\":")
appendStr(sb, fact.diag)
sb.Append(",\"verified\":")
sb.Append(v)
sb.Append(",\"duplicateCount\":")
sb.Append(Convert.ToString(dc))
sb.Append(",\"args\":[")
var af = true
for a in fact.args {
if !af {
sb.Append(",")
}
af = false
appendStr(sb, a)
}
sb.Append("]")
sb.Append(",\"origins\":[")
var of = true
for o in fact.origins {
if !of {
sb.Append(",")
}
of = false
appendStr(sb, o)
}
sb.Append("]")
sb.Append(Convert.ToChar(125))
}
sb.Append("\n]\n")
System.IO.File.WriteAllText(System.IO.Path.Combine(corpusDir, "manifest.json"), sb.ToString())
}
// Write corpus.jsonl — one JSON object per line with INLINE source + metadata. This is
// the HuggingFace-ready training payload; consumers load it directly.
func writeJsonl(facts: List<Fact>, corpusDir: string) {
let sb = StringBuilder()
for fact in facts {
var v = "false"
if fact.verified {
v = "true"
}
let dc = fact.duplicateCount
sb.Append(Convert.ToChar(123))
sb.Append("\"id\":")
appendStr(sb, fact.idOf())
sb.Append(",\"topic\":")
appendStr(sb, fact.topic)
sb.Append(",\"kind\":")
appendStr(sb, fact.kind)
sb.Append(",\"program\":")
appendStr(sb, fact.program)
sb.Append(",\"entry\":")
appendStr(sb, fact.entry)
sb.Append(",\"expected\":")
appendStr(sb, fact.expected)
sb.Append(",\"diag\":")
appendStr(sb, fact.diag)
sb.Append(",\"verified\":")
sb.Append(v)
sb.Append(",\"duplicateCount\":")
sb.Append(Convert.ToString(dc))
sb.Append(",\"source\":")
appendStr(sb, fact.source)
sb.Append(Convert.ToChar(125))
sb.Append("\n")
}
System.IO.File.WriteAllText(System.IO.Path.Combine(corpusDir, "corpus.jsonl"), sb.ToString())
}
// Write coverage.md — totals per bucket, verification rate, and dedup summary. No silent
// drops: the unknown bucket and the collapsed-duplicate counts are reported.
func writeCoverage(facts: List<Fact>, corpusDir: string, fileCount: int) {
var runnable = 0
var negative = 0
var unknown = 0
var authored = 0
var program = 0
var verified = 0
var redundant = 0
var groups = 0
for fact in facts {
if fact.kind == "runnable" {
runnable += 1
} else if fact.kind == "negative" {
negative += 1
} else if fact.kind == "authored" {
authored += 1
} else if fact.kind == "program" {
program += 1
} else {
unknown += 1
}
if fact.verified {
verified += 1
}
if fact.duplicateCount > 1 {
groups += 1
redundant += fact.duplicateCount - 1
}
}
let sb = StringBuilder()
sb.Append("# E# corpus — extraction & verification coverage\n\n")
sb.Append("Generated by `tools/extract_corpus` (written in E#).\n\n")
sb.Append("| metric | count |\n")
sb.Append("|---|---|\n")
sb.Append("| host files scanned | {fileCount} |\n")
sb.Append("| canonical examples | {facts.Count} |\n")
sb.Append("| runnable (value claim) | {runnable} |\n")
sb.Append("| negative (diagnostic) | {negative} |\n")
sb.Append("| unknown (compile-only) | {unknown} |\n")
sb.Append("| authored (artisanal) | {authored} |\n")
sb.Append("| program (flagship) | {program} |\n")
sb.Append("| re-verified through the E# compiler | {verified} |\n")
sb.Append("| duplicate groups collapsed | {groups} |\n")
sb.Append("| redundant files removed | {redundant} |\n")
System.IO.File.WriteAllText(System.IO.Path.Combine(corpusDir, "coverage.md"), sb.ToString())
}
// Featured curation, applied post-hoc: read `featured.txt` (one id per line) and copy the
// named examples into `featured/`. The test files are never touched.
func applyFeatured(facts: List<Fact>, corpusDir: string) {
let listPath = System.IO.Path.Combine(corpusDir, "featured.txt")
if !System.IO.File.Exists(listPath) {
return
}
let wanted = List<string>()
for line in System.IO.File.ReadAllLines(listPath) {
let t = line.Trim()
if t.Length > 0 && !t.StartsWith("#") {
wanted.Add(t)
}
}
if wanted.Count == 0 {
return
}
let featuredDir = System.IO.Path.Combine(corpusDir, "featured")
System.IO.Directory.CreateDirectory(featuredDir)
for fact in facts {
let id = fact.idOf()
if wanted.Contains(id) {
let dst = System.IO.Path.Combine(featuredDir, id + ".es")
System.IO.File.WriteAllText(dst, fact.banner() + fact.source)
}
}
}
Part of a flagship E# program
// E# — a verified example from the E# language corpus (CLR language; .es, not ECMAScript).
// provenance: extract.es topic: programs status: unverified
// part of extract_corpus — the first real E# program (multi-file, dogfood)
namespace ExtractCorpus
// Stage 1 — Extract: walk the test corpus's C# host files via Roslyn and lift the
// embedded `.es` programs + their behavioral claims out of each `[Fact]`. This is
// the interop-aggressive half of the dogfood: E# consuming Roslyn's generic,
// type-pattern-heavy API. LINQ (`OfType<T>`) resolves from the implicit System.Linq
// import; the Roslyn syntax types come from the two imports below. The walk is
// deliberately string-centric (`.ToString()` on nodes) rather than type-pattern
// matching, so it leans on Roslyn's enumerables, not `is`-patterns over CLR types.
using "Microsoft.CodeAnalysis"
using "Microsoft.CodeAnalysis.CSharp"
using "Microsoft.CodeAnalysis.CSharp.Syntax"
// One extracted example: an `.es` program plus the behavioral claim its [Fact] made.
// id (provenance) is `file::method`; kind is the classification bucket.
ref data Fact {
var file: string = ""
var method: string = ""
var source: string = ""
var kind: string = "unknown" // "runnable" | "negative" | "unknown"
var entry: string = "" // E# function to invoke (runnable)
var args: List<string> = List<string>() // literal arg texts (runnable)
var expected: string = "" // literal expected text (runnable)
var diag: string = "" // expected diagnostic code (negative)
var verified: bool = false // re-verified through the E# compiler (Stage 2)
var topic: string = "core" // taxonomy bucket (file + content)
var origins: List<string> = List<string>() // every file::method whose source is identical (dedup)
var duplicateCount: int = 1 // how many test methods carried this exact source
var canonical: bool = true // false for absorbed duplicates (not written)
var program: string = "" // non-empty for multi-file flagship-program files (kind "program")
}
// True when a method declaration carries an xUnit [Fact] or [Theory] attribute.
func isTestMethod(m: MethodDeclarationSyntax) -> bool {
for a in m.DescendantNodes().OfType<AttributeSyntax>() {
let name = a.Name.ToString()
if name == "Fact" || name == "Theory" {
return true
}
}
return false
}
// Heuristic: a string literal is an embedded E# program if it carries E# structure.
func looksLikeEsharp(s: string) -> bool {
return s.Contains("namespace ") || s.Contains("func ") || s.Contains("data ") || s.Contains("choice ")
}
// "ES2151" / "ES3012" etc. — a diagnostic-code literal (E + S + four digits).
func isDiagCode(s: string) -> bool {
if s.Length != 6 { return false }
if s[0] != 'E' || s[1] != 'S' { return false }
var i = 2
while i < 6 {
if !char.IsDigit(s[i]) { return false }
i += 1
}
return true
}
// Strip a single layer of surrounding double quotes from an argument's source text.
func unquote(s: string) -> string {
if s.Length >= 2 && s[0] == '"' && s[s.Length - 1] == '"' {
return s.Substring(1, s.Length - 2)
}
return s
}
// Split a call's argument text at top-level commas (ignoring commas nested inside
// (), [], <>, {}, ordinary "..." strings, or """...""" raw strings). Works off the
// invocation's source text so it never touches Roslyn's `SeparatedSyntaxList<T>` struct
// (a known interop gap). Brace + raw-string awareness is what keeps object initializers
// like `new object?[] { "/users/{id}", x }` and multi-line raw-string args from
// mis-splitting at an interior comma.
func splitTopLevel(inner: string) -> List<string> {
let result = List<string>()
var depth = 0
var inStr = false // inside an ordinary "..." literal
var inRaw = false // inside a """...""" raw-string literal
var start = 0
var i = 0
while i < inner.Length {
let c = inner[i]
if inRaw {
// Only a closing triple-quote exits a raw string; everything else is inert.
if c == '"' && i + 2 < inner.Length && inner[i + 1] == '"' && inner[i + 2] == '"' {
inRaw = false
i += 3
continue
}
} else if inStr {
if c == '"' { inStr = false }
} else if c == '"' && i + 2 < inner.Length && inner[i + 1] == '"' && inner[i + 2] == '"' {
inRaw = true
i += 3
continue
} else if c == '"' {
inStr = true
} else if c == '(' || c == '[' || c == '<' || c == '{' {
depth += 1
} else if c == ')' || c == ']' || c == '>' || c == '}' {
depth -= 1
} else if c == ',' && depth == 0 {
result.Add(inner.Substring(start, i - start).Trim())
start = i + 1
}
i += 1
}
if start < inner.Length {
result.Add(inner.Substring(start, inner.Length - start).Trim())
}
return result
}
// An E# identifier: [A-Za-z_][A-Za-z0-9_]* (specification/lexical.md). Used to reject
// mis-parsed `entry` values (object initializers, `new`, fragments with spaces/braces)
// before a fact is allowed to claim a runnable behavior.
func isIdentifier(s: string) -> bool {
if s.Length == 0 { return false }
let c0 = s[0]
if !char.IsLetter(c0) && c0 != '_' { return false }
var i = 1
while i < s.Length {
let c = s[i]
if !char.IsLetter(c) && !char.IsDigit(c) && c != '_' { return false }
i += 1
}
return true
}
// The argument-list text inside the outermost parentheses of a call expression's
// source, e.g. `Invoke(asm, "Test", "sumTo", 10)` -> `asm, "Test", "sumTo", 10`.
func innerArgs(callText: string) -> string {
let open = callText.IndexOf('(')
let close = callText.LastIndexOf(')')
if open < 0 || close <= open { return "" }
return callText.Substring(open + 1, close - open - 1)
}
// The source-text of each argument of an invocation, parsed from its source text.
func argTexts(inv: InvocationExpressionSyntax) -> List<string> {
return splitTopLevel(innerArgs(inv.ToString()))
}
// The first embedded E# program in a test method (most tests carry exactly one).
func primarySource(m: MethodDeclarationSyntax) -> string {
for lit in m.DescendantNodes().OfType<LiteralExpressionSyntax>() {
let v = lit.Token.ValueText
if looksLikeEsharp(v) {
return v
}
}
return ""
}
// The diagnostic code a negative test asserts, if any ("" otherwise).
func diagCode(m: MethodDeclarationSyntax) -> string {
for lit in m.DescendantNodes().OfType<LiteralExpressionSyntax>() {
let v = lit.Token.ValueText
if isDiagCode(v) {
return v
}
}
return ""
}
// Parse a runnable claim from an `Assert.Equal(expected, Invoke/Run(...))` invocation.
// Fills entry/args/expected on `fact` and returns true on the dominant shapes:
// Invoke(asm, "Test", "method", args...) — typeName + method, then args
// Run(asm, "Test", "method", args...)
// EsHarness.Run(src, "method", args...) — source expr, method, then args
func tryRunnable(assertInv: InvocationExpressionSyntax, fact: Fact) -> bool {
let assertArgs = argTexts(assertInv)
if assertArgs.Count != 2 { return false }
// The inner Invoke/Run call lives among the assert's descendants.
for inner in assertInv.DescendantNodes().OfType<InvocationExpressionSyntax>() {
let callee = inner.Expression.ToString()
if callee.EndsWith("Invoke") || callee.EndsWith("Run") {
let ia = argTexts(inner)
if ia.Count < 2 { continue }
var entry = ""
let candidateArgs = List<string>()
// typeName-shape when the second arg is the literal "Test".
if ia.Count >= 3 && ia[1] == "\"Test\"" {
entry = unquote(ia[2])
var i = 3
while i < ia.Count {
candidateArgs.Add(ia[i])
i += 1
}
} else {
// Run(src, "method", args...) shape.
entry = unquote(ia[1])
var i = 2
while i < ia.Count {
candidateArgs.Add(ia[i])
i += 1
}
}
// Guard: a mis-parsed entry (object initializer, `new`, a fragment with
// braces/spaces) is not a valid method name. Skip it — the fact still ships
// its source as compile-only ("unknown"), never a garbled runnable claim.
if !isIdentifier(entry) { continue }
fact.entry = entry
for a in candidateArgs {
fact.args.Add(a)
}
fact.expected = assertArgs[0]
fact.kind = "runnable"
return true
}
}
return false
}
// Extract every [Fact]/[Theory] in one C# host file into Fact records.
func extractFile(path: string) -> List<Fact> {
let facts = List<Fact>()
let text = System.IO.File.ReadAllText(path)
let tree = CSharpSyntaxTree.ParseText(text)
let root = tree.GetRoot()
let fileName = System.IO.Path.GetFileName(path)
for m in root.DescendantNodes().OfType<MethodDeclarationSyntax>() {
if !isTestMethod(m) { continue }
let src = primarySource(m)
if src == "" { continue } // no embedded E# program — metadata/reflection test
let fact = Fact { file: fileName, method: m.Identifier.ValueText, source: src, topic: classifyTopic(fileName, src) }
// Runnable: an Assert.Equal whose second argument runs the program.
var matched = false
for inv in m.DescendantNodes().OfType<InvocationExpressionSyntax>() {
if inv.Expression.ToString() == "Assert.Equal" {
if tryRunnable(inv, fact) {
matched = true
}
}
}
// Negative: asserts a diagnostic code instead of a value.
if !matched {
let code = diagCode(m)
if code != "" {
fact.kind = "negative"
fact.diag = code
}
}
facts.Add(fact)
}
return facts
}
// Read every hand-authored `.es` under `authoredDir` into "authored" Facts — the
// curated, idiomatic tier. Topic is content-classified so they also surface under their
// feature's topic; they are written to `corpus/authored/` regardless (see esPathOf).
func ingestAuthored(authoredDir: string) -> List<Fact> {
let facts = List<Fact>()
if !System.IO.Directory.Exists(authoredDir) {
return facts
}
for path in System.IO.Directory.GetFiles(authoredDir, "*.es") {
let src = System.IO.File.ReadAllText(path)
let name = System.IO.Path.GetFileName(path)
let fact = Fact { file: name, method: "", source: src, kind: "authored", topic: classifyTopic(name, src) }
facts.Add(fact)
}
return facts
}
// Read a multi-file program's `.es` files (one Fact per file, kind "program") — the
// flagship "first real E# program". Written to `corpus/programs/<program>/<file>`.
func ingestProgram(programName: string, srcDir: string, fileNames: List<string>) -> List<Fact> {
let facts = List<Fact>()
for fn in fileNames {
let path = System.IO.Path.Combine(srcDir, fn)
if !System.IO.File.Exists(path) {
continue
}
let src = System.IO.File.ReadAllText(path)
let fact = Fact { file: fn, method: "", source: src, kind: "program", program: programName, topic: "programs" }
facts.Add(fact)
}
return facts
}
Part of a flagship E# program
// E# — a verified example from the E# language corpus (CLR language; .es, not ECMAScript).
// provenance: main.es topic: programs status: unverified
// part of extract_corpus — the first real E# program (multi-file, dogfood)
namespace ExtractCorpus
// extract_corpus — the E#-written tool that lifts the verified `.es` corpus out of the
// test suite (Stage 1, Roslyn) and independently re-verifies each example through the E#
// compiler (Stage 2), then writes corpus/examples + authored + the flagship program, plus
// manifest.json, corpus.jsonl (training payload), and coverage.md. Dogfood: the toolchain
// that ships E# runs on E#.
//
// Stage 1 (extract.es) C# host files -> per-[Fact] Fact records via Roslyn
// Stage 2 (verify.es) each example recompiled through the E# IL backend
// dedup (emit.es) collapse identical sources, preserve provenance
// output (emit.es) corpus/{examples,authored,programs} + manifest + jsonl + coverage
func main() {
// Paths are relative to the esharp repo root (run the tool from there), or pass
// absolute positional argv overrides for CI:
// extract_corpus <testDir> <corpusDir> <toolDir>
var testDir = "tests/Esharp.Tests"
var corpusDir = "corpus"
var toolDir = "tools/extract_corpus"
let argv = System.Environment.GetCommandLineArgs()
if argv.Length > 1 { testDir = argv[1] }
if argv.Length > 2 { corpusDir = argv[2] }
if argv.Length > 3 { toolDir = argv[3] }
// Stage 1 — extract from C# host files.
let files = System.IO.Directory.GetFiles(testDir, "*.cs")
let extracted = List<Fact>()
for f in files {
for fact in extractFile(f) {
extracted.Add(fact)
}
}
Console.WriteLine("Stage 1: extracted {extracted.Count} examples from {files.Length} host files.")
// Stage 2 — re-verify each through the E# compiler.
Console.WriteLine("Stage 2: re-verifying extracted examples (this recompiles every one)...")
verifyAll(extracted)
// Dedup the extracted bulk (authored + program are curated/unique, kept as-is).
let canon = dedup(extracted)
Console.WriteLine("Dedup: {extracted.Count} -> {canon.Count} canonical.")
// Artisanal hand-authored examples.
let authoredDir = System.IO.Path.Combine(toolDir, "authored")
let authored = ingestAuthored(authoredDir)
verifyAll(authored)
Console.WriteLine("Authored: {authored.Count} artisanal examples.")
// Flagship program: extract_corpus's own source — the first real E# program.
let programFiles = List<string>()
programFiles.Add("extract.es")
programFiles.Add("emit.es")
programFiles.Add("verify.es")
programFiles.Add("main.es")
let program = ingestProgram("extract_corpus", toolDir, programFiles)
verifyAll(program)
Console.WriteLine("Program: {program.Count} flagship files.")
// Combine and write.
let all = List<Fact>()
for fact in canon {
all.Add(fact)
}
for fact in authored {
all.Add(fact)
}
for fact in program {
all.Add(fact)
}
writeCorpus(all, corpusDir)
writeManifest(all, corpusDir)
writeJsonl(all, corpusDir)
writeCoverage(all, corpusDir, files.Length)
applyFeatured(all, corpusDir)
var runnable = 0
var negative = 0
var unknown = 0
for fact in canon {
if fact.kind == "runnable" {
runnable += 1
} else if fact.kind == "negative" {
negative += 1
} else {
unknown += 1
}
}
var verified = 0
for fact in all {
if fact.verified {
verified += 1
}
}
Console.WriteLine("=== extract_corpus complete ===")
Console.WriteLine(" total: {all.Count}")
Console.WriteLine(" runnable {runnable} negative {negative} unknown {unknown} authored {authored.Count} program {program.Count}")
Console.WriteLine(" re-verified: {verified}/{all.Count}")
Console.WriteLine(" written to: {corpusDir}")
}
Part of a flagship E# program
// E# — a verified example from the E# language corpus (CLR language; .es, not ECMAScript).
// provenance: verify.es topic: programs status: unverified
// part of extract_corpus — the first real E# program (multi-file, dogfood)
namespace ExtractCorpus
// Stage 2 — Re-verify: each extracted example is recompiled through the E# IL backend
// (the source-of-truth pipeline EsHarness uses) so every published example is provably
// green, decoupled from xUnit. This is the cleanest dogfood: E# invoking the E# compiler.
//
// Per-file `using`s scope these Esharp.* imports to this file only — extract.es imports
// Roslyn under the same namespace with no collision (per-file import scoping).
using "Esharp.Compiler.Parsing"
using "Esharp.Compiler.Binding"
using "Esharp.Compiler.Diagnostics"
using "Esharp.ILEmit"
// True when `source` parses, binds, and emits verifiable IL with zero errors — the same
// parse -> bind -> emit(verify) pipeline the test harness runs. A corpus example that
// passes here is provably compilable independent of the test suite.
func verifyCompiles(source: string) -> bool {
let parser = Parser(source, "corpus.es")
let unit = parser.ParseCompilationUnit()
// Only hard parse ERRORS disqualify an example — a warning (e.g. a deprecation
// notice) still compiles and runs. Mirror the binder/emit error filtering below.
for d in parser.Diagnostics {
if d.Severity == DiagnosticSeverity.Error {
return false
}
}
let binder = Binder()
let bound = binder.Bind(unit)
for d in binder.Diagnostics {
if d.Severity == DiagnosticSeverity.Error {
return false
}
}
let tmp = System.IO.Path.Combine(System.IO.Path.GetTempPath(), "corpus_verify.dll")
// Pass implicitUsings=true explicitly (final arg). Omitting this trailing optional
// makes E# zero-fill it to default(bool)=false rather than honoring C#'s default
// (true), which disables the implicit BCL-namespace search and leaves unqualified
// types like `List<int>()` unresolved. See tickets/compiler-gaps-corpus-extractor.md.
let emitDiags = ILEmitter.EmitToFile(bound, "corpus_verify", tmp, false, nil, true, true)
for d in emitDiags {
if d.Severity == DiagnosticSeverity.Error {
return false
}
}
return true
}
// True when binding `source` reports the expected diagnostic code — a negative example
// is "verified" when it still produces the error its [Fact] asserted.
func verifyDiagnostic(source: string, code: string) -> bool {
let parser = Parser(source, "corpus.es")
let unit = parser.ParseCompilationUnit()
let binder = Binder()
binder.Bind(unit)
for d in binder.Diagnostics {
if d.Message.Contains(code) {
return true
}
}
return false
}
// Set `verified` on each fact by re-running it through the compiler: runnable/unknown
// examples must compile clean; negative examples must still surface their diagnostic.
func verifyAll(facts: List<Fact>) {
for fact in facts {
try {
if fact.kind == "negative" {
fact.verified = verifyDiagnostic(fact.source, fact.diag)
} else {
fact.verified = verifyCompiles(fact.source)
}
} catch {
// A malformed extraction that throws inside the compiler is simply
// unverified — never abort the whole run.
fact.verified = false
}
}
}