diff --git a/Cargo.lock b/Cargo.lock deleted file mode 100644 index 1356472..0000000 --- a/Cargo.lock +++ /dev/null @@ -1,285 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 4 - -[[package]] -name = "adler2" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" - -[[package]] -name = "bitflags" -version = "2.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" - -[[package]] -name = "cfg-if" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" - -[[package]] -name = "crc32fast" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "equivalent" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" - -[[package]] -name = "fast-strip-ansi" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3086ffd0a7160f58f988c74173a002e255da505a114e2f5425acb1eaab2b8ac" -dependencies = [ - "vt-push-parser", -] - -[[package]] -name = "flate2" -version = "1.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" -dependencies = [ - "crc32fast", - "miniz_oxide", -] - -[[package]] -name = "hashbrown" -version = "0.17.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" - -[[package]] -name = "hex" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" - -[[package]] -name = "indexmap" -version = "2.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" -dependencies = [ - "equivalent", - "hashbrown", -] - -[[package]] -name = "inshellah" -version = "0.1.0" -dependencies = [ - "fast-strip-ansi", - "flate2", - "libc", - "nom", - "parking_lot", - "serde_json", -] - -[[package]] -name = "itoa" -version = "1.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" - -[[package]] -name = "libc" -version = "0.2.186" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" - -[[package]] -name = "lock_api" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" -dependencies = [ - "scopeguard", -] - -[[package]] -name = "memchr" -version = "2.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" - -[[package]] -name = "miniz_oxide" -version = "0.8.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" -dependencies = [ - "adler2", - "simd-adler32", -] - -[[package]] -name = "nom" -version = "8.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" -dependencies = [ - "memchr", -] - -[[package]] -name = "parking_lot" -version = "0.12.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-link", -] - -[[package]] -name = "proc-macro2" -version = "1.0.106" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "quote" -version = "1.0.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "redox_syscall" -version = "0.5.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" -dependencies = [ - "bitflags", -] - -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - -[[package]] -name = "serde" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" -dependencies = [ - "serde_core", -] - -[[package]] -name = "serde_core" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "serde_json" -version = "1.0.149" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" -dependencies = [ - "indexmap", - "itoa", - "memchr", - "serde", - "serde_core", - "zmij", -] - -[[package]] -name = "simd-adler32" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" - -[[package]] -name = "smallvec" -version = "1.15.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" - -[[package]] -name = "syn" -version = "2.0.117" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "unicode-ident" -version = "1.0.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" - -[[package]] -name = "vt-push-parser" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdbf39d53c5a50cad8119d9cde929ecd208764e8d8d1626486b8929cbcd5f0e7" -dependencies = [ - "hex", - "smallvec", -] - -[[package]] -name = "windows-link" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" - -[[package]] -name = "zmij" -version = "1.0.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml deleted file mode 100644 index 1319992..0000000 --- a/Cargo.toml +++ /dev/null @@ -1,12 +0,0 @@ -[package] -name = "inshellah" -version = "0.1.1" -edition = "2024" - -[dependencies] -fast-strip-ansi = "0.13.1" -flate2 = "1.1.9" -libc = "0.2.186" -nom = "8.0.0" -parking_lot = "0.12.5" -serde_json = { version = "1.0.149", features = ["preserve_order"] } diff --git a/README.md b/README.md index 70983bd..3d3e66d 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,7 @@ completer. see `doc/` for details: -- [building and installing](doc/building.md) — cargo, nix, post-install setup -- [nushell integration](doc/nushell-integration.md) — setup, the pipeline, the completer -- [nixos module](doc/nixos.md) — automatic build-time indexing + module options +- [building and installing](doc/building.md) — compilation, arch/debian/fedora, opam, nix +- [nushell integration](doc/nushell-integration.md) — setup, usage, examples +- [nixos module](doc/nixos.md) — automatic build-time indexing - [runtime completions](doc/runtime-completions.md) — on-the-fly caching via the completer -- [benchmarks](doc/benchmarks.md) — wall-time and indexed-count numbers diff --git a/bin/.ocamlformat b/bin/.ocamlformat new file mode 100644 index 0000000..e69de29 diff --git a/bin/dune b/bin/dune new file mode 100644 index 0000000..4bb8309 --- /dev/null +++ b/bin/dune @@ -0,0 +1,4 @@ +(executable + (public_name inshellah) + (name main) + (libraries inshellah)) diff --git a/bin/main.ml b/bin/main.ml new file mode 100644 index 0000000..b72a456 --- /dev/null +++ b/bin/main.ml @@ -0,0 +1,1403 @@ +(* main.ml — cli entry point for inshellah, a nushell completions engine. + * + * inshellah generates nushell "extern" definitions for external commands by + * parsing their manpages and --help output. it has two main modes: + * + * 1. indexing (batch): scan a prefix directory's bin/ and share/man/, + * extract completions for every binary, and write them to a cache dir. + * this is typically run once per nix profile or system update. + * + * 2. completing (interactive): given a command and its current arguments, + * look up the cached data and return JSON completion candidates for + * nushell's custom completer protocol. + * + * the indexing pipeline for each binary: + * a. classify the binary (skip? try --help? try native completions?) + * b. if the tool has native nushell completion support, run --help and + * discover subcommands containing "complet", then try them with "nushell" + * c. otherwise, run the tool with --help/-h and parse the output + * d. recursively resolve subcommands (depth-limited to 5) + * e. after binaries, parse manpages for any commands not yet covered + * + * parallelism: indexing forks per binary, and subcommand resolution forks + * per subcommand. results are marshaled back via pipes. this gives good + * throughput on multi-core systems while keeping the code simple (no threads, + * no async runtime — just unix fork/pipe/waitpid). + *) + +open Inshellah.Parser +open Inshellah.Manpage +open Inshellah.Nushell +open Inshellah.Store + +module SSet = Set.Make(String) + +(* print usage and exit. called when no valid subcommand is given. *) +let usage () = + Printf.eprintf + {|inshellah - nushell completions engine + +Usage: + inshellah index PREFIX... [--dir PATH] [--ignore FILE] [--help-only FILE] + Index completions into a directory of JSON/nu files. + PREFIX is a directory containing bin/ and share/man/. + Default dir: $XDG_CACHE_HOME/inshellah + --ignore FILE skip listed commands entirely + --help-only FILE skip manpages for listed commands, use --help instead + inshellah complete CMD [ARGS...] [--dir PATH[:PATH...]] + Nushell custom completer. Outputs JSON completion candidates. + Falls back to --help resolution if command is not indexed. + --dir takes colon-separated paths. The first path is the writable + user cache; additional paths are read-only system directories. + Manpages are found via sibling share/man of system dir paths. + inshellah query CMD [--dir PATH[:PATH...]] + Print stored completion data for CMD. + inshellah dump [--dir PATH[:PATH...]] + List indexed commands. + inshellah manpage FILE Parse a manpage and emit nushell extern + inshellah manpage-dir DIR Batch-process manpages under DIR + inshellah completions Generate nushell completions for inshellah + +|}; + exit 1 + +(* manpage sections that contain command documentation. + * section 1 = user commands, section 8 = system administration commands. *) +let command_sections = [1; 8] + +(* simple substring search using Str *) +let contains_str haystack needle = + try ignore (Str.search_forward (Str.regexp_string needle) haystack 0); true + with Not_found -> false + +(* heuristic to detect whether text is valid nushell source code. + * checks for common nushell declaration keywords. the length > 20 + * check avoids false positives on short error messages. *) +let is_nushell_source text = + String.length text > 20 + && (contains_str text "export extern" + || contains_str text "export def" + || (contains_str text "module " && contains_str text "export")) + +(* extract command name from a manpage filename. + * "ls.1.gz" -> strip .gz -> "ls.1" -> chop extension -> "ls" *) +let cmd_name_of_manpage path = + let base = Filename.basename path in + let base = + if Filename.check_suffix base ".gz" then Filename.chop_suffix base ".gz" + else base in + try Filename.chop_extension base with Invalid_argument _ -> base + +(* sanitized environment for child processes. + * strips display-related variables (DISPLAY, WAYLAND_DISPLAY, etc.) to prevent + * gui tools from trying to open windows when we run them with --help. + * without this, some tools would pop up dialogs or hang waiting for a + * display connection. *) +let safe_env = lazy ( + Array.of_list ( + List.filter (fun var -> + not (String.starts_with ~prefix:"DISPLAY=" var + || String.starts_with ~prefix:"WAYLAND_DISPLAY=" var + || String.starts_with ~prefix:"DBUS_SESSION_BUS_ADDRESS=" var + || String.starts_with ~prefix:"XAUTHORITY=" var)) + (Array.to_list (Unix.environment ())))) + +(* non-blocking drain of a pipe fd into a buffer. safe to call repeatedly; + * reads whatever is available without blocking. used by all fork-pipe sites + * to keep pipes drained so children never block on write. *) +let drain_fd rd buf = + let chunk = Bytes.create 8192 in + let continue = ref true in + while !continue do + match Unix.select [rd] [] [] 0.0 with + | (_ :: _, _, _) -> + (try + let bytes_read = Unix.read rd chunk 0 8192 in + if bytes_read = 0 then continue := false + else Buffer.add_subbytes buf chunk 0 bytes_read + with Unix.Unix_error _ -> continue := false) + | _ -> continue := false + done + +(* run a command with a timeout, capturing its stdout+stderr. + * forks a child process, redirects stdin from /dev/null, and merges + * stdout+stderr onto a pipe. reads from the pipe with select() polling + * until either the child exits or the deadline is reached. + * + * the child is run in /tmp to prevent tools that create side-effect files + * from polluting the user's working directory. we chdir to /tmp before + * fork and restore after. + * + * the select timeout is capped at 0.05s per iteration to ensure we check + * the deadline frequently even when no data is available. + * + * returns none if the process couldn't be started, produced no output, + * or was killed due to timeout. *) +let run_cmd args timeout_ms = + let (rd, wr) = Unix.pipe () in + let devnull = Unix.openfile "/dev/null" [Unix.O_RDONLY] 0 in + let argv = Array.of_list args in + (* run subprocesses in /tmp so commands that write side-effect files + * don't pollute the working directory *) + let saved_cwd = Sys.getcwd () in + Sys.chdir "/tmp"; + let pid = + try Unix.create_process_env (List.hd args) argv + (Lazy.force safe_env) devnull wr wr + with Unix.Unix_error _ -> + Unix.close rd; Unix.close wr; Unix.close devnull; -1 in + Sys.chdir saved_cwd; + Unix.close wr; Unix.close devnull; + if pid < 0 then (Unix.close rd; None) + else begin + let buf = Buffer.create 4096 in + let deadline = Unix.gettimeofday () +. (float_of_int timeout_ms /. 1000.0) in + let chunk = Bytes.create 8192 in + let alive = ref true in + (try while !alive do + let remaining = deadline -. Unix.gettimeofday () in + if remaining <= 0.0 then alive := false + else match Unix.select [rd] [] [] (min remaining 0.05) with + | (_ :: _, _, _) -> + let bytes_read = Unix.read rd chunk 0 8192 in + if bytes_read = 0 then raise Exit + else Buffer.add_subbytes buf chunk 0 bytes_read + | _ -> () + done with Exit -> ()); + Unix.close rd; + if not !alive then begin + (try Unix.kill pid Sys.sigkill with Unix.Unix_error _ -> ()); + ignore (Unix.waitpid [] pid) + end else + ignore (Unix.waitpid [] pid); + if Buffer.length buf > 0 then Some (Buffer.contents buf) else None + end + +(* check if a path is a regular file with at least one execute bit set *) +let is_executable path = + try let st = Unix.stat path in + st.st_kind = Unix.S_REG && st.st_perm land 0o111 <> 0 + with Unix.Unix_error _ -> false + +(* check if a file is a script by looking for a #! shebang. + * follows symlinks via realpath before reading. *) +let is_script path = + try + let real = Unix.realpath path in + let ic = open_in_bin real in + let has_shebang = + try let b = Bytes.create 2 in + really_input ic b 0 2; + Bytes.get b 0 = '#' && Bytes.get b 1 = '!' + with End_of_file -> false in + close_in ic; + has_shebang + with _ -> false + +(* scan an elf binary for string needles without loading the entire file. + * reads the file in 64kb chunks, searching each chunk for the needle strings. + * uses a sliding window (carry) of max_needle bytes between chunks to handle + * needles that span chunk boundaries. + * + * on read failure (e.g. if the path resolves to something unreadable), all + * needles are marked as found. this is a conservative fallback — we'd rather + * try --help on an unreadable binary than skip it. + * + * the inner loop is a manual byte-by-byte comparison rather than using + * String.contains or Str for performance — this runs on every binary + * in the prefix, so it needs to be fast. *) +let elf_scan path needles = + let found = Hashtbl.create 4 in + let remaining () = List.filter (fun needle -> not (Hashtbl.mem found needle)) needles in + (try + let real = Unix.realpath path in + let ic = open_in_bin real in + let magic = Bytes.create 4 in + really_input ic magic 0 4; + if Bytes.get magic 0 = '\x7f' && Bytes.get magic 1 = 'E' + && Bytes.get magic 2 = 'L' && Bytes.get magic 3 = 'F' then begin + let max_needle = List.fold_left (fun m needle -> max m (String.length needle)) 0 needles in + let chunk_size = 65536 in + let buf = Bytes.create (chunk_size + max_needle) in + let carry = ref 0 in + let eof = ref false in + while not !eof && remaining () <> [] do + let bytes_read = (try input ic buf !carry chunk_size with End_of_file -> 0) in + if bytes_read = 0 then eof := true + else begin + let total = !carry + bytes_read in + List.iter (fun needle -> + if not (Hashtbl.mem found needle) then begin + let nlen = String.length needle in + let pos = ref 0 in + while !pos <= total - nlen do + if Bytes.get buf !pos = needle.[0] then begin + let matched = ref true in + for j = 1 to nlen - 1 do + if Bytes.get buf (!pos + j) <> needle.[j] then matched := false + done; + if !matched then (Hashtbl.replace found needle true; pos := total) + else incr pos + end else incr pos + done + end + ) (remaining ()); + let new_carry = min max_needle total in + Bytes.blit buf (total - new_carry) buf 0 new_carry; + carry := new_carry + end + done + end; + close_in ic + with _ -> + List.iter (fun needle -> Hashtbl.replace found needle true) needles); + found + +(* detect nix-generated c wrapper scripts and extract the real binary path. + * nix's makeCWrapper creates small c programs that set up the environment + * and exec the real binary. these wrappers won't contain "-h" or "complet" + * in their own binary (they're just wrappers), so elf_scan would say "skip". + * this function reads the wrapper source to find the actual /nix/store/.../bin/... + * target path, so we can try --help on the real binary instead. + * + * caps the read at 64kb to avoid accidentally reading a large non-wrapper + * binary into memory. *) +let nix_wrapper_target path = + try + let real = Unix.realpath path in + let ic = open_in_bin real in + let size = in_channel_length ic in + if size > 65536 then (close_in ic; None) + else begin + let contents = Bytes.create size in + really_input ic contents 0 size; close_in ic; + let contents = Bytes.to_string contents in + if not (contains_str contents "makeCWrapper") then None + else + let re = Str.regexp "/nix/store/[a-z0-9]+-[^' \n\r\x00]+/bin/[a-zA-Z0-9._-]+" in + try ignore (Str.search_forward re contents 0); + let target = Str.matched_string contents in + if Sys.file_exists target then Some target else None + with Not_found -> None + end + with _ -> None + +(* detect nix bash/sh wrapper scripts that exec a real binary. + * nix sometimes generates small shell scripts (e.g. to set env vars like + * XDG_CONFIG_HOME) that exec the real binary. these look like: + * #!/nix/store/.../bash -e + * export FOO=... + * exec -a "$0" "/nix/store/.../bin/.foo-wrapped" "$@" + * we extract the exec target path and resolve through it. *) +let nix_script_wrapper_target path = + try + let real = Unix.realpath path in + let ic = open_in real in + let size = in_channel_length ic in + if size > 4096 then (close_in ic; None) + else begin + let contents = Bytes.create size in + really_input ic contents 0 size; close_in ic; + let contents = Bytes.to_string contents in + if not (contains_str contents "exec") then None + else + let re = Str.regexp "exec[ \t]+\\(-a[ \t]+\"\\$0\"[ \t]+\\)?\"?\\(/nix/store/[a-z0-9]+-[^\" \t\n]+/bin/[a-zA-Z0-9._-]+\\)\"?" in + try ignore (Str.search_forward re contents 0); + let target = Str.matched_group 2 contents in + let target = Unix.realpath target in + if Sys.file_exists target then Some target else None + with Not_found -> None + end + with _ -> None + +(* heuristic filter for binary names that should never be indexed. + * skips: empty names, "-", dotfiles, libraries (lib-prefix), daemon wrappers + * (suffixes -daemon, -wrapped), shared objects (.so suffix), and names with no + * alphanumeric characters (e.g. punctuation-only names). *) +let skip_name name = + String.length name = 0 || name = "-" || name.[0] = '.' + || String.starts_with ~prefix:"lib" name + || String.ends_with ~suffix:"-daemon" name + || String.ends_with ~suffix:"-wrapped" name + || String.ends_with ~suffix:".so" name + || not (String.exists (fun c -> (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) name) + +(* classification result for a binary. + * Skip — don't index this binary at all + * Try_help — only try --help (scripts, binaries without "completion" string) + * Try_native_and_help — try native nushell completion first, fall back to --help *) +type bin_class = Skip | Try_help | Try_native_and_help + +(* classify an elf binary path for indexing. *) +let classify_elf path = + let scan = elf_scan path ["-h"; "complet"] in + if Hashtbl.mem scan "complet" then Try_native_and_help + else if Hashtbl.mem scan "-h" then Try_help + else Skip + +(* classify a binary to decide the indexing strategy. + * decision tree: + * 1. nushell builtin or bad name -> Skip + * 2. not executable -> Skip + * 3. script (has shebang) -> resolve through nix script wrapper if possible, + * otherwise Try_help + * 4. elf binary containing "complet" -> Try_native_and_help + * 5. elf binary containing "-h" -> Try_help + * 6. nix c wrapper -> Try_help (the wrapper itself is just an exec shim) + * 7. otherwise -> Skip (binary has no help infrastructure) *) +let classify_binary bindir name = + if is_nushell_builtin name || skip_name name then Skip + else + let path = Filename.concat bindir name in + if not (is_executable path) then Skip + else if is_script path then + match nix_script_wrapper_target path with + | Some target -> + let cls = classify_elf target in + if cls <> Skip then cls else Try_help + | None -> Try_help + else + let cls = classify_elf path in + if cls <> Skip then cls + else if nix_wrapper_target path <> None then Try_help + else Skip + +(* detect available cpu cores by counting "processor" lines in /proc/cpuinfo. + * falls back to 4 if /proc/cpuinfo can't be read (e.g. on non-linux). *) +let num_cores () = + try + let ic = open_in "/proc/cpuinfo" in + let count = ref 0 in + (try while true do + if String.starts_with ~prefix:"processor" (input_line ic) then incr count + done with End_of_file -> ()); + close_in ic; max 1 !count + with _ -> 4 + +(* extract words from text that contain any of the given substrings. + * words are sequences of [a-zA-Z0-9_-] optionally prefixed with --. + * returns a deduplicated list. *) +let extract_matching_words text needles = + let len = String.length text in + let module SSet = Set.Make(String) in + let words = ref SSet.empty in + let i = ref 0 in + while !i < len do + while !i < len && not (text.[!i] >= 'a' && text.[!i] <= 'z' + || text.[!i] >= 'A' && text.[!i] <= 'Z' + || text.[!i] = '-') do + incr i + done; + let start = !i in + while !i < len && (text.[!i] >= 'a' && text.[!i] <= 'z' + || text.[!i] >= 'A' && text.[!i] <= 'Z' + || text.[!i] >= '0' && text.[!i] <= '9' + || text.[!i] = '-' || text.[!i] = '_') do + incr i + done; + if !i > start then begin + let word = String.sub text start (!i - start) in + let lower = String.lowercase_ascii word in + if List.exists (fun needle -> + try ignore (Str.search_forward (Str.regexp_string needle) lower 0); true + with Not_found -> false + ) needles then + words := SSet.add word !words + end + done; + SSet.elements !words + +(* try to get native nushell completions from a binary. + * runs --help, scans the output for words containing completion-related + * substrings ("complet"), then tries each match as a subcommand or flag + * with "nushell" as the argument. + * + * this catches arbitrary patterns (completions, generate-completions, + * shell-completion, gen-completions, etc.) without maintaining a hardcoded + * list. the worst case is a few failed attempts before falling back to + * manpage/--help parsing. *) +let try_native_completion bin_path = + let help_text = match run_cmd [bin_path; "--help"] 500 with + | Some t -> t | None -> "" in + if help_text = "" then None + else + let candidates = extract_matching_words help_text ["complet"] in + List.find_map (fun word -> + let attempts = + if String.starts_with ~prefix:"--" word then + [[bin_path; word; "nushell"]] + else + [[bin_path; word; "nushell"]; + [bin_path; "--" ^ word; "nushell"]] + in + List.find_map (fun args -> + match run_cmd args 500 with + | Some text when is_nushell_source text -> Some text + | _ -> None + ) attempts + ) candidates + +(* parse a manpage file, extracting the command name, its flags/subcommands, + * and any clap-style per-subcommand sections. + * returns none for nushell builtins or failed parses. *) +let parse_manpage_for_command file = + let contents = read_manpage_file file in + let fallback = cmd_name_of_manpage file in + (* the filename encodes the command boundary: "git-stash" = 2 words. + * use this to clamp the synopsis-extracted name, which can be too greedy + * when the synopsis lists subcommand variants. *) + let max_words = List.length (String.split_on_char '-' fallback) in + let clamp_cmd name = + let words = String.split_on_char ' ' name in + if List.length words > max_words then + String.concat " " (List.filteri (fun i _ -> i < max_words) words) + else name in + let cmd = match extract_synopsis_command contents with + | Some name -> clamp_cmd name | None -> fallback in + if is_nushell_builtin cmd then None + else + let result = parse_manpage_string contents in + let sub_sections = extract_subcommand_sections contents in + let result = if sub_sections <> [] then + { result with subcommands = List.map (fun (name, desc, _) -> + { name; desc }) sub_sections } + else result in + let subs = List.map (fun (name, _desc, r) -> + (cmd ^ " " ^ name, r)) sub_sections in + Some (cmd, result, subs) + +(* "inshellah manpage FILE" — parse one manpage and print the nushell extern *) +let cmd_manpage file = + match parse_manpage_for_command file with + | Some (cmd, result, _) when result.entries <> [] -> + print_string (generate_extern cmd result) + | _ -> () + +(* "inshellah manpage-dir DIR" — batch-process all manpages under a directory *) +let cmd_manpage_dir dir = + List.iter (fun section -> + let subdir = Filename.concat dir (Printf.sprintf "man%d" section) in + if is_dir subdir then + Array.iter (fun file -> + (try cmd_manpage (Filename.concat subdir file) with _ -> ()) + ) (Sys.readdir subdir) + ) command_sections + +(* detect rendered manpage output — when --help delegates to man(1), the + * output starts with a header line like "GIT-STASH(1) ... GIT-STASH(1)". + * we check if the first non-blank line matches that pattern. *) +let is_rendered_manpage text = + let lines = String.split_on_char '\n' text in + let first_line = List.find_opt (fun l -> String.trim l <> "") lines in + match first_line with + | None -> false + | Some line -> + let trimmed = String.trim line in + (* look for WORD(DIGIT) at the start of the line *) + try + let paren = String.index trimmed '(' in + paren > 0 + && paren + 2 < String.length trimmed + && trimmed.[paren + 1] >= '0' && trimmed.[paren + 1] <= '9' + && trimmed.[paren + 2] = ')' + with Not_found -> false + +(* find the raw manpage file for a hyphenated command name like "git-stash". + * first checks the provided man directories directly, then falls back to + * man -w for on-the-fly resolution when no man dirs are known. *) +let find_manpage_path mandirs hyphenated_name = + let try_dirs () = + List.find_map (fun mandir -> + List.find_map (fun section -> + let subdir = Filename.concat mandir (Printf.sprintf "man%d" section) in + List.find_map (fun ext -> + let path = Filename.concat subdir + (Printf.sprintf "%s.%d%s" hyphenated_name section ext) in + if Sys.file_exists path then Some path else None + ) [""; ".gz"] + ) command_sections + ) mandirs in + match try_dirs () with + | Some _ as found -> found + | None -> + (* fallback to man -w when no man dirs provided or file not found *) + match run_cmd ["man"; "-w"; hyphenated_name] 200 with + | Some raw -> + let path = String.trim raw in + if Sys.file_exists path then Some path else None + | None -> None + +(* when --help output is a rendered manpage, find and parse the raw manpage + * source instead. returns the main result plus any sub-section results + * (e.g. "git stash push" flags parsed from the git-stash manpage). *) +let try_manpage_fallback mandirs cmd_name = + match find_manpage_path mandirs cmd_name with + | None -> None + | Some path -> + match parse_manpage_for_command path with + | None -> None + | Some (_, result, subs) when result.entries = [] && subs = [] -> None + | Some (_, result, subs) -> Some (result, subs) + +(* safety limit: don't accumulate more than 500 subcommand resolution results + * per binary. prevents runaway recursion on tools with enormous subcommand trees. *) +let max_resolve_results = 500 + +(* safe wrapper around parse_manpage_for_command that catches all exceptions *) +let process_manpage file = + try + match parse_manpage_for_command file with + | Some (cmd, result, subs) when result.entries <> [] || subs <> [] -> + Some (cmd, result, subs) + | _ -> None + with _ -> None + +(* collect the set of command names that have manpages in a given man directory. + * used during indexing to skip --help for commands that will be handled by + * the manpage parsing phase instead (manpages are more reliable than --help). *) +let manpaged_commands mandir = + List.fold_left (fun acc section -> + let subdir = Filename.concat mandir (Printf.sprintf "man%d" section) in + if is_dir subdir then + Array.fold_left (fun acc f -> SSet.add (cmd_name_of_manpage f) acc) + acc (Sys.readdir subdir) + else acc + ) SSet.empty command_sections + +(* parallel structured help resolver — recursively resolves a command and + * all its subcommands by running --help on each, forking a child process + * per subcommand for parallelism. + * + * the resolver works as a breadth-first queue: + * 1. start with the root command in the queue + * 2. fork a child for each queued item (up to num_cores concurrent) + * 3. the child runs --help, parses the output, marshals the result via pipe + * 4. the parent collects results and enqueues discovered subcommands + * 5. repeat until queue is empty and all children have finished + * + * depth is limited to 5 levels and total results to max_resolve_results + * to prevent runaway recursion on pathological command trees. + * + * the child process detects "self-listing" — when a subcommand's --help + * lists itself as a subcommand (e.g. "git help" listing "help" as a + * subcommand of itself). this would cause infinite recursion, so such + * results are discarded. + * + * children close all pipe fds from other pending children immediately + * after fork to prevent fd leaks. the parent drains pipes regularly to + * prevent children from blocking on full pipe buffers. *) +let help_resolve_par ?(timeout=200) ?(mandirs=[]) cmd rest name = + let max_jobs = num_cores () in + let queue = Queue.create () in + Queue.push (rest, name, 0) queue; + let results = ref [] in + (* pending: (pid, rd, buf, cmd_args, cmd_name, depth) *) + let pending = ref [] in + let collect rd buf cmd_args cmd_name depth = + drain_fd rd buf; + (try Unix.close rd with _ -> ()); + let data = Buffer.contents buf in + let result : (help_result * subcommand list * (string * help_result) list) option = + if String.length data > 0 then + try Marshal.from_string data 0 with _ -> None + else None in + match result with + | None -> () + | Some (r, subs, extras) -> + let at_limit = depth >= 5 || List.length !results >= max_resolve_results in + results := (cmd_name, r) :: !results; + (* extras are fully-parsed sub-results from manpage sub-sections — + * add them directly without enqueueing for further resolution *) + List.iter (fun (sub_name, sub_r) -> + if not (List.exists (fun (existing, _) -> existing = sub_name) !results) then + results := (sub_name, sub_r) :: !results + ) extras; + if not at_limit then + (* only enqueue subcommands that weren't already covered by extras *) + let extra_names = List.map fst extras in + List.iter (fun (sc : subcommand) -> + let full = cmd_name ^ " " ^ sc.name in + if not (List.exists (fun existing -> existing = full) extra_names) then + Queue.push (cmd_args @ [sc.name], full, depth + 1) queue + ) subs in + let reap () = + pending := List.filter (fun (pid, rd, buf, cmd_args, cmd_name, depth) -> + drain_fd rd buf; + match Unix.waitpid [Unix.WNOHANG] pid with + | (0, _) -> true + | _ -> collect rd buf cmd_args cmd_name depth; false + | exception Unix.Unix_error (Unix.ECHILD, _, _) -> + (try Unix.close rd with _ -> ()); false + ) !pending in + let wait_for_slot () = + while List.length !pending >= max_jobs do + reap (); + if List.length !pending >= max_jobs then begin + let fds = List.map (fun (_, rd, _, _, _, _) -> rd) !pending in + ignore (Unix.select fds [] [] 0.05) + end + done in + while not (Queue.is_empty queue) || !pending <> [] do + while not (Queue.is_empty queue) do + let (cmd_args, cmd_name, depth) = Queue.pop queue in + wait_for_slot (); + let (rd, wr) = Unix.pipe () in + let pid = Unix.fork () in + if pid = 0 then begin + Unix.close rd; + List.iter (fun (_, prd, _, _, _, _) -> + try Unix.close prd with _ -> ()) !pending; + let result = + let text = match run_cmd (cmd :: cmd_args @ ["--help"]) timeout with + | Some _ as r -> r + | None -> run_cmd (cmd :: cmd_args @ ["-h"]) timeout in + match text with + | None -> None + | Some text -> + (* check for rendered manpage first — when --help delegates to + * man(1), the raw groff source has richer structure than the + * rendered text. parse_help would partially succeed on rendered + * manpage output (extracting flags from OPTIONS) but miss + * subcommands from the COMMANDS section. *) + if is_rendered_manpage text then + let base = Filename.basename cmd in + let hyphenated = String.concat "-" (base :: cmd_args) in + match try_manpage_fallback mandirs hyphenated with + | Some (r, subs) -> + let at_limit = depth >= 5 in + let extra = List.map (fun (sub_name, sub_r) -> + (cmd_name ^ " " ^ sub_name, sub_r)) subs in + let enqueue_subs = if at_limit then [] else r.subcommands in + Some (r, enqueue_subs, extra) + | None -> + (* manpage file not found — fall back to parsing rendered text *) + (match parse_help text with + | Error _ -> None + | Ok r when r.entries = [] && r.subcommands = [] && r.positionals = [] -> None + | Ok r -> + let self_listed = match cmd_args with + | [] -> false + | _ -> + let leaf = List.nth cmd_args (List.length cmd_args - 1) in + List.exists (fun (sc : subcommand) -> sc.name = leaf) r.subcommands in + if self_listed then + Some ({ entries = []; subcommands = []; positionals = []; + description = "" }, [], []) + else + let at_limit = depth >= 5 in + let subs = if at_limit then [] else r.subcommands in + Some (r, subs, [])) + else + match parse_help text with + | Error _ -> None + | Ok r when r.entries = [] && r.subcommands = [] && r.positionals = [] -> None + | Ok r -> + let self_listed = match cmd_args with + | [] -> false + | _ -> + let leaf = List.nth cmd_args (List.length cmd_args - 1) in + List.exists (fun (sc : subcommand) -> sc.name = leaf) r.subcommands in + if self_listed then + (* the subcommand's --help returned the parent's help text + * (it lists itself as a subcommand). cache a leaf stub so the + * completer knows this is a leaf node, not a parent with + * further subcommands. *) + Some ({ entries = []; subcommands = []; positionals = []; + description = "" }, [], []) + else + let at_limit = depth >= 5 in + let subs = if at_limit then [] else r.subcommands in + Some (r, subs, []) in + let oc = Unix.out_channel_of_descr wr in + Marshal.to_channel oc (result : (help_result * subcommand list * (string * help_result) list) option) []; + close_out oc; + exit 0 + end else begin + Unix.close wr; + pending := (pid, rd, Buffer.create 4096, cmd_args, cmd_name, depth) :: !pending + end + done; + if !pending <> [] then begin + reap (); + if !pending <> [] && Queue.is_empty queue then begin + let fds = List.map (fun (_, rd, _, _, _, _) -> rd) !pending in + ignore (Unix.select fds [] [] 0.05) + end + end + done; + List.rev !results + +(* "inshellah index" — the main indexing command. + * processes all binaries and manpages in the given prefix directories, + * writing completion data to the cache dir. + * + * the pipeline has two phases: + * + * phase 1 (binaries): fork one child per binary. each child: + * - tries native nushell completions (if classified as Try_native_and_help) + * - falls back to help_resolve_par (which itself forks per subcommand) + * - marshals the result back via pipe as a tagged variant: + * `Native of string — raw nushell source + * `Parsed of (string * help_result) list — parsed flag data + * `None — nothing useful extracted + * + * phase 2 (manpages): sequentially parse manpages for commands not yet + * covered by phase 1. manpages are more reliable than --help for many + * gnu tools, but slower to process. + * + * commands on the ignorelist are skipped entirely. commands on the + * help_only list skip manpage parsing and only use --help. commands + * with manpages skip --help in phase 1 (they'll be handled in phase 2). + * + * the done_cmds set tracks which commands have already been indexed to + * prevent duplicates across phases and across multiple prefix directories. *) + +(* known privilege-escalation wrappers — defined here (before cmd_index and + * cmd_complete) because both need the list: cmd_index writes @complete + * external stubs, and cmd_complete strips the wrapper to find the real command. *) +let elevation_commands = + ["sudo"; "run0"; "doas"; "pkexec"; "su"; "calife"; "sux"; "sudoedit"; + "please"; "super"; "priv"] + +let cmd_index bindirs mandirs ignorelist help_only dir = + ensure_dir dir; + let done_cmds = ref SSet.empty in + let result_count = ref 0 in + let index_bindir bindir mandir = + if not (is_dir bindir) then + Printf.eprintf "skipping %s (not found)\n" bindir + else begin + let bins = Sys.readdir bindir in + Array.sort String.compare bins; + let manpaged = if is_dir mandir + then manpaged_commands mandir else SSet.empty in + let max_jobs = num_cores () in + let classified = Array.map (fun name -> + if SSet.mem name ignorelist then (name, Skip) + else if SSet.mem name help_only then (name, classify_binary bindir name) + else if SSet.mem name manpaged then (name, Skip) + else (name, classify_binary bindir name) + ) bins in + let pending = ref [] in + let process_result name rd buf = + drain_fd rd buf; + (try Unix.close rd with _ -> ()); + let data = Buffer.contents buf in + if String.length data > 0 then begin + let result : [`Native of string | `Parsed of (string * help_result) list | `None] = + try Marshal.from_string data 0 with _ -> `None in + (match result with + | `Native src -> + write_native ~dir name src; + incr result_count + | `Parsed pairs -> + List.iter (fun (cmd_name, r) -> + if not (SSet.mem cmd_name !done_cmds) then begin + write_result ~dir ~source:"help" cmd_name r; + done_cmds := SSet.add cmd_name !done_cmds; + incr result_count + end + ) pairs + | `None -> ()) + end; + done_cmds := SSet.add name !done_cmds in + let reap () = + pending := List.filter (fun (pid, rd, buf, name) -> + drain_fd rd buf; + match Unix.waitpid [Unix.WNOHANG] pid with + | (0, _) -> true + | _ -> + process_result name rd buf; + false + | exception Unix.Unix_error (Unix.ECHILD, _, _) -> + (try Unix.close rd with _ -> ()); false + ) !pending in + let wait_for_slot () = + while List.length !pending >= max_jobs do + reap (); + if List.length !pending >= max_jobs then begin + let fds = List.map (fun (_, rd, _, _) -> rd) !pending in + ignore (Unix.select fds [] [] 0.05) + end + done in + Array.iter (fun (name, classification) -> + match classification with + | Skip -> () + | Try_help | Try_native_and_help -> + wait_for_slot (); + let (rd, wr) = Unix.pipe () in + let pid = Unix.fork () in + if pid = 0 then begin + Unix.close rd; + List.iter (fun (_, prd, _, _) -> + try Unix.close prd with _ -> ()) !pending; + let result = + try + let path = Filename.concat bindir name in + let native = match classification with + | Try_native_and_help -> + (match try_native_completion path with + | Some src -> Some src | None -> None) + | _ -> None in + match native with + | Some src -> `Native src + | None -> + let pairs = help_resolve_par ~timeout:200 ~mandirs path [] name in + if pairs <> [] then `Parsed pairs else `None + with _ -> `None in + let oc = Unix.out_channel_of_descr wr in + Marshal.to_channel oc + (result : [`Native of string | `Parsed of (string * help_result) list | `None]) []; + close_out oc; + exit 0 + end else begin + Unix.close wr; + pending := (pid, rd, Buffer.create 4096, name) :: !pending + end + ) classified; + while !pending <> [] do + reap (); + if !pending <> [] then begin + let fds = List.map (fun (_, rd, _, _) -> rd) !pending in + ignore (Unix.select fds [] [] 0.05) + end + done; + (* phase 2: manpages *) + if is_dir mandir then + List.iter (fun section -> + let subdir = Filename.concat mandir (Printf.sprintf "man%d" section) in + if is_dir subdir then begin + let files = Sys.readdir subdir in + (* sort by filename length first, then alphabetically. + * this ensures parent manpages (e.g. nix-env.1.gz) are + * processed before subpage manpages (nix-env-install.1.gz) + * so the parent's data isn't overwritten by a subpage + * whose synopsis also extracts the parent command name. *) + Array.sort (fun a b -> + let la = String.length a and lb = String.length b in + if la <> lb then compare la lb + else String.compare a b) files; + Array.iter (fun file -> + let base_cmd = cmd_name_of_manpage file in + if SSet.mem base_cmd help_only then () + else match process_manpage (Filename.concat subdir file) with + | None -> () + | Some (cmd, result, subs) -> + if not (SSet.mem cmd !done_cmds) then begin + write_result ~dir ~source:"manpage" cmd result; + done_cmds := SSet.add cmd !done_cmds; + incr result_count + end else if cmd <> base_cmd then + (* a subpage manpage (e.g. nix-env-install.1) extracted + * a command name that was already indexed (e.g. "nix-env"). + * warn so the user can investigate. *) + Printf.eprintf "warning: %s extracted cmd \"%s\" (already indexed), skipping\n" + file cmd; + List.iter (fun (sub_cmd, sub_result) -> + if not (SSet.mem sub_cmd !done_cmds) then begin + write_result ~dir ~source:"manpage" sub_cmd sub_result; + done_cmds := SSet.add sub_cmd !done_cmds; + incr result_count + end + ) subs; + (* for COMMANDS section subcommands (e.g. systemctl start/stop), + * write leaf stubs so the completer treats them as leaf nodes + * rather than falling back to the parent's flags/subcommands. + * only when there are no clap-style sub-sections (subs = []), + * meaning the subcommands came from the COMMANDS section. + * deliberately not added to done_cmds — if a per-subcommand + * manpage exists (e.g. docker-start.1), it will overwrite the stub. *) + if subs = [] then + List.iter (fun (sc : subcommand) -> + let sub_cmd = cmd ^ " " ^ sc.name in + if not (SSet.mem sub_cmd !done_cmds) then + write_result ~dir ~source:"manpage" sub_cmd + { entries = []; subcommands = []; positionals = []; + description = sc.desc } + ) result.subcommands + ) files + end + ) command_sections + end in + List.iter2 index_bindir bindirs mandirs; + (* write @complete external stubs for elevation commands (sudo, doas, etc.) + * so nushell routes their completions through the external completer. + * without this, nushell hardcodes sudo/doas to show command-name completion + * and never calls the external completer for their own flags. *) + List.iter (fun cmd -> + let json_path = Filename.concat dir (filename_of_command cmd ^ ".json") in + if Sys.file_exists json_path then + write_native ~dir cmd + (Printf.sprintf "@complete external\nextern \"%s\" []\n" cmd) + ) elevation_commands; + Printf.printf "indexed %d commands into %s\n" !result_count dir + +(* "inshellah dump" — list all indexed commands with their source type *) +let cmd_dump dirs = + let cmds = all_commands dirs in + Printf.printf "%d commands\n" (List.length cmds); + List.iter (fun cmd -> + let src = match file_type_of dirs cmd with + | Some label -> label | None -> "?" in + Printf.printf " %-40s [%s]\n" cmd src + ) cmds + +(* search $PATH for an executable with the given name. + * used during completion to find binaries for on-the-fly resolution. *) +let find_in_path name = + try + Sys.getenv "PATH" + |> String.split_on_char ':' + |> List.find_map (fun dir -> + let p = Filename.concat dir name in + if is_executable p then Some p else None) + with Not_found -> None + +(* resolve a command's completions on-the-fly and cache the results. + * called during "complete" when a command isn't in the index. + * runs help_resolve_par and writes results to the user's cache dir. *) +let resolve_and_cache ~dir ~mandirs name path = + let pairs = help_resolve_par ~timeout:200 ~mandirs path [] name in + if pairs <> [] then begin + ensure_dir dir; + List.iter (fun (cmd_name, r) -> write_result ~dir cmd_name r) pairs; + Some pairs + end else None + +(* format a single completion candidate as JSON for nushell's completer protocol *) +let completion_json value desc = + Printf.sprintf "{\"value\":\"%s\",\"description\":\"%s\"}" + (escape_json value) (escape_json desc) + +(* fuzzy matching: returns a score > 0 if needle is a subsequence of haystack. + * higher scores = better match. scoring tiers: + * - exact match: 1000 + * - prefix match: 900 + length bonus (how much of the haystack is covered) + * - subsequence: base 10 per char + bonuses for: + * - word boundary alignment (50): matching at '-', '_', or camelCase transitions + * - consecutive matches (20): matching adjacent characters + * + * this drives the completion candidate ranking. users typing "ser" should see + * "--server" ranked above "--preserve" even though both contain "ser" as a + * subsequence. the word-boundary bonus achieves this. *) +let fuzzy_score needle haystack = + let needle_len = String.length needle and haystack_len = String.length haystack in + if needle_len = 0 then 1 + else if needle_len > haystack_len then 0 + else if needle = haystack then 1000 + else + let needle_lc = String.lowercase_ascii needle + and haystack_lc = String.lowercase_ascii haystack in + if String.starts_with ~prefix:needle_lc haystack_lc then + 900 + (needle_len * 100 / haystack_len) + else + let is_boundary hay_idx = + hay_idx = 0 || haystack.[hay_idx - 1] = '-' || haystack.[hay_idx - 1] = '_' + || (haystack.[hay_idx - 1] >= 'a' && haystack.[hay_idx - 1] <= 'z' + && haystack.[hay_idx] >= 'A' && haystack.[hay_idx] <= 'Z') in + (* walk haystack matching needle chars as a subsequence *) + let needle_idx, score, _, _ = + String.fold_left (fun (needle_idx, score, hay_idx, prev_match) c -> + if needle_idx >= needle_len then (needle_idx, score, hay_idx + 1, prev_match) + else if c = needle_lc.[needle_idx] then + let bonus = (if is_boundary hay_idx then 50 else 10) + + (if prev_match = hay_idx - 1 then 20 else 0) in + (needle_idx + 1, score + bonus, hay_idx + 1, hay_idx) + else (needle_idx, score, hay_idx + 1, prev_match) + ) (0, 0, 0, -1) haystack_lc in + if needle_idx = needle_len then score else 0 + +(* scan past the elevation command's flags and arguments to find the real + * command. is_command checks whether a token names a known command. + * returns Some (real_cmd :: args) or None if no command was found. *) +let find_real_command is_command args = + let rec scan = function + | [] -> None + | "--" :: rest -> Some rest + | arg :: rest when String.length arg > 0 && arg.[0] = '-' -> + scan rest + | arg :: _ as cmd_and_rest when is_command arg -> + Some cmd_and_rest + | _ :: rest -> scan rest + in + scan args + +(* "inshellah complete CMD [ARGS...]" — the nushell custom completer. + * this is the hot path — called every time the user presses tab in nushell. + * + * the completion logic: + * 1. try to find the command (or longest subcommand prefix) in the store + * 2. if not found, try on-the-fly resolution (find in $PATH, run --help, cache) + * 3. score all candidate completions against the partial input using fuzzy_score + * 4. output scored candidates as a JSON array + * + * subcommand resolution: the lookup tries longest prefix first. + * for "git add --", it first looks for "git add", then "git". + * this ensures subcommand-specific flags are shown. + * + * nushell sends a trailing empty token when the cursor is after a space + * ("git add "). in this case all_tokens includes the empty string. + * when the last token is non-empty, the user is still typing it, so we use + * it as the fuzzy filter. when empty, we show all candidates. + * + * if only a parent command matched (e.g. "git" matched but not "git add"), + * we suppress subcommand suggestions and only show flags. this prevents + * showing sibling subcommands when the user has already committed to a + * specific subcommand path. + * + * file completions: nushell's external completer protocol is either/or — + * you either return custom candidates or fall back to native file completions + * (via null), but can't mix both. we return null (triggering nushell's native + * file completer with colors, sorting, quoting) when: + * - the user is at a leaf command (no subcommands) and not mid-flag + * - or we have no candidates at all + * this ensures file completions appear with full nushell UX. when the user + * IS typing a flag (partial starts with "-"), we return our flag candidates. *) +let cmd_complete spans user_dir system_dirs mandirs = + (* system dirs are searched first — they're built at index time from + * manpages and are authoritative. user dir is an on-the-fly cache + * that should only be used as fallback for commands not in any system dir. *) + let dirs = system_dirs @ [user_dir] in + (* if the command line starts with a privilege-escalation wrapper, scan past + * it to find the real command. we identify the command by checking the store + * and $PATH — this avoids needing per-command option tables which are fragile + * across different implementations. if no real command is found, fall back to + * completing the elevation command itself. *) + let spans = match spans with + | cmd :: rest when List.mem cmd elevation_commands -> + let is_command name = + name <> "" && (lookup dirs name <> None || find_in_path name <> None) + in + (match find_real_command is_command rest with + | Some (_ :: _ as real_spans) -> real_spans + | _ -> spans) + | _ -> spans in + match spans with + | [] -> print_string "null\n" + | cmd_name :: rest -> + (* try longest prefix match: "git add" before "git" *) + let find_result tokens = + let num_tokens = List.length tokens in + List.init num_tokens Fun.id |> List.find_map (fun drop -> + let prefix = List.filteri (fun i _ -> i < num_tokens - drop) tokens in + match prefix with + | [] -> None + | _ -> + let try_name = String.concat " " prefix in + match lookup dirs try_name with + | Some r -> Some (try_name, r, List.length prefix) + | None -> None) in + (* strip flag tokens (--user, -a, etc.) from intermediate positions. + * flags are not part of the subcommand path and should not affect + * lookup. e.g. "systemctl --user start" should look up "systemctl start". + * the last token (partial) is NOT stripped — it may be a flag the + * user is typing (e.g. "--u") which needs fuzzy matching. *) + let strip_intermediate_flags tokens = + match List.rev tokens with + | last :: rev_rest -> + List.filter (fun t -> + String.length t = 0 || t.[0] <> '-') (List.rev rev_rest) + @ [last] + | [] -> [] in + let all_tokens = strip_intermediate_flags (cmd_name :: rest) in + let last_token = match rest with + | [] -> "" | _ -> List.nth rest (List.length rest - 1) in + (* only treat the last token as a completed subcommand when nushell + * sends a trailing empty token (cursor is after a space). + * otherwise the user is still typing and we treat it as partial. *) + let lookup_tokens = if last_token = "" then all_tokens + else match all_tokens with + | _ :: _ -> List.rev (List.tl (List.rev all_tokens)) + | _ -> [cmd_name] in + let resolve tokens partial = + match find_result tokens with + | Some _ as found -> (found, partial) + | None -> (None, partial) in + let found, partial = resolve lookup_tokens last_token in + (* try on-the-fly resolution when no match or only a parent matched *) + let lookup_depth = List.length lookup_tokens in + let result, partial = match found with + | Some (_, _, depth) when depth >= lookup_depth - 1 -> + (* exact or near-exact match — use it *) + (found, partial) + | _ -> + (* no match, or only a parent matched — try on-the-fly resolution *) + (match find_in_path cmd_name with + | Some path -> + (* derive sibling share/man from the binary's location. + * e.g. /nix/store/.../bin/foo → /nix/store/.../share/man + * this lets on-the-fly resolution find manpages for commands + * not in the indexed prefixes. also resolves through nix + * wrappers to find the real binary's manpage location. *) + let mandir_of_bin p = + let bindir = Filename.dirname p in + let prefix = Filename.dirname bindir in + Filename.concat (Filename.concat prefix "share") "man" in + let bin_mandirs = + let direct = mandir_of_bin path in + (* also check the canonical path after resolving symlinks. + * e.g. /run/current-system/sw/bin/foo is a symlink to + * /nix/store/xxx/bin/foo — check /nix/store/xxx/share/man *) + let via_realpath = + try let real = Unix.realpath path in + if real <> path then [mandir_of_bin real] else [] + with Unix.Unix_error _ -> [] in + let via_wrapper = + match nix_script_wrapper_target path with + | Some target -> [mandir_of_bin target] + | None -> + match nix_wrapper_target path with + | Some target -> [mandir_of_bin target] + | None -> [] in + List.filter is_dir (direct :: via_realpath @ via_wrapper) in + let all_mandirs = bin_mandirs @ mandirs in + (match resolve_and_cache ~dir:user_dir ~mandirs:all_mandirs cmd_name path with + | Some _pairs -> resolve lookup_tokens last_token + | None -> (found, partial)) + | None -> (found, partial)) in + let candidates = match result with + | None -> [] + | Some (_matched_name, r, depth) -> + (* when the match is shallower than requested, the user already + * typed a subcommand beyond the matched level — don't show + * sibling subcommands, only flags *) + let sub_candidates = if depth < lookup_depth - 1 then [] else + let subs = match r.subcommands with + | _ :: _ -> r.subcommands + | [] -> subcommands_of dirs _matched_name in + List.filter_map (fun (subcommand : subcommand) -> + let score = fuzzy_score partial subcommand.name in + if score > 0 then Some (score, completion_json subcommand.name subcommand.desc) else None + ) subs in + (* build flag completion candidates from the entry list. + * for flags with both short and long forms (Both), we pick which form + * to display based on what the user is currently typing: + * - if the partial input matches the short flag better, show the short + * flag as the value and note the long form in the description + * - otherwise (including empty partial), prefer the long flag and note + * the short form in the description + * + * parameter names are appended to descriptions in angle brackets for + * mandatory params and square brackets for optional ones, matching the + * conventions users expect from cli help text. *) + let flag_candidates = List.filter_map (fun (entry : entry) -> + let base_desc = match entry.param with + | Some (Mandatory p) -> if entry.desc <> "" then entry.desc ^ " <" ^ p ^ ">" else "<" ^ p ^ ">" + | Some (Optional p) -> if entry.desc <> "" then entry.desc ^ " [" ^ p ^ "]" else "[" ^ p ^ "]" + | None -> entry.desc in + let flag, desc = match entry.switch with + | Long l -> ("--" ^ l, base_desc) + | Short c -> (Printf.sprintf "-%c" c, base_desc) + | Both (c, l) -> + (* score the partial against both forms to decide which to present. + * e.g. typing "-s" scores higher against "-s" than "--squeeze-blank", + * so we show "-s (aka --squeeze-blank)". when the partial is empty or + * matches the long form better, we default to the long form. *) + let long_flag = "--" ^ l in + let short_flag = Printf.sprintf "-%c" c in + let long_score = fuzzy_score partial long_flag in + let short_score = fuzzy_score partial short_flag in + if short_score > long_score then + (short_flag, Printf.sprintf "(aka %s) %s" long_flag base_desc) + else + (long_flag, Printf.sprintf "(aka %s) %s" short_flag base_desc) in + let score = fuzzy_score partial flag in + if score > 0 then Some (score, completion_json flag desc) else None + ) r.entries in + let scored = sub_candidates @ flag_candidates in + List.sort (fun (a, _) (b, _) -> compare b a) scored + |> List.map snd in + (* determine whether to return our candidates or fall back to nushell's + * native file completer (via null). nushell's protocol is either/or: + * returning candidates suppresses file completions, returning null + * enables them with full nushell UX (colors, sorting, quoting). + * + * we return null when: + * - we have no candidates at all (unknown command, no match) + * - the user is at a leaf command and not typing a flag — this is + * the position where file arguments are expected, so hand off to + * nushell's native file completer for the best experience *) + let typing_flag = String.length partial > 0 && partial.[0] = '-' in + let has_subcommands = match result with + | Some (matched_name, r, _) -> + r.subcommands <> [] || subcommands_of dirs matched_name <> [] + | None -> false in + let want_files = (not typing_flag) && (not has_subcommands) in + if want_files then print_string "null\n" + else if candidates = [] then print_string "null\n" + else Printf.printf "[%s]\n" (String.concat "," candidates) + +(* "inshellah query CMD" — print the raw stored data for a command *) +let cmd_query cmd dirs = + match lookup_raw dirs cmd with + | None -> + Printf.eprintf "not found: %s\n" cmd; exit 1 + | Some data -> + print_string data; print_newline () + +(* load a newline-separated list of command names to ignore. + * blank lines and lines starting with '#' are skipped. *) +let load_ignorelist path = + try + In_channel.with_open_text path In_channel.input_all + |> String.split_on_char '\n' + |> List.filter_map (fun line -> + let line = String.trim line in + if String.length line > 0 && line.[0] <> '#' then Some line else None) + |> SSet.of_list + with _ -> SSet.empty + +(* parse "index" subcommand arguments: prefix dirs + optional --dir, --ignore, --help-only. + * uses a fold over the argument list, accumulating prefixes and option values. *) +let parse_index_args args = + let (prefixes, dir, ignore, help_only, _) = + List.fold_left (fun (prefixes, dir, ignore, help_only, pending) arg -> + match pending with + | Some "--dir" -> (prefixes, arg, ignore, help_only, None) + | Some "--ignore" -> (prefixes, dir, SSet.union ignore (load_ignorelist arg), help_only, None) + | Some "--help-only" -> (prefixes, dir, ignore, SSet.union help_only (load_ignorelist arg), None) + | Some _ -> (prefixes, dir, ignore, help_only, None) + | None -> + match arg with + | "--dir" | "--ignore" | "--help-only" -> (prefixes, dir, ignore, help_only, Some arg) + | _ -> (arg :: prefixes, dir, ignore, help_only, None) + ) ([], default_store_path (), SSet.empty, SSet.empty, None) args in + (List.rev prefixes, dir, ignore, help_only) + +(* derive the sibling man directory from a store directory path. + * e.g. "/run/current-system/sw/share/inshellah" -> "/run/current-system/sw/share/man" *) +let man_dir_of_system_dir path = + Filename.concat (Filename.dirname path) "man" + +(* parse common --dir arguments for complete/query/dump commands. + * --dir takes a colon-separated list of paths. the first path is the writable + * user cache dir; additional paths are read-only system directories. + * man directories are derived from system dir paths as siblings + * (share/inshellah -> share/man). uses a fold over the argument list. *) +let parse_dir_args args = + let (dir_value, rest_args, _) = + List.fold_left (fun (dir_value, rest_args, pending) arg -> + match pending with + | Some "--dir" -> (Some arg, rest_args, None) + | Some _ -> (dir_value, rest_args, None) + | None -> + match arg with + | "--dir" -> (dir_value, rest_args, Some arg) + | _ -> (dir_value, arg :: rest_args, None) + ) (None, [], None) args in + let (user_dir, system_dirs) = match dir_value with + | None -> (default_store_path (), []) + | Some v -> + match String.split_on_char ':' v with + | [] -> (default_store_path (), []) + | first :: rest -> (first, rest) in + (user_dir, system_dirs, List.rev rest_args) + +(* "inshellah completions nushell" — emit native nushell extern for inshellah itself *) +let cmd_completions_nushell () = + let result = { + entries = []; + subcommands = []; + positionals = []; + description = "nushell completions engine"; + } in + let index_result = { + entries = [ + { switch = Long "dir"; param = Some (Mandatory "PATH"); desc = "output directory for cached completions" }; + { switch = Long "ignore"; param = Some (Mandatory "FILE"); desc = "skip listed commands entirely" }; + { switch = Long "help-only"; param = Some (Mandatory "FILE"); desc = "skip manpages for listed commands, use --help instead" }; + ]; + subcommands = []; + positionals = [ + { pos_name = "prefix"; optional = false; variadic = true }; + ]; + description = "index completions from prefix directories"; + } in + let complete_result = { + entries = [ + { switch = Long "dir"; param = Some (Mandatory "PATH"); desc = "colon-separated cache paths" }; + ]; + subcommands = []; + positionals = [ + { pos_name = "cmd"; optional = false; variadic = false }; + { pos_name = "args"; optional = true; variadic = true }; + ]; + description = "nushell custom completer, outputs JSON candidates"; + } in + let query_result = { + entries = [ + { switch = Long "dir"; param = Some (Mandatory "PATH"); desc = "colon-separated cache paths" }; + ]; + subcommands = []; + positionals = [ + { pos_name = "cmd"; optional = false; variadic = false }; + ]; + description = "print stored completion data for a command"; + } in + let dump_result = { + entries = [ + { switch = Long "dir"; param = Some (Mandatory "PATH"); desc = "colon-separated cache paths" }; + ]; + subcommands = []; + positionals = []; + description = "list indexed commands"; + } in + let manpage_result = { + entries = []; + subcommands = []; + positionals = [ + { pos_name = "file"; optional = false; variadic = false }; + ]; + description = "parse a manpage and emit nushell extern"; + } in + let manpage_dir_result = { + entries = []; + subcommands = []; + positionals = [ + { pos_name = "dir"; optional = false; variadic = false }; + ]; + description = "batch-process manpages under a directory"; + } in + let completions_result = { + entries = []; + subcommands = []; + positionals = []; + description = "generate nushell completions for inshellah"; + } in + print_string (generate_extern "inshellah" result); + print_string (generate_extern "inshellah index" index_result); + print_string (generate_extern "inshellah complete" complete_result); + print_string (generate_extern "inshellah query" query_result); + print_string (generate_extern "inshellah dump" dump_result); + print_string (generate_extern "inshellah manpage" manpage_result); + print_string (generate_extern "inshellah manpage-dir" manpage_dir_result); + print_string (generate_extern "inshellah completions" completions_result) + +(* --- entry point --- + * dispatch on the first argument to the appropriate subcommand handler. *) +let () = + match Array.to_list Sys.argv |> List.tl with + | "index" :: rest -> + let (prefixes, dir, ignorelist, help_only) = parse_index_args rest in + if prefixes = [] then (Printf.eprintf "error: index requires at least one prefix dir\n"; exit 1); + let bindirs = List.map (fun p -> Filename.concat p "bin") prefixes in + let mandirs = List.map (fun p -> Filename.concat p "share/man") prefixes in + cmd_index bindirs mandirs ignorelist help_only dir + | "complete" :: rest -> + let (user_dir, system_dirs, spans) = parse_dir_args rest in + let man_dirs = List.filter_map (fun d -> + let m = man_dir_of_system_dir d in + if is_dir m then Some m else None) system_dirs in + cmd_complete spans user_dir system_dirs man_dirs + | "query" :: rest -> + let (user_dir, system_dirs, args) = parse_dir_args rest in + (match args with + | [cmd] -> cmd_query cmd (user_dir :: system_dirs) + | _ -> Printf.eprintf "error: query CMD [--dir PATH[:PATH...]]\n"; exit 1) + | "dump" :: rest -> + let (user_dir, system_dirs, _) = parse_dir_args rest in + cmd_dump (user_dir :: system_dirs) + | ["manpage"; file] -> cmd_manpage file + | ["manpage-dir"; dir] -> cmd_manpage_dir dir + | ["completions"] -> cmd_completions_nushell () + | _ -> usage () diff --git a/doc/building.md b/doc/building.md index 0a2598d..de685d7 100644 --- a/doc/building.md +++ b/doc/building.md @@ -1,77 +1,141 @@ # building and installing -inshellah is a rust crate. it builds with stock cargo on any platform -rust supports. +## dependencies -## with nix +inshellah is written in OCaml and uses dune as its build system. + +build dependencies: +- **OCaml** >= 5.0 +- **dune** >= 3.20 +- **angstrom** — parser combinator library +- **angstrom-unix** — unix extensions for angstrom +- **camlzip** — gzip decompression for reading compressed manpages +- **str** — regular expressions (ships with OCaml) +- **unix** — process/file operations (ships with OCaml) + +runtime dependencies: +- **man** (optional) — used as a fallback to locate manpages during + on-the-fly completion resolution. not needed if system directories + are provided via `--dir` (manpages are found via sibling `share/man`). + +## building with nix (recommended) + +if you have nix installed: ```sh nix build ``` -binary is at `./result/bin/inshellah`. +the binary is at `./result/bin/inshellah`. -development shell: +for development with a shell containing all dependencies: ```sh nix develop -cargo build --release -cargo test +dune build +dune test ``` -## with cargo +## building from source with opam -requires rust >= 1.85 (edition 2024). +install dependencies via opam: ```sh -cargo build --release -cargo test -sudo install -Dm755 target/release/inshellah /usr/local/bin/inshellah +opam install dune angstrom angstrom-unix camlzip +``` + +build and test: + +```sh +dune build +dune test +``` + +install into the opam switch: + +```sh +dune install +``` + +## building from source without opam + +if your distribution packages the OCaml libraries directly, install +them through your package manager, then build with dune: + +```sh +dune build +``` + +the binary is at `_build/default/bin/main.exe`. copy it to your +`$PATH`: + +```sh +install -Dm755 _build/default/bin/main.exe /usr/local/bin/inshellah ``` ## arch linux +install OCaml and dune from the official repos, and the remaining +libraries from the AUR or via opam: + ```sh -sudo pacman -S rust -cargo build --release -sudo install -Dm755 target/release/inshellah /usr/local/bin/inshellah +# system packages +sudo pacman -S ocaml dune + +# ocaml libraries (via opam) +opam init # if not already initialized +eval $(opam env) +opam install angstrom angstrom-unix camlzip + +# build +dune build +dune test + +# install +sudo install -Dm755 _build/default/bin/main.exe /usr/local/bin/inshellah ``` ## debian / ubuntu ```sh -sudo apt install cargo rustc -# or: rustup install stable -cargo build --release -sudo install -Dm755 target/release/inshellah /usr/local/bin/inshellah +sudo apt install ocaml opam +opam init +eval $(opam env) +opam install dune angstrom angstrom-unix camlzip + +dune build +sudo install -Dm755 _build/default/bin/main.exe /usr/local/bin/inshellah ``` ## fedora ```sh -sudo dnf install cargo rust -cargo build --release -sudo install -Dm755 target/release/inshellah /usr/local/bin/inshellah +sudo dnf install ocaml opam +opam init +eval $(opam env) +opam install dune angstrom angstrom-unix camlzip + +dune build +sudo install -Dm755 _build/default/bin/main.exe /usr/local/bin/inshellah ``` ## post-install setup -index completions from your system prefix(es): +after installing the binary, index completions from your system +prefix(es): ```sh # typical linux system inshellah index /usr /usr/local -# more workers / different timeout -inshellah index /usr /usr/local --workers 16 --timeout-ms 500 - # check what was indexed inshellah dump ``` -wire up the nushell completer in `~/.config/nushell/config.nu`: +then wire up the nushell completer: ```nu +# ~/.config/nushell/config.nu $env.config.completions.external = { enable: true completer: {|spans| @@ -81,28 +145,19 @@ $env.config.completions.external = { } ``` -see [nushell-integration.md](nushell-integration.md) for full -completer details and [runtime-completions.md](runtime-completions.md) -for on-the-fly resolution of commands not covered by the upfront -index. +see [nushell-integration.md](nushell-integration.md) for full details +on the completer, and [runtime-completions.md](runtime-completions.md) +for on-the-fly resolution of commands not covered by the index. ## re-indexing after package changes +the index is a static cache — it doesn't update automatically when you +install or remove packages. re-run `inshellah index` after significant +package changes: + ```sh inshellah index /usr /usr/local ``` -on nixos, the system index regenerates on every `nixos-rebuild`. see -[nixos.md](nixos.md). - -## development - -```sh -cargo build # debug build, faster compile -cargo test # full test suite -cargo clippy --release -``` - -a `man` binary is useful at runtime as a fallback for locating -manpages outside the indexed prefixes — not required for indexing -itself. +on nixos, the system index regenerates on every `nixos-rebuild` +automatically. see [nixos.md](nixos.md) for details. diff --git a/doc/nixos.md b/doc/nixos.md index 05b9d5a..5d74690 100644 --- a/doc/nixos.md +++ b/doc/nixos.md @@ -1,47 +1,105 @@ # nixos integration -inshellah provides a nixos module that indexes nushell completions for -every installed package at system build time, and a wrapped binary -that knows where to find the result. +inshellah provides a nixos module that automatically indexes nushell +completions for all installed packages at system build time. ## enabling ```nix -# flake.nix outputs: +# in your flake.nix outputs: { nixosConfigurations.myhost = nixpkgs.lib.nixosSystem { modules = [ inshellah.nixosModules.default - { programs.inshellah.enable = true; } + { + programs.inshellah.enable = true; + } ]; }; } ``` -or importing directly: +or if importing the module directly: ```nix # configuration.nix { pkgs, ... }: { - imports = [ ./path/to/inshellah-rs/nix/module.nix ]; - programs.inshellah.enable = true; + imports = [ ./path/to/inshellah/nix/module.nix ]; + programs.inshellah = { + enable = true; + package = pkgs.inshellah; # or your local build + }; } ``` -after rebuilding, completions are immediately available — no extra -nushell config needed if you paste in the snippet (see below). +## what happens at build time -## what the module does +the module hooks into `environment.extraSetup`, which runs during the +system profile build (the `buildEnv` that creates `/run/current-system/sw`). +at that point, all system packages are merged, so `$out/bin` contains every +executable and `$out/share/man` contains every manpage. -- installs the inshellah binary, wrapped so the system completion path - is found automatically. -- runs `inshellah index "$out"` during the system profile build, - producing one file per command under `$out/share/inshellah/`. -- drops a nushell autoload shim into `/share/nushell/vendor/autoload/` - that overrides nushell's hard-coded sudo/doas bypass so completions - fire through inshellah even for elevated commands. -- exposes a `snippet` option carrying the full external-completer - config — see "using the completer" below. +inshellah runs a single command: + +``` +inshellah index "$out" --dir $out/share/inshellah +``` + +this executes a three-phase pipeline: + +### phase 1: native completion detection (parallel) + +for each executable, inshellah scans the elf binary for the string +`completion`. if found, it probes common patterns like +`CMD completions nushell` to see if the program can generate its own +nushell completions. native output is used verbatim — these are always +higher quality than parsed completions. + +programs like `niri`, and any clap/cobra tool with nushell support, +are handled this way. + +### phase 2: manpage parsing (sequential) + +for commands not covered by phase 1, inshellah parses manpages from +man1 (user commands) and man8 (sysadmin commands). it handles: + +- gnu `.TP` style (coreutils, help2man) +- `.IP` style (curl, hand-written) +- `.PP`+`.RS`/`.RE` style (git, docbook) +- nix3 bullet+hyperlink style (`nix run`, `nix build`, etc.) +- mdoc (bsd) format +- deroff fallback for unusual formats + +synopsis sections are parsed to detect subcommands: `git-commit.1` +generates `export extern "git commit"`, not `export extern "git-commit"`. + +### phase 3: --help fallback (parallel) + +remaining executables without manpages get `--help` (or `-h`) called +with a 200ms timeout. elf binaries are pre-scanned for the `-h` string +to skip those that don't support help flags. shell scripts are run +directly (they're fast). execution is parallelized to available cores. + +when `--help` produces rendered manpage output instead of plain help +text (e.g. `git stash --help` delegates to `man`), the raw manpage +source is located and parsed with the groff parser for richer results. + +### output + +each command gets its own file in `/share/inshellah` under the system +profile. native generators produce `.nu` files; parsed results produce +`.json` files. the `complete` command reads both formats. + +nushell built-in commands (ls, cd, cp, mv, etc.) are excluded since +nushell provides its own completions. + +### performance + +on a typical nixos system (~950 executables, ~1600 manpages): +- total time: ~4-10 seconds +- native gzip decompression (camlzip, no process spawning) +- parallel --help with core-scaled forking +- elf string scanning to skip ~15% of binaries ## module options @@ -52,11 +110,12 @@ programs.inshellah = { # the inshellah package (set automatically by the flake module) package = pkgs.inshellah; - # subdirectory of the system profile holding the index files + # where to place indexed completion files under the system profile # default: "/share/inshellah" completionsPath = "/share/inshellah"; # additional read-only completion directories to search + # these are appended to the --dir path alongside the system completions extraDirs = [ "/etc/profiles/per-user/alice/share/inshellah" ]; # commands to skip entirely during indexing @@ -64,65 +123,41 @@ programs.inshellah = { # commands to skip manpage parsing for (uses --help instead) helpOnlyCommands = [ "nix" ]; - - # per-subprocess timeout in ms during indexing (null = built-in - # default of 200ms) - timeoutMs = null; - - # worker-thread count for the parallel scrape - workers = null; }; ``` ## using the completer -the module's `snippet` option holds a complete external-completer -config. drop it into nushell: +the flake module sets a read-only `snippet` option containing the nushell +config needed to wire up the completer. you can access it via +`config.programs.inshellah.snippet` and paste it into your nushell config, +or source it from a file generated by your nixos config. -```nix -# generate a config file from the snippet -environment.etc."nushell/inshellah.nu".text = config.programs.inshellah.snippet; -``` - -then in your nushell config: +the snippet sets up the external completer. the wrapper installed by +the module has the system completion paths hardcoded, so no flags are +needed: ```nu -source /etc/nushell/inshellah.nu +let inshellah_complete = {|spans| + inshellah complete ...$spans | from json +} +$env.config.completions.external = { + enable: true + max_results: 100 + completer: $inshellah_complete +} ``` -or copy the snippet directly into `~/.config/nushell/config.nu` if you -prefer to manage it by hand: +## home manager and other user-level package managers -```nu -# (the snippet is many lines — copy it from `nix eval` of the option, -# or use the environment.etc approach above) -$env.config.completions.external = { ... } -``` +the nixos module only indexes packages installed at the system level +(those that end up in `/run/current-system/sw`). if you use home-manager, +nix-env, or another user-level package manager, those binaries and +manpages live elsewhere — typically under `/etc/profiles/per-user/` +or `~/.nix-profile`. -the snippet provides both static lookups against the system index and -runtime fallbacks for cases the static index can't cover: - -| command | dynamic source | -|---|---| -| `nix` | flake refs via `NIX_GET_COMPLETIONS`, with optional `meta.description` | -| `systemctl` / `journalctl` | unit names from `list-units` | -| `coredumpctl` | units + pids | -| `loginctl` | users / sessions | -| `machinectl` / `networkctl` | machines / links | -| `ssh` / `scp` / `sftp` | hostnames from ssh config + known_hosts | -| `docker` / `podman` | containers + image refs by subcommand | -| `kubectl` | resource names from the live cluster | -| `git` | refs + worktree paths | -| `npm` / `pnpm` / `yarn` | scripts from package.json | -| `make` / `just` | targets / recipes | -| `cargo` | workspace targets behind `--bin` / `--example` / etc. | -| `kill` / `pkill` | pid+comm pairs | - -## home manager and user-level package managers - -the system module only indexes packages installed system-wide. for -home-manager or per-user nix profiles, run `inshellah index` against -those prefixes separately: +to get completions for user-installed packages, run `inshellah index` +against those prefixes separately: ```sh # home-manager / per-user profile @@ -132,28 +167,35 @@ inshellah index /etc/profiles/per-user/$USER inshellah index ~/.nix-profile ``` -this indexes into `$XDG_CACHE_HOME/inshellah`, which the completer -searches automatically. to automate via home-manager: +this indexes into the default user cache (`$XDG_CACHE_HOME/inshellah`), +which the completer searches automatically. you can re-run this after +installing new packages, or add it to a home-manager activation script. + +if you want to automate this in home-manager: ```nix +# home.nix home.activation.inshellah-index = lib.hm.dag.entryAfter [ "writeBoundary" ] '' ${pkgs.inshellah}/bin/inshellah index /etc/profiles/per-user/$USER 2>/dev/null || true ''; ``` +the completer will then search both the system index and the user +cache, so completions from both sources are available. + ## troubleshooting -**completions not appearing**: check that the system index exists -(`ls /run/current-system/sw/share/inshellah/`) and that the completer -is configured. +**completions not appearing**: ensure the completer is configured in +your nushell config (see above). check that the system index exists: +`ls /run/current-system/sw/share/inshellah/`. **missing completions for a specific command**: check if it's a nushell -built-in (`help commands | where name == "thecommand"`) — built-ins -are excluded. +built-in (`help commands | where name == "thecommand"`). built-ins are +excluded because nushell serves its own completions for them. -**stale completions after update**: the index regenerates on every -`nixos-rebuild`. if a command changed its flags, rebuild. +**stale completions after update**: completions regenerate on every +`nixos-rebuild`. if a command changed its flags, rebuild to pick up +the changes. -**build-time errors**: indexing failures are non-fatal. check -`journalctl` for the build log if completions are missing for a -specific command. +**build-time errors**: indexing failures are non-fatal (`|| true`). +check `journalctl` for the build log if completions are missing. diff --git a/doc/nushell-integration.md b/doc/nushell-integration.md index 88bebcc..68ea5f8 100644 --- a/doc/nushell-integration.md +++ b/doc/nushell-integration.md @@ -1,28 +1,150 @@ # using inshellah completions in nushell -inshellah indexes completions for the commands in your `$PATH` and -serves them to nushell's external completer. indexed data is stored as -`.json` and `.nu` files that the `complete` command reads at -tab-completion time. +inshellah indexes completions from three sources (in priority order): +1. **native generators** — programs that can emit nushell completions directly +2. **manpages** — groff/troff/mdoc manpage parsing +3. **`--help` output** — parsing help text as a fallback + +indexed data is stored as `.json` and `.nu` files in a directory that the +`complete` command reads from at tab-completion time. ## quick start index completions from a system prefix: ```sh -# from a prefix containing bin/ and share/man/ +# index from a prefix containing bin/ and share/man/ inshellah index /usr -# multiple prefixes +# index from multiple prefixes inshellah index /usr /usr/local -# custom directory +# store in a custom directory inshellah index /usr --dir ~/my-completions ``` -then wire up the completer in `~/.config/nushell/config.nu`: +parse a single manpage: + +```sh +inshellah manpage /usr/share/man/man1/git.1.gz +``` + +batch-process all manpages under a directory (man1 and man8): + +```sh +inshellah manpage-dir /usr/share/man +``` + +## commands + +``` +inshellah index PREFIX... [--dir PATH] [--ignore FILE] [--help-only FILE] + index completions into a directory of json/nu files. + PREFIX is a directory containing bin/ and share/man/. + default dir: $XDG_CACHE_HOME/inshellah + --ignore FILE skip listed commands entirely + --help-only FILE skip manpages for listed commands, use --help instead + +inshellah complete CMD [ARGS...] [--dir PATH[:PATH...]] + nushell custom completer. outputs json completion candidates. + falls back to --help resolution if command is not indexed. + --dir takes colon-separated paths. the first path is the writable + user cache; additional paths are read-only system directories. + manpages are found via sibling share/man of system dir paths. + +inshellah query CMD [--dir PATH[:PATH...]] + print stored completion data for CMD. + +inshellah dump [--dir PATH[:PATH...]] + list indexed commands. + +inshellah manpage FILE + parse a manpage and emit nushell extern block. + +inshellah manpage-dir DIR + batch-process manpages under DIR (man1 and man8 sections). +``` + +## the index pipeline + +the `index` command runs a three-phase pipeline over all executables +in each `PREFIX/bin`: + +### phase 1: native completion detection (parallel) + +for each executable, inshellah scans the elf binary for the string +`completion`. if found, it probes common patterns like +`CMD completions nushell` to see if the program can generate its own +nushell completions. native output is used verbatim — these are always +higher quality than parsed completions. + +programs like `niri`, and any clap/cobra tool with nushell support, +are handled this way. + +### phase 2: manpage parsing (sequential) + +for commands not covered by phase 1, inshellah parses manpages from +man1 (user commands) and man8 (sysadmin commands). it handles: + +- gnu `.TP` style (coreutils, help2man) +- `.IP` style (curl, hand-written) +- `.PP`+`.RS`/`.RE` style (git, docbook) +- nix3 bullet+hyperlink style (`nix run`, `nix build`, etc.) +- mdoc (bsd) format +- deroff fallback for unusual formats + +synopsis sections are parsed to detect subcommands: `git-commit.1` +generates `export extern "git commit"`, not `export extern "git-commit"`. + +### phase 3: --help fallback (parallel) + +remaining executables without manpages get `--help` (or `-h`) called +with a 200ms timeout. elf binaries are pre-scanned for the `-h` string +to skip those that don't support help flags. shell scripts are run +directly (they're fast). execution is parallelized to available cores. + +subcommands are recursively resolved — if `--help` output lists +subcommands, inshellah runs `CMD SUBCMD --help` for each. + +when a `--help` invocation produces rendered manpage output (some +commands like `git stash` delegate `--help` to `man`), inshellah +detects this and locates the raw manpage source to parse with the +groff parser instead. this yields richer results (subcommands, +structured flag sections) than parsing the rendered text. + +### output + +each command gets its own file in the index directory. native generators +produce `.nu` files; parsed results produce `.json` files. the `complete` +command reads both formats. + +nushell built-in commands (ls, cd, cp, mv, etc.) are excluded since +nushell provides its own completions. + +### performance + +on a typical nixos system (~950 executables, ~1600 manpages): +- total time: ~4-10 seconds +- native gzip decompression (camlzip, no process spawning) +- parallel --help with core-scaled forking +- elf string scanning to skip ~15% of binaries + +## the completer + +the `complete` command is designed to be wired into nushell as an +external completer. it reads from the directories specified via `--dir` +(colon-separated), performs fuzzy matching, and outputs json completion +candidates. the first path is the writable user cache; additional paths +are read-only system directories. + +if a command is not indexed, `complete` falls back to on-the-fly +`--help` resolution — it runs the command's help, caches the result +in the user directory, and returns completions immediately. + +### setting up the completer ```nu +# ~/.config/nushell/config.nu $env.config.completions.external = { enable: true completer: {|spans| @@ -32,62 +154,27 @@ $env.config.completions.external = { } ``` -that's it. tab-completion now works for every command indexed. +with the nixos module, use the provided `snippet` option value (see +[nixos.md](nixos.md)) which points at the system index automatically. -## commands +## nixos module -``` -inshellah index PREFIX... [--dir PATH] [--ignore FILE] [--help-only FILE] - [--workers N] [--timeout-ms N] - index completions into a directory of json/nu files. - PREFIX is a directory containing bin/ and share/man/. - default dir: $XDG_CACHE_HOME/inshellah - --ignore FILE skip listed commands entirely - --help-only FILE skip manpages for listed commands, use --help instead - --workers N worker-thread count - --timeout-ms N per-subprocess timeout in ms (default: 200) +enable automatic completion indexing at system build time: -inshellah complete CMD [ARGS...] [--dir PATH[:PATH...]] [--timeout-ms N] - nushell custom completer. outputs JSON completion candidates. - falls back to on-the-fly --help resolution if a command isn't - indexed yet — the result is cached and subsequent presses are - instant. - --dir takes colon-separated paths. the first path is the writable - user cache; additional paths are read-only system directories. - -inshellah query CMD [--dir PATH[:PATH...]] - print stored completion data for CMD. - -inshellah dump [--dir PATH[:PATH...]] - list indexed commands. - -inshellah manpage FILE - parse a manpage and emit a nushell extern block. - -inshellah manpage-dir DIR - batch-process manpages under DIR (man1 and man8 sections). +```nix +{ + imports = [ ./path/to/inshellah/nix/module.nix ]; + programs.inshellah.enable = true; +} ``` -## what gets handled +this runs `inshellah index` during the system profile build. see +[nixos.md](nixos.md) for full details. -- **sources**: native nushell completion generators (clap/cobra tools - that can emit completions themselves), manpages in section 1 and 8, - `--help` and `-h` output. -- **groff styles**: gnu `.TP` (coreutils, help2man), `.IP` (curl, - hand-written), `.PP`+`.RS`/`.RE` (git, docbook), nix3 bullet - (`nix run`, `nix build`), mdoc (BSD), plus a deroff fallback. -- **subcommand naming**: `git-commit.1` produces `git commit`, not - `git-commit`. clap-style per-subcommand manpages get one file each. -- **synopsis-only flags**: flags declared in a manpage SYNOPSIS but - missing from the body (e.g. nix-env's `--profile`, most of sed's - interface) are picked up too. -- **elevation wrappers**: `sudo`, `doas`, `pkexec`, `su`, `run0` are - stripped before lookup, including when the real target is given as - an absolute path. -- **exclusions**: nushell built-ins (ls, cd, mv, etc.) are skipped — - nushell serves its own completions for those. +## what gets generated -## extern blocks (manpage / manpage-dir) +the `manpage` and `manpage-dir` commands emit nushell `extern` blocks +with flags, parameter types, and descriptions: ```nu export extern "rg" [ @@ -99,12 +186,9 @@ export extern "rg" [ ] ``` -these are produced by `inshellah manpage` / `inshellah manpage-dir` and -can be source'd directly in your nushell config if you prefer that to -the json completer flow. +subcommand manpages (e.g. `git-commit.1`) are detected via synopsis +parsing and generate the correct nushell name (`git commit` not +`git-commit`). -## nixos - -`programs.inshellah.enable = true` will index at system build time and -ship a richer completer with runtime fallbacks (live cluster queries, -git/ssh/docker/k8s lookups, etc.). see [nixos.md](nixos.md). +nushell built-in commands (ls, cd, mv, etc.) are excluded since nushell +provides its own completions for these. diff --git a/doc/runtime-completions.md b/doc/runtime-completions.md index 3e0ee84..7b58e48 100644 --- a/doc/runtime-completions.md +++ b/doc/runtime-completions.md @@ -1,31 +1,30 @@ # runtime completion resolution -when a command isn't in the static index yet, `inshellah complete` -runs `--help` (or `-h`) on the binary, caches the result in the user -directory, and returns completions immediately. tab-completion just -works for tools installed outside the indexed prefixes — via cargo, -pip, npm, go, etc. +the `complete` command has built-in on-the-fly resolution: when a command +is not found in the index, it falls back to running `--help`, caches the +result, and returns completions immediately. this means commands installed +outside the system profile (via cargo, pip, npm, go, etc.) get completions +on first tab-press with no manual setup. ## how it works -typing `docker compose up --`: +when you type `docker compose up --`: 1. nushell calls `inshellah complete docker compose up --` -2. inshellah looks up the longest matching prefix in the index +2. inshellah looks up the index for the longest matching prefix 3. if found, it fuzzy-matches flags and subcommands against the partial input 4. if not found, it locates the binary in `$PATH`, runs `--help`, recursively resolves subcommands, caches the results in the user - directory (`$XDG_CACHE_HOME/inshellah`), and returns completions + directory (`$XDG_CACHE_HOME/inshellah`), and returns completions. + if `--help` produces rendered manpage output, the raw manpage source + is located and parsed instead for richer results -all subsequent completions for that command are served from cache. - -elevation wrappers (`sudo`, `doas`, `pkexec`, `su`, `run0`) are -stripped before lookup: `sudo docker compose up --` resolves against -`docker`, not `sudo`. absolute paths after the wrapper are recognised -too. +all subsequent completions for that command are instant (served from cache). ## setup +the completer works with no extra configuration beyond the basic setup: + ```nu # ~/.config/nushell/config.nu $env.config.completions.external = { @@ -37,8 +36,18 @@ $env.config.completions.external = { } ``` -with the nixos module, no extra config is needed beyond enabling the -module — the wrapper has the system paths baked in. +with the nixos module, the installed wrapper has the system paths +hardcoded — no extra flags needed. the same snippet works: + +```nu +$env.config.completions.external = { + enable: true + completer: {|spans| + inshellah complete ...$spans + | from json + } +} +``` to manually specify system dirs, use colon-separated `--dir`: @@ -52,15 +61,25 @@ $env.config.completions.external = { } ``` -paths after the first in `--dir` are read-only system dirs. +system directories (paths after the first in `--dir`) enable +manpage-based fallback: when a command's `--help` delegates to `man`, +the completer looks for the raw manpage in the sibling `share/man` +directory (e.g. `share/inshellah` → `share/man`). if no system dirs +are given, it falls back to `man -w` to locate the manpage. + +or use the `snippet` option provided by the flake module (see +[nixos.md](nixos.md)). ## cache management +the user cache lives at `$XDG_CACHE_HOME/inshellah` (typically +`~/.cache/inshellah`). + ```sh # list cached commands inshellah dump -# view stored data for a command +# view cached data for a command inshellah query docker # clear cache diff --git a/dune-project b/dune-project new file mode 100644 index 0000000..4d29412 --- /dev/null +++ b/dune-project @@ -0,0 +1,28 @@ +(lang dune 3.20) + +(name inshellah) + +(generate_opam_files true) + +(source + (github username/reponame)) + +(authors "atagen ") + +(maintainers "atagen ") + +(license GPL-3.0-or-later) + +(package + (name inshellah) + (synopsis "Nushell completions generator") + (description + "Inshellah parses manpages and --help switches to generate completions for nushell.") + (depends + ocaml + dune + angstrom + angstrom-unix + camlzip) + (tags + (shell completions nushell parser angstrom))) diff --git a/flake.lock b/flake.lock index 8c7ac0c..3adb309 100644 --- a/flake.lock +++ b/flake.lock @@ -2,16 +2,16 @@ "nodes": { "nixpkgs": { "locked": { - "lastModified": 1773821835, - "narHash": "sha256-TJ3lSQtW0E2JrznGVm8hOQGVpXjJyXY2guAxku2O9A4=", + "lastModified": 1773385838, + "narHash": "sha256-ylF2AGl08seexxlLvMqj3jd+yZq56W9zicwe51mp0Pw=", "owner": "nixos", "repo": "nixpkgs", - "rev": "b40629efe5d6ec48dd1efba650c797ddbd39ace0", + "rev": "fef542e7a88eec2b698389e6279464fd479926b6", "type": "github" }, "original": { "owner": "nixos", - "ref": "nixos-unstable", + "ref": "nixpkgs-unstable", "repo": "nixpkgs", "type": "github" } diff --git a/flake.nix b/flake.nix index 351a4ee..6b05775 100644 --- a/flake.nix +++ b/flake.nix @@ -1,36 +1,48 @@ { - - inputs.nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable"; + inputs.nixpkgs.url = "github:nixos/nixpkgs/nixpkgs-unstable"; outputs = { self, nixpkgs }: let forAllSystems = - f: nixpkgs.lib.genAttrs nixpkgs.lib.systems.flakeExposed (sys: f nixpkgs.legacyPackages.${sys}); + f: + nixpkgs.lib.genAttrs [ "x86_64-linux" "aarch64-linux" ] ( + system: f (import nixpkgs { inherit system; }) + ); in { devShells = forAllSystems (pkgs: { default = pkgs.mkShell { - packages = with pkgs; [ - rustc - cargo - rustfmt - rust-analyzer - clippy + packages = with pkgs.ocamlPackages; [ + dune_3 + ocaml + angstrom + angstrom-unix + camlzip + ppx_inline_test + ocaml-lsp + ocamlformat + ocamlformat-rpc-lib + utop ]; }; }); packages = forAllSystems (pkgs: { - default = pkgs.rustPlatform.buildRustPackage { + default = pkgs.ocamlPackages.buildDunePackage { pname = "inshellah"; - version = "0.1.1"; + version = "0.1"; src = pkgs.lib.cleanSource ./.; - cargoLock.lockFile = ./Cargo.lock; - meta = { - description = "nushell completion indexer"; - mainProgram = "inshellah"; - }; + nativeBuildInputs = [ pkgs.git ]; + buildInputs = with pkgs.ocamlPackages; [ + dune_3 + ocaml + angstrom + angstrom-unix + camlzip + ]; + + meta.mainProgram = "inshellah"; }; }); @@ -45,20 +57,16 @@ imports = [ ./nix/module.nix ]; programs.inshellah.package = self.packages.${pkgs.stdenv.hostPlatform.system}.default; programs.inshellah.snippet = '' - let inshellah_complete = { |spans| + let inshellah_complete = { |spans| let completions = (^inshellah complete ...$spans) | from json - let span_len = ($spans | length) - let last_span = if $span_len > 0 { $spans | last } else { "" } - let prev_span = if $span_len >= 2 { $spans | get ($span_len - 2) } else { "" } - let sub = if $span_len >= 2 { $spans | get 1 } else { "" } - # dynamic completions — fire only when the static index has nothing. - let additional = if ($completions == null and $span_len > 0) { + # dynamic completions + let additional = if ($completions == null and ($spans | length) > 0) { match $spans.0 { "nix" => { - $env.NIX_GET_COMPLETIONS = $span_len - 1 + $env.NIX_GET_COMPLETIONS = ($spans | length) - 1 let nix_output = $spans | run-external $in | split row -r '\n' | str trim | skip 1 let entries = if (($nix_output | length) < 6 and - $last_span =~ "[a-zA-Z][a-zA-Z0-9_-]*#[a-zA-Z][a-zA-Z0-9_-]*") { + ($spans | last) =~ "[a-zA-Z][a-zA-Z0-9_-]*#[a-zA-Z][a-zA-Z0-9_-]*") { hide-env NIX_GET_COMPLETIONS $env.NIX_ALLOW_UNFREE = 1 $env.NIX_ALLOW_BROKEN = 1 @@ -70,14 +78,17 @@ } } } else { - $nix_output | each { |e| { value: $e, description: "" } } + $nix_output | each { |e| + { value: $e, description: "" } + } } $entries } "systemctl" => { - if $span_len < 3 { null } else { + if ($spans | length) < 3 { null } else { + let kw = $spans | last let scope = if ("--user" in $spans) { [--user] } else { [] } - ^systemctl ...$scope list-units --all --no-pager --plain --full --no-legend $"($last_span)*" + ^systemctl ...$scope list-units --all --no-pager --plain --full --no-legend $"($kw)*" | lines | each { |l| let parsed = $l | parse -r '(?P\S+)\s+\S+\s+\S+\s+\S+\s+(?P.*)' @@ -87,228 +98,6 @@ } | compact } } - "journalctl" => { - # unit-name completion after --unit / -u - if ($prev_span == "--unit" or $prev_span == "-u") { - let scope = if ("--user-unit" in $spans or "--user" in $spans) { [--user] } else { [] } - ^systemctl ...$scope list-units --all --no-pager --plain --full --no-legend $"($last_span)*" - | lines - | each { |l| - let parsed = $l | parse -r '(?P\S+)\s+\S+\s+\S+\s+\S+\s+(?P.*)' - if ($parsed | length) > 0 { - {value: $parsed.0.unit, description: ($parsed.0.desc | str trim)} - } - } | compact - } else { null } - } - "coredumpctl" => { - # unit names (after dump/info/debug/list verbs) and pids - let unit_verbs = ["dump" "info" "debug" "list"] - if (($sub in $unit_verbs) and $span_len >= 3) { - let units = (try { - ^systemctl list-units --all --no-pager --plain --full --no-legend $"($last_span)*" - | lines - | each { |l| - let p = $l | parse -r '(?P\S+)\s+\S+\s+\S+\s+\S+\s+(?P.*)' - if ($p | length) > 0 { { value: $p.0.unit, description: ($p.0.desc | str trim) } } - } | compact - } catch { [] }) - let pids = (try { - ^coredumpctl list --no-pager --no-legend - | lines - | each { |l| - let p = $l | split row -r '\s+' - if ($p | length) >= 5 { { value: $p.4, description: $"PID ($p.4) ($p | get 9? | default "")" } } - } | compact - } catch { [] }) - $units | append $pids - } else { null } - } - "loginctl" => { - # user / session names for loginctl ... - let user_verbs = ["user-status" "show-user" "enable-linger" "disable-linger" "kill-user" "terminate-user"] - let session_verbs = ["session-status" "show-session" "activate" "lock-session" "unlock-session" "terminate-session" "kill-session"] - if (($sub in $user_verbs) and $span_len >= 3) { - try { - ^loginctl list-users --no-pager --no-legend - | lines | each { |l| - let p = $l | str trim | split row -r '\s+' - if ($p | length) >= 2 { { value: $p.1, description: $"UID ($p.0)" } } - } | compact - } catch { null } - } else if (($sub in $session_verbs) and $span_len >= 3) { - try { - ^loginctl list-sessions --no-pager --no-legend - | lines | each { |l| - let p = $l | str trim | split row -r '\s+' - if ($p | length) >= 3 { { value: $p.0, description: $"user ($p.2)" } } - } | compact - } catch { null } - } else { null } - } - "machinectl" => { - let machine_verbs = ["status" "show" "start" "login" "shell" "enable" "disable" "poweroff" "reboot" "terminate" "kill" "bind" "copy-to" "copy-from"] - if (($sub in $machine_verbs) and $span_len >= 3) { - try { - ^machinectl list --no-pager --no-legend - | lines | each { |l| - let p = $l | str trim | split row -r '\s+' - if ($p | length) >= 1 { { value: $p.0, description: ($p | get 1? | default "") } } - } | compact - } catch { null } - } else { null } - } - "networkctl" => { - let link_verbs = ["status" "show" "up" "down" "renew" "forcerenew" "reconfigure" "delete"] - if (($sub in $link_verbs) and $span_len >= 3) { - try { - ^networkctl list --no-pager --no-legend - | lines | each { |l| - let p = $l | str trim | split row -r '\s+' - if ($p | length) >= 4 { { value: $p.1, description: $"($p.2) ($p.3)" } } - } | compact - } catch { null } - } else { null } - } - "hostnamectl" | "timedatectl" | "localectl" => { - # mostly fixed verb sets — let the static index handle these. - # left here as an explicit no-op for documentation. - null - } - "ssh" | "scp" | "sftp" => { - # hostnames from ~/.ssh/config + ~/.ssh/known_hosts - let cfg_hosts = (try { - open ~/.ssh/config | lines | each { |l| - let m = $l | parse -r '(?i)^\s*Host\s+(?P.+)$' - if ($m | length) > 0 { $m.0.h | split row -r '\s+' } else { [] } - } | flatten | where { |h| not ($h | str contains '*') and not ($h | is-empty) } - } catch { [] }) - let known = (try { - open ~/.ssh/known_hosts | lines | each { |l| - ($l | split row -r '\s+' | get 0? | default "") | split row ',' - } | flatten | where { |h| (not ($h | is-empty)) and (not ($h | str starts-with '|')) and (not ($h | str starts-with '[')) } - } catch { [] }) - $cfg_hosts | append $known | uniq | each { |h| { value: $h, description: "" } } - } - "docker" | "podman" => { - let need_container = ["exec" "logs" "inspect" "start" "stop" "restart" "rm" "kill" "attach" "cp" "top" "wait" "pause" "unpause" "port" "commit" "diff" "export"] - let need_image = ["run" "rmi" "tag" "push" "pull" "history" "save" "create"] - if ($sub in $need_container) { - try { - ^($spans.0) ps -a --format '{{.Names}}\t{{.Image}}' - | lines | each { |l| - let p = $l | split row "\t" - if ($p | length) >= 2 { { value: $p.0, description: $p.1 } } - } | compact - } catch { null } - } else if ($sub in $need_image) { - try { - ^($spans.0) images --format '{{.Repository}}:{{.Tag}}\t{{.Size}}' - | lines | each { |l| - let p = $l | split row "\t" - if (($p | length) >= 2) and (not ($p.0 | str ends-with ':')) { - { value: $p.0, description: $p.1 } - } - } | compact - } catch { null } - } else { null } - } - "kubectl" => { - let need_resource = ["get" "describe" "delete" "edit" "logs" "exec" "port-forward" "rollout" "scale" "annotate" "label"] - if (($sub in $need_resource) and $span_len >= 4) { - let kind = $spans | get 2 - try { - ^kubectl get $kind --no-headers -o "custom-columns=NAME:.metadata.name" - | lines | str trim - | where { |n| not ($n | is-empty) } - | each { |n| { value: $n, description: $kind } } - } catch { null } - } else { null } - } - "git" => { - let need_ref = ["checkout" "switch" "merge" "rebase" "branch" "log" "diff" "show" "reset" "cherry-pick" "revert" "tag" "push" "pull" "blame" "bisect"] - if (($sub == "worktree") and $span_len >= 3) { - try { - ^git worktree list --porcelain - | lines - | each { |l| - let m = $l | parse -r '^worktree\s+(?P

.+)$' - if ($m | length) > 0 { { value: $m.0.p, description: "" } } - } | compact - } catch { null } - } else if (($sub in $need_ref) and $span_len >= 3) { - try { - ^git for-each-ref --format='%(refname:short)%09%(objecttype)%09%(contents:subject)' refs/heads refs/remotes refs/tags - | lines - | each { |l| - let p = $l | split row "\t" - if ($p | length) >= 3 { { value: $p.0, description: $p.2 } } - } | compact - } catch { null } - } else { null } - } - "npm" | "pnpm" | "yarn" => { - # script names from the nearest package.json - let wants = (($spans.0 == "yarn") and $span_len >= 2) or ($sub == "run") or ($sub == "run-script") - if $wants { - try { - open package.json | get scripts? | default {} | transpose name cmd - | each { |row| { value: $row.name, description: $row.cmd } } - } catch { null } - } else { null } - } - "make" => { - # Makefile targets — line-leading identifier followed by ':' - try { - open Makefile | lines - | each { |l| - let m = $l | parse -r '^(?P[A-Za-z0-9_./-]+)\s*:' - if (($m | length) > 0) and (not ($m.0.t | str starts-with '.')) { - { value: $m.0.t, description: "" } - } - } | compact | uniq-by value - } catch { null } - } - "just" => { - # just recipes via `just --list` (handles justfile / Justfile / .justfile) - try { - ^just --list --unsorted - | lines | skip 1 - | each { |l| - let m = $l | parse -r '^\s+(?P[A-Za-z0-9_-]+)(?:\s+\S.*)?(?:\s*#\s*(?P.*))?$' - if ($m | length) > 0 { - { value: $m.0.t, description: ($m.0.d? | default "") } - } - } | compact - } catch { null } - } - "cargo" => { - let need_target = ["run" "test" "build" "bench" "check" "doc" "install"] - let target_flags = ["--bin" "-p" "--package" "--example" "--test" "--bench"] - if (($sub in $need_target) and ($prev_span in $target_flags)) { - try { - ^cargo metadata --no-deps --format-version 1 - | from json | get packages | each { |pkg| - $pkg.targets | each { |t| { value: $t.name, description: ($t.kind | str join ",") } } - } | flatten | uniq-by value - } catch { null } - } else { null } - } - "kill" | "pkill" => { - try { - ^ps -eo pid,comm --no-headers - | lines - | each { |l| - let parts = $l | str trim | split row -r '\s+' - if ($parts | length) >= 2 { - let pid = $parts | get 0 - let comm = $parts | skip 1 | str join " " - if ($spans.0 == "kill") { { value: $pid, description: $comm } } - else { { value: $comm, description: $pid } } - } - } | compact - } catch { null } - } _ => { null } } } else { null } diff --git a/inshellah.opam b/inshellah.opam new file mode 100644 index 0000000..9888aa7 --- /dev/null +++ b/inshellah.opam @@ -0,0 +1,35 @@ +# This file is generated by dune, edit dune-project instead +opam-version: "2.0" +synopsis: "Nushell completions generator" +description: + "Inshellah parses manpages and --help switches to generate completions for nushell." +maintainer: ["atagen "] +authors: ["atagen "] +license: "GPL-3.0-or-later" +tags: ["shell" "completions" "nushell" "parser" "angstrom"] +homepage: "https://github.com/username/reponame" +bug-reports: "https://github.com/username/reponame/issues" +depends: [ + "ocaml" + "dune" {>= "3.20"} + "angstrom" + "angstrom-unix" + "camlzip" + "odoc" {with-doc} +] +build: [ + ["dune" "subst"] {dev} + [ + "dune" + "build" + "-p" + name + "-j" + jobs + "@install" + "@runtest" {with-test} + "@doc" {with-doc} + ] +] +dev-repo: "git+https://github.com/username/reponame.git" +x-maintenance-intent: ["(latest)"] diff --git a/lib/.ocamlformat b/lib/.ocamlformat new file mode 100644 index 0000000..e69de29 diff --git a/lib/dune b/lib/dune new file mode 100644 index 0000000..38defe1 --- /dev/null +++ b/lib/dune @@ -0,0 +1,3 @@ +(library + (name inshellah) + (libraries angstrom angstrom-unix camlzip str unix)) diff --git a/lib/manpage.ml b/lib/manpage.ml new file mode 100644 index 0000000..5415fac --- /dev/null +++ b/lib/manpage.ml @@ -0,0 +1,1145 @@ +(* manpage.ml — parse unix manpages (groff/mdoc format) into help_result. + * + * manpages are written in roff/groff markup — a decades-old typesetting language + * used by man(1). this module strips the formatting and extracts structured data + * (flags, subcommands, positionals) from the raw groff source. + * + * there are two major manpage macro packages: + * - man (groff) — used by gnu/linux tools. uses macros like .SH, .TP, .IP, .PP + * - mdoc (bsd) — used by bsd tools. uses .Sh, .Fl, .Ar, .Op, .It, .Bl/.El + * + * this module handles both, auto-detecting the format by checking for .Sh macros. + * + * for groff manpages, flag extraction uses multiple "strategies" that target + * different common formatting patterns: + * - strategy_tp: .TP tagged paragraphs (gnu coreutils, help2man) + * - strategy_ip: .IP indented paragraphs (curl, hand-written) + * - strategy_pp_rs: .PP + .RS/.RE blocks (git, docbook) + * - strategy_nix: nix3-style bullet .IP with .UR/.UE hyperlinks + * - strategy_deroff: fallback — strip all groff, feed to help text parser + * + * the module tries all applicable strategies and picks the one that extracts + * the most flag entries, on the theory that more results = better match. + * + * key peculiarities: + * - groff has an enormous escape syntax (font changes, named characters, + * size changes, color, string variables, etc.) — strip_groff_escapes + * handles the common cases but is not exhaustive + * - font escapes like \fI (italic) need to insert spaces at word boundaries + * to prevent flag names from fusing with their parameter names + * - the strategies share the angstrom-based switch_parser from parser.ml + * for parsing the actual flag syntax out of the stripped text + *) + +open Parser + +(* --- shared helpers for imperative string scanning --- + * many groff parsing routines use an imperative cursor (ref int) walking + * through a string. these helpers factor out common scanning patterns. *) + +(* advance pos past all characters until the delimiter is found. + * leaves pos pointing at the delimiter character, or at len if not found. *) +let skip_to_char source len pos delim = + while !pos < len && source.[!pos] <> delim do incr pos done + +(* translate a groff named character escape to its text equivalent. + * groff uses two-letter codes like "aq" for apostrophe, "lq"/"rq" for + * left/right quotes, "em"/"en" for dashes. returns None for unknown names. *) +let named_char_of = function + | "aq" -> Some '\'' + | "lq" | "Lq" -> Some '\x22' (* left double quote *) + | "rq" | "Rq" -> Some '\x22' (* right double quote *) + | "em" | "en" -> Some '-' + | _ -> None + +(* skip a groff reference that uses one of three sub-forms: + * single char — e.g. \*X or \nX + * ( + 2 chars — e.g. \*(XX or \n(XX + * [ to ] — e.g. \*[name] or \n[name] + * used for \* (string variable) and \n (number register) escapes. + * advances pos past the consumed characters. *) +let skip_groff_reference source len pos = + if !pos < len then begin + if source.[!pos] = '(' then + pos := !pos + 3 (* skip past '(' + two-character name *) + else if source.[!pos] = '[' then begin + incr pos; + skip_to_char source len pos ']'; + if !pos < len then incr pos + end else + incr pos + end + +(* --- groff escape/formatting stripper --- + * groff escapes start with backslash and use various continuation syntaxes. + * this function strips them, replacing named characters (like \(aq for + * apostrophe) with their text equivalents and discarding formatting directives. *) + +let strip_groff_escapes source = + let buffer = Buffer.create (String.length source) in + let len = String.length source in + let pos = ref 0 in + let prev_char = ref '\000' in + (* emit a character into the output buffer and track it as previous *) + let put char_val = Buffer.add_char buffer char_val; prev_char := char_val in + let is_alnum char_val = + (char_val >= 'a' && char_val <= 'z') + || (char_val >= 'A' && char_val <= 'Z') + || (char_val >= '0' && char_val <= '9') + in + while !pos < len do + if source.[!pos] = '\\' && !pos + 1 < len then begin + let next = source.[!pos + 1] in + match next with + | 'f' -> + (* font escape: \fB, \fI, \fP, \fR, \f(XX, \f[...] *) + if !pos + 2 < len then begin + let font_char = source.[!pos + 2] in + (* insert space before italic font to preserve word boundaries + e.g. \fB--max-results\fR\fIcount\fR -> "--max-results count" *) + if font_char = 'I' && is_alnum !prev_char then put ' '; + if font_char = '(' then + pos := !pos + 5 (* \f(XX — two-character font name *) + else if font_char = '[' then begin + pos := !pos + 3; + skip_to_char source len pos ']'; + if !pos < len then incr pos + end else + pos := !pos + 3 (* \fX — single-character font selector *) + end else + pos := !pos + 2 + | '-' -> + (* escaped hyphen-minus — emit a plain hyphen *) + put '-'; + pos := !pos + 2 + | '&' | '/' | ',' -> + (* zero-width characters — discard without output *) + pos := !pos + 2 + | '(' -> + (* two-char named character: \(aq, \(lq, \(rq, etc. *) + if !pos + 3 < len then begin + let name = String.sub source (!pos + 2) 2 in + (match named_char_of name with + | Some char_val -> put char_val + | None -> ()); + pos := !pos + 4 + end else + pos := !pos + 2 + | '[' -> + (* bracketed named character: \[aq], \[lq], etc. *) + pos := !pos + 2; + let start = !pos in + skip_to_char source len pos ']'; + if !pos < len then begin + let name = String.sub source start (!pos - start) in + (match named_char_of name with + | Some char_val -> put char_val + | None -> ()); + incr pos + end + | 's' -> + (* size escape: \sN, \s+N, \s-N — skip the numeric argument *) + pos := !pos + 2; + if !pos < len && (source.[!pos] = '+' || source.[!pos] = '-') then incr pos; + if !pos < len && source.[!pos] >= '0' && source.[!pos] <= '9' then incr pos; + if !pos < len && source.[!pos] >= '0' && source.[!pos] <= '9' then incr pos + | 'm' -> + (* color escape: \m[...] — skip the bracketed color name *) + pos := !pos + 2; + if !pos < len && source.[!pos] = '[' then begin + incr pos; + skip_to_char source len pos ']'; + if !pos < len then incr pos + end + | 'X' -> + (* device control: \X'...' — skip the single-quoted payload *) + pos := !pos + 2; + if !pos < len && source.[!pos] = '\'' then begin + incr pos; + skip_to_char source len pos '\''; + if !pos < len then incr pos + end + | '*' -> + (* string variable: \*X or \*(XX or \*[...] — skip the reference *) + pos := !pos + 2; + skip_groff_reference source len pos + | 'n' -> + (* number register: \nX or \n(XX or \n[...] — skip the reference *) + pos := !pos + 2; + skip_groff_reference source len pos + | 'e' -> + (* escaped backslash literal *) + put '\\'; + pos := !pos + 2 + | '\\' -> + (* double backslash — emit one *) + put '\\'; + pos := !pos + 2 + | ' ' -> + (* escaped space — emit a regular space *) + put ' '; + pos := !pos + 2 + | _ -> + (* unknown escape — skip the two-character sequence *) + pos := !pos + 2 + end else begin + put source.[!pos]; + incr pos + end + done; + Buffer.contents buffer + +(* strip inline macro formatting: .BI, .BR, .IR, etc. + * these macros alternate between fonts for their arguments, e.g.: + * .BI "--output " "FILE" + * becomes "--outputFILE" (arguments concatenated without spaces). + * + * quoted strings are kept together (quotes stripped), but unquoted spaces + * are consumed. this matches groff's actual rendering of these macros, + * where alternating-font arguments are concatenated. *) +let strip_inline_macro_args text = + let buffer = Buffer.create (String.length text) in + let len = String.length text in + let pos = ref 0 in + while !pos < len do + if text.[!pos] = '"' then begin + (* quoted argument — copy characters up to the closing quote *) + incr pos; + while !pos < len && text.[!pos] <> '"' do + Buffer.add_char buffer text.[!pos]; + incr pos + done; + if !pos < len then incr pos + end else if text.[!pos] = ' ' || text.[!pos] = '\t' then begin + (* unquoted whitespace — skip (arguments are concatenated) *) + incr pos + end else begin + (* regular character — copy to output *) + Buffer.add_char buffer text.[!pos]; + incr pos + end + done; + Buffer.contents buffer + +(* convenience: strip escapes and trim whitespace *) +let strip_groff line = + let text = strip_groff_escapes line in + String.trim text + +(* --- line classification --- + * every line in a manpage is classified as one of four types. + * this classification drives all subsequent parsing — strategies + * pattern-match on sequences of classified lines. *) + +type groff_line = + | Macro of string * string (* macro name + args, e.g. ("SH", "OPTIONS") or ("TP", "") *) + | Text of string (* plain text after groff stripping *) + | Blank (* empty line *) + | Comment (* groff comment: .backslash-quote or backslash-quote *) + +(* classify a single line of manpage source. + * macro lines start with '.' or '\'' (groff alternate control char). + * the macro name is split from its arguments at the first space/tab. + * arguments wrapped in double quotes are unquoted. *) +let classify_line line = + let len = String.length line in + if len = 0 then Blank + else if len >= 2 && line.[0] = '.' && line.[1] = '\\' && (len < 3 || line.[2] = '"') then + Comment + else if len >= 3 && line.[0] = '\\' && line.[1] = '"' then + Comment + else if line.[0] = '.' || line.[0] = '\'' then begin + (* macro line — extract macro name and arguments *) + let rest = String.sub line 1 (len - 1) in + let rest = String.trim rest in + (* split into macro name and arguments at the first whitespace *) + let space_pos = + try Some (String.index rest ' ') + with Not_found -> + try Some (String.index rest '\t') + with Not_found -> None + in + match space_pos with + | Some split_at -> + let name = String.sub rest 0 split_at in + let args = String.trim (String.sub rest (split_at + 1) (String.length rest - split_at - 1)) in + (* strip surrounding quotes from arguments *) + let args = + let alen = String.length args in + if alen >= 2 && args.[0] = '"' && args.[alen - 1] = '"' then + String.sub args 1 (alen - 2) + else args + in + Macro (name, args) + | None -> + Macro (rest, "") + end else begin + let stripped = strip_groff line in + if String.length stripped = 0 then Blank + else Text stripped + end + +(* refined comment detection — the base classify_line may miss some comment + * forms, so this wrapper checks more carefully before falling through to + * the general classifier. *) +let is_comment_line line = + let len = String.length line in + (len >= 3 && line.[0] = '.' && line.[1] = '\\' && line.[2] = '"') + || (len >= 2 && line.[0] = '\\' && line.[1] = '"') + +let classify_line line = + if is_comment_line line then Comment + else classify_line line + +(* --- section extraction --- + * manpages are divided into sections by .SH macros. the OPTIONS section + * contains the flag definitions we want. if there's no OPTIONS section, + * we fall back to DESCRIPTION (some simple tools put flags there). + * + * old-style nix manpages (nix-build, nix-env-install, etc.) split flags + * across multiple .SH sections with option-like names: e.g. "Options" for + * command-specific flags and "Common Options" for flags shared by all nix + * commands. collecting only the first such section misses the majority of + * flags, so we collect and concatenate all option-like sections. *) + +let extract_options_section lines = + let classified = List.map classify_line lines in + (* collect lines until the next .SH header, returning (content, rest) + * where rest starts at the .SH line (or is empty if at end of file). *) + let rec collect_section lines acc = + match lines with + | [] -> (List.rev acc, []) + | Macro ("SH", _) :: _ -> (List.rev acc, lines) + | line :: rest -> collect_section rest (line :: acc) + in + (* test whether a section name looks like an options section. + * matches "OPTIONS", "COMMON OPTIONS", "GLOBAL OPTIONS", etc. *) + let is_options_section name = + let upper = String.uppercase_ascii (String.trim name) in + upper = "OPTIONS" + || (String.length upper > 0 && + try let _ = Str.search_forward (Str.regexp_string "OPTION") upper 0 in true + with Not_found -> false) + in + (* collect from all option-like .SH sections and concatenate them. + * handles the common nix pattern where "Options" and "Common Options" + * are separate .SH sections but both contain relevant flags. + * + * a synthetic Macro("SH","") separator is inserted between sections so + * that collect_desc_text (which stops on SH/SS) does not let a description + * from the last entry in one section bleed into the intro text of the next. *) + let rec find_all_options lines acc = + match lines with + | [] -> acc + | Macro ("SH", args) :: rest when is_options_section args -> + let (section, remaining) = collect_section rest [] in + let sep = if acc = [] then [] else [Macro ("SH", "")] in + find_all_options remaining (acc @ sep @ section) + | _ :: rest -> find_all_options rest acc + in + (* fallback: DESCRIPTION section for simple tools that put flags there *) + let rec find_description = function + | [] -> [] + | Macro ("SH", args) :: rest + when String.uppercase_ascii (String.trim args) = "DESCRIPTION" -> + fst (collect_section rest []) + | _ :: rest -> find_description rest + in + match find_all_options classified [] with + | [] -> find_description classified + | sections -> sections + +(* --- strategy-based entry extraction --- + * rather than a single monolithic parser, we use multiple "strategies" that + * each target a specific groff formatting pattern. this is necessary because + * manpage authors use very different macro combinations for the same purpose. + * + * the shared building blocks: + * - collect_text_lines: gather consecutive Text lines into one description string + * - parse_tag_to_entry: run the angstrom switch parser on a tag string to + * extract the flag definition. this reuses the same parser that handles + * --help output, giving consistent extraction across both sources. + * - tag_of_macro: extract the "tag" text from formatting macros like .B, .BI, etc. + *) + +(* collect consecutive text lines, joining them with spaces *) +let rec collect_text_lines lines acc = + match lines with + | Text text :: rest -> collect_text_lines rest (text :: acc) + | _ -> (String.concat " " (List.rev acc), lines) + +(* attempt to parse a tag string (e.g. "-v, --verbose FILE") into an entry. + * uses the angstrom switch_parser + param_parser from parser.ml. + * returns None if the tag doesn't look like a flag definition. *) +let parse_tag_to_entry tag desc = + let tag = strip_groff_escapes tag in + let tag = String.trim tag in + match Angstrom.parse_string ~consume:Angstrom.Consume.Prefix + (Angstrom.lift2 (fun sw p -> (sw, p)) switch_parser param_parser) tag with + | Ok (switch, param) -> Some { switch; param; desc } + | Error _ -> None + +(* extract tag text from a macro line. + * .B and .I preserve spaces (single argument); .BI, .BR, .IR alternate + * fonts and concatenate arguments. *) +let tag_of_macro name args = + match name with + | "B" | "I" -> strip_groff_escapes args |> String.trim + | _ -> strip_inline_macro_args args |> strip_groff_escapes |> String.trim + +(* strategy a: .TP style (most common — gnu coreutils, help2man). + * .TP introduces a tagged paragraph: the next line is the "tag" (flag name) + * and subsequent text lines are the description. the tag can be plain text + * or wrapped in a formatting macro (.B, .BI, etc.). + * + * example groff: + * .TP + * \fB\-v\fR, \fB\-\-verbose\fR + * increase verbosity *) +let strategy_tp lines = + let rec walk lines acc = + match lines with + | [] -> List.rev acc + | Macro ("TP", _) :: rest -> + (* next line is the tag — could be Text or a formatting macro *) + begin match rest with + | Text tag :: rest2 -> + let (desc, rest3) = collect_text_lines rest2 [] in + let entry = parse_tag_to_entry tag desc in + walk rest3 (match entry with Some e -> e :: acc | None -> acc) + | Macro (("B" | "I" | "BI" | "BR" | "IR") as macro_name, args) :: rest2 -> + let tag = tag_of_macro macro_name args in + let (desc, rest3) = collect_text_lines rest2 [] in + let entry = parse_tag_to_entry tag desc in + walk rest3 (match entry with Some e -> e :: acc | None -> acc) + | _ -> walk rest acc + end + | _ :: rest -> walk rest acc + in + walk lines [] + +(* strategy b: .IP style (curl, hand-written manpages). + * .IP takes an inline tag argument: .IP "-v, --verbose" + * the description follows as text lines. simpler than .TP because + * the tag is on the macro line itself. *) +let strategy_ip lines = + let rec walk lines acc = + match lines with + | [] -> List.rev acc + | Macro ("IP", tag) :: rest -> + let tag = strip_groff_escapes tag in + let (desc, rest2) = collect_text_lines rest [] in + let entry = parse_tag_to_entry tag desc in + walk rest2 (match entry with Some e -> e :: acc | None -> acc) + | _ :: rest -> walk rest acc + in + walk lines [] + +(* strategy c: .PP + .RS/.RE style (git, docbook-generated manpages). + * flag entries are introduced by .PP (paragraph), with the flag name as + * plain text, followed by a .RS (indent) block containing the description, + * closed by .RE (de-indent). this is common in docbook-to-manpage toolchains. *) +let strategy_pp_rs lines = + let rec walk lines acc = + match lines with + | [] -> List.rev acc + | Macro ("PP", _) :: rest -> + begin match rest with + | Text tag :: rest2 -> + (* look for .RS ... text ... .RE *) + let rec collect_rs lines desc_acc = + match lines with + | Macro ("RS", _) :: rest3 -> + collect_in_rs rest3 desc_acc + | Text text :: rest3 -> + (* sometimes description follows directly *) + collect_rs rest3 (text :: desc_acc) + | _ -> (String.concat " " (List.rev desc_acc), lines) + and collect_in_rs lines desc_acc = + match lines with + | Macro ("RE", _) :: rest3 -> + (String.concat " " (List.rev desc_acc), rest3) + | Text text :: rest3 -> + collect_in_rs rest3 (text :: desc_acc) + | Macro ("PP", _) :: _ | Macro ("SH", _) :: _ -> + (String.concat " " (List.rev desc_acc), lines) + | _ :: rest3 -> collect_in_rs rest3 desc_acc + | [] -> (String.concat " " (List.rev desc_acc), []) + in + let (desc, rest3) = collect_rs rest2 [] in + let entry = parse_tag_to_entry tag desc in + walk rest3 (match entry with Some e -> e :: acc | None -> acc) + | _ -> walk rest acc + end + | _ :: rest -> walk rest acc + in + walk lines [] + +(* strategy d: deroff fallback — strip all groff markup, then feed the + * resulting plain text through the --help parser from parser.ml. + * this is the last resort when no structured macro pattern is recognized. + * it works surprisingly well for simple manpages but may miss entries + * in heavily formatted ones. *) +let strategy_deroff_lines lines = + let buffer = Buffer.create 256 in + List.iter (fun line -> + match line with + | Text text -> + Buffer.add_string buffer text; + Buffer.add_char buffer '\n' + | Macro (("BI" | "BR" | "IR" | "B" | "I"), args) -> + let text = strip_inline_macro_args args in + let text = strip_groff_escapes text in + Buffer.add_string buffer text; + Buffer.add_char buffer '\n' + | Blank -> Buffer.add_char buffer '\n' + | _ -> () + ) lines; + let text = Buffer.contents buffer in + match parse_help text with + | Ok result -> result.entries + | Error _ -> [] + +(* strategy e: nix3-style bullet .IP with .UR/.UE hyperlinks. + * nix's manpages use .IP with bullet markers for flag entries, interleaved + * with .UR/.UE hyperlink macros. the flag tag is in text lines after the + * bullet .IP, and the description follows a non-bullet .IP marker. + * + * nix manpages nest .RS/.RE blocks inside descriptions for sub-examples. + * the skip_rs helper tracks nesting depth to skip these without losing + * the rest of the description. *) +let strategy_nix lines = + (* a bullet .IP has non-empty args (the bullet marker) *) + let is_bullet_ip args = + String.length (String.trim args) > 0 + in + let rec walk lines acc = + match lines with + | [] -> List.rev acc + | Macro ("IP", args) :: rest when is_bullet_ip args -> + (* collect tag: skip .UR/.UE macros, collect Text lines *) + let rec collect_tag lines parts = + match lines with + | Macro ("UR", _) :: rest2 -> collect_tag rest2 parts + | Macro ("UE", _) :: rest2 -> collect_tag rest2 parts + | Text text :: rest2 -> collect_tag rest2 (text :: parts) + | _ -> (String.concat " " (List.rev parts), lines) + in + let (tag, rest2) = collect_tag rest [] in + (* collect description after the description .IP marker *) + let rec collect_desc lines parts = + match lines with + | Macro ("IP", dargs) :: rest3 when not (is_bullet_ip dargs) -> + collect_desc_text rest3 parts + | _ -> (String.concat " " (List.rev parts), lines) + and collect_desc_text lines parts = + match lines with + | Text text :: rest3 -> collect_desc_text rest3 (text :: parts) + | Macro ("IP", args2) :: _ when is_bullet_ip args2 -> + (* next bullet entry — stop collecting *) + (String.concat " " (List.rev parts), lines) + | Macro (("SS" | "SH"), _) :: _ -> + (* section boundary — stop collecting *) + (String.concat " " (List.rev parts), lines) + | Macro ("RS", _) :: rest3 -> + skip_rs rest3 parts 1 + | Macro ("IP", _) :: rest3 -> + (* non-bullet .IP = continuation paragraph *) + collect_desc_text rest3 parts + | Macro _ :: rest3 -> collect_desc_text rest3 parts + | Blank :: rest3 -> collect_desc_text rest3 parts + | Comment :: rest3 -> collect_desc_text rest3 parts + | [] -> (String.concat " " (List.rev parts), []) + and skip_rs lines parts depth = + match lines with + | Macro ("RE", _) :: rest3 -> + if depth <= 1 then collect_desc_text rest3 parts + else skip_rs rest3 parts (depth - 1) + | Macro ("RS", _) :: rest3 -> skip_rs rest3 parts (depth + 1) + | _ :: rest3 -> skip_rs rest3 parts depth + | [] -> (String.concat " " (List.rev parts), []) + in + let (desc, rest3) = collect_desc rest2 [] in + let entry = parse_tag_to_entry tag desc in + walk rest3 (match entry with Some e -> e :: acc | None -> acc) + | _ :: rest -> walk rest acc + in + walk lines [] + +(* count occurrences of a specific macro in the section. + * used by extract_entries to decide which strategies are worth trying. *) +let count_macro name lines = + List.fold_left (fun count line -> + match line with Macro (macro_name, _) when macro_name = name -> count + 1 | _ -> count + ) 0 lines + +(* auto-detect and try strategies, return the one with most entries. + * first counts macros to determine which strategies are applicable, + * then runs all applicable ones and picks the winner by entry count. + * if no specialized strategy produces results, falls back to deroff. + * + * this "try everything, pick the best" approach is intentional. + * manpage formatting is too varied and inconsistent to reliably detect the + * format from macro counts alone. running multiple strategies and comparing + * results is more robust. *) +let extract_entries lines = + let tp = count_macro "TP" lines + and ip = count_macro "IP" lines + and pp = count_macro "PP" lines + and rs = count_macro "RS" lines + and ur = count_macro "UR" lines in + (* build a list of (label, entries) for each applicable strategy *) + let specialized = List.filter_map Fun.id [ + (if tp > 0 then Some ("TP", strategy_tp lines) else None); + (if ip > 0 then Some ("IP", strategy_ip lines) else None); + (if pp > 0 && rs > 0 then Some ("PP+RS", strategy_pp_rs lines) else None); + (if ur > 0 && ip > 0 then Some ("nix", strategy_nix lines) else None); + ] in + (* filter to strategies that found at least one entry, fall back to deroff *) + let candidates = match List.filter (fun (_, entries) -> entries <> []) specialized with + | [] -> [("deroff", strategy_deroff_lines lines)] + | filtered -> filtered + in + (* pick the strategy with the most entries *) + List.fold_left (fun (_, best) (name, entries) -> + if List.length entries >= List.length best then (name, entries) + else (name, best) + ) ("none", []) candidates |> snd + +(* --- NAME section description extraction --- + * the NAME section in manpages follows the convention: + * "command \- short description" + * we extract the part after "\-" as the command's description. + * handles both "\-" (groff) and " - " (plain text) separators. *) + +let extract_name_description contents = + let lines = String.split_on_char '\n' contents in + let classified = List.map classify_line lines in + let rec find = function + | [] -> None + | Macro ("SH", args) :: rest + when String.uppercase_ascii (String.trim args) = "NAME" -> + collect rest [] + | _ :: rest -> find rest + and collect lines acc = + match lines with + | Macro ("SH", _) :: _ | [] -> finish acc + | Text text :: rest -> collect rest (text :: acc) + | Macro (("B" | "BI" | "BR" | "I" | "IR"), args) :: rest -> + let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in + collect rest (if String.length text > 0 then text :: acc else acc) + | Macro ("Nm", args) :: rest -> + let text = strip_groff_escapes args |> String.trim in + collect rest (if String.length text > 0 then text :: acc else acc) + | Macro ("Nd", args) :: rest -> + let text = strip_groff_escapes args |> String.trim in + collect rest (if String.length text > 0 then ("\\- " ^ text) :: acc else acc) + | _ :: rest -> collect rest acc + and finish acc = + let full = String.concat " " (List.rev acc) |> String.trim in + (* NAME lines look like: "git-add \- Add file contents to the index" *) + let sep = Str.regexp {| *\\- *\| +- +|} in + match Str.bounded_split sep full 2 with + | [_; desc] -> Some (String.trim desc) + | _ -> None + in + find classified + +(* --- SYNOPSIS command name extraction --- + * the SYNOPSIS section shows how to invoke the command: + * .SH SYNOPSIS + * .B git add + * [\fIOPTIONS\fR] [\fB\-\-\fR] [\fI\fR...] + * + * we extract the command name by taking consecutive "word" tokens until + * we hit something that looks like an argument (starts with [, <, -, etc.). *) + +let extract_synopsis_command_lines lines = + (* replace italic text (\fI...\fR) with angle-bracketed placeholders + * before classification strips the font info. italic in groff indicates + * a parameter/placeholder (e.g. \fIoperation\fR), not a command word. + * the angle brackets cause extract_cmd to stop at these tokens since + * '<' is in its stop set. without this, "nix-env \fIoperation\fR" + * would be parsed as command "nix-env operation" instead of "nix-env". *) + let lines = List.map (fun line -> + Str.global_replace (Str.regexp {|\\fI\([^\\]*\)\\f[RP]|}) {|<\1>|} line + ) lines in + let classified = List.map classify_line lines in + let is_synopsis name = + String.uppercase_ascii (String.trim name) = "SYNOPSIS" + in + (* extract the command name from a line by taking leading word tokens *) + let extract_cmd line = + let words = String.split_on_char ' ' (String.trim line) in + let words = List.filter (fun word -> String.length word > 0) words in + let is_cmd_char = function + | 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '.' -> true + | _ -> false + in + (* take words that look like command name parts, stop at arguments *) + let rec take = function + | [] -> [] + | word :: rest -> + if String.length word > 0 + && (word.[0] = '[' || word.[0] = '-' || word.[0] = '<' + || word.[0] = '(' || word.[0] = '{') + then [] + else if String.for_all is_cmd_char word then + word :: take rest + else [] + in + match take words with + | [] -> None + | cmd -> Some (String.concat " " cmd) + in + let rec find = function + | [] -> None + | Macro ("SH", args) :: rest when is_synopsis args -> collect rest + | _ :: rest -> find rest + and collect = function + | [] -> None + | Macro ("SH", _) :: _ -> None + | Text text :: _ -> + let text = String.trim text in + if String.length text > 0 then extract_cmd text else None + | Macro (("B" | "BI" | "BR"), args) :: _ -> + let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in + if String.length text > 0 then extract_cmd text else None + | _ :: rest -> collect rest + in + find classified + +let extract_synopsis_command contents = + let lines = String.split_on_char '\n' contents in + extract_synopsis_command_lines lines + +(* --- SYNOPSIS positional extraction --- + * extract positional arguments from the SYNOPSIS section by collecting + * all text/formatting macro lines, joining them, skipping the command + * name prefix, then running parse_usage_args from parser.ml on the remainder. *) + +let extract_synopsis_positionals_lines lines = + let classified = List.map classify_line lines in + let is_synopsis name = + String.uppercase_ascii (String.trim name) = "SYNOPSIS" + in + let rec find = function + | [] -> [] + | Macro ("SH", args) :: rest when is_synopsis args -> collect rest [] + | _ :: rest -> find rest + and collect lines acc = + match lines with + | [] -> finish acc + | Macro ("SH", _) :: _ -> finish acc + | Macro ("SS", _) :: _ -> finish acc + | Macro ("br", _) :: _ -> finish acc + | Text text :: rest -> + let text = strip_groff_escapes text |> String.trim in + collect rest (if String.length text > 0 then text :: acc else acc) + | Macro (("B" | "BI" | "BR" | "I" | "IR" | "IB" | "RB" | "RI"), args) :: rest -> + let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in + collect rest (if String.length text > 0 then text :: acc else acc) + | _ :: rest -> collect rest acc + and finish acc = + let parts = List.rev acc in + let full = String.concat " " parts |> String.trim in + if String.length full = 0 then [] + else + let cmd_end = skip_command_prefix full in + let args = String.sub full cmd_end (String.length full - cmd_end) in + parse_usage_args args + in + find classified + +(* --- mdoc (bsd) format support --- + * mdoc is the bsd manpage macro package. it uses semantic macros rather than + * presentation macros: + * .Fl v -> flag: -v + * .Ar file -> argument: file + * .Op ... -> optional: [...] + * .Bl/.It/.El -> list begin/item/end + * .Sh -> section header (note lowercase 'h', vs groff's .SH) + * + * the parser walks through classified lines looking for .Bl (list begin) + * blocks containing .It (items) with .Fl (flag) entries. *) + +let is_mdoc lines = + List.exists (fun line -> + match classify_line line with Macro ("Sh", _) -> true | _ -> false + ) lines + +(* extract renderable text from an mdoc line, skipping structural macros *) +let mdoc_text_of line = + match line with + | Text text -> Some (strip_groff_escapes text) + | Macro (macro_name, args) -> + (match macro_name with + | "Pp" | "Bl" | "El" | "Sh" | "Ss" | "Os" | "Dd" | "Dt" + | "Oo" | "Oc" | "Op" -> None + | _ -> + let text = strip_groff_escapes args |> String.trim in + if text = "" then None else Some text) + | _ -> None + +(* parse an mdoc .It (list item) line that contains flag definitions. + * mdoc .It lines look like: ".It Fl v Ar file" + * where Fl = flag, Ar = argument. we extract the flag name and parameter. + * + * only handles single-char short flags and long flags starting with '-'. + * mdoc's .Fl macro automatically prepends '-', so "Fl v" means "-v" + * and "Fl -verbose" means "--verbose". *) +let parse_mdoc_it args = + let words = String.split_on_char ' ' args + |> List.filter (fun word -> word <> "" && word <> "Ns") in + let param = match words with + | _ :: _ :: "Ar" :: param_name :: _ -> Some (Mandatory param_name) + | _ -> None + in + match words with + | "Fl" :: char_str :: _ when String.length char_str = 1 && is_alphanumeric char_str.[0] -> + Some { switch = Short char_str.[0]; param; desc = "" } + | "Fl" :: name :: _ when String.length name > 1 && name.[0] = '-' -> + Some { switch = Long (String.sub name 1 (String.length name - 1)); param; desc = "" } + | _ -> None + +(* extract a positional argument from an mdoc line (.Ar or .Op Ar) *) +let positional_of_mdoc_line optional args = + let words = String.split_on_char ' ' args + |> List.filter (fun word -> word <> "") in + match words with + | name :: _ when String.length name >= 2 -> + Some { pos_name = String.lowercase_ascii name; + optional; variadic = List.mem "..." words } + | _ -> None + +(* parse an entire mdoc-format manpage. + * walks through all classified lines looking for: + * 1. .Bl/.It/.El list blocks containing flag definitions + * 2. .Sh SYNOPSIS sections containing positional arguments (.Ar, .Op Ar) + * + * the scan function handles nested .Bl blocks — if the first .It in a .Bl + * starts with .Fl (a flag), the entire list is parsed as options. otherwise + * the list is skipped (it might be an example list or a description list). *) +let parse_mdoc_lines lines = + let classified = List.map classify_line lines in + (* skip lines until the matching .El closing tag *) + let rec skip_to_el = function + | [] -> [] + | Macro ("El", _) :: rest -> rest + | _ :: rest -> skip_to_el rest + in + (* collect description text lines until the next structural macro *) + let rec collect_desc acc = function + | [] -> (acc, []) + | (Macro ("It", _) | Macro ("El", _) + | Macro ("Sh", _) | Macro ("Ss", _)) :: _ as rest -> (acc, rest) + | line :: rest -> + collect_desc (match mdoc_text_of line with Some text -> text :: acc | None -> acc) rest + in + (* convenience: collect desc and join into a trimmed string *) + let desc_of rest = + let parts, rest = collect_desc [] rest in + (String.concat " " (List.rev parts) |> String.trim, rest) + in + (* parse a single .It entry: extract flag, collect description *) + let parse_it args rest entries = + let desc, rest = desc_of rest in + let entries = match parse_mdoc_it args with + | Some entry -> { entry with desc } :: entries + | None -> entries + in + (entries, rest) + in + (* parse all .It entries within a .Bl/.El option list *) + let rec parse_option_list entries = function + | [] -> (entries, []) + | Macro ("El", _) :: rest -> (entries, rest) + | Macro ("It", args) :: rest -> + let entries, rest = parse_it args rest entries in + parse_option_list entries rest + | _ :: rest -> parse_option_list entries rest + in + (* main scan: walk through all lines, collecting flags and positionals *) + let rec scan entries positionals = function + | [] -> (entries, positionals) + | Macro ("Bl", _) :: Macro ("It", it_args) :: rest -> + (* peek at first .It to decide if this is a flag list *) + let words = String.split_on_char ' ' it_args + |> List.filter (fun word -> word <> "") in + if (match words with "Fl" :: _ -> true | _ -> false) then + let entries, rest = parse_it it_args rest entries in + let entries, rest = parse_option_list entries rest in + scan entries positionals rest + else + scan entries positionals (skip_to_el rest) + | Macro ("Bl", _) :: rest -> scan entries positionals (skip_to_el rest) + | Macro ("Sh", args) :: rest + when String.uppercase_ascii (String.trim args) = "SYNOPSIS" -> + let positionals, rest = parse_synopsis positionals rest in + scan entries positionals rest + | _ :: rest -> scan entries positionals rest + and parse_synopsis positionals = function + | [] -> (positionals, []) + | Macro ("Sh", _) :: _ as rest -> (positionals, rest) + | Macro ("Ar", args) :: rest -> + let positionals = match positional_of_mdoc_line false args with + | Some p -> p :: positionals | None -> positionals in + parse_synopsis positionals rest + | Macro ("Op", args) :: rest -> + let words = String.split_on_char ' ' args + |> List.filter (fun word -> word <> "") in + let positionals = match words with + | "Ar" :: _ -> + (match positional_of_mdoc_line true args with + | Some p -> p :: positionals | None -> positionals) + | _ -> positionals in + parse_synopsis positionals rest + | _ :: rest -> parse_synopsis positionals rest + in + let entries, positionals = scan [] [] classified in + (* deduplicate positionals by name, preserving order *) + let positionals = + List.rev positionals + |> List.fold_left (fun (seen, acc) p -> + if List.mem p.pos_name seen then (seen, acc) + else (p.pos_name :: seen, p :: acc) + ) ([], []) + |> snd |> List.rev + in + { entries = List.rev entries; subcommands = []; positionals; description = "" } + +(* --- COMMANDS section subcommand extraction --- + * some manpages (notably systemctl) have a dedicated COMMANDS section + * listing subcommands with descriptions. these use .PP + bold name + + * .RS/.RE blocks: + * .PP + * \fBstart\fR \fIUNIT\fR... + * .RS 4 + * Start (activate) one or more units. + * .RE + * + * we extract the bold command name and first sentence of description. *) + +let extract_commands_section lines = + let classified = List.map classify_line lines in + (* collect all lines from the current position until the next .SH *) + let rec collect_until_next_sh lines acc = + match lines with + | [] -> List.rev acc + | Macro ("SH", _) :: _ -> List.rev acc + | line :: rest -> collect_until_next_sh rest (line :: acc) + in + let is_commands_section name = + let upper = String.uppercase_ascii (String.trim name) in + upper = "COMMANDS" || upper = "COMMAND" + in + (* find all COMMANDS/.COMMAND sections and collect their lines *) + let rec find_commands acc = function + | [] -> List.rev acc + | Macro ("SH", args) :: rest when is_commands_section args -> + find_commands (collect_until_next_sh rest [] :: acc) rest + | _ :: rest -> find_commands acc rest + in + let sections = find_commands [] classified in + List.concat sections + +(* extract subcommand name from a bold groff text like + * "\fBlist\-units\fR [\fIPATTERN\fR...]" -> "list-units" + * + * validates that the extracted name looks like a subcommand: lowercase, + * at least 2 chars, no leading dash. falls back to stripping all groff + * and taking the first word if no \fB...\fR wrapper is found. *) +let extract_bold_command_name text = + let trimmed = String.trim text in + (* check whether a string looks like a valid subcommand name *) + let is_valid_subcmd name = + String.length name >= 2 + && name.[0] <> '-' + && String.for_all (fun char_val -> + (char_val >= 'a' && char_val <= 'z') + || (char_val >= '0' && char_val <= '9') + || char_val = '-' || char_val = '_' + ) name + in + (* look for \fB...\fR at the start *) + if String.length trimmed >= 4 + && trimmed.[0] = '\\' && trimmed.[1] = 'f' && trimmed.[2] = 'B' then + let start = 3 in + let end_marker = "\\fR" in + match String.split_on_char '\\' (String.sub trimmed start (String.length trimmed - start)) with + | name_part :: _ -> + let name = strip_groff_escapes ("\\fB" ^ name_part ^ end_marker) |> String.trim in + if is_valid_subcmd name then Some name else None + | [] -> None + else + (* try already-stripped text — take the first word *) + let stripped = strip_groff_escapes trimmed in + let first_word = match String.split_on_char ' ' stripped with + | word :: _ -> word | [] -> "" in + if is_valid_subcmd first_word then Some first_word else None + +(* walk through commands section lines, extracting subcommand name+description + * pairs from .PP + Text + .RS/.RE blocks *) +let extract_subcommands_from_commands lines = + let rec walk lines acc = + match lines with + | [] -> List.rev acc + | Macro ("PP", _) :: rest -> + begin match rest with + | Text tag :: rest2 -> + (* check if this is a subcommand (bold name, not a flag) *) + begin match extract_bold_command_name tag with + | Some name -> + (* collect description from .RS/.RE block *) + let rec collect_desc lines desc_acc = + match lines with + | Macro ("RS", _) :: rest3 -> + collect_in_rs rest3 desc_acc + | Text text :: rest3 -> + collect_desc rest3 (text :: desc_acc) + | _ -> (String.concat " " (List.rev desc_acc), lines) + and collect_in_rs lines desc_acc = + match lines with + | Macro ("RE", _) :: rest3 -> + (String.concat " " (List.rev desc_acc), rest3) + | Text text :: rest3 -> + collect_in_rs rest3 (text :: desc_acc) + | Macro ("PP", _) :: _ | Macro ("SH", _) :: _ | Macro ("SS", _) :: _ -> + (String.concat " " (List.rev desc_acc), lines) + | _ :: rest3 -> collect_in_rs rest3 desc_acc + | [] -> (String.concat " " (List.rev desc_acc), []) + in + let (desc, rest3) = collect_desc rest2 [] in + let desc = String.trim desc in + (* take first sentence as description *) + let short_desc = match String.split_on_char '.' desc with + | first :: _ when String.length first > 0 -> String.trim first + | _ -> desc in + let sc : subcommand = { name; desc = short_desc } in + walk rest3 (sc :: acc) + | None -> walk rest2 acc + end + | _ -> walk rest acc + end + | _ :: rest -> walk rest acc + in + walk lines [] + +(* --- top-level api --- *) + +(* parse a manpage from its classified lines. + * auto-detects mdoc vs groff format. for groff, runs the multi-strategy + * extraction pipeline: extract OPTIONS section -> try all strategies -> + * pick best -> extract SYNOPSIS positionals -> extract COMMANDS subcommands. *) +let parse_manpage_lines lines = + if is_mdoc lines then + parse_mdoc_lines lines + else begin + let options_section = extract_options_section lines in + let entries = extract_entries options_section in + let positionals = extract_synopsis_positionals_lines lines in + let commands_section = extract_commands_section lines in + let subcommands = extract_subcommands_from_commands commands_section in + { entries; subcommands; positionals; description = "" } + end + +(* parse a manpage from its raw string contents. + * splits into lines, parses, then extracts the NAME section description. *) +let parse_manpage_string contents = + let lines = String.split_on_char '\n' contents in + let result = parse_manpage_lines lines in + let description = match extract_name_description contents with + | Some desc -> desc | None -> "" in + { result with description } + +(* --- clap-style SUBCOMMAND section extraction --- + * manpages generated by clap (rust's cli arg parser) put each subcommand + * under its own .SH SUBCOMMAND header with a Usage: line giving the name. + * this is unusual — most tools list subcommands under a single COMMANDS section. + * + * we collect all .SH SUBCOMMAND/SUBCOMMANDS sections, find the Usage: line + * in each to get the subcommand name, then extract flag entries from the + * section body. returns triples of (name, description, help_result). *) +let extract_subcommand_sections contents = + let lines = String.split_on_char '\n' contents in + let classified = List.map classify_line lines in + (* split into sections at .SH boundaries, keeping only SUBCOMMAND(S) sections *) + let rec collect_sections acc current_name current_lines = function + | [] -> + let acc = match current_name with + | Some section_name -> (section_name, List.rev current_lines) :: acc + | None -> acc in + List.rev acc + | Macro ("SH", args) :: rest -> + let acc = match current_name with + | Some section_name -> (section_name, List.rev current_lines) :: acc + | None -> acc in + let name = String.uppercase_ascii (String.trim args) in + if name = "SUBCOMMAND" || name = "SUBCOMMANDS" then + collect_sections acc (Some name) [] rest + else + collect_sections acc None [] rest + | line :: rest -> + collect_sections acc current_name (line :: current_lines) rest + in + let sections = collect_sections [] None [] classified in + (* for each SUBCOMMAND section, extract name from Usage: line and parse entries *) + let usage_re = Str.regexp {|Usage: \([a-zA-Z0-9_-]+\)|} in + let matches_usage text = + try ignore (Str.search_forward usage_re text 0); Some (Str.matched_group 1 text) + with Not_found -> None in + List.filter_map (fun (_header, section_lines) -> + (* scan section lines for the Usage: line to get the subcommand name *) + let name, desc_lines = + List.fold_left (fun (name, desc_lines) line -> + match name with + | Some _ -> (name, desc_lines) + | None -> + match line with + | Text text -> + (match matches_usage text with + | Some _ as found -> (found, desc_lines) + | None -> (None, text :: desc_lines)) + | Macro (("TP" | "B" | "BI" | "BR"), args) -> + let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in + (matches_usage text, desc_lines) + | _ -> (None, desc_lines) + ) (None, []) section_lines in + match name with + | None -> None + | Some subcmd_name -> + let entries = extract_entries section_lines in + let desc = String.concat " " (List.rev desc_lines) + |> strip_groff_escapes |> String.trim in + (* strip backtick-quoted words *) + let desc = Str.global_replace (Str.regexp "`\\([^`]*\\)`") "\\1" desc in + Some (subcmd_name, desc, { entries; subcommands = []; positionals = []; description = desc }) + ) sections + +(* read a manpage file from disk. handles .gz compressed files (the common + * case — most installed manpages are gzipped) using the Gzip library. + * plain text files are read directly. *) +let read_manpage_file path = + if Filename.check_suffix path ".gz" then begin + let ic = Gzip.open_in path in + let buffer = Buffer.create 8192 in + let chunk = Bytes.create 8192 in + (try while true do + let bytes_read = Gzip.input ic chunk 0 8192 in + if bytes_read = 0 then raise Exit + else Buffer.add_subbytes buffer chunk 0 bytes_read + done with Exit | End_of_file -> ()); + Gzip.close_in ic; + Buffer.contents buffer + end else begin + let ic = open_in path in + let size = in_channel_length ic in + let bytes = Bytes.create size in + really_input ic bytes 0 size; + close_in ic; + Bytes.to_string bytes + end + +(* convenience: read + parse a manpage file in one step *) +let parse_manpage_file path = + read_manpage_file path |> parse_manpage_string diff --git a/lib/nushell.ml b/lib/nushell.ml new file mode 100644 index 0000000..b5e4d4f --- /dev/null +++ b/lib/nushell.ml @@ -0,0 +1,253 @@ +(* nushell.ml — generate nushell extern definitions from parsed help data. + * + * this module is the code generation backend. it takes a help_result (from + * the parser or manpage modules) and produces nushell source code that + * defines `extern` declarations — nushell's mechanism for teaching the shell + * about external commands' flags and subcommands so it can offer completions. + * + * it also maintains a list of nushell's built-in commands to avoid generating + * extern definitions that would shadow them. + * + * key responsibilities: + * - deduplicating flag entries (same flag from multiple help sources) + * - mapping parameter names to nushell types (path, int, string) + * - formatting flags in nushell syntax: --flag(-f): type # description + * - handling positional arguments with nushell's ordering constraints + * - escaping special characters for nushell string literals + *) + +open Parser + +module SSet = Set.Make(String) +module SMap = Map.Make(String) +module CSet = Set.Make(Char) + +(* nushell built-in commands and keywords — we must never generate `extern` + * definitions for these because it would shadow nushell's own implementations. + * this list is maintained manually and should be updated with new nushell releases. *) +let nushell_builtins = [ + "alias"; "all"; "ansi"; "any"; "append"; "ast"; "attr"; + "bits"; "break"; "bytes"; + "cal"; "cd"; "char"; "chunk-by"; "chunks"; "clear"; "collect"; + "columns"; "commandline"; "compact"; "complete"; "config"; "const"; + "continue"; "cp"; + "date"; "debug"; "decode"; "def"; "default"; "describe"; "detect"; + "do"; "drop"; "du"; + "each"; "echo"; "encode"; "enumerate"; "error"; "every"; "exec"; + "exit"; "explain"; "explore"; "export"; "export-env"; "extern"; + "fill"; "filter"; "find"; "first"; "flatten"; "for"; "format"; "from"; + "generate"; "get"; "glob"; "grid"; "group-by"; + "hash"; "headers"; "help"; "hide"; "hide-env"; "histogram"; + "history"; "http"; + "if"; "ignore"; "input"; "insert"; "inspect"; "interleave"; "into"; + "is-admin"; "is-empty"; "is-not-empty"; "is-terminal"; "items"; + "job"; "join"; + "keybindings"; "kill"; + "last"; "length"; "let"; "let-env"; "lines"; "load-env"; "loop"; "ls"; + "match"; "math"; "merge"; "metadata"; "mkdir"; "mktemp"; "module"; + "move"; "mut"; "mv"; + "nu-check"; "nu-highlight"; + "open"; "overlay"; + "panic"; "par-each"; "parse"; "path"; "plugin"; "port"; "prepend"; "print"; "ps"; + "query"; + "random"; "reduce"; "reject"; "rename"; "return"; "reverse"; "rm"; + "roll"; "rotate"; "run-external"; + "save"; "schema"; "scope"; "select"; "seq"; "shuffle"; "skip"; "sleep"; + "slice"; "sort"; "sort-by"; "source"; "source-env"; "split"; "start"; + "stor"; "str"; "sys"; + "table"; "take"; "tee"; "term"; "timeit"; "to"; "touch"; "transpose"; + "try"; "tutor"; + "ulimit"; "umask"; "uname"; "uniq"; "uniq-by"; "unlet"; "update"; + "upsert"; "url"; "use"; + "values"; "version"; "view"; + "watch"; "where"; "which"; "while"; "whoami"; "window"; "with-env"; "wrap"; + "zip"; +] + +(* lazily constructed set for fast membership checks against builtins *) +let builtin_set = lazy (SSet.of_list nushell_builtins) + +(* returns true if the given command name collides with a nushell built-in *) +let is_nushell_builtin cmd = + SSet.mem cmd (Lazy.force builtin_set) + +(* deduplicate flag entries that refer to the same flag. + * when the same flag appears multiple times (e.g. from overlapping manpage + * sections or repeated help text), we keep the "best" version using a score: + * - both short+long form present: +10 (most informative) + * - has a parameter: +5 + * - description length bonus: up to +5 + * + * after deduplication by long name, we also remove standalone short flags + * whose letter is already covered by a Both(short, long) entry. this prevents + * emitting both "-v" and "--verbose(-v)" which nushell would reject as a + * duplicate. the filtering preserves original ordering from the help text. *) +let dedup_entries entries = + (* produce a canonical key for each entry based on its switch form *) + let key_of entry = + match entry.switch with + | Short c -> Printf.sprintf "-%c" c + | Long l | Both (_, l) -> Printf.sprintf "--%s" l + in + (* compute a quality score for ranking duplicate entries *) + let score entry = + let switch_bonus = match entry.switch with Both _ -> 10 | _ -> 0 in + let param_bonus = match entry.param with Some _ -> 5 | None -> 0 in + let desc_bonus = min 5 (String.length entry.desc / 10) in + switch_bonus + param_bonus + desc_bonus + in + (* fold over entries, keeping only the highest-scored entry per key *) + let best = List.fold_left (fun acc entry -> + let key = key_of entry in + match SMap.find_opt key acc with + | Some prev when score prev >= score entry -> acc + | _ -> SMap.add key entry acc + ) SMap.empty entries in + (* collect all short-flag characters that are already part of a Both entry, + * so we can suppress standalone Short entries for the same character *) + let covered = SMap.fold (fun _ entry acc -> + match entry.switch with + | Both (c, _) -> CSet.add c acc + | _ -> acc + ) best CSet.empty in + (* emit entries in original order, skipping duplicates and covered shorts *) + List.fold_left (fun (seen, acc) entry -> + let key = key_of entry in + if SSet.mem key seen then (seen, acc) + else match entry.switch with + | Short c when CSet.mem c covered -> (seen, acc) + | _ -> (SSet.add key seen, SMap.find key best :: acc) + ) (SSet.empty, []) entries |> snd |> List.rev + +(* map parameter names to nushell types. + * nushell's `extern` declarations use typed parameters, so we infer the type + * from the parameter name. file/path-related names become "path" (enables + * path completion), numeric names become "int", everything else is "string". *) +let nushell_type_of_param = function + | "FILE" | "file" | "PATH" | "path" | "DIR" | "dir" | "DIRECTORY" + | "FILENAME" | "PATTERNFILE" -> "path" + | "NUM" | "N" | "COUNT" | "NUMBER" | "int" | "INT" | "COLS" | "WIDTH" + | "LINES" | "DEPTH" | "depth" -> "int" + | _ -> "string" + +(* escape a string for use inside nushell double-quoted string literals. + * only double quotes and backslashes need escaping in nushell's syntax. *) +let escape_nu s = + if not (String.contains s '"') && not (String.contains s '\\') then s + else begin + let buf = Buffer.create (String.length s + 4) in + String.iter (fun c -> match c with + | '"' -> Buffer.add_string buf "\\\"" + | '\\' -> Buffer.add_string buf "\\\\" + | _ -> Buffer.add_char buf c + ) s; + Buffer.contents buf + end + +(* format a single flag entry as a nushell `extern` parameter line. + * output examples: + * " --verbose(-v) # increase verbosity" + * " --output(-o): path # write output to file" + * " -n: int # number of results" + * + * the description is right-padded to column 40 with a "# " comment prefix. + * nushell's syntax for combined short+long is "--long(-s)". *) +let format_flag entry = + let name = match entry.switch with + | Both (short_char, l) -> Printf.sprintf "--%s(-%c)" l short_char + | Long l -> Printf.sprintf "--%s" l + | Short short_char -> Printf.sprintf "-%c" short_char + in + let typed = match entry.param with + | Some (Mandatory p) | Some (Optional p) -> ": " ^ nushell_type_of_param p + | None -> "" + in + let flag = " " ^ name ^ typed in + if String.length entry.desc = 0 then flag + else + let pad_len = max 1 (40 - String.length flag) in + flag ^ String.make pad_len ' ' ^ "# " ^ entry.desc + +(* format a positional argument as a nushell `extern` parameter line. + * nushell syntax: "...name: type" for variadic, "name?: type" for optional. + * hyphens in names are converted to underscores since nushell identifiers + * cannot contain hyphens. *) +let format_positional positional = + let name = String.map (function '-' -> '_' | c -> c) positional.pos_name in + let prefix = if positional.variadic then "..." else "" in + let suffix = if positional.optional && not positional.variadic then "?" else "" in + let typ = nushell_type_of_param (String.uppercase_ascii positional.pos_name) in + Printf.sprintf " %s%s%s: %s" prefix name suffix typ + +(* enforce nushell's positional argument ordering rules: + * 1. no required positional may follow an optional one + * 2. at most one variadic ("rest") parameter is allowed + * + * if a required positional appears after an optional one, it is silently + * promoted to optional. duplicate variadic params are dropped. + * uses a fold to track the state across the list in one pass. *) +let fixup_positionals positionals = + List.fold_left (fun (seen_optional, seen_variadic, acc) positional -> + if positional.variadic then + (* only allow the first variadic parameter *) + if seen_variadic then (seen_optional, seen_variadic, acc) + else (true, true, positional :: acc) + else if seen_optional then + (* once we've seen an optional, all subsequent must be optional too *) + (true, seen_variadic, { positional with optional = true } :: acc) + else + (positional.optional, seen_variadic, positional :: acc) + ) (false, false, []) positionals + |> fun (_, _, acc) -> List.rev acc + +(* generate the full nushell `extern` block for a command. + * produces output like: + * export extern "git add" [ + * ...pathspec?: path + * --verbose(-v) # be verbose + * --dry-run(-n) # dry run + * ] + * + * subcommands that weren't resolved into their own full definitions get + * stub `extern` blocks with just a comment containing their description: + * export extern "git stash" [ # stash changes + * ] + *) +let extern_of cmd_name result = + let entries = dedup_entries result.entries in + let escaped_name = escape_nu cmd_name in + let positionals = fixup_positionals result.positionals in + (* format all positional and flag lines, each terminated with a newline *) + let pos_lines = List.map (fun positional -> format_positional positional ^ "\n") positionals in + let flags = List.map (fun entry -> format_flag entry ^ "\n") entries in + let main = Printf.sprintf "export extern \"%s\" [\n%s%s]\n" escaped_name (String.concat "" pos_lines) (String.concat "" flags) in + (* generate stub extern blocks for unresolved subcommands *) + let subs = List.map (fun (subcommand : subcommand) -> + Printf.sprintf "\nexport extern \"%s %s\" [ # %s\n]\n" + escaped_name (escape_nu subcommand.name) (escape_nu subcommand.desc) + ) result.subcommands in + String.concat "" (main :: subs) + +(* public alias for extern_of — this is the main entry point for callers *) +let generate_extern = extern_of + +(* derive a nushell `module` name from a command name. + * replaces non-alphanumeric characters with hyphens and appends "-completions". + * e.g. "git" becomes "git-completions", "docker-compose" stays "docker-compose-completions" *) +let module_name_of cmd_name = + let s = String.map (function + | ('a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_') as c -> c | _ -> '-') cmd_name in + s ^ "-completions" + +(* generate a complete nushell `module` wrapping the `extern`. + * output: "module git-completions { ... }\n\nuse git-completions *\n" + * the `use` at the end makes the `extern` immediately available in scope. *) +let generate_module cmd_name result = + let mod_name = module_name_of cmd_name in + Printf.sprintf "module %s {\n%s}\n\nuse %s *\n" mod_name (extern_of cmd_name result) mod_name + +(* convenience wrapper: generate an `extern` from just a list of entries + * (no subcommands, positionals, or description). used when we only have + * flag data and nothing else. *) +let generate_extern_from_entries cmd_name entries = + generate_extern cmd_name { entries; subcommands = []; positionals = []; description = "" } diff --git a/lib/parser.ml b/lib/parser.ml new file mode 100644 index 0000000..f20aae5 --- /dev/null +++ b/lib/parser.ml @@ -0,0 +1,814 @@ +(* parser.ml — parse --help output into structured flag/subcommand/positional data. + * + * this module is the core of inshellah's help-text understanding. it takes the + * raw text that a cli tool prints when you run `cmd --help` and extracts: + * - flag entries (short/long switches with optional parameters and descriptions) + * - subcommand listings (name + description pairs) + * - positional arguments (from usage lines) + * + * the parser is built on Angstrom (a monadic parser combinator library) for the + * structured flag/subcommand extraction, with hand-rolled imperative parsers for + * usage-line positional extraction (where the format is too varied for clean + * combinator composition). + * + * key design decisions: + * - the Angstrom parser runs in prefix-consume mode — it doesn't need to parse + * the entire input, just extract what it can recognize. unrecognized lines are + * skipped via skip_non_option_line. + * - multi-line descriptions are handled via indentation-based continuation: + * lines indented 8+ spaces that don't start with '-' are folded into the + * previous entry's description. + * - subcommand detection uses a heuristic: lines with a name followed by 2+ + * spaces then a description, where the name is at least 2 chars. section + * headers (like "arguments:") toggle whether name-description pairs are + * treated as subcommands or positionals. + * - positional extraction has two paths: usage-line parsing (the common case) + * and CLI11's explicit "positionals:" section format. + *) + +open Angstrom + +(* strip ansi escape sequences and osc hyperlinks from --help output. + * many modern cli tools emit colored/styled output even when piped, + * so we need to clean this before parsing. handles: + * - csi sequences (esc [ ... final_byte) — colors, cursor movement, etc. + * - osc sequences (esc ] ... bel/st) — hyperlinks, window titles, etc. + * - other two-byte esc+char sequences *) +let strip_ansi s = + let buf = Buffer.create (String.length s) in + let len = String.length s in + let pos = ref 0 in + while !pos < len do + if !pos + 1 < len && Char.code s.[!pos] = 0x1b then begin + let next = s.[!pos + 1] in + if next = '[' then begin + (* csi sequence: esc [ ... final_byte *) + pos := !pos + 2; + while !pos < len && not (s.[!pos] >= '@' && s.[!pos] <= '~') do incr pos done; + if !pos < len then incr pos + end else if next = ']' then begin + (* osc sequence: esc ] ... (terminated by bel or esc \) *) + pos := !pos + 2; + let terminated = ref false in + while !pos < len && not !terminated do + if s.[!pos] = '\x07' then + (incr pos; terminated := true) + else if !pos + 1 < len && Char.code s.[!pos] = 0x1b && s.[!pos + 1] = '\\' then + (pos := !pos + 2; terminated := true) + else + incr pos + done + end else begin + (* other esc sequence, skip esc + one char *) + pos := !pos + 2 + end + end else begin + Buffer.add_char buf s.[!pos]; + incr pos + end + done; + Buffer.contents buf + +(* --- character class predicates --- + * used throughout the Angstrom parsers to classify characters. + * separated out for readability and reuse. *) + +let is_whitespace = function ' ' | '\t' -> true | _ -> false + +let is_alphanumeric = function + | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' -> true + | _ -> false + +(* characters allowed inside parameter names like FILE, output-dir, etc. *) +let is_param_char = function + | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '_' | '-' -> true + | _ -> false + +(* used to detect ALL_CAPS parameter names like FILE, TIME_STYLE *) +let is_upper_or_underscore = function + | 'A' .. 'Z' | '_' -> true + | _ -> false + +(* characters allowed in long flag names (--foo-bar, --enable-feature2) *) +let is_long_char = function + | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '-' -> true + | _ -> false + +(* --- core types --- + * these types represent the structured output of parsing a help text. + * they are shared across the entire codebase (nushell codegen, store, manpage parser). + * + * switch: a flag can be short-only (-v), long-only (--verbose), or both (-v, --verbose). + * the both variant keeps the pair together so nushell can emit "--verbose(-v)". + * + * param: flags can take mandatory (--output FILE) or optional (--color[=WHEN]) values. + * + * entry: one complete flag definition — its switch form, optional parameter, and + * the description text (potentially multi-line, already joined). + * + * help_result: the complete parsed output for a single command. *) +type switch = Short of char | Long of string | Both of char * string +type param = Mandatory of string | Optional of string +type entry = { switch : switch; param : param option; desc : string } +type subcommand = { name : string; desc : string } +type positional = { pos_name : string; optional : bool; variadic : bool } +type help_result = { entries : entry list; subcommands : subcommand list; positionals : positional list; description : string } + +(* --- low-level Angstrom combinators --- + * building blocks for all the parsers below. *) + +(* consume horizontal whitespace (spaces and tabs) without crossing lines *) +let inline_ws = skip_while (function ' ' | '\t' -> true | _ -> false) +(* end of line — matches either a newline or end of input. + * this is the permissive version used in most places. *) +let eol = end_of_line <|> end_of_input +(* strict end of line — must consume an actual newline character. + * used in skip_non_option_line so we don't accidentally match eof + * and consume it when we shouldn't. *) +let eol_strict = end_of_line + +(* --- switch and parameter parsers --- + * parse the flag name portion of an option line, e.g. "-v", "--verbose" *) + +let short_switch = char '-' *> satisfy is_alphanumeric +let long_switch = string "--" *> take_while1 is_long_char +let comma = char ',' *> inline_ws + +(* parameter parsers — handle the various syntaxes tools use to indicate + * that a flag takes a value. the formats are surprisingly diverse: + * --output=FILE (eq_man_param — mandatory, common in gnu tools) + * --color[=WHEN] (eq_opt_param — optional with = syntax) + * --depth DEPTH (space_upper_param — space-separated ALL_CAPS) + * --file (space_angle_param — angle brackets) + * --file [] (space_opt_angle_param — optional angle brackets) + * --format string (space_type_param — go/cobra lowercase type word) + *) +let eq_opt_param = + string "[=" *> take_while1 is_param_char <* char ']' >>| fun a -> Optional a + +let eq_man_param = + char '=' *> take_while1 is_param_char >>| fun a -> Mandatory a + +(* space-separated ALL_CAPS param: e.g. " FILE", " TIME_STYLE". + * peek ahead and check the first char is uppercase, then validate + * the entire word is ALL_CAPS. prevents false positives where a + * description word like "Do" or "Set" immediately follows the flag name. + * digits are allowed (e.g. "SHA256") but lowercase chars disqualify. *) +let space_upper_param = + char ' ' *> peek_char_fail >>= fun c -> + if is_upper_or_underscore c then + take_while1 is_param_char >>= fun name -> + if String.length name >= 1 && String.for_all (fun c -> is_upper_or_underscore c || c >= '0' && c <= '9') name then + return (Mandatory name) + else + fail "not an all-caps param" + else + fail "not an uppercase param" + +(* angle-bracket param: e.g. "", "" *) +let angle_param = + char '<' *> take_while1 (fun c -> c <> '>') <* char '>' >>| fun name -> + Mandatory name + +(* space + angle bracket param *) +let space_angle_param = + char ' ' *> angle_param + +(* optional angle bracket param: [] *) +let opt_angle_param = + char '[' *> char '<' *> take_while1 (fun c -> c <> '>') <* char '>' <* char ']' + >>| fun name -> Optional name + +let space_opt_angle_param = + char ' ' *> opt_angle_param + +(* go/cobra style: space + lowercase type word like "string", "list", "int". + * capped at 10 chars to avoid consuming description words. + * go's flag libraries commonly emit "--timeout duration" or "--name string" + * where the type name is a short lowercase word. longer words are almost + * certainly the start of a description, not a type annotation. *) +let space_type_param = + char ' ' *> peek_char_fail >>= fun c -> + if c >= 'a' && c <= 'z' then + take_while1 (fun c -> c >= 'a' && c <= 'z') >>= fun name -> + if String.length name <= 10 then + return (Mandatory name) + else + fail "too long for type param" + else + fail "not a lowercase type param" + +(* try each parameter format in order of specificity. the ordering matters: + * eq_opt_param must come before eq_man_param because "[=WHEN]" would otherwise + * partially match as "=WHEN" then fail on the trailing "]". similarly, + * space_opt_angle_param before space_angle_param to catch "[]" before "". *) +let param_parser = + option None + (choice + [ eq_opt_param; eq_man_param; + space_opt_angle_param; space_angle_param; + space_upper_param; space_type_param ] + >>| fun a -> Some a) + +(* switch parser — handles the various ways help text presents flag names. + * formats handled (in order of attempt): + * -a, --all (short + comma + long — gnu style) + * -a --all (short + space + long — some tools omit the comma) + * --all / -a (long + slash + short — rare but seen in some tools) + * -a (short only) + * --all (long only) + * + * the ordering is critical because Angstrom's choice commits to + * the first parser that makes progress. short_switch consumes "-a", so the + * combined parsers must be tried before the short-only parser. *) +let switch_parser = + choice + [ + (short_switch >>= fun s -> + comma *> long_switch >>| fun l -> Both (s, l)); + (short_switch >>= fun s -> + char ' ' *> long_switch >>| fun l -> Both (s, l)); + (long_switch >>= fun l -> + inline_ws *> char '/' *> inline_ws *> + short_switch >>| fun s -> Both (s, l)); + (short_switch >>| fun s -> Short s); + (long_switch >>| fun l -> Long l); + ] + +(* --- description parsing with multi-line continuation --- + * descriptions in help text often wrap across multiple lines. the convention + * is that continuation lines are deeply indented (8+ spaces) and don't start + * with '-' (which would indicate a new flag entry). we peek ahead to check + * indentation without consuming, then decide whether to fold the line in. *) + +(* take the rest of the line as text (does not consume the newline itself) *) +let rest_of_line = take_till (fun c -> c = '\n' || c = '\r') + +(* check if a line is a continuation line: deeply indented, doesn't start with '-'. + * tabs count as 8 spaces to match typical terminal rendering. + * the 8-space threshold was chosen empirically — most help formatters indent + * descriptions at least this much, while flag lines are indented 2-4 spaces. *) +let continuation_line = + peek_string 1 >>= fun _ -> + (* must start with significant whitespace (8+ spaces or tab) *) + let count_indent s = + let indent = ref 0 in + let pos = ref 0 in + while !pos < String.length s do + (match s.[!pos] with + | ' ' -> incr indent + | '\t' -> indent := !indent + 8 + | _ -> pos := String.length s); + incr pos + done; + !indent + in + available >>= fun avail -> + if avail = 0 then fail "eof" + else + (* peek ahead to see indentation level *) + peek_string (min avail 80) >>= fun preview -> + let indent = count_indent preview in + let trimmed = String.trim preview in + let starts_with_dash = + String.length trimmed > 0 && trimmed.[0] = '-' + in + if indent >= 8 && not starts_with_dash then + (* this is a continuation line — consume whitespace + text *) + inline_ws *> rest_of_line <* eol + else + fail "not a continuation line" + +(* parse description text: first line (after switch+param) plus any continuation lines. + * blank continuation lines are filtered out, and all lines are trimmed and joined + * with spaces into a single string. *) +let description = + inline_ws *> rest_of_line <* eol >>= fun first_line -> + many continuation_line >>| fun cont_lines -> + let all = first_line :: cont_lines in + let all = List.filter (fun s -> String.length (String.trim s) > 0) all in + String.concat " " (List.map String.trim all) + +(* description that appears on a separate line below the flag. + * this handles the clap (rust) "long" help format where flags and descriptions + * are on separate lines: + * --verbose + * increase verbosity + * here there's no inline description — just deeply-indented continuation lines. *) +let description_below = + many1 continuation_line >>| fun lines -> + let lines = List.filter (fun s -> String.length (String.trim s) > 0) lines in + String.concat " " (List.map String.trim lines) + +(* --- line classification for skipping --- + * the parser needs to skip lines it doesn't understand (section headers, + * blank lines, description paragraphs not attached to a flag, etc.) + * without consuming lines that are flag entries. *) + +(* peek ahead to check if the current line looks like a flag entry. + * an option line starts with whitespace then '-'. *) +let at_option_line = + peek_string 1 >>= fun _ -> + available >>= fun avail -> + if avail = 0 then fail "eof" + else + peek_string (min avail 40) >>= fun preview -> + let s = String.trim preview in + if String.length s > 0 && s.[0] = '-' then return () + else fail "not an option line" + +(* skip a non-option line (section header, blank, description-only, etc.). + * uses eol_strict (not eol) so it won't match at eof — this prevents the + * parser from infinitely skipping at the end of input. if the line looks + * like an option line (at_option_line succeeds), we deliberately fail so + * that the entry parser gets a chance at it instead. *) +let skip_non_option_line = + (at_option_line *> fail "this is an option line") + <|> (rest_of_line *> eol_strict *> return ()) + +(* --- entry parsing --- *) + +(* parse a single flag entry: leading whitespace, then switch+param, then description. + * the description can appear on the same line (inline) or on the next line (below). + * if there's no description at all, we accept an empty string. + * the (eol *> description_below) branch handles the clap long-help format. *) +let entry = + inline_ws *> + lift2 (fun (sw, param) desc -> { switch = sw; param; desc }) + (lift2 (fun a b -> (a, b)) switch_parser param_parser) + (description <|> (eol *> (description_below <|> return ""))) + +(* --- subcommand parsing --- + * subcommand lines in help text follow the pattern: + * " name description" + * where the name and description are separated by 2+ spaces. + * some tools also include argument placeholders between name and description: + * " start UNIT... start one or more units" + * " list [PATTERN] list matching units" + *) + +let is_subcommand_char = function + | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-' | '_' -> true + | _ -> false + +(* skip argument placeholders like UNIT..., [PATTERN...|PID...], + * that appear between the subcommand name and the description. + * only consumes single-space gaps — the two-space gap before the + * description is left for the main parser to use as the delimiter. + * + * this is a recursive (fix-point) parser that peeks ahead to distinguish + * single-space argument gaps from the double-space description separator. + * it accepts tokens that start with [, <, or are ALL_CAPS (with dots/pipes/ + * commas for variadic syntax). *) +let skip_arg_placeholders = + fix (fun self -> + (* peek ahead: single space followed by arg-like token *) + available >>= fun avail -> + if avail < 2 then return () + else + peek_string (min avail 2) >>= fun peek_two -> + if String.length peek_two >= 2 && peek_two.[0] = ' ' && peek_two.[1] <> ' ' then + (* single space — could be an arg placeholder *) + let next = peek_two.[1] in + if next = '[' || next = '<' + || (next >= 'A' && next <= 'Z') then + (* peek the full token to check if it's ALL_CAPS/brackets *) + peek_string (min avail 80) >>= fun preview -> + (* extract the token after the single space *) + let tok_start = 1 in + let token_end = ref tok_start in + while !token_end < String.length preview + && preview.[!token_end] <> ' ' + && preview.[!token_end] <> '\n' + && preview.[!token_end] <> '\r' do + incr token_end + done; + let tok = String.sub preview tok_start (!token_end - tok_start) in + (* accept as placeholder if it starts with [ or < or is ALL_CAPS + (possibly with dots, pipes, dashes) *) + let is_placeholder = + tok.[0] = '[' || tok.[0] = '<' + || String.for_all (fun c -> + (c >= 'A' && c <= 'Z') || c = '_' || c = '-' + || c = '.' || c = '|' || c = ',' || (c >= '0' && c <= '9') + ) tok + in + if is_placeholder then + advance (1 + String.length tok) *> self + else return () + else return () + else return ()) + +(* parse a subcommand entry line. + * requires: name >= 2 chars, followed by 2+ spaces, then description. + * the name is lowercased for consistent lookup. + * + * if the description starts with "- " (a dash-space prefix), it's stripped. + * some tools format their subcommand lists as: + * " add - add a new item" + * where the "- " is decorative, not part of the description. *) +let subcommand_entry = + inline_ws *> + take_while1 is_subcommand_char >>= fun name -> + if String.length name < 2 then fail "subcommand name too short" + else + skip_arg_placeholders *> + char ' ' *> char ' ' *> inline_ws *> + rest_of_line <* eol >>| fun desc -> + { name = String.lowercase_ascii name; + desc = let trimmed = String.trim desc in + if String.length trimmed >= 2 && trimmed.[0] = '-' && trimmed.[1] = ' ' then + String.trim (String.sub trimmed 2 (String.length trimmed - 2)) + else trimmed } + +(* --- section header detection --- + * section headers are critical for disambiguating subcommands from positional + * arguments. lines like "commands:" introduce subcommand sections, while + * "arguments:" or "positionals:" introduce argument sections where the same + * name+description format should not be treated as subcommands. *) + +(* detect section names that introduce positional argument listings. + * the check is case-insensitive and strips trailing colons. *) +let is_arg_section s = + let lc = String.lowercase_ascii (String.trim s) in + let base = if String.ends_with ~suffix:":" lc + then String.sub lc 0 (String.length lc - 1) |> String.trim + else lc in + base = "arguments" || base = "args" || base = "positionals" + || base = "positional arguments" + +(* a section header: left-aligned (or lightly indented, <= 4 spaces) text + * ending with ':', not starting with '-'. must be consumed before + * subcommand_entry in the choice combinator, otherwise "commands:" would + * be parsed as a subcommand named "commands" with description ":". + * + * returns a bool indicating whether this is an argument section (true) + * or some other section (false). this drives the subcommand filtering logic + * in help_parser — entries under argument sections are excluded from the + * subcommand list. *) +let section_header = + available >>= fun avail -> + if avail = 0 then fail "eof" + else + peek_string (min avail 80) >>= fun preview -> + (* extract just the first line from the preview *) + let first_line = match String.index_opt preview '\n' with + | Some pos -> String.sub preview 0 pos + | None -> preview in + let trimmed = String.trim first_line in + let len = String.length trimmed in + let indent = let pos = ref 0 in + while !pos < String.length first_line && (first_line.[!pos] = ' ' || first_line.[!pos] = '\t') do incr pos done; + !pos in + if len >= 2 && trimmed.[len - 1] = ':' && trimmed.[0] <> '-' && indent <= 4 then + rest_of_line <* eol_strict >>| fun line -> is_arg_section line + else fail "not a section header" + +(* --- top-level parser --- + * the main help parser: walks through all lines, trying each line as one of: + * 1. a flag entry (starts with whitespace + '-') + * 2. a section header (left-aligned text ending with ':') + * 3. a subcommand line (name + 2+ spaces + description) + * 4. anything else — skip + * + * the choice ordering matters: entries are tried first (highest priority), + * then section headers (must beat subcommand_entry to avoid misparse), + * then subcommands, then skip as fallback. + * + * after collecting all items, two post-processing steps happen: + * - subcommands under argument sections are excluded (tracked via + * a running in_arg_sec boolean toggled by section headers) + * - duplicate subcommand names are deduplicated, keeping the entry + * with the longer description (heuristic: more info = better) + * + * positionals are not extracted here — they come from the usage line + * parser (extract_usage_positionals) or CLI11's explicit section parser + * (extract_cli11_positionals), applied later in parse_help. *) +let help_parser = + let open Angstrom in + fix (fun _self -> + let try_entry = + entry >>| fun e -> `Entry e + in + let try_section = + section_header >>| fun is_arg -> `Section is_arg + in + let try_subcommand = + subcommand_entry >>| fun sc -> `Subcommand sc + in + let try_skip = + skip_non_option_line >>| fun () -> `Skip + in + many (choice [ try_entry; try_section; try_subcommand; try_skip ]) >>| fun items -> + let entries = List.filter_map (function `Entry e -> Some e | _ -> None) items in + let subcommands = + List.fold_left (fun (in_arg_sec, acc) item -> + match item with + | `Section is_arg -> (is_arg, acc) + | `Subcommand sc when not in_arg_sec -> (in_arg_sec, sc :: acc) + | _ -> (in_arg_sec, acc) + ) (false, []) items + |> snd |> List.rev + |> List.fold_left (fun acc sc -> + match List.assoc_opt sc.name acc with + | Some prev when String.length prev.desc >= String.length sc.desc -> acc + | _ -> (sc.name, sc) :: List.remove_assoc sc.name acc + ) [] + |> List.rev_map snd + in + { entries; subcommands; positionals = []; description = "" }) + +(* --- usage line parsing --- + * usage lines look like: "usage: git add [OPTIONS] [--] [...]" + * to extract positional arguments, we first need to skip past the command + * name prefix ("git add") to reach the argument portion. + * + * skip_command_prefix walks word-by-word, treating each space-separated + * token as part of the command name as long as it: + * - is made of "word chars" (alphanumeric, hyphen, underscore, slash, dot) + * - contains at least one lowercase letter (to distinguish from ALL_CAPS + * positional names like FILE) + * - doesn't start with [, <, (, {, or - (which indicate arguments, not + * command name components) + * + * this is an imperative index-walking parser rather than using Angstrom, + * because usage lines are a single string (not line-oriented) and the format + * is too varied for clean combinator composition. *) +let skip_command_prefix s = + let len = String.length s in + let pos = ref 0 in + let skip_ws () = while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done in + let is_word_char = function + | 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '/' | '.' -> true + | _ -> false + in + let rec loop () = + skip_ws (); + if !pos >= len then () + else if s.[!pos] = '[' || s.[!pos] = '<' || s.[!pos] = '(' || s.[!pos] = '{' || s.[!pos] = '-' then () + else if is_word_char s.[!pos] then begin + let start = !pos in + while !pos < len && is_word_char s.[!pos] do incr pos done; + let word = String.sub s start (!pos - start) in + let has_lower = ref false in + String.iter (fun c -> if c >= 'a' && c <= 'z' then has_lower := true) word; + if not !has_lower then + pos := start + else + loop () + end + in + loop (); + !pos + +(* parse the argument portion of a usage line into positional definitions. + * handles these syntactic forms: + * - mandatory positional + * [file] - optional positional + * FILE - mandatory positional (ALL_CAPS convention) + * ... - variadic (also handles utf-8 ellipsis) + * [file...] - optional variadic + * curly-brace alternatives - skipped, not a positional + * -flag - flags (skipped) + * + * certain ALL_CAPS names are skipped because they're not real positionals — + * "OPTIONS", "FLAGS", etc. are section labels that sometimes appear in usage + * lines for readability. + * + * deduplication at the end ensures we don't emit the same positional twice + * (can happen when usage lines are reformatted or repeated). *) +let parse_usage_args s = + let len = String.length s in + let pos = ref 0 in + let positionals = ref [] in + let skip_ws () = + while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done in + let is_pos_char c = + (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || (c >= '0' && c <= '9') in + (* detect trailing dots or utf-8 ellipsis indicating variadic args *) + let read_dots () = + skip_ws (); + if !pos + 2 < len && s.[!pos] = '.' && s.[!pos+1] = '.' && s.[!pos+2] = '.' then + (pos := !pos + 3; true) + else if !pos + 2 < len && s.[!pos] = '\xe2' && s.[!pos+1] = '\x80' && s.[!pos+2] = '\xa6' then + (pos := !pos + 3; true) (* utf-8 ellipsis *) + else false + in + (* names that are section labels, not actual positional arguments *) + let is_skip name = + let u = String.uppercase_ascii name in + u = "OPTIONS" || u = "OPTION" || u = "FLAGS" || u = "FLAG" + in + (* validate that a name contains only alphanumeric, underscore, hyphen chars *) + let is_clean_name name = + String.length name >= 2 + && String.for_all (fun c -> + (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') + || (c >= '0' && c <= '9') || c = '_' || c = '-') name + in + let is_letter c = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') in + (* skip {A|c|d|...} alternative blocks — not positional arguments *) + let skip_braces () = + if !pos < len && s.[!pos] = '{' then begin + let depth = ref 1 in + incr pos; + while !pos < len && !depth > 0 do + if s.[!pos] = '{' then incr depth + else if s.[!pos] = '}' then decr depth; + incr pos + done; + ignore (read_dots ()); + true + end else false + in + while !pos < len do + skip_ws (); + if !pos >= len then () + else if skip_braces () then () + else match s.[!pos] with + | '[' -> + (* optional positional: [name] or [] or [name...] *) + incr pos; + let start = !pos in + let depth = ref 1 in + while !pos < len && !depth > 0 do + if s.[!pos] = '[' then incr depth + else if s.[!pos] = ']' then decr depth; + incr pos + done; + let bracket_end = !pos - 1 in + let inner = String.sub s start (max 0 (bracket_end - start)) |> String.trim in + let inner, has_inner_dots = + if String.ends_with ~suffix:"..." inner then + (String.sub inner 0 (String.length inner - 3) |> String.trim, true) + else (inner, false) + in + let variadic = has_inner_dots || read_dots () in + if String.length inner > 0 + && inner.[0] <> '-' + && (is_letter inner.[0] || inner.[0] = '<') then begin + let name = + if inner.[0] = '<' then + let e = try String.index inner '>' with Not_found -> String.length inner in + String.sub inner 1 (e - 1) + else inner + in + if is_clean_name name && not (is_skip name) then + positionals := { pos_name = String.lowercase_ascii name; + optional = true; variadic } :: !positionals + end + | '<' -> + (* mandatory positional in angle brackets: *) + incr pos; + let start = !pos in + while !pos < len && s.[!pos] <> '>' do incr pos done; + let name = String.sub s start (!pos - start) in + if !pos < len then incr pos; + let variadic = read_dots () in + if is_clean_name name && not (is_skip name) then + positionals := { pos_name = String.lowercase_ascii name; + optional = false; variadic } :: !positionals + | '-' -> + (* flag — skip entirely, not a positional *) + while !pos < len && s.[!pos] <> ' ' && s.[!pos] <> '\t' && s.[!pos] <> ']' do incr pos done + | c when c >= 'A' && c <= 'Z' -> + (* ALL_CAPS positional name *) + let start = !pos in + while !pos < len && is_pos_char s.[!pos] do incr pos done; + let name = String.sub s start (!pos - start) in + let variadic = read_dots () in + if String.length name >= 2 + && String.for_all (fun c -> + (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || (c >= '0' && c <= '9') + ) name + && not (is_skip name) then + positionals := { pos_name = String.lowercase_ascii name; + optional = false; variadic } :: !positionals + | _ -> + incr pos + done; + (* deduplicate positionals by name, keeping the first occurrence *) + List.rev !positionals + |> List.fold_left (fun (seen, acc) p -> + if List.mem p.pos_name seen then (seen, acc) + else (p.pos_name :: seen, p :: acc) + ) ([], []) + |> snd |> List.rev + +(* find the "usage:" line in the help text and extract positionals from it. + * searches line-by-line for a line starting with "usage:" (case-insensitive). + * handles both inline usage ("usage: cmd [OPTIONS] FILE") and the clap style + * where the actual usage is on the next line: + * USAGE: + * cmd [OPTIONS] FILE + * + * also handles the bare "usage" header (no colon) followed by a next line. *) +let extract_usage_positionals text = + let lines = String.split_on_char '\n' text in + let lines_arr = Array.of_list lines in + let len = Array.length lines_arr in + (* search through lines for the first usage header and return the usage content *) + let find_usage_line () = + let check_line idx = + let trimmed = String.trim lines_arr.(idx) in + let trimmed_len = String.length trimmed in + let lc = String.lowercase_ascii trimmed in + if trimmed_len >= 6 && String.sub lc 0 6 = "usage:" then begin + let after = String.sub trimmed 6 (trimmed_len - 6) |> String.trim in + if String.length after > 0 then Some after + else if idx + 1 < len then + (* clap style: USAGE:\n cmd [OPTIONS] PATTERN *) + let next = String.trim lines_arr.(idx + 1) in + if String.length next > 0 then Some next else None + else None + end else if lc = "usage" then begin + if idx + 1 < len then + let next = String.trim lines_arr.(idx + 1) in + if String.length next > 0 then Some next else None + else None + end else None + in + (* use List.find_map over the index range to find the first matching line *) + List.find_map check_line (List.init len Fun.id) + in + match find_usage_line () with + | None -> [] + | Some usage -> + let cmd_end = skip_command_prefix usage in + let args = String.sub usage cmd_end (String.length usage - cmd_end) in + parse_usage_args args + +(* extract positionals from CLI11's explicit "POSITIONALS:" section. + * CLI11 (a c++ arg parsing library) emits a dedicated section: + * Positionals: + * name TEXT description here + * count INT another description + * + * this is preferred over usage-line extraction when present because it + * provides more accurate type information. the parser looks for the + * section header, then reads indented lines until a blank or unindented + * line signals the end. type words (TEXT, INT, FLOAT, etc.) between the + * name and description are skipped. *) +let extract_cli11_positionals text = + let lines = String.split_on_char '\n' text in + (* parse a single indented positional line into a positional record *) + let parse_one s = + let len = String.length s in + let pos = ref 0 in + let is_name_char c = + (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') || c = '_' || c = '-' in + while !pos < len && is_name_char s.[!pos] do incr pos done; + if !pos < 2 then None + else + let name = String.sub s 0 !pos in + while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done; + (* skip type word: TEXT, INT, FLOAT, ENUM, BOOLEAN, etc. *) + while !pos < len && s.[!pos] >= 'A' && s.[!pos] <= 'Z' do incr pos done; + while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done; + let variadic = !pos + 2 < len && s.[!pos] = '.' && s.[!pos+1] = '.' && s.[!pos+2] = '.' in + Some { pos_name = String.lowercase_ascii name; optional = false; variadic } + in + (* parse consecutive indented lines under the section header *) + let rec parse_lines lines acc = + match lines with + | [] -> List.rev acc + | line :: rest -> + let len = String.length line in + if len = 0 || (line.[0] <> ' ' && line.[0] <> '\t') then + List.rev acc + else + let trimmed = String.trim line in + if String.length trimmed = 0 then List.rev acc + else match parse_one trimmed with + | Some p -> parse_lines rest (p :: acc) + | None -> parse_lines rest acc + in + (* scan lines for the positionals section header, then parse the body *) + let rec find_section = function + | [] -> [] + | line :: rest -> + let trimmed = String.trim line in + if trimmed = "POSITIONALS:" || trimmed = "Positionals:" then + parse_lines rest [] + else + find_section rest + in + find_section lines + +(* top-level entry point: parse a --help text string into a help_result. + * steps: + * 1. strip ansi escapes (colors, hyperlinks, etc.) + * 2. run the Angstrom help_parser for flags and subcommands + * 3. extract positionals via CLI11 format (preferred) or usage line (fallback) + * 4. merge positionals into the result + * uses Angstrom's prefix-consume mode — we don't need to parse every byte. *) +let parse_help txt = + let clean = strip_ansi txt in + match Angstrom.parse_string ~consume:Consume.Prefix help_parser clean with + | Ok result -> + let cli11 = extract_cli11_positionals clean in + let usage = extract_usage_positionals clean in + let positionals = if cli11 <> [] then cli11 else usage in + Ok { result with positionals } + | Error msg -> Error msg diff --git a/lib/store.ml b/lib/store.ml new file mode 100644 index 0000000..2466c81 --- /dev/null +++ b/lib/store.ml @@ -0,0 +1,670 @@ +(* store.ml — filesystem-backed cache of parsed completion data. + * + * this module handles persistence of completion data to disk. each command's + * help_result is serialized to JSON and stored as a file in a cache directory + * (default: $XDG_CACHE_HOME/inshellah). commands with native nushell completions + * are stored as .nu files instead. + * + * the store also provides lookup, listing, and subcommand discovery by + * scanning filenames in the cache directory. + * + * file naming convention: + * - spaces in command names become underscores (e.g. "git add" -> "git_add.json") + * - subcommands of a parent share the prefix (e.g. "git_add.json", "git_commit.json") + * - .json files contain serialized help_result + * - .nu files contain native nushell extern source code + * + * the module includes a minimal hand-rolled JSON parser/serializer because + * we only need to handle our own output format (no need for a full JSON library). + *) + +open Parser + +(* get the default store path: $XDG_CACHE_HOME/inshellah, falling back to + * ~/.cache/inshellah if XDG_CACHE_HOME is not set. *) +let default_store_path () = + let cache = try Sys.getenv "XDG_CACHE_HOME" + with Not_found -> Filename.concat (Sys.getenv "HOME") ".cache" in + Filename.concat cache "inshellah" + +(* recursively create directories along a path (equivalent to mkdir -p). + * splits the path into components and folds over them, accumulating + * the current directory prefix and creating each level if missing. *) +let ensure_dir dir = + let sep = Filename.dir_sep in + let parts = String.split_on_char sep.[0] dir in + (* determine the starting prefix: absolute paths begin with "/" *) + let start = if String.length dir > 0 && dir.[0] = sep.[0] then sep else "" in + let _final = + List.fold_left (fun current part -> + if part = "" then current + else begin + let next = if current = sep then sep ^ part + else if current = "" then part + else current ^ sep ^ part in + (if not (Sys.file_exists next) then Unix.mkdir next 0o755); + next + end + ) start parts + in + () + +(* convert command name to safe filename: spaces become underscores, + * non-alphanumeric chars become hyphens. + * e.g. "git add" -> "git_add", "docker-compose" -> "docker-compose" *) +let filename_of_command cmd = + String.map (function + | ' ' -> '_' + | ('a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '.') as char_val -> char_val + | _ -> '-') cmd + +(* inverse of filename_of_command: underscores back to spaces. + * note: this is lossy — original underscores in command names + * (e.g. "my_tool") would be converted to spaces. in practice this + * doesn't matter because tools with underscores in names are rare, + * and subcommands use space-separated naming. *) +let command_of_filename base_name = + String.map (function '_' -> ' ' | char_val -> char_val) base_name + +(* --- JSON serialization of help_result --- + * hand-rolled JSON emitters. we don't use a JSON library because: + * 1. the schema is fixed and simple — we only serialize our own types + * 2. avoiding dependencies keeps the binary small + * 3. printf-style emission is fast and straightforward for our types *) + +(* escape a string for JSON: quotes, backslashes, and control characters. + * control chars below 0x20 are emitted as \uXXXX unicode escapes. *) +let escape_json contents = + let buf = Buffer.create (String.length contents + 4) in + String.iter (fun char_val -> match char_val with + | '"' -> Buffer.add_string buf "\\\"" + | '\\' -> Buffer.add_string buf "\\\\" + | '\n' -> Buffer.add_string buf "\\n" + | '\t' -> Buffer.add_string buf "\\t" + | '\r' -> Buffer.add_string buf "\\r" + | c when Char.code c < 0x20 -> + Buffer.add_string buf (Printf.sprintf "\\u%04x" (Char.code c)) + | c -> Buffer.add_char buf c + ) contents; + Buffer.contents buf + +(* wrap a string in quotes after escaping for JSON *) +let json_string text = Printf.sprintf "\"%s\"" (escape_json text) + +(* the literal null value for JSON output *) +let json_null = "null" + +(* serialize a switch (short flag, long flag, or both) to JSON *) +let json_switch_of = function + | Short char_val -> + Printf.sprintf "{\"type\":\"short\",\"char\":%s}" (json_string (String.make 1 char_val)) + | Long name -> + Printf.sprintf "{\"type\":\"long\",\"name\":%s}" (json_string name) + | Both (char_val, name) -> + Printf.sprintf "{\"type\":\"both\",\"char\":%s,\"name\":%s}" + (json_string (String.make 1 char_val)) (json_string name) + +(* serialize a parameter spec (mandatory, optional, or absent) to JSON *) +let json_param_of = function + | None -> json_null + | Some (Mandatory name) -> + Printf.sprintf "{\"kind\":\"mandatory\",\"name\":%s}" (json_string name) + | Some (Optional name) -> + Printf.sprintf "{\"kind\":\"optional\",\"name\":%s}" (json_string name) + +(* serialize a single flag entry (switch + param + description) to JSON *) +let json_entry_of entry = + Printf.sprintf "{\"switch\":%s,\"param\":%s,\"desc\":%s}" + (json_switch_of entry.switch) (json_param_of entry.param) (json_string entry.desc) + +(* serialize a subcommand (name + description) to JSON *) +let json_subcommand_of sc = + Printf.sprintf "{\"name\":%s,\"desc\":%s}" (json_string sc.name) (json_string sc.desc) + +(* serialize a positional argument to JSON *) +let json_positional_of p = + Printf.sprintf "{\"name\":%s,\"optional\":%b,\"variadic\":%b}" + (json_string p.pos_name) p.optional p.variadic + +(* serialize a list of items to a JSON array using the given formatter *) +let json_list formatter items = + "[" ^ String.concat "," (List.map formatter items) ^ "]" + +(* serialize an entire help_result to a JSON object string *) +let json_of_help_result ?(source="help") result = + Printf.sprintf "{\"source\":%s,\"description\":%s,\"entries\":%s,\"subcommands\":%s,\"positionals\":%s}" + (json_string source) + (json_string result.description) + (json_list json_entry_of result.entries) + (json_list json_subcommand_of result.subcommands) + (json_list json_positional_of result.positionals) + +(* --- JSON deserialization --- + * minimal hand-rolled recursive-descent JSON parser. only handles the subset + * we emit: strings, booleans, nulls, arrays, and objects. no number parsing + * (we don't emit numbers). this is intentionally minimal — we only read back + * our own serialized format, so robustness against arbitrary JSON is not needed. + * + * note: the \u escape handler does basic UTF-8 encoding for code points + * up to 0xFFFF but doesn't handle surrogate pairs. this is fine for our use + * case since we only escape control characters below 0x20. *) + +type json = + | Jnull + | Jbool of bool + | Jstring of string + | Jarray of json list + | Jobject of (string * json) list + +(* JSON accessor helpers — return sensible defaults for missing/wrong types *) +let json_get key = function + | Jobject pairs -> (try List.assoc key pairs with Not_found -> Jnull) + | _ -> Jnull + +(* extract a string from a JSON value, defaulting to empty string *) +let json_to_string = function Jstring text -> text | _ -> "" + +(* extract a boolean from a JSON value, defaulting to false *) +let json_to_bool = function Jbool value -> value | _ -> false + +(* extract a list from a JSON array value, defaulting to empty list *) +let json_to_list = function Jarray items -> items | _ -> [] + +exception Json_error of string + +(* imperative recursive-descent JSON parser. + * uses a mutable position ref to walk through the string. + * note: boolean/null parsing just advances a fixed number of chars + * without validating the actual characters — safe because we only read + * our own output, but would be incorrect for arbitrary JSON. *) +let parse_json contents = + let len = String.length contents in + let pos = ref 0 in + (* peek at the current character without consuming it *) + let peek () = if !pos < len then contents.[!pos] else '\x00' in + (* advance the position by one character *) + let advance () = incr pos in + (* skip over any whitespace characters at current position *) + let skip_ws () = + while !pos < len && (contents.[!pos] = ' ' || contents.[!pos] = '\t' + || contents.[!pos] = '\n' || contents.[!pos] = '\r') do + advance () + done in + (* skip whitespace then consume the expected character, or raise *) + let expect char_val = + skip_ws (); + if peek () <> char_val then + raise (Json_error (Printf.sprintf "expected '%c' at %d" char_val !pos)); + advance () in + (* mutually recursive parsers for each JSON value type *) + let rec parse_value () = + skip_ws (); + match peek () with + | '"' -> Jstring (parse_string ()) + | '{' -> parse_object () + | '[' -> parse_array () + | 'n' -> advance (); advance (); advance (); advance (); Jnull + | 't' -> advance (); advance (); advance (); advance (); Jbool true + | 'f' -> + advance (); advance (); advance (); advance (); advance (); Jbool false + | char_val -> + raise (Json_error (Printf.sprintf "unexpected '%c' at %d" char_val !pos)) + (* parse a quoted string value, handling escape sequences *) + and parse_string () = + expect '"'; + let buf = Buffer.create 32 in + while peek () <> '"' do + if peek () = '\\' then begin + advance (); + (match peek () with + | '"' -> Buffer.add_char buf '"' + | '\\' -> Buffer.add_char buf '\\' + | 'n' -> Buffer.add_char buf '\n' + | 't' -> Buffer.add_char buf '\t' + | 'r' -> Buffer.add_char buf '\r' + | 'u' -> + (* handle \uXXXX unicode escapes with basic UTF-8 encoding *) + advance (); + let hex = String.sub contents !pos 4 in + pos := !pos + 3; + let code = int_of_string ("0x" ^ hex) in + if code < 128 then Buffer.add_char buf (Char.chr code) + else begin + if code < 0x800 then begin + Buffer.add_char buf (Char.chr (0xc0 lor (code lsr 6))); + Buffer.add_char buf (Char.chr (0x80 lor (code land 0x3f))) + end else begin + Buffer.add_char buf (Char.chr (0xe0 lor (code lsr 12))); + Buffer.add_char buf (Char.chr (0x80 lor ((code lsr 6) land 0x3f))); + Buffer.add_char buf (Char.chr (0x80 lor (code land 0x3f))) + end + end + | char_val -> Buffer.add_char buf char_val); + advance () + end else begin + Buffer.add_char buf (peek ()); + advance () + end + done; + advance (); (* consume closing quote *) + Buffer.contents buf + (* parse a JSON object: { "key": value, ... } *) + and parse_object () = + expect '{'; + skip_ws (); + if peek () = '}' then (advance (); Jobject []) + else begin + let pairs = ref [] in + let more = ref true in + while !more do + skip_ws (); + let key = parse_string () in + expect ':'; + let value = parse_value () in + pairs := (key, value) :: !pairs; + skip_ws (); + if peek () = ',' then advance () + else more := false + done; + expect '}'; + Jobject (List.rev !pairs) + end + (* parse a JSON array: [ value, value, ... ] *) + and parse_array () = + expect '['; + skip_ws (); + if peek () = ']' then (advance (); Jarray []) + else begin + let items = ref [] in + let more = ref true in + while !more do + let value = parse_value () in + items := value :: !items; + skip_ws (); + if peek () = ',' then advance () + else more := false + done; + expect ']'; + Jarray (List.rev !items) + end + in + parse_value () + +(* --- JSON to OCaml type converters --- + * these reconstruct our parser types from their JSON representations. + * they mirror the json_*_of serializers above. *) + +(* reconstruct a switch value from its JSON representation *) +let switch_of_json json_node = + match json_to_string (json_get "type" json_node) with + | "short" -> + let char_str = json_to_string (json_get "char" json_node) in + Short (if String.length char_str > 0 then char_str.[0] else '?') + | "long" -> Long (json_to_string (json_get "name" json_node)) + | "both" -> + let char_str = json_to_string (json_get "char" json_node) in + Both ((if String.length char_str > 0 then char_str.[0] else '?'), + json_to_string (json_get "name" json_node)) + | _ -> Long "?" + +(* reconstruct a parameter spec from its JSON representation *) +let param_of_json = function + | Jnull -> None + | json_node -> + let name = json_to_string (json_get "name" json_node) in + (match json_to_string (json_get "kind" json_node) with + | "mandatory" -> Some (Mandatory name) + | "optional" -> Some (Optional name) + | _ -> None) + +(* reconstruct a flag entry from its JSON representation *) +let entry_of_json json_node = + { switch = switch_of_json (json_get "switch" json_node); + param = param_of_json (json_get "param" json_node); + desc = json_to_string (json_get "desc" json_node) } + +(* reconstruct a subcommand from its JSON representation *) +let subcommand_of_json json_node = + { name = json_to_string (json_get "name" json_node); + desc = json_to_string (json_get "desc" json_node) } + +(* reconstruct a positional argument from its JSON representation *) +let positional_of_json json_node = + { pos_name = json_to_string (json_get "name" json_node); + optional = json_to_bool (json_get "optional" json_node); + variadic = json_to_bool (json_get "variadic" json_node) } + +(* reconstruct a full help_result from its JSON representation *) +let help_result_of_json json_node = + { entries = List.map entry_of_json (json_to_list (json_get "entries" json_node)); + subcommands = List.map subcommand_of_json (json_to_list (json_get "subcommands" json_node)); + positionals = List.map positional_of_json (json_to_list (json_get "positionals" json_node)); + description = json_to_string (json_get "description" json_node) } + +(* --- filesystem operations --- *) + +(* write a string to a file, overwriting any existing content *) +let write_file path contents = + let oc = open_out path in + output_string oc contents; + close_out oc + +(* read an entire file into a string, returning None on any error *) +let read_file path = + try + let ic = open_in path in + let size = in_channel_length ic in + let contents = Bytes.create size in + really_input ic contents 0 size; + close_in ic; + Some (Bytes.to_string contents) + with _ -> None + +(* write a parsed help_result to the store as JSON *) +let write_result ~dir ?(source="help") command result = + let path = Filename.concat dir (filename_of_command command ^ ".json") in + write_file path (json_of_help_result ~source result) + +(* write native nushell completion source to the store as a .nu file *) +let write_native ~dir command data = + let path = Filename.concat dir (filename_of_command command ^ ".nu") in + write_file path data + +(* check whether a path exists and is a directory *) +let is_dir path = Sys.file_exists path && Sys.is_directory path + +(* look for a command's data file across multiple store directories. + * checks JSON first, then .nu. returns the first match found. + * directories are searched in order (user dir before system dirs). *) +let find_file dirs command = + let base_name = filename_of_command command in + List.find_map (fun directory -> + let json_path = Filename.concat directory (base_name ^ ".json") in + if Sys.file_exists json_path then Some json_path + else + let nu_path = Filename.concat directory (base_name ^ ".nu") in + if Sys.file_exists nu_path then Some nu_path + else None + ) dirs + +(* parse a nushell .nu file to extract a help_result for a specific command. + * .nu files contain `export extern "cmd" [ ... ]` blocks with flag definitions. + * this parser extracts flags, positionals, subcommands, and descriptions + * from the nushell extern syntax so the completer can use native completions. + * + * nushell extern parameter syntax: + * --flag(-s): type # description → Both(s, "flag") with param + * --flag: type # description → Long "flag" with param + * --flag # description → Long "flag" no param + * -s # description → Short 's' + * name: type # description → positional + * name?: type → optional positional + * ...name: type → variadic positional + *) +let parse_nu_completions target_cmd contents = + let lines = String.split_on_char '\n' contents in + (* extract the description comment preceding an export extern block *) + let current_desc = ref "" in + (* collect all extern blocks: (cmd_name, entries, positionals, description) *) + let blocks = ref [] in + let in_block = ref false in + let block_cmd = ref "" in + let block_entries = ref [] in + let block_positionals = ref [] in + let block_desc = ref "" in + let finish_block () = + if !in_block then begin + blocks := (!block_cmd, List.rev !block_entries, + List.rev !block_positionals, !block_desc) :: !blocks; + in_block := false + end in + List.iter (fun line -> + let trimmed = String.trim line in + if not !in_block then begin + (* look for description comments and export extern lines *) + if String.length trimmed > 2 && trimmed.[0] = '#' && trimmed.[1] = ' ' then + current_desc := String.trim (String.sub trimmed 2 (String.length trimmed - 2)) + else if String.length trimmed > 15 + && (try ignore (Str.search_forward + (Str.regexp_string "export extern") trimmed 0); true + with Not_found -> false) then begin + (* extract command name from: export extern "cmd name" [ or export extern cmd [ *) + let re_quoted = Str.regexp {|export extern "\([^"]*\)"|} in + let re_bare = Str.regexp {|export extern \([a-zA-Z0-9_-]+\)|} in + let cmd_opt = + if try ignore (Str.search_forward re_quoted trimmed 0); true + with Not_found -> false + then Some (Str.matched_group 1 trimmed) + else if try ignore (Str.search_forward re_bare trimmed 0); true + with Not_found -> false + then Some (Str.matched_group 1 trimmed) + else None in + if cmd_opt <> None then begin + let cmd = match cmd_opt with Some c -> c | None -> "" in + in_block := true; + block_cmd := cmd; + block_entries := []; + block_positionals := []; + block_desc := !current_desc; + current_desc := "" + end + end else + current_desc := "" + end else begin + (* inside an extern block — parse flag/positional lines *) + if String.length trimmed > 0 && trimmed.[0] = ']' then + finish_block () + else begin + (* extract description from # comment *) + let param_part, desc = + match String.split_on_char '#' trimmed with + | before :: rest -> + (String.trim before, + String.trim (String.concat "#" rest)) + | _ -> (trimmed, "") + in + if String.length param_part > 1 then begin + if param_part.[0] = '-' && param_part.[1] = '-' then begin + (* long flag: --flag(-s): type or --flag: type or --flag *) + let re_both = Str.regexp {|--\([a-zA-Z0-9-]+\)(-\([a-zA-Z0-9]\))\(: *\([a-zA-Z]+\)\)?|} in + let re_long = Str.regexp {|--\([a-zA-Z0-9-]+\)\(: *\([a-zA-Z]+\)\)?|} in + if try ignore (Str.search_forward re_both param_part 0); true + with Not_found -> false then begin + let long = Str.matched_group 1 param_part in + let short = (Str.matched_group 2 param_part).[0] in + let param = try Some (Mandatory (Str.matched_group 4 param_part)) + with Not_found | Invalid_argument _ -> None in + block_entries := { switch = Both (short, long); param; desc } :: !block_entries + end else if try ignore (Str.search_forward re_long param_part 0); true + with Not_found -> false then begin + let long = Str.matched_group 1 param_part in + let param = try Some (Mandatory (Str.matched_group 3 param_part)) + with Not_found | Invalid_argument _ -> None in + block_entries := { switch = Long long; param; desc } :: !block_entries + end + end else if param_part.[0] = '-' then begin + (* short flag: -s *) + if String.length param_part >= 2 then + let c = param_part.[1] in + if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') then + block_entries := { switch = Short c; param = None; desc } :: !block_entries + end else begin + (* positional: name: type or name?: type or ...name: type *) + let variadic = String.starts_with ~prefix:"..." param_part in + let part = if variadic then String.sub param_part 3 (String.length param_part - 3) + else param_part in + let optional = try let q = String.index part '?' in q > 0 + with Not_found -> false in + let name = match String.index_opt part ':' with + | Some i -> String.trim (String.sub part 0 i) + | None -> match String.index_opt part '?' with + | Some i -> String.trim (String.sub part 0 i) + | None -> String.trim part in + let name = String.map (function '-' -> '_' | c -> c) name in + if String.length name > 0 && name.[0] <> '-' then + block_positionals := { pos_name = name; optional = optional || variadic; + variadic } :: !block_positionals + end + end + end + end + ) lines; + finish_block (); + let blocks = List.rev !blocks in + (* find the block matching the target command *) + let target = target_cmd in + match List.find_opt (fun (cmd, _, _, _) -> cmd = target) blocks with + | Some (_, entries, positionals, description) -> + (* collect subcommands from other blocks that are children of this command *) + let prefix = target ^ " " in + let subcommands = List.filter_map (fun (cmd, _, _, desc) -> + if String.starts_with ~prefix cmd then + let sub_name = String.sub cmd (String.length prefix) + (String.length cmd - String.length prefix) in + (* only immediate subcommands (no further spaces) *) + if not (String.contains sub_name ' ') && String.length sub_name > 0 + then Some { name = sub_name; desc } + else None + else None + ) blocks in + { entries; subcommands; positionals; description } + | None -> + (* target not found — return empty result *) + { entries = []; subcommands = []; positionals = []; description = "" } + +(* look up a command and deserialize its help_result. + * searches for .json files first, then falls back to .nu files + * (parsing the nushell extern syntax to extract completion data). + * for subcommands like "rbw get", also checks the parent's .nu file + * (e.g. rbw.nu) since clap-generated .nu files contain all extern + * blocks in a single file. *) +let lookup dirs command = + let base_name = filename_of_command command in + (* also try the root command's .nu file for subcommand lookups. + * "rbw get" -> try rbw.nu and look for the "rbw get" extern block. *) + let parent_base = match String.index_opt command ' ' with + | Some i -> Some (filename_of_command (String.sub command 0 i)) + | None -> None in + List.find_map (fun directory -> + let json_path = Filename.concat directory (base_name ^ ".json") in + match read_file json_path with + | Some data -> + (try Some (help_result_of_json (parse_json data)) + with _ -> None) + | None -> + let nu_path = Filename.concat directory (base_name ^ ".nu") in + (match read_file nu_path with + | Some data -> + (try Some (parse_nu_completions command data) + with _ -> None) + | None -> + (* try parent's .nu file for subcommand blocks *) + match parent_base with + | Some pb -> + let parent_nu = Filename.concat directory (pb ^ ".nu") in + (match read_file parent_nu with + | Some data -> + (try + let r = parse_nu_completions command data in + if r.entries <> [] || r.subcommands <> [] || r.positionals <> [] + then Some r else None + with _ -> None) + | None -> None) + | None -> None) + ) dirs + +(* look up a command's raw data (JSON or .nu source) without parsing. + * used by the "query" command to dump stored data as-is. *) +let lookup_raw dirs command = + let base_name = filename_of_command command in + List.find_map (fun directory -> + let json_path = Filename.concat directory (base_name ^ ".json") in + match read_file json_path with + | Some _ as result -> result + | None -> + let nu_path = Filename.concat directory (base_name ^ ".nu") in + read_file nu_path + ) dirs + +(* strip known extensions (.json or .nu) from a filename, returning None + * if the filename has neither extension *) +let chop_extension filename = + if Filename.check_suffix filename ".json" then Some (Filename.chop_suffix filename ".json") + else if Filename.check_suffix filename ".nu" then Some (Filename.chop_suffix filename ".nu") + else None + +(* discover subcommands of a command by scanning filenames in the store. + * looks for files whose names start with the command's filename + "_" + * (e.g. for "git", finds "git_add.json", "git_commit.json", etc.) + * + * only returns immediate subcommands (no nested underscores beyond the prefix). + * tries to extract description from the JSON "description" field if available. + * + * note: this filesystem-based discovery is used as a fallback when the + * command's own help_result doesn't list subcommands. it enables completion + * for subcommands that were indexed from separate manpages or help runs. *) +let subcommands_of dirs command = + let prefix = filename_of_command command ^ "_" in + let prefix_len = String.length prefix in + let module SMap = Map.Make(String) in + let subs = List.fold_left (fun subs directory -> + if is_dir directory then + Array.fold_left (fun subs filename -> + if not (String.starts_with ~prefix filename) then subs + else + let is_json = Filename.check_suffix filename ".json" in + match chop_extension filename with + | None -> subs + | Some base_name -> + let rest = String.sub base_name prefix_len (String.length base_name - prefix_len) in + (* skip nested subcommands and empty names *) + if String.contains rest '_' || String.length rest = 0 then subs + else if SMap.mem rest subs then subs + else + (* try to read the description from the JSON file *) + let desc = if is_json then + match read_file (Filename.concat directory filename) with + | Some data -> + (try json_to_string (json_get "description" (parse_json data)) + with _ -> "") + | None -> "" + else "" in + SMap.add rest { name = rest; desc } subs + ) subs (Sys.readdir directory) + else subs + ) SMap.empty dirs in + SMap.fold (fun _ sc acc -> sc :: acc) subs [] |> List.rev + +(* list all indexed commands across all store directories. + * returns a sorted, deduplicated list of command names. *) +let all_commands dirs = + let module SSet = Set.Make(String) in + List.fold_left (fun cmds directory -> + if is_dir directory then + Array.fold_left (fun cmds filename -> + match chop_extension filename with + | Some base_name -> SSet.add (command_of_filename base_name) cmds + | None -> cmds + ) cmds (Sys.readdir directory) + else cmds + ) SSet.empty dirs + |> SSet.elements + +(* determine how a command was indexed: "help", "manpage", "native", etc. + * for JSON files, reads the "source" field. for .nu files, returns "native". + * used by the "dump" command to show provenance. *) +let file_type_of dirs command = + let base_name = filename_of_command command in + List.find_map (fun directory -> + let json_path = Filename.concat directory (base_name ^ ".json") in + if Sys.file_exists json_path then + (match read_file json_path with + | Some data -> + (try Some (json_to_string (json_get "source" (parse_json data))) + with _ -> Some "json") + | None -> Some "json") + else + let nu_path = Filename.concat directory (base_name ^ ".nu") in + if Sys.file_exists nu_path then Some "native" + else None + ) dirs diff --git a/nix/module.nix b/nix/module.nix index a5d74b5..04ae7f2 100644 --- a/nix/module.nix +++ b/nix/module.nix @@ -10,7 +10,7 @@ # # Usage: # { pkgs, ... }: { -# imports = [ ./path/to/inshellah-rs/nix/module.nix ]; +# imports = [ ./path/to/inshellah/nix/module.nix ]; # programs.inshellah.enable = true; # } @@ -72,26 +72,6 @@ in ''; }; - timeoutMs = lib.mkOption { - type = lib.types.nullOr lib.types.int; - default = null; - example = 200; - description = '' - per-subprocess timeout in milliseconds. when null the binary's - compiled-in default is used (currently 200ms). - ''; - }; - - workers = lib.mkOption { - type = lib.types.nullOr lib.types.int; - default = null; - example = 8; - description = '' - worker thread count for the parallel scrape pool. when null, - `std::thread::available_parallelism` is used. - ''; - }; - snippet = lib.mkOption { type = lib.types.str; readOnly = true; @@ -129,14 +109,12 @@ in lib.concatStringsSep "\n" cfg.helpOnlyCommands ); helpOnlyFlag = lib.optionalString (cfg.helpOnlyCommands != [ ]) " --help-only ${helpOnlyFile}"; - timeoutFlag = lib.optionalString (cfg.timeoutMs != null) " --timeout-ms ${toString cfg.timeoutMs}"; - workersFlag = lib.optionalString (cfg.workers != null) " --workers ${toString cfg.workers}"; in '' mkdir -p ${destDir} if [ -d "$out/bin" ] && [ -d "$out/share/man" ]; then - ${inshellah} index "$out" --dir ${destDir}${ignoreFlag}${helpOnlyFlag}${timeoutFlag}${workersFlag} \ + ${inshellah} index "$out" --dir ${destDir}${ignoreFlag}${helpOnlyFlag} \ 2>/dev/null || true fi diff --git a/src/lib.rs b/src/lib.rs deleted file mode 100644 index 2256bee..0000000 --- a/src/lib.rs +++ /dev/null @@ -1,4 +0,0 @@ -pub mod parsers; -pub mod pool; -pub mod store; -pub mod types; diff --git a/src/main.rs b/src/main.rs deleted file mode 100644 index 62c82f1..0000000 --- a/src/main.rs +++ /dev/null @@ -1,1659 +0,0 @@ -//! inshellah CLI. -//! -//! subcommands: -//! index PREFIX... scan PREFIX/bin and PREFIX/share/man, write JSON cache -//! manpage FILE parse a single manpage, emit nushell extern -//! manpage-dir DIR batch-process manpages under DIR -//! complete CMD ARG... nushell external completer; reads the cache, -//! falls back to on-the-fly --help if uncached -//! query CMD print stored data for CMD -//! dump list indexed commands -//! completions emit nushell completion definitions for inshellah itself - -use std::collections::HashSet; -use std::fs; -use std::io::Read; -use std::path::{Path, PathBuf}; -use std::process::{Command, Stdio}; -use std::sync::Arc; -use std::time::{Duration, Instant}; - -use parking_lot::Mutex; - -use inshellah::parsers::help::help_parser; -use inshellah::parsers::manpage::{ - ManpageEntry, ManpageResult, ManpageSubcommand, OwnedParam, OwnedSwitch, - extract_synopsis_command, parse_manpage_string, parse_manpage_with_subs, - read_manpage_file, -}; -use inshellah::parsers::nushell::{generate_extern, generate_module, is_nushell_builtin}; -use inshellah::pool::{ScrapePool, Submitter}; -use inshellah::store::{ - all_commands, default_store_path, ensure_dir, file_type_of, filename_of_command, lookup, - lookup_raw, subcommands_of, write_native, write_result, -}; - -const COMMAND_SECTIONS: &[u8] = &[1, 8]; - -/// per-subprocess timeout default when --timeout-ms isn't passed. -/// empirically tuned so that a slow-to-print binary doesn't block the -/// pool, while fast-responding ones (the vast majority) print their -/// --help well inside the window. with `n` parallel workers a 200ms -/// ceiling means the worst-case waste from an unresponsive binary is -/// `200ms / n_workers` of wall time. -const DEFAULT_TIMEOUT_MS: u64 = 200; - -fn usage() { - eprintln!( - "inshellah - nushell completions engine - -Usage: - inshellah index PREFIX... [--dir PATH] [--ignore FILE] [--help-only FILE] - [--timeout-ms N] [--workers N] - Index completions into a directory of JSON/nu files. - PREFIX is a directory containing bin/ and share/man/. - Default dir: $XDG_CACHE_HOME/inshellah - --ignore FILE skip listed commands entirely - --help-only FILE skip manpages for listed commands, use --help instead - --timeout-ms N per-subprocess timeout in milliseconds (default 200) - --workers N parallel scrape workers (default: cpu count) - inshellah complete CMD [ARGS...] [--dir PATH[:PATH...]] [--timeout-ms N] - Nushell custom completer. Outputs JSON completion candidates. - Falls back to --help resolution if command is not indexed. - --dir takes colon-separated paths. The first path is the writable - user cache; additional paths are read-only system directories. - inshellah query CMD [--dir PATH[:PATH...]] - Print stored completion data for CMD. - inshellah dump [--dir PATH[:PATH...]] - List indexed commands. - inshellah manpage FILE Parse a manpage and emit nushell extern - inshellah manpage-dir DIR Batch-process manpages under DIR - inshellah completions Generate nushell completions for inshellah -" - ); -} - -// --- subprocess management --- - -/// sanitized env: strip display-related variables to prevent gui tools from -/// popping up windows when run with --help. cached once per process — -/// `vars_os` walks the whole env every call, which adds up across thousands -/// of spawns. -fn safe_env_vars() -> &'static [(std::ffi::OsString, std::ffi::OsString)] { - static CACHE: std::sync::OnceLock> = - std::sync::OnceLock::new(); - CACHE.get_or_init(|| { - std::env::vars_os() - .filter(|(k, _)| { - let s = k.to_string_lossy(); - !(s == "DISPLAY" - || s == "WAYLAND_DISPLAY" - || s == "DBUS_SESSION_BUS_ADDRESS" - || s == "XAUTHORITY") - }) - .collect() - }) -} - -/// run a command with a timeout, capturing stdout+stderr merged. -/// returns None if the process couldn't be started, produced no output, -/// or was killed due to timeout. -/// -/// uses `poll(2)` on the pipe fds directly from the calling thread — no -/// reader threads, no try_wait polling loop. we block in the kernel for -/// either data (POLLIN), peer-close (POLLHUP), or the timeout deadline, -/// so the cost per subprocess is roughly one syscall per data chunk -/// plus the spawn itself. -/// -/// unix process groups still apply: the child is its own pgid leader, so -/// on timeout we killpg(pgid, SIGKILL) and the whole tree (wrapper -/// scripts, forked grandchildren) dies, closing the pipe writers and -/// letting our reads finish cleanly. -fn run_cmd(args: &[String], timeout_ms: u64) -> Option { - use std::io::Read; - use std::os::fd::AsRawFd; - use std::os::unix::process::CommandExt; - - if args.is_empty() { - return None; - } - let mut cmd = Command::new(&args[0]); - cmd.args(&args[1..]); - cmd.stdin(Stdio::null()); - cmd.stdout(Stdio::piped()); - cmd.stderr(Stdio::piped()); - cmd.env_clear(); - for (k, v) in safe_env_vars() { - cmd.env(k, v); - } - cmd.current_dir("/tmp"); - cmd.process_group(0); - - let mut child = cmd.spawn().ok()?; - let pgid = child.id() as i32; - let mut stdout = child.stdout.take()?; - let mut stderr = child.stderr.take()?; - let stdout_fd = stdout.as_raw_fd(); - let stderr_fd = stderr.as_raw_fd(); - - // both pipe fds must be non-blocking so poll-then-read can drain - // everything available without blocking on the next chunk. - unsafe { - for fd in [stdout_fd, stderr_fd] { - let flags = libc::fcntl(fd, libc::F_GETFL); - libc::fcntl(fd, libc::F_SETFL, flags | libc::O_NONBLOCK); - } - } - - let deadline = Instant::now() + Duration::from_millis(timeout_ms); - let mut buf: Vec = Vec::with_capacity(4096); - let mut chunk = [0u8; 4096]; - let mut stdout_open = true; - let mut stderr_open = true; - let mut timed_out = false; - - while stdout_open || stderr_open { - let now = Instant::now(); - if now >= deadline { - timed_out = true; - break; - } - let remaining_ms = (deadline - now).as_millis().min(i32::MAX as u128) as i32; - - let mut fds = [ - libc::pollfd { - fd: if stdout_open { stdout_fd } else { -1 }, - events: libc::POLLIN, - revents: 0, - }, - libc::pollfd { - fd: if stderr_open { stderr_fd } else { -1 }, - events: libc::POLLIN, - revents: 0, - }, - ]; - let n = unsafe { libc::poll(fds.as_mut_ptr(), fds.len() as libc::nfds_t, remaining_ms) }; - if n < 0 { - // EINTR — retry. anything else: bail and let the child reap below. - if std::io::Error::last_os_error().kind() == std::io::ErrorKind::Interrupted { - continue; - } - break; - } - if n == 0 { - // poll itself returned without events — deadline check at top - // of next iter will catch it. - continue; - } - - // drain whichever fds are ready until EAGAIN or EOF. - for (i, pfd) in fds.iter().enumerate() { - if pfd.revents == 0 { - continue; - } - let (reader, open): (&mut dyn Read, &mut bool) = if i == 0 { - (&mut stdout as &mut dyn Read, &mut stdout_open) - } else { - (&mut stderr as &mut dyn Read, &mut stderr_open) - }; - loop { - match reader.read(&mut chunk) { - Ok(0) => { - *open = false; - break; - } - Ok(read) => buf.extend_from_slice(&chunk[..read]), - Err(e) if e.kind() == std::io::ErrorKind::WouldBlock => break, - Err(_) => { - *open = false; - break; - } - } - } - if pfd.revents & (libc::POLLHUP | libc::POLLERR) != 0 { - *open = false; - } - } - } - - if timed_out { - unsafe { - libc::killpg(pgid, libc::SIGKILL); - } - } - let _ = child.wait(); - - if buf.is_empty() { - None - } else { - Some(String::from_utf8_lossy(&buf).into_owned()) - } -} - -// --- file classification --- - -fn is_executable(path: &Path) -> bool { - use std::os::unix::fs::PermissionsExt; - fs::metadata(path) - .map(|m| m.is_file() && (m.permissions().mode() & 0o111) != 0) - .unwrap_or(false) -} - -fn is_script(path: &Path) -> bool { - let real = match fs::canonicalize(path) { - Ok(p) => p, - Err(_) => return false, - }; - let Ok(mut f) = fs::File::open(&real) else { - return false; - }; - let mut buf = [0u8; 2]; - f.read_exact(&mut buf).map(|_| &buf == b"#!").unwrap_or(false) -} - -/// skip filenames that aren't real commands (e.g. doc/locale paths). -fn skip_name(name: &str) -> bool { - name.starts_with('.') - || name.ends_with(".so") - || name.ends_with(".a") - || name.ends_with(".la") - || name.contains('/') -} - -// --- ELF scanning --- - -/// scan an ELF binary (or any file) for string needles. returns the set of -/// needles that appeared. on read failure all needles are reported found -/// (conservative — we'd rather try --help than skip). -fn elf_scan(path: &Path, needles: &[&str]) -> HashSet { - let mut found: HashSet = HashSet::new(); - let real = match fs::canonicalize(path) { - Ok(p) => p, - Err(_) => { - for n in needles { - found.insert((*n).to_string()); - } - return found; - } - }; - let Ok(mut f) = fs::File::open(&real) else { - for n in needles { - found.insert((*n).to_string()); - } - return found; - }; - let mut magic = [0u8; 4]; - if f.read_exact(&mut magic).is_err() { - return found; - } - if magic != [0x7f, b'E', b'L', b'F'] { - // not ELF — return empty so caller decides - return found; - } - let max_needle = needles.iter().map(|s| s.len()).max().unwrap_or(0); - let chunk_size = 65536usize; - let mut buf = vec![0u8; chunk_size + max_needle]; - let mut carry = 0usize; - let needles_b: Vec<&[u8]> = needles.iter().map(|s| s.as_bytes()).collect(); - loop { - let n = match f.read(&mut buf[carry..carry + chunk_size]) { - Ok(n) => n, - Err(_) => 0, - }; - if n == 0 { - break; - } - let total = carry + n; - for (i, needle) in needles_b.iter().enumerate() { - let key = needles[i]; - if found.contains(key) { - continue; - } - if needle.len() > total { - continue; - } - let win = &buf[..total]; - if win - .windows(needle.len()) - .any(|w| w == *needle) - { - found.insert(key.to_string()); - } - } - if found.len() == needles.len() { - break; - } - let new_carry = max_needle.min(total); - buf.copy_within(total - new_carry..total, 0); - carry = new_carry; - } - found -} - -// --- nix wrapper detection --- - -fn read_to_string_capped(path: &Path, cap: usize) -> Option { - let real = fs::canonicalize(path).ok()?; - let md = fs::metadata(&real).ok()?; - if md.len() as usize > cap { - return None; - } - fs::read_to_string(&real).ok() -} - -/// detect nix-generated c wrappers; return the real binary path. -fn nix_wrapper_target(path: &Path) -> Option { - let contents = read_to_string_capped(path, 65536)?; - if !contents.contains("makeCWrapper") { - return None; - } - // pattern: /nix/store/-/bin/ - extract_nix_bin_path(&contents) -} - -/// detect nix-generated bash/sh wrappers. -fn nix_script_wrapper_target(path: &Path) -> Option { - let contents = read_to_string_capped(path, 4096)?; - if !contents.starts_with("#!") { - return None; - } - if !contents.contains("/nix/store/") { - return None; - } - if !(contents.contains("exec ") || contents.contains("exec\t")) { - return None; - } - extract_nix_bin_path(&contents) -} - -fn extract_nix_bin_path(contents: &str) -> Option { - let needle = "/nix/store/"; - let bytes = contents.as_bytes(); - let mut idx = 0; - while let Some(rel) = contents[idx..].find(needle) { - let start = idx + rel; - // find end of the path (whitespace, quote, or null) - let mut end = start + needle.len(); - while end < bytes.len() { - let b = bytes[end]; - if b == b' ' - || b == b'\t' - || b == b'\n' - || b == b'\r' - || b == b'"' - || b == b'\'' - || b == 0 - { - break; - } - end += 1; - } - let candidate = &contents[start..end]; - if candidate.contains("/bin/") { - let path = PathBuf::from(candidate); - if path.exists() { - return Some(path); - } - } - idx = end; - } - None -} - -// --- binary classification --- - -#[derive(Debug, Clone, PartialEq, Eq)] -enum Classify { - /// can try --help - TryHelp, - /// the tool likely speaks the "nushell" completion subcommand - HasNativeCompletions, - /// skip — doesn't look like a CLI we can extract from - Skip, -} - -/// classify an ELF binary by scanning for help/completion needles. -fn classify_elf(path: &Path) -> Classify { - let found = elf_scan(path, &["-h", "--help", "complet"]); - if found.contains("complet") { - Classify::HasNativeCompletions - } else if found.contains("-h") || found.contains("--help") { - Classify::TryHelp - } else { - Classify::Skip - } -} - -/// classify a binary by its actual nature: script, ELF, or nix wrapper. -fn classify_binary(_bindir: &Path, full: &Path) -> Classify { - if is_script(full) { - return Classify::TryHelp; - } - if let Some(target) = nix_wrapper_target(full) { - return classify_elf(&target); - } - if let Some(target) = nix_script_wrapper_target(full) { - return classify_elf(&target); - } - classify_elf(full) -} - -// --- help text extraction --- - -/// try `--help`, then `-h`, returning the first non-empty output (with -/// ANSI escapes stripped). each attempt gets the same per-call timeout. -/// we deliberately skip the third historical `help`-subcommand variant: -/// if neither flag yielded usable text, a positional `help` is unlikely -/// to do anything different and the extra spawn dominates indexing cost. -fn try_help(bin: &Path, timeout_ms: u64) -> Option { - let bin_s = bin.to_string_lossy().to_string(); - for variant in [&["--help"][..], &["-h"][..]] { - let mut args = vec![bin_s.clone()]; - args.extend(variant.iter().map(|s| s.to_string())); - if let Some(out) = run_cmd(&args, timeout_ms) { - let cleaned = fast_strip_ansi::strip_ansi_string(&out); - if !cleaned.trim().is_empty() { - return Some(cleaned.to_string()); - } - } - } - None -} - -/// detect text that looks like a rendered manpage (first non-blank line -/// matches `WORD(N)`). when --help delegates to man(1), the raw groff -/// source has richer structure than the rendered output, so we prefer -/// re-parsing the manpage file directly. -fn is_rendered_manpage(text: &str) -> bool { - let first = text.lines().find(|l| !l.trim().is_empty()); - let Some(line) = first else { return false }; - let trimmed = line.trim(); - let bytes = trimmed.as_bytes(); - if let Some(paren) = trimmed.find('(') { - if paren > 0 && paren + 2 < bytes.len() { - return bytes[paren + 1].is_ascii_digit() && bytes[paren + 2] == b')'; - } - } - false -} - -fn is_nushell_source(text: &str) -> bool { - text.len() > 20 - && (text.contains("export extern") - || text.contains("export def") - || (text.contains("module ") && text.contains("export"))) -} - -/// look for words that contain a known needle within the text (used to -/// find subcommand names that might be a native-completion command). -fn extract_matching_words(text: &str, needles: &[&str]) -> Vec { - let mut out: Vec = Vec::new(); - let mut seen: HashSet = HashSet::new(); - for token in text.split(|c: char| c.is_whitespace() || c == ',' || c == '|') { - let word = token.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_'); - if word.len() < 2 || word.starts_with('-') { - continue; - } - for needle in needles { - if word.contains(needle) && !seen.contains(word) { - seen.insert(word.to_string()); - out.push(word.to_string()); - break; - } - } - } - out -} - -/// try to get native nushell completions from a binary that supports them. -fn try_native_completion(bin: &Path, timeout_ms: u64) -> Option { - let help_text = try_help(bin, timeout_ms)?; - // look for words like "completion", "completions" — typical subcommand - let candidates = extract_matching_words(&help_text, &["complet"]); - let bin_s = bin.to_string_lossy().to_string(); - for sub in &candidates { - for args_form in [ - vec![bin_s.clone(), sub.clone(), "nushell".to_string()], - vec![bin_s.clone(), sub.clone(), "--shell".to_string(), "nushell".to_string()], - vec![bin_s.clone(), sub.clone(), "--shell=nushell".to_string()], - ] { - if let Some(out) = run_cmd(&args_form, timeout_ms) { - let cleaned = fast_strip_ansi::strip_ansi_string(&out); - if is_nushell_source(&cleaned) { - return Some(cleaned.to_string()); - } - } - } - } - None -} - -// --- subcommand recursion --- - -const MAX_RESOLVE_RESULTS: usize = 500; -const MAX_RECURSE_DEPTH: u32 = 5; - -fn parse_help_text(text: &str) -> ManpageResult { - let cleaned: String = fast_strip_ansi::strip_ansi_string(text).into_owned(); - match help_parser(&cleaned) { - Ok((_, r)) => (&r).into(), - Err(_) => ManpageResult::default(), - } -} - -/// recursively resolve subcommands, returning a vec of (cmd_path, result) -/// where cmd_path is the full "git stash apply" form. used by the -/// dynamic-resolve path in `cmd_complete`; the batch indexer uses the -/// pool instead, which expresses this same BFS shape with workers. -fn help_resolve( - bin: &Path, - cmd: &str, - depth: u32, - timeout_ms: u64, - acc: &mut Vec<(String, ManpageResult)>, -) { - if acc.len() >= MAX_RESOLVE_RESULTS { - return; - } - let Some(help_text) = try_help(bin, timeout_ms) else { - return; - }; - let result = parse_help_text(&help_text); - acc.push((cmd.to_string(), result)); - let initial_subs: Vec = acc - .last() - .map(|(_, r)| { - r.subcommands - .iter() - .map(|sc| sc.name.clone()) - .filter(|n| n.len() >= 2 && !n.starts_with('-')) - .collect() - }) - .unwrap_or_default(); - let bin_s = bin.to_string_lossy().to_string(); - for sub in initial_subs { - recurse_subcommand(&bin_s, cmd, &[sub.clone()], depth + 1, timeout_ms, acc); - } -} - -fn recurse_subcommand( - bin_s: &str, - base_cmd: &str, - sub_args: &[String], - depth: u32, - timeout_ms: u64, - acc: &mut Vec<(String, ManpageResult)>, -) { - if acc.len() >= MAX_RESOLVE_RESULTS || depth > MAX_RECURSE_DEPTH { - return; - } - let full_cmd = format!("{base_cmd} {}", sub_args.join(" ")); - let Some(text) = try_help_args(bin_s, sub_args, timeout_ms) else { - return; - }; - let result = parse_help_text(&text); - if result.entries.is_empty() - && result.subcommands.is_empty() - && result.positionals.is_empty() - { - return; - } - if let Some(leaf) = sub_args.last() { - let self_listed = result - .subcommands - .iter() - .any(|sc| sc.name.eq_ignore_ascii_case(leaf)); - if self_listed { - return; - } - } - let inner_subs: Vec = result - .subcommands - .iter() - .map(|sc| sc.name.clone()) - .filter(|n| n.len() >= 2 && !n.starts_with('-') && n != "help") - .collect(); - acc.push((full_cmd, result)); - for sub in inner_subs { - if acc.len() >= MAX_RESOLVE_RESULTS { - break; - } - let mut next = sub_args.to_vec(); - next.push(sub); - recurse_subcommand(bin_s, base_cmd, &next, depth + 1, timeout_ms, acc); - } -} - -/// try `bin sub_path... --help` first, then `... -h` if --help came back -/// empty or "No manual entry…". used by deep subcommand recursion. -fn try_help_args(bin_s: &str, sub_args: &[String], timeout_ms: u64) -> Option { - let mut primary_args: Vec = vec![bin_s.to_string()]; - primary_args.extend(sub_args.iter().cloned()); - primary_args.push("--help".to_string()); - let primary = run_cmd(&primary_args, timeout_ms); - let primary_text = primary - .as_deref() - .map(|s| fast_strip_ansi::strip_ansi_string(s).into_owned()); - let primary_useful = primary_text - .as_ref() - .map(|t| { - let trimmed = t.trim(); - !trimmed.is_empty() - && !trimmed.starts_with("No manual entry") - && !trimmed.starts_with("man:") - }) - .unwrap_or(false); - if primary_useful { - return primary_text; - } - let mut fallback_args: Vec = vec![bin_s.to_string()]; - fallback_args.extend(sub_args.iter().cloned()); - fallback_args.push("-h".to_string()); - if let Some(out) = run_cmd(&fallback_args, timeout_ms) { - let cleaned = fast_strip_ansi::strip_ansi_string(&out).into_owned(); - if !cleaned.trim().is_empty() { - return Some(cleaned); - } - } - primary_text -} - -/// convenience wrapper for the previous single-sub call sites -/// (the dynamic-resolve path in cmd_complete). -fn try_help_subcommand(bin_s: &str, sub: &str, timeout_ms: u64) -> Option { - try_help_args(bin_s, &[sub.to_string()], timeout_ms) -} - -// --- manpage handling --- - -fn cmd_name_of_manpage(path: &Path) -> String { - let mut base = path - .file_name() - .and_then(|s| s.to_str()) - .unwrap_or("") - .to_string(); - if base.ends_with(".gz") { - base.truncate(base.len() - 3); - } - // strip section suffix: "ls.1" -> "ls" - if let Some(dot) = base.rfind('.') { - base.truncate(dot); - } - base -} - -fn find_manpage_path(mandirs: &[PathBuf], hyphenated: &str) -> Option { - for mandir in mandirs { - for section in COMMAND_SECTIONS { - let secdir = mandir.join(format!("man{section}")); - for ext in ["", ".gz"] { - let path = secdir.join(format!("{hyphenated}.{section}{ext}")); - if path.is_file() { - return Some(path); - } - } - } - } - None -} - -/// derive the command name a manpage documents. the SYNOPSIS section -/// is authoritative because manpage filenames are ambiguous — -/// "btrfs-check.8" could mean either a standalone binary `btrfs-check` -/// or the subcommand `btrfs check`. we clamp to the number of -/// hyphen-separated parts in the filename to prevent synopsis lines -/// like "btrfs check [options] " from absorbing the device -/// placeholder into the command name. -fn resolve_manpage_cmd_name(file: &Path, contents: &str) -> String { - let fallback = cmd_name_of_manpage(file); - let max_words = fallback.matches('-').count() + 1; - match extract_synopsis_command(contents) { - Some(name) => { - let words: Vec<&str> = name.split(' ').filter(|w| !w.is_empty()).collect(); - if words.len() > max_words { - words[..max_words].join(" ") - } else { - name - } - } - None => fallback, - } -} - -/// process a manpage and return (cmd_name, main_result, per-subcommand results). -/// the sub_results come from clap-style `.SH SUBCOMMAND` sections — each is -/// a self-contained command with its own flags. -fn process_manpage( - file: &Path, -) -> Option<(String, ManpageResult, Vec<(String, ManpageResult)>)> { - let contents = read_manpage_file(file).ok()?; - let (result, sub_sections) = parse_manpage_with_subs(&contents); - if result.entries.is_empty() && result.subcommands.is_empty() && sub_sections.is_empty() { - return None; - } - let name = resolve_manpage_cmd_name(file, &contents); - if name.is_empty() { - return None; - } - // namespace the sub-section names under the resolved cmd name: - // e.g. nh's SUBCOMMAND "os" becomes the stored command "nh os". - let subs: Vec<(String, ManpageResult)> = sub_sections - .into_iter() - .map(|(sub_name, sub_result)| (format!("{name} {sub_name}"), sub_result)) - .collect(); - Some((name, result, subs)) -} - -/// collect the set of command names that have a manpage in section 1 or 8. -/// used during indexing to skip --help for commands the manpage phase -/// will handle anyway — manpages are more reliable than --help output. -fn manpaged_commands(mandirs: &[PathBuf]) -> HashSet { - let mut out = HashSet::new(); - for mandir in mandirs { - for section in COMMAND_SECTIONS { - let secdir = mandir.join(format!("man{section}")); - if let Ok(entries) = fs::read_dir(&secdir) { - for entry in entries.flatten() { - out.insert(cmd_name_of_manpage(&entry.path())); - } - } - } - } - out -} - -fn list_manpages(mandirs: &[PathBuf]) -> Vec { - let mut out = Vec::new(); - for mandir in mandirs { - for section in COMMAND_SECTIONS { - let secdir = mandir.join(format!("man{section}")); - if let Ok(entries) = fs::read_dir(&secdir) { - for entry in entries.flatten() { - out.push(entry.path()); - } - } - } - } - out -} - -// --- index command --- - -fn load_ignorelist(path: &Path) -> HashSet { - let mut out = HashSet::new(); - if let Ok(contents) = fs::read_to_string(path) { - for line in contents.lines() { - let line = line.trim(); - if !line.is_empty() && !line.starts_with('#') { - out.insert(line.to_string()); - } - } - } - out -} - -fn list_binaries(bindirs: &[PathBuf]) -> Vec<(String, PathBuf)> { - let mut all: Vec<(String, PathBuf)> = Vec::new(); - let mut seen: HashSet = HashSet::new(); - for bd in bindirs { - let Ok(entries) = fs::read_dir(bd) else { - continue; - }; - for entry in entries.flatten() { - let path = entry.path(); - let Some(name) = path.file_name().and_then(|s| s.to_str()) else { - continue; - }; - if skip_name(name) || is_nushell_builtin(name) { - continue; - } - if !is_executable(&path) { - continue; - } - if seen.insert(name.to_string()) { - all.push((name.to_string(), path)); - } - } - } - all.sort_by(|a, b| a.0.cmp(&b.0)); - all -} - -/// shared state passed to every pool worker. nothing inside mutates -/// except `indexed`, which is wrapped in a parking_lot::Mutex. -struct ScrapeCtx { - cache_dir: PathBuf, - mandirs: Vec, - indexed: Mutex>, - timeout_ms: u64, -} - -#[derive(Debug)] -struct PoolJob { - bin_path: PathBuf, - /// the binary's basename — e.g. "git". stays constant across the - /// whole recursion tree for this binary. - base_cmd: String, - /// chain of subcommand tokens past the base. empty for the - /// top-level scrape, ["clone"] for `git clone`, ["stash","apply"] - /// for `git stash apply`. - sub_args: Vec, - depth: u32, -} - -impl PoolJob { - fn full_cmd(&self) -> String { - if self.sub_args.is_empty() { - self.base_cmd.clone() - } else { - format!("{} {}", self.base_cmd, self.sub_args.join(" ")) - } - } -} - -/// per-job handler called by every worker. populates the cache + enqueues -/// child jobs (one per discovered subcommand) onto the same pool. -fn process_pool_job(ctx: &ScrapeCtx, job: PoolJob, submit: &Submitter) { - let full_cmd = job.full_cmd(); - if ctx.indexed.lock().contains(&full_cmd) { - return; - } - - // for the top-level scrape, also handle native-completion classification - // before falling through to --help. recursive sub-probes are always - // TryHelp-shaped — sub-commands don't ship their own "extern" payloads. - let bin_s = job.bin_path.to_string_lossy().to_string(); - if job.sub_args.is_empty() { - let class = classify_binary(&job.bin_path, &job.bin_path); - if matches!(class, Classify::Skip) { - return; - } - if matches!(class, Classify::HasNativeCompletions) - && let Some(nu) = try_native_completion(&job.bin_path, ctx.timeout_ms) - { - let _ = write_native(&ctx.cache_dir, &full_cmd, &nu); - ctx.indexed.lock().insert(full_cmd); - return; - } - } - - // scrape help text - let text = if job.sub_args.is_empty() { - try_help(&job.bin_path, ctx.timeout_ms) - } else { - try_help_args(&bin_s, &job.sub_args, ctx.timeout_ms) - }; - let Some(text) = text else { return }; - - // top-level: if --help is just a rendered manpage, prefer the source. - if job.sub_args.is_empty() - && is_rendered_manpage(&text) - && let Some(mp_path) = find_manpage_path(&ctx.mandirs, &job.base_cmd) - && let Ok(contents) = read_manpage_file(&mp_path) - { - let mp_result = parse_manpage_string(&contents); - if !mp_result.entries.is_empty() || !mp_result.subcommands.is_empty() { - let _ = write_result(&ctx.cache_dir, &full_cmd, "manpage", &mp_result); - ctx.indexed.lock().insert(full_cmd); - return; - } - } - - let result = parse_help_text(&text); - if result.entries.is_empty() - && result.subcommands.is_empty() - && result.positionals.is_empty() - { - return; - } - - // self-listing detection for sub-probes: if the leaf token shows up in - // the result's subcommand list, the binary probably echoed the parent - // help (didn't recognize the token). discard. - if let Some(leaf) = job.sub_args.last() { - if result - .subcommands - .iter() - .any(|sc| sc.name.eq_ignore_ascii_case(leaf)) - { - return; - } - } - - let _ = write_result(&ctx.cache_dir, &full_cmd, "help", &result); - ctx.indexed.lock().insert(full_cmd); - - // matches the sequential recurse_subcommand depth check (`depth > MAX`), - // not `>=`, so we get 6 levels (0..=5) of recursion. without this we - // were cutting off the last layer of deep clap trees like jay. - if job.depth > MAX_RECURSE_DEPTH { - return; - } - for sc in &result.subcommands { - if sc.name.len() < 2 || sc.name.starts_with('-') || sc.name == "help" { - continue; - } - let mut next = job.sub_args.clone(); - next.push(sc.name.clone()); - submit.submit(PoolJob { - bin_path: job.bin_path.clone(), - base_cmd: job.base_cmd.clone(), - sub_args: next, - depth: job.depth + 1, - }); - } -} - -fn cmd_index( - bindirs: &[PathBuf], - mandirs: &[PathBuf], - ignorelist: &HashSet, - help_only: &HashSet, - dir: &Path, - timeout_ms: u64, - num_workers: usize, -) -> std::io::Result<()> { - ensure_dir(dir)?; - let binaries = list_binaries(bindirs); - let manpaged = manpaged_commands(mandirs); - - // phase 1: parallel scrape of every eligible binary via the BFS pool. - // shared state lives in an Arc; the `indexed` set is the - // one mutable bit and uses parking_lot::Mutex. - let ctx = Arc::new(ScrapeCtx { - cache_dir: dir.to_path_buf(), - mandirs: mandirs.to_vec(), - indexed: Mutex::new(HashSet::new()), - timeout_ms, - }); - let pool = ScrapePool::new(num_workers, { - let ctx = ctx.clone(); - move |job: PoolJob, submit: &Submitter| { - process_pool_job(&ctx, job, submit); - } - }); - for (name, path) in &binaries { - if ignorelist.contains(name) { - continue; - } - // prefer manpages over --help: if this binary has a manpage in the same - // prefix, defer to the manpage phase below unless --help-only forces - // the help path. - if !help_only.contains(name) && manpaged.contains(name) { - continue; - } - pool.submit(PoolJob { - bin_path: path.clone(), - base_cmd: name.clone(), - sub_args: Vec::new(), - depth: 0, - }); - } - pool.wait(); - // unwrap the indexed set back out for phase 2 — by this point no - // workers are alive so the Arc has only one strong reference. - let mut indexed: HashSet = - Arc::try_unwrap(ctx).ok().map(|c| c.indexed.into_inner()).unwrap_or_default(); - - // process manpages for commands not yet indexed (unless they're in help-only). - // shorter filenames sort first so parent manpages (e.g. nix-env.1) are - // processed before subpage manpages (nix-env-install.1). - let mut manpages = list_manpages(mandirs); - manpages.sort_by(|a, b| { - let alen = a.file_name().map(|s| s.len()).unwrap_or(0); - let blen = b.file_name().map(|s| s.len()).unwrap_or(0); - alen.cmp(&blen).then_with(|| a.cmp(b)) - }); - for manpage_path in manpages { - let Some((name, result, sub_sections)) = process_manpage(&manpage_path) else { - continue; - }; - let base_cmd = cmd_name_of_manpage(&manpage_path); - if indexed.contains(&name) { - if name != base_cmd { - eprintln!( - "warning: {} extracted cmd \"{}\" (already indexed), skipping", - manpage_path - .file_name() - .and_then(|s| s.to_str()) - .unwrap_or(""), - name - ); - } - continue; - } - if help_only.contains(&name) { - continue; - } - if is_nushell_builtin(&name) { - continue; - } - // clap-style SUBCOMMAND sections produce real, fully-populated - // sub-files (each with its own flags + positionals); they take - // priority over COMMANDS-section leaf stubs. - write_result(dir, &name, "manpage", &result)?; - indexed.insert(name.clone()); - for (sub_cmd, sub_result) in &sub_sections { - if indexed.contains(sub_cmd) { - continue; - } - write_result(dir, sub_cmd, "manpage", sub_result)?; - indexed.insert(sub_cmd.clone()); - } - // for COMMANDS-section subcommands that aren't already covered by - // a SUBCOMMAND section (or a per-subcommand manpage), write a - // description-only stub so the completer treats them as leaves. - // a real per-subcommand manpage processed later will overwrite the - // stub since we deliberately don't add it to `indexed`. - if sub_sections.is_empty() { - for sc in &result.subcommands { - let sub_cmd = format!("{name} {}", sc.name); - if indexed.contains(&sub_cmd) { - continue; - } - let stub = ManpageResult { - entries: Vec::new(), - subcommands: Vec::new(), - positionals: Default::default(), - description: sc.desc.clone(), - }; - write_result(dir, &sub_cmd, "manpage", &stub)?; - } - } - } - - println!("indexed {} commands into {}", indexed.len(), dir.display()); - Ok(()) -} - -// --- manpage subcommand --- - -fn cmd_manpage(file: &Path) -> std::io::Result<()> { - let contents = read_manpage_file(file)?; - let result = parse_manpage_string(&contents); - let name = cmd_name_of_manpage(file); - print!("{}", generate_extern(&name, &result)); - Ok(()) -} - -fn cmd_manpage_dir(dir: &Path) -> std::io::Result<()> { - for section in COMMAND_SECTIONS { - let secdir = dir.join(format!("man{section}")); - let Ok(entries) = fs::read_dir(&secdir) else { - continue; - }; - for entry in entries.flatten() { - let path = entry.path(); - if let Ok(contents) = read_manpage_file(&path) { - let result = parse_manpage_string(&contents); - if result.entries.is_empty() && result.subcommands.is_empty() { - continue; - } - let name = cmd_name_of_manpage(&path); - if name.is_empty() { - continue; - } - print!("{}", generate_extern(&name, &result)); - } - } - } - Ok(()) -} - -// --- query / dump / complete --- - -fn cmd_query(cmd: &str, dirs: &[PathBuf]) -> std::io::Result<()> { - match lookup_raw(dirs, cmd) { - Some(data) => { - print!("{data}"); - Ok(()) - } - None => { - eprintln!("not found: {cmd}"); - std::process::exit(1); - } - } -} - -fn cmd_dump(dirs: &[PathBuf]) { - let cmds = all_commands(dirs); - println!("{} commands", cmds.len()); - for cmd in &cmds { - let src = file_type_of(dirs, cmd).unwrap_or_else(|| "?".to_string()); - println!("{src:>8} {cmd}"); - } -} - -/// look up a command's path in $PATH. -fn find_in_path(name: &str) -> Option { - let path_var = std::env::var("PATH").ok()?; - for dir in path_var.split(':') { - let candidate = Path::new(dir).join(name); - if is_executable(&candidate) { - return Some(candidate); - } - } - None -} - -/// compute a simple "needle in haystack" fuzzy score. zero means no match. -/// substring match scores higher than subsequence, prefix match scores -/// higher than mid-string, contiguous chars score better than gaps. -fn fuzzy_score(needle: &str, haystack: &str) -> i32 { - if needle.is_empty() { - return 1; - } - let n = needle.to_ascii_lowercase(); - let h = haystack.to_ascii_lowercase(); - if h == n { - return 1000; - } - if let Some(idx) = h.find(&n) { - let base = 500 - (idx as i32 * 10); - return base.max(50); - } - // subsequence match (each char in needle appears in order) - let mut hi = h.chars(); - let mut matched = 0; - for c in n.chars() { - let mut found = false; - for hc in hi.by_ref() { - if hc == c { - matched += 1; - found = true; - break; - } - } - if !found { - return 0; - } - } - if matched == n.chars().count() { 10 } else { 0 } -} - -fn json_escape(s: &str) -> String { - let mut out = String::with_capacity(s.len() + 2); - for c in s.chars() { - match c { - '"' => out.push_str("\\\""), - '\\' => out.push_str("\\\\"), - '\n' => out.push_str("\\n"), - '\r' => out.push_str("\\r"), - '\t' => out.push_str("\\t"), - c if (c as u32) < 0x20 => out.push_str(&format!("\\u{:04x}", c as u32)), - c => out.push(c), - } - } - out -} - -fn completion_json(value: &str, desc: &str) -> String { - format!( - r#"{{"value":"{}","description":"{}"}}"#, - json_escape(value), - json_escape(desc) - ) -} - -/// dynamically scrape --help for a command not in the cache, write the result -/// into the user store, and return its parsed form. discovered subcommands -/// are also written. -fn resolve_and_cache( - user_dir: &Path, - mandirs: &[PathBuf], - cmd_name: &str, - path: &Path, - timeout_ms: u64, -) -> Option { - if matches!(classify_binary(path, path), Classify::HasNativeCompletions) - && let Some(nu) = try_native_completion(path, timeout_ms) - { - let _ = write_native(user_dir, cmd_name, &nu); - } - let text = try_help(path, timeout_ms)?; - if is_rendered_manpage(&text) - && let Some(mp_path) = find_manpage_path(mandirs, cmd_name) - && let Ok(contents) = read_manpage_file(&mp_path) - { - let result = parse_manpage_string(&contents); - if !result.entries.is_empty() || !result.subcommands.is_empty() { - let _ = write_result(user_dir, cmd_name, "manpage", &result); - return Some(result); - } - } - let parsed = parse_help_text(&text); - let _ = write_result(user_dir, cmd_name, "help", &parsed); - let mut sub_acc: Vec<(String, ManpageResult)> = Vec::new(); - help_resolve(path, cmd_name, 1, timeout_ms, &mut sub_acc); - for (cmd, r) in sub_acc.into_iter().skip(1) { - let _ = write_result(user_dir, &cmd, "help", &r); - } - Some(parsed) -} - -const ELEVATION_COMMANDS: &[&str] = &[ - "sudo", "doas", "pkexec", "su", "run0", -]; - -fn cmd_complete( - spans: &[String], - user_dir: &Path, - system_dirs: &[PathBuf], - mandirs: &[PathBuf], - timeout_ms: u64, -) { - let mut dirs: Vec = system_dirs.to_vec(); - dirs.push(user_dir.to_path_buf()); - - // skip past elevation wrappers (sudo, doas) to find the real command - let spans: Vec = match spans.first() { - Some(first) if ELEVATION_COMMANDS.contains(&first.as_str()) => { - let rest = &spans[1..]; - let real = rest.iter().position(|s| { - !s.is_empty() - && !s.starts_with('-') - && (lookup(&dirs, s).is_some() || find_in_path(s).is_some()) - }); - match real { - Some(idx) => rest[idx..].to_vec(), - None => spans.to_vec(), - } - } - _ => spans.to_vec(), - }; - - if spans.is_empty() { - println!("null"); - return; - } - - let cmd_name = spans[0].clone(); - let rest: Vec = spans[1..].to_vec(); - - // strip intermediate flag tokens — they aren't part of subcommand path - let mut tokens: Vec = vec![cmd_name.clone()]; - if !rest.is_empty() { - let (last, leading) = rest.split_last().unwrap(); - for t in leading { - if !t.starts_with('-') || t.is_empty() { - tokens.push(t.clone()); - } - } - tokens.push(last.clone()); - } - - let last_token = rest.last().cloned().unwrap_or_default(); - // lookup tokens exclude the partial unless the user has typed a trailing space - let lookup_tokens: Vec = if last_token.is_empty() { - tokens.clone() - } else if tokens.len() > 1 { - tokens[..tokens.len() - 1].to_vec() - } else { - vec![cmd_name.clone()] - }; - - // try longest-prefix match: "git stash apply" → "git stash" → "git" - let find_result = |toks: &[String]| -> Option<(String, ManpageResult, usize)> { - let n = toks.len(); - for drop in 0..n { - let prefix = &toks[..n - drop]; - if prefix.is_empty() { - continue; - } - let name = prefix.join(" "); - if let Some(r) = lookup(&dirs, &name) { - return Some((name, r, prefix.len())); - } - } - None - }; - - let mut found = find_result(&lookup_tokens); - - // dynamic resolve: if nothing matches or only a parent matched, try --help - let lookup_depth = lookup_tokens.len(); - let need_resolve = match &found { - Some((_, _, depth)) => *depth < lookup_depth.saturating_sub(1), - None => true, - }; - if need_resolve - && let Some(path) = find_in_path(&cmd_name) - { - // build extended mandirs from the binary's own prefix as well - let mut all_mandirs = mandirs.to_vec(); - if let Some(parent) = path.parent() - && let Some(prefix) = parent.parent() - { - let share_man = prefix.join("share/man"); - if share_man.is_dir() { - all_mandirs.push(share_man); - } - } - if resolve_and_cache(user_dir, &all_mandirs, &cmd_name, &path, timeout_ms).is_some() { - found = find_result(&lookup_tokens); - } - } - - let candidates: Vec = match &found { - None => Vec::new(), - Some((matched_name, r, depth)) => { - let mut scored: Vec<(i32, String)> = Vec::new(); - // subcommand candidates (skip if match is too shallow) - if *depth >= lookup_depth.saturating_sub(1) { - let subs: Vec = if !r.subcommands.is_empty() { - r.subcommands.clone() - } else { - subcommands_of(&dirs, matched_name) - }; - for sc in &subs { - let s = fuzzy_score(&last_token, &sc.name); - if s > 0 { - scored.push((s, completion_json(&sc.name, &sc.desc))); - } - } - } - // flag candidates - for e in &r.entries { - let base_desc = match &e.param { - Some(OwnedParam::Mandatory(p)) => { - if e.desc.is_empty() { - format!("<{p}>") - } else { - format!("{} <{p}>", e.desc) - } - } - Some(OwnedParam::Optional(p)) => { - if e.desc.is_empty() { - format!("[{p}]") - } else { - format!("{} [{p}]", e.desc) - } - } - None => e.desc.clone(), - }; - let (flag, desc) = match &e.switch { - OwnedSwitch::Long(l) => (format!("--{l}"), base_desc), - OwnedSwitch::Short(c) => (format!("-{c}"), base_desc), - OwnedSwitch::Both(c, l) => { - let long_flag = format!("--{l}"); - let short_flag = format!("-{c}"); - let ls = fuzzy_score(&last_token, &long_flag); - let ss = fuzzy_score(&last_token, &short_flag); - if ss > ls { - (short_flag, format!("(aka {long_flag}) {base_desc}")) - } else { - (long_flag.clone(), format!("(aka {short_flag}) {base_desc}")) - } - } - }; - let s = fuzzy_score(&last_token, &flag); - if s > 0 { - scored.push((s, completion_json(&flag, &desc))); - } - } - scored.sort_by(|a, b| b.0.cmp(&a.0)); - scored.into_iter().map(|(_, json)| json).collect() - } - }; - - // protocol: null = hand off to nushell's file completer; [...] = our candidates - let typing_flag = last_token.starts_with('-') && !last_token.is_empty(); - let has_subs = match &found { - Some((matched_name, r, _)) => { - !r.subcommands.is_empty() || !subcommands_of(&dirs, matched_name).is_empty() - } - None => false, - }; - // hand off to file completer only when the user's prefix is empty - // or doesn't match anything. when the user typed a specific token - // that happens to match a flag, prefer showing the match. - let want_files = - !typing_flag && !has_subs && (last_token.is_empty() || candidates.is_empty()); - if want_files || candidates.is_empty() { - println!("null"); - } else { - println!("[{}]", candidates.join(",")); - } -} - -// --- completions self-emission --- - -fn cmd_completions() { - // emit completions for inshellah itself. - let mut entries: Vec = Vec::new(); - entries.push(ManpageEntry { - switch: OwnedSwitch::Both('h', "help".to_string()), - param: None, - desc: "show help".to_string(), - }); - let subs = ["index", "manpage", "manpage-dir", "completions"]; - let mut subcommands = Vec::new(); - for s in subs { - subcommands.push(ManpageSubcommand { - name: s.to_string(), - desc: String::new(), - }); - } - let result = ManpageResult { - entries, - subcommands, - positionals: Default::default(), - description: "nushell completions engine".to_string(), - }; - print!("{}", generate_module("inshellah", &result)); -} - -// --- argument parsing --- - -struct IndexArgs { - prefixes: Vec, - dir: Option, - ignore: Option, - help_only: Option, - timeout_ms: u64, - workers: usize, -} - -fn parse_index_args(args: &[String]) -> IndexArgs { - let mut out = IndexArgs { - prefixes: Vec::new(), - dir: None, - ignore: None, - help_only: None, - timeout_ms: DEFAULT_TIMEOUT_MS, - workers: default_workers(), - }; - let mut i = 0; - while i < args.len() { - match args[i].as_str() { - "--dir" => { - i += 1; - if i < args.len() { - out.dir = Some(PathBuf::from(&args[i])); - } - } - "--ignore" => { - i += 1; - if i < args.len() { - out.ignore = Some(PathBuf::from(&args[i])); - } - } - "--help-only" => { - i += 1; - if i < args.len() { - out.help_only = Some(PathBuf::from(&args[i])); - } - } - "--timeout-ms" => { - i += 1; - if i < args.len() - && let Ok(n) = args[i].parse::() - { - out.timeout_ms = n; - } - } - "--workers" => { - i += 1; - if i < args.len() - && let Ok(n) = args[i].parse::() - { - out.workers = n.max(1); - } - } - other => { - out.prefixes.push(PathBuf::from(other)); - } - } - i += 1; - } - out -} - -/// best-effort thread count default: `available_parallelism` (1.59+), else 4. -fn default_workers() -> usize { - std::thread::available_parallelism() - .map(|n| n.get()) - .unwrap_or(4) -} - -fn man_dir_of_prefix(prefix: &Path) -> PathBuf { - prefix.join("share/man") -} - -/// parse --dir PATH[:PATH...], optional --timeout-ms N, plus any -/// positional args. when --dir isn't supplied, returns the default cache -/// dir as the single entry. -fn parse_dir_args(args: &[String]) -> (Vec, Vec, u64) { - let mut positional = Vec::new(); - let mut dirs: Option> = None; - let mut timeout_ms = DEFAULT_TIMEOUT_MS; - let mut i = 0; - while i < args.len() { - match args[i].as_str() { - "--dir" => { - i += 1; - if i < args.len() { - dirs = Some(args[i].split(':').map(PathBuf::from).collect()); - } - } - "--timeout-ms" => { - i += 1; - if i < args.len() - && let Ok(n) = args[i].parse::() - { - timeout_ms = n; - } - } - _ => { - positional.push(args[i].clone()); - } - } - i += 1; - } - let dirs = dirs.unwrap_or_else(|| vec![default_store_path()]); - (positional, dirs, timeout_ms) -} - -fn main() { - let args: Vec = std::env::args().collect(); - if args.len() < 2 { - usage(); - std::process::exit(1); - } - match args[1].as_str() { - "index" => { - let parsed = parse_index_args(&args[2..]); - if parsed.prefixes.is_empty() { - eprintln!("error: index requires at least one PREFIX"); - std::process::exit(1); - } - let dir = parsed.dir.unwrap_or_else(default_store_path); - let ignorelist = parsed - .ignore - .as_deref() - .map(load_ignorelist) - .unwrap_or_default(); - let help_only = parsed - .help_only - .as_deref() - .map(load_ignorelist) - .unwrap_or_default(); - let bindirs: Vec = - parsed.prefixes.iter().map(|p| p.join("bin")).collect(); - let mandirs: Vec = - parsed.prefixes.iter().map(|p| man_dir_of_prefix(p)).collect(); - if let Err(e) = cmd_index( - &bindirs, - &mandirs, - &ignorelist, - &help_only, - &dir, - parsed.timeout_ms, - parsed.workers, - ) { - eprintln!("index failed: {e}"); - std::process::exit(1); - } - } - "manpage" => { - if args.len() < 3 { - eprintln!("error: manpage requires a FILE argument"); - std::process::exit(1); - } - if let Err(e) = cmd_manpage(Path::new(&args[2])) { - eprintln!("manpage failed: {e}"); - std::process::exit(1); - } - } - "manpage-dir" => { - if args.len() < 3 { - eprintln!("error: manpage-dir requires a DIR argument"); - std::process::exit(1); - } - if let Err(e) = cmd_manpage_dir(Path::new(&args[2])) { - eprintln!("manpage-dir failed: {e}"); - std::process::exit(1); - } - } - "complete" => { - let (positional, dirs, timeout_ms) = parse_dir_args(&args[2..]); - // first dir is the writable user cache; rest are read-only system dirs - let (user_dir, system_dirs): (PathBuf, Vec) = match dirs.split_first() { - Some((first, rest)) => (first.clone(), rest.to_vec()), - None => (default_store_path(), Vec::new()), - }; - // mandirs default to share/man siblings of each system dir - let mandirs: Vec = system_dirs - .iter() - .filter_map(|d| d.parent().map(|p| p.join("share/man"))) - .filter(|p| p.is_dir()) - .collect(); - cmd_complete(&positional, &user_dir, &system_dirs, &mandirs, timeout_ms); - } - "query" => { - let (positional, dirs, _timeout_ms) = parse_dir_args(&args[2..]); - if positional.is_empty() { - eprintln!("error: query requires a CMD argument"); - std::process::exit(1); - } - let cmd = positional.join(" "); - if let Err(e) = cmd_query(&cmd, &dirs) { - eprintln!("query failed: {e}"); - std::process::exit(1); - } - } - "dump" => { - let (_, dirs, _timeout_ms) = parse_dir_args(&args[2..]); - cmd_dump(&dirs); - } - "completions" => cmd_completions(), - "--help" | "-h" | "help" => usage(), - other => { - eprintln!("unknown subcommand: {other}"); - usage(); - std::process::exit(1); - } - } - // make warning go away - let _ = filename_of_command; -} diff --git a/src/parsers/help.rs b/src/parsers/help.rs deleted file mode 100644 index ee55c63..0000000 --- a/src/parsers/help.rs +++ /dev/null @@ -1,144 +0,0 @@ -mod description; -mod helpers; -mod options; -mod positionals; -mod subcommands; - -pub use options::{param_parser, parse_usage_flags, switch_parser}; -pub use positionals::{ - extract_cli11_positionals, extract_usage_positionals, parse_usage_args, - skip_command_name, -}; - -use std::collections::HashMap; - -use crate::{ - parsers::help::{ - description::description, - helpers::{eol, get_indent, rest_of_line}, - subcommands::subcommand_entry, - }, - types::*, -}; -use nom::{ - IResult, Parser, - branch::alt, - bytes::complete::tag_no_case, - character::complete::{char, line_ending, space0}, - combinator::{opt, peek, rest, value, verify}, - multi::many0, - sequence::{delimited, terminated}, -}; - -use crate::make_parser; - -/// parse a single flag entry: indent + switch + optional param + description. -make_parser!(entry -> OptionEntry<'_>, - ( - space0, - (switch_parser, opt(param_parser)), - description, - ) - => |(_, (switch, param), (first, cont)) - : (_, (Switch<'a>, Option>), (&'a str, Vec<&'a str>))| - { - let mut desc: Vec<&str> = Vec::with_capacity(1 + cont.len()); - if !first.trim().is_empty() { desc.push(first); } - desc.extend(cont.into_iter().filter(|l| !l.trim().is_empty())); - OptionEntry { switch, param, desc } - } -); - -enum ParseResult<'a> { - OptionEntry(OptionEntry<'a>), - SectionHeader, - Subcommand(Subcommand<'a>), - NonOptionLine, -} - -/// fallback: consume the current line + line_ending unconditionally. used -/// after entry / section_header / subcommand_entry have all failed. -make_parser!(skip_non_option_line -> (), - value((), terminated(rest_of_line, line_ending))); - -make_parser!(is_arg_section -> (), - value((), - alt(( - tag_no_case("positional arguments"), - tag_no_case("arguments"), - tag_no_case("positionals"), - tag_no_case("args"), - )), - ) -); - -make_parser!(section_header -> (), - value((), - delimited( - verify(space0, |ss: &str| get_indent(ss).1 <= 4), - is_arg_section, - (char(':'), rest_of_line, eol) - ) - ) -); - -/// dedup raw subcommands by case-insensitive name, keeping the entry with -/// the longest description. preserves first-seen ordering. -fn dedup_subcommands<'a>(raw: Vec>) -> Vec> { - let mut by_name: HashMap> = HashMap::new(); - let mut order: Vec = Vec::new(); - for sc in raw { - let key = sc.name.to_ascii_lowercase(); - match by_name.get(&key) { - Some(prev) if prev.desc.len() >= sc.desc.len() => {} - _ => { - if !by_name.contains_key(&key) { - order.push(key.clone()); - } - by_name.insert(key, sc); - } - } - } - order.into_iter().map(|k| by_name.remove(&k).unwrap()).collect() -} - -/// build the final HelpResult from the parse outputs + a copy of the -/// original input (for whole-input positional extraction). -fn build_help_result<'a>(original: &'a str, results: Vec>) -> HelpResult<'a> { - let mut entries = Vec::new(); - let mut raw_subcommands: Vec> = Vec::new(); - for res in results { - use ParseResult::*; - // TODO: track in_arg_sec to filter subcommands under positional sections - match res { - OptionEntry(e) => entries.push(e), - Subcommand(e) => raw_subcommands.push(e), - _ => (), - } - } - let subcommands = dedup_subcommands(raw_subcommands); - // cli11 positional section takes priority over the usage-line scan - // when both are present — cli11 carries types and optionality. - let positionals = match extract_cli11_positionals(original) { - Ok((_, p)) if !p.is_empty() => p, - _ => extract_usage_positionals(original).map(|(_, p)| p).unwrap_or_default(), - }; - HelpResult { entries, subcommands, positionals, desc: "" } -} - -/// top-level help parser. `peek(rest)` captures the original input slice -/// so build_help_result can run the positional extractors over the whole -/// thing while many0 still parses from the same position. -make_parser!(pub help_parser -> HelpResult<'a>, - ( - peek(rest), - many0(alt(( - entry.map(ParseResult::OptionEntry), - section_header.map(|_| ParseResult::SectionHeader), - subcommand_entry.map(ParseResult::Subcommand), - skip_non_option_line.map(|_| ParseResult::NonOptionLine), - ))), - ) - => |(original, results): (&'a str, Vec>)| - build_help_result(original, results) -); diff --git a/src/parsers/help/description.rs b/src/parsers/help/description.rs deleted file mode 100644 index 7645c1e..0000000 --- a/src/parsers/help/description.rs +++ /dev/null @@ -1,42 +0,0 @@ -use nom::{ - IResult, Parser, - character::complete::space0, - combinator::verify, - multi::many0, - sequence::{preceded, terminated}, -}; - -use crate::make_parser; -use crate::parsers::help::helpers::{at_least_indent, eol, rest_of_line}; - -/// continuation line: an indented (≥8 visual cols), non-flag-shaped line -/// belonging to the previous flag's description. blank-but-indented lines -/// are accepted (content = ""), filtered out by the caller's join. -make_parser!(continuation_line -> &str, - verify( - preceded( - // assert ≥8 visual cols of leading horizontal whitespace - // without consuming — space0 inside `rest_of_line`'s preceded - // will eat them next. - at_least_indent(8), - terminated(preceded(space0, rest_of_line), eol) - ), - // reject lines whose first non-space char is '-' — that's a new - // flag entry, not a continuation of the previous one. - |content: &&str| !content.starts_with('-') - ) -); - -/// description: the line of text after the switch+param, plus any -/// continuation lines. always succeeds — first line may be empty (when -/// the switch is followed immediately by a newline, "clap long" style). -make_parser!(pub description -> (&'a str, Vec<&'a str>), - ( - terminated(preceded(space0, rest_of_line), eol), - many0(continuation_line), - )); - -/// description that appears entirely on continuation lines below the switch. -/// kept exported for callers that want it explicitly (none currently). -make_parser!(pub description_below -> Vec<&'a str>, - many0(continuation_line)); diff --git a/src/parsers/help/helpers.rs b/src/parsers/help/helpers.rs deleted file mode 100644 index 9213637..0000000 --- a/src/parsers/help/helpers.rs +++ /dev/null @@ -1,100 +0,0 @@ -use nom::{ - AsChar, IResult, Parser, - branch::alt, - bytes::complete::take_till, - character::complete::line_ending, - combinator::eof, -}; -#[allow(unused_imports)] -use nom::{bytes::complete::take_while, combinator::peek, combinator::verify}; - -#[macro_export] -macro_rules! make_parser { - (pub $name:ident -> $out:ty, $parser:expr => $wrap:expr) => { - pub fn $name<'a>(s: &'a str) -> IResult<&'a str, $out> { - let (rem, val) = $parser.parse(s)?; - Ok((rem, $wrap(val))) - } - }; - (pub $name:ident -> $out:ty, $parser:expr) => { - pub fn $name<'a>(s: &'a str) -> IResult<&'a str, $out> { - $parser.parse(s) - } - }; - ($name:ident -> $out:ty, $parser:expr => $wrap:expr) => { - fn $name<'a>(s: &'a str) -> IResult<&'a str, $out> { - let (rem, val) = $parser.parse(s)?; - Ok((rem, $wrap(val))) - } - }; - ($name:ident -> $out:ty, $parser:expr) => { - fn $name<'a>(s: &'a str) -> IResult<&'a str, $out> { - $parser.parse(s) - } - }; -} - -#[macro_export] -macro_rules! make_predicate { - (pub $name:ident, |$c:ident| $($body:tt)*) => { - pub fn $name($c: char) -> bool { $($body)* } - }; - ($name:ident, |$c:ident| $($body:tt)*) => { - fn $name($c: char) -> bool { $($body)* } - }; -} - -make_predicate!(pub is_option_char, |c| c.is_alphanumeric() || matches!(c, '-' | '_')); - -make_parser!(pub rest_of_line -> &str, - take_till(|c: char| c.is_newline()) -); - -/// end of line — matches either a newline or end of input. -/// permissive version used in most line-consuming parsers. -make_parser!(pub eol -> &str, alt((line_ending, eof))); - -/// compute the visual indent of a leading whitespace run. -/// spaces count 1, tabs count 8 (typical terminal default). -pub fn visual_indent(s: &str) -> u8 { - s.chars().fold(0u8, |acc, c| { - acc.saturating_add(match c { - ' ' => 1, - '\t' => 8, - _ => 0, - }) - }) -} - -/// nom-shaped check that the input begins with at least `min` visual -/// columns of horizontal whitespace (spaces or tabs). doesn't consume — -/// pair with `space0`/`take_while` to actually eat the indent. -pub fn at_least_indent<'a>( - min: u8, -) -> impl Parser<&'a str, Output = &'a str, Error = nom::error::Error<&'a str>> { - verify( - peek(take_while(|c: char| c == ' ' || c == '\t')), - move |s: &str| visual_indent(s) >= min, - ) -} - -/// legacy helper: returns (byte index of first non-space, visual indent). -/// used by callers that still need the byte index. -pub fn get_indent(s: &str) -> (usize, u8) { - let mut traversed = 0; - let mut indent = 0; - for (i, c) in s.char_indices() { - let incr = match c { - ' ' => 1, - '\t' => 8, - _ => 0, - }; - if incr == 0 { - traversed = i; - break; - } else { - indent += incr; - } - } - (traversed, indent) -} diff --git a/src/parsers/help/options.rs b/src/parsers/help/options.rs deleted file mode 100644 index e2f4faa..0000000 --- a/src/parsers/help/options.rs +++ /dev/null @@ -1,171 +0,0 @@ -use crate::make_parser; -use crate::parsers::help::helpers::is_option_char; -use crate::types::*; - -use nom::bytes::complete::{take_till, take_till1}; -use nom::character::complete::space0; -use nom::combinator::{map, opt}; -use nom::multi::many0; -use nom::sequence::separated_pair; -use nom::{ - IResult, Parser, - branch::alt, - bytes::complete::{tag, take_while1}, - character::complete::{char, satisfy}, - combinator::{value, verify}, - sequence::{delimited, preceded}, -}; - -make_parser!(short_switch -> char, - preceded(char('-'), satisfy(|c| c.is_alphanumeric()))); - -make_parser!(long_switch -> &str, - preceded(tag("--"), take_while1(is_option_char))); - -make_parser!(comma -> (), - value((), preceded(char(','), space0))); - -make_parser!(eq_optional_param -> Param<'a>, - delimited(tag("[="), take_while1(is_option_char), char(']')) => Param::Optional); - -make_parser!(eq_mandatory_param -> Param<'a>, - preceded(char('='), take_while1(is_option_char)) => Param::Mandatory); - -// take a wide alphanumeric/_/- token then verify the WHOLE thing looks -// like an ALL_CAPS-style param name. taking only uppercase chars would -// match just "N" of " Needs: ..." and leave "eeds:..." as desc, so we -// widen, then reject anything that doesn't pass the all-caps check. -make_parser!(spaced_uppercase_param -> Param<'a>, - preceded( - char(' '), - verify( - take_while1(|c: char| - c.is_ascii_alphabetic() || c.is_ascii_digit() || c == '_' || c == '-' - ), - |s: &str| { - let first = match s.chars().next() { Some(c) => c, None => return false }; - if !(first.is_ascii_uppercase() || first == '_') { return false; } - s.chars().all(|c| c.is_ascii_uppercase() || c.is_ascii_digit() || c == '_') - } - ) - ) => Param::Mandatory); - -make_parser!(spaced_angle_param -> Param<'a>, - preceded(char(' '), delimited(char('<'), take_till1(|c| c == '>'), char('>'))) => Param::Mandatory); - -make_parser!(spaced_opt_angle_param -> Param<'a>, - preceded(char(' '), delimited(char('<'), - delimited(char('['), take_while1(|c| c != ']'), char(']')), - char('>'))) => Param::Optional); - -// take the full lowercase token then verify it's <=10 chars. a -// take_while_m_n with a 10-char cap would leave a partial match — e.g. -// "--foo nanoseconds" would extract param "nanosecond" and leave "s" as -// the description. a word longer than 10 chars is almost certainly the -// start of the description, not a type annotation. -make_parser!(spaced_type_param -> Param<'a>, - preceded( - char(' '), - verify( - take_while1(|c: char| c.is_ascii_lowercase()), - |s: &str| s.len() <= 10 - ) - ) => Param::Mandatory -); - -make_parser!(pub param_parser -> Param<'a>, alt(( - eq_optional_param, - eq_mandatory_param, - spaced_opt_angle_param, - spaced_angle_param, - spaced_uppercase_param, - spaced_type_param, -))); - -macro_rules! switch_pair { - ($name:ident, $left:expr, $sep:expr, $right:expr => |$a:ident, $b:ident| $body:expr) => { - fn $name<'a>(s: &'a str) -> IResult<&'a str, Switch<'a>> { - use nom::sequence::separated_pair; - let (rem, ($a, $b)) = separated_pair($left, $sep, $right).parse(s)?; - Ok((rem, $body)) - } - }; -} - -switch_pair!(short_comma_long, - short_switch, comma, long_switch => |s, l| Switch::Both(s, l)); - -switch_pair!(short_space_long, - short_switch, char(' '), long_switch => |s, l| Switch::Both(s, l)); - -make_parser!(slash_sep -> (), - value((), delimited(space0, char('/'), space0))); - -switch_pair!(long_slash_short, - long_switch, slash_sep, short_switch => |l, s| Switch::Both(s, l)); - -make_parser!(short_as_switch -> Switch<'a>, short_switch => Switch::Short); -make_parser!(long_as_switch -> Switch<'a>, long_switch => Switch::Long); - -make_parser!(pub switch_parser -> Switch<'a>, - alt(( - short_comma_long, - short_space_long, - long_slash_short, - short_as_switch, - long_as_switch, - )) -); - -// `{--long | -s}` — manpage SYNOPSIS-line switch pair. nix-env's -// synopsis is the canonical case: `[{--file | -f} path] [{--profile | -// -p} path]`. emits Switch::Both with the long name. -make_parser!(brace_pipe_long_short -> Switch<'a>, - separated_pair(long_switch, (space0, char('|'), space0), short_switch) - => |(l, s): (&'a str, char)| Switch::Both(s, l) -); - -make_parser!(brace_pipe_short_long -> Switch<'a>, - separated_pair(short_switch, (space0, char('|'), space0), long_switch) - => |(s, l): (char, &'a str)| Switch::Both(s, l) -); - -make_parser!(brace_pipe_switch -> Switch<'a>, - delimited( - (char('{'), space0), - alt((brace_pipe_long_short, brace_pipe_short_long)), - (space0, char('}')) - ) -); - -make_parser!(usage_switch_parser -> Switch<'a>, - alt((brace_pipe_switch, switch_parser)) -); - -// consume any chars except `]`. used to swallow trailing tokens inside a -// flag bracket — e.g. `[--option name value]` keeps switch=Long("option") -// and param=Mandatory("name"), discarding ` value` before the closing `]`. -make_parser!(take_till_bracket -> &str, take_till(|c: char| c == ']')); - -// `[ [param] ]` inside the SYNOPSIS line. -make_parser!(flag_in_bracket -> (Switch<'a>, Option>), - delimited( - (char('['), space0), - (usage_switch_parser, opt(param_parser)), - (take_till_bracket, char(']')) - ) -); - -/// walk the joined SYNOPSIS-line text, collecting every flag-bracketed -/// switch + its first param. non-flag tokens (positional brackets, -/// command name, ellipses) are skipped one char at a time. -make_parser!(pub parse_usage_flags -> Vec<(Switch<'a>, Option>)>, - many0(alt(( - map(flag_in_bracket, Some), - // `value(None, ...)` requires `None: Clone` which forces Clone - // on Switch/Param; `map(..., |_| None)` doesn't. - map(satisfy(|c| c != '\n' && c != '\r'), |_| None), - ))) - => |v: Vec, Option>)>>| - v.into_iter().flatten().collect() -); diff --git a/src/parsers/help/positionals.rs b/src/parsers/help/positionals.rs deleted file mode 100644 index d92dcb0..0000000 --- a/src/parsers/help/positionals.rs +++ /dev/null @@ -1,389 +0,0 @@ -use std::collections::HashMap; - -use crate::parsers::help::helpers::{get_indent, rest_of_line}; -use crate::types::{HelpResult, Positional}; -use crate::{make_parser, make_predicate}; -use nom::branch::alt; -use nom::bytes::complete::{tag, tag_no_case, take_till, take_till1, take_while, take_while1}; -use nom::character::complete::{char, line_ending, newline, satisfy, space0, space1}; -use nom::combinator::{map, not, opt, peek, recognize, value, verify}; -use nom::multi::many0; -use nom::sequence::{delimited, preceded, terminated}; -use nom::{AsChar, IResult, Parser}; - -#[derive(Clone)] -enum PositionalParse<'a> { - Curly, - Flag, - Skip, - Mandatory(&'a str), - Optional(&'a str), - ManVariadic(&'a str), - OptVariadic(&'a str), -} - -make_predicate!(is_word_char, |c| c.is_alphanumeric() - || matches!(c, '-' | '_' | '/' | '.')); - -make_predicate!(is_pos_char, |c| c.is_ascii_uppercase() - || c.is_numeric() - || matches!(c, '_' | '-')); - -make_parser!(section_label -> (), - value((), alt(( - tag_no_case("options"), - tag_no_case("option"), - tag_no_case("flags"), - tag_no_case("flag") - ))) -); - -make_parser!(ellipses -> (), - value((), - alt((tag("..."), tag("\u{2026}"))) - ) -); - -make_parser!(braces -> PositionalParse, - value(PositionalParse::Curly, delimited(char('{'), take_till1(|c| c == '}'), char('}'))) -); - -// FIXME should this be a take_while is_option_char? -// why tf do we have a ']' condition -make_parser!(flag -> PositionalParse, - value(PositionalParse::Flag, preceded(char('-'), take_till1(|c: char| c.is_space() || c == ']'))) -); - -fn check_positional(s: &str) -> bool { - let s = s.trim(); - if s.is_empty() { - return false; - } - // reject names starting with '-' — these are flag tokens accidentally - // captured by the bracket parser, e.g. "[--at-operation]" in jj's - // synopsis. without this guard every `[--flag]` token would be - // recorded as a positional named "--flag". - if s.starts_with('-') { - return false; - } - if section_label.parse(s).is_ok() { - return false; - } - let upper = s.to_ascii_uppercase(); - if matches!(upper.as_str(), "OPTIONS" | "OPTION" | "FLAGS" | "FLAG") { - return false; - } - s.chars().all(|c| c.is_alphanumeric() || matches!(c, '-' | '_' | '/' | '.')) -} - -/// recognize a balanced `[...]` block, tolerating ONE level of nested -/// brackets inside. expressed entirely via nom combinators: -/// -/// `[` + many0(alt((nested_bracket_block, non_bracket_char))) + `]` -/// -/// nested_bracket_block is `[ chars_until_] ]`, which means we accept a -/// single inner `[...]` correctly but not arbitrarily-deep nesting — -/// manpages don't go deeper than two levels. -/// returns the inner content (everything between the outer brackets). -make_parser!(balanced_bracket_inner -> &str, - recognize(delimited( - char('['), - many0(alt(( - recognize((char('['), take_till(|c: char| c == ']'), char(']'))), - recognize(satisfy(|c: char| c != ']' && c != '[')), - ))), - char(']'), - )) - => |whole: &'a str| &whole[1..whole.len() - 1] -); - -/// extract a positional name from already-trimmed bracket-inner content. -/// returns the name slice and a flag indicating whether the bracket inner -/// carried a trailing `...` (in-bracket variadic marker). -fn parse_bracket_inner_name(inner: &str) -> Option<(&str, bool)> { - let inner = inner.trim(); - // strip trailing "..." for in-bracket variadic. - let (rest, has_dots) = if let Some(stripped) = inner.strip_suffix("...") { - (stripped.trim_end(), true) - } else if let Some(stripped) = inner.strip_suffix('\u{2026}') { - (stripped.trim_end(), true) - } else { - (inner, false) - }; - let name = if let Some(after_lt) = rest.strip_prefix('<') { - // angle-bracket name: take everything up to the matching '>' - let end = after_lt.find('>')?; - &after_lt[..end] - } else { - // bare name: take leading word - let end = rest - .find(|c: char| c.is_whitespace() || c == '[' || c == ']') - .unwrap_or(rest.len()); - if end == 0 { - return None; - } - &rest[..end] - }; - Some((name, has_dots)) -} - -/// extract a balanced `[...]` block and decompose its inner content into -/// (name, has-inner-`...` flag). `map_opt` turns a `None` from -/// `parse_bracket_inner_name` into a nom parse error. -make_parser!(opt_bracket_name -> (&'a str, bool), - nom::combinator::map_opt(balanced_bracket_inner, parse_bracket_inner_name) -); - -make_parser!( - opt_positional -> PositionalParse, - verify( - // tuple parser: (name + in-bracket variadic, post-bracket ellipsis). - // matches "[name]", "[name...]", "[name ...]", "[name] ...", - // "[]", and one-level nests like "[ [...]]". - (opt_bracket_name, opt(ellipses)), - |((name, _), _): &((&'a str, bool), Option<()>)| check_positional(name) - ) => |((name, has_inner_dots), post_dots): ((&'a str, bool), Option<()>)| { - if has_inner_dots || post_dots.is_some() { - PositionalParse::OptVariadic(name) - } else { - PositionalParse::Optional(name) - } - } -); - -make_parser!(man_positional -> PositionalParse, - verify( - ( - delimited( - char('<'), - ( - take_till1(|c| c == '.' || c == '\u{2026}' || c == '>'), - opt(ellipses) - ), - char('>') - ), - opt(ellipses) - ), - |((ss, _), _)| check_positional(ss) - ) => |((p, v), v1): ((&'a str, Option<()>), Option<()>)| - if v.is_some() || v1.is_some() { PositionalParse::ManVariadic(p) } - else { PositionalParse::Mandatory(p) } -); - -make_parser!(allcaps_positional -> PositionalParse, - verify( - ( - preceded( - peek( - satisfy(|c: char| c.is_ascii_uppercase()) - ), - take_while1(is_pos_char) - ), - opt( - alt(( - tag("..."), - tag("\u{2026}")) - ) - ) - ), - |(ss, _): &(&str, _)| check_positional(ss) - ) => |(p, v): (&'a str, Option<&'a str>)| - if v.is_some() { PositionalParse::ManVariadic(p) } else { PositionalParse::Mandatory(p) } -); - -fn caseless_insert<'a>(k: &'a str, v: Positional, acc: &mut HashMap<&'a str, Positional>) { - let dupe = acc.keys().any(|ik| ik.eq_ignore_ascii_case(k)); - if !dupe { - acc.insert(k, v); - } -} - -// parse_usage_args runs on a single logical usage line. SKIP refuses to -// cross a newline boundary so many0 stops at end-of-line — without this -// the parser would happily wander into the OPTIONS section and treat -// every `--flag ` angle-bracket parameter as a positional. -// -// the inner positional terminator uses peek(line_ending) instead of -// consuming the newline, so the trailing `opt(line_ending)` in the -// outer delimited eats it cleanly and we never advance past the usage -// line. -make_parser!(pub parse_usage_args -> HashMap<&'a str, Positional>, - (delimited( - space0, - many0( - alt(( - map( - ( - terminated( - alt(( - braces, - opt_positional, - man_positional, - flag, - allcaps_positional, - )), - alt(( - space1, - value("", peek(line_ending)), - value("", peek(nom::combinator::eof)), - )) - ), - // catch "[section] ..." patterns where the ellipsis is - // on the *next* token, separated by whitespace. - opt(terminated( - alt((tag("..."), tag("\u{2026}"))), - alt(( - space1, - value("", peek(line_ending)), - value("", peek(nom::combinator::eof)), - )) - )) - ), - |(positional, trailing): (PositionalParse<'a>, Option<_>)| { - if trailing.is_none() { positional } - else { - match positional { - PositionalParse::Optional(n) => PositionalParse::OptVariadic(n), - PositionalParse::Mandatory(n) => PositionalParse::ManVariadic(n), - other => other, - } - } - } - ), - // SKIP must NOT consume a newline. without this, many0 keeps - // iterating past the usage line into OPTIONS-section flag - // syntax and over-extracts positionals. - value(PositionalParse::Skip, satisfy(|c: char| c != '\n' && c != '\r')), - )) - ), - opt((space0, line_ending)) - )) => |p: Vec>| - p.into_iter().fold(HashMap::new(), |mut acc, parse| - { - match parse { - PositionalParse::Curly => (), - PositionalParse::Flag => (), - PositionalParse::Skip => (), - PositionalParse::OptVariadic(arg) => caseless_insert(arg, Positional { - optional: true, - variadic: true - }, &mut acc), - PositionalParse::ManVariadic(arg) => caseless_insert(arg, Positional { - optional: false, - variadic: true - }, &mut acc), - PositionalParse::Optional(arg) => caseless_insert(arg, Positional { - optional: true, - variadic: false, - }, &mut acc), - PositionalParse::Mandatory(arg) => caseless_insert(arg, Positional { - optional: false, - variadic: false - }, &mut acc), - } - acc - }) -); - -make_parser!(pub skip_command_name -> (), - value((), preceded(space0, - many0( - ( - verify( - preceded(not(char('-')), take_while1(is_word_char)), - |ss: &str| ss.chars().any(|c| matches!(c, 'a'..='z')) - ), - space1 - ) - ) - )) -); - -make_parser!(find_usage_line -> (), - value((), preceded( - space0, - terminated( - tag_no_case("usage"), - // accept any of: - // "Usage:" — inline form with colon - // "Usage args" — inline form, space follows the word - // "USAGE\n cmd args" — clap-style header on its own line - alt( - ( - value((), char(':')), - value((), peek(line_ending)), - value((), peek(satisfy(|c: char| c == ' ' || c == '\t'))), - ) - ) - ) - )) -); - -make_parser!(pub extract_usage_positionals -> HashMap<&'a str, Positional>, - preceded( - many0(preceded(not(find_usage_line), (rest_of_line, line_ending))), - preceded( - (find_usage_line, space0, opt(line_ending), space0, skip_command_name), - parse_usage_args - ) - ) -); - -make_predicate!(is_cli11_name_char, |c| c.is_alphanumeric() - || matches!(c, '_' | '-')); - -make_parser!(cli11_section_header -> (), - value((), - delimited( - space0, - alt((tag("POSITIONALS:"), tag("Positionals:"))), - (rest_of_line, opt(line_ending)) - ) - ) -); - -make_parser!(cli11_pos_line -> (&'a str, bool), - preceded( - verify(space0, |ss: &str| !ss.is_empty()), - terminated( - ( - verify(take_while1(is_cli11_name_char), |s: &str| s.len() >= 2), - preceded( - (space0, take_while(|c: char| c.is_ascii_uppercase()), space0), - opt(tag("...")) - ) - ), - (rest_of_line, opt(line_ending)) - ) - ) => |(name, variadic): (&'a str, Option<_>)| (name, variadic.is_some()) -); - -make_parser!(parse_cli11_body -> HashMap<&'a str, Positional>, - many0(cli11_pos_line) => |entries: Vec<(&'a str, bool)>| - entries.into_iter().fold(HashMap::new(), |mut acc, (name, variadic)| { - caseless_insert(name, Positional { optional: false, variadic }, &mut acc); - acc - }) -); - -make_parser!(pub extract_cli11_positionals -> HashMap<&'a str, Positional>, - preceded( - many0(preceded(not(cli11_section_header), (rest_of_line, line_ending))), - preceded(cli11_section_header, parse_cli11_body) - ) -); - -/// use ansi strip first -fn parse_positionals(s: &str) -> Result, ()> { - let cli11 = extract_cli11_positionals(&s); - let usage = extract_usage_positionals(&s); - if let Ok((_, c11)) = cli11 - && !c11.is_empty() - { - Ok(c11) - } else if let Ok((_, u)) = usage { - Ok(u) - } else { - Err(()) - } -} diff --git a/src/parsers/help/subcommands.rs b/src/parsers/help/subcommands.rs deleted file mode 100644 index 7e46412..0000000 --- a/src/parsers/help/subcommands.rs +++ /dev/null @@ -1,83 +0,0 @@ -use nom::{ - AsChar, IResult, Parser, - branch::alt, - bytes::complete::{tag, take_till, take_while1}, - character::complete::{char, satisfy, space0}, - combinator::{not, value, verify}, - multi::many0, - sequence::{delimited, preceded, terminated}, -}; - -use crate::make_parser; -use crate::parsers::help::helpers::{eol, is_option_char}; -use crate::types::Subcommand; - -fn is_placeholder(c: char) -> bool { - match c { - _ if c.is_alphanumeric() => true, - '_' | '-' | '.' | '|' | ',' => true, - _ => false, - } -} - -/// chars allowed inside a bare (unbracketed) placeholder token, e.g. -/// "FILE", "PATTERN...", "A|B". excludes lowercase letters so mixed-case -/// description words like "NixOS" or "Home-manager" don't get swallowed -/// as placeholders. -fn is_bare_placeholder_char(c: char) -> bool { - matches!(c, 'A'..='Z' | '0'..='9' | '_' | '-' | '.' | '|' | ',') -} - -make_parser!( - skip_arg_placeholders -> (), - value( - (), - many0(preceded( - // peek ahead one char (don't consume) so the per-branch parser can - // see the full token. needed because the bare ALL_CAPS branch must - // verify the *entire* token before deciding to consume. - char(' '), - alt(( - // <...> bracketed placeholder - delimited(char('<'), take_while1(is_placeholder), char('>')), - // [...] optional bracketed placeholder - delimited(char('['), take_while1(is_placeholder), char(']')), - // bare ALL_CAPS placeholder — first char must be uppercase or - // a digit (allows e.g. "N", "M2"), and the whole token must - // be uppercase-friendly. rejects "NixOS"-style mixed-case so - // descriptions don't get swallowed. - verify( - take_while1(is_bare_placeholder_char), - |s: &str| { - let first = s.chars().next().unwrap(); - first.is_ascii_uppercase() || first.is_ascii_digit() - } - ), - )), - )), - ) -); - -/// parse a subcommand entry: leading whitespace, then a name (2+ option -/// chars, not starting with '-'), optional argument placeholders, exactly -/// two spaces, optional padding, then the description text and eol. -make_parser!(pub subcommand_entry -> Subcommand<'_>, - ( - preceded( - space0, - verify( - preceded(not(char('-')), take_while1(is_option_char)), - |n: &str| n.len() >= 2, - ), - ), - skip_arg_placeholders, - tag(" "), - space0, - terminated(take_till(|c: char| c.is_newline()), eol), - ) => |(name, _, _, _, desc): (&'a str, _, _, _, &'a str)| { - // some help formats prefix desc with "- " (manpage-style); strip it. - let d = desc.trim_start(); - let desc = d.strip_prefix("- ").map(|s| s.trim_start()).unwrap_or(d); - Subcommand { name, desc } - } -); diff --git a/src/parsers/manpage.rs b/src/parsers/manpage.rs deleted file mode 100644 index 0058335..0000000 --- a/src/parsers/manpage.rs +++ /dev/null @@ -1,328 +0,0 @@ -//! parse unix manpages (groff/mdoc format) into a structured result. -//! -//! manpages are written in roff/groff markup — a decades-old typesetting language -//! used by man(1). this module strips the formatting and extracts structured data -//! (flags, subcommands, positionals) from the raw groff source. -//! -//! there are two major manpage macro packages: -//! - man (groff) — used by gnu/linux tools. uses macros like .SH, .TP, .IP, .PP -//! - mdoc (bsd) — used by bsd tools. uses .Sh, .Fl, .Ar, .Op, .It, .Bl/.El -//! -//! this module handles both, auto-detecting the format by checking for .Sh macros. -//! -//! for groff manpages, flag extraction uses multiple "strategies" that target -//! different common formatting patterns: -//! - strategy_tp: .TP tagged paragraphs (gnu coreutils, help2man) -//! - strategy_ip: .IP indented paragraphs (curl, hand-written) -//! - strategy_pp_rs: .PP + .RS/.RE blocks (git, docbook) -//! - strategy_nix: nix3-style bullet .IP with .UR/.UE hyperlinks -//! - strategy_deroff: fallback — strip all groff, feed to help text parser -//! -//! the module tries all applicable strategies and picks the one that extracts -//! the most flag entries, on the theory that more results = better match. - -mod commands; -mod groff; -mod mdoc; -mod sections; -mod strategies; - -use std::collections::HashMap; -use std::io::{self, Read}; -use std::path::Path; - -use crate::types::{HelpResult, OptionEntry, Param, Positional, Subcommand, Switch}; - -pub use self::groff::{GroffLine, classify_line, strip_groff_escapes}; -pub use self::sections::{extract_subcommand_sections, extract_synopsis_command}; - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum OwnedSwitch { - Short(char), - Long(String), - Both(char, String), -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum OwnedParam { - Mandatory(String), - Optional(String), -} - -#[derive(Debug, Clone)] -pub struct ManpageEntry { - pub switch: OwnedSwitch, - pub param: Option, - pub desc: String, -} - -#[derive(Debug, Clone)] -pub struct ManpageSubcommand { - pub name: String, - pub desc: String, -} - -#[derive(Debug, Clone, Default)] -pub struct ManpageResult { - pub entries: Vec, - pub subcommands: Vec, - pub positionals: HashMap, - pub description: String, -} - -impl From<&Switch<'_>> for OwnedSwitch { - fn from(s: &Switch<'_>) -> Self { - match s { - Switch::Short(c) => OwnedSwitch::Short(*c), - Switch::Long(l) => OwnedSwitch::Long((*l).to_string()), - Switch::Both(c, l) => OwnedSwitch::Both(*c, (*l).to_string()), - } - } -} - -impl From<&Param<'_>> for OwnedParam { - fn from(p: &Param<'_>) -> Self { - match p { - Param::Mandatory(s) => OwnedParam::Mandatory((*s).to_string()), - Param::Optional(s) => OwnedParam::Optional((*s).to_string()), - } - } -} - -impl From<&OptionEntry<'_>> for ManpageEntry { - fn from(e: &OptionEntry<'_>) -> Self { - let desc: String = e - .desc - .iter() - .map(|s| s.trim()) - .filter(|s| !s.is_empty()) - .collect::>() - .join(" "); - ManpageEntry { - switch: (&e.switch).into(), - param: e.param.as_ref().map(Into::into), - desc, - } - } -} - -impl From<&Subcommand<'_>> for ManpageSubcommand { - fn from(sc: &Subcommand<'_>) -> Self { - // lowercase the subcommand name here so (a) file naming is - // consistent (meat_yum.json vs meat_YUM.json) and (b) recursive - // --help probes use the lowercase form, which is what most real - // CLIs accept — even tools like meat that DISPLAY uppercase - // names in their help text dispatch on the lowercased argument. - ManpageSubcommand { - name: sc.name.to_ascii_lowercase(), - desc: sc.desc.to_string(), - } - } -} - -impl From<&HelpResult<'_>> for ManpageResult { - fn from(r: &HelpResult<'_>) -> Self { - ManpageResult { - entries: r.entries.iter().map(Into::into).collect(), - subcommands: r.subcommands.iter().map(Into::into).collect(), - // positional names are stored lowercased so output is - // stable across the various places we extract them from - // (synopsis, usage, cli11 sections). - positionals: r - .positionals - .iter() - .map(|(k, v)| (k.to_ascii_lowercase(), v.clone())) - .collect(), - description: r.desc.to_string(), - } - } -} - -/// parse a manpage from its classified lines. -/// auto-detects mdoc vs groff format. for groff, runs the multi-strategy -/// extraction pipeline. -pub fn parse_manpage_lines(lines: &[GroffLine]) -> ManpageResult { - if mdoc::is_mdoc(lines) { - mdoc::parse_mdoc_lines(lines) - } else { - let options_section = sections::extract_options_section(lines); - let mut entries = strategies::extract_entries(&options_section); - // merge SYNOPSIS-only flags (nix-env's `[{--profile | -p} path]` - // pattern, where the flag is declared in the synopsis but never - // listed as an entry in the OPTIONS body). body entries take - // precedence on duplicate names — they carry the descriptions. - let synopsis_flags = sections::extract_synopsis_flags(lines); - if !synopsis_flags.is_empty() { - let have_long: std::collections::HashSet = entries - .iter() - .filter_map(|e| match &e.switch { - OwnedSwitch::Long(l) | OwnedSwitch::Both(_, l) => { - Some(l.to_ascii_lowercase()) - } - _ => None, - }) - .collect(); - let have_short: std::collections::HashSet = entries - .iter() - .filter_map(|e| match &e.switch { - OwnedSwitch::Short(c) | OwnedSwitch::Both(c, _) => Some(*c), - _ => None, - }) - .collect(); - for e in synopsis_flags { - let dup = match &e.switch { - OwnedSwitch::Long(l) => have_long.contains(&l.to_ascii_lowercase()), - OwnedSwitch::Short(c) => have_short.contains(c), - OwnedSwitch::Both(c, l) => { - have_short.contains(c) || have_long.contains(&l.to_ascii_lowercase()) - } - }; - if !dup { - entries.push(e); - } - } - } - let positionals = sections::extract_synopsis_positionals(lines); - let commands_section = sections::extract_commands_section(lines); - let subcommands = commands::extract_subcommands_from_commands(&commands_section); - ManpageResult { - entries, - subcommands, - positionals, - description: String::new(), - } - } -} - -/// parse a manpage from its raw string contents. -/// splits into lines, parses, then extracts the NAME section description. -pub fn parse_manpage_string(contents: &str) -> ManpageResult { - let lines: Vec = contents.split('\n').map(classify_line).collect(); - let mut result = parse_manpage_lines(&lines); - if let Some(desc) = sections::extract_name_description(&lines) { - result.description = desc; - } - result -} - -/// parse a manpage and also pull out clap-style `.SH SUBCOMMAND` sections -/// as separate per-subcommand results. each subcommand section in a -/// clap-generated manpage is its own command with its own flags; the -/// parent's subcommand list is populated from their names. -/// -/// returns (main_result, sub_results) where each sub_result has -/// name=full_command ("nh os"), desc, and its own ManpageResult. -pub fn parse_manpage_with_subs( - contents: &str, -) -> (ManpageResult, Vec<(String, ManpageResult)>) { - let lines: Vec = contents.split('\n').map(classify_line).collect(); - let mut result = parse_manpage_lines(&lines); - if let Some(desc) = sections::extract_name_description(&lines) { - result.description = desc; - } - let sub_sections = sections::extract_subcommand_sections(&lines); - if !sub_sections.is_empty() { - // overwrite subcommands with the SUBCOMMAND-section names — - // these are the authoritative list for clap-generated manpages. - result.subcommands = sub_sections - .iter() - .map(|(name, desc, _)| ManpageSubcommand { - name: name.to_ascii_lowercase(), - desc: desc.clone(), - }) - .collect(); - } - // each SUBCOMMAND section body is parsed via the same strategy-picker - // as the top-level OPTIONS section — clap puts flag definitions - // directly under the .SH SUBCOMMAND header with no inner .SH wrapping, - // so parse_manpage_lines (which looks for a child OPTIONS section) - // would come back empty. - let subs: Vec<(String, ManpageResult)> = sub_sections - .into_iter() - .map(|(name, desc, lines)| { - let entries = strategies::extract_entries(&lines); - let sub_result = ManpageResult { - entries, - subcommands: Vec::new(), - positionals: Default::default(), - description: desc, - }; - (name, sub_result) - }) - .collect(); - (result, subs) -} - -/// read a manpage file from disk. handles .gz compressed files (the common -/// case — most installed manpages are gzipped). plain text files are read directly. -pub fn read_manpage_file>(path: P) -> io::Result { - let path = path.as_ref(); - let bytes = std::fs::read(path)?; - if path.extension().and_then(|e| e.to_str()) == Some("gz") { - let mut decoder = flate2::read::GzDecoder::new(&bytes[..]); - let mut out = String::new(); - decoder.read_to_string(&mut out)?; - Ok(out) - } else { - String::from_utf8(bytes) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) - } -} - -/// read + parse a manpage file in one step. -pub fn parse_manpage_file>(path: P) -> io::Result { - let contents = read_manpage_file(path)?; - Ok(parse_manpage_string(&contents)) -} - -#[cfg(test)] -mod tests { - use super::*; - - const TP_MANPAGE: &str = r#".TH FOO 1 "2024" "1.0" "User Commands" -.SH NAME -foo \- a synthetic test command -.SH SYNOPSIS -.B foo -[\fIOPTIONS\fR] [output] -.SH OPTIONS -.TP -\fB\-v\fR, \fB\-\-verbose\fR -increase output verbosity -.TP -\fB\-o\fR \fIFILE\fR, \fB\-\-output\fR=\fIFILE\fR -write to FILE -.TP -\fB\-h\fR, \fB\-\-help\fR -show this help and exit -"#; - - #[test] - fn tp_strategy_extracts_flags() { - let r = parse_manpage_string(TP_MANPAGE); - assert_eq!(r.entries.len(), 3, "expected 3 entries, got {:?}", r.entries); - assert_eq!(r.description, "a synthetic test command"); - assert!(matches!( - r.entries[0].switch, - OwnedSwitch::Both('v', ref l) if l == "verbose" - )); - assert!(matches!( - r.entries[2].switch, - OwnedSwitch::Both('h', ref l) if l == "help" - )); - assert!(r.entries[0].desc.contains("verbosity")); - } - - #[test] - fn mdoc_format_detected() { - let src = ".Sh NAME\n.Nm test\n.Nd a test\n.Sh DESCRIPTION\nstuff\n"; - let lines: Vec = src.split('\n').map(classify_line).collect(); - assert!(mdoc::is_mdoc(&lines)); - } - - #[test] - fn groff_escapes_stripped() { - let stripped = groff::strip_groff_escapes("\\fB\\-v\\fR \\fIfile\\fR"); - assert_eq!(stripped.trim(), "-v file"); - } -} diff --git a/src/parsers/manpage/commands.rs b/src/parsers/manpage/commands.rs deleted file mode 100644 index 72e98de..0000000 --- a/src/parsers/manpage/commands.rs +++ /dev/null @@ -1,132 +0,0 @@ -//! COMMANDS section subcommand extraction. -//! -//! some manpages (notably systemctl) have a dedicated COMMANDS section -//! listing subcommands with descriptions. these use .PP + bold name + -//! .RS/.RE blocks: -//! .PP -//! \fBstart\fR \fIUNIT\fR... -//! .RS 4 -//! Start (activate) one or more units. -//! .RE - -use crate::parsers::manpage::ManpageSubcommand; -use crate::parsers::manpage::groff::{GroffLine, strip_groff_escapes}; - -/// validate that the extracted name looks like a subcommand: lowercase, -/// at least 2 chars, no leading dash. -fn is_valid_subcmd(name: &str) -> bool { - name.len() >= 2 - && !name.starts_with('-') - && name - .chars() - .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-' || c == '_') -} - -/// extract subcommand name from a bold groff text like -/// "\fBlist\-units\fR [\fIPATTERN\fR...]" -> "list-units" -fn extract_bold_command_name(text: &str) -> Option { - let trimmed = text.trim(); - if trimmed.len() >= 4 && trimmed.starts_with("\\fB") { - // look for \fB...\fR at the start: find the next '\\' and take - // the segment between \fB and there. - let after = &trimmed[3..]; - let segment_end = after.find('\\').unwrap_or(after.len()); - let name_part = &after[..segment_end]; - let reconstructed = format!("\\fB{name_part}\\fR"); - let name = strip_groff_escapes(&reconstructed).trim().to_string(); - if is_valid_subcmd(&name) { - return Some(name); - } - return None; - } - // fallback: take the first whitespace-delimited word of the stripped text - let stripped = strip_groff_escapes(trimmed); - let first_word = stripped.split_whitespace().next().unwrap_or(""); - if is_valid_subcmd(first_word) { - Some(first_word.to_string()) - } else { - None - } -} - -/// walk through commands section lines, extracting subcommand name+description -/// pairs from .PP + Text + .RS/.RE blocks. -pub fn extract_subcommands_from_commands(lines: &[GroffLine]) -> Vec { - let mut out = Vec::new(); - let mut i = 0; - while i < lines.len() { - if let GroffLine::Macro { name, .. } = &lines[i] - && name == "PP" - { - i += 1; - if i >= lines.len() { - continue; - } - if let GroffLine::Text(tag) = &lines[i] { - let tag = tag.clone(); - if let Some(name) = extract_bold_command_name(&tag) { - let (desc, new_i) = collect_subcmd_desc(lines, i + 1); - let short_desc = first_sentence(&desc); - out.push(ManpageSubcommand { - name: name.to_ascii_lowercase(), - desc: short_desc, - }); - i = new_i; - continue; - } - i += 1; - } - } else { - i += 1; - } - } - out -} - -/// collect the description for a subcommand entry. handles .RS/.RE blocks -/// and stops at the next .PP/.SH/.SS boundary. -fn collect_subcmd_desc(lines: &[GroffLine], start: usize) -> (String, usize) { - let mut acc: Vec = Vec::new(); - let mut i = start; - while i < lines.len() { - match &lines[i] { - GroffLine::Macro { name, .. } if name == "RS" => { - i += 1; - // inside .RS — collect until .RE or boundary - while i < lines.len() { - match &lines[i] { - GroffLine::Macro { name, .. } if name == "RE" => { - return (acc.join(" "), i + 1); - } - GroffLine::Text(t) => { - acc.push(t.clone()); - i += 1; - } - GroffLine::Macro { name, .. } - if name == "PP" || name == "SH" || name == "SS" => - { - return (acc.join(" "), i); - } - _ => i += 1, - } - } - return (acc.join(" "), i); - } - GroffLine::Text(t) => { - acc.push(t.clone()); - i += 1; - } - _ => return (acc.join(" "), i), - } - } - (acc.join(" "), i) -} - -/// take the first sentence (up to '.') as the description. -fn first_sentence(s: &str) -> String { - let s = s.trim(); - match s.find('.') { - Some(idx) if idx > 0 => s[..idx].trim().to_string(), - _ => s.to_string(), - } -} diff --git a/src/parsers/manpage/groff.rs b/src/parsers/manpage/groff.rs deleted file mode 100644 index 0f8b550..0000000 --- a/src/parsers/manpage/groff.rs +++ /dev/null @@ -1,373 +0,0 @@ -//! groff escape/formatting stripping and line classification. -//! -//! groff escapes start with backslash and use various continuation syntaxes. -//! we strip them, replacing named characters (like \(aq for apostrophe) with -//! their text equivalents and discarding formatting directives. -//! -//! also exports `make_macro_walker!`, the manpage-side analogue of the -//! help parser's `make_parser!`. all of our strategy_* functions are -//! "scan lines, on each .MACRO_NAME run a handler, advance, accumulate" -//! — this macro factors out the loop scaffolding so each strategy reduces -//! to its specific extraction logic. - -/// walk a `&[GroffLine]` slice, and on each macro whose name matches -/// `$mname`, invoke the body with `(lines, i, args)` where: -/// - `lines` is the full slice (for slicing further bodies) -/// - `i` is the current index of the matched macro -/// - `args` is the macro's argument string (by reference) -/// -/// the body returns `Option<(T, usize)>`. `Some((value, new_i))` pushes -/// `value` and advances the cursor to `new_i` (typically computed as -/// `lines.len() - rest.len()` after `collect_text_lines`). `None` -/// advances by one line and keeps scanning. -/// -/// matches the help-parser pattern `make_parser!(name -> T, parser => wrap)`: -/// the macro hides the loop scaffolding, the handler expresses the actual -/// extraction logic. -#[macro_export] -macro_rules! make_macro_walker { - (pub $name:ident -> Vec<$t:ty>, on macro $mname:expr => - |$lines:ident, $i:ident, $args:ident| $body:expr) => { - pub fn $name(lines_input: &[$crate::parsers::manpage::GroffLine]) -> Vec<$t> { - let mut out = Vec::new(); - let mut cursor = 0; - let $lines: &[$crate::parsers::manpage::GroffLine] = lines_input; - while cursor < $lines.len() { - if let $crate::parsers::manpage::GroffLine::Macro { - name: macro_name, args: $args - } = &$lines[cursor] - { - if macro_name == $mname { - let $i = cursor; - // wrap the handler body in an IIFE so an early - // `return None` inside the handler returns from the - // closure, not from the surrounding strategy function. - let result: Option<($t, usize)> = (|| $body)(); - if let Some((value, new_i)) = result { - out.push(value); - cursor = new_i; - continue; - } - } - } - cursor += 1; - } - out - } - }; -} - -/// every line in a manpage is classified as one of four types. -/// this classification drives all subsequent parsing — strategies -/// pattern-match on sequences of classified lines. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum GroffLine { - /// macro name + args, e.g. ("SH", "OPTIONS") or ("TP", "") - Macro { name: String, args: String }, - /// plain text after groff stripping - Text(String), - /// empty line - Blank, - /// groff comment: .backslash-quote or backslash-quote - Comment, -} - -/// translate a groff named character escape to its text equivalent. -/// groff uses two-letter codes like "aq" for apostrophe, "lq"/"rq" for -/// left/right quotes, "em"/"en" for dashes. -fn named_char_of(name: &str) -> Option { - match name { - "aq" => Some('\''), - "lq" | "Lq" | "rq" | "Rq" => Some('"'), - "em" | "en" => Some('-'), - _ => None, - } -} - -fn is_alnum(c: u8) -> bool { - c.is_ascii_alphanumeric() -} - -/// strip groff escape sequences, replacing named characters with text -/// equivalents and discarding formatting directives. -pub fn strip_groff_escapes(source: &str) -> String { - let bytes = source.as_bytes(); - let len = bytes.len(); - let mut buffer = String::with_capacity(len); - let mut pos = 0; - let mut prev_char: u8 = 0; - - while pos < len { - if bytes[pos] == b'\\' && pos + 1 < len { - let next = bytes[pos + 1]; - match next { - b'f' => { - // font escape: \fB, \fI, \fP, \fR, \f(XX, \f[...] - if pos + 2 < len { - let font_char = bytes[pos + 2]; - // insert space before italic font to preserve word boundaries - // e.g. \fB--max-results\fR\fIcount\fR -> "--max-results count" - if font_char == b'I' && is_alnum(prev_char) { - buffer.push(' '); - prev_char = b' '; - } - if font_char == b'(' { - pos += 5; // \f(XX — two-character font name - } else if font_char == b'[' { - pos += 3; - skip_to_byte(bytes, len, &mut pos, b']'); - if pos < len { - pos += 1; - } - } else { - pos += 3; // \fX — single-character font selector - } - } else { - pos += 2; - } - } - b'-' => { - // escaped hyphen-minus — emit a plain hyphen - buffer.push('-'); - prev_char = b'-'; - pos += 2; - } - b'&' | b'/' | b',' => { - // zero-width characters — discard without output - pos += 2; - } - b'(' => { - // two-char named character: \(aq, \(lq, \(rq, etc. - if pos + 3 < len { - let name = &source[pos + 2..pos + 4]; - if let Some(c) = named_char_of(name) { - buffer.push(c); - prev_char = c as u8; - } - pos += 4; - } else { - pos += 2; - } - } - b'[' => { - // bracketed named character: \[aq], \[lq], etc. - pos += 2; - let start = pos; - skip_to_byte(bytes, len, &mut pos, b']'); - if pos < len { - let name = &source[start..pos]; - if let Some(c) = named_char_of(name) { - buffer.push(c); - prev_char = c as u8; - } - pos += 1; - } - } - b's' => { - // size escape: \sN, \s+N, \s-N — skip the numeric argument - pos += 2; - if pos < len && (bytes[pos] == b'+' || bytes[pos] == b'-') { - pos += 1; - } - if pos < len && bytes[pos].is_ascii_digit() { - pos += 1; - } - if pos < len && bytes[pos].is_ascii_digit() { - pos += 1; - } - } - b'm' => { - // color escape: \m[...] — skip the bracketed color name - pos += 2; - if pos < len && bytes[pos] == b'[' { - pos += 1; - skip_to_byte(bytes, len, &mut pos, b']'); - if pos < len { - pos += 1; - } - } - } - b'X' => { - // device control: \X'...' — skip the single-quoted payload - pos += 2; - if pos < len && bytes[pos] == b'\'' { - pos += 1; - skip_to_byte(bytes, len, &mut pos, b'\''); - if pos < len { - pos += 1; - } - } - } - b'*' => { - // string variable: \*X or \*(XX or \*[...] — skip the reference - pos += 2; - skip_groff_reference(bytes, len, &mut pos); - } - b'n' => { - // number register: \nX or \n(XX or \n[...] — skip the reference - pos += 2; - skip_groff_reference(bytes, len, &mut pos); - } - b'e' => { - // escaped backslash literal - buffer.push('\\'); - prev_char = b'\\'; - pos += 2; - } - b'\\' => { - // double backslash — emit one - buffer.push('\\'); - prev_char = b'\\'; - pos += 2; - } - b' ' => { - // escaped space — emit a regular space - buffer.push(' '); - prev_char = b' '; - pos += 2; - } - _ => { - // unknown escape — skip the two-character sequence - pos += 2; - } - } - } else { - // copy a full utf-8 char from source to buffer - let c = source[pos..].chars().next().unwrap(); - buffer.push(c); - prev_char = if c.is_ascii() { c as u8 } else { 0 }; - pos += c.len_utf8(); - } - } - buffer -} - -fn skip_to_byte(bytes: &[u8], len: usize, pos: &mut usize, delim: u8) { - while *pos < len && bytes[*pos] != delim { - *pos += 1; - } -} - -/// skip a groff reference that uses one of three sub-forms: -/// single char — e.g. \*X or \nX -/// ( + 2 chars — e.g. \*(XX or \n(XX -/// [ to ] — e.g. \*[name] or \n[name] -fn skip_groff_reference(bytes: &[u8], len: usize, pos: &mut usize) { - if *pos < len { - if bytes[*pos] == b'(' { - *pos += 3; // skip past '(' + two-character name - } else if bytes[*pos] == b'[' { - *pos += 1; - skip_to_byte(bytes, len, pos, b']'); - if *pos < len { - *pos += 1; - } - } else { - *pos += 1; - } - } -} - -/// strip inline macro formatting: .BI, .BR, .IR, etc. -/// these macros alternate between fonts for their arguments, e.g.: -/// .BI "--output " "FILE" -/// becomes "--outputFILE" (arguments concatenated without spaces). -/// -/// quoted strings are kept together (quotes stripped), but unquoted spaces -/// are consumed. this matches groff's actual rendering of these macros. -pub fn strip_inline_macro_args(text: &str) -> String { - let bytes = text.as_bytes(); - let len = bytes.len(); - let mut buffer = String::with_capacity(len); - let mut pos = 0; - while pos < len { - if bytes[pos] == b'"' { - // quoted argument — copy characters up to the closing quote - pos += 1; - while pos < len && bytes[pos] != b'"' { - let c = text[pos..].chars().next().unwrap(); - buffer.push(c); - pos += c.len_utf8(); - } - if pos < len { - pos += 1; - } - } else if bytes[pos] == b' ' || bytes[pos] == b'\t' { - // unquoted whitespace — skip (arguments are concatenated) - pos += 1; - } else { - let c = text[pos..].chars().next().unwrap(); - buffer.push(c); - pos += c.len_utf8(); - } - } - buffer -} - -/// strip escapes and trim whitespace. -pub fn strip_groff(line: &str) -> String { - strip_groff_escapes(line).trim().to_string() -} - -/// refined comment detection — the base classify_line may miss some comment -/// forms, so this wrapper checks more carefully before falling through. -fn is_comment_line(line: &str) -> bool { - let bytes = line.as_bytes(); - let len = bytes.len(); - (len >= 3 && bytes[0] == b'.' && bytes[1] == b'\\' && bytes[2] == b'"') - || (len >= 2 && bytes[0] == b'\\' && bytes[1] == b'"') -} - -/// classify a single line of manpage source. -/// macro lines start with '.' or '\'' (groff alternate control char). -/// the macro name is split from its arguments at the first space/tab. -/// arguments wrapped in double quotes are unquoted. -pub fn classify_line(line: &str) -> GroffLine { - if is_comment_line(line) { - return GroffLine::Comment; - } - let len = line.len(); - if len == 0 { - return GroffLine::Blank; - } - let bytes = line.as_bytes(); - // base classify also flags dot-backslash forms as comments - if len >= 2 && bytes[0] == b'.' && bytes[1] == b'\\' && (len < 3 || bytes[2] == b'"') { - return GroffLine::Comment; - } - if len >= 3 && bytes[0] == b'\\' && bytes[1] == b'"' { - return GroffLine::Comment; - } - if bytes[0] == b'.' || bytes[0] == b'\'' { - // macro line — extract macro name and arguments - let rest = line[1..].trim(); - let split_at = rest.find(|c: char| c == ' ' || c == '\t'); - match split_at { - Some(idx) => { - let name = rest[..idx].to_string(); - let args = rest[idx + 1..].trim(); - // strip surrounding quotes from arguments - let args = if args.len() >= 2 - && args.starts_with('"') - && args.ends_with('"') - { - args[1..args.len() - 1].to_string() - } else { - args.to_string() - }; - GroffLine::Macro { name, args } - } - None => GroffLine::Macro { - name: rest.to_string(), - args: String::new(), - }, - } - } else { - let stripped = strip_groff(line); - if stripped.is_empty() { - GroffLine::Blank - } else { - GroffLine::Text(stripped) - } - } -} diff --git a/src/parsers/manpage/mdoc.rs b/src/parsers/manpage/mdoc.rs deleted file mode 100644 index 2cd88ef..0000000 --- a/src/parsers/manpage/mdoc.rs +++ /dev/null @@ -1,242 +0,0 @@ -//! BSD mdoc format support. -//! -//! mdoc is the bsd manpage macro package. it uses semantic macros rather than -//! presentation macros: -//! .Fl v -> flag: -v -//! .Ar file -> argument: file -//! .Op ... -> optional: [...] -//! .Bl/.It/.El -> list begin/item/end -//! .Sh -> section header (note lowercase 'h', vs groff's .SH) - -use std::collections::HashMap; - -use crate::parsers::manpage::groff::{GroffLine, strip_groff_escapes}; -use crate::parsers::manpage::{ - ManpageEntry, ManpageResult, OwnedParam, OwnedSwitch, -}; -use crate::types::Positional; - -/// detect mdoc format by looking for any .Sh macro. -pub fn is_mdoc(lines: &[GroffLine]) -> bool { - lines.iter().any(|l| matches!(l, GroffLine::Macro { name, .. } if name == "Sh")) -} - -/// extract renderable text from an mdoc line, skipping structural macros. -fn mdoc_text_of(line: &GroffLine) -> Option { - match line { - GroffLine::Text(t) => Some(strip_groff_escapes(t)), - GroffLine::Macro { name, args } => match name.as_str() { - "Pp" | "Bl" | "El" | "Sh" | "Ss" | "Os" | "Dd" | "Dt" | "Oo" | "Oc" | "Op" => { - None - } - _ => { - let text = strip_groff_escapes(args); - let text = text.trim(); - if text.is_empty() { - None - } else { - Some(text.to_string()) - } - } - }, - _ => None, - } -} - -/// parse an mdoc .It (list item) line that contains flag definitions. -/// mdoc .It lines look like: ".It Fl v Ar file" -/// where Fl = flag, Ar = argument. -fn parse_mdoc_it(args: &str) -> Option { - let words: Vec<&str> = args - .split(' ') - .filter(|w| !w.is_empty() && *w != "Ns") - .collect(); - let param = match words.as_slice() { - [_, _, "Ar", name, ..] => Some(OwnedParam::Mandatory(name.to_string())), - _ => None, - }; - match words.as_slice() { - ["Fl", ch, ..] if ch.len() == 1 && ch.chars().next().unwrap().is_ascii_alphanumeric() => { - Some(ManpageEntry { - switch: OwnedSwitch::Short(ch.chars().next().unwrap()), - param, - desc: String::new(), - }) - } - ["Fl", name, ..] if name.len() > 1 && name.starts_with('-') => Some(ManpageEntry { - switch: OwnedSwitch::Long(name[1..].to_string()), - param, - desc: String::new(), - }), - _ => None, - } -} - -/// extract a positional argument from an mdoc line (.Ar or .Op Ar). -fn positional_of_mdoc_line(args: &str) -> Option<(String, bool)> { - let words: Vec<&str> = args.split(' ').filter(|w| !w.is_empty()).collect(); - let variadic = words.iter().any(|w| *w == "..."); - match words.first() { - Some(name) if name.len() >= 2 => { - Some((name.to_ascii_lowercase(), variadic)) - } - _ => None, - } -} - -/// parse an entire mdoc-format manpage. -/// walks through all classified lines looking for: -/// 1. .Bl/.It/.El list blocks containing flag definitions -/// 2. .Sh SYNOPSIS sections containing positional arguments (.Ar, .Op Ar) -pub fn parse_mdoc_lines(lines: &[GroffLine]) -> ManpageResult { - // collect description for an entry — until next structural macro - fn desc_of(lines: &[GroffLine], start: usize) -> (String, usize) { - let mut acc: Vec = Vec::new(); - let mut i = start; - while i < lines.len() { - if let GroffLine::Macro { name, .. } = &lines[i] - && matches!(name.as_str(), "It" | "El" | "Sh" | "Ss") - { - break; - } - if let Some(t) = mdoc_text_of(&lines[i]) { - acc.push(t); - } - i += 1; - } - (acc.join(" ").trim().to_string(), i) - } - - fn skip_to_el(lines: &[GroffLine], start: usize) -> usize { - let mut i = start; - while i < lines.len() { - if let GroffLine::Macro { name, .. } = &lines[i] - && name == "El" - { - return i + 1; - } - i += 1; - } - i - } - - /// parse a single .It entry: extract flag, collect description. - fn parse_it( - args: &str, - lines: &[GroffLine], - start: usize, - entries: &mut Vec, - ) -> usize { - let (desc, new_start) = desc_of(lines, start); - if let Some(mut entry) = parse_mdoc_it(args) { - entry.desc = desc; - entries.push(entry); - } - new_start - } - - /// parse all .It entries within a .Bl/.El option list. - fn parse_option_list( - entries: &mut Vec, - lines: &[GroffLine], - start: usize, - ) -> usize { - let mut i = start; - while i < lines.len() { - match &lines[i] { - GroffLine::Macro { name, .. } if name == "El" => return i + 1, - GroffLine::Macro { name, args } if name == "It" => { - i = parse_it(args, lines, i + 1, entries); - } - _ => i += 1, - } - } - i - } - - fn parse_synopsis( - positionals: &mut Vec<(String, bool, bool)>, - lines: &[GroffLine], - start: usize, - ) -> usize { - let mut i = start; - while i < lines.len() { - match &lines[i] { - GroffLine::Macro { name, .. } if name == "Sh" => return i, - GroffLine::Macro { name, args } if name == "Ar" => { - if let Some((n, v)) = positional_of_mdoc_line(args) { - positionals.push((n, false, v)); - } - i += 1; - } - GroffLine::Macro { name, args } if name == "Op" => { - let words: Vec<&str> = - args.split(' ').filter(|w| !w.is_empty()).collect(); - if matches!(words.first(), Some(&"Ar")) { - let rest = if args.len() > 3 { &args[3..] } else { "" }; - if let Some((n, v)) = positional_of_mdoc_line(rest) { - positionals.push((n, true, v)); - } - } - i += 1; - } - _ => i += 1, - } - } - i - } - - let mut entries: Vec = Vec::new(); - let mut positionals: Vec<(String, bool, bool)> = Vec::new(); - let mut i = 0; - while i < lines.len() { - // .Bl + .It header sequence — peek at first .It to decide if this is a flag list - if let GroffLine::Macro { name: n1, .. } = &lines[i] - && n1 == "Bl" - { - let j = i + 1; - if j < lines.len() - && let GroffLine::Macro { name: n2, args: it_args } = &lines[j] - && n2 == "It" - { - let words: Vec<&str> = - it_args.split(' ').filter(|w| !w.is_empty()).collect(); - if matches!(words.first(), Some(&"Fl")) { - let k = parse_it(it_args, lines, j + 1, &mut entries); - i = parse_option_list(&mut entries, lines, k); - continue; - } else { - i = skip_to_el(lines, j + 1); - continue; - } - } - i = skip_to_el(lines, j); - continue; - } - if let GroffLine::Macro { name, args } = &lines[i] - && name == "Sh" - && args.trim().to_ascii_uppercase() == "SYNOPSIS" - { - i = parse_synopsis(&mut positionals, lines, i + 1); - continue; - } - i += 1; - } - - // deduplicate positionals by name, preserving first-seen order - let mut seen: Vec = Vec::new(); - let mut deduped: HashMap = HashMap::new(); - for (name, optional, variadic) in positionals { - if !seen.contains(&name) { - seen.push(name.clone()); - deduped.insert(name, Positional { optional, variadic }); - } - } - - ManpageResult { - entries, - subcommands: Vec::new(), - positionals: deduped, - description: String::new(), - } -} diff --git a/src/parsers/manpage/sections.rs b/src/parsers/manpage/sections.rs deleted file mode 100644 index 312a94e..0000000 --- a/src/parsers/manpage/sections.rs +++ /dev/null @@ -1,539 +0,0 @@ -//! section extraction from manpages. -//! -//! manpages are divided into sections by .SH macros. we extract OPTIONS, -//! NAME, SYNOPSIS, and COMMANDS sections for their specific content. - -use std::collections::HashMap; - -use nom::{Parser, sequence::preceded}; - -use crate::parsers::help::{parse_usage_args, parse_usage_flags, skip_command_name}; -use crate::parsers::manpage::groff::{ - GroffLine, strip_groff_escapes, strip_inline_macro_args, -}; -use crate::parsers::manpage::{ManpageEntry, OwnedParam, OwnedSwitch}; -use crate::types::{Param, Positional, Switch}; - -fn is_options_section(name: &str) -> bool { - let upper = name.trim().to_ascii_uppercase(); - upper == "OPTIONS" || upper.contains("OPTION") -} - -/// extract the lines from the OPTIONS section(s). collects from all -/// option-like .SH sections and concatenates them (handles the nix pattern -/// of "Options" and "Common Options" being separate sections). -/// falls back to DESCRIPTION if no OPTIONS section exists. -pub fn extract_options_section(lines: &[GroffLine]) -> Vec { - let mut acc: Vec = Vec::new(); - let mut i = 0; - while i < lines.len() { - if let GroffLine::Macro { name, args } = &lines[i] - && name == "SH" - && is_options_section(args) - { - i += 1; - // synthetic separator between concatenated sections so that - // collect_desc_text (which stops on SH/SS) does not let descriptions - // bleed between sections. - if !acc.is_empty() { - acc.push(GroffLine::Macro { - name: "SH".to_string(), - args: String::new(), - }); - } - while i < lines.len() { - if let GroffLine::Macro { name, .. } = &lines[i] - && name == "SH" - { - break; - } - acc.push(lines[i].clone()); - i += 1; - } - } else { - i += 1; - } - } - if !acc.is_empty() { - return acc; - } - // fallback: DESCRIPTION section - let mut i = 0; - while i < lines.len() { - if let GroffLine::Macro { name, args } = &lines[i] - && name == "SH" - && args.trim().to_ascii_uppercase() == "DESCRIPTION" - { - i += 1; - let mut desc_acc: Vec = Vec::new(); - while i < lines.len() { - if let GroffLine::Macro { name, .. } = &lines[i] - && name == "SH" - { - break; - } - desc_acc.push(lines[i].clone()); - i += 1; - } - return desc_acc; - } - i += 1; - } - Vec::new() -} - -/// the NAME section follows the convention "command \- short description". -/// extract the part after "\-" as the command's description. -/// handles both "\-" (groff) and " - " (plain text) separators. -pub fn extract_name_description(lines: &[GroffLine]) -> Option { - let mut i = 0; - while i < lines.len() { - if let GroffLine::Macro { name, args } = &lines[i] - && name == "SH" - && args.trim().to_ascii_uppercase() == "NAME" - { - i += 1; - let mut acc: Vec = Vec::new(); - while i < lines.len() { - if let GroffLine::Macro { name, .. } = &lines[i] - && name == "SH" - { - break; - } - match &lines[i] { - GroffLine::Text(t) => acc.push(t.clone()), - GroffLine::Macro { name, args } - if matches!(name.as_str(), "B" | "BI" | "BR" | "I" | "IR") => - { - let text = strip_groff_escapes(&strip_inline_macro_args(args)); - let text = text.trim(); - if !text.is_empty() { - acc.push(text.to_string()); - } - } - GroffLine::Macro { name, args } if name == "Nm" => { - let text = strip_groff_escapes(args); - let text = text.trim(); - if !text.is_empty() { - acc.push(text.to_string()); - } - } - GroffLine::Macro { name, args } if name == "Nd" => { - let text = strip_groff_escapes(args); - let text = text.trim(); - if !text.is_empty() { - acc.push(format!("\\- {text}")); - } - } - _ => (), - } - i += 1; - } - let full = acc.join(" ").trim().to_string(); - return split_name_separator(&full); - } - i += 1; - } - None -} - -/// split a NAME line on either "\-" (groff) or " - " (plain). -/// returns the part after the separator, trimmed. -fn split_name_separator(full: &str) -> Option { - // search for either marker - let groff_idx = find_padded(full, "\\-"); - let dash_idx = find_padded(full, " - "); - let idx = match (groff_idx, dash_idx) { - (Some(a), Some(b)) => Some(a.min(b)), - (Some(a), None) => Some(a), - (None, Some(b)) => Some(b), - (None, None) => None, - }?; - // skip past the matched separator - let after = if full[idx..].starts_with("\\-") { - &full[idx + 2..] - } else { - &full[idx + 3..] - }; - let desc = after.trim().to_string(); - if desc.is_empty() { None } else { Some(desc) } -} - -/// find a marker preceded and followed by optional surrounding space. -/// approximated by a simple substring search — accepts spaces on either -/// side without enforcing how many. -fn find_padded(s: &str, needle: &str) -> Option { - s.find(needle) -} - -/// extract the command name from the SYNOPSIS section. -/// -/// the SYNOPSIS section shows how to invoke the command: -/// .SH SYNOPSIS -/// .B git add -/// [\fIOPTIONS\fR] [\fB\-\-\fR] [\fI\fR...] -/// -/// we extract the command name by taking consecutive "word" tokens until -/// we hit something that looks like an argument (starts with [, <, -, etc.). -pub fn extract_synopsis_command(contents: &str) -> Option { - // pre-replace italic text (\fI...\fR) with angle-bracketed placeholders - // before classification strips the font info. italic in groff indicates - // a parameter/placeholder (e.g. \fIoperation\fR), not a command word. - // the angle brackets cause extract_cmd to stop at these tokens since - // '<' is in its stop set. - let preprocessed: Vec = contents - .split('\n') - .map(replace_italic_with_angles) - .collect(); - let classified: Vec = preprocessed - .iter() - .map(|line| crate::parsers::manpage::groff::classify_line(line)) - .collect(); - let mut i = 0; - while i < classified.len() { - if let GroffLine::Macro { name, args } = &classified[i] - && name == "SH" - && args.trim().to_ascii_uppercase() == "SYNOPSIS" - { - i += 1; - while i < classified.len() { - match &classified[i] { - GroffLine::Macro { name, .. } if name == "SH" => return None, - GroffLine::Text(text) => { - let trimmed = text.trim(); - return if trimmed.is_empty() { - None - } else { - extract_cmd(trimmed) - }; - } - GroffLine::Macro { name, args } - if matches!(name.as_str(), "B" | "BI" | "BR") => - { - let text = strip_groff_escapes(&strip_inline_macro_args(args)); - let trimmed = text.trim(); - if !trimmed.is_empty() { - return extract_cmd(trimmed); - } - i += 1; - } - _ => i += 1, - } - } - return None; - } - i += 1; - } - None -} - -/// replace \fI...\f[RP] sequences with <...> so italic params are seen as -/// non-word tokens by extract_cmd. -fn replace_italic_with_angles(line: &str) -> String { - let bytes = line.as_bytes(); - let len = bytes.len(); - let mut out = String::with_capacity(len); - let mut i = 0; - while i < len { - // byte-compare to avoid panicking on non-ASCII char boundaries - if i + 3 <= len && &bytes[i..i + 3] == b"\\fI" { - // find closing \fR or \fP — scan to next '\\' - let inner_start = i + 3; - let mut j = inner_start; - while j < len && bytes[j] != b'\\' { - j += 1; - } - if j + 3 <= len - && bytes[j] == b'\\' - && bytes[j + 1] == b'f' - && (bytes[j + 2] == b'R' || bytes[j + 2] == b'P') - { - out.push('<'); - out.push_str(&line[inner_start..j]); - out.push('>'); - i = j + 3; - continue; - } - } - let c = line[i..].chars().next().unwrap(); - out.push(c); - i += c.len_utf8(); - } - out -} - -/// extract the command name from a synopsis line by taking leading word tokens. -fn extract_cmd(line: &str) -> Option { - let words: Vec<&str> = line.split(' ').filter(|w| !w.is_empty()).collect(); - let is_cmd_char = |c: char| { - c.is_ascii_alphanumeric() || matches!(c, '-' | '_' | '.') - }; - let mut taken: Vec<&str> = Vec::new(); - for word in words { - let first = word.chars().next().unwrap(); - if matches!(first, '[' | '-' | '<' | '(' | '{') { - break; - } - if word.chars().all(is_cmd_char) { - taken.push(word); - } else { - break; - } - } - if taken.is_empty() { - None - } else { - Some(taken.join(" ")) - } -} - -/// extract the lines that form the SYNOPSIS section. -fn extract_synopsis_section(lines: &[GroffLine]) -> Vec { - let mut i = 0; - while i < lines.len() { - if let GroffLine::Macro { name, args } = &lines[i] - && name == "SH" - && args.trim().to_ascii_uppercase() == "SYNOPSIS" - { - i += 1; - let mut acc = Vec::new(); - while i < lines.len() { - if let GroffLine::Macro { name, .. } = &lines[i] - && name == "SH" - { - break; - } - acc.push(lines[i].clone()); - i += 1; - } - return acc; - } - i += 1; - } - Vec::new() -} - -/// extract positional arguments from the SYNOPSIS section. -/// joins all text/formatting macro lines via `join_synopsis_text`, then -/// skips the command name prefix and runs `parse_usage_args` on the rest. -pub fn extract_synopsis_positionals(lines: &[GroffLine]) -> HashMap { - let full = join_synopsis_text(lines); - if full.is_empty() { - return HashMap::new(); - } - let result: nom::IResult<&str, HashMap<&str, Positional>> = - preceded(skip_command_name, parse_usage_args).parse(&full); - match result { - Ok((_, map)) => map - .into_iter() - .map(|(k, v)| (k.to_ascii_lowercase(), v)) - .collect(), - Err(_) => HashMap::new(), - } -} - -/// join the SYNOPSIS section into a single line of plain text, stripping -/// groff escapes and inline font macros. shared by both the positional -/// and flag extractors so they see identical input. -fn join_synopsis_text(lines: &[GroffLine]) -> String { - let section = extract_synopsis_section(lines); - let mut acc: Vec = Vec::new(); - for line in section { - match line { - GroffLine::Macro { name, .. } if name == "SS" || name == "br" => break, - GroffLine::Text(t) => { - let text = strip_groff_escapes(&t).trim().to_string(); - if !text.is_empty() { - acc.push(text); - } - } - GroffLine::Macro { name, args } - if matches!( - name.as_str(), - "B" | "BI" | "BR" | "I" | "IR" | "IB" | "RB" | "RI" - ) => - { - let text = strip_groff_escapes(&strip_inline_macro_args(&args)); - let text = text.trim(); - if !text.is_empty() { - acc.push(text.to_string()); - } - } - _ => (), - } - } - acc.join(" ").trim().to_string() -} - -fn to_owned_switch(s: Switch<'_>) -> OwnedSwitch { - match s { - Switch::Short(c) => OwnedSwitch::Short(c), - Switch::Long(l) => OwnedSwitch::Long(l.to_string()), - Switch::Both(c, l) => OwnedSwitch::Both(c, l.to_string()), - } -} - -fn to_owned_param(p: Param<'_>) -> OwnedParam { - match p { - Param::Mandatory(s) => OwnedParam::Mandatory(s.to_string()), - Param::Optional(s) => OwnedParam::Optional(s.to_string()), - } -} - -/// extract flag-tagged entries from the SYNOPSIS line. some manpages -/// (notably nix-env, sed) declare flags only in the synopsis and never -/// repeat them as entries in the OPTIONS body, so the body-only pass -/// misses them. we join the synopsis text the same way the positional -/// extractor does, then run `parse_usage_flags` over every bracketed -/// switch+param. callers merge with body entries; body wins on duplicate -/// flag names since body descriptions are richer. -pub fn extract_synopsis_flags(lines: &[GroffLine]) -> Vec { - let full = join_synopsis_text(lines); - if full.is_empty() { - return Vec::new(); - } - let result: nom::IResult<&str, Vec<(Switch<'_>, Option>)>> = - preceded(skip_command_name, parse_usage_flags).parse(&full); - match result { - Ok((_, pairs)) => pairs - .into_iter() - .map(|(switch, param)| ManpageEntry { - switch: to_owned_switch(switch), - param: param.map(to_owned_param), - desc: String::new(), - }) - .collect(), - Err(_) => Vec::new(), - } -} - -fn is_commands_section(name: &str) -> bool { - let upper = name.trim().to_ascii_uppercase(); - upper == "COMMANDS" || upper == "COMMAND" -} - -/// find all COMMANDS/.COMMAND sections and collect their lines. -pub fn extract_commands_section(lines: &[GroffLine]) -> Vec { - let mut acc: Vec = Vec::new(); - let mut i = 0; - while i < lines.len() { - if let GroffLine::Macro { name, args } = &lines[i] - && name == "SH" - && is_commands_section(args) - { - i += 1; - while i < lines.len() { - if let GroffLine::Macro { name, .. } = &lines[i] - && name == "SH" - { - break; - } - acc.push(lines[i].clone()); - i += 1; - } - } else { - i += 1; - } - } - acc -} - -/// extract SUBCOMMAND-style sections (clap-generated manpages put each -/// subcommand under its own .SH SUBCOMMAND header with a Usage: line). -/// returns triples of (name, description, lines) so the caller can re-parse -/// each section as its own help_result. -pub fn extract_subcommand_sections( - lines: &[GroffLine], -) -> Vec<(String, String, Vec)> { - // split into sections at .SH boundaries, keeping only SUBCOMMAND(S) ones - let mut sections: Vec> = Vec::new(); - let mut current_name: Option = None; - let mut current: Vec = Vec::new(); - for line in lines { - if let GroffLine::Macro { name, args } = line - && name == "SH" - { - if current_name.is_some() { - sections.push(std::mem::take(&mut current)); - } - let n = args.trim().to_ascii_uppercase(); - if n == "SUBCOMMAND" || n == "SUBCOMMANDS" { - current_name = Some(n); - } else { - current_name = None; - } - continue; - } - if current_name.is_some() { - current.push(line.clone()); - } - } - if current_name.is_some() { - sections.push(current); - } - - let mut out = Vec::new(); - for section in sections { - // scan section lines for the Usage: line to get the subcommand name - let mut subcmd_name: Option = None; - let mut desc_lines: Vec = Vec::new(); - for line in §ion { - if subcmd_name.is_some() { - break; - } - match line { - GroffLine::Text(t) => match find_usage_name(t) { - Some(name) => subcmd_name = Some(name), - None => desc_lines.push(t.clone()), - }, - GroffLine::Macro { name, args } - if matches!(name.as_str(), "TP" | "B" | "BI" | "BR") => - { - let text = strip_groff_escapes(&strip_inline_macro_args(args)); - let text = text.trim(); - subcmd_name = find_usage_name(text); - } - _ => (), - } - } - if let Some(name) = subcmd_name { - let desc_raw = desc_lines.join(" "); - let desc = strip_groff_escapes(&desc_raw).trim().to_string(); - let desc = strip_backtick_words(&desc); - out.push((name, desc, section)); - } - } - out -} - -/// look for "Usage: NAME" and return NAME if found. -/// NAME contains alphanumeric, underscore, or dash. -fn find_usage_name(text: &str) -> Option { - const MARKER: &str = "Usage: "; - let idx = text.find(MARKER)?; - let after = &text[idx + MARKER.len()..]; - let end = after - .find(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '-')) - .unwrap_or(after.len()); - if end == 0 { None } else { Some(after[..end].to_string()) } -} - -/// strip backtick-quoted words: `word` -> word. -fn strip_backtick_words(s: &str) -> String { - let mut out = String::with_capacity(s.len()); - let mut i = 0; - let bytes = s.as_bytes(); - while i < bytes.len() { - if bytes[i] == b'`' { - if let Some(end) = s[i + 1..].find('`') { - out.push_str(&s[i + 1..i + 1 + end]); - i += end + 2; - continue; - } - } - let c = s[i..].chars().next().unwrap(); - out.push(c); - i += c.len_utf8(); - } - out -} diff --git a/src/parsers/manpage/strategies.rs b/src/parsers/manpage/strategies.rs deleted file mode 100644 index 0eac0e9..0000000 --- a/src/parsers/manpage/strategies.rs +++ /dev/null @@ -1,346 +0,0 @@ -//! strategy-based entry extraction. -//! -//! rather than a single monolithic parser, we use multiple "strategies" that -//! each target a specific groff formatting pattern. this is necessary because -//! manpage authors use very different macro combinations for the same purpose. - -use nom::{Parser, combinator::opt}; - -use crate::make_macro_walker; -use crate::parsers::help::{help_parser, param_parser, switch_parser}; -use crate::parsers::manpage::groff::{ - GroffLine, strip_groff_escapes, strip_inline_macro_args, -}; -use crate::parsers::manpage::{ManpageEntry, OwnedParam, OwnedSwitch}; -use crate::types::{Param, Switch}; - -/// collect consecutive text lines, joining them with spaces. -/// returns (collected, remaining). -fn collect_text_lines(lines: &[GroffLine]) -> (String, &[GroffLine]) { - let mut acc: Vec<&str> = Vec::new(); - let mut i = 0; - while i < lines.len() { - match &lines[i] { - GroffLine::Text(t) => acc.push(t), - _ => break, - } - i += 1; - } - (acc.join(" "), &lines[i..]) -} - -fn to_owned_switch(s: Switch<'_>) -> OwnedSwitch { - match s { - Switch::Short(c) => OwnedSwitch::Short(c), - Switch::Long(l) => OwnedSwitch::Long(l.to_string()), - Switch::Both(c, l) => OwnedSwitch::Both(c, l.to_string()), - } -} - -fn to_owned_param(p: Param<'_>) -> OwnedParam { - match p { - Param::Mandatory(s) => OwnedParam::Mandatory(s.to_string()), - Param::Optional(s) => OwnedParam::Optional(s.to_string()), - } -} - -/// attempt to parse a tag string (e.g. "-v, --verbose FILE") into an entry. -/// uses the nom switch_parser + param_parser from the help module. -/// returns None if the tag doesn't look like a flag definition. -pub fn parse_tag_to_entry(tag: &str, desc: String) -> Option { - let tag = strip_groff_escapes(tag); - let tag = tag.trim(); - let result: nom::IResult<&str, (Switch<'_>, Option>)> = - (switch_parser, opt(param_parser)).parse(tag); - match result { - Ok((_, (switch, param))) => Some(ManpageEntry { - switch: to_owned_switch(switch), - param: param.map(to_owned_param), - desc, - }), - Err(_) => None, - } -} - -/// extract tag text from a macro line. -/// .B and .I preserve spaces (single argument); .BI, .BR, .IR alternate -/// fonts and concatenate arguments. -pub fn tag_of_macro(name: &str, args: &str) -> String { - match name { - "B" | "I" => strip_groff_escapes(args).trim().to_string(), - _ => strip_groff_escapes(&strip_inline_macro_args(args)) - .trim() - .to_string(), - } -} - -/// strategy a: .TP style (most common — gnu coreutils, help2man). -/// .TP introduces a tagged paragraph: the next line is the "tag" (flag name) -/// and subsequent text lines are the description. the tag can be plain text -/// or wrapped in a formatting macro (.B, .BI, etc.). -make_macro_walker!(pub strategy_tp -> Vec, on macro "TP" => - |lines, i, _args| { - if i + 1 >= lines.len() { None } - else { - let (tag, body_start) = match &lines[i + 1] { - GroffLine::Text(tag) => (tag.clone(), i + 2), - GroffLine::Macro { name, args } - if matches!(name.as_str(), "B" | "I" | "BI" | "BR" | "IR") => - { - (tag_of_macro(name, args), i + 2) - } - _ => return None, - }; - let (desc, rest) = collect_text_lines(&lines[body_start..]); - let new_i = lines.len() - rest.len(); - parse_tag_to_entry(&tag, desc).map(|e| (e, new_i)) - } - } -); - -/// strategy b: .IP style (curl, hand-written manpages). -/// .IP takes an inline tag argument: .IP "-v, --verbose" -/// the description follows as text lines. -make_macro_walker!(pub strategy_ip -> Vec, on macro "IP" => - |lines, i, args| { - let tag = strip_groff_escapes(args); - let (desc, rest) = collect_text_lines(&lines[i + 1..]); - let new_i = lines.len() - rest.len(); - parse_tag_to_entry(&tag, desc).map(|e| (e, new_i)) - } -); - -/// strategy c: .PP + .RS/.RE style (git, docbook-generated manpages). -/// flag entries are introduced by .PP (paragraph), with the flag name as -/// plain text, followed by a .RS (indent) block containing the description, -/// closed by .RE (de-indent). -make_macro_walker!(pub strategy_pp_rs -> Vec, on macro "PP" => - |lines, i, _args| { - if i + 1 >= lines.len() { return None; } - if let GroffLine::Text(tag) = &lines[i + 1] { - let (desc, new_i) = collect_pp_rs_desc(lines, i + 2); - parse_tag_to_entry(tag, desc).map(|e| (e, new_i)) - } else { - None - } - } -); - -fn collect_pp_rs_desc(lines: &[GroffLine], start: usize) -> (String, usize) { - let mut acc: Vec = Vec::new(); - let mut i = start; - // outer: look for .RS marker or text - while i < lines.len() { - match &lines[i] { - GroffLine::Macro { name, .. } if name == "RS" => { - i += 1; - // inside .RS — collect until .RE or boundary macro - while i < lines.len() { - match &lines[i] { - GroffLine::Macro { name, .. } if name == "RE" => { - return (acc.join(" "), i + 1); - } - GroffLine::Text(t) => { - acc.push(t.clone()); - i += 1; - } - GroffLine::Macro { name, .. } if name == "PP" || name == "SH" => { - return (acc.join(" "), i); - } - _ => i += 1, - } - } - return (acc.join(" "), i); - } - GroffLine::Text(t) => { - acc.push(t.clone()); - i += 1; - } - _ => return (acc.join(" "), i), - } - } - (acc.join(" "), i) -} - -/// strategy d: deroff fallback — strip all groff markup, then feed the -/// resulting plain text through the help parser. -pub fn strategy_deroff(lines: &[GroffLine]) -> Vec { - let mut buffer = String::with_capacity(256); - for line in lines { - match line { - GroffLine::Text(text) => { - buffer.push_str(text); - buffer.push('\n'); - } - GroffLine::Macro { name, args } - if matches!(name.as_str(), "BI" | "BR" | "IR" | "B" | "I") => - { - let text = strip_groff_escapes(&strip_inline_macro_args(args)); - buffer.push_str(&text); - buffer.push('\n'); - } - GroffLine::Blank => buffer.push('\n'), - _ => (), - } - } - match help_parser(&buffer) { - Ok((_, result)) => result - .entries - .into_iter() - .map(|e| ManpageEntry { - switch: to_owned_switch(e.switch), - param: e.param.map(to_owned_param), - desc: e.desc.join(" "), - }) - .collect(), - Err(_) => Vec::new(), - } -} - -fn is_bullet_ip(args: &str) -> bool { - !args.trim().is_empty() -} - -/// strategy e: nix3-style bullet .IP with .UR/.UE hyperlinks. -/// nix's manpages use .IP with bullet markers for flag entries, interleaved -/// with .UR/.UE hyperlink macros. the flag tag is in text lines after the -/// bullet .IP, and the description follows a non-bullet .IP marker. -make_macro_walker!(pub strategy_nix -> Vec, on macro "IP" => - |lines, i, args| { - if !is_bullet_ip(args) { return None; } - // collect tag: skip .UR/.UE macros, gather Text lines - let mut tag_idx = i + 1; - let mut tag_parts: Vec = Vec::new(); - while tag_idx < lines.len() { - match &lines[tag_idx] { - GroffLine::Macro { name, .. } if name == "UR" || name == "UE" => { - tag_idx += 1; - } - GroffLine::Text(t) => { - tag_parts.push(t.clone()); - tag_idx += 1; - } - _ => break, - } - } - let tag = tag_parts.join(" "); - let (desc, new_i) = collect_nix_desc(lines, tag_idx); - parse_tag_to_entry(&tag, desc).map(|e| (e, new_i)) - } -); - -fn collect_nix_desc(lines: &[GroffLine], start: usize) -> (String, usize) { - if start >= lines.len() { - return (String::new(), start); - } - let mut i = start; - // require non-bullet .IP marker for description - if let GroffLine::Macro { name, args } = &lines[i] - && name == "IP" - && args.trim().is_empty() - { - i += 1; - } else { - return (String::new(), start); - } - let mut parts: Vec = Vec::new(); - while i < lines.len() { - match &lines[i] { - GroffLine::Text(t) => { - parts.push(t.clone()); - i += 1; - } - GroffLine::Macro { name, args } if name == "IP" => { - if !args.trim().is_empty() { - // next bullet entry — stop - return (parts.join(" "), i); - } - // non-bullet .IP = continuation paragraph - i += 1; - } - GroffLine::Macro { name, .. } if name == "SS" || name == "SH" => { - return (parts.join(" "), i); - } - GroffLine::Macro { name, .. } if name == "RS" => { - i = skip_rs(lines, i + 1, 1); - } - GroffLine::Macro { .. } => { - i += 1; - } - GroffLine::Blank | GroffLine::Comment => { - i += 1; - } - } - } - (parts.join(" "), i) -} - -fn skip_rs(lines: &[GroffLine], start: usize, mut depth: usize) -> usize { - let mut i = start; - while i < lines.len() { - if let GroffLine::Macro { name, .. } = &lines[i] { - if name == "RE" { - depth -= 1; - if depth == 0 { - return i + 1; - } - } else if name == "RS" { - depth += 1; - } - } - i += 1; - } - i -} - -/// count occurrences of a specific macro in the section. -fn count_macro(name: &str, lines: &[GroffLine]) -> usize { - lines - .iter() - .filter(|line| matches!(line, GroffLine::Macro { name: n, .. } if n == name)) - .count() -} - -/// auto-detect and try strategies, return the one with most entries. -/// first counts macros to determine which strategies are applicable, -/// then runs all applicable ones and picks the winner by entry count. -/// if no specialized strategy produces results, falls back to deroff. -pub fn extract_entries(lines: &[GroffLine]) -> Vec { - let tp = count_macro("TP", lines); - let ip = count_macro("IP", lines); - let pp = count_macro("PP", lines); - let rs = count_macro("RS", lines); - let ur = count_macro("UR", lines); - - let mut specialized: Vec<(&str, Vec)> = Vec::new(); - if tp > 0 { - specialized.push(("TP", strategy_tp(lines))); - } - if ip > 0 { - specialized.push(("IP", strategy_ip(lines))); - } - if pp > 0 && rs > 0 { - specialized.push(("PP+RS", strategy_pp_rs(lines))); - } - if ur > 0 && ip > 0 { - specialized.push(("nix", strategy_nix(lines))); - } - let candidates: Vec<(&str, Vec)> = { - let filtered: Vec<_> = specialized - .into_iter() - .filter(|(_, e)| !e.is_empty()) - .collect(); - if filtered.is_empty() { - vec![("deroff", strategy_deroff(lines))] - } else { - filtered - } - }; - let mut best: Vec = Vec::new(); - for (_, entries) in candidates { - if entries.len() >= best.len() { - best = entries; - } - } - best -} diff --git a/src/parsers/mod.rs b/src/parsers/mod.rs deleted file mode 100644 index 1f8090a..0000000 --- a/src/parsers/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -pub mod help; -pub mod manpage; -pub mod nushell; diff --git a/src/parsers/nushell.rs b/src/parsers/nushell.rs deleted file mode 100644 index 967cc3b..0000000 --- a/src/parsers/nushell.rs +++ /dev/null @@ -1,342 +0,0 @@ -//! generate nushell `extern` definitions from parsed help data. -//! -//! this module is the code generation backend. it takes a [`ManpageResult`] -//! (from the help or manpage parsers) and produces nushell source that defines -//! `extern` declarations — nushell's mechanism for teaching the shell about -//! external commands' flags and subcommands so it can offer completions. -//! -//! key responsibilities: -//! - deduplicating flag entries (same flag from multiple help sources) -//! - mapping parameter names to nushell types (path, int, string) -//! - formatting flags in nushell syntax: --flag(-f): type # description -//! - handling positional arguments with nushell's ordering constraints -//! - escaping special characters for nushell string literals - -use std::borrow::Cow; -use std::collections::{HashMap, HashSet}; -use std::sync::OnceLock; - -use crate::parsers::manpage::{ - ManpageEntry, ManpageResult, ManpageSubcommand, OwnedParam, OwnedSwitch, -}; -use crate::types::Positional; - -/// nushell built-in commands and keywords — we must never generate `extern` -/// definitions for these because it would shadow nushell's own implementations. -/// maintained manually and should be updated with new nushell releases. -pub const NUSHELL_BUILTINS: &[&str] = &[ - "alias", "all", "ansi", "any", "append", "ast", "attr", - "bits", "break", "bytes", - "cal", "cd", "char", "chunk-by", "chunks", "clear", "collect", - "columns", "commandline", "compact", "complete", "config", "const", - "continue", "cp", - "date", "debug", "decode", "def", "default", "describe", "detect", - "do", "drop", "du", - "each", "echo", "encode", "enumerate", "error", "every", "exec", - "exit", "explain", "explore", "export", "export-env", "extern", - "fill", "filter", "find", "first", "flatten", "for", "format", "from", - "generate", "get", "glob", "grid", "group-by", - "hash", "headers", "help", "hide", "hide-env", "histogram", - "history", "http", - "if", "ignore", "input", "insert", "inspect", "interleave", "into", - "is-admin", "is-empty", "is-not-empty", "is-terminal", "items", - "job", "join", - "keybindings", "kill", - "last", "length", "let", "let-env", "lines", "load-env", "loop", "ls", - "match", "math", "merge", "metadata", "mkdir", "mktemp", "module", - "move", "mut", "mv", - "nu-check", "nu-highlight", - "open", "overlay", - "panic", "par-each", "parse", "path", "plugin", "port", "prepend", "print", "ps", - "query", - "random", "reduce", "reject", "rename", "return", "reverse", "rm", - "roll", "rotate", "run-external", - "save", "schema", "scope", "select", "seq", "shuffle", "skip", "sleep", - "slice", "sort", "sort-by", "source", "source-env", "split", "start", - "stor", "str", "sys", - "table", "take", "tee", "term", "timeit", "to", "touch", "transpose", - "try", "tutor", - "ulimit", "umask", "uname", "uniq", "uniq-by", "unlet", "update", - "upsert", "url", "use", - "values", "version", "view", - "watch", "where", "which", "while", "whoami", "window", "with-env", "wrap", - "zip", -]; - -fn builtin_set() -> &'static HashSet<&'static str> { - static SET: OnceLock> = OnceLock::new(); - SET.get_or_init(|| NUSHELL_BUILTINS.iter().copied().collect()) -} - -/// returns true if the given command name collides with a nushell built-in. -pub fn is_nushell_builtin(cmd: &str) -> bool { - builtin_set().contains(cmd) -} - -/// map parameter names to nushell types. -/// nushell's `extern` declarations use typed parameters, so we infer the type -/// from the parameter name. file/path-related names become "path" (enables -/// path completion), numeric names become "int", everything else is "string". -pub fn nushell_type_of_param(name: &str) -> &'static str { - match name { - "FILE" | "file" | "PATH" | "path" | "DIR" | "dir" | "DIRECTORY" - | "FILENAME" | "PATTERNFILE" => "path", - "NUM" | "N" | "COUNT" | "NUMBER" | "int" | "INT" | "COLS" | "WIDTH" - | "LINES" | "DEPTH" | "depth" => "int", - _ => "string", - } -} - -/// escape a string for use inside nushell double-quoted string literals. -/// only double quotes and backslashes need escaping in nushell's syntax. -pub fn escape_nu(s: &str) -> Cow<'_, str> { - if !s.contains('"') && !s.contains('\\') { - Cow::Borrowed(s) - } else { - let mut buf = String::with_capacity(s.len() + 4); - for c in s.chars() { - match c { - '"' => buf.push_str("\\\""), - '\\' => buf.push_str("\\\\"), - c => buf.push(c), - } - } - Cow::Owned(buf) - } -} - -fn entry_key(e: &ManpageEntry) -> String { - match &e.switch { - OwnedSwitch::Short(c) => format!("-{c}"), - OwnedSwitch::Long(l) | OwnedSwitch::Both(_, l) => format!("--{l}"), - } -} - -fn entry_score(e: &ManpageEntry) -> i32 { - let switch_bonus = if matches!(e.switch, OwnedSwitch::Both(_, _)) { 10 } else { 0 }; - let param_bonus = if e.param.is_some() { 5 } else { 0 }; - let desc_bonus = (e.desc.len() / 10).min(5) as i32; - switch_bonus + param_bonus + desc_bonus -} - -/// deduplicate flag entries that refer to the same flag. -/// -/// when the same flag appears multiple times (e.g. from overlapping manpage -/// sections or repeated help text), we keep the "best" version using a score: -/// - both short+long form present: +10 (most informative) -/// - has a parameter: +5 -/// - description length bonus: up to +5 -/// -/// after deduplication by long name, we also remove standalone short flags -/// whose letter is already covered by a Both(short, long) entry. this prevents -/// emitting both "-v" and "--verbose(-v)" which nushell would reject as a -/// duplicate. the filtering preserves original ordering from the help text. -pub fn dedup_entries(entries: &[ManpageEntry]) -> Vec { - let mut best: HashMap = HashMap::new(); - for e in entries { - let key = entry_key(e); - match best.get(&key) { - Some(prev) if entry_score(prev) >= entry_score(e) => {} - _ => { - best.insert(key, e); - } - } - } - let mut covered: HashSet = HashSet::new(); - for e in best.values() { - if let OwnedSwitch::Both(c, _) = &e.switch { - covered.insert(*c); - } - } - let mut seen: HashSet = HashSet::new(); - let mut out: Vec = Vec::new(); - for e in entries { - let key = entry_key(e); - if seen.contains(&key) { - continue; - } - if let OwnedSwitch::Short(c) = &e.switch - && covered.contains(c) - { - continue; - } - seen.insert(key.clone()); - out.push((*best.get(&key).unwrap()).clone()); - } - out -} - -/// format a single flag entry as a nushell `extern` parameter line. -/// output examples: -/// " --verbose(-v) # increase verbosity" -/// " --output(-o): path # write output to file" -/// " -n: int # number of results" -/// -/// the description is right-padded to column 40 with a "# " comment prefix. -pub fn format_flag(entry: &ManpageEntry) -> String { - let name = match &entry.switch { - OwnedSwitch::Both(c, l) => format!("--{l}(-{c})"), - OwnedSwitch::Long(l) => format!("--{l}"), - OwnedSwitch::Short(c) => format!("-{c}"), - }; - let typed = match &entry.param { - Some(OwnedParam::Mandatory(p)) | Some(OwnedParam::Optional(p)) => { - format!(": {}", nushell_type_of_param(p)) - } - None => String::new(), - }; - let flag = format!(" {name}{typed}"); - if entry.desc.is_empty() { - flag - } else { - let pad_len = 40usize.saturating_sub(flag.len()).max(1); - format!("{flag}{}# {}", " ".repeat(pad_len), entry.desc) - } -} - -/// format a positional argument as a nushell `extern` parameter line. -/// nushell syntax: "...name: type" for variadic, "name?: type" for optional. -/// hyphens in names are converted to underscores since nushell identifiers -/// cannot contain hyphens. -pub fn format_positional(name: &str, p: &Positional) -> String { - let name_underscored: String = - name.chars().map(|c| if c == '-' { '_' } else { c }).collect(); - let prefix = if p.variadic { "..." } else { "" }; - let suffix = if p.optional && !p.variadic { "?" } else { "" }; - let typ = nushell_type_of_param(&name.to_ascii_uppercase()); - format!(" {prefix}{name_underscored}{suffix}: {typ}") -} - -/// enforce nushell's positional argument ordering rules: -/// 1. no required positional may follow an optional one -/// 2. at most one variadic ("rest") parameter is allowed -/// -/// if a required positional appears after an optional one, it is silently -/// promoted to optional. duplicate variadic params are dropped. -pub fn fixup_positionals( - positionals: Vec<(String, Positional)>, -) -> Vec<(String, Positional)> { - let mut seen_optional = false; - let mut seen_variadic = false; - let mut out = Vec::with_capacity(positionals.len()); - for (name, mut p) in positionals { - if p.variadic { - if seen_variadic { - continue; - } - seen_variadic = true; - seen_optional = true; - out.push((name, p)); - } else if seen_optional { - p.optional = true; - out.push((name, p)); - } else { - seen_optional = p.optional; - out.push((name, p)); - } - } - out -} - -/// derive a nushell `module` name from a command name. -/// replaces non-alphanumeric characters with hyphens and appends "-completions". -pub fn module_name_of(cmd_name: &str) -> String { - let mut s: String = cmd_name - .chars() - .map(|c| { - if c.is_ascii_alphanumeric() || c == '-' || c == '_' { - c - } else { - '-' - } - }) - .collect(); - s.push_str("-completions"); - s -} - -/// stable-sorted view of the positionals HashMap as a Vec. -/// HashMap iteration order is unspecified; nushell needs a deterministic -/// emission order, so we sort by name. -fn sorted_positionals(positionals: &HashMap) -> Vec<(String, Positional)> { - let mut v: Vec<(String, Positional)> = positionals - .iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - v.sort_by(|a, b| a.0.cmp(&b.0)); - v -} - -/// generate the full nushell `extern` block for a command. -/// -/// produces output like: -/// export extern "git add" [ -/// ...pathspec?: path -/// --verbose(-v) # be verbose -/// --dry-run(-n) # dry run -/// ] -/// -/// subcommands that weren't resolved into their own full definitions get -/// stub `extern` blocks with just a comment containing their description: -/// export extern "git stash" [ # stash changes -/// ] -pub fn generate_extern(cmd_name: &str, result: &ManpageResult) -> String { - let entries = dedup_entries(&result.entries); - let escaped_name = escape_nu(cmd_name); - let positionals = fixup_positionals(sorted_positionals(&result.positionals)); - - let mut out = String::new(); - out.push_str(&format!("export extern \"{escaped_name}\" [\n")); - for (name, p) in &positionals { - out.push_str(&format_positional(name, p)); - out.push('\n'); - } - for entry in &entries { - out.push_str(&format_flag(entry)); - out.push('\n'); - } - out.push_str("]\n"); - - for sc in &result.subcommands { - out.push_str(&format!( - "\nexport extern \"{} {}\" [ # {}\n]\n", - escaped_name, - escape_nu(&sc.name), - escape_nu(&sc.desc) - )); - } - out -} - -/// generate a complete nushell `module` wrapping the `extern`. -/// output: "module git-completions { ... }\n\nuse git-completions *\n" -/// the `use` at the end makes the `extern` immediately available in scope. -pub fn generate_module(cmd_name: &str, result: &ManpageResult) -> String { - let mod_name = module_name_of(cmd_name); - format!( - "module {mod_name} {{\n{}}}\n\nuse {mod_name} *\n", - generate_extern(cmd_name, result) - ) -} - -/// convenience wrapper: generate an `extern` from just a list of entries. -pub fn generate_extern_from_entries(cmd_name: &str, entries: Vec) -> String { - generate_extern( - cmd_name, - &ManpageResult { - entries, - subcommands: Vec::new(), - positionals: HashMap::new(), - description: String::new(), - }, - ) -} - -/// stub subcommand entry used when extracting subcommands from a parsed -/// help result for nushell output. -pub fn manpage_subcommand_from(name: &str, desc: &str) -> ManpageSubcommand { - ManpageSubcommand { - name: name.to_string(), - desc: desc.to_string(), - } -} diff --git a/src/pool.rs b/src/pool.rs deleted file mode 100644 index da34a7e..0000000 --- a/src/pool.rs +++ /dev/null @@ -1,200 +0,0 @@ -//! BFS-queue worker pool for parallel subprocess scraping. -//! -//! workers pull jobs from a shared queue and call a user-supplied -//! handler; the handler gets a `Submitter` to push newly-discovered -//! child jobs back onto the same queue. when the in-flight count -//! reaches zero the pool shuts down and `wait` returns. -//! -//! the queue-back design is deliberate: command-help trees are uneven -//! (one binary has 30 subs, another has 1). queue-back keeps every -//! worker fed; spawn-in-place would leave cores idle on lopsided trees. -//! -//! synchronization: `parking_lot::Mutex>` for the queue and -//! `parking_lot::Condvar` to park workers when the queue is empty, plus -//! `AtomicUsize` for in-flight count and `AtomicBool` for shutdown. -//! parking_lot gives no-poison locks (no `Result` noise on every -//! `lock()`) and a single-syscall fast path in the uncontended case. - -use std::collections::VecDeque; -use std::sync::Arc; -use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; -use std::thread::{self, JoinHandle}; - -use parking_lot::{Condvar, Mutex}; - -/// shared state held behind an `Arc` by every worker and by the -/// submitter handles handed to the per-job handler. -struct Inner { - queue: Mutex>, - notify: Condvar, - /// jobs created but not yet completed. counts both queued and - /// in-progress jobs. when this hits 0 the pool shuts down. - in_flight: AtomicUsize, - /// set once in_flight reaches 0. signals workers to wake from the - /// condvar and exit. - shutdown: AtomicBool, -} - -impl Inner { - fn submit(&self, job: J) { - self.in_flight.fetch_add(1, Ordering::SeqCst); - self.queue.lock().push_back(job); - self.notify.notify_one(); - } - - fn next(&self) -> Option { - let mut q = self.queue.lock(); - loop { - if let Some(job) = q.pop_front() { - return Some(job); - } - if self.shutdown.load(Ordering::SeqCst) { - return None; - } - self.notify.wait(&mut q); - } - } - - fn complete(&self) { - if self.in_flight.fetch_sub(1, Ordering::SeqCst) == 1 { - // we were the last in-flight job — initiate shutdown so other - // workers parked on the condvar wake up and exit. - self.shutdown.store(true, Ordering::SeqCst); - self.notify.notify_all(); - } - } -} - -/// cheap-to-clone handle that lets a job handler enqueue further jobs. -/// passed by reference to the handler closure. -pub struct Submitter { - inner: Arc>, -} - -impl Clone for Submitter { - fn clone(&self) -> Self { - Submitter { - inner: self.inner.clone(), - } - } -} - -impl Submitter { - pub fn submit(&self, job: J) { - self.inner.submit(job); - } -} - -/// BFS-queue worker pool. each worker pulls a job, calls the handler -/// (which may submit further jobs via the passed `Submitter`), then marks -/// the job complete. when in-flight reaches zero the pool shuts down and -/// `wait` returns. -pub struct ScrapePool { - inner: Arc>, - workers: Vec>, -} - -impl ScrapePool { - /// spawn `num_workers` threads that run `handler` on each job pulled - /// from the queue. the handler receives the job by value and a - /// `&Submitter` for enqueuing children. - pub fn new(num_workers: usize, handler: F) -> Self - where - F: Fn(J, &Submitter) + Send + Sync + 'static, - { - let inner = Arc::new(Inner { - queue: Mutex::new(VecDeque::new()), - notify: Condvar::new(), - in_flight: AtomicUsize::new(0), - shutdown: AtomicBool::new(false), - }); - let handler = Arc::new(handler); - let workers = (0..num_workers.max(1)) - .map(|_| { - let inner = inner.clone(); - let handler = handler.clone(); - thread::spawn(move || { - let submitter = Submitter { - inner: inner.clone(), - }; - while let Some(job) = inner.next() { - handler(job, &submitter); - inner.complete(); - } - }) - }) - .collect(); - ScrapePool { inner, workers } - } - - /// submit a top-level job. typically called by the orchestrating - /// thread before `wait`; handlers should use `Submitter::submit`. - pub fn submit(&self, job: J) { - self.inner.submit(job); - } - - /// block until all jobs (initial + transitively discovered) have - /// completed, then join every worker thread. - pub fn wait(self) { - // if no jobs were ever submitted, workers would block forever on - // the condvar. signal shutdown so they exit cleanly. - if self.inner.in_flight.load(Ordering::SeqCst) == 0 { - self.inner.shutdown.store(true, Ordering::SeqCst); - self.inner.notify.notify_all(); - } - for w in self.workers { - let _ = w.join(); - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn flat_jobs_processed_once_each() { - let collected: Arc>> = Arc::new(Mutex::new(Vec::new())); - let pool = ScrapePool::new(4, { - let collected = collected.clone(); - move |n: u32, _: &Submitter| { - collected.lock().push(n); - } - }); - for i in 0..100u32 { - pool.submit(i); - } - pool.wait(); - let mut got = collected.lock().clone(); - got.sort(); - assert_eq!(got, (0..100).collect::>()); - } - - #[test] - fn discovered_children_processed_to_completion() { - // BFS expansion: every odd number under 10 spawns its successor. - let collected: Arc>> = Arc::new(Mutex::new(Vec::new())); - let pool = ScrapePool::new(2, { - let collected = collected.clone(); - move |n: u32, sub: &Submitter| { - collected.lock().push(n); - if n < 10 && n % 2 == 1 { - sub.submit(n + 1); - } - } - }); - for i in [1u32, 3, 5, 7, 9] { - pool.submit(i); - } - pool.wait(); - let mut got = collected.lock().clone(); - got.sort(); - assert_eq!(got, vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); - } - - #[test] - fn wait_with_no_jobs_returns_immediately() { - let pool: ScrapePool<()> = ScrapePool::new(2, |_, _| {}); - pool.wait(); - } -} diff --git a/src/store.rs b/src/store.rs deleted file mode 100644 index 59a372e..0000000 --- a/src/store.rs +++ /dev/null @@ -1,626 +0,0 @@ -//! filesystem store for parsed completion data. -//! -//! write side: serialize ManpageResult to JSON, derive sanitised -//! filenames from command names ("git add" → git_add.json). -//! -//! read side: look up a command by name across the user cache + system -//! dirs, deserialize JSON or parse a .nu extern blob back into a result. - -use std::collections::HashMap; -use std::fs; -use std::io; -use std::path::{Path, PathBuf}; - -use serde_json::Value; - -use crate::parsers::manpage::{ - ManpageEntry, ManpageResult, ManpageSubcommand, OwnedParam, OwnedSwitch, -}; -use crate::types::Positional; - -/// default cache directory: $XDG_CACHE_HOME/inshellah, falling back to -/// $HOME/.cache/inshellah. -pub fn default_store_path() -> PathBuf { - if let Ok(xdg) = std::env::var("XDG_CACHE_HOME") { - if !xdg.is_empty() { - return PathBuf::from(xdg).join("inshellah"); - } - } - if let Ok(home) = std::env::var("HOME") { - return PathBuf::from(home).join(".cache/inshellah"); - } - PathBuf::from(".cache/inshellah") -} - -/// create directory and all parents. -pub fn ensure_dir(dir: &Path) -> io::Result<()> { - fs::create_dir_all(dir) -} - -/// derive a safe filename from a command name. -/// spaces in subcommand names ("git add") become "_" ("git_add"). -/// any other non-filesystem-safe characters are also replaced. -pub fn filename_of_command(cmd: &str) -> String { - cmd.chars() - .map(|c| match c { - 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_' | '.' => c, - ' ' => '_', - _ => '_', - }) - .collect() -} - -/// reverse: a filename "git_add" produces command name "git add". -/// underscores are flipped to spaces unconditionally — names that -/// genuinely contained an underscore round-trip as spaces, which is -/// acceptable since the read side is only used for display. -pub fn command_of_filename(base: &str) -> String { - base.replace('_', " ") -} - -fn escape_json(s: &str) -> String { - let mut out = String::with_capacity(s.len() + 2); - for c in s.chars() { - match c { - '"' => out.push_str("\\\""), - '\\' => out.push_str("\\\\"), - '\n' => out.push_str("\\n"), - '\r' => out.push_str("\\r"), - '\t' => out.push_str("\\t"), - '\x08' => out.push_str("\\b"), - '\x0c' => out.push_str("\\f"), - c if (c as u32) < 0x20 => { - out.push_str(&format!("\\u{:04x}", c as u32)); - } - c => out.push(c), - } - } - out -} - -fn json_string(s: &str) -> String { - format!("\"{}\"", escape_json(s)) -} - -fn json_switch(s: &OwnedSwitch) -> String { - match s { - OwnedSwitch::Short(c) => { - format!(r#"{{"type":"short","char":{}}}"#, json_string(&c.to_string())) - } - OwnedSwitch::Long(l) => { - format!(r#"{{"type":"long","name":{}}}"#, json_string(l)) - } - OwnedSwitch::Both(c, l) => format!( - r#"{{"type":"both","char":{},"name":{}}}"#, - json_string(&c.to_string()), - json_string(l) - ), - } -} - -fn json_param(p: &Option) -> String { - match p { - None => "null".to_string(), - Some(OwnedParam::Mandatory(n)) => { - format!(r#"{{"kind":"mandatory","name":{}}}"#, json_string(n)) - } - Some(OwnedParam::Optional(n)) => { - format!(r#"{{"kind":"optional","name":{}}}"#, json_string(n)) - } - } -} - -fn json_entry(e: &ManpageEntry) -> String { - format!( - r#"{{"switch":{},"param":{},"desc":{}}}"#, - json_switch(&e.switch), - json_param(&e.param), - json_string(&e.desc) - ) -} - -fn json_subcommand(sc: &ManpageSubcommand) -> String { - format!( - r#"{{"name":{},"desc":{}}}"#, - json_string(&sc.name), - json_string(&sc.desc) - ) -} - -fn json_positional(name: &str, p: &Positional) -> String { - format!( - r#"{{"name":{},"optional":{},"variadic":{}}}"#, - json_string(name), - p.optional, - p.variadic - ) -} - -fn json_list String>(items: &[T], f: F) -> String { - let parts: Vec = items.iter().map(|i| f(i)).collect(); - format!("[{}]", parts.join(",")) -} - -/// serialize a ManpageResult to JSON: -/// {"source":..., "description":..., "entries":[...], -/// "subcommands":[...], "positionals":[...]} -pub fn json_of_result(source: &str, result: &ManpageResult) -> String { - let entries = json_list(&result.entries, json_entry); - let subcommands = json_list(&result.subcommands, json_subcommand); - // sort positionals by name for deterministic output - let mut pos_vec: Vec<(&String, &Positional)> = result.positionals.iter().collect(); - pos_vec.sort_by(|a, b| a.0.cmp(b.0)); - let positionals_parts: Vec = pos_vec - .iter() - .map(|(name, p)| json_positional(name, p)) - .collect(); - let positionals = format!("[{}]", positionals_parts.join(",")); - format!( - r#"{{"source":{},"description":{},"entries":{},"subcommands":{},"positionals":{}}}"#, - json_string(source), - json_string(&result.description), - entries, - subcommands, - positionals, - ) -} - -pub fn write_file(path: &Path, contents: &str) -> io::Result<()> { - if let Some(parent) = path.parent() { - fs::create_dir_all(parent)?; - } - fs::write(path, contents) -} - -/// write the parsed result for `command` into `dir` as JSON. -pub fn write_result( - dir: &Path, - command: &str, - source: &str, - result: &ManpageResult, -) -> io::Result<()> { - let path = dir.join(format!("{}.json", filename_of_command(command))); - write_file(&path, &json_of_result(source, result)) -} - -/// write a native-nushell completion blob (the binary supplied its own). -pub fn write_native(dir: &Path, command: &str, data: &str) -> io::Result<()> { - let path = dir.join(format!("{}.nu", filename_of_command(command))); - write_file(&path, data) -} - -// --- read side --- - -fn read_file(path: &Path) -> Option { - fs::read_to_string(path).ok() -} - -fn switch_from_json(v: &Value) -> Option { - let t = v.get("type")?.as_str()?; - match t { - "short" => { - let c = v.get("char")?.as_str()?.chars().next()?; - Some(OwnedSwitch::Short(c)) - } - "long" => Some(OwnedSwitch::Long(v.get("name")?.as_str()?.to_string())), - "both" => { - let c = v.get("char")?.as_str()?.chars().next()?; - let n = v.get("name")?.as_str()?.to_string(); - Some(OwnedSwitch::Both(c, n)) - } - _ => None, - } -} - -fn param_from_json(v: &Value) -> Option { - if v.is_null() { - return None; - } - let kind = v.get("kind")?.as_str()?; - let name = v.get("name")?.as_str()?.to_string(); - Some(match kind { - "mandatory" => OwnedParam::Mandatory(name), - "optional" => OwnedParam::Optional(name), - _ => return None, - }) -} - -fn entry_from_json(v: &Value) -> Option { - let switch = switch_from_json(v.get("switch")?)?; - let param = v.get("param").and_then(param_from_json); - let desc = v - .get("desc") - .and_then(|d| d.as_str()) - .unwrap_or("") - .to_string(); - Some(ManpageEntry { switch, param, desc }) -} - -fn subcommand_from_json(v: &Value) -> Option { - let name = v.get("name")?.as_str()?.to_string(); - let desc = v - .get("desc") - .and_then(|d| d.as_str()) - .unwrap_or("") - .to_string(); - Some(ManpageSubcommand { name, desc }) -} - -fn positional_from_json(v: &Value) -> Option<(String, Positional)> { - let name = v.get("name")?.as_str()?.to_string(); - let optional = v - .get("optional") - .and_then(|x| x.as_bool()) - .unwrap_or(false); - let variadic = v - .get("variadic") - .and_then(|x| x.as_bool()) - .unwrap_or(false); - Some((name, Positional { optional, variadic })) -} - -/// deserialize a JSON cache entry into ManpageResult. -pub fn result_from_json(v: &Value) -> ManpageResult { - let description = v - .get("description") - .and_then(|d| d.as_str()) - .unwrap_or("") - .to_string(); - let entries = v - .get("entries") - .and_then(|x| x.as_array()) - .map(|arr| arr.iter().filter_map(entry_from_json).collect()) - .unwrap_or_default(); - let subcommands = v - .get("subcommands") - .and_then(|x| x.as_array()) - .map(|arr| arr.iter().filter_map(subcommand_from_json).collect()) - .unwrap_or_default(); - let positionals = v - .get("positionals") - .and_then(|x| x.as_array()) - .map(|arr| arr.iter().filter_map(positional_from_json).collect()) - .unwrap_or_default(); - ManpageResult { - entries, - subcommands, - positionals, - description, - } -} - -/// parse nushell `export extern` blocks out of a .nu source file. -/// -/// returns the help_result that matches `target_cmd` — its entries, -/// positionals, and any other extern blocks under it (`cmd sub`) are -/// folded into the subcommands list. -pub fn parse_nu_completions(target_cmd: &str, contents: &str) -> ManpageResult { - let mut blocks: Vec = Vec::new(); - let mut current_desc = String::new(); - let mut in_block = false; - let mut block = NuBlock::default(); - - for line in contents.split('\n') { - let trimmed = line.trim(); - if !in_block { - if trimmed.starts_with("# ") { - current_desc = trimmed[2..].trim().to_string(); - } else if trimmed.contains("export extern") - && let Some(cmd) = extract_extern_name(trimmed) - { - in_block = true; - block = NuBlock { - cmd, - description: std::mem::take(&mut current_desc), - ..Default::default() - }; - } else { - current_desc.clear(); - } - } else if trimmed.starts_with(']') { - blocks.push(std::mem::take(&mut block)); - in_block = false; - } else { - let (param_part, desc) = match trimmed.find('#') { - Some(idx) => (trimmed[..idx].trim(), trimmed[idx + 1..].trim()), - None => (trimmed, ""), - }; - parse_nu_param_line_into(param_part, desc, &mut block); - } - } - if in_block { - blocks.push(block); - } - - // find the block matching target_cmd - let Some(matched) = blocks.iter().find(|b| b.cmd == target_cmd) else { - return ManpageResult::default(); - }; - - // collect immediate subcommands from other blocks ("target sub" pattern) - let prefix = format!("{target_cmd} "); - let mut subcommands: Vec = Vec::new(); - for b in &blocks { - if let Some(suffix) = b.cmd.strip_prefix(&prefix) - && !suffix.contains(' ') - && !suffix.is_empty() - { - subcommands.push(ManpageSubcommand { - name: suffix.to_string(), - desc: b.description.clone(), - }); - } - } - - ManpageResult { - entries: matched.entries.clone(), - subcommands, - positionals: matched.positionals.clone(), - description: matched.description.clone(), - } -} - -fn extract_extern_name(line: &str) -> Option { - let idx = line.find("export extern")?; - let after = line[idx + "export extern".len()..].trim_start(); - if let Some(rest) = after.strip_prefix('"') { - let end = rest.find('"')?; - Some(rest[..end].to_string()) - } else { - let end = after - .find(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '-')) - .unwrap_or(after.len()); - if end == 0 { - None - } else { - Some(after[..end].to_string()) - } - } -} - -fn parse_nu_param_line_into(param_part: &str, desc: &str, block: &mut NuBlock) { - if param_part.len() < 2 { - return; - } - if param_part.starts_with("--") { - // long flag: --name(-c): type or --name: type or --name - let after = ¶m_part[2..]; - let (name, rest) = split_at_non_name_char(after); - if name.is_empty() { - return; - } - let mut short: Option = None; - let mut rest = rest; - if let Some(after_open) = rest.strip_prefix("(-") { - if let Some(c) = after_open.chars().next() { - if after_open[c.len_utf8()..].starts_with(')') { - short = Some(c); - rest = &after_open[c.len_utf8() + 1..]; - } - } - } - let param = parse_type_suffix(rest); - let switch = match short { - Some(c) => OwnedSwitch::Both(c, name.to_string()), - None => OwnedSwitch::Long(name.to_string()), - }; - block.entries.push(ManpageEntry { - switch, - param, - desc: desc.to_string(), - }); - } else if param_part.starts_with('-') { - // short flag: -c - if let Some(c) = param_part.chars().nth(1) - && c.is_ascii_alphanumeric() - { - block.entries.push(ManpageEntry { - switch: OwnedSwitch::Short(c), - param: None, - desc: desc.to_string(), - }); - } - } else { - // positional: name: type or name?: type or ...name: type - let variadic = param_part.starts_with("..."); - let after_prefix = if variadic { ¶m_part[3..] } else { param_part }; - let optional = after_prefix.contains('?'); - let name_end = after_prefix - .find(|c| c == ':' || c == '?') - .unwrap_or(after_prefix.len()); - let name = after_prefix[..name_end].trim(); - let name: String = name.chars().map(|c| if c == '-' { '_' } else { c }).collect(); - if !name.is_empty() && !name.starts_with('-') { - block.positionals.insert( - name, - Positional { - optional: optional || variadic, - variadic, - }, - ); - } - } -} - -fn split_at_non_name_char(s: &str) -> (&str, &str) { - let end = s - .find(|c: char| !(c.is_ascii_alphanumeric() || c == '-')) - .unwrap_or(s.len()); - (&s[..end], &s[end..]) -} - -/// parse a `: type` suffix into an OwnedParam (always Mandatory since the -/// nushell extern syntax doesn't distinguish optional-with-default). -fn parse_type_suffix(s: &str) -> Option { - let s = s.trim_start(); - let s = s.strip_prefix(':')?; - let s = s.trim_start(); - let end = s - .find(|c: char| !c.is_ascii_alphabetic()) - .unwrap_or(s.len()); - if end == 0 { - None - } else { - Some(OwnedParam::Mandatory(s[..end].to_string())) - } -} - -#[derive(Default)] -struct NuBlock { - cmd: String, - entries: Vec, - positionals: HashMap, - description: String, -} - -/// look up a command's parsed result. searches JSON files first, then .nu -/// files, then the parent's .nu file for subcommand lookups (clap-generated -/// .nu files contain all extern blocks in a single file). -pub fn lookup(dirs: &[PathBuf], command: &str) -> Option { - let base_name = filename_of_command(command); - let parent_base = command - .find(' ') - .map(|i| filename_of_command(&command[..i])); - for directory in dirs { - let json_path = directory.join(format!("{base_name}.json")); - if let Some(data) = read_file(&json_path) - && let Ok(v) = serde_json::from_str::(&data) - { - return Some(result_from_json(&v)); - } - let nu_path = directory.join(format!("{base_name}.nu")); - if let Some(data) = read_file(&nu_path) { - return Some(parse_nu_completions(command, &data)); - } - // try parent's .nu file for subcommand lookups - if let Some(pb) = &parent_base { - let parent_nu = directory.join(format!("{pb}.nu")); - if let Some(data) = read_file(&parent_nu) { - let r = parse_nu_completions(command, &data); - if !r.entries.is_empty() - || !r.subcommands.is_empty() - || !r.positionals.is_empty() - { - return Some(r); - } - } - } - } - None -} - -/// look up a command's raw stored data (JSON or .nu source). -pub fn lookup_raw(dirs: &[PathBuf], command: &str) -> Option { - let base_name = filename_of_command(command); - for directory in dirs { - let json_path = directory.join(format!("{base_name}.json")); - if let Some(data) = read_file(&json_path) { - return Some(data); - } - let nu_path = directory.join(format!("{base_name}.nu")); - if let Some(data) = read_file(&nu_path) { - return Some(data); - } - } - None -} - -fn chop_extension(filename: &str) -> Option<&str> { - filename - .strip_suffix(".json") - .or_else(|| filename.strip_suffix(".nu")) -} - -/// list all indexed commands across all store directories. -/// returns a sorted, deduplicated list of command names. -pub fn all_commands(dirs: &[PathBuf]) -> Vec { - let mut out: std::collections::BTreeSet = std::collections::BTreeSet::new(); - for directory in dirs { - let Ok(entries) = fs::read_dir(directory) else { - continue; - }; - for entry in entries.flatten() { - if let Some(name) = entry.file_name().to_str() - && let Some(base) = chop_extension(name) - { - out.insert(command_of_filename(base)); - } - } - } - out.into_iter().collect() -} - -/// discover subcommands of a command by scanning filenames in the store -/// (e.g. for "git", finds "git_add.json", "git_log.json"). -pub fn subcommands_of(dirs: &[PathBuf], command: &str) -> Vec { - let prefix = format!("{}_", filename_of_command(command)); - let mut seen: HashMap = HashMap::new(); - for directory in dirs { - let Ok(entries) = fs::read_dir(directory) else { - continue; - }; - for entry in entries.flatten() { - let Some(filename) = entry.file_name().to_str().map(|s| s.to_string()) else { - continue; - }; - if !filename.starts_with(&prefix) { - continue; - } - let is_json = filename.ends_with(".json"); - let Some(base) = chop_extension(&filename) else { - continue; - }; - let rest = &base[prefix.len()..]; - if rest.is_empty() || rest.contains('_') { - continue; - } - if seen.contains_key(rest) { - continue; - } - let desc = if is_json { - read_file(&entry.path()) - .and_then(|d| serde_json::from_str::(&d).ok()) - .and_then(|v| { - v.get("description") - .and_then(|x| x.as_str()) - .map(|s| s.to_string()) - }) - .unwrap_or_default() - } else { - String::new() - }; - seen.insert( - rest.to_string(), - ManpageSubcommand { - name: rest.to_string(), - desc, - }, - ); - } - } - let mut out: Vec = seen.into_values().collect(); - out.sort_by(|a, b| a.name.cmp(&b.name)); - out -} - -/// determine how a command was indexed: "help", "manpage", "native", etc. -/// for JSON files, returns the "source" field. for .nu files, returns "native". -pub fn file_type_of(dirs: &[PathBuf], command: &str) -> Option { - let base = filename_of_command(command); - for directory in dirs { - let json_path = directory.join(format!("{base}.json")); - if json_path.exists() { - return Some( - read_file(&json_path) - .and_then(|d| serde_json::from_str::(&d).ok()) - .and_then(|v| v.get("source").and_then(|x| x.as_str()).map(String::from)) - .unwrap_or_else(|| "json".to_string()), - ); - } - let nu_path = directory.join(format!("{base}.nu")); - if nu_path.exists() { - return Some("native".to_string()); - } - } - None -} diff --git a/src/types.rs b/src/types.rs deleted file mode 100644 index 335138c..0000000 --- a/src/types.rs +++ /dev/null @@ -1,36 +0,0 @@ -use std::collections::HashMap; - -pub enum Switch<'a> { - Short(char), - Long(&'a str), - Both(char, &'a str), -} - -pub enum Param<'a> { - Mandatory(&'a str), - Optional(&'a str), -} - -pub struct OptionEntry<'a> { - pub switch: Switch<'a>, - pub param: Option>, - pub desc: Vec<&'a str>, -} - -pub struct Subcommand<'a> { - pub name: &'a str, - pub desc: &'a str, -} - -#[derive(Debug, Clone)] -pub struct Positional { - pub optional: bool, - pub variadic: bool, -} - -pub struct HelpResult<'a> { - pub entries: Vec>, - pub subcommands: Vec>, - pub positionals: HashMap<&'a str, Positional>, - pub desc: &'a str, -} diff --git a/test/dune b/test/dune new file mode 100644 index 0000000..d54a2fb --- /dev/null +++ b/test/dune @@ -0,0 +1,3 @@ +(test + (name test_inshellah) + (libraries inshellah str)) diff --git a/test/test_inshellah.ml b/test/test_inshellah.ml new file mode 100644 index 0000000..8f7b25e --- /dev/null +++ b/test/test_inshellah.ml @@ -0,0 +1,610 @@ +open Inshellah.Parser +open Inshellah.Manpage +open Inshellah.Nushell + +let failures = ref 0 +let passes = ref 0 + +let check name condition = + if condition then begin + incr passes; + Printf.printf " PASS: %s\n" name + end else begin + incr failures; + Printf.printf " FAIL: %s\n" name + end + +let parse txt = + match parse_help txt with + | Ok r -> r + | Error msg -> failwith (Printf.sprintf "parse_help failed: %s" msg) + +(* --- Help parser tests --- *) + +let test_gnu_basic () = + Printf.printf "\n== GNU basic flags ==\n"; + let r = parse " -a, --all do not ignore entries starting with .\n" in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "both switch" (e.switch = Both ('a', "all")); + check "no param" (e.param = None); + check "desc" (String.length e.desc > 0) + +let test_gnu_eq_param () = + Printf.printf "\n== GNU = param ==\n"; + let r = parse " --block-size=SIZE scale sizes by SIZE\n" in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "long switch" (e.switch = Long "block-size"); + check "mandatory param" (e.param = Some (Mandatory "SIZE")) + +let test_gnu_opt_param () = + Printf.printf "\n== GNU optional param ==\n"; + let r = parse " --color[=WHEN] color the output WHEN\n" in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "long switch" (e.switch = Long "color"); + check "optional param" (e.param = Some (Optional "WHEN")) + +let test_underscore_param () = + Printf.printf "\n== Underscore in param (TIME_STYLE) ==\n"; + let r = parse " --time-style=TIME_STYLE time/date format\n" in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "param with underscore" (e.param = Some (Mandatory "TIME_STYLE")) + +let test_short_only () = + Printf.printf "\n== Short-only flag ==\n"; + let r = parse " -v verbose output\n" in + check "one entry" (List.length r.entries = 1); + check "short switch" ((List.hd r.entries).switch = Short 'v') + +let test_long_only () = + Printf.printf "\n== Long-only flag ==\n"; + let r = parse " --help display help\n" in + check "one entry" (List.length r.entries = 1); + check "long switch" ((List.hd r.entries).switch = Long "help") + +let test_multiline_desc () = + Printf.printf "\n== Multi-line description ==\n"; + let r = parse {| --block-size=SIZE with -l, scale sizes by SIZE when printing them; + e.g., '--block-size=M'; see SIZE format below +|} in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "desc includes continuation" (String.length e.desc > 50) + +let test_multiple_entries () = + Printf.printf "\n== Multiple entries ==\n"; + let r = parse {| -a, --all do not ignore entries starting with . + -A, --almost-all do not list implied . and .. + --author with -l, print the author of each file +|} in + check "three entries" (List.length r.entries = 3) + +let test_clap_short_sections () = + Printf.printf "\n== Clap short with section headers ==\n"; + let r = parse {|INPUT OPTIONS: + -e, --regexp=PATTERN A pattern to search for. + -f, --file=PATTERNFILE Search for patterns from the given file. +SEARCH OPTIONS: + -s, --case-sensitive Search case sensitively. +|} in + check "three entries" (List.length r.entries = 3); + let e = List.hd r.entries in + check "first is regexp" (e.switch = Both ('e', "regexp")); + check "first has param" (e.param = Some (Mandatory "PATTERN")) + +let test_clap_long_style () = + Printf.printf "\n== Clap long style (desc below flag) ==\n"; + let r = parse {| -H, --hidden + Include hidden directories and files. + + --no-ignore + Do not respect ignore files. +|} in + check "two entries" (List.length r.entries = 2); + let e = List.hd r.entries in + check "hidden switch" (e.switch = Both ('H', "hidden")); + check "desc below" (String.length e.desc > 0) + +let test_clap_long_angle_param () = + Printf.printf "\n== Clap long angle bracket param ==\n"; + let r = parse {| --nonprintable-notation + Set notation for non-printable characters. +|} in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "long switch" (e.switch = Long "nonprintable-notation"); + check "angle param" (e.param = Some (Mandatory "notation")) + +let test_space_upper_param () = + Printf.printf "\n== Space-separated ALL_CAPS param ==\n"; + let r = parse " -f, --foo FOO foo help\n" in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "switch" (e.switch = Both ('f', "foo")); + check "space param" (e.param = Some (Mandatory "FOO")) + +let test_go_cobra_flags () = + Printf.printf "\n== Go/Cobra flags ==\n"; + let r = parse {|Flags: + -D, --debug Enable debug mode + -H, --host string Daemon socket to connect to + -v, --version Print version information +|} in + check "three flag entries" (List.length r.entries = 3); + (* Check the host flag has a type param *) + let host = List.nth r.entries 1 in + check "host switch" (host.switch = Both ('H', "host")); + check "host type param" (host.param = Some (Mandatory "string")) + +let test_go_cobra_subcommands () = + Printf.printf "\n== Go/Cobra subcommands ==\n"; + let r = parse {|Common Commands: + run Create and run a new container from an image + exec Execute a command in a running container + build Build an image from a Dockerfile +|} in + check "has subcommands" (List.length r.subcommands > 0) + +let test_busybox_tab () = + Printf.printf "\n== Busybox tab-indented ==\n"; + let r = parse "\t-1\tOne column output\n\t-a\tInclude names starting with .\n" in + check "two entries" (List.length r.entries = 2); + check "first is -1" ((List.hd r.entries).switch = Short '1') + +let test_no_debug_prints () = + Printf.printf "\n== No debug side effects ==\n"; + (* The old parser had print_endline at module load time. + If we got here without "opt param is running" on stdout, we're good. *) + check "no debug prints" true + +(* --- Manpage parser tests --- *) + +let test_manpage_tp_style () = + Printf.printf "\n== Manpage .TP style ==\n"; + let groff = {|.SH OPTIONS +.TP +\fB\-a\fR, \fB\-\-all\fR +do not ignore entries starting with . +.TP +\fB\-A\fR, \fB\-\-almost\-all\fR +do not list implied . and .. +.TP +\fB\-\-block\-size\fR=\fISIZE\fR +with \fB\-l\fR, scale sizes by SIZE +.SH AUTHOR +Written by someone. +|} in + let result = parse_manpage_string groff in + check "three entries" (List.length result.entries = 3); + if List.length result.entries >= 1 then begin + let e = List.hd result.entries in + check "first is -a/--all" (e.switch = Both ('a', "all")); + check "first desc" (String.length e.desc > 0) + end; + if List.length result.entries >= 3 then begin + let e = List.nth result.entries 2 in + check "block-size switch" (e.switch = Long "block-size"); + check "block-size param" (e.param = Some (Mandatory "SIZE")) + end + +let test_manpage_ip_style () = + Printf.printf "\n== Manpage .IP style ==\n"; + let groff = {|.SH OPTIONS +.IP "\fB\-k\fR, \fB\-\-insecure\fR" +Allow insecure connections. +.IP "\fB\-o\fR, \fB\-\-output\fR \fIfile\fR" +Write output to file. +.SH SEE ALSO +|} in + let result = parse_manpage_string groff in + check "two entries" (List.length result.entries = 2); + if List.length result.entries >= 1 then begin + let e = List.hd result.entries in + check "first is -k/--insecure" (e.switch = Both ('k', "insecure")) + end + +let test_manpage_groff_stripping () = + Printf.printf "\n== Groff escape stripping ==\n"; + let s = strip_groff_escapes {|\fB\-\-color\fR[=\fIWHEN\fR]|} in + check "font escapes removed" (not (String.contains s 'f' && String.contains s 'B')); + check "dashes converted" (String.contains s '-'); + let s2 = strip_groff_escapes {|\(aqhello\(aq|} in + check "aq -> quote" (String.contains s2 '\'') + +let test_manpage_empty_options () = + Printf.printf "\n== Manpage with no OPTIONS section ==\n"; + let groff = {|.SH NAME +foo \- does stuff +.SH DESCRIPTION +Does stuff. +|} in + let result = parse_manpage_string groff in + check "no entries" (List.length result.entries = 0) + +let test_slash_switch_separator () = + Printf.printf "\n== Slash switch separator (--long / -s) ==\n"; + let r = parse " --verbose / -v Increase verbosity\n" in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "both switch" (e.switch = Both ('v', "verbose")); + check "no param" (e.param = None); + check "desc" (e.desc = "Increase verbosity") + +let test_manpage_nix3_style () = + Printf.printf "\n== Manpage nix3 style ==\n"; + let groff = {|.SH Options +.SS Logging-related options +.IP "\(bu" 3 +.UR #opt-verbose +\f(CR--verbose\fR +.UE +/ \f(CR-v\fR +.IP +Increase the logging verbosity level. +.IP "\(bu" 3 +.UR #opt-quiet +\f(CR--quiet\fR +.UE +.IP +Decrease the logging verbosity level. +.SH SEE ALSO +|} in + let result = parse_manpage_string groff in + check "two entries" (List.length result.entries = 2); + if List.length result.entries >= 1 then begin + let e = List.hd result.entries in + check "verbose is Both" (e.switch = Both ('v', "verbose")); + check "verbose desc" (String.length e.desc > 0) + end; + if List.length result.entries >= 2 then begin + let e = List.nth result.entries 1 in + check "quiet is Long" (e.switch = Long "quiet"); + check "quiet desc" (String.length e.desc > 0) + end + +let test_manpage_nix3_with_params () = + Printf.printf "\n== Manpage nix3 with params ==\n"; + let groff = {|.SH Options +.IP "\(bu" 3 +.UR #opt-arg +\f(CR--arg\fR +.UE +\fIname\fR \fIexpr\fR +.IP +Pass the value as the argument name to Nix functions. +.IP "\(bu" 3 +.UR #opt-include +\f(CR--include\fR +.UE +/ \f(CR-I\fR \fIpath\fR +.IP +Add path to search path entries. +.IP +This option may be given multiple times. +.SH SEE ALSO +|} in + let result = parse_manpage_string groff in + check "two entries" (List.length result.entries = 2); + if List.length result.entries >= 1 then begin + let e = List.hd result.entries in + check "arg is Long" (e.switch = Long "arg"); + check "arg has param" (e.param <> None) + end; + if List.length result.entries >= 2 then begin + let e = List.nth result.entries 1 in + check "include is Both" (e.switch = Both ('I', "include")); + check "include has path param" (e.param = Some (Mandatory "path")) + end + +let test_synopsis_subcommand () = + Printf.printf "\n== SYNOPSIS subcommand detection ==\n"; + let groff = {|.SH "SYNOPSIS" +.sp +.nf +\fBgit\fR \fBcommit\fR [\fB\-a\fR | \fB\-\-interactive\fR] +.fi +.SH "DESCRIPTION" +|} in + let cmd = extract_synopsis_command groff in + check "detected git commit" (cmd = Some "git commit") + +let test_synopsis_standalone () = + Printf.printf "\n== SYNOPSIS standalone command ==\n"; + let groff = {|.SH Synopsis +.LP +\f(CRnix-build\fR [\fIpaths\fR] +.SH Description +|} in + let cmd = extract_synopsis_command groff in + check "detected nix-build" (cmd = Some "nix-build") + +let test_synopsis_nix3 () = + Printf.printf "\n== SYNOPSIS nix3 subcommand ==\n"; + let groff = {|.SH Synopsis +.LP +\f(CRnix run\fR [\fIoption\fR] \fIinstallable\fR +.SH Description +|} in + let cmd = extract_synopsis_command groff in + check "detected nix run" (cmd = Some "nix run") + +(* --- Nushell generation tests --- *) + +let contains s sub = + try + let _ = Str.search_forward (Str.regexp_string sub) s 0 in true + with Not_found -> false + +let test_nushell_basic () = + Printf.printf "\n== Nushell basic extern ==\n"; + let r = parse " -a, --all do not ignore entries starting with .\n" in + let nu = generate_extern "ls" r in + check "has extern" (contains nu "export extern \"ls\""); + check "has --all(-a)" (contains nu "--all(-a)"); + check "has comment" (contains nu "# do not ignore") + +let test_nushell_param_types () = + Printf.printf "\n== Nushell param type mapping ==\n"; + let r = parse {| -w, --width=COLS set output width + --block-size=SIZE scale sizes + -o, --output FILE output file +|} in + let nu = generate_extern "ls" r in + check "COLS -> int" (contains nu "--width(-w): int"); + check "SIZE -> string" (contains nu "--block-size: string"); + check "FILE -> path" (contains nu "--output(-o): path") + +let test_nushell_subcommands () = + Printf.printf "\n== Nushell subcommands ==\n"; + let r = parse {|Common Commands: + run Create and run a new container + exec Execute a command + +Flags: + -D, --debug Enable debug mode +|} in + let nu = generate_extern "docker" r in + check "has main extern" (contains nu "export extern \"docker\""); + check "has --debug" (contains nu "--debug(-D)"); + check "has run subcommand" (contains nu "export extern \"docker run\""); + check "has exec subcommand" (contains nu "export extern \"docker exec\"") + +let test_nushell_from_manpage () = + Printf.printf "\n== Nushell from manpage ==\n"; + let groff = {|.SH OPTIONS +.TP +\fB\-a\fR, \fB\-\-all\fR +do not ignore entries starting with . +.TP +\fB\-\-block\-size\fR=\fISIZE\fR +scale sizes by SIZE +.SH AUTHOR +|} in + let result = parse_manpage_string groff in + let nu = generate_extern "ls" result in + check "has extern" (contains nu "export extern \"ls\""); + check "has --all(-a)" (contains nu "--all(-a)"); + check "has --block-size" (contains nu "--block-size: string") + +let test_nushell_module () = + Printf.printf "\n== Nushell module wrapper ==\n"; + let r = parse " -v, --verbose verbose output\n" in + let nu = generate_module "myapp" r in + check "has module" (contains nu "module myapp-completions"); + check "has extern inside" (contains nu "export extern \"myapp\""); + check "has flag" (contains nu "--verbose(-v)") + +let test_dedup_entries () = + Printf.printf "\n== Deduplication ==\n"; + let r = parse {| -v, --verbose verbose output + --verbose verbose mode + -v be verbose +|} in + let nu = generate_extern "test" r in + (* Count occurrences of --verbose *) + let count = + let re = Str.regexp_string "--verbose" in + let n = ref 0 in + let i = ref 0 in + (try while true do + let _ = Str.search_forward re nu !i in + incr n; i := Str.match_end () + done with Not_found -> ()); + !n + in + check "verbose appears once" (count = 1); + check "best version kept (Both)" (contains nu "--verbose(-v)") + +let test_dedup_manpage () = + Printf.printf "\n== Dedup from manpage ==\n"; + let groff = {|.SH OPTIONS +.TP +\fB\-v\fR, \fB\-\-verbose\fR +Be verbose. +.SH DESCRIPTION +Use \fB\-v\fR for verbose output. +Use \fB\-\-verbose\fR to see more. +|} in + let result = parse_manpage_string groff in + let nu = generate_extern "test" result in + check "has --verbose(-v)" (contains nu "--verbose(-v)"); + (* Should not have standalone -v or duplicate --verbose *) + let lines = String.split_on_char '\n' nu in + let verbose_lines = List.filter (fun l -> contains l "verbose") lines in + check "only one verbose line" (List.length verbose_lines = 1) + +let test_commands_section_subcommands () = + Printf.printf "\n== COMMANDS section subcommand extraction ==\n"; + (* manpages like systemctl have a COMMANDS section with bold command names + * inside .PP + .RS/.RE blocks. these should be extracted as subcommands + * and treated as leaf nodes (no entries of their own). *) + let groff = {|.SH OPTIONS +.TP +\fB\-\-user\fR +Talk to the service manager of the calling user. +.TP +\fB\-\-system\fR +Talk to the service manager of the system. +.SH COMMANDS +.PP +\fBstart\fR \fIUNIT\fR\&... +.RS 4 +Start (activate) one or more units. +.RE +.PP +\fBstop\fR \fIUNIT\fR\&... +.RS 4 +Stop (deactivate) one or more units. +.RE +.PP +\fBreload\fR \fIUNIT\fR\&... +.RS 4 +Asks all units to reload their configuration. +.RE +.SH SEE ALSO +|} in + let result = parse_manpage_string groff in + check "has options entries" (List.length result.entries = 2); + check "has subcommands" (List.length result.subcommands = 3); + let sc_names = List.map (fun (sc : subcommand) -> sc.name) result.subcommands in + check "has start" (List.mem "start" sc_names); + check "has stop" (List.mem "stop" sc_names); + check "has reload" (List.mem "reload" sc_names); + (* verify subcommand descriptions are extracted *) + let start_sc = List.find (fun (sc : subcommand) -> sc.name = "start") result.subcommands in + check "start has desc" (String.length start_sc.desc > 0) + +let test_self_listing_detection () = + Printf.printf "\n== Self-listing subcommand detection ==\n"; + (* when a subcommand's --help shows the parent's help text, + * the subcommand name appears in its own subcommand list. + * the parser should detect this — tested via parse_help. *) + let help_text = {|systemctl [OPTIONS...] COMMAND ... + +Unit Commands: + start UNIT... Start (activate) one or more units + stop UNIT... Stop (deactivate) one or more units + status [PATTERN...] Show runtime status + +Options: + --user Talk to the user service manager + --system Talk to the system service manager +|} in + let r = parse help_text in + let has_start = List.exists (fun (sc : subcommand) -> sc.name = "start") r.subcommands in + check "detected start as subcommand" has_start; + (* the self-listing logic (in main.ml) would check: is "start" in r.subcommands? + * here we just verify the parser extracts it correctly. *) + check "has entries too" (List.length r.entries >= 2) + +let test_nu_file_parsing () = + Printf.printf "\n== .nu file parsing ==\n"; + let nu_source = {|module completions { + + # Unofficial CLI tool + export extern mytool [ + --help(-h) # Print help + --version(-V) # Print version + ] + + # List all items + export extern "mytool list" [ + --raw # Output as JSON + --format(-f): string # Output format + --help(-h) # Print help + name?: string # Filter by name + ] + +} + +use completions * +|} in + let r = Inshellah.Store.parse_nu_completions "mytool" nu_source in + check "has entries" (List.length r.entries = 2); + check "has subcommands" (List.length r.subcommands >= 1); + let list_sc = List.find_opt (fun (sc : subcommand) -> sc.name = "list") r.subcommands in + check "has list subcommand" (list_sc <> None); + check "description" (r.description = "Unofficial CLI tool"); + (* test subcommand lookup *) + let r2 = Inshellah.Store.parse_nu_completions "mytool list" nu_source in + check "list has entries" (List.length r2.entries = 3); + let has_format = List.exists (fun (e : entry) -> + e.switch = Both ('f', "format")) r2.entries in + check "list has --format(-f)" has_format; + check "list has positional" (List.length r2.positionals >= 1) + +let test_italic_synopsis () = + Printf.printf "\n== Italic in SYNOPSIS ==\n"; + let groff = {|.SH Synopsis +.LP +\f(CRnix-env\fR \fIoperation\fR [\fIoptions\fR] [\fIarguments…\fR] +.SH Description +|} in + let cmd = extract_synopsis_command groff in + check "no phantom operation" (cmd = Some "nix-env") + +let test_font_boundary_spacing () = + Printf.printf "\n== Font boundary spacing ==\n"; + (* \fB--max-results\fR\fIcount\fR should become "--max-results count" *) + let s = strip_groff_escapes {|\fB\-\-max\-results\fR\fIcount\fR|} in + check "has space before param" (contains s "--max-results count"); + (* \fB--color\fR[=\fIWHEN\fR] should NOT insert space before = *) + let s2 = strip_groff_escapes {|\fB\-\-color\fR[=\fIWHEN\fR]|} in + check "no space before =" (contains s2 "--color[=WHEN]") + +let () = + Printf.printf "Running help parser tests...\n"; + test_gnu_basic (); + test_gnu_eq_param (); + test_gnu_opt_param (); + test_underscore_param (); + test_short_only (); + test_long_only (); + test_multiline_desc (); + test_multiple_entries (); + test_clap_short_sections (); + test_clap_long_style (); + test_clap_long_angle_param (); + test_space_upper_param (); + test_go_cobra_flags (); + test_go_cobra_subcommands (); + test_busybox_tab (); + test_no_debug_prints (); + + Printf.printf "\nRunning manpage parser tests...\n"; + test_manpage_tp_style (); + test_manpage_ip_style (); + test_manpage_groff_stripping (); + test_manpage_empty_options (); + test_slash_switch_separator (); + test_manpage_nix3_style (); + test_manpage_nix3_with_params (); + test_synopsis_subcommand (); + test_synopsis_standalone (); + test_synopsis_nix3 (); + + Printf.printf "\nRunning nushell generation tests...\n"; + test_nushell_basic (); + test_nushell_param_types (); + test_nushell_subcommands (); + test_nushell_from_manpage (); + test_nushell_module (); + + Printf.printf "\nRunning dedup and font tests...\n"; + test_dedup_entries (); + test_dedup_manpage (); + test_font_boundary_spacing (); + + Printf.printf "\nRunning COMMANDS section tests...\n"; + test_commands_section_subcommands (); + test_self_listing_detection (); + + Printf.printf "\nRunning .nu and synopsis tests...\n"; + test_nu_file_parsing (); + test_italic_synopsis (); + + Printf.printf "\n=== Results: %d passed, %d failed ===\n" !passes !failures; + if !failures > 0 then exit 1 diff --git a/tests/git_clone_fix.rs b/tests/git_clone_fix.rs deleted file mode 100644 index 38443d2..0000000 --- a/tests/git_clone_fix.rs +++ /dev/null @@ -1,23 +0,0 @@ -use inshellah::parsers::help::help_parser; - -#[test] -fn parser_recovers_past_no_bracket_long_form() { - // git clone -h produces lines like `--[no-]progress` that switch_parser - // can't parse. previously the help parser got stuck on these because - // skip_non_option_line refused to skip option-looking lines. now it falls - // through to skip, letting the parser continue to the next real entry. - let text = r#"usage: git clone [] [--] [

] - - -v, --[no-]verbose be more verbose - -q, --[no-]quiet be more quiet - --[no-]progress force progress reporting - --[no-]reject-shallow don't clone shallow repository - -n, --no-checkout don't create a checkout - --checkout opposite of --no-checkout - -s, --[no-]shared setup as shared repository -"#; - let (_, r) = help_parser(text).expect("parse"); - // before the fix: only 2 entries (-v, -q) before the parser got stuck. - // after: -v, -q, -n/--no-checkout, --checkout, -s, plus any others. - assert!(r.entries.len() >= 4, "expected ≥4 entries, got {}", r.entries.len()); -} diff --git a/tests/ports.rs b/tests/ports.rs deleted file mode 100644 index 3392679..0000000 --- a/tests/ports.rs +++ /dev/null @@ -1,517 +0,0 @@ -//! Tests ported from ../inshellah/test/test_inshellah.ml. -//! -//! Covers the help parser, manpage parser, groff stripping, and nushell -//! generation. The single .nu store parser test (`test_nu_file_parsing`) is -//! not included — it requires porting store.ml first. - -use inshellah::parsers::help::help_parser; -use inshellah::parsers::manpage::{ - ManpageResult, OwnedParam, OwnedSwitch, extract_synopsis_command, - parse_manpage_string, strip_groff_escapes, -}; -use inshellah::parsers::nushell::{generate_extern, generate_module}; -use inshellah::store::parse_nu_completions; -use inshellah::types::{HelpResult, Param, Switch}; - -fn parse(txt: &str) -> HelpResult<'_> { - match help_parser(txt) { - Ok((_, r)) => r, - Err(e) => panic!("parse_help failed: {e:?}"), - } -} - -// --- Help parser tests --- - -#[test] -fn gnu_basic() { - let r = parse(" -a, --all do not ignore entries starting with .\n"); - assert_eq!(r.entries.len(), 1); - let e = &r.entries[0]; - assert!(matches!(&e.switch, Switch::Both('a', l) if *l == "all")); - assert!(e.param.is_none()); - assert!(!e.desc.is_empty()); -} - -#[test] -fn gnu_eq_param() { - let r = parse(" --block-size=SIZE scale sizes by SIZE\n"); - assert_eq!(r.entries.len(), 1); - let e = &r.entries[0]; - assert!(matches!(&e.switch, Switch::Long(l) if *l == "block-size")); - assert!(matches!(&e.param, Some(Param::Mandatory(p)) if *p == "SIZE")); -} - -#[test] -fn gnu_opt_param() { - let r = parse(" --color[=WHEN] color the output WHEN\n"); - assert_eq!(r.entries.len(), 1); - let e = &r.entries[0]; - assert!(matches!(&e.switch, Switch::Long(l) if *l == "color")); - assert!(matches!(&e.param, Some(Param::Optional(p)) if *p == "WHEN")); -} - -#[test] -fn underscore_param() { - let r = parse(" --time-style=TIME_STYLE time/date format\n"); - assert_eq!(r.entries.len(), 1); - let e = &r.entries[0]; - assert!(matches!(&e.param, Some(Param::Mandatory(p)) if *p == "TIME_STYLE")); -} - -#[test] -fn short_only() { - let r = parse(" -v verbose output\n"); - assert_eq!(r.entries.len(), 1); - assert!(matches!(r.entries[0].switch, Switch::Short('v'))); -} - -#[test] -fn long_only() { - let r = parse(" --help display help\n"); - assert_eq!(r.entries.len(), 1); - assert!(matches!(&r.entries[0].switch, Switch::Long(l) if *l == "help")); -} - -#[test] -fn multiline_desc() { - let txt = " --block-size=SIZE with -l, scale sizes by SIZE when printing them;\n e.g., '--block-size=M'; see SIZE format below\n"; - let r = parse(txt); - assert_eq!(r.entries.len(), 1); - let combined: String = r.entries[0].desc.join(" "); - assert!(combined.len() > 50, "desc was: {combined}"); -} - -#[test] -fn multiple_entries() { - let txt = " -a, --all do not ignore entries starting with .\n -A, --almost-all do not list implied . and ..\n --author with -l, print the author of each file\n"; - let r = parse(txt); - assert_eq!(r.entries.len(), 3); -} - -#[test] -fn clap_short_sections() { - let txt = "INPUT OPTIONS:\n -e, --regexp=PATTERN A pattern to search for.\n -f, --file=PATTERNFILE Search for patterns from the given file.\nSEARCH OPTIONS:\n -s, --case-sensitive Search case sensitively.\n"; - let r = parse(txt); - assert_eq!(r.entries.len(), 3); - let e = &r.entries[0]; - assert!(matches!(&e.switch, Switch::Both('e', l) if *l == "regexp")); - assert!(matches!(&e.param, Some(Param::Mandatory(p)) if *p == "PATTERN")); -} - -#[test] -fn clap_long_style() { - let txt = " -H, --hidden\n Include hidden directories and files.\n\n --no-ignore\n Do not respect ignore files.\n"; - let r = parse(txt); - assert_eq!(r.entries.len(), 2); - let e = &r.entries[0]; - assert!(matches!(&e.switch, Switch::Both('H', l) if *l == "hidden")); - assert!(!e.desc.is_empty()); -} - -#[test] -fn clap_long_angle_param() { - let txt = " --nonprintable-notation \n Set notation for non-printable characters.\n"; - let r = parse(txt); - assert_eq!(r.entries.len(), 1); - let e = &r.entries[0]; - assert!(matches!(&e.switch, Switch::Long(l) if *l == "nonprintable-notation")); - assert!(matches!(&e.param, Some(Param::Mandatory(p)) if *p == "notation")); -} - -#[test] -fn space_upper_param() { - let r = parse(" -f, --foo FOO foo help\n"); - assert_eq!(r.entries.len(), 1); - let e = &r.entries[0]; - assert!(matches!(&e.switch, Switch::Both('f', l) if *l == "foo")); - assert!(matches!(&e.param, Some(Param::Mandatory(p)) if *p == "FOO")); -} - -#[test] -fn go_cobra_flags() { - let txt = "Flags:\n -D, --debug Enable debug mode\n -H, --host string Daemon socket to connect to\n -v, --version Print version information\n"; - let r = parse(txt); - assert_eq!(r.entries.len(), 3); - let host = &r.entries[1]; - assert!(matches!(&host.switch, Switch::Both('H', l) if *l == "host")); - assert!(matches!(&host.param, Some(Param::Mandatory(p)) if *p == "string")); -} - -#[test] -fn go_cobra_subcommands() { - let txt = "Common Commands:\n run Create and run a new container from an image\n exec Execute a command in a running container\n build Build an image from a Dockerfile\n"; - let r = parse(txt); - assert!(!r.subcommands.is_empty(), "expected subcommands, got: {:?}", r.subcommands.len()); -} - -#[test] -fn busybox_tab() { - let r = parse("\t-1\tOne column output\n\t-a\tInclude names starting with .\n"); - assert_eq!(r.entries.len(), 2); - assert!(matches!(r.entries[0].switch, Switch::Short('1'))); -} - -#[test] -fn no_debug_prints() { - // the old ocaml parser had print_endline at module load time; this test - // documents that no such side effects exist in the rust port. - let _ = parse(" -v verbose\n"); -} - -#[test] -fn slash_switch_separator() { - let r = parse(" --verbose / -v Increase verbosity\n"); - assert_eq!(r.entries.len(), 1); - let e = &r.entries[0]; - assert!(matches!(&e.switch, Switch::Both('v', l) if *l == "verbose")); - assert!(e.param.is_none()); - let combined: String = e.desc.join(" "); - assert_eq!(combined.trim(), "Increase verbosity"); -} - -// --- Manpage parser tests --- - -#[test] -fn manpage_tp_style() { - let groff = r#".SH OPTIONS -.TP -\fB\-a\fR, \fB\-\-all\fR -do not ignore entries starting with . -.TP -\fB\-A\fR, \fB\-\-almost\-all\fR -do not list implied . and .. -.TP -\fB\-\-block\-size\fR=\fISIZE\fR -with \fB\-l\fR, scale sizes by SIZE -.SH AUTHOR -Written by someone. -"#; - let r = parse_manpage_string(groff); - assert_eq!(r.entries.len(), 3, "entries: {:?}", r.entries); - assert!(matches!(&r.entries[0].switch, OwnedSwitch::Both('a', l) if l == "all")); - assert!(!r.entries[0].desc.is_empty()); - assert!(matches!(&r.entries[2].switch, OwnedSwitch::Long(l) if l == "block-size")); - assert!(matches!(&r.entries[2].param, Some(OwnedParam::Mandatory(p)) if p == "SIZE")); -} - -#[test] -fn manpage_ip_style() { - let groff = r#".SH OPTIONS -.IP "\fB\-k\fR, \fB\-\-insecure\fR" -Allow insecure connections. -.IP "\fB\-o\fR, \fB\-\-output\fR \fIfile\fR" -Write output to file. -.SH SEE ALSO -"#; - let r = parse_manpage_string(groff); - assert_eq!(r.entries.len(), 2, "entries: {:?}", r.entries); - assert!(matches!(&r.entries[0].switch, OwnedSwitch::Both('k', l) if l == "insecure")); -} - -#[test] -fn manpage_groff_stripping() { - let s = strip_groff_escapes(r#"\fB\-\-color\fR[=\fIWHEN\fR]"#); - // font escapes removed - assert!(!(s.contains('f') && s.contains('B') && s.contains('\\'))); - // dashes converted - assert!(s.contains('-')); - let s2 = strip_groff_escapes(r#"\(aqhello\(aq"#); - assert!(s2.contains('\''), "expected apostrophe in: {s2}"); -} - -#[test] -fn manpage_empty_options() { - let groff = ".SH NAME\nfoo \\- does stuff\n.SH DESCRIPTION\nDoes stuff.\n"; - let r = parse_manpage_string(groff); - assert_eq!(r.entries.len(), 0); -} - -#[test] -fn manpage_nix3_style() { - let groff = r#".SH Options -.SS Logging-related options -.IP "\(bu" 3 -.UR #opt-verbose -\f(CR--verbose\fR -.UE -/ \f(CR-v\fR -.IP -Increase the logging verbosity level. -.IP "\(bu" 3 -.UR #opt-quiet -\f(CR--quiet\fR -.UE -.IP -Decrease the logging verbosity level. -.SH SEE ALSO -"#; - let r = parse_manpage_string(groff); - assert_eq!(r.entries.len(), 2, "entries: {:?}", r.entries); - assert!(matches!(&r.entries[0].switch, OwnedSwitch::Both('v', l) if l == "verbose")); - assert!(!r.entries[0].desc.is_empty()); - assert!(matches!(&r.entries[1].switch, OwnedSwitch::Long(l) if l == "quiet")); - assert!(!r.entries[1].desc.is_empty()); -} - -#[test] -fn manpage_nix3_with_params() { - let groff = r#".SH Options -.IP "\(bu" 3 -.UR #opt-arg -\f(CR--arg\fR -.UE -\fIname\fR \fIexpr\fR -.IP -Pass the value as the argument name to Nix functions. -.IP "\(bu" 3 -.UR #opt-include -\f(CR--include\fR -.UE -/ \f(CR-I\fR \fIpath\fR -.IP -Add path to search path entries. -.IP -This option may be given multiple times. -.SH SEE ALSO -"#; - let r = parse_manpage_string(groff); - assert_eq!(r.entries.len(), 2, "entries: {:?}", r.entries); - assert!(matches!(&r.entries[0].switch, OwnedSwitch::Long(l) if l == "arg")); - assert!(r.entries[0].param.is_some()); - assert!(matches!(&r.entries[1].switch, OwnedSwitch::Both('I', l) if l == "include")); - assert!(matches!(&r.entries[1].param, Some(OwnedParam::Mandatory(p)) if p == "path")); -} - -#[test] -fn synopsis_subcommand() { - let groff = r#".SH "SYNOPSIS" -.sp -.nf -\fBgit\fR \fBcommit\fR [\fB\-a\fR | \fB\-\-interactive\fR] -.fi -.SH "DESCRIPTION" -"#; - let cmd = extract_synopsis_command(groff); - assert_eq!(cmd.as_deref(), Some("git commit")); -} - -#[test] -fn synopsis_standalone() { - let groff = ".SH Synopsis\n.LP\n\\f(CRnix-build\\fR [\\fIpaths\\fR]\n.SH Description\n"; - let cmd = extract_synopsis_command(groff); - assert_eq!(cmd.as_deref(), Some("nix-build")); -} - -#[test] -fn synopsis_nix3() { - let groff = ".SH Synopsis\n.LP\n\\f(CRnix run\\fR [\\fIoption\\fR] \\fIinstallable\\fR\n.SH Description\n"; - let cmd = extract_synopsis_command(groff); - assert_eq!(cmd.as_deref(), Some("nix run")); -} - -#[test] -fn italic_synopsis() { - let groff = ".SH Synopsis\n.LP\n\\f(CRnix-env\\fR \\fIoperation\\fR [\\fIoptions\\fR] [\\fIarguments…\\fR]\n.SH Description\n"; - let cmd = extract_synopsis_command(groff); - assert_eq!(cmd.as_deref(), Some("nix-env")); -} - -// --- Font/dedup tests (only the font-spacing one is portable) --- - -#[test] -fn font_boundary_spacing() { - // \fB--max-results\fR\fIcount\fR should become "--max-results count" - let s = strip_groff_escapes(r#"\fB\-\-max\-results\fR\fIcount\fR"#); - assert!(s.contains("--max-results count"), "got: {s}"); - // \fB--color\fR[=\fIWHEN\fR] should NOT insert space before = - let s2 = strip_groff_escapes(r#"\fB\-\-color\fR[=\fIWHEN\fR]"#); - assert!(s2.contains("--color[=WHEN]"), "got: {s2}"); -} - -// --- COMMANDS section tests --- - -#[test] -fn commands_section_subcommands() { - let groff = r#".SH OPTIONS -.TP -\fB\-\-user\fR -Talk to the service manager of the calling user. -.TP -\fB\-\-system\fR -Talk to the service manager of the system. -.SH COMMANDS -.PP -\fBstart\fR \fIUNIT\fR\&... -.RS 4 -Start (activate) one or more units. -.RE -.PP -\fBstop\fR \fIUNIT\fR\&... -.RS 4 -Stop (deactivate) one or more units. -.RE -.PP -\fBreload\fR \fIUNIT\fR\&... -.RS 4 -Asks all units to reload their configuration. -.RE -.SH SEE ALSO -"#; - let r = parse_manpage_string(groff); - assert_eq!(r.entries.len(), 2, "options entries: {:?}", r.entries); - assert_eq!(r.subcommands.len(), 3, "subcommands: {:?}", r.subcommands); - let names: Vec<&str> = r.subcommands.iter().map(|sc| sc.name.as_str()).collect(); - assert!(names.contains(&"start")); - assert!(names.contains(&"stop")); - assert!(names.contains(&"reload")); - let start_sc = r.subcommands.iter().find(|sc| sc.name == "start").unwrap(); - assert!(!start_sc.desc.is_empty()); -} - -// --- Nushell generation tests --- - -fn to_owned_result(r: &HelpResult<'_>) -> ManpageResult { - r.into() -} - -#[test] -fn nushell_basic() { - let r = parse(" -a, --all do not ignore entries starting with .\n"); - let nu = generate_extern("ls", &to_owned_result(&r)); - assert!(nu.contains("export extern \"ls\""), "nu = {nu}"); - assert!(nu.contains("--all(-a)"), "nu = {nu}"); - assert!(nu.contains("# do not ignore"), "nu = {nu}"); -} - -#[test] -fn nushell_param_types() { - let txt = " -w, --width=COLS set output width\n --block-size=SIZE scale sizes\n -o, --output FILE output file\n"; - let r = parse(txt); - let nu = generate_extern("ls", &to_owned_result(&r)); - assert!(nu.contains("--width(-w): int"), "nu = {nu}"); - assert!(nu.contains("--block-size: string"), "nu = {nu}"); - assert!(nu.contains("--output(-o): path"), "nu = {nu}"); -} - -#[test] -fn nushell_subcommands() { - let txt = "Common Commands:\n run Create and run a new container\n exec Execute a command\n\nFlags:\n -D, --debug Enable debug mode\n"; - let r = parse(txt); - let nu = generate_extern("docker", &to_owned_result(&r)); - assert!(nu.contains("export extern \"docker\""), "nu = {nu}"); - assert!(nu.contains("--debug(-D)"), "nu = {nu}"); - assert!(nu.contains("export extern \"docker run\""), "nu = {nu}"); - assert!(nu.contains("export extern \"docker exec\""), "nu = {nu}"); -} - -#[test] -fn nushell_from_manpage() { - let groff = r#".SH OPTIONS -.TP -\fB\-a\fR, \fB\-\-all\fR -do not ignore entries starting with . -.TP -\fB\-\-block\-size\fR=\fISIZE\fR -scale sizes by SIZE -.SH AUTHOR -"#; - let result = parse_manpage_string(groff); - let nu = generate_extern("ls", &result); - assert!(nu.contains("export extern \"ls\""), "nu = {nu}"); - assert!(nu.contains("--all(-a)"), "nu = {nu}"); - assert!(nu.contains("--block-size: string"), "nu = {nu}"); -} - -#[test] -fn nushell_module() { - let r = parse(" -v, --verbose verbose output\n"); - let nu = generate_module("myapp", &to_owned_result(&r)); - assert!(nu.contains("module myapp-completions"), "nu = {nu}"); - assert!(nu.contains("export extern \"myapp\""), "nu = {nu}"); - assert!(nu.contains("--verbose(-v)"), "nu = {nu}"); -} - -#[test] -fn dedup_entries_help() { - let txt = " -v, --verbose verbose output\n --verbose verbose mode\n -v be verbose\n"; - let r = parse(txt); - let nu = generate_extern("test", &to_owned_result(&r)); - let count = nu.matches("--verbose").count(); - assert_eq!(count, 1, "expected --verbose to appear once, nu = {nu}"); - assert!(nu.contains("--verbose(-v)"), "nu = {nu}"); -} - -#[test] -fn dedup_manpage_entries() { - let groff = r#".SH OPTIONS -.TP -\fB\-v\fR, \fB\-\-verbose\fR -Be verbose. -.SH DESCRIPTION -Use \fB\-v\fR for verbose output. -Use \fB\-\-verbose\fR to see more. -"#; - let result = parse_manpage_string(groff); - let nu = generate_extern("test", &result); - assert!(nu.contains("--verbose(-v)"), "nu = {nu}"); - let verbose_lines: Vec<&str> = - nu.lines().filter(|l| l.contains("verbose")).collect(); - assert_eq!(verbose_lines.len(), 1, "expected 1 verbose line, got: {verbose_lines:?}"); -} - -#[test] -fn nu_file_parsing() { - let nu_source = r#"module completions { - - # Unofficial CLI tool - export extern mytool [ - --help(-h) # Print help - --version(-V) # Print version - ] - - # List all items - export extern "mytool list" [ - --raw # Output as JSON - --format(-f): string # Output format - --help(-h) # Print help - name?: string # Filter by name - ] - -} - -use completions * -"#; - let r = parse_nu_completions("mytool", nu_source); - assert_eq!(r.entries.len(), 2, "entries: {:?}", r.entries); - assert!(!r.subcommands.is_empty(), "subcommands: {:?}", r.subcommands); - assert!(r.subcommands.iter().any(|sc| sc.name == "list")); - assert_eq!(r.description, "Unofficial CLI tool"); - - let r2 = parse_nu_completions("mytool list", nu_source); - assert_eq!(r2.entries.len(), 3, "list entries: {:?}", r2.entries); - let has_format = r2.entries.iter().any( - |e| matches!(&e.switch, OwnedSwitch::Both('f', l) if l == "format"), - ); - assert!(has_format, "list should have --format(-f): {:?}", r2.entries); - assert!(!r2.positionals.is_empty(), "list should have a positional"); -} - -#[test] -fn self_listing_detection() { - let txt = r#"systemctl [OPTIONS...] COMMAND ... - -Unit Commands: - start UNIT... Start (activate) one or more units - stop UNIT... Stop (deactivate) one or more units - status [PATTERN...] Show runtime status - -Options: - --user Talk to the user service manager - --system Talk to the system service manager -"#; - let r = parse(txt); - let has_start = r.subcommands.iter().any(|sc| sc.name == "start"); - assert!(has_start, "expected start in subcommands: {:?}", - r.subcommands.iter().map(|sc| sc.name).collect::>()); - assert!(r.entries.len() >= 2); -}