From 259d13b6244e34e16f61c5c70d1086165f44da2e Mon Sep 17 00:00:00 2001 From: psteinroe Date: Tue, 1 Jul 2025 09:02:44 +0200 Subject: [PATCH 1/8] refactor: parser --- Cargo.lock | 265 +++++- Cargo.toml | 7 + crates/pgt_lexer_new/Cargo.toml | 19 + crates/pgt_lexer_new/README.md | 1 + crates/pgt_lexer_new/src/cursor.rs | 65 ++ crates/pgt_lexer_new/src/lib.rs | 805 +++++++++++++++++++ crates/pgt_lexer_new/src/token.rs | 166 ++++ crates/pgt_parser/Cargo.toml | 21 + crates/pgt_parser/README.md | 1 + crates/pgt_parser/src/codegen/mod.rs | 1 + crates/pgt_parser/src/codegen/syntax_kind.rs | 1 + crates/pgt_parser/src/lexed_str.rs | 265 ++++++ crates/pgt_parser/src/lib.rs | 4 + crates/pgt_parser_codegen/Cargo.toml | 26 + crates/pgt_parser_codegen/README.md | 1 + crates/pgt_parser_codegen/build.rs | 51 ++ crates/pgt_parser_codegen/src/keywords.rs | 203 +++++ crates/pgt_parser_codegen/src/lib.rs | 9 + crates/pgt_parser_codegen/src/syntax_kind.rs | 123 +++ 19 files changed, 2032 insertions(+), 2 deletions(-) create mode 100644 crates/pgt_lexer_new/Cargo.toml create mode 100644 crates/pgt_lexer_new/README.md create mode 100644 crates/pgt_lexer_new/src/cursor.rs create mode 100644 crates/pgt_lexer_new/src/lib.rs create mode 100644 crates/pgt_lexer_new/src/token.rs create mode 100644 crates/pgt_parser/Cargo.toml create mode 100644 crates/pgt_parser/README.md create mode 100644 crates/pgt_parser/src/codegen/mod.rs create mode 100644 crates/pgt_parser/src/codegen/syntax_kind.rs create mode 100644 crates/pgt_parser/src/lexed_str.rs create mode 100644 crates/pgt_parser/src/lib.rs create mode 100644 crates/pgt_parser_codegen/Cargo.toml create mode 100644 crates/pgt_parser_codegen/README.md create mode 100644 crates/pgt_parser_codegen/build.rs create mode 100644 crates/pgt_parser_codegen/src/keywords.rs create mode 100644 crates/pgt_parser_codegen/src/lib.rs create mode 100644 crates/pgt_parser_codegen/src/syntax_kind.rs diff --git a/Cargo.lock b/Cargo.lock index 41f807d1..27244be7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -334,6 +334,12 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" +[[package]] +name = "beef" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" + [[package]] name = "bindgen" version = "0.66.1" @@ -747,9 +753,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.2.3" +version = "1.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27f657647bcff5394bf56c7317665bbf790a137a50eaaa5c6bfbb9e27a518f2d" +checksum = "d487aa071b5f64da6f19a3e848e3578944b726ee5a4854b82172f02aa876bfdc" dependencies = [ "shlex", ] @@ -1217,6 +1223,26 @@ version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" +[[package]] +name = "enum-iterator" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c280b9e6b3ae19e152d8e31cf47f18389781e119d4013a2a2bb0180e5facc635" +dependencies = [ + "enum-iterator-derive", +] + +[[package]] +name = "enum-iterator-derive" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1ab991c1362ac86c61ab6f556cff143daa22e5a15e4e189df818b2fd19fe65b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "enumflags2" version = "0.7.11" @@ -1363,6 +1389,12 @@ dependencies = [ "spin", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -2113,6 +2145,40 @@ dependencies = [ "value-bag", ] +[[package]] +name = "logos" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab6f536c1af4c7cc81edf73da1f8029896e7e1e16a219ef09b184e76a296f3db" +dependencies = [ + "logos-derive", +] + +[[package]] +name = "logos-codegen" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "189bbfd0b61330abea797e5e9276408f2edbe4f822d7ad08685d67419aafb34e" +dependencies = [ + "beef", + "fnv", + "lazy_static", + "proc-macro2", + "quote", + "regex-syntax 0.8.5", + "rustc_version", + "syn 2.0.90", +] + +[[package]] +name = "logos-derive" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebfe8e1a19049ddbfccbd14ac834b215e11b85b90bab0c2dba7c7b92fb5d5cba" +dependencies = [ + "logos-codegen", +] + [[package]] name = "lsp-types" version = "0.94.1" @@ -2160,6 +2226,28 @@ dependencies = [ "autocfg", ] +[[package]] +name = "miette" +version = "7.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f98efec8807c63c752b5bd61f862c165c115b0a35685bdcfd9238c7aeb592b7" +dependencies = [ + "cfg-if", + "miette-derive", + "unicode-width", +] + +[[package]] +name = "miette-derive" +version = "7.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db5b29714e950dbb20d5e6f74f9dcec4edbcc1067bb7f8ed198c097b8c1a818b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "mimalloc" version = "0.1.43" @@ -2710,6 +2798,13 @@ dependencies = [ "quote", ] +[[package]] +name = "pgt_lexer_new" +version = "0.0.0" +dependencies = [ + "insta", +] + [[package]] name = "pgt_lsp" version = "0.0.0" @@ -2748,6 +2843,29 @@ dependencies = [ "quote", ] +[[package]] +name = "pgt_parser" +version = "0.0.0" +dependencies = [ + "insta", + "pgt_lexer_new", + "pgt_parser_codegen", +] + +[[package]] +name = "pgt_parser_codegen" +version = "0.0.0" +dependencies = [ + "anyhow", + "convert_case", + "enum-iterator", + "proc-macro2", + "prost-reflect", + "protox", + "quote", + "ureq", +] + [[package]] name = "pgt_query_ext" version = "0.0.0" @@ -3194,6 +3312,18 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "prost-reflect" +version = "0.15.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37587d5a8a1b3dc9863403d084fc2254b91ab75a702207098837950767e2260b" +dependencies = [ + "logos", + "miette", + "prost", + "prost-types", +] + [[package]] name = "prost-types" version = "0.13.5" @@ -3239,6 +3369,33 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "protox" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "424c2bd294b69c49b949f3619362bc3c5d28298cd1163b6d1a62df37c16461aa" +dependencies = [ + "bytes", + "miette", + "prost", + "prost-reflect", + "prost-types", + "protox-parse", + "thiserror 2.0.6", +] + +[[package]] +name = "protox-parse" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57927f9dbeeffcce7192404deee6157a640cbb3fe8ac11eabbe571565949ab75" +dependencies = [ + "logos", + "miette", + "prost-types", + "thiserror 2.0.6", +] + [[package]] name = "pulldown-cmark" version = "0.12.2" @@ -3405,6 +3562,20 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rsa" version = "0.9.7" @@ -3458,6 +3629,15 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "0.37.28" @@ -3485,6 +3665,41 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "rustls" +version = "0.23.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7160e3e10bf4535308537f3c4e1641468cd0e485175d6163087c0393c7d46643" +dependencies = [ + "log", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.20" @@ -3539,6 +3754,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "semver" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" + [[package]] name = "serde" version = "1.0.215" @@ -4647,6 +4868,28 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "ureq" +version = "2.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" +dependencies = [ + "base64", + "flate2", + "log", + "once_cell", + "rustls", + "rustls-pki-types", + "url", + "webpki-roots 0.26.11", +] + [[package]] name = "url" version = "2.5.4" @@ -4844,6 +5087,24 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki-roots" +version = "0.26.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" +dependencies = [ + "webpki-roots 1.0.1", +] + +[[package]] +name = "webpki-roots" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8782dd5a41a24eed3a4f40b606249b3e236ca61adf1f25ea4d45c73de122b502" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "which" version = "4.4.2" diff --git a/Cargo.toml b/Cargo.toml index fe00d7ca..fff06a7f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ biome_rowan = "0.5.7" biome_string_case = "0.5.8" bpaf = { version = "0.9.15", features = ["derive"] } crossbeam = "0.8.4" +enum-iterator = "2.1.0" enumflags2 = "0.7.11" ignore = "0.4.23" indexmap = { version = "2.6.0", features = ["serde"] } @@ -43,6 +44,9 @@ slotmap = "1.0.7" smallvec = { version = "1.13.2", features = ["union", "const_new", "serde"] } strum = { version = "0.27.1", features = ["derive"] } # this will use tokio if available, otherwise async-std +convert_case = "0.6.0" +prost-reflect = "0.15.3" +protox = "0.8.0" sqlx = { version = "0.8.2", features = ["runtime-tokio", "runtime-async-std", "postgres", "json"] } syn = "1.0.109" termcolor = "1.4.1" @@ -69,8 +73,11 @@ pgt_flags = { path = "./crates/pgt_flags", version = "0.0.0" } pgt_fs = { path = "./crates/pgt_fs", version = "0.0.0" } pgt_lexer = { path = "./crates/pgt_lexer", version = "0.0.0" } pgt_lexer_codegen = { path = "./crates/pgt_lexer_codegen", version = "0.0.0" } +pgt_lexer_new = { path = "./crates/pgt_lexer_new", version = "0.0.0" } pgt_lsp = { path = "./crates/pgt_lsp", version = "0.0.0" } pgt_markup = { path = "./crates/pgt_markup", version = "0.0.0" } +pgt_parser = { path = "./crates/pgt_parser", version = "0.0.0" } +pgt_parser_codegen = { path = "./crates/pgt_parser_codegen", version = "0.0.0" } pgt_query_ext = { path = "./crates/pgt_query_ext", version = "0.0.0" } pgt_query_ext_codegen = { path = "./crates/pgt_query_ext_codegen", version = "0.0.0" } pgt_query_proto_parser = { path = "./crates/pgt_query_proto_parser", version = "0.0.0" } diff --git a/crates/pgt_lexer_new/Cargo.toml b/crates/pgt_lexer_new/Cargo.toml new file mode 100644 index 00000000..80ded7ec --- /dev/null +++ b/crates/pgt_lexer_new/Cargo.toml @@ -0,0 +1,19 @@ +[package] +authors.workspace = true +categories.workspace = true +description = "" +edition.workspace = true +homepage.workspace = true +keywords.workspace = true +license.workspace = true +name = "pgt_lexer_new" +repository.workspace = true +version = "0.0.0" + + +[dependencies] + +[dev-dependencies] +insta.workspace = true + +[lib] diff --git a/crates/pgt_lexer_new/README.md b/crates/pgt_lexer_new/README.md new file mode 100644 index 00000000..8fc21d34 --- /dev/null +++ b/crates/pgt_lexer_new/README.md @@ -0,0 +1 @@ +Heavily inspired by and copied from [squawk_lexer](https://github.com/sbdchd/squawk/tree/9acfecbbb7f3c7eedcbaf060e7b25f9afa136db3/crates/squawk_lexer). Thanks for making all the hard work MIT-licensed! diff --git a/crates/pgt_lexer_new/src/cursor.rs b/crates/pgt_lexer_new/src/cursor.rs new file mode 100644 index 00000000..c70c5feb --- /dev/null +++ b/crates/pgt_lexer_new/src/cursor.rs @@ -0,0 +1,65 @@ +use std::str::Chars; + +/// Peekable iterator over a char sequence. +/// +/// Next characters can be peeked via `first` method, +/// and position can be shifted forward via `bump` method. +/// based on: +/// - +/// - +/// +pub(crate) struct Cursor<'a> { + /// Iterator over chars. Slightly faster than a &str. + chars: Chars<'a>, + len_remaining: usize, +} + +pub(crate) const EOF_CHAR: char = '\0'; + +impl<'a> Cursor<'a> { + pub(crate) fn new(input: &'a str) -> Cursor<'a> { + Cursor { + len_remaining: input.len(), + chars: input.chars(), + } + } + + /// Peeks the next symbol from the input stream without consuming it. + /// If requested position doesn't exist, `EOF_CHAR` is returned. + /// However, getting `EOF_CHAR` doesn't always mean actual end of file, + /// it should be checked with `is_eof` method. + pub(crate) fn first(&self) -> char { + // `.next()` optimizes better than `.nth(0)` + self.chars.clone().next().unwrap_or(EOF_CHAR) + } + + /// Checks if there is nothing more to consume. + pub(crate) fn is_eof(&self) -> bool { + self.chars.as_str().is_empty() + } + + /// Returns amount of already consumed symbols. + pub(crate) fn pos_within_token(&self) -> u32 { + (self.len_remaining - self.chars.as_str().len()) as u32 + } + + /// Resets the number of bytes consumed to 0. + pub(crate) fn reset_pos_within_token(&mut self) { + self.len_remaining = self.chars.as_str().len(); + } + + /// Moves to the next character. + pub(crate) fn bump(&mut self) -> Option { + let c = self.chars.next()?; + Some(c) + } + + /// Eats symbols while predicate returns true or until the end of file is reached. + pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + // It was tried making optimized version of this for eg. line comments, but + // LLVM can inline all of this and compile it down to fast iteration over bytes. + while predicate(self.first()) && !self.is_eof() { + self.bump(); + } + } +} diff --git a/crates/pgt_lexer_new/src/lib.rs b/crates/pgt_lexer_new/src/lib.rs new file mode 100644 index 00000000..d590eaab --- /dev/null +++ b/crates/pgt_lexer_new/src/lib.rs @@ -0,0 +1,805 @@ +mod cursor; +mod token; +use cursor::{Cursor, EOF_CHAR}; +pub use token::{Base, LiteralKind, Token, TokenKind}; + +// via: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L346 +// ident_start [A-Za-z\200-\377_] +const fn is_ident_start(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..='\u{FF}') +} + +// ident_cont [A-Za-z\200-\377_0-9\$] +const fn is_ident_cont(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..='\u{FF}') +} + +// whitespace +// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scansup.c#L107-L128 +// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L204-L229 + +const fn is_space(c: char) -> bool { + matches!( + c, ' ' // space + ) +} + +const fn is_tab(c: char) -> bool { + matches!( + c, '\t' // tab + ) +} + +const fn is_newline(c: char) -> bool { + matches!( + c, '\n' // newline + ) +} + +const fn is_carriage_return(c: char) -> bool { + matches!( + c, '\r' // carriage return + ) +} + +const fn is_vertical_tab(c: char) -> bool { + matches!( + c, '\u{000B}' // vertical tab + ) +} + +const fn is_form_feed(c: char) -> bool { + matches!( + c, '\u{000C}' // form feed + ) +} + +impl Cursor<'_> { + // see: https://github.com/rust-lang/rust/blob/ba1d7f4a083e6402679105115ded645512a7aea8/compiler/rustc_lexer/src/lib.rs#L339 + pub(crate) fn advance_token(&mut self) -> Token { + let Some(first_char) = self.bump() else { + return Token::new(TokenKind::Eof, 0); + }; + let token_kind = match first_char { + // Slash, comment or block comment. + '/' => match self.first() { + '*' => self.block_comment(), + _ => TokenKind::Slash, + }, + '-' => match self.first() { + '-' => self.line_comment(), + _ => TokenKind::Minus, + }, + + c if is_space(c) => { + self.eat_while(is_space); + TokenKind::Space + } + + c if is_tab(c) => { + self.eat_while(is_tab); + TokenKind::Tab + } + + c if is_newline(c) => { + self.eat_while(is_newline); + TokenKind::Newline + } + + c if is_carriage_return(c) => { + self.eat_while(is_carriage_return); + TokenKind::CarriageReturn + } + + c if is_vertical_tab(c) => { + self.eat_while(is_vertical_tab); + TokenKind::VerticalTab + } + + c if is_form_feed(c) => { + self.eat_while(is_form_feed); + TokenKind::FormFeed + } + + // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE + 'u' | 'U' => match self.first() { + '&' => { + self.bump(); + self.prefixed_string( + |terminated| LiteralKind::UnicodeEscStr { terminated }, + true, + ) + } + _ => self.ident_or_unknown_prefix(), + }, + + // escaped strings + 'e' | 'E' => { + self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false) + } + + // bit string + 'b' | 'B' => { + self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false) + } + + // hexadecimal byte string + 'x' | 'X' => { + self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false) + } + + // Identifier (this should be checked after other variant that can + // start as identifier). + c if is_ident_start(c) => self.ident(), + + // Numeric literal. + // see: https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC + c @ '0'..='9' => { + let literal_kind = self.number(c); + TokenKind::Literal { kind: literal_kind } + } + '.' => match self.first() { + '0'..='9' => { + let literal_kind = self.number('.'); + TokenKind::Literal { kind: literal_kind } + } + _ => TokenKind::Dot, + }, + // One-symbol tokens. + ';' => TokenKind::Semi, + ',' => TokenKind::Comma, + '(' => TokenKind::OpenParen, + ')' => TokenKind::CloseParen, + '[' => TokenKind::OpenBracket, + ']' => TokenKind::CloseBracket, + '@' => TokenKind::At, + '#' => TokenKind::Pound, + '~' => TokenKind::Tilde, + '?' => TokenKind::Question, + ':' => TokenKind::Colon, + '$' => { + // Dollar quoted strings + if is_ident_start(self.first()) || self.first() == '$' { + self.dollar_quoted_string() + } else { + // Parameters + while self.first().is_ascii_digit() { + self.bump(); + } + TokenKind::PositionalParam + } + } + '`' => TokenKind::Backtick, + '=' => TokenKind::Eq, + '!' => TokenKind::Bang, + '<' => TokenKind::Lt, + '>' => TokenKind::Gt, + '&' => TokenKind::And, + '|' => TokenKind::Or, + '+' => TokenKind::Plus, + '*' => TokenKind::Star, + '^' => TokenKind::Caret, + '%' => TokenKind::Percent, + + // String literal + '\'' => { + let terminated = self.single_quoted_string(); + let kind = LiteralKind::Str { terminated }; + TokenKind::Literal { kind } + } + + // Quoted indentifiers + '"' => { + let terminated = self.double_quoted_string(); + TokenKind::QuotedIdent { terminated } + } + _ => TokenKind::Unknown, + }; + let res = Token::new(token_kind, self.pos_within_token()); + self.reset_pos_within_token(); + res + } + pub(crate) fn ident(&mut self) -> TokenKind { + self.eat_while(is_ident_cont); + TokenKind::Ident + } + + fn ident_or_unknown_prefix(&mut self) -> TokenKind { + // Start is already eaten, eat the rest of identifier. + self.eat_while(is_ident_cont); + // Known prefixes must have been handled earlier. So if + // we see a prefix here, it is definitely an unknown prefix. + match self.first() { + '#' | '"' | '\'' => TokenKind::UnknownPrefix, + _ => TokenKind::Ident, + } + } + + // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L227 + // comment ("--"{non_newline}*) + pub(crate) fn line_comment(&mut self) -> TokenKind { + self.bump(); + + self.eat_while(|c| c != '\n'); + TokenKind::LineComment + } + + // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L324-L344 + pub(crate) fn block_comment(&mut self) -> TokenKind { + self.bump(); + + let mut depth = 1usize; + while let Some(c) = self.bump() { + match c { + '/' if self.first() == '*' => { + self.bump(); + depth += 1; + } + '*' if self.first() == '/' => { + self.bump(); + depth -= 1; + if depth == 0 { + // This block comment is closed, so for a construction like "/* */ */" + // there will be a successfully parsed block comment "/* */" + // and " */" will be processed separately. + break; + } + } + _ => (), + } + } + + TokenKind::BlockComment { + terminated: depth == 0, + } + } + + fn prefixed_string( + &mut self, + mk_kind: fn(bool) -> LiteralKind, + allows_double: bool, + ) -> TokenKind { + match self.first() { + '\'' => { + self.bump(); + let terminated = self.single_quoted_string(); + let kind = mk_kind(terminated); + TokenKind::Literal { kind } + } + '"' if allows_double => { + self.bump(); + let terminated = self.double_quoted_string(); + TokenKind::QuotedIdent { terminated } + } + _ => self.ident_or_unknown_prefix(), + } + } + + fn number(&mut self, first_digit: char) -> LiteralKind { + let mut base = Base::Decimal; + if first_digit == '0' { + // Attempt to parse encoding base. + match self.first() { + // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L403 + 'b' | 'B' => { + base = Base::Binary; + self.bump(); + if !self.eat_decimal_digits() { + return LiteralKind::Int { + base, + empty_int: true, + }; + } + } + // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L402 + 'o' | 'O' => { + base = Base::Octal; + self.bump(); + if !self.eat_decimal_digits() { + return LiteralKind::Int { + base, + empty_int: true, + }; + } + } + // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L401 + 'x' | 'X' => { + base = Base::Hexadecimal; + self.bump(); + if !self.eat_hexadecimal_digits() { + return LiteralKind::Int { + base, + empty_int: true, + }; + } + } + // Not a base prefix; consume additional digits. + '0'..='9' | '_' => { + self.eat_decimal_digits(); + } + + // Also not a base prefix; nothing more to do here. + '.' | 'e' | 'E' => {} + + // Just a 0. + _ => { + return LiteralKind::Int { + base, + empty_int: false, + }; + } + } + } else { + // No base prefix, parse number in the usual way. + self.eat_decimal_digits(); + }; + + match self.first() { + '.' => { + // might have stuff after the ., and if it does, it needs to start + // with a number + self.bump(); + let mut empty_exponent = false; + if self.first().is_ascii_digit() { + self.eat_decimal_digits(); + match self.first() { + 'e' | 'E' => { + self.bump(); + empty_exponent = !self.eat_float_exponent(); + } + _ => (), + } + } else { + match self.first() { + 'e' | 'E' => { + self.bump(); + empty_exponent = !self.eat_float_exponent(); + } + _ => (), + } + } + LiteralKind::Float { + base, + empty_exponent, + } + } + 'e' | 'E' => { + self.bump(); + let empty_exponent = !self.eat_float_exponent(); + LiteralKind::Float { + base, + empty_exponent, + } + } + _ => LiteralKind::Int { + base, + empty_int: false, + }, + } + } + + fn single_quoted_string(&mut self) -> bool { + // Parse until either quotes are terminated or error is detected. + loop { + match self.first() { + // Quotes might be terminated. + '\'' => { + self.bump(); + + match self.first() { + // encountered an escaped quote '' + '\'' => { + self.bump(); + } + // encountered terminating quote + _ => return true, + } + } + // End of file, stop parsing. + EOF_CHAR if self.is_eof() => break, + // Skip the character. + _ => { + self.bump(); + } + } + } + // String was not terminated. + false + } + + /// Eats double-quoted string and returns true + /// if string is terminated. + fn double_quoted_string(&mut self) -> bool { + while let Some(c) = self.bump() { + match c { + '"' if self.first() == '"' => { + // Bump again to skip escaped character. + self.bump(); + } + '"' => { + return true; + } + _ => (), + } + } + // End of file reached. + false + } + + // https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING + fn dollar_quoted_string(&mut self) -> TokenKind { + // Get the start sequence of the dollar quote, i.e., 'foo' in + // $foo$hello$foo$ + let mut start = vec![]; + while let Some(c) = self.bump() { + match c { + '$' => { + self.bump(); + break; + } + _ => { + start.push(c); + } + } + } + + // we have a dollar quoted string deliminated with `$$` + if start.is_empty() { + loop { + self.eat_while(|c| c != '$'); + if self.is_eof() { + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated: false }, + }; + } + // eat $ + self.bump(); + if self.first() == '$' { + self.bump(); + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated: true }, + }; + } + } + } else { + loop { + self.eat_while(|c| c != start[0]); + if self.is_eof() { + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated: false }, + }; + } + + // might be the start of our start/end sequence + let mut match_count = 0; + for start_char in &start { + if self.first() == *start_char { + self.bump(); + match_count += 1; + } else { + self.bump(); + break; + } + } + + // closing '$' + let terminated = match_count == start.len(); + if self.first() == '$' && terminated { + self.bump(); + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated }, + }; + } + } + } + } + + fn eat_decimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.first() { + '_' => { + self.bump(); + } + '0'..='9' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + fn eat_hexadecimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.first() { + '_' => { + self.bump(); + } + '0'..='9' | 'a'..='f' | 'A'..='F' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + /// Eats the float exponent. Returns true if at least one digit was met, + /// and returns false otherwise. + fn eat_float_exponent(&mut self) -> bool { + if self.first() == '-' || self.first() == '+' { + self.bump(); + } + self.eat_decimal_digits() + } +} + +/// Creates an iterator that produces tokens from the input string. +pub fn tokenize(input: &str) -> impl Iterator + '_ { + let mut cursor = Cursor::new(input); + std::iter::from_fn(move || { + let token = cursor.advance_token(); + if token.kind != TokenKind::Eof { + Some(token) + } else { + None + } + }) +} + +#[cfg(test)] +mod tests { + use std::fmt; + + use super::*; + use insta::assert_debug_snapshot; + + struct TokenDebug<'a> { + content: &'a str, + token: Token, + } + impl fmt::Debug for TokenDebug<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?} @ {:?}", self.content, self.token.kind) + } + } + + impl<'a> TokenDebug<'a> { + fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> { + TokenDebug { + token, + content: &input[start as usize..(start + token.len) as usize], + } + } + } + + fn lex(input: &str) -> Vec { + let mut tokens = vec![]; + let mut start = 0; + + for token in tokenize(input) { + let length = token.len; + tokens.push(TokenDebug::new(token, input, start)); + start += length; + } + tokens + } + #[test] + fn lex_statement() { + let result = lex("select 1;"); + assert_debug_snapshot!(result); + } + + #[test] + fn block_comment() { + let result = lex(r#" +/* + * foo + * bar +*/"#); + assert_debug_snapshot!(result); + } + + #[test] + fn block_comment_unterminated() { + let result = lex(r#" +/* + * foo + * bar + /* +*/"#); + assert_debug_snapshot!(result); + } + + #[test] + fn line_comment() { + let result = lex(r#" +-- foooooooooooo bar buzz +"#); + assert_debug_snapshot!(result); + } + + #[test] + fn line_comment_whitespace() { + assert_debug_snapshot!(lex(r#" +select 'Hello' -- This is a comment +' World';"#)) + } + + #[test] + fn dollar_quoting() { + assert_debug_snapshot!(lex(r#" +$$Dianne's horse$$ +$SomeTag$Dianne's horse$SomeTag$ + +-- with dollar inside and matching tags +$foo$hello$world$bar$ +"#)) + } + + #[test] + fn dollar_strings_part2() { + assert_debug_snapshot!(lex(r#" +DO $doblock$ +end +$doblock$;"#)) + } + + #[test] + fn dollar_quote_mismatch_tags_simple() { + assert_debug_snapshot!(lex(r#" +-- dollar quoting with mismatched tags +$foo$hello world$bar$ +"#)); + } + + #[test] + fn dollar_quote_mismatch_tags_complex() { + assert_debug_snapshot!(lex(r#" +-- with dollar inside but mismatched tags +$foo$hello$world$bar$ +"#)); + } + + #[test] + fn numeric() { + assert_debug_snapshot!(lex(r#" +42 +3.5 +4. +.001 +.123e10 +5e2 +1.925e-3 +1e-10 +1e+10 +1e10 +4664.E+5 +"#)) + } + + #[test] + fn numeric_non_decimal() { + assert_debug_snapshot!(lex(r#" +0b100101 +0B10011001 +0o273 +0O755 +0x42f +0XFFFF +"#)) + } + + #[test] + fn numeric_with_seperators() { + assert_debug_snapshot!(lex(r#" +1_500_000_000 +0b10001000_00000000 +0o_1_755 +0xFFFF_FFFF +1.618_034 +"#)) + } + + #[test] + fn select_with_period() { + assert_debug_snapshot!(lex(r#" +select public.users; +"#)) + } + + #[test] + fn bitstring() { + assert_debug_snapshot!(lex(r#" +B'1001' +b'1001' +X'1FF' +x'1FF' +"#)) + } + + #[test] + fn string() { + assert_debug_snapshot!(lex(r#" +'Dianne''s horse' + +select 'foo '' +bar'; + +select 'foooo' + 'bar'; + + +'foo \\ \n \tbar' + +'forgot to close the string +"#)) + } + + #[test] + fn params() { + assert_debug_snapshot!(lex(r#" +select $1 + $2; + +select $1123123123123; + +select $; +"#)) + } + + #[test] + fn string_with_escapes() { + // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-ESCAPE + + assert_debug_snapshot!(lex(r#" +E'foo' + +e'bar' + +e'\b\f\n\r\t' + +e'\0\11\777' + +e'\x0\x11\xFF' + +e'\uAAAA \UFFFFFFFF' + +"#)) + } + + #[test] + fn string_unicode_escape() { + // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE + + assert_debug_snapshot!(lex(r#" +U&"d\0061t\+000061" + +U&"\0441\043B\043E\043D" + +u&'\0441\043B' + +U&"d!0061t!+000061" UESCAPE '!' +"#)) + } + + #[test] + fn quoted_ident() { + assert_debug_snapshot!(lex(r#" +"hello &1 -world"; + + +"hello-world +"#)) + } + + #[test] + fn quoted_ident_with_escape_quote() { + assert_debug_snapshot!(lex(r#" +"foo "" bar" +"#)) + } +} diff --git a/crates/pgt_lexer_new/src/token.rs b/crates/pgt_lexer_new/src/token.rs new file mode 100644 index 00000000..d286f2c3 --- /dev/null +++ b/crates/pgt_lexer_new/src/token.rs @@ -0,0 +1,166 @@ +// based on: https://github.com/rust-lang/rust/blob/d1b7355d3d7b4ead564dbecb1d240fcc74fff21b/compiler/rustc_lexer/src/lib.rs#L58 +#[derive(Debug, PartialEq, Clone, Copy)] +pub enum TokenKind { + /// Used when there's an error of some sort while lexing. + Unknown, + /// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid + /// suffix, but may be present here on string and float literals. Users of + /// this type will need to check for and reject that case. + /// + /// See [`LiteralKind`] for more details. + Literal { + kind: LiteralKind, + }, + /// Whitespace characters. + Space, + Tab, + Newline, + CarriageReturn, + VerticalTab, + FormFeed, + /// Identifier + /// + /// case-sensitive + Ident, + /// `;` + Semi, + /// End of file + Eof, + /// `/` + Slash, + /// `-- foo` + LineComment, + /// ``` + /// /* + /// foo + /// */ + /// ``` + BlockComment { + terminated: bool, + }, + /// `-` + Minus, + /// `:` + Colon, + /// `.` + Dot, + /// `=` + Eq, + /// `>` + Gt, + /// `&` + And, + /// `<` + Lt, + /// `!` + Bang, + /// `+` + Plus, + /// `~` + Tilde, + /// `#` + Pound, + /// `?` + Question, + /// `|` + Or, + /// `%` + Percent, + /// `^` + Caret, + /// `*` + Star, + /// `` ` `` + Backtick, + /// `@` + At, + /// `]` + CloseBracket, + /// `[` + OpenBracket, + /// `)` + CloseParen, + /// `(` + OpenParen, + /// `,` + Comma, + /// Error case that we need to report later on. + UnknownPrefix, + /// Positional Parameter, e.g., `$1` + /// + /// see: + PositionalParam, + /// Quoted Identifier, e.g., `"update"` in `update "my_table" set "a" = 5;` + /// + /// These are case-sensitive, unlike [`TokenKind::Ident`] + /// + /// see: + QuotedIdent { + terminated: bool, + }, +} + +/// Parsed token. +/// It doesn't contain information about data that has been parsed, +/// only the type of the token and its size. +#[derive(Debug, Clone, Copy)] +pub struct Token { + pub kind: TokenKind, + pub len: u32, +} + +impl Token { + pub(crate) fn new(kind: TokenKind, len: u32) -> Token { + Token { kind, len } + } +} + +/// Base of numeric literal encoding according to its prefix. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum Base { + /// Literal starts with "0b". + Binary = 2, + /// Literal starts with "0o". + Octal = 8, + /// Literal doesn't contain a prefix. + Decimal = 10, + /// Literal starts with "0x". + Hexadecimal = 16, +} + +// Enum representing the literal types supported by the lexer. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum LiteralKind { + /// Integer Numeric, e.g., `42` + /// + /// see: + Int { base: Base, empty_int: bool }, + /// Float Numeric, e.g., `1.925e-3` + /// + /// see: + Float { base: Base, empty_exponent: bool }, + /// String, e.g., `'foo'` + /// + /// see: + Str { terminated: bool }, + /// Hexidecimal Bit String, e.g., `X'1FF'` + /// + /// see: + ByteStr { terminated: bool }, + /// Bit String, e.g., `B'1001'` + /// + /// see: + BitStr { terminated: bool }, + /// Dollar Quoted String, e.g., `$$Dianne's horse$$` + /// + /// see: + DollarQuotedString { terminated: bool }, + /// Unicode Escape String, e.g., `U&'d\0061t\+000061'` + /// + /// see: + UnicodeEscStr { terminated: bool }, + /// Escape String, e.g, `E'foo'` + /// + /// see: + EscStr { terminated: bool }, +} diff --git a/crates/pgt_parser/Cargo.toml b/crates/pgt_parser/Cargo.toml new file mode 100644 index 00000000..75cd0f89 --- /dev/null +++ b/crates/pgt_parser/Cargo.toml @@ -0,0 +1,21 @@ +[package] +authors.workspace = true +categories.workspace = true +description = "" +edition.workspace = true +homepage.workspace = true +keywords.workspace = true +license.workspace = true +name = "pgt_parser" +repository.workspace = true +version = "0.0.0" + + +[dependencies] +pgt_lexer_new.workspace = true +pgt_parser_codegen.workspace = true + +[dev-dependencies] +insta.workspace = true + +[lib] diff --git a/crates/pgt_parser/README.md b/crates/pgt_parser/README.md new file mode 100644 index 00000000..57bdaa34 --- /dev/null +++ b/crates/pgt_parser/README.md @@ -0,0 +1 @@ +Heavily inspired by and copied from [squawk_parser](https://github.com/sbdchd/squawk/tree/9acfecbbb7f3c7eedcbaf060e7b25f9afa136db3/crates/squawk_parser). Thanks for making all the hard work MIT-licensed! diff --git a/crates/pgt_parser/src/codegen/mod.rs b/crates/pgt_parser/src/codegen/mod.rs new file mode 100644 index 00000000..c4e67bc5 --- /dev/null +++ b/crates/pgt_parser/src/codegen/mod.rs @@ -0,0 +1 @@ +pub mod syntax_kind; diff --git a/crates/pgt_parser/src/codegen/syntax_kind.rs b/crates/pgt_parser/src/codegen/syntax_kind.rs new file mode 100644 index 00000000..12c5718a --- /dev/null +++ b/crates/pgt_parser/src/codegen/syntax_kind.rs @@ -0,0 +1 @@ +pgt_parser_codegen::syntax_kind_codegen!(); diff --git a/crates/pgt_parser/src/lexed_str.rs b/crates/pgt_parser/src/lexed_str.rs new file mode 100644 index 00000000..c20f555b --- /dev/null +++ b/crates/pgt_parser/src/lexed_str.rs @@ -0,0 +1,265 @@ +// based on https://github.com/rust-lang/rust-analyzer/blob/d8887c0758bbd2d5f752d5bd405d4491e90e7ed6/crates/parser/src/lexed_str.rs + +use std::ops; + +use pgt_lexer_new::tokenize; + +use crate::SyntaxKind; + +pub struct LexedStr<'a> { + text: &'a str, + kind: Vec, + start: Vec, + error: Vec, +} + +struct LexError { + msg: String, + token: u32, +} + +impl<'a> LexedStr<'a> { + pub fn new(text: &'a str) -> LexedStr<'a> { + let mut conv = Converter::new(text); + + for token in tokenize(&text[conv.offset..]) { + let token_text = &text[conv.offset..][..token.len as usize]; + + conv.extend_token(&token.kind, token_text); + } + + conv.finalize_with_eof() + } + + pub(crate) fn len(&self) -> usize { + self.kind.len() - 1 + } + + pub(crate) fn kind(&self, i: usize) -> SyntaxKind { + assert!(i < self.len()); + self.kind[i] + } + + pub(crate) fn text(&self, i: usize) -> &str { + self.range_text(i..i + 1) + } + + pub(crate) fn range_text(&self, r: ops::Range) -> &str { + assert!(r.start < r.end && r.end <= self.len()); + let lo = self.start[r.start] as usize; + let hi = self.start[r.end] as usize; + &self.text[lo..hi] + } + + // Naming is hard. + pub fn text_range(&self, i: usize) -> ops::Range { + assert!(i < self.len()); + let lo = self.start[i] as usize; + let hi = self.start[i + 1] as usize; + lo..hi + } + pub fn text_start(&self, i: usize) -> usize { + assert!(i <= self.len()); + self.start[i] as usize + } + + pub fn errors(&self) -> impl Iterator + '_ { + self.error + .iter() + .map(|it| (it.token as usize, it.msg.as_str())) + } + + fn push(&mut self, kind: SyntaxKind, offset: usize) { + self.kind.push(kind); + self.start.push(offset as u32); + } +} + +struct Converter<'a> { + res: LexedStr<'a>, + offset: usize, +} + +impl<'a> Converter<'a> { + fn new(text: &'a str) -> Self { + Self { + res: LexedStr { + text, + kind: Vec::new(), + start: Vec::new(), + error: Vec::new(), + }, + offset: 0, + } + } + + fn finalize_with_eof(mut self) -> LexedStr<'a> { + self.res.push(SyntaxKind::EOF, self.offset); + self.res + } + + fn push(&mut self, kind: SyntaxKind, len: usize, err: Option<&str>) { + self.res.push(kind, self.offset); + self.offset += len; + + if let Some(err) = err { + let token = self.res.len() as u32; + let msg = err.to_owned(); + self.res.error.push(LexError { msg, token }); + } + } + + fn extend_token(&mut self, kind: &pgt_lexer_new::TokenKind, token_text: &str) { + // A note on an intended tradeoff: + // We drop some useful information here (see patterns with double dots `..`) + // Storing that info in `SyntaxKind` is not possible due to its layout requirements of + // being `u16` that come from `rowan::SyntaxKind`. + let mut err = ""; + + let syntax_kind = { + match kind { + pgt_lexer_new::TokenKind::LineComment => SyntaxKind::COMMENT, + pgt_lexer_new::TokenKind::BlockComment { terminated } => { + if !terminated { + err = "Missing trailing `*/` symbols to terminate the block comment"; + } + SyntaxKind::COMMENT + } + + // whitespace + pgt_lexer_new::TokenKind::Space => SyntaxKind::SPACE, + pgt_lexer_new::TokenKind::Tab => SyntaxKind::TAB, + pgt_lexer_new::TokenKind::Newline => SyntaxKind::NEWLINE, + pgt_lexer_new::TokenKind::CarriageReturn => SyntaxKind::CARRIAGE_RETURN, + pgt_lexer_new::TokenKind::VerticalTab => SyntaxKind::VERTICAL_TAB, + pgt_lexer_new::TokenKind::FormFeed => SyntaxKind::FORM_FEED, + pgt_lexer_new::TokenKind::Ident => { + // TODO: check for max identifier length + // + // see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS + // The system uses no more than NAMEDATALEN-1 bytes of an + // identifier; longer names can be written in commands, but + // they will be truncated. By default, NAMEDATALEN is 64 so + // the maximum identifier length is 63 bytes. If this limit + // is problematic, it can be raised by changing the + // NAMEDATALEN constant in src/include/pg_config_manual.h. + // see: https://github.com/postgres/postgres/blob/e032e4c7ddd0e1f7865b246ec18944365d4f8614/src/include/pg_config_manual.h#L29 + SyntaxKind::from_keyword(token_text).unwrap_or(SyntaxKind::IDENT) + } + pgt_lexer_new::TokenKind::Literal { kind, .. } => { + self.extend_literal(token_text.len(), kind); + return; + } + pgt_lexer_new::TokenKind::Semi => SyntaxKind::SEMICOLON, + pgt_lexer_new::TokenKind::Comma => SyntaxKind::COMMA, + pgt_lexer_new::TokenKind::Dot => SyntaxKind::DOT, + pgt_lexer_new::TokenKind::OpenParen => SyntaxKind::L_PAREN, + pgt_lexer_new::TokenKind::CloseParen => SyntaxKind::R_PAREN, + pgt_lexer_new::TokenKind::OpenBracket => SyntaxKind::L_BRACK, + pgt_lexer_new::TokenKind::CloseBracket => SyntaxKind::R_BRACK, + pgt_lexer_new::TokenKind::At => SyntaxKind::AT, + pgt_lexer_new::TokenKind::Pound => SyntaxKind::POUND, + pgt_lexer_new::TokenKind::Tilde => SyntaxKind::TILDE, + pgt_lexer_new::TokenKind::Question => SyntaxKind::QUESTION, + pgt_lexer_new::TokenKind::Colon => SyntaxKind::COLON, + pgt_lexer_new::TokenKind::Eq => SyntaxKind::EQ, + pgt_lexer_new::TokenKind::Bang => SyntaxKind::BANG, + pgt_lexer_new::TokenKind::Lt => SyntaxKind::L_ANGLE, + pgt_lexer_new::TokenKind::Gt => SyntaxKind::R_ANGLE, + pgt_lexer_new::TokenKind::Minus => SyntaxKind::MINUS, + pgt_lexer_new::TokenKind::And => SyntaxKind::AMP, + pgt_lexer_new::TokenKind::Or => SyntaxKind::PIPE, + pgt_lexer_new::TokenKind::Plus => SyntaxKind::PLUS, + pgt_lexer_new::TokenKind::Star => SyntaxKind::STAR, + pgt_lexer_new::TokenKind::Slash => SyntaxKind::SLASH, + pgt_lexer_new::TokenKind::Caret => SyntaxKind::CARET, + pgt_lexer_new::TokenKind::Percent => SyntaxKind::PERCENT, + pgt_lexer_new::TokenKind::Unknown => SyntaxKind::ERROR, + pgt_lexer_new::TokenKind::UnknownPrefix => { + err = "unknown literal prefix"; + SyntaxKind::IDENT + } + pgt_lexer_new::TokenKind::Eof => SyntaxKind::EOF, + pgt_lexer_new::TokenKind::Backtick => SyntaxKind::BACKTICK, + pgt_lexer_new::TokenKind::PositionalParam => SyntaxKind::POSITIONAL_PARAM, + pgt_lexer_new::TokenKind::QuotedIdent { terminated } => { + if !terminated { + err = "Missing trailing \" to terminate the quoted identifier" + } + SyntaxKind::IDENT + } + } + }; + + let err = if err.is_empty() { None } else { Some(err) }; + self.push(syntax_kind, token_text.len(), err); + } + + fn extend_literal(&mut self, len: usize, kind: &pgt_lexer_new::LiteralKind) { + let mut err = ""; + + let syntax_kind = match *kind { + pgt_lexer_new::LiteralKind::Int { empty_int, base: _ } => { + if empty_int { + err = "Missing digits after the integer base prefix"; + } + SyntaxKind::INT_NUMBER + } + pgt_lexer_new::LiteralKind::Float { + empty_exponent, + base: _, + } => { + if empty_exponent { + err = "Missing digits after the exponent symbol"; + } + SyntaxKind::FLOAT_NUMBER + } + pgt_lexer_new::LiteralKind::Str { terminated } => { + if !terminated { + err = "Missing trailing `'` symbol to terminate the string literal"; + } + // TODO: rust analzyer checks for un-escaped strings, we should too + SyntaxKind::STRING + } + pgt_lexer_new::LiteralKind::ByteStr { terminated } => { + if !terminated { + err = "Missing trailing `'` symbol to terminate the hex bit string literal"; + } + // TODO: rust analzyer checks for un-escaped strings, we should too + SyntaxKind::BYTE_STRING + } + pgt_lexer_new::LiteralKind::BitStr { terminated } => { + if !terminated { + err = "Missing trailing `\'` symbol to terminate the bit string literal"; + } + // TODO: rust analzyer checks for un-escaped strings, we should too + SyntaxKind::BIT_STRING + } + pgt_lexer_new::LiteralKind::DollarQuotedString { terminated } => { + if !terminated { + // TODO: we could be fancier and say the ending string we're looking for + err = "Unterminated dollar quoted string literal"; + } + // TODO: rust analzyer checks for un-escaped strings, we should too + SyntaxKind::DOLLAR_QUOTED_STRING + } + pgt_lexer_new::LiteralKind::UnicodeEscStr { terminated } => { + if !terminated { + err = "Missing trailing `'` symbol to terminate the unicode escape string literal"; + } + // TODO: rust analzyer checks for un-escaped strings, we should too + SyntaxKind::BYTE_STRING + } + pgt_lexer_new::LiteralKind::EscStr { terminated } => { + if !terminated { + err = "Missing trailing `\'` symbol to terminate the escape string literal"; + } + // TODO: rust analzyer checks for un-escaped strings, we should too + SyntaxKind::ESC_STRING + } + }; + + let err = if err.is_empty() { None } else { Some(err) }; + self.push(syntax_kind, len, err); + } +} diff --git a/crates/pgt_parser/src/lib.rs b/crates/pgt_parser/src/lib.rs new file mode 100644 index 00000000..d1c34c1b --- /dev/null +++ b/crates/pgt_parser/src/lib.rs @@ -0,0 +1,4 @@ +mod codegen; +mod lexed_str; + +pub use crate::codegen::syntax_kind::SyntaxKind; diff --git a/crates/pgt_parser_codegen/Cargo.toml b/crates/pgt_parser_codegen/Cargo.toml new file mode 100644 index 00000000..157f342b --- /dev/null +++ b/crates/pgt_parser_codegen/Cargo.toml @@ -0,0 +1,26 @@ +[package] +authors.workspace = true +categories.workspace = true +description = "" +edition.workspace = true +homepage.workspace = true +keywords.workspace = true +license.workspace = true +name = "pgt_parser_codegen" +repository.workspace = true +version = "0.0.0" + +[dependencies] +anyhow = { workspace = true } +convert_case = { workspace = true } +enum-iterator = { workspace = true } +proc-macro2.workspace = true +prost-reflect = { workspace = true } +protox = { workspace = true } +quote.workspace = true + +[build-dependencies] +ureq = "2.9" + +[lib] +proc-macro = true diff --git a/crates/pgt_parser_codegen/README.md b/crates/pgt_parser_codegen/README.md new file mode 100644 index 00000000..57bdaa34 --- /dev/null +++ b/crates/pgt_parser_codegen/README.md @@ -0,0 +1 @@ +Heavily inspired by and copied from [squawk_parser](https://github.com/sbdchd/squawk/tree/9acfecbbb7f3c7eedcbaf060e7b25f9afa136db3/crates/squawk_parser). Thanks for making all the hard work MIT-licensed! diff --git a/crates/pgt_parser_codegen/build.rs b/crates/pgt_parser_codegen/build.rs new file mode 100644 index 00000000..3cd71002 --- /dev/null +++ b/crates/pgt_parser_codegen/build.rs @@ -0,0 +1,51 @@ +use std::env; +use std::fs; +use std::io::Write; +use std::path::PathBuf; + +// TODO make this selectable via feature flags +static LIBPG_QUERY_TAG: &str = "17-6.1.0"; + +/// Downloads the `kwlist.h` file from the specified version of `libpg_query` +fn main() -> Result<(), Box> { + let version = LIBPG_QUERY_TAG.to_string(); + + let out_dir = PathBuf::from(env::var("OUT_DIR")?); + let vendor_dir = out_dir.join("vendor"); + let libpg_query_dir = vendor_dir.join("libpg_query").join(&version); + let kwlist_path = libpg_query_dir.join("kwlist.h"); + let stamp_file = libpg_query_dir.join(".stamp"); + + if !stamp_file.exists() { + println!( + "cargo:warning=Downloading kwlist.h for libpg_query {}", + version + ); + + fs::create_dir_all(&libpg_query_dir)?; + + let proto_url = format!( + "https://raw.githubusercontent.com/pganalyze/libpg_query/{}/src/postgres/include/parser/kwlist.h", + version + ); + + let response = ureq::get(&proto_url).call()?; + let content = response.into_string()?; + + let mut file = fs::File::create(&kwlist_path)?; + file.write_all(content.as_bytes())?; + + fs::File::create(&stamp_file)?; + + println!("cargo:warning=Successfully downloaded kwlist.h"); + } + + println!( + "cargo:rustc-env=PG_QUERY_KWLIST_PATH={}", + kwlist_path.display() + ); + + println!("cargo:rerun-if-changed={}", stamp_file.display()); + + Ok(()) +} diff --git a/crates/pgt_parser_codegen/src/keywords.rs b/crates/pgt_parser_codegen/src/keywords.rs new file mode 100644 index 00000000..5d865974 --- /dev/null +++ b/crates/pgt_parser_codegen/src/keywords.rs @@ -0,0 +1,203 @@ +// from https://github.com/sbdchd/squawk/blob/ac9f90c3b2be8d2c46fd5454eb48975afd268dbe/crates/xtask/src/keywords.rs +use anyhow::{Context, Ok, Result}; +use enum_iterator::{Sequence, all}; +use std::{ + collections::{HashMap, HashSet}, + path, +}; + +struct KeywordMeta { + pub(crate) category: KeywordCategory, + pub(crate) label: KeywordLabel, +} + +enum KeywordLabel { + As, + Bare, +} + +/// related: +/// - [postgres/src/backend/utils/adt/misc.c](https://github.com/postgres/postgres/blob/08691ea958c2646b6aadefff878539eb0b860bb0/src/backend/utils/adt/misc.c#L452-L467/) +/// - [postgres docs: sql keywords appendix](https://www.postgresql.org/docs/17/sql-keywords-appendix.html) +/// +/// The header file isn't enough though because `json_scalar` can be a function +/// name, but `between` cannot be +/// +/// The Postgres parser special cases certain calls like `json_scalar`: +/// +/// +/// | Category | Column | Table | Function | Type | +/// |--------------|--------|-------|----------|------| +/// | Unreserved | Y | Y | Y | Y | +/// | Reserved | N | N | N | N | +/// | ColName | Y | Y | N | Y | +/// | TypeFuncName | N | N | Y | Y | +/// +#[derive(Clone, Copy)] +enum KeywordCategory { + Unreserved, + Reserved, + ColName, + TypeFuncName, +} + +#[derive(Sequence, PartialEq)] +enum KWType { + ColumnTable, + Type, +} + +impl std::fmt::Display for KWType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + KWType::ColumnTable => "COLUMN_OR_TABLE_KEYWORDS", + KWType::Type => "TYPE_KEYWORDS", + }) + } +} + +fn keyword_allowed(cat: KeywordCategory, kw_type: KWType) -> bool { + match cat { + KeywordCategory::Unreserved => match kw_type { + KWType::ColumnTable => true, + KWType::Type => true, + }, + KeywordCategory::Reserved => match kw_type { + KWType::ColumnTable => false, + KWType::Type => false, + }, + KeywordCategory::ColName => match kw_type { + KWType::ColumnTable => true, + KWType::Type => true, + }, + KeywordCategory::TypeFuncName => match kw_type { + KWType::ColumnTable => false, + KWType::Type => true, + }, + } +} + +fn parse_header() -> Result> { + // use the environment variable set by the build script to locate the kwlist.h file + let kwlist_file = path::PathBuf::from(env!("PG_QUERY_KWLIST_PATH")); + let data = std::fs::read_to_string(kwlist_file).context("Failed to read kwlist.h")?; + + let mut keywords = HashMap::new(); + + for line in data.lines() { + if line.starts_with("PG_KEYWORD") { + let line = line + .split(&['(', ')']) + .nth(1) + .context("Invalid kwlist.h structure")?; + + let row_items: Vec<&str> = line.split(',').collect(); + + match row_items[..] { + [name, _value, category, is_bare_label] => { + let label = match is_bare_label.trim() { + "AS_LABEL" => KeywordLabel::As, + "BARE_LABEL" => KeywordLabel::Bare, + unexpected => anyhow::bail!("Unexpected label: {}", unexpected), + }; + + let category = match category.trim() { + "UNRESERVED_KEYWORD" => KeywordCategory::Unreserved, + "RESERVED_KEYWORD" => KeywordCategory::Reserved, + "COL_NAME_KEYWORD" => KeywordCategory::ColName, + "TYPE_FUNC_NAME_KEYWORD" => KeywordCategory::TypeFuncName, + unexpected => anyhow::bail!("Unexpected category: {}", unexpected), + }; + + let meta = KeywordMeta { category, label }; + let name = name.trim().replace('\"', ""); + keywords.insert(name, meta); + } + _ => anyhow::bail!("Problem reading kwlist.h row"), + } + } + } + + Ok(keywords) +} + +pub(crate) struct KeywordKinds { + pub(crate) all_keywords: Vec, + pub(crate) bare_label_keywords: Vec, + pub(crate) unreserved_keywords: Vec, + pub(crate) reserved_keywords: Vec, + pub(crate) col_table_keywords: Vec, + pub(crate) type_keywords: Vec, +} + +pub(crate) fn keyword_kinds() -> Result { + let keywords = parse_header()?; + let mut bare_label_keywords = keywords + .iter() + .filter(|(_key, value)| match value.label { + KeywordLabel::As => false, + KeywordLabel::Bare => true, + }) + .map(|(key, _value)| key.to_owned()) + .collect::>(); + bare_label_keywords.sort(); + + let mut unreserved_keywords = keywords + .iter() + .filter(|(_key, value)| matches!(value.category, KeywordCategory::Unreserved)) + .map(|(key, _value)| key.to_owned()) + .collect::>(); + unreserved_keywords.sort(); + + let mut reserved_keywords = keywords + .iter() + .filter(|(_key, value)| matches!(value.category, KeywordCategory::Reserved)) + .map(|(key, _value)| key.to_owned()) + .collect::>(); + reserved_keywords.sort(); + + let mut all_keywords = keywords + .keys() + .map(|key| key.to_owned()) + .collect::>(); + all_keywords.sort(); + + let mut col_table_tokens = HashSet::new(); + let mut type_tokens = HashSet::new(); + for (key, meta) in &keywords { + for variant in all::() { + match variant { + KWType::ColumnTable => { + if keyword_allowed(meta.category, variant) { + col_table_tokens.insert(key); + } + } + KWType::Type => { + if keyword_allowed(meta.category, variant) { + type_tokens.insert(key); + } + } + } + } + } + + let mut col_table_keywords = col_table_tokens + .iter() + .map(|x| x.to_string()) + .collect::>(); + col_table_keywords.sort(); + let mut type_keywords = type_tokens + .iter() + .map(|x| x.to_string()) + .collect::>(); + type_keywords.sort(); + + Ok(KeywordKinds { + all_keywords, + bare_label_keywords, + unreserved_keywords, + reserved_keywords, + col_table_keywords, + type_keywords, + }) +} diff --git a/crates/pgt_parser_codegen/src/lib.rs b/crates/pgt_parser_codegen/src/lib.rs new file mode 100644 index 00000000..b620b6a6 --- /dev/null +++ b/crates/pgt_parser_codegen/src/lib.rs @@ -0,0 +1,9 @@ +mod keywords; +mod syntax_kind; + +use syntax_kind::syntax_kind_mod; + +#[proc_macro] +pub fn syntax_kind_codegen(_input: proc_macro::TokenStream) -> proc_macro::TokenStream { + syntax_kind_mod().into() +} diff --git a/crates/pgt_parser_codegen/src/syntax_kind.rs b/crates/pgt_parser_codegen/src/syntax_kind.rs new file mode 100644 index 00000000..66f28a0b --- /dev/null +++ b/crates/pgt_parser_codegen/src/syntax_kind.rs @@ -0,0 +1,123 @@ +use convert_case::{Case, Casing}; +use proc_macro2::TokenStream; +use quote::{format_ident, quote}; + +use crate::keywords::{KeywordKinds, keyword_kinds}; + +const WHITESPACE: &[&str] = &[ + "SPACE", // " " + "TAB", // "\t" + "NEWLINE", // "\n" + "CARRIAGE_RETURN", // "\r" + "VERTICAL_TAB", // "\x0B" + "FORM_FEED", // "\x0C" +]; + +const PUNCT: &[(&str, &str)] = &[ + ("$", "DOLLAR"), + (";", "SEMICOLON"), + (",", "COMMA"), + ("(", "L_PAREN"), + (")", "R_PAREN"), + ("[", "L_BRACK"), + ("]", "R_BRACK"), + ("<", "L_ANGLE"), + (">", "R_ANGLE"), + ("@", "AT"), + ("#", "POUND"), + ("~", "TILDE"), + ("?", "QUESTION"), + ("&", "AMP"), + ("|", "PIPE"), + ("+", "PLUS"), + ("*", "STAR"), + ("/", "SLASH"), + ("^", "CARET"), + ("%", "PERCENT"), + ("_", "UNDERSCORE"), + (".", "DOT"), + (":", "COLON"), + ("=", "EQ"), + ("!", "BANG"), + ("-", "MINUS"), + ("`", "BACKTICK"), +]; + +const EXTRA: &[&str] = &["POSITIONAL_PARAM", "ERROR", "COMMENT", "EOF"]; + +const LITERALS: &[&str] = &[ + "BIT_STRING", + "BYTE_STRING", + "DOLLAR_QUOTED_STRING", + "ESC_STRING", + "FLOAT_NUMBER", + "INT_NUMBER", + "NULL", + "STRING", + "IDENT", +]; + +pub fn syntax_kind_mod() -> proc_macro2::TokenStream { + let keywords = keyword_kinds().expect("Failed to get keyword kinds"); + + let KeywordKinds { all_keywords, .. } = keywords; + + let mut enum_variants: Vec = Vec::new(); + let mut from_kw_match_arms: Vec = Vec::new(); + + // collect keywords + for kw in &all_keywords { + if kw.to_uppercase().contains("WHITESPACE") { + continue; // Skip whitespace as it is handled separately + } + + let kind_ident = format_ident!("{}_KW", kw.to_case(Case::UpperSnake)); + + enum_variants.push(quote! { #kind_ident }); + from_kw_match_arms.push(quote! { + #kw => Some(SyntaxKind::#kind_ident) + }); + } + + // collect extra keywords + EXTRA.iter().for_each(|&name| { + let variant_name = format_ident!("{}", name); + enum_variants.push(quote! { #variant_name }); + }); + + // collect whitespace variants + WHITESPACE.iter().for_each(|&name| { + let variant_name = format_ident!("{}", name); + enum_variants.push(quote! { #variant_name }); + }); + + // collect punctuations + PUNCT.iter().for_each(|&(_ascii_name, variant)| { + let variant_name = format_ident!("{}", variant); + enum_variants.push(quote! { #variant_name }); + }); + + // collect literals + LITERALS.iter().for_each(|&name| { + let variant_name = format_ident!("{}", name); + enum_variants.push(quote! { #variant_name }); + }); + + quote! { + #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] + #[repr(u16)] + pub enum SyntaxKind { + #(#enum_variants),*, + } + + impl SyntaxKind { + pub(crate) fn from_keyword(ident: &str) -> Option { + let lower_ident = ident.to_ascii_lowercase(); + match lower_ident.as_str() { + #(#from_kw_match_arms),*, + _ => None + } + } + } + } +} From fb1594cd242ff8bb2409faf3b5bbdfd3c9243c6e Mon Sep 17 00:00:00 2001 From: psteinroe Date: Thu, 3 Jul 2025 08:03:04 +0200 Subject: [PATCH 2/8] progress --- Cargo.lock | 46 +- Cargo.toml | 4 +- crates/pgt_lexer_new/Cargo.toml | 2 + crates/pgt_lexer_new/README.md | 2 +- .../src/codegen/mod.rs | 0 .../pgt_lexer_new/src/codegen/syntax_kind.rs | 1 + .../src/lexed_str.rs | 102 +-- crates/pgt_lexer_new/src/lib.rs | 807 +----------------- .../Cargo.toml | 2 +- .../README.md | 0 .../build.rs | 0 .../src/keywords.rs | 0 .../src/lib.rs | 0 .../src/syntax_kind.rs | 0 crates/pgt_parser/src/codegen/syntax_kind.rs | 1 - crates/pgt_parser/src/lib.rs | 4 - crates/pgt_parser_codegen/README.md | 1 - .../{pgt_parser => pgt_tokenizer}/Cargo.toml | 4 +- crates/pgt_tokenizer/README.md | 1 + .../src/cursor.rs | 0 crates/pgt_tokenizer/src/lib.rs | 805 +++++++++++++++++ .../src/token.rs | 0 22 files changed, 891 insertions(+), 891 deletions(-) rename crates/{pgt_parser => pgt_lexer_new}/src/codegen/mod.rs (100%) create mode 100644 crates/pgt_lexer_new/src/codegen/syntax_kind.rs rename crates/{pgt_parser => pgt_lexer_new}/src/lexed_str.rs (70%) rename crates/{pgt_parser_codegen => pgt_lexer_new_codegen}/Cargo.toml (93%) rename crates/{pgt_parser => pgt_lexer_new_codegen}/README.md (100%) rename crates/{pgt_parser_codegen => pgt_lexer_new_codegen}/build.rs (100%) rename crates/{pgt_parser_codegen => pgt_lexer_new_codegen}/src/keywords.rs (100%) rename crates/{pgt_parser_codegen => pgt_lexer_new_codegen}/src/lib.rs (100%) rename crates/{pgt_parser_codegen => pgt_lexer_new_codegen}/src/syntax_kind.rs (100%) delete mode 100644 crates/pgt_parser/src/codegen/syntax_kind.rs delete mode 100644 crates/pgt_parser/src/lib.rs delete mode 100644 crates/pgt_parser_codegen/README.md rename crates/{pgt_parser => pgt_tokenizer}/Cargo.toml (76%) create mode 100644 crates/pgt_tokenizer/README.md rename crates/{pgt_lexer_new => pgt_tokenizer}/src/cursor.rs (100%) create mode 100644 crates/pgt_tokenizer/src/lib.rs rename crates/{pgt_lexer_new => pgt_tokenizer}/src/token.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 27244be7..7d1853fb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2803,6 +2803,22 @@ name = "pgt_lexer_new" version = "0.0.0" dependencies = [ "insta", + "pgt_lexer_new_codegen", + "pgt_tokenizer", +] + +[[package]] +name = "pgt_lexer_new_codegen" +version = "0.0.0" +dependencies = [ + "anyhow", + "convert_case", + "enum-iterator", + "proc-macro2", + "prost-reflect", + "protox", + "quote", + "ureq", ] [[package]] @@ -2843,29 +2859,6 @@ dependencies = [ "quote", ] -[[package]] -name = "pgt_parser" -version = "0.0.0" -dependencies = [ - "insta", - "pgt_lexer_new", - "pgt_parser_codegen", -] - -[[package]] -name = "pgt_parser_codegen" -version = "0.0.0" -dependencies = [ - "anyhow", - "convert_case", - "enum-iterator", - "proc-macro2", - "prost-reflect", - "protox", - "quote", - "ureq", -] - [[package]] name = "pgt_query_ext" version = "0.0.0" @@ -2969,6 +2962,13 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "pgt_tokenizer" +version = "0.0.0" +dependencies = [ + "insta", +] + [[package]] name = "pgt_treesitter_queries" version = "0.0.0" diff --git a/Cargo.toml b/Cargo.toml index fff06a7f..b18dc2a3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,10 +74,9 @@ pgt_fs = { path = "./crates/pgt_fs", version = "0.0.0" } pgt_lexer = { path = "./crates/pgt_lexer", version = "0.0.0" } pgt_lexer_codegen = { path = "./crates/pgt_lexer_codegen", version = "0.0.0" } pgt_lexer_new = { path = "./crates/pgt_lexer_new", version = "0.0.0" } +pgt_lexer_new_codegen = { path = "./crates/pgt_lexer_new_codegen", version = "0.0.0" } pgt_lsp = { path = "./crates/pgt_lsp", version = "0.0.0" } pgt_markup = { path = "./crates/pgt_markup", version = "0.0.0" } -pgt_parser = { path = "./crates/pgt_parser", version = "0.0.0" } -pgt_parser_codegen = { path = "./crates/pgt_parser_codegen", version = "0.0.0" } pgt_query_ext = { path = "./crates/pgt_query_ext", version = "0.0.0" } pgt_query_ext_codegen = { path = "./crates/pgt_query_ext_codegen", version = "0.0.0" } pgt_query_proto_parser = { path = "./crates/pgt_query_proto_parser", version = "0.0.0" } @@ -85,6 +84,7 @@ pgt_schema_cache = { path = "./crates/pgt_schema_cache", version = "0. pgt_statement_splitter = { path = "./crates/pgt_statement_splitter", version = "0.0.0" } pgt_text_edit = { path = "./crates/pgt_text_edit", version = "0.0.0" } pgt_text_size = { path = "./crates/pgt_text_size", version = "0.0.0" } +pgt_tokenizer = { path = "./crates/pgt_tokenizer", version = "0.0.0" } pgt_treesitter_queries = { path = "./crates/pgt_treesitter_queries", version = "0.0.0" } pgt_typecheck = { path = "./crates/pgt_typecheck", version = "0.0.0" } pgt_workspace = { path = "./crates/pgt_workspace", version = "0.0.0" } diff --git a/crates/pgt_lexer_new/Cargo.toml b/crates/pgt_lexer_new/Cargo.toml index 80ded7ec..0ea86475 100644 --- a/crates/pgt_lexer_new/Cargo.toml +++ b/crates/pgt_lexer_new/Cargo.toml @@ -12,6 +12,8 @@ version = "0.0.0" [dependencies] +pgt_lexer_new_codegen.workspace = true +pgt_tokenizer.workspace = true [dev-dependencies] insta.workspace = true diff --git a/crates/pgt_lexer_new/README.md b/crates/pgt_lexer_new/README.md index 8fc21d34..57bdaa34 100644 --- a/crates/pgt_lexer_new/README.md +++ b/crates/pgt_lexer_new/README.md @@ -1 +1 @@ -Heavily inspired by and copied from [squawk_lexer](https://github.com/sbdchd/squawk/tree/9acfecbbb7f3c7eedcbaf060e7b25f9afa136db3/crates/squawk_lexer). Thanks for making all the hard work MIT-licensed! +Heavily inspired by and copied from [squawk_parser](https://github.com/sbdchd/squawk/tree/9acfecbbb7f3c7eedcbaf060e7b25f9afa136db3/crates/squawk_parser). Thanks for making all the hard work MIT-licensed! diff --git a/crates/pgt_parser/src/codegen/mod.rs b/crates/pgt_lexer_new/src/codegen/mod.rs similarity index 100% rename from crates/pgt_parser/src/codegen/mod.rs rename to crates/pgt_lexer_new/src/codegen/mod.rs diff --git a/crates/pgt_lexer_new/src/codegen/syntax_kind.rs b/crates/pgt_lexer_new/src/codegen/syntax_kind.rs new file mode 100644 index 00000000..6b3d2317 --- /dev/null +++ b/crates/pgt_lexer_new/src/codegen/syntax_kind.rs @@ -0,0 +1 @@ +pgt_lexer_new_codegen::syntax_kind_codegen!(); diff --git a/crates/pgt_parser/src/lexed_str.rs b/crates/pgt_lexer_new/src/lexed_str.rs similarity index 70% rename from crates/pgt_parser/src/lexed_str.rs rename to crates/pgt_lexer_new/src/lexed_str.rs index c20f555b..6bbeec90 100644 --- a/crates/pgt_parser/src/lexed_str.rs +++ b/crates/pgt_lexer_new/src/lexed_str.rs @@ -2,7 +2,7 @@ use std::ops; -use pgt_lexer_new::tokenize; +use pgt_tokenizer::tokenize; use crate::SyntaxKind; @@ -109,7 +109,7 @@ impl<'a> Converter<'a> { } } - fn extend_token(&mut self, kind: &pgt_lexer_new::TokenKind, token_text: &str) { + fn extend_token(&mut self, kind: &pgt_tokenizer::TokenKind, token_text: &str) { // A note on an intended tradeoff: // We drop some useful information here (see patterns with double dots `..`) // Storing that info in `SyntaxKind` is not possible due to its layout requirements of @@ -118,8 +118,8 @@ impl<'a> Converter<'a> { let syntax_kind = { match kind { - pgt_lexer_new::TokenKind::LineComment => SyntaxKind::COMMENT, - pgt_lexer_new::TokenKind::BlockComment { terminated } => { + pgt_tokenizer::TokenKind::LineComment => SyntaxKind::COMMENT, + pgt_tokenizer::TokenKind::BlockComment { terminated } => { if !terminated { err = "Missing trailing `*/` symbols to terminate the block comment"; } @@ -127,13 +127,13 @@ impl<'a> Converter<'a> { } // whitespace - pgt_lexer_new::TokenKind::Space => SyntaxKind::SPACE, - pgt_lexer_new::TokenKind::Tab => SyntaxKind::TAB, - pgt_lexer_new::TokenKind::Newline => SyntaxKind::NEWLINE, - pgt_lexer_new::TokenKind::CarriageReturn => SyntaxKind::CARRIAGE_RETURN, - pgt_lexer_new::TokenKind::VerticalTab => SyntaxKind::VERTICAL_TAB, - pgt_lexer_new::TokenKind::FormFeed => SyntaxKind::FORM_FEED, - pgt_lexer_new::TokenKind::Ident => { + pgt_tokenizer::TokenKind::Space => SyntaxKind::SPACE, + pgt_tokenizer::TokenKind::Tab => SyntaxKind::TAB, + pgt_tokenizer::TokenKind::Newline => SyntaxKind::NEWLINE, + pgt_tokenizer::TokenKind::CarriageReturn => SyntaxKind::CARRIAGE_RETURN, + pgt_tokenizer::TokenKind::VerticalTab => SyntaxKind::VERTICAL_TAB, + pgt_tokenizer::TokenKind::FormFeed => SyntaxKind::FORM_FEED, + pgt_tokenizer::TokenKind::Ident => { // TODO: check for max identifier length // // see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS @@ -146,43 +146,43 @@ impl<'a> Converter<'a> { // see: https://github.com/postgres/postgres/blob/e032e4c7ddd0e1f7865b246ec18944365d4f8614/src/include/pg_config_manual.h#L29 SyntaxKind::from_keyword(token_text).unwrap_or(SyntaxKind::IDENT) } - pgt_lexer_new::TokenKind::Literal { kind, .. } => { + pgt_tokenizer::TokenKind::Literal { kind, .. } => { self.extend_literal(token_text.len(), kind); return; } - pgt_lexer_new::TokenKind::Semi => SyntaxKind::SEMICOLON, - pgt_lexer_new::TokenKind::Comma => SyntaxKind::COMMA, - pgt_lexer_new::TokenKind::Dot => SyntaxKind::DOT, - pgt_lexer_new::TokenKind::OpenParen => SyntaxKind::L_PAREN, - pgt_lexer_new::TokenKind::CloseParen => SyntaxKind::R_PAREN, - pgt_lexer_new::TokenKind::OpenBracket => SyntaxKind::L_BRACK, - pgt_lexer_new::TokenKind::CloseBracket => SyntaxKind::R_BRACK, - pgt_lexer_new::TokenKind::At => SyntaxKind::AT, - pgt_lexer_new::TokenKind::Pound => SyntaxKind::POUND, - pgt_lexer_new::TokenKind::Tilde => SyntaxKind::TILDE, - pgt_lexer_new::TokenKind::Question => SyntaxKind::QUESTION, - pgt_lexer_new::TokenKind::Colon => SyntaxKind::COLON, - pgt_lexer_new::TokenKind::Eq => SyntaxKind::EQ, - pgt_lexer_new::TokenKind::Bang => SyntaxKind::BANG, - pgt_lexer_new::TokenKind::Lt => SyntaxKind::L_ANGLE, - pgt_lexer_new::TokenKind::Gt => SyntaxKind::R_ANGLE, - pgt_lexer_new::TokenKind::Minus => SyntaxKind::MINUS, - pgt_lexer_new::TokenKind::And => SyntaxKind::AMP, - pgt_lexer_new::TokenKind::Or => SyntaxKind::PIPE, - pgt_lexer_new::TokenKind::Plus => SyntaxKind::PLUS, - pgt_lexer_new::TokenKind::Star => SyntaxKind::STAR, - pgt_lexer_new::TokenKind::Slash => SyntaxKind::SLASH, - pgt_lexer_new::TokenKind::Caret => SyntaxKind::CARET, - pgt_lexer_new::TokenKind::Percent => SyntaxKind::PERCENT, - pgt_lexer_new::TokenKind::Unknown => SyntaxKind::ERROR, - pgt_lexer_new::TokenKind::UnknownPrefix => { + pgt_tokenizer::TokenKind::Semi => SyntaxKind::SEMICOLON, + pgt_tokenizer::TokenKind::Comma => SyntaxKind::COMMA, + pgt_tokenizer::TokenKind::Dot => SyntaxKind::DOT, + pgt_tokenizer::TokenKind::OpenParen => SyntaxKind::L_PAREN, + pgt_tokenizer::TokenKind::CloseParen => SyntaxKind::R_PAREN, + pgt_tokenizer::TokenKind::OpenBracket => SyntaxKind::L_BRACK, + pgt_tokenizer::TokenKind::CloseBracket => SyntaxKind::R_BRACK, + pgt_tokenizer::TokenKind::At => SyntaxKind::AT, + pgt_tokenizer::TokenKind::Pound => SyntaxKind::POUND, + pgt_tokenizer::TokenKind::Tilde => SyntaxKind::TILDE, + pgt_tokenizer::TokenKind::Question => SyntaxKind::QUESTION, + pgt_tokenizer::TokenKind::Colon => SyntaxKind::COLON, + pgt_tokenizer::TokenKind::Eq => SyntaxKind::EQ, + pgt_tokenizer::TokenKind::Bang => SyntaxKind::BANG, + pgt_tokenizer::TokenKind::Lt => SyntaxKind::L_ANGLE, + pgt_tokenizer::TokenKind::Gt => SyntaxKind::R_ANGLE, + pgt_tokenizer::TokenKind::Minus => SyntaxKind::MINUS, + pgt_tokenizer::TokenKind::And => SyntaxKind::AMP, + pgt_tokenizer::TokenKind::Or => SyntaxKind::PIPE, + pgt_tokenizer::TokenKind::Plus => SyntaxKind::PLUS, + pgt_tokenizer::TokenKind::Star => SyntaxKind::STAR, + pgt_tokenizer::TokenKind::Slash => SyntaxKind::SLASH, + pgt_tokenizer::TokenKind::Caret => SyntaxKind::CARET, + pgt_tokenizer::TokenKind::Percent => SyntaxKind::PERCENT, + pgt_tokenizer::TokenKind::Unknown => SyntaxKind::ERROR, + pgt_tokenizer::TokenKind::UnknownPrefix => { err = "unknown literal prefix"; SyntaxKind::IDENT } - pgt_lexer_new::TokenKind::Eof => SyntaxKind::EOF, - pgt_lexer_new::TokenKind::Backtick => SyntaxKind::BACKTICK, - pgt_lexer_new::TokenKind::PositionalParam => SyntaxKind::POSITIONAL_PARAM, - pgt_lexer_new::TokenKind::QuotedIdent { terminated } => { + pgt_tokenizer::TokenKind::Eof => SyntaxKind::EOF, + pgt_tokenizer::TokenKind::Backtick => SyntaxKind::BACKTICK, + pgt_tokenizer::TokenKind::PositionalParam => SyntaxKind::POSITIONAL_PARAM, + pgt_tokenizer::TokenKind::QuotedIdent { terminated } => { if !terminated { err = "Missing trailing \" to terminate the quoted identifier" } @@ -195,17 +195,17 @@ impl<'a> Converter<'a> { self.push(syntax_kind, token_text.len(), err); } - fn extend_literal(&mut self, len: usize, kind: &pgt_lexer_new::LiteralKind) { + fn extend_literal(&mut self, len: usize, kind: &pgt_tokenizer::LiteralKind) { let mut err = ""; let syntax_kind = match *kind { - pgt_lexer_new::LiteralKind::Int { empty_int, base: _ } => { + pgt_tokenizer::LiteralKind::Int { empty_int, base: _ } => { if empty_int { err = "Missing digits after the integer base prefix"; } SyntaxKind::INT_NUMBER } - pgt_lexer_new::LiteralKind::Float { + pgt_tokenizer::LiteralKind::Float { empty_exponent, base: _, } => { @@ -214,28 +214,28 @@ impl<'a> Converter<'a> { } SyntaxKind::FLOAT_NUMBER } - pgt_lexer_new::LiteralKind::Str { terminated } => { + pgt_tokenizer::LiteralKind::Str { terminated } => { if !terminated { err = "Missing trailing `'` symbol to terminate the string literal"; } // TODO: rust analzyer checks for un-escaped strings, we should too SyntaxKind::STRING } - pgt_lexer_new::LiteralKind::ByteStr { terminated } => { + pgt_tokenizer::LiteralKind::ByteStr { terminated } => { if !terminated { err = "Missing trailing `'` symbol to terminate the hex bit string literal"; } // TODO: rust analzyer checks for un-escaped strings, we should too SyntaxKind::BYTE_STRING } - pgt_lexer_new::LiteralKind::BitStr { terminated } => { + pgt_tokenizer::LiteralKind::BitStr { terminated } => { if !terminated { err = "Missing trailing `\'` symbol to terminate the bit string literal"; } // TODO: rust analzyer checks for un-escaped strings, we should too SyntaxKind::BIT_STRING } - pgt_lexer_new::LiteralKind::DollarQuotedString { terminated } => { + pgt_tokenizer::LiteralKind::DollarQuotedString { terminated } => { if !terminated { // TODO: we could be fancier and say the ending string we're looking for err = "Unterminated dollar quoted string literal"; @@ -243,14 +243,14 @@ impl<'a> Converter<'a> { // TODO: rust analzyer checks for un-escaped strings, we should too SyntaxKind::DOLLAR_QUOTED_STRING } - pgt_lexer_new::LiteralKind::UnicodeEscStr { terminated } => { + pgt_tokenizer::LiteralKind::UnicodeEscStr { terminated } => { if !terminated { err = "Missing trailing `'` symbol to terminate the unicode escape string literal"; } // TODO: rust analzyer checks for un-escaped strings, we should too SyntaxKind::BYTE_STRING } - pgt_lexer_new::LiteralKind::EscStr { terminated } => { + pgt_tokenizer::LiteralKind::EscStr { terminated } => { if !terminated { err = "Missing trailing `\'` symbol to terminate the escape string literal"; } diff --git a/crates/pgt_lexer_new/src/lib.rs b/crates/pgt_lexer_new/src/lib.rs index d590eaab..d1c34c1b 100644 --- a/crates/pgt_lexer_new/src/lib.rs +++ b/crates/pgt_lexer_new/src/lib.rs @@ -1,805 +1,4 @@ -mod cursor; -mod token; -use cursor::{Cursor, EOF_CHAR}; -pub use token::{Base, LiteralKind, Token, TokenKind}; +mod codegen; +mod lexed_str; -// via: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L346 -// ident_start [A-Za-z\200-\377_] -const fn is_ident_start(c: char) -> bool { - matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..='\u{FF}') -} - -// ident_cont [A-Za-z\200-\377_0-9\$] -const fn is_ident_cont(c: char) -> bool { - matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..='\u{FF}') -} - -// whitespace -// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scansup.c#L107-L128 -// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L204-L229 - -const fn is_space(c: char) -> bool { - matches!( - c, ' ' // space - ) -} - -const fn is_tab(c: char) -> bool { - matches!( - c, '\t' // tab - ) -} - -const fn is_newline(c: char) -> bool { - matches!( - c, '\n' // newline - ) -} - -const fn is_carriage_return(c: char) -> bool { - matches!( - c, '\r' // carriage return - ) -} - -const fn is_vertical_tab(c: char) -> bool { - matches!( - c, '\u{000B}' // vertical tab - ) -} - -const fn is_form_feed(c: char) -> bool { - matches!( - c, '\u{000C}' // form feed - ) -} - -impl Cursor<'_> { - // see: https://github.com/rust-lang/rust/blob/ba1d7f4a083e6402679105115ded645512a7aea8/compiler/rustc_lexer/src/lib.rs#L339 - pub(crate) fn advance_token(&mut self) -> Token { - let Some(first_char) = self.bump() else { - return Token::new(TokenKind::Eof, 0); - }; - let token_kind = match first_char { - // Slash, comment or block comment. - '/' => match self.first() { - '*' => self.block_comment(), - _ => TokenKind::Slash, - }, - '-' => match self.first() { - '-' => self.line_comment(), - _ => TokenKind::Minus, - }, - - c if is_space(c) => { - self.eat_while(is_space); - TokenKind::Space - } - - c if is_tab(c) => { - self.eat_while(is_tab); - TokenKind::Tab - } - - c if is_newline(c) => { - self.eat_while(is_newline); - TokenKind::Newline - } - - c if is_carriage_return(c) => { - self.eat_while(is_carriage_return); - TokenKind::CarriageReturn - } - - c if is_vertical_tab(c) => { - self.eat_while(is_vertical_tab); - TokenKind::VerticalTab - } - - c if is_form_feed(c) => { - self.eat_while(is_form_feed); - TokenKind::FormFeed - } - - // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE - 'u' | 'U' => match self.first() { - '&' => { - self.bump(); - self.prefixed_string( - |terminated| LiteralKind::UnicodeEscStr { terminated }, - true, - ) - } - _ => self.ident_or_unknown_prefix(), - }, - - // escaped strings - 'e' | 'E' => { - self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false) - } - - // bit string - 'b' | 'B' => { - self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false) - } - - // hexadecimal byte string - 'x' | 'X' => { - self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false) - } - - // Identifier (this should be checked after other variant that can - // start as identifier). - c if is_ident_start(c) => self.ident(), - - // Numeric literal. - // see: https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC - c @ '0'..='9' => { - let literal_kind = self.number(c); - TokenKind::Literal { kind: literal_kind } - } - '.' => match self.first() { - '0'..='9' => { - let literal_kind = self.number('.'); - TokenKind::Literal { kind: literal_kind } - } - _ => TokenKind::Dot, - }, - // One-symbol tokens. - ';' => TokenKind::Semi, - ',' => TokenKind::Comma, - '(' => TokenKind::OpenParen, - ')' => TokenKind::CloseParen, - '[' => TokenKind::OpenBracket, - ']' => TokenKind::CloseBracket, - '@' => TokenKind::At, - '#' => TokenKind::Pound, - '~' => TokenKind::Tilde, - '?' => TokenKind::Question, - ':' => TokenKind::Colon, - '$' => { - // Dollar quoted strings - if is_ident_start(self.first()) || self.first() == '$' { - self.dollar_quoted_string() - } else { - // Parameters - while self.first().is_ascii_digit() { - self.bump(); - } - TokenKind::PositionalParam - } - } - '`' => TokenKind::Backtick, - '=' => TokenKind::Eq, - '!' => TokenKind::Bang, - '<' => TokenKind::Lt, - '>' => TokenKind::Gt, - '&' => TokenKind::And, - '|' => TokenKind::Or, - '+' => TokenKind::Plus, - '*' => TokenKind::Star, - '^' => TokenKind::Caret, - '%' => TokenKind::Percent, - - // String literal - '\'' => { - let terminated = self.single_quoted_string(); - let kind = LiteralKind::Str { terminated }; - TokenKind::Literal { kind } - } - - // Quoted indentifiers - '"' => { - let terminated = self.double_quoted_string(); - TokenKind::QuotedIdent { terminated } - } - _ => TokenKind::Unknown, - }; - let res = Token::new(token_kind, self.pos_within_token()); - self.reset_pos_within_token(); - res - } - pub(crate) fn ident(&mut self) -> TokenKind { - self.eat_while(is_ident_cont); - TokenKind::Ident - } - - fn ident_or_unknown_prefix(&mut self) -> TokenKind { - // Start is already eaten, eat the rest of identifier. - self.eat_while(is_ident_cont); - // Known prefixes must have been handled earlier. So if - // we see a prefix here, it is definitely an unknown prefix. - match self.first() { - '#' | '"' | '\'' => TokenKind::UnknownPrefix, - _ => TokenKind::Ident, - } - } - - // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L227 - // comment ("--"{non_newline}*) - pub(crate) fn line_comment(&mut self) -> TokenKind { - self.bump(); - - self.eat_while(|c| c != '\n'); - TokenKind::LineComment - } - - // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L324-L344 - pub(crate) fn block_comment(&mut self) -> TokenKind { - self.bump(); - - let mut depth = 1usize; - while let Some(c) = self.bump() { - match c { - '/' if self.first() == '*' => { - self.bump(); - depth += 1; - } - '*' if self.first() == '/' => { - self.bump(); - depth -= 1; - if depth == 0 { - // This block comment is closed, so for a construction like "/* */ */" - // there will be a successfully parsed block comment "/* */" - // and " */" will be processed separately. - break; - } - } - _ => (), - } - } - - TokenKind::BlockComment { - terminated: depth == 0, - } - } - - fn prefixed_string( - &mut self, - mk_kind: fn(bool) -> LiteralKind, - allows_double: bool, - ) -> TokenKind { - match self.first() { - '\'' => { - self.bump(); - let terminated = self.single_quoted_string(); - let kind = mk_kind(terminated); - TokenKind::Literal { kind } - } - '"' if allows_double => { - self.bump(); - let terminated = self.double_quoted_string(); - TokenKind::QuotedIdent { terminated } - } - _ => self.ident_or_unknown_prefix(), - } - } - - fn number(&mut self, first_digit: char) -> LiteralKind { - let mut base = Base::Decimal; - if first_digit == '0' { - // Attempt to parse encoding base. - match self.first() { - // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L403 - 'b' | 'B' => { - base = Base::Binary; - self.bump(); - if !self.eat_decimal_digits() { - return LiteralKind::Int { - base, - empty_int: true, - }; - } - } - // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L402 - 'o' | 'O' => { - base = Base::Octal; - self.bump(); - if !self.eat_decimal_digits() { - return LiteralKind::Int { - base, - empty_int: true, - }; - } - } - // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L401 - 'x' | 'X' => { - base = Base::Hexadecimal; - self.bump(); - if !self.eat_hexadecimal_digits() { - return LiteralKind::Int { - base, - empty_int: true, - }; - } - } - // Not a base prefix; consume additional digits. - '0'..='9' | '_' => { - self.eat_decimal_digits(); - } - - // Also not a base prefix; nothing more to do here. - '.' | 'e' | 'E' => {} - - // Just a 0. - _ => { - return LiteralKind::Int { - base, - empty_int: false, - }; - } - } - } else { - // No base prefix, parse number in the usual way. - self.eat_decimal_digits(); - }; - - match self.first() { - '.' => { - // might have stuff after the ., and if it does, it needs to start - // with a number - self.bump(); - let mut empty_exponent = false; - if self.first().is_ascii_digit() { - self.eat_decimal_digits(); - match self.first() { - 'e' | 'E' => { - self.bump(); - empty_exponent = !self.eat_float_exponent(); - } - _ => (), - } - } else { - match self.first() { - 'e' | 'E' => { - self.bump(); - empty_exponent = !self.eat_float_exponent(); - } - _ => (), - } - } - LiteralKind::Float { - base, - empty_exponent, - } - } - 'e' | 'E' => { - self.bump(); - let empty_exponent = !self.eat_float_exponent(); - LiteralKind::Float { - base, - empty_exponent, - } - } - _ => LiteralKind::Int { - base, - empty_int: false, - }, - } - } - - fn single_quoted_string(&mut self) -> bool { - // Parse until either quotes are terminated or error is detected. - loop { - match self.first() { - // Quotes might be terminated. - '\'' => { - self.bump(); - - match self.first() { - // encountered an escaped quote '' - '\'' => { - self.bump(); - } - // encountered terminating quote - _ => return true, - } - } - // End of file, stop parsing. - EOF_CHAR if self.is_eof() => break, - // Skip the character. - _ => { - self.bump(); - } - } - } - // String was not terminated. - false - } - - /// Eats double-quoted string and returns true - /// if string is terminated. - fn double_quoted_string(&mut self) -> bool { - while let Some(c) = self.bump() { - match c { - '"' if self.first() == '"' => { - // Bump again to skip escaped character. - self.bump(); - } - '"' => { - return true; - } - _ => (), - } - } - // End of file reached. - false - } - - // https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING - fn dollar_quoted_string(&mut self) -> TokenKind { - // Get the start sequence of the dollar quote, i.e., 'foo' in - // $foo$hello$foo$ - let mut start = vec![]; - while let Some(c) = self.bump() { - match c { - '$' => { - self.bump(); - break; - } - _ => { - start.push(c); - } - } - } - - // we have a dollar quoted string deliminated with `$$` - if start.is_empty() { - loop { - self.eat_while(|c| c != '$'); - if self.is_eof() { - return TokenKind::Literal { - kind: LiteralKind::DollarQuotedString { terminated: false }, - }; - } - // eat $ - self.bump(); - if self.first() == '$' { - self.bump(); - return TokenKind::Literal { - kind: LiteralKind::DollarQuotedString { terminated: true }, - }; - } - } - } else { - loop { - self.eat_while(|c| c != start[0]); - if self.is_eof() { - return TokenKind::Literal { - kind: LiteralKind::DollarQuotedString { terminated: false }, - }; - } - - // might be the start of our start/end sequence - let mut match_count = 0; - for start_char in &start { - if self.first() == *start_char { - self.bump(); - match_count += 1; - } else { - self.bump(); - break; - } - } - - // closing '$' - let terminated = match_count == start.len(); - if self.first() == '$' && terminated { - self.bump(); - return TokenKind::Literal { - kind: LiteralKind::DollarQuotedString { terminated }, - }; - } - } - } - } - - fn eat_decimal_digits(&mut self) -> bool { - let mut has_digits = false; - loop { - match self.first() { - '_' => { - self.bump(); - } - '0'..='9' => { - has_digits = true; - self.bump(); - } - _ => break, - } - } - has_digits - } - - fn eat_hexadecimal_digits(&mut self) -> bool { - let mut has_digits = false; - loop { - match self.first() { - '_' => { - self.bump(); - } - '0'..='9' | 'a'..='f' | 'A'..='F' => { - has_digits = true; - self.bump(); - } - _ => break, - } - } - has_digits - } - - /// Eats the float exponent. Returns true if at least one digit was met, - /// and returns false otherwise. - fn eat_float_exponent(&mut self) -> bool { - if self.first() == '-' || self.first() == '+' { - self.bump(); - } - self.eat_decimal_digits() - } -} - -/// Creates an iterator that produces tokens from the input string. -pub fn tokenize(input: &str) -> impl Iterator + '_ { - let mut cursor = Cursor::new(input); - std::iter::from_fn(move || { - let token = cursor.advance_token(); - if token.kind != TokenKind::Eof { - Some(token) - } else { - None - } - }) -} - -#[cfg(test)] -mod tests { - use std::fmt; - - use super::*; - use insta::assert_debug_snapshot; - - struct TokenDebug<'a> { - content: &'a str, - token: Token, - } - impl fmt::Debug for TokenDebug<'_> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{:?} @ {:?}", self.content, self.token.kind) - } - } - - impl<'a> TokenDebug<'a> { - fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> { - TokenDebug { - token, - content: &input[start as usize..(start + token.len) as usize], - } - } - } - - fn lex(input: &str) -> Vec { - let mut tokens = vec![]; - let mut start = 0; - - for token in tokenize(input) { - let length = token.len; - tokens.push(TokenDebug::new(token, input, start)); - start += length; - } - tokens - } - #[test] - fn lex_statement() { - let result = lex("select 1;"); - assert_debug_snapshot!(result); - } - - #[test] - fn block_comment() { - let result = lex(r#" -/* - * foo - * bar -*/"#); - assert_debug_snapshot!(result); - } - - #[test] - fn block_comment_unterminated() { - let result = lex(r#" -/* - * foo - * bar - /* -*/"#); - assert_debug_snapshot!(result); - } - - #[test] - fn line_comment() { - let result = lex(r#" --- foooooooooooo bar buzz -"#); - assert_debug_snapshot!(result); - } - - #[test] - fn line_comment_whitespace() { - assert_debug_snapshot!(lex(r#" -select 'Hello' -- This is a comment -' World';"#)) - } - - #[test] - fn dollar_quoting() { - assert_debug_snapshot!(lex(r#" -$$Dianne's horse$$ -$SomeTag$Dianne's horse$SomeTag$ - --- with dollar inside and matching tags -$foo$hello$world$bar$ -"#)) - } - - #[test] - fn dollar_strings_part2() { - assert_debug_snapshot!(lex(r#" -DO $doblock$ -end -$doblock$;"#)) - } - - #[test] - fn dollar_quote_mismatch_tags_simple() { - assert_debug_snapshot!(lex(r#" --- dollar quoting with mismatched tags -$foo$hello world$bar$ -"#)); - } - - #[test] - fn dollar_quote_mismatch_tags_complex() { - assert_debug_snapshot!(lex(r#" --- with dollar inside but mismatched tags -$foo$hello$world$bar$ -"#)); - } - - #[test] - fn numeric() { - assert_debug_snapshot!(lex(r#" -42 -3.5 -4. -.001 -.123e10 -5e2 -1.925e-3 -1e-10 -1e+10 -1e10 -4664.E+5 -"#)) - } - - #[test] - fn numeric_non_decimal() { - assert_debug_snapshot!(lex(r#" -0b100101 -0B10011001 -0o273 -0O755 -0x42f -0XFFFF -"#)) - } - - #[test] - fn numeric_with_seperators() { - assert_debug_snapshot!(lex(r#" -1_500_000_000 -0b10001000_00000000 -0o_1_755 -0xFFFF_FFFF -1.618_034 -"#)) - } - - #[test] - fn select_with_period() { - assert_debug_snapshot!(lex(r#" -select public.users; -"#)) - } - - #[test] - fn bitstring() { - assert_debug_snapshot!(lex(r#" -B'1001' -b'1001' -X'1FF' -x'1FF' -"#)) - } - - #[test] - fn string() { - assert_debug_snapshot!(lex(r#" -'Dianne''s horse' - -select 'foo '' -bar'; - -select 'foooo' - 'bar'; - - -'foo \\ \n \tbar' - -'forgot to close the string -"#)) - } - - #[test] - fn params() { - assert_debug_snapshot!(lex(r#" -select $1 + $2; - -select $1123123123123; - -select $; -"#)) - } - - #[test] - fn string_with_escapes() { - // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-ESCAPE - - assert_debug_snapshot!(lex(r#" -E'foo' - -e'bar' - -e'\b\f\n\r\t' - -e'\0\11\777' - -e'\x0\x11\xFF' - -e'\uAAAA \UFFFFFFFF' - -"#)) - } - - #[test] - fn string_unicode_escape() { - // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE - - assert_debug_snapshot!(lex(r#" -U&"d\0061t\+000061" - -U&"\0441\043B\043E\043D" - -u&'\0441\043B' - -U&"d!0061t!+000061" UESCAPE '!' -"#)) - } - - #[test] - fn quoted_ident() { - assert_debug_snapshot!(lex(r#" -"hello &1 -world"; - - -"hello-world -"#)) - } - - #[test] - fn quoted_ident_with_escape_quote() { - assert_debug_snapshot!(lex(r#" -"foo "" bar" -"#)) - } -} +pub use crate::codegen::syntax_kind::SyntaxKind; diff --git a/crates/pgt_parser_codegen/Cargo.toml b/crates/pgt_lexer_new_codegen/Cargo.toml similarity index 93% rename from crates/pgt_parser_codegen/Cargo.toml rename to crates/pgt_lexer_new_codegen/Cargo.toml index 157f342b..96f1e74c 100644 --- a/crates/pgt_parser_codegen/Cargo.toml +++ b/crates/pgt_lexer_new_codegen/Cargo.toml @@ -6,7 +6,7 @@ edition.workspace = true homepage.workspace = true keywords.workspace = true license.workspace = true -name = "pgt_parser_codegen" +name = "pgt_lexer_new_codegen" repository.workspace = true version = "0.0.0" diff --git a/crates/pgt_parser/README.md b/crates/pgt_lexer_new_codegen/README.md similarity index 100% rename from crates/pgt_parser/README.md rename to crates/pgt_lexer_new_codegen/README.md diff --git a/crates/pgt_parser_codegen/build.rs b/crates/pgt_lexer_new_codegen/build.rs similarity index 100% rename from crates/pgt_parser_codegen/build.rs rename to crates/pgt_lexer_new_codegen/build.rs diff --git a/crates/pgt_parser_codegen/src/keywords.rs b/crates/pgt_lexer_new_codegen/src/keywords.rs similarity index 100% rename from crates/pgt_parser_codegen/src/keywords.rs rename to crates/pgt_lexer_new_codegen/src/keywords.rs diff --git a/crates/pgt_parser_codegen/src/lib.rs b/crates/pgt_lexer_new_codegen/src/lib.rs similarity index 100% rename from crates/pgt_parser_codegen/src/lib.rs rename to crates/pgt_lexer_new_codegen/src/lib.rs diff --git a/crates/pgt_parser_codegen/src/syntax_kind.rs b/crates/pgt_lexer_new_codegen/src/syntax_kind.rs similarity index 100% rename from crates/pgt_parser_codegen/src/syntax_kind.rs rename to crates/pgt_lexer_new_codegen/src/syntax_kind.rs diff --git a/crates/pgt_parser/src/codegen/syntax_kind.rs b/crates/pgt_parser/src/codegen/syntax_kind.rs deleted file mode 100644 index 12c5718a..00000000 --- a/crates/pgt_parser/src/codegen/syntax_kind.rs +++ /dev/null @@ -1 +0,0 @@ -pgt_parser_codegen::syntax_kind_codegen!(); diff --git a/crates/pgt_parser/src/lib.rs b/crates/pgt_parser/src/lib.rs deleted file mode 100644 index d1c34c1b..00000000 --- a/crates/pgt_parser/src/lib.rs +++ /dev/null @@ -1,4 +0,0 @@ -mod codegen; -mod lexed_str; - -pub use crate::codegen::syntax_kind::SyntaxKind; diff --git a/crates/pgt_parser_codegen/README.md b/crates/pgt_parser_codegen/README.md deleted file mode 100644 index 57bdaa34..00000000 --- a/crates/pgt_parser_codegen/README.md +++ /dev/null @@ -1 +0,0 @@ -Heavily inspired by and copied from [squawk_parser](https://github.com/sbdchd/squawk/tree/9acfecbbb7f3c7eedcbaf060e7b25f9afa136db3/crates/squawk_parser). Thanks for making all the hard work MIT-licensed! diff --git a/crates/pgt_parser/Cargo.toml b/crates/pgt_tokenizer/Cargo.toml similarity index 76% rename from crates/pgt_parser/Cargo.toml rename to crates/pgt_tokenizer/Cargo.toml index 75cd0f89..9cd4bf5e 100644 --- a/crates/pgt_parser/Cargo.toml +++ b/crates/pgt_tokenizer/Cargo.toml @@ -6,14 +6,12 @@ edition.workspace = true homepage.workspace = true keywords.workspace = true license.workspace = true -name = "pgt_parser" +name = "pgt_tokenizer" repository.workspace = true version = "0.0.0" [dependencies] -pgt_lexer_new.workspace = true -pgt_parser_codegen.workspace = true [dev-dependencies] insta.workspace = true diff --git a/crates/pgt_tokenizer/README.md b/crates/pgt_tokenizer/README.md new file mode 100644 index 00000000..8fc21d34 --- /dev/null +++ b/crates/pgt_tokenizer/README.md @@ -0,0 +1 @@ +Heavily inspired by and copied from [squawk_lexer](https://github.com/sbdchd/squawk/tree/9acfecbbb7f3c7eedcbaf060e7b25f9afa136db3/crates/squawk_lexer). Thanks for making all the hard work MIT-licensed! diff --git a/crates/pgt_lexer_new/src/cursor.rs b/crates/pgt_tokenizer/src/cursor.rs similarity index 100% rename from crates/pgt_lexer_new/src/cursor.rs rename to crates/pgt_tokenizer/src/cursor.rs diff --git a/crates/pgt_tokenizer/src/lib.rs b/crates/pgt_tokenizer/src/lib.rs new file mode 100644 index 00000000..d590eaab --- /dev/null +++ b/crates/pgt_tokenizer/src/lib.rs @@ -0,0 +1,805 @@ +mod cursor; +mod token; +use cursor::{Cursor, EOF_CHAR}; +pub use token::{Base, LiteralKind, Token, TokenKind}; + +// via: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L346 +// ident_start [A-Za-z\200-\377_] +const fn is_ident_start(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..='\u{FF}') +} + +// ident_cont [A-Za-z\200-\377_0-9\$] +const fn is_ident_cont(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..='\u{FF}') +} + +// whitespace +// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scansup.c#L107-L128 +// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L204-L229 + +const fn is_space(c: char) -> bool { + matches!( + c, ' ' // space + ) +} + +const fn is_tab(c: char) -> bool { + matches!( + c, '\t' // tab + ) +} + +const fn is_newline(c: char) -> bool { + matches!( + c, '\n' // newline + ) +} + +const fn is_carriage_return(c: char) -> bool { + matches!( + c, '\r' // carriage return + ) +} + +const fn is_vertical_tab(c: char) -> bool { + matches!( + c, '\u{000B}' // vertical tab + ) +} + +const fn is_form_feed(c: char) -> bool { + matches!( + c, '\u{000C}' // form feed + ) +} + +impl Cursor<'_> { + // see: https://github.com/rust-lang/rust/blob/ba1d7f4a083e6402679105115ded645512a7aea8/compiler/rustc_lexer/src/lib.rs#L339 + pub(crate) fn advance_token(&mut self) -> Token { + let Some(first_char) = self.bump() else { + return Token::new(TokenKind::Eof, 0); + }; + let token_kind = match first_char { + // Slash, comment or block comment. + '/' => match self.first() { + '*' => self.block_comment(), + _ => TokenKind::Slash, + }, + '-' => match self.first() { + '-' => self.line_comment(), + _ => TokenKind::Minus, + }, + + c if is_space(c) => { + self.eat_while(is_space); + TokenKind::Space + } + + c if is_tab(c) => { + self.eat_while(is_tab); + TokenKind::Tab + } + + c if is_newline(c) => { + self.eat_while(is_newline); + TokenKind::Newline + } + + c if is_carriage_return(c) => { + self.eat_while(is_carriage_return); + TokenKind::CarriageReturn + } + + c if is_vertical_tab(c) => { + self.eat_while(is_vertical_tab); + TokenKind::VerticalTab + } + + c if is_form_feed(c) => { + self.eat_while(is_form_feed); + TokenKind::FormFeed + } + + // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE + 'u' | 'U' => match self.first() { + '&' => { + self.bump(); + self.prefixed_string( + |terminated| LiteralKind::UnicodeEscStr { terminated }, + true, + ) + } + _ => self.ident_or_unknown_prefix(), + }, + + // escaped strings + 'e' | 'E' => { + self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false) + } + + // bit string + 'b' | 'B' => { + self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false) + } + + // hexadecimal byte string + 'x' | 'X' => { + self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false) + } + + // Identifier (this should be checked after other variant that can + // start as identifier). + c if is_ident_start(c) => self.ident(), + + // Numeric literal. + // see: https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC + c @ '0'..='9' => { + let literal_kind = self.number(c); + TokenKind::Literal { kind: literal_kind } + } + '.' => match self.first() { + '0'..='9' => { + let literal_kind = self.number('.'); + TokenKind::Literal { kind: literal_kind } + } + _ => TokenKind::Dot, + }, + // One-symbol tokens. + ';' => TokenKind::Semi, + ',' => TokenKind::Comma, + '(' => TokenKind::OpenParen, + ')' => TokenKind::CloseParen, + '[' => TokenKind::OpenBracket, + ']' => TokenKind::CloseBracket, + '@' => TokenKind::At, + '#' => TokenKind::Pound, + '~' => TokenKind::Tilde, + '?' => TokenKind::Question, + ':' => TokenKind::Colon, + '$' => { + // Dollar quoted strings + if is_ident_start(self.first()) || self.first() == '$' { + self.dollar_quoted_string() + } else { + // Parameters + while self.first().is_ascii_digit() { + self.bump(); + } + TokenKind::PositionalParam + } + } + '`' => TokenKind::Backtick, + '=' => TokenKind::Eq, + '!' => TokenKind::Bang, + '<' => TokenKind::Lt, + '>' => TokenKind::Gt, + '&' => TokenKind::And, + '|' => TokenKind::Or, + '+' => TokenKind::Plus, + '*' => TokenKind::Star, + '^' => TokenKind::Caret, + '%' => TokenKind::Percent, + + // String literal + '\'' => { + let terminated = self.single_quoted_string(); + let kind = LiteralKind::Str { terminated }; + TokenKind::Literal { kind } + } + + // Quoted indentifiers + '"' => { + let terminated = self.double_quoted_string(); + TokenKind::QuotedIdent { terminated } + } + _ => TokenKind::Unknown, + }; + let res = Token::new(token_kind, self.pos_within_token()); + self.reset_pos_within_token(); + res + } + pub(crate) fn ident(&mut self) -> TokenKind { + self.eat_while(is_ident_cont); + TokenKind::Ident + } + + fn ident_or_unknown_prefix(&mut self) -> TokenKind { + // Start is already eaten, eat the rest of identifier. + self.eat_while(is_ident_cont); + // Known prefixes must have been handled earlier. So if + // we see a prefix here, it is definitely an unknown prefix. + match self.first() { + '#' | '"' | '\'' => TokenKind::UnknownPrefix, + _ => TokenKind::Ident, + } + } + + // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L227 + // comment ("--"{non_newline}*) + pub(crate) fn line_comment(&mut self) -> TokenKind { + self.bump(); + + self.eat_while(|c| c != '\n'); + TokenKind::LineComment + } + + // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L324-L344 + pub(crate) fn block_comment(&mut self) -> TokenKind { + self.bump(); + + let mut depth = 1usize; + while let Some(c) = self.bump() { + match c { + '/' if self.first() == '*' => { + self.bump(); + depth += 1; + } + '*' if self.first() == '/' => { + self.bump(); + depth -= 1; + if depth == 0 { + // This block comment is closed, so for a construction like "/* */ */" + // there will be a successfully parsed block comment "/* */" + // and " */" will be processed separately. + break; + } + } + _ => (), + } + } + + TokenKind::BlockComment { + terminated: depth == 0, + } + } + + fn prefixed_string( + &mut self, + mk_kind: fn(bool) -> LiteralKind, + allows_double: bool, + ) -> TokenKind { + match self.first() { + '\'' => { + self.bump(); + let terminated = self.single_quoted_string(); + let kind = mk_kind(terminated); + TokenKind::Literal { kind } + } + '"' if allows_double => { + self.bump(); + let terminated = self.double_quoted_string(); + TokenKind::QuotedIdent { terminated } + } + _ => self.ident_or_unknown_prefix(), + } + } + + fn number(&mut self, first_digit: char) -> LiteralKind { + let mut base = Base::Decimal; + if first_digit == '0' { + // Attempt to parse encoding base. + match self.first() { + // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L403 + 'b' | 'B' => { + base = Base::Binary; + self.bump(); + if !self.eat_decimal_digits() { + return LiteralKind::Int { + base, + empty_int: true, + }; + } + } + // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L402 + 'o' | 'O' => { + base = Base::Octal; + self.bump(); + if !self.eat_decimal_digits() { + return LiteralKind::Int { + base, + empty_int: true, + }; + } + } + // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L401 + 'x' | 'X' => { + base = Base::Hexadecimal; + self.bump(); + if !self.eat_hexadecimal_digits() { + return LiteralKind::Int { + base, + empty_int: true, + }; + } + } + // Not a base prefix; consume additional digits. + '0'..='9' | '_' => { + self.eat_decimal_digits(); + } + + // Also not a base prefix; nothing more to do here. + '.' | 'e' | 'E' => {} + + // Just a 0. + _ => { + return LiteralKind::Int { + base, + empty_int: false, + }; + } + } + } else { + // No base prefix, parse number in the usual way. + self.eat_decimal_digits(); + }; + + match self.first() { + '.' => { + // might have stuff after the ., and if it does, it needs to start + // with a number + self.bump(); + let mut empty_exponent = false; + if self.first().is_ascii_digit() { + self.eat_decimal_digits(); + match self.first() { + 'e' | 'E' => { + self.bump(); + empty_exponent = !self.eat_float_exponent(); + } + _ => (), + } + } else { + match self.first() { + 'e' | 'E' => { + self.bump(); + empty_exponent = !self.eat_float_exponent(); + } + _ => (), + } + } + LiteralKind::Float { + base, + empty_exponent, + } + } + 'e' | 'E' => { + self.bump(); + let empty_exponent = !self.eat_float_exponent(); + LiteralKind::Float { + base, + empty_exponent, + } + } + _ => LiteralKind::Int { + base, + empty_int: false, + }, + } + } + + fn single_quoted_string(&mut self) -> bool { + // Parse until either quotes are terminated or error is detected. + loop { + match self.first() { + // Quotes might be terminated. + '\'' => { + self.bump(); + + match self.first() { + // encountered an escaped quote '' + '\'' => { + self.bump(); + } + // encountered terminating quote + _ => return true, + } + } + // End of file, stop parsing. + EOF_CHAR if self.is_eof() => break, + // Skip the character. + _ => { + self.bump(); + } + } + } + // String was not terminated. + false + } + + /// Eats double-quoted string and returns true + /// if string is terminated. + fn double_quoted_string(&mut self) -> bool { + while let Some(c) = self.bump() { + match c { + '"' if self.first() == '"' => { + // Bump again to skip escaped character. + self.bump(); + } + '"' => { + return true; + } + _ => (), + } + } + // End of file reached. + false + } + + // https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING + fn dollar_quoted_string(&mut self) -> TokenKind { + // Get the start sequence of the dollar quote, i.e., 'foo' in + // $foo$hello$foo$ + let mut start = vec![]; + while let Some(c) = self.bump() { + match c { + '$' => { + self.bump(); + break; + } + _ => { + start.push(c); + } + } + } + + // we have a dollar quoted string deliminated with `$$` + if start.is_empty() { + loop { + self.eat_while(|c| c != '$'); + if self.is_eof() { + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated: false }, + }; + } + // eat $ + self.bump(); + if self.first() == '$' { + self.bump(); + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated: true }, + }; + } + } + } else { + loop { + self.eat_while(|c| c != start[0]); + if self.is_eof() { + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated: false }, + }; + } + + // might be the start of our start/end sequence + let mut match_count = 0; + for start_char in &start { + if self.first() == *start_char { + self.bump(); + match_count += 1; + } else { + self.bump(); + break; + } + } + + // closing '$' + let terminated = match_count == start.len(); + if self.first() == '$' && terminated { + self.bump(); + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated }, + }; + } + } + } + } + + fn eat_decimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.first() { + '_' => { + self.bump(); + } + '0'..='9' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + fn eat_hexadecimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.first() { + '_' => { + self.bump(); + } + '0'..='9' | 'a'..='f' | 'A'..='F' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + /// Eats the float exponent. Returns true if at least one digit was met, + /// and returns false otherwise. + fn eat_float_exponent(&mut self) -> bool { + if self.first() == '-' || self.first() == '+' { + self.bump(); + } + self.eat_decimal_digits() + } +} + +/// Creates an iterator that produces tokens from the input string. +pub fn tokenize(input: &str) -> impl Iterator + '_ { + let mut cursor = Cursor::new(input); + std::iter::from_fn(move || { + let token = cursor.advance_token(); + if token.kind != TokenKind::Eof { + Some(token) + } else { + None + } + }) +} + +#[cfg(test)] +mod tests { + use std::fmt; + + use super::*; + use insta::assert_debug_snapshot; + + struct TokenDebug<'a> { + content: &'a str, + token: Token, + } + impl fmt::Debug for TokenDebug<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?} @ {:?}", self.content, self.token.kind) + } + } + + impl<'a> TokenDebug<'a> { + fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> { + TokenDebug { + token, + content: &input[start as usize..(start + token.len) as usize], + } + } + } + + fn lex(input: &str) -> Vec { + let mut tokens = vec![]; + let mut start = 0; + + for token in tokenize(input) { + let length = token.len; + tokens.push(TokenDebug::new(token, input, start)); + start += length; + } + tokens + } + #[test] + fn lex_statement() { + let result = lex("select 1;"); + assert_debug_snapshot!(result); + } + + #[test] + fn block_comment() { + let result = lex(r#" +/* + * foo + * bar +*/"#); + assert_debug_snapshot!(result); + } + + #[test] + fn block_comment_unterminated() { + let result = lex(r#" +/* + * foo + * bar + /* +*/"#); + assert_debug_snapshot!(result); + } + + #[test] + fn line_comment() { + let result = lex(r#" +-- foooooooooooo bar buzz +"#); + assert_debug_snapshot!(result); + } + + #[test] + fn line_comment_whitespace() { + assert_debug_snapshot!(lex(r#" +select 'Hello' -- This is a comment +' World';"#)) + } + + #[test] + fn dollar_quoting() { + assert_debug_snapshot!(lex(r#" +$$Dianne's horse$$ +$SomeTag$Dianne's horse$SomeTag$ + +-- with dollar inside and matching tags +$foo$hello$world$bar$ +"#)) + } + + #[test] + fn dollar_strings_part2() { + assert_debug_snapshot!(lex(r#" +DO $doblock$ +end +$doblock$;"#)) + } + + #[test] + fn dollar_quote_mismatch_tags_simple() { + assert_debug_snapshot!(lex(r#" +-- dollar quoting with mismatched tags +$foo$hello world$bar$ +"#)); + } + + #[test] + fn dollar_quote_mismatch_tags_complex() { + assert_debug_snapshot!(lex(r#" +-- with dollar inside but mismatched tags +$foo$hello$world$bar$ +"#)); + } + + #[test] + fn numeric() { + assert_debug_snapshot!(lex(r#" +42 +3.5 +4. +.001 +.123e10 +5e2 +1.925e-3 +1e-10 +1e+10 +1e10 +4664.E+5 +"#)) + } + + #[test] + fn numeric_non_decimal() { + assert_debug_snapshot!(lex(r#" +0b100101 +0B10011001 +0o273 +0O755 +0x42f +0XFFFF +"#)) + } + + #[test] + fn numeric_with_seperators() { + assert_debug_snapshot!(lex(r#" +1_500_000_000 +0b10001000_00000000 +0o_1_755 +0xFFFF_FFFF +1.618_034 +"#)) + } + + #[test] + fn select_with_period() { + assert_debug_snapshot!(lex(r#" +select public.users; +"#)) + } + + #[test] + fn bitstring() { + assert_debug_snapshot!(lex(r#" +B'1001' +b'1001' +X'1FF' +x'1FF' +"#)) + } + + #[test] + fn string() { + assert_debug_snapshot!(lex(r#" +'Dianne''s horse' + +select 'foo '' +bar'; + +select 'foooo' + 'bar'; + + +'foo \\ \n \tbar' + +'forgot to close the string +"#)) + } + + #[test] + fn params() { + assert_debug_snapshot!(lex(r#" +select $1 + $2; + +select $1123123123123; + +select $; +"#)) + } + + #[test] + fn string_with_escapes() { + // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-ESCAPE + + assert_debug_snapshot!(lex(r#" +E'foo' + +e'bar' + +e'\b\f\n\r\t' + +e'\0\11\777' + +e'\x0\x11\xFF' + +e'\uAAAA \UFFFFFFFF' + +"#)) + } + + #[test] + fn string_unicode_escape() { + // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE + + assert_debug_snapshot!(lex(r#" +U&"d\0061t\+000061" + +U&"\0441\043B\043E\043D" + +u&'\0441\043B' + +U&"d!0061t!+000061" UESCAPE '!' +"#)) + } + + #[test] + fn quoted_ident() { + assert_debug_snapshot!(lex(r#" +"hello &1 -world"; + + +"hello-world +"#)) + } + + #[test] + fn quoted_ident_with_escape_quote() { + assert_debug_snapshot!(lex(r#" +"foo "" bar" +"#)) + } +} diff --git a/crates/pgt_lexer_new/src/token.rs b/crates/pgt_tokenizer/src/token.rs similarity index 100% rename from crates/pgt_lexer_new/src/token.rs rename to crates/pgt_tokenizer/src/token.rs From 2cdc659cdf8dd73e49c05a81758b933770a375d6 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Thu, 3 Jul 2025 08:59:25 +0200 Subject: [PATCH 3/8] progress --- .claude/settings.local.json | 11 ++ Cargo.lock | 2 + crates/pgt_diagnostics/src/display/message.rs | 9 + crates/pgt_lexer_new/Cargo.toml | 2 + crates/pgt_lexer_new/examples/basic_usage.rs | 44 +++++ crates/pgt_lexer_new/src/diagnostics.rs | 41 +++++ crates/pgt_lexer_new/src/lexed_str.rs | 8 +- crates/pgt_lexer_new/src/lib.rs | 157 ++++++++++++++++++ 8 files changed, 270 insertions(+), 4 deletions(-) create mode 100644 .claude/settings.local.json create mode 100644 crates/pgt_lexer_new/examples/basic_usage.rs create mode 100644 crates/pgt_lexer_new/src/diagnostics.rs diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 00000000..f99e7b98 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,11 @@ +{ + "permissions": { + "allow": [ + "Bash(grep:*)", + "Bash(rg:*)", + "Bash(cargo test:*)", + "Bash(cargo run:*)" + ], + "deny": [] + } +} \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 7d1853fb..7f5d6e3a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2803,7 +2803,9 @@ name = "pgt_lexer_new" version = "0.0.0" dependencies = [ "insta", + "pgt_diagnostics", "pgt_lexer_new_codegen", + "pgt_text_size", "pgt_tokenizer", ] diff --git a/crates/pgt_diagnostics/src/display/message.rs b/crates/pgt_diagnostics/src/display/message.rs index 3cf9be3f..20c039a9 100644 --- a/crates/pgt_diagnostics/src/display/message.rs +++ b/crates/pgt_diagnostics/src/display/message.rs @@ -47,6 +47,15 @@ impl From for MessageAndDescription { } } +impl From<&str> for MessageAndDescription { + fn from(description: &str) -> Self { + Self { + message: markup! { {description} }.to_owned(), + description: description.into(), + } + } +} + impl From for MessageAndDescription { fn from(message: MarkupBuf) -> Self { let description = markup_to_string(&message); diff --git a/crates/pgt_lexer_new/Cargo.toml b/crates/pgt_lexer_new/Cargo.toml index 0ea86475..37cdeac6 100644 --- a/crates/pgt_lexer_new/Cargo.toml +++ b/crates/pgt_lexer_new/Cargo.toml @@ -12,7 +12,9 @@ version = "0.0.0" [dependencies] +pgt_diagnostics.workspace = true pgt_lexer_new_codegen.workspace = true +pgt_text_size.workspace = true pgt_tokenizer.workspace = true [dev-dependencies] diff --git a/crates/pgt_lexer_new/examples/basic_usage.rs b/crates/pgt_lexer_new/examples/basic_usage.rs new file mode 100644 index 00000000..26d8124c --- /dev/null +++ b/crates/pgt_lexer_new/examples/basic_usage.rs @@ -0,0 +1,44 @@ +use pgt_lexer_new::{SyntaxKind, lex}; + +fn main() { + let sql = "SELECT id, name FROM users WHERE active = true;"; + let lexed = lex(sql); + + println!("Total tokens: {}", lexed.len()); + println!("\nToken details:"); + + // Iterate over tokens + for (idx, kind) in lexed.tokens().enumerate() { + // Skip whitespace for cleaner output + if matches!( + kind, + SyntaxKind::SPACE | SyntaxKind::TAB | SyntaxKind::NEWLINE + ) { + continue; + } + + let range = lexed.range(idx); + let text = lexed.text(idx); + + println!(" [{:3}] {:?} @ {:?} = {:?}", idx, kind, range, text); + } + + // Check for errors + let errors = lexed.errors(); + if !errors.is_empty() { + println!("\nLexing errors:"); + for error in errors { + println!(" Error at {:?}: {}", error.span, error.message); + } + } else { + println!("\nNo lexing errors found."); + } + + // Example: Find all identifiers + println!("\nIdentifiers found:"); + for (idx, kind) in lexed.tokens().enumerate() { + if kind == SyntaxKind::IDENT { + println!(" - {} at {:?}", lexed.text(idx), lexed.range(idx)); + } + } +} diff --git a/crates/pgt_lexer_new/src/diagnostics.rs b/crates/pgt_lexer_new/src/diagnostics.rs new file mode 100644 index 00000000..41f05f48 --- /dev/null +++ b/crates/pgt_lexer_new/src/diagnostics.rs @@ -0,0 +1,41 @@ +use pgt_diagnostics::{Diagnostic, MessageAndDescription}; +use pgt_text_size::TextRange; + +/// A specialized diagnostic for lex errors. +#[derive(Clone, Debug, Diagnostic, PartialEq)] +#[diagnostic(category = "syntax", severity = Error)] +pub struct LexError { + /// The location where the error is occurred + #[location(span)] + pub span: TextRange, + #[message] + #[description] + pub message: MessageAndDescription, +} + +#[cfg(test)] +mod tests { + use crate::lex; + + #[test] + fn finds_lex_errors() { + // Test with unterminated block comment + let input = "/* unterminated comment"; + let lexed = lex(input); + let errors = lexed.errors(); + + // Should have error for unterminated block comment + assert!(!errors.is_empty()); + assert!(errors[0].message.to_string().contains("Missing trailing")); + assert!(errors[0].span.start() < errors[0].span.end()); + + // Test with unterminated string + let input2 = "SELECT 'unterminated string"; + let lexed2 = lex(input2); + let errors2 = lexed2.errors(); + + // Should have error for unterminated string + assert!(!errors2.is_empty()); + assert!(errors2[0].message.to_string().contains("Missing trailing")); + } +} diff --git a/crates/pgt_lexer_new/src/lexed_str.rs b/crates/pgt_lexer_new/src/lexed_str.rs index 6bbeec90..d085cf6b 100644 --- a/crates/pgt_lexer_new/src/lexed_str.rs +++ b/crates/pgt_lexer_new/src/lexed_str.rs @@ -31,20 +31,20 @@ impl<'a> LexedStr<'a> { conv.finalize_with_eof() } - pub(crate) fn len(&self) -> usize { + pub fn len(&self) -> usize { self.kind.len() - 1 } - pub(crate) fn kind(&self, i: usize) -> SyntaxKind { + pub fn kind(&self, i: usize) -> SyntaxKind { assert!(i < self.len()); self.kind[i] } - pub(crate) fn text(&self, i: usize) -> &str { + pub fn text(&self, i: usize) -> &str { self.range_text(i..i + 1) } - pub(crate) fn range_text(&self, r: ops::Range) -> &str { + pub fn range_text(&self, r: ops::Range) -> &str { assert!(r.start < r.end && r.end <= self.len()); let lo = self.start[r.start] as usize; let hi = self.start[r.end] as usize; diff --git a/crates/pgt_lexer_new/src/lib.rs b/crates/pgt_lexer_new/src/lib.rs index d1c34c1b..2326149a 100644 --- a/crates/pgt_lexer_new/src/lib.rs +++ b/crates/pgt_lexer_new/src/lib.rs @@ -1,4 +1,161 @@ mod codegen; +mod diagnostics; mod lexed_str; +use diagnostics::LexError; +use lexed_str::LexedStr; +use pgt_text_size::TextRange; + pub use crate::codegen::syntax_kind::SyntaxKind; + +/// Result of lexing a string, providing access to tokens and diagnostics +pub struct Lexed<'a> { + inner: LexedStr<'a>, +} + +impl<'a> Lexed<'a> { + /// Returns the number of tokens (excluding EOF) + pub fn len(&self) -> usize { + self.inner.len() + } + + /// Returns true if there are no tokens + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns an iterator over token kinds + pub fn tokens(&self) -> impl Iterator + '_ { + (0..self.len()).map(move |i| self.inner.kind(i)) + } + + /// Returns the kind of token at the given index + pub fn kind(&self, idx: usize) -> SyntaxKind { + self.inner.kind(idx) + } + + /// Returns the text range of token at the given index + pub fn range(&self, idx: usize) -> TextRange { + let range = self.inner.text_range(idx); + TextRange::new( + range.start.try_into().unwrap(), + range.end.try_into().unwrap(), + ) + } + + /// Returns the text of token at the given index + pub fn text(&self, idx: usize) -> &str { + self.inner.text(idx) + } + + /// Returns all lexing errors with their text ranges + pub fn errors(&self) -> Vec { + self.inner + .errors() + .map(|(i, msg)| { + let range = self.inner.text_range(i); + LexError { + message: msg.into(), + span: TextRange::new( + range.start.try_into().unwrap(), + range.end.try_into().unwrap(), + ), + } + }) + .collect() + } +} + +/// Lex the input string into tokens and diagnostics +pub fn lex(input: &str) -> Lexed { + Lexed { + inner: LexedStr::new(input), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_lexing() { + let input = "SELECT * FROM users WHERE id = 1;"; + let lexed = lex(input); + + // Check we have tokens + assert!(!lexed.is_empty()); + + // Iterate over tokens and collect identifiers + let mut identifiers = Vec::new(); + for (idx, kind) in lexed.tokens().enumerate() { + if kind == SyntaxKind::IDENT { + identifiers.push((lexed.text(idx), lexed.range(idx))); + } + } + + // Should find at least "users" and "id" as identifiers + assert!(identifiers.len() >= 2); + } + + #[test] + fn test_lexing_with_errors() { + let input = "SELECT 'unterminated string"; + let lexed = lex(input); + + // Should have tokens + assert!(!lexed.is_empty()); + + // Should have an error for unterminated string + let errors = lexed.errors(); + assert!(!errors.is_empty()); + // Check the error message exists + assert!(!errors[0].message.to_string().is_empty()); + } + + #[test] + fn test_token_ranges() { + let input = "SELECT id"; + let lexed = lex(input); + + // First token should be a keyword (SELECT gets parsed as a keyword) + let _first_kind = lexed.kind(0); + assert_eq!(u32::from(lexed.range(0).start()), 0); + assert_eq!(u32::from(lexed.range(0).end()), 6); + assert_eq!(lexed.text(0), "SELECT"); + + // Find the id token + for (idx, kind) in lexed.tokens().enumerate() { + if kind == SyntaxKind::IDENT && lexed.text(idx) == "id" { + assert_eq!(u32::from(lexed.range(idx).start()), 7); + assert_eq!(u32::from(lexed.range(idx).end()), 9); + } + } + } + + #[test] + fn test_empty_input() { + let input = ""; + let lexed = lex(input); + assert!(lexed.is_empty()); + assert_eq!(lexed.len(), 0); + } + + #[test] + fn test_whitespace_handling() { + let input = " SELECT \n id "; + let lexed = lex(input); + + // Collect non-whitespace tokens + let mut non_whitespace = Vec::new(); + for (idx, kind) in lexed.tokens().enumerate() { + if !matches!( + kind, + SyntaxKind::SPACE | SyntaxKind::TAB | SyntaxKind::NEWLINE + ) { + non_whitespace.push(lexed.text(idx)); + } + } + + assert_eq!(non_whitespace.len(), 2); // SELECT and id + } +} From 86e83c8f3a781a7d0c33a03a2eb2c985620ed446 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Thu, 3 Jul 2025 09:00:00 +0200 Subject: [PATCH 4/8] progress --- crates/pgt_lexer_new/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/pgt_lexer_new/src/lib.rs b/crates/pgt_lexer_new/src/lib.rs index 2326149a..e4dd3a25 100644 --- a/crates/pgt_lexer_new/src/lib.rs +++ b/crates/pgt_lexer_new/src/lib.rs @@ -13,7 +13,7 @@ pub struct Lexed<'a> { inner: LexedStr<'a>, } -impl<'a> Lexed<'a> { +impl Lexed<'_> { /// Returns the number of tokens (excluding EOF) pub fn len(&self) -> usize { self.inner.len() From 8794776b8182815ca0c5edf8e4d177e381f50a7d Mon Sep 17 00:00:00 2001 From: psteinroe Date: Thu, 3 Jul 2025 09:00:32 +0200 Subject: [PATCH 5/8] progress --- crates/pgt_lexer_new/src/lexed_str.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/crates/pgt_lexer_new/src/lexed_str.rs b/crates/pgt_lexer_new/src/lexed_str.rs index d085cf6b..99eaec88 100644 --- a/crates/pgt_lexer_new/src/lexed_str.rs +++ b/crates/pgt_lexer_new/src/lexed_str.rs @@ -58,10 +58,6 @@ impl<'a> LexedStr<'a> { let hi = self.start[i + 1] as usize; lo..hi } - pub fn text_start(&self, i: usize) -> usize { - assert!(i <= self.len()); - self.start[i] as usize - } pub fn errors(&self) -> impl Iterator + '_ { self.error From 1676f3b7bbd644e0e46a75df7a1f644cfa00a135 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Thu, 3 Jul 2025 09:27:04 +0200 Subject: [PATCH 6/8] progress --- crates/pgt_lexer_new/examples/basic_usage.rs | 44 -- crates/pgt_lexer_new/src/diagnostics.rs | 27 - crates/pgt_lexer_new/src/lib.rs | 24 + crates/pgt_lexer_new_codegen/build.rs | 18 +- .../postgres/17-6.1.0/kwlist.h | 518 ++++++++++++++++++ 5 files changed, 550 insertions(+), 81 deletions(-) delete mode 100644 crates/pgt_lexer_new/examples/basic_usage.rs create mode 100644 crates/pgt_lexer_new_codegen/postgres/17-6.1.0/kwlist.h diff --git a/crates/pgt_lexer_new/examples/basic_usage.rs b/crates/pgt_lexer_new/examples/basic_usage.rs deleted file mode 100644 index 26d8124c..00000000 --- a/crates/pgt_lexer_new/examples/basic_usage.rs +++ /dev/null @@ -1,44 +0,0 @@ -use pgt_lexer_new::{SyntaxKind, lex}; - -fn main() { - let sql = "SELECT id, name FROM users WHERE active = true;"; - let lexed = lex(sql); - - println!("Total tokens: {}", lexed.len()); - println!("\nToken details:"); - - // Iterate over tokens - for (idx, kind) in lexed.tokens().enumerate() { - // Skip whitespace for cleaner output - if matches!( - kind, - SyntaxKind::SPACE | SyntaxKind::TAB | SyntaxKind::NEWLINE - ) { - continue; - } - - let range = lexed.range(idx); - let text = lexed.text(idx); - - println!(" [{:3}] {:?} @ {:?} = {:?}", idx, kind, range, text); - } - - // Check for errors - let errors = lexed.errors(); - if !errors.is_empty() { - println!("\nLexing errors:"); - for error in errors { - println!(" Error at {:?}: {}", error.span, error.message); - } - } else { - println!("\nNo lexing errors found."); - } - - // Example: Find all identifiers - println!("\nIdentifiers found:"); - for (idx, kind) in lexed.tokens().enumerate() { - if kind == SyntaxKind::IDENT { - println!(" - {} at {:?}", lexed.text(idx), lexed.range(idx)); - } - } -} diff --git a/crates/pgt_lexer_new/src/diagnostics.rs b/crates/pgt_lexer_new/src/diagnostics.rs index 41f05f48..e92cb27c 100644 --- a/crates/pgt_lexer_new/src/diagnostics.rs +++ b/crates/pgt_lexer_new/src/diagnostics.rs @@ -12,30 +12,3 @@ pub struct LexError { #[description] pub message: MessageAndDescription, } - -#[cfg(test)] -mod tests { - use crate::lex; - - #[test] - fn finds_lex_errors() { - // Test with unterminated block comment - let input = "/* unterminated comment"; - let lexed = lex(input); - let errors = lexed.errors(); - - // Should have error for unterminated block comment - assert!(!errors.is_empty()); - assert!(errors[0].message.to_string().contains("Missing trailing")); - assert!(errors[0].span.start() < errors[0].span.end()); - - // Test with unterminated string - let input2 = "SELECT 'unterminated string"; - let lexed2 = lex(input2); - let errors2 = lexed2.errors(); - - // Should have error for unterminated string - assert!(!errors2.is_empty()); - assert!(errors2[0].message.to_string().contains("Missing trailing")); - } -} diff --git a/crates/pgt_lexer_new/src/lib.rs b/crates/pgt_lexer_new/src/lib.rs index e4dd3a25..fdbafeda 100644 --- a/crates/pgt_lexer_new/src/lib.rs +++ b/crates/pgt_lexer_new/src/lib.rs @@ -9,6 +9,8 @@ use pgt_text_size::TextRange; pub use crate::codegen::syntax_kind::SyntaxKind; /// Result of lexing a string, providing access to tokens and diagnostics +/// +/// Thin wrapper around LexedStr for better API ergonomics pub struct Lexed<'a> { inner: LexedStr<'a>, } @@ -158,4 +160,26 @@ mod tests { assert_eq!(non_whitespace.len(), 2); // SELECT and id } + + #[test] + fn finds_lex_errors() { + // Test with unterminated block comment + let input = "/* unterminated comment"; + let lexed = lex(input); + let errors = lexed.errors(); + + // Should have error for unterminated block comment + assert!(!errors.is_empty()); + assert!(errors[0].message.to_string().contains("Missing trailing")); + assert!(errors[0].span.start() < errors[0].span.end()); + + // Test with unterminated string + let input2 = "SELECT 'unterminated string"; + let lexed2 = lex(input2); + let errors2 = lexed2.errors(); + + // Should have error for unterminated string + assert!(!errors2.is_empty()); + assert!(errors2[0].message.to_string().contains("Missing trailing")); + } } diff --git a/crates/pgt_lexer_new_codegen/build.rs b/crates/pgt_lexer_new_codegen/build.rs index 3cd71002..70c9635d 100644 --- a/crates/pgt_lexer_new_codegen/build.rs +++ b/crates/pgt_lexer_new_codegen/build.rs @@ -10,19 +10,19 @@ static LIBPG_QUERY_TAG: &str = "17-6.1.0"; fn main() -> Result<(), Box> { let version = LIBPG_QUERY_TAG.to_string(); - let out_dir = PathBuf::from(env::var("OUT_DIR")?); - let vendor_dir = out_dir.join("vendor"); - let libpg_query_dir = vendor_dir.join("libpg_query").join(&version); - let kwlist_path = libpg_query_dir.join("kwlist.h"); - let stamp_file = libpg_query_dir.join(".stamp"); + // Check for the postgres header file in the source tree first + let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?); + let headers_dir = manifest_dir.join("postgres").join(&version); + let kwlist_path = headers_dir.join("kwlist.h"); - if !stamp_file.exists() { + // Only download if the file doesn't exist + if !kwlist_path.exists() { println!( "cargo:warning=Downloading kwlist.h for libpg_query {}", version ); - fs::create_dir_all(&libpg_query_dir)?; + fs::create_dir_all(&headers_dir)?; let proto_url = format!( "https://raw.githubusercontent.com/pganalyze/libpg_query/{}/src/postgres/include/parser/kwlist.h", @@ -35,8 +35,6 @@ fn main() -> Result<(), Box> { let mut file = fs::File::create(&kwlist_path)?; file.write_all(content.as_bytes())?; - fs::File::create(&stamp_file)?; - println!("cargo:warning=Successfully downloaded kwlist.h"); } @@ -45,7 +43,7 @@ fn main() -> Result<(), Box> { kwlist_path.display() ); - println!("cargo:rerun-if-changed={}", stamp_file.display()); + println!("cargo:rerun-if-changed={}", kwlist_path.display()); Ok(()) } diff --git a/crates/pgt_lexer_new_codegen/postgres/17-6.1.0/kwlist.h b/crates/pgt_lexer_new_codegen/postgres/17-6.1.0/kwlist.h new file mode 100644 index 00000000..658d7ff6 --- /dev/null +++ b/crates/pgt_lexer_new_codegen/postgres/17-6.1.0/kwlist.h @@ -0,0 +1,518 @@ +/*------------------------------------------------------------------------- + * + * kwlist.h + * + * The keyword lists are kept in their own source files for use by + * automatic tools. The exact representation of a keyword is determined + * by the PG_KEYWORD macro, which is not defined in this file; it can + * be defined by the caller for special purposes. + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/include/parser/kwlist.h + * + *------------------------------------------------------------------------- + */ + +/* there is deliberately not an #ifndef KWLIST_H here */ + +/* + * List of keyword (name, token-value, category, bare-label-status) entries. + * + * Note: gen_keywordlist.pl requires the entries to appear in ASCII order. + */ + +/* name, value, category, is-bare-label */ +PG_KEYWORD("abort", ABORT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("absent", ABSENT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("absolute", ABSOLUTE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("access", ACCESS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("action", ACTION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("add", ADD_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("admin", ADMIN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("after", AFTER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("aggregate", AGGREGATE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("all", ALL, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("also", ALSO, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("alter", ALTER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("always", ALWAYS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("analyse", ANALYSE, RESERVED_KEYWORD, BARE_LABEL) /* British spelling */ +PG_KEYWORD("analyze", ANALYZE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("and", AND, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("any", ANY, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("array", ARRAY, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("as", AS, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("asc", ASC, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("asensitive", ASENSITIVE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("assertion", ASSERTION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("assignment", ASSIGNMENT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("asymmetric", ASYMMETRIC, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("at", AT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("atomic", ATOMIC, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("attach", ATTACH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("attribute", ATTRIBUTE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("authorization", AUTHORIZATION, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("backward", BACKWARD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("before", BEFORE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("begin", BEGIN_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("between", BETWEEN, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("bigint", BIGINT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("binary", BINARY, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("bit", BIT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("boolean", BOOLEAN_P, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("both", BOTH, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("breadth", BREADTH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("by", BY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cache", CACHE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("call", CALL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("called", CALLED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cascade", CASCADE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cascaded", CASCADED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("case", CASE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cast", CAST, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("catalog", CATALOG_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("chain", CHAIN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("char", CHAR_P, COL_NAME_KEYWORD, AS_LABEL) +PG_KEYWORD("character", CHARACTER, COL_NAME_KEYWORD, AS_LABEL) +PG_KEYWORD("characteristics", CHARACTERISTICS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("check", CHECK, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("checkpoint", CHECKPOINT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("class", CLASS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("close", CLOSE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cluster", CLUSTER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("coalesce", COALESCE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("collate", COLLATE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("collation", COLLATION, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("column", COLUMN, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("columns", COLUMNS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("comment", COMMENT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("comments", COMMENTS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("commit", COMMIT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("committed", COMMITTED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("compression", COMPRESSION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("concurrently", CONCURRENTLY, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("conditional", CONDITIONAL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("configuration", CONFIGURATION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("conflict", CONFLICT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("connection", CONNECTION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("constraint", CONSTRAINT, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("constraints", CONSTRAINTS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("content", CONTENT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("continue", CONTINUE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("conversion", CONVERSION_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("copy", COPY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cost", COST, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("create", CREATE, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("cross", CROSS, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("csv", CSV, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cube", CUBE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("current", CURRENT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("current_catalog", CURRENT_CATALOG, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("current_date", CURRENT_DATE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("current_role", CURRENT_ROLE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("current_schema", CURRENT_SCHEMA, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("current_time", CURRENT_TIME, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("current_timestamp", CURRENT_TIMESTAMP, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("current_user", CURRENT_USER, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cursor", CURSOR, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("cycle", CYCLE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("data", DATA_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("database", DATABASE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("day", DAY_P, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("deallocate", DEALLOCATE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("dec", DEC, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("decimal", DECIMAL_P, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("declare", DECLARE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("default", DEFAULT, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("defaults", DEFAULTS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("deferrable", DEFERRABLE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("deferred", DEFERRED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("definer", DEFINER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("delete", DELETE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("delimiter", DELIMITER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("delimiters", DELIMITERS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("depends", DEPENDS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("depth", DEPTH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("desc", DESC, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("detach", DETACH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("dictionary", DICTIONARY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("disable", DISABLE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("discard", DISCARD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("distinct", DISTINCT, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("do", DO, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("document", DOCUMENT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("domain", DOMAIN_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("double", DOUBLE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("drop", DROP, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("each", EACH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("else", ELSE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("empty", EMPTY_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("enable", ENABLE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("encoding", ENCODING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("encrypted", ENCRYPTED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("end", END_P, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("enum", ENUM_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("error", ERROR_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("escape", ESCAPE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("event", EVENT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("except", EXCEPT, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("exclude", EXCLUDE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("excluding", EXCLUDING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("exclusive", EXCLUSIVE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("execute", EXECUTE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("exists", EXISTS, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("explain", EXPLAIN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("expression", EXPRESSION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("extension", EXTENSION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("external", EXTERNAL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("extract", EXTRACT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("false", FALSE_P, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("family", FAMILY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("fetch", FETCH, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("filter", FILTER, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("finalize", FINALIZE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("first", FIRST_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("float", FLOAT_P, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("following", FOLLOWING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("for", FOR, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("force", FORCE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("foreign", FOREIGN, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("format", FORMAT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("forward", FORWARD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("freeze", FREEZE, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("from", FROM, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("full", FULL, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("function", FUNCTION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("functions", FUNCTIONS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("generated", GENERATED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("global", GLOBAL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("grant", GRANT, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("granted", GRANTED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("greatest", GREATEST, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("group", GROUP_P, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("grouping", GROUPING, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("groups", GROUPS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("handler", HANDLER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("having", HAVING, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("header", HEADER_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("hold", HOLD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("hour", HOUR_P, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("identity", IDENTITY_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("if", IF_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("ilike", ILIKE, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("immediate", IMMEDIATE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("immutable", IMMUTABLE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("implicit", IMPLICIT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("import", IMPORT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("in", IN_P, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("include", INCLUDE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("including", INCLUDING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("increment", INCREMENT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("indent", INDENT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("index", INDEX, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("indexes", INDEXES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("inherit", INHERIT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("inherits", INHERITS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("initially", INITIALLY, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("inline", INLINE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("inner", INNER_P, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("inout", INOUT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("input", INPUT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("insensitive", INSENSITIVE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("insert", INSERT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("instead", INSTEAD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("int", INT_P, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("integer", INTEGER, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("intersect", INTERSECT, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("interval", INTERVAL, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("into", INTO, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("invoker", INVOKER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("is", IS, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("isnull", ISNULL, TYPE_FUNC_NAME_KEYWORD, AS_LABEL) +PG_KEYWORD("isolation", ISOLATION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("join", JOIN, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json", JSON, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_array", JSON_ARRAY, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_arrayagg", JSON_ARRAYAGG, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_exists", JSON_EXISTS, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_object", JSON_OBJECT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_objectagg", JSON_OBJECTAGG, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_query", JSON_QUERY, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_scalar", JSON_SCALAR, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_serialize", JSON_SERIALIZE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_table", JSON_TABLE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("json_value", JSON_VALUE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("keep", KEEP, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("key", KEY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("keys", KEYS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("label", LABEL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("language", LANGUAGE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("large", LARGE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("last", LAST_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("lateral", LATERAL_P, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("leading", LEADING, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("leakproof", LEAKPROOF, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("least", LEAST, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("left", LEFT, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("level", LEVEL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("like", LIKE, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("limit", LIMIT, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("listen", LISTEN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("load", LOAD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("local", LOCAL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("localtime", LOCALTIME, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("localtimestamp", LOCALTIMESTAMP, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("location", LOCATION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("lock", LOCK_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("locked", LOCKED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("logged", LOGGED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("mapping", MAPPING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("match", MATCH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("matched", MATCHED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("materialized", MATERIALIZED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("maxvalue", MAXVALUE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("merge", MERGE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("merge_action", MERGE_ACTION, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("method", METHOD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("minute", MINUTE_P, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("minvalue", MINVALUE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("mode", MODE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("month", MONTH_P, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("move", MOVE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("name", NAME_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("names", NAMES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("national", NATIONAL, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("natural", NATURAL, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("nchar", NCHAR, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("nested", NESTED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("new", NEW, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("next", NEXT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("nfc", NFC, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("nfd", NFD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("nfkc", NFKC, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("nfkd", NFKD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("no", NO, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("none", NONE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("normalize", NORMALIZE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("normalized", NORMALIZED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("not", NOT, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("nothing", NOTHING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("notify", NOTIFY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("notnull", NOTNULL, TYPE_FUNC_NAME_KEYWORD, AS_LABEL) +PG_KEYWORD("nowait", NOWAIT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("null", NULL_P, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("nullif", NULLIF, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("nulls", NULLS_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("numeric", NUMERIC, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("object", OBJECT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("of", OF, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("off", OFF, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("offset", OFFSET, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("oids", OIDS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("old", OLD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("omit", OMIT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("on", ON, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("only", ONLY, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("operator", OPERATOR, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("option", OPTION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("options", OPTIONS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("or", OR, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("order", ORDER, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("ordinality", ORDINALITY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("others", OTHERS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("out", OUT_P, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("outer", OUTER_P, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("over", OVER, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("overlaps", OVERLAPS, TYPE_FUNC_NAME_KEYWORD, AS_LABEL) +PG_KEYWORD("overlay", OVERLAY, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("overriding", OVERRIDING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("owned", OWNED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("owner", OWNER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("parallel", PARALLEL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("parameter", PARAMETER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("parser", PARSER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("partial", PARTIAL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("partition", PARTITION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("passing", PASSING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("password", PASSWORD, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("path", PATH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("placing", PLACING, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("plan", PLAN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("plans", PLANS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("policy", POLICY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("position", POSITION, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("preceding", PRECEDING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("precision", PRECISION, COL_NAME_KEYWORD, AS_LABEL) +PG_KEYWORD("prepare", PREPARE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("prepared", PREPARED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("preserve", PRESERVE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("primary", PRIMARY, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("prior", PRIOR, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("privileges", PRIVILEGES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("procedural", PROCEDURAL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("procedure", PROCEDURE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("procedures", PROCEDURES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("program", PROGRAM, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("publication", PUBLICATION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("quote", QUOTE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("quotes", QUOTES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("range", RANGE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("read", READ, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("real", REAL, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("reassign", REASSIGN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("recheck", RECHECK, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("recursive", RECURSIVE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("ref", REF_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("references", REFERENCES, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("referencing", REFERENCING, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("refresh", REFRESH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("reindex", REINDEX, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("relative", RELATIVE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("release", RELEASE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("rename", RENAME, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("repeatable", REPEATABLE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("replace", REPLACE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("replica", REPLICA, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("reset", RESET, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("restart", RESTART, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("restrict", RESTRICT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("return", RETURN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("returning", RETURNING, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("returns", RETURNS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("revoke", REVOKE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("right", RIGHT, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("role", ROLE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("rollback", ROLLBACK, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("rollup", ROLLUP, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("routine", ROUTINE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("routines", ROUTINES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("row", ROW, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("rows", ROWS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("rule", RULE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("savepoint", SAVEPOINT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("scalar", SCALAR, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("schema", SCHEMA, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("schemas", SCHEMAS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("scroll", SCROLL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("search", SEARCH, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("second", SECOND_P, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("security", SECURITY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("select", SELECT, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("sequence", SEQUENCE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("sequences", SEQUENCES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("serializable", SERIALIZABLE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("server", SERVER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("session", SESSION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("session_user", SESSION_USER, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("set", SET, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("setof", SETOF, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("sets", SETS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("share", SHARE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("show", SHOW, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("similar", SIMILAR, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("simple", SIMPLE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("skip", SKIP, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("smallint", SMALLINT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("snapshot", SNAPSHOT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("some", SOME, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("source", SOURCE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("sql", SQL_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("stable", STABLE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("standalone", STANDALONE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("start", START, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("statement", STATEMENT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("statistics", STATISTICS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("stdin", STDIN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("stdout", STDOUT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("storage", STORAGE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("stored", STORED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("strict", STRICT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("string", STRING_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("strip", STRIP_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("subscription", SUBSCRIPTION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("substring", SUBSTRING, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("support", SUPPORT, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("symmetric", SYMMETRIC, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("sysid", SYSID, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("system", SYSTEM_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("system_user", SYSTEM_USER, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("table", TABLE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("tables", TABLES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("tablesample", TABLESAMPLE, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("tablespace", TABLESPACE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("target", TARGET, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("temp", TEMP, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("template", TEMPLATE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("temporary", TEMPORARY, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("text", TEXT_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("then", THEN, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("ties", TIES, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("time", TIME, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("timestamp", TIMESTAMP, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("to", TO, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("trailing", TRAILING, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("transaction", TRANSACTION, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("transform", TRANSFORM, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("treat", TREAT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("trigger", TRIGGER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("trim", TRIM, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("true", TRUE_P, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("truncate", TRUNCATE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("trusted", TRUSTED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("type", TYPE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("types", TYPES_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("uescape", UESCAPE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("unbounded", UNBOUNDED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("uncommitted", UNCOMMITTED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("unconditional", UNCONDITIONAL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("unencrypted", UNENCRYPTED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("union", UNION, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("unique", UNIQUE, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("unknown", UNKNOWN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("unlisten", UNLISTEN, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("unlogged", UNLOGGED, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("until", UNTIL, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("update", UPDATE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("user", USER, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("using", USING, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("vacuum", VACUUM, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("valid", VALID, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("validate", VALIDATE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("validator", VALIDATOR, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("value", VALUE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("values", VALUES, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("varchar", VARCHAR, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("variadic", VARIADIC, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("varying", VARYING, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("verbose", VERBOSE, TYPE_FUNC_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("version", VERSION_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("view", VIEW, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("views", VIEWS, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("volatile", VOLATILE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("when", WHEN, RESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("where", WHERE, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("whitespace", WHITESPACE_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("window", WINDOW, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("with", WITH, RESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("within", WITHIN, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("without", WITHOUT, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("work", WORK, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("wrapper", WRAPPER, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("write", WRITE, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("xml", XML_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlattributes", XMLATTRIBUTES, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlconcat", XMLCONCAT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlelement", XMLELEMENT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlexists", XMLEXISTS, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlforest", XMLFOREST, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlnamespaces", XMLNAMESPACES, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlparse", XMLPARSE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlpi", XMLPI, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlroot", XMLROOT, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmlserialize", XMLSERIALIZE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("xmltable", XMLTABLE, COL_NAME_KEYWORD, BARE_LABEL) +PG_KEYWORD("year", YEAR_P, UNRESERVED_KEYWORD, AS_LABEL) +PG_KEYWORD("yes", YES_P, UNRESERVED_KEYWORD, BARE_LABEL) +PG_KEYWORD("zone", ZONE, UNRESERVED_KEYWORD, BARE_LABEL) From 0ce720312d939aff080e88bc843b4034d7ce4a6a Mon Sep 17 00:00:00 2001 From: psteinroe Date: Thu, 3 Jul 2025 10:12:47 +0200 Subject: [PATCH 7/8] progress --- crates/pgt_lexer_new/src/lexed.rs | 96 ++++++++++++ .../src/{lexed_str.rs => lexer.rs} | 138 ++++-------------- crates/pgt_lexer_new/src/lib.rs | 73 +-------- 3 files changed, 133 insertions(+), 174 deletions(-) create mode 100644 crates/pgt_lexer_new/src/lexed.rs rename crates/pgt_lexer_new/src/{lexed_str.rs => lexer.rs} (62%) diff --git a/crates/pgt_lexer_new/src/lexed.rs b/crates/pgt_lexer_new/src/lexed.rs new file mode 100644 index 00000000..e5ac45d8 --- /dev/null +++ b/crates/pgt_lexer_new/src/lexed.rs @@ -0,0 +1,96 @@ +use pgt_diagnostics::MessageAndDescription; +use pgt_text_size::TextRange; + +use crate::SyntaxKind; + +/// Internal error type used during lexing +#[derive(Debug, Clone)] +pub struct LexError { + pub msg: String, + pub token: u32, +} + +/// A specialized diagnostic for lex errors. +#[derive(Clone, Debug, PartialEq)] +pub struct LexDiagnostic { + /// The location where the error occurred + pub span: TextRange, + /// The error message + pub message: MessageAndDescription, +} + +/// Result of lexing a string, providing access to tokens and diagnostics +pub struct Lexed<'a> { + pub(crate) text: &'a str, + pub(crate) kind: Vec, + pub(crate) start: Vec, + pub(crate) error: Vec, +} + +impl<'a> Lexed<'a> { + /// Returns the number of tokens (excluding EOF) + pub fn len(&self) -> usize { + self.kind.len() - 1 + } + + /// Returns true if there are no tokens + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns an iterator over token kinds + pub fn tokens(&self) -> impl Iterator + '_ { + (0..self.len()).map(move |i| self.kind(i)) + } + + /// Returns the kind of token at the given index + pub fn kind(&self, idx: usize) -> SyntaxKind { + assert!(idx < self.len()); + self.kind[idx] + } + + /// Returns the text range of token at the given index + pub fn range(&self, idx: usize) -> TextRange { + let range = self.text_range(idx); + TextRange::new( + range.start.try_into().unwrap(), + range.end.try_into().unwrap(), + ) + } + + /// Returns the text of token at the given index + pub fn text(&self, idx: usize) -> &str { + self.range_text(idx..idx + 1) + } + + /// Returns all lexing errors with their text ranges + pub fn errors(&self) -> Vec { + self.error + .iter() + .map(|it| { + let range = self.text_range(it.token as usize); + LexDiagnostic { + message: it.msg.as_str().into(), + span: TextRange::new( + range.start.try_into().unwrap(), + range.end.try_into().unwrap(), + ), + } + }) + .collect() + } + + pub(crate) fn text_range(&self, i: usize) -> std::ops::Range { + assert!(i < self.len()); + let lo = self.start[i] as usize; + let hi = self.start[i + 1] as usize; + lo..hi + } + + fn range_text(&self, r: std::ops::Range) -> &str { + assert!(r.start < r.end && r.end <= self.len()); + let lo = self.start[r.start] as usize; + let hi = self.start[r.end] as usize; + &self.text[lo..hi] + } +} diff --git a/crates/pgt_lexer_new/src/lexed_str.rs b/crates/pgt_lexer_new/src/lexer.rs similarity index 62% rename from crates/pgt_lexer_new/src/lexed_str.rs rename to crates/pgt_lexer_new/src/lexer.rs index 99eaec88..355eaa77 100644 --- a/crates/pgt_lexer_new/src/lexed_str.rs +++ b/crates/pgt_lexer_new/src/lexer.rs @@ -1,115 +1,60 @@ -// based on https://github.com/rust-lang/rust-analyzer/blob/d8887c0758bbd2d5f752d5bd405d4491e90e7ed6/crates/parser/src/lexed_str.rs - -use std::ops; - use pgt_tokenizer::tokenize; use crate::SyntaxKind; +use crate::lexed::{LexError, Lexed}; -pub struct LexedStr<'a> { +/// Lexer that processes input text into tokens and diagnostics +pub struct Lexer<'a> { text: &'a str, kind: Vec, start: Vec, error: Vec, -} - -struct LexError { - msg: String, - token: u32, -} - -impl<'a> LexedStr<'a> { - pub fn new(text: &'a str) -> LexedStr<'a> { - let mut conv = Converter::new(text); - - for token in tokenize(&text[conv.offset..]) { - let token_text = &text[conv.offset..][..token.len as usize]; - - conv.extend_token(&token.kind, token_text); - } - - conv.finalize_with_eof() - } - - pub fn len(&self) -> usize { - self.kind.len() - 1 - } - - pub fn kind(&self, i: usize) -> SyntaxKind { - assert!(i < self.len()); - self.kind[i] - } - - pub fn text(&self, i: usize) -> &str { - self.range_text(i..i + 1) - } - - pub fn range_text(&self, r: ops::Range) -> &str { - assert!(r.start < r.end && r.end <= self.len()); - let lo = self.start[r.start] as usize; - let hi = self.start[r.end] as usize; - &self.text[lo..hi] - } - - // Naming is hard. - pub fn text_range(&self, i: usize) -> ops::Range { - assert!(i < self.len()); - let lo = self.start[i] as usize; - let hi = self.start[i + 1] as usize; - lo..hi - } - - pub fn errors(&self) -> impl Iterator + '_ { - self.error - .iter() - .map(|it| (it.token as usize, it.msg.as_str())) - } - - fn push(&mut self, kind: SyntaxKind, offset: usize) { - self.kind.push(kind); - self.start.push(offset as u32); - } -} - -struct Converter<'a> { - res: LexedStr<'a>, offset: usize, } -impl<'a> Converter<'a> { - fn new(text: &'a str) -> Self { +impl<'a> Lexer<'a> { + /// Create a new lexer for the given text + pub fn new(text: &'a str) -> Self { Self { - res: LexedStr { - text, - kind: Vec::new(), - start: Vec::new(), - error: Vec::new(), - }, + text, + kind: Vec::new(), + start: Vec::new(), + error: Vec::new(), offset: 0, } } - fn finalize_with_eof(mut self) -> LexedStr<'a> { - self.res.push(SyntaxKind::EOF, self.offset); - self.res + /// Lex the input text and return the result + pub fn lex(mut self) -> Lexed<'a> { + for token in tokenize(&self.text[self.offset..]) { + let token_text = &self.text[self.offset..][..token.len as usize]; + self.extend_token(&token.kind, token_text); + } + + // Add EOF token + self.push(SyntaxKind::EOF, 0, None); + + Lexed { + text: self.text, + kind: self.kind, + start: self.start, + error: self.error, + } } fn push(&mut self, kind: SyntaxKind, len: usize, err: Option<&str>) { - self.res.push(kind, self.offset); + self.kind.push(kind); + self.start.push(self.offset as u32); self.offset += len; if let Some(err) = err { - let token = self.res.len() as u32; + let token = (self.kind.len() - 1) as u32; let msg = err.to_owned(); - self.res.error.push(LexError { msg, token }); + self.error.push(LexError { msg, token }); } } fn extend_token(&mut self, kind: &pgt_tokenizer::TokenKind, token_text: &str) { - // A note on an intended tradeoff: - // We drop some useful information here (see patterns with double dots `..`) - // Storing that info in `SyntaxKind` is not possible due to its layout requirements of - // being `u16` that come from `rowan::SyntaxKind`. let mut err = ""; let syntax_kind = { @@ -121,8 +66,6 @@ impl<'a> Converter<'a> { } SyntaxKind::COMMENT } - - // whitespace pgt_tokenizer::TokenKind::Space => SyntaxKind::SPACE, pgt_tokenizer::TokenKind::Tab => SyntaxKind::TAB, pgt_tokenizer::TokenKind::Newline => SyntaxKind::NEWLINE, @@ -130,16 +73,6 @@ impl<'a> Converter<'a> { pgt_tokenizer::TokenKind::VerticalTab => SyntaxKind::VERTICAL_TAB, pgt_tokenizer::TokenKind::FormFeed => SyntaxKind::FORM_FEED, pgt_tokenizer::TokenKind::Ident => { - // TODO: check for max identifier length - // - // see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS - // The system uses no more than NAMEDATALEN-1 bytes of an - // identifier; longer names can be written in commands, but - // they will be truncated. By default, NAMEDATALEN is 64 so - // the maximum identifier length is 63 bytes. If this limit - // is problematic, it can be raised by changing the - // NAMEDATALEN constant in src/include/pg_config_manual.h. - // see: https://github.com/postgres/postgres/blob/e032e4c7ddd0e1f7865b246ec18944365d4f8614/src/include/pg_config_manual.h#L29 SyntaxKind::from_keyword(token_text).unwrap_or(SyntaxKind::IDENT) } pgt_tokenizer::TokenKind::Literal { kind, .. } => { @@ -214,43 +147,36 @@ impl<'a> Converter<'a> { if !terminated { err = "Missing trailing `'` symbol to terminate the string literal"; } - // TODO: rust analzyer checks for un-escaped strings, we should too SyntaxKind::STRING } pgt_tokenizer::LiteralKind::ByteStr { terminated } => { if !terminated { err = "Missing trailing `'` symbol to terminate the hex bit string literal"; } - // TODO: rust analzyer checks for un-escaped strings, we should too SyntaxKind::BYTE_STRING } pgt_tokenizer::LiteralKind::BitStr { terminated } => { if !terminated { - err = "Missing trailing `\'` symbol to terminate the bit string literal"; + err = "Missing trailing `'` symbol to terminate the bit string literal"; } - // TODO: rust analzyer checks for un-escaped strings, we should too SyntaxKind::BIT_STRING } pgt_tokenizer::LiteralKind::DollarQuotedString { terminated } => { if !terminated { - // TODO: we could be fancier and say the ending string we're looking for err = "Unterminated dollar quoted string literal"; } - // TODO: rust analzyer checks for un-escaped strings, we should too SyntaxKind::DOLLAR_QUOTED_STRING } pgt_tokenizer::LiteralKind::UnicodeEscStr { terminated } => { if !terminated { err = "Missing trailing `'` symbol to terminate the unicode escape string literal"; } - // TODO: rust analzyer checks for un-escaped strings, we should too SyntaxKind::BYTE_STRING } pgt_tokenizer::LiteralKind::EscStr { terminated } => { if !terminated { - err = "Missing trailing `\'` symbol to terminate the escape string literal"; + err = "Missing trailing `'` symbol to terminate the escape string literal"; } - // TODO: rust analzyer checks for un-escaped strings, we should too SyntaxKind::ESC_STRING } }; diff --git a/crates/pgt_lexer_new/src/lib.rs b/crates/pgt_lexer_new/src/lib.rs index fdbafeda..8ee63d0f 100644 --- a/crates/pgt_lexer_new/src/lib.rs +++ b/crates/pgt_lexer_new/src/lib.rs @@ -1,78 +1,15 @@ mod codegen; mod diagnostics; -mod lexed_str; - -use diagnostics::LexError; -use lexed_str::LexedStr; -use pgt_text_size::TextRange; +mod lexed; +mod lexer; pub use crate::codegen::syntax_kind::SyntaxKind; - -/// Result of lexing a string, providing access to tokens and diagnostics -/// -/// Thin wrapper around LexedStr for better API ergonomics -pub struct Lexed<'a> { - inner: LexedStr<'a>, -} - -impl Lexed<'_> { - /// Returns the number of tokens (excluding EOF) - pub fn len(&self) -> usize { - self.inner.len() - } - - /// Returns true if there are no tokens - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Returns an iterator over token kinds - pub fn tokens(&self) -> impl Iterator + '_ { - (0..self.len()).map(move |i| self.inner.kind(i)) - } - - /// Returns the kind of token at the given index - pub fn kind(&self, idx: usize) -> SyntaxKind { - self.inner.kind(idx) - } - - /// Returns the text range of token at the given index - pub fn range(&self, idx: usize) -> TextRange { - let range = self.inner.text_range(idx); - TextRange::new( - range.start.try_into().unwrap(), - range.end.try_into().unwrap(), - ) - } - - /// Returns the text of token at the given index - pub fn text(&self, idx: usize) -> &str { - self.inner.text(idx) - } - - /// Returns all lexing errors with their text ranges - pub fn errors(&self) -> Vec { - self.inner - .errors() - .map(|(i, msg)| { - let range = self.inner.text_range(i); - LexError { - message: msg.into(), - span: TextRange::new( - range.start.try_into().unwrap(), - range.end.try_into().unwrap(), - ), - } - }) - .collect() - } -} +pub use crate::lexed::{LexDiagnostic, Lexed}; +pub use crate::lexer::Lexer; /// Lex the input string into tokens and diagnostics pub fn lex(input: &str) -> Lexed { - Lexed { - inner: LexedStr::new(input), - } + Lexer::new(input).lex() } #[cfg(test)] From 5b3322e2bae52d855cd32b05560b78e5e5f2721c Mon Sep 17 00:00:00 2001 From: psteinroe Date: Thu, 3 Jul 2025 11:08:53 +0200 Subject: [PATCH 8/8] progress --- crates/pgt_lexer_new/src/diagnostics.rs | 14 -------------- crates/pgt_lexer_new/src/lexed.rs | 11 +++++++---- crates/pgt_lexer_new/src/lib.rs | 1 - 3 files changed, 7 insertions(+), 19 deletions(-) delete mode 100644 crates/pgt_lexer_new/src/diagnostics.rs diff --git a/crates/pgt_lexer_new/src/diagnostics.rs b/crates/pgt_lexer_new/src/diagnostics.rs deleted file mode 100644 index e92cb27c..00000000 --- a/crates/pgt_lexer_new/src/diagnostics.rs +++ /dev/null @@ -1,14 +0,0 @@ -use pgt_diagnostics::{Diagnostic, MessageAndDescription}; -use pgt_text_size::TextRange; - -/// A specialized diagnostic for lex errors. -#[derive(Clone, Debug, Diagnostic, PartialEq)] -#[diagnostic(category = "syntax", severity = Error)] -pub struct LexError { - /// The location where the error is occurred - #[location(span)] - pub span: TextRange, - #[message] - #[description] - pub message: MessageAndDescription, -} diff --git a/crates/pgt_lexer_new/src/lexed.rs b/crates/pgt_lexer_new/src/lexed.rs index e5ac45d8..e0c03f1d 100644 --- a/crates/pgt_lexer_new/src/lexed.rs +++ b/crates/pgt_lexer_new/src/lexed.rs @@ -1,4 +1,4 @@ -use pgt_diagnostics::MessageAndDescription; +use pgt_diagnostics::{Diagnostic, MessageAndDescription}; use pgt_text_size::TextRange; use crate::SyntaxKind; @@ -11,11 +11,14 @@ pub struct LexError { } /// A specialized diagnostic for lex errors. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, Diagnostic, PartialEq)] +#[diagnostic(category = "syntax", severity = Error)] pub struct LexDiagnostic { - /// The location where the error occurred + /// The location where the error is occurred + #[location(span)] pub span: TextRange, - /// The error message + #[message] + #[description] pub message: MessageAndDescription, } diff --git a/crates/pgt_lexer_new/src/lib.rs b/crates/pgt_lexer_new/src/lib.rs index 8ee63d0f..d4535e98 100644 --- a/crates/pgt_lexer_new/src/lib.rs +++ b/crates/pgt_lexer_new/src/lib.rs @@ -1,5 +1,4 @@ mod codegen; -mod diagnostics; mod lexed; mod lexer;