Reserve guarded string literals (RFC 3593)

rust-lang · May 2, 2024 · 8f57684 · 8f57684
1 parent f92d49b
commit 8f57684
Show file tree

Hide file tree

Showing 24 changed files with 876 additions and 10 deletions.
diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs
@@ -4,6 +4,7 @@ use std::str::Chars;
 ///
 /// Next characters can be peeked via `first` method,
 /// and position can be shifted forward via `bump` method.
+#[derive(Clone)]
 pub struct Cursor<'a> {
     len_remaining: usize,
     /// Iterator over chars. Slightly faster than a &str.

diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
@@ -29,6 +29,8 @@ pub mod unescape;
 #[cfg(test)]
 mod tests;
 
+use std::num::NonZeroU8;
+
 pub use crate::cursor::Cursor;
 
 use self::LiteralKind::*;
@@ -179,24 +181,27 @@ pub enum DocStyle {
 /// `rustc_ast::ast::LitKind`).
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub enum LiteralKind {
-    /// "12_u8", "0o100", "0b120i99", "1f32".
+    /// `12_u8`, `0o100`, `0b120i99`, `1f32`.
     Int { base: Base, empty_int: bool },
-    /// "12.34f32", "1e3", but not "1f32".
+    /// `12.34f32`, `1e3`, but not `1f32`.
     Float { base: Base, empty_exponent: bool },
-    /// "'a'", "'\\'", "'''", "';"
+    /// `'a'`, `'\\'`, `'''`, `';`
     Char { terminated: bool },
-    /// "b'a'", "b'\\'", "b'''", "b';"
+    /// `b'a'`, `b'\\'`, `b'''`, `b';`
     Byte { terminated: bool },
-    /// ""abc"", ""abc"
+    /// `"abc"`, `"abc`
     Str { terminated: bool },
-    /// "b"abc"", "b"abc"
+    /// `b"abc"`, `b"abc`
     ByteStr { terminated: bool },
     /// `c"abc"`, `c"abc`
     CStr { terminated: bool },
-    /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
+    /// `#"abc"#`, `#"a`, `##"a"#`. `None` indicates no closing quote.
+    /// Allows fewer hashes to close the string to support older editions.
+    GuardedStr { n_start_hashes: Option<NonZeroU8>, n_end_hashes: u8 },
+    /// `r"abc"`, `r#"abc"#`, `r####"ab"###"c"####`, `r#"a`. `None` indicates
     /// an invalid literal.
     RawStr { n_hashes: Option<u8> },
-    /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
+    /// `br"abc"`, `br#"abc"#`, `br####"ab"###"c"####`, `br#"a`. `None`
     /// indicates an invalid literal.
     RawByteStr { n_hashes: Option<u8> },
     /// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal.
@@ -365,6 +370,49 @@ impl Cursor<'_> {
                 _ => self.ident_or_unknown_prefix(),
             },
 
+            // Guarded string literal (reserved syntax).
+            '#' if matches!(self.first(), '"' | '#') => {
+                // Create a backup to restore later if this
+                // turns out to not be a guarded literal.
+                let backup = self.clone();
+
+                let mut n_start_hashes: u32 = 1; // Already captured one `#`.
+                while self.first() == '#' {
+                    n_start_hashes += 1;
+                    self.bump();
+                }
+
+                if self.first() == '"' {
+                    self.bump();
+
+                    let res = self.guarded_double_quoted_string(n_start_hashes);
+                    let suffix_start = self.pos_within_token();
+
+                    if let (Ok(n_end_hashes), Ok(n)) = (res, u8::try_from(n_start_hashes)) {
+                        self.eat_literal_suffix();
+
+                        Literal {
+                            kind: GuardedStr {
+                                n_start_hashes: NonZeroU8::new(n),
+                                // Always succeeds because `n_end_hashes <= n`
+                                n_end_hashes: n_end_hashes.try_into().unwrap(),
+                            },
+                            suffix_start,
+                        }
+                    } else {
+                        Literal {
+                            kind: GuardedStr { n_start_hashes: None, n_end_hashes: 0 },
+                            suffix_start,
+                        }
+                    }
+                } else {
+                    // Not a guarded string, so restore old state.
+                    *self = backup;
+                    // Return a pound token.
+                    Pound
+                }
+            }
+
             // Byte literal, byte string literal, raw byte string literal or identifier.
             'b' => self.c_or_byte_string(
                 |terminated| ByteStr { terminated },
@@ -758,6 +806,34 @@ impl Cursor<'_> {
         false
     }
 
+    /// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
+    fn guarded_double_quoted_string(&mut self, n_start_hashes: u32) -> Result<u32, RawStrError> {
+        debug_assert!(self.prev() == '"');
+
+        // Lex the string itself as a normal string literal
+        // so we can recover that for older editions later.
+        if !self.double_quoted_string() {
+            return Err(RawStrError::NoTerminator {
+                expected: n_start_hashes,
+                found: 0,
+                possible_terminator_offset: None,
+            });
+        }
+
+        // Consume closing '#' symbols.
+        // Note that this will not consume extra trailing `#` characters:
+        // `###"abcde"####` is lexed as a `GuardedStr { n_hashes: 3 }`
+        // followed by a `#` token.
+        let mut n_end_hashes = 0;
+        while self.first() == '#' && n_end_hashes < n_start_hashes {
+            n_end_hashes += 1;
+            self.bump();
+        }
+
+        // Handle `n_end_hashes < n_start_hashes` later.
+        Ok(n_end_hashes)
+    }
+
     /// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
     fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> {
         // Wrap the actual function to handle the error with too many hashes.

diff --git a/compiler/rustc_lint/src/context/diagnostics.rs b/compiler/rustc_lint/src/context/diagnostics.rs
@@ -157,6 +157,20 @@ pub(super) fn builtin(sess: &Session, diagnostic: BuiltinLintDiag, diag: &mut Di
                 Applicability::MachineApplicable,
             );
         }
+        BuiltinLintDiag::ReservedGuardedString(space_span) => {
+            if let Some(space_span) = space_span {
+                diag.span_suggestion_verbose(
+                    space_span,
+                    "insert whitespace here to avoid this being parsed as guarded string in Rust 2024",
+                    " ",
+                    Applicability::MachineApplicable,
+                );
+            } else {
+                diag.help(
+                    "insert whitespace between the `#`s and the opening quote to avoid this being parsed as guarded string in Rust 2024",
+                );
+            }
+        }
         BuiltinLintDiag::UnusedBuiltinAttribute { attr_name, macro_name, invoc_span } => {
             diag.span_note(
                         invoc_span,

diff --git a/compiler/rustc_lint_defs/src/builtin.rs b/compiler/rustc_lint_defs/src/builtin.rs
@@ -89,6 +89,7 @@ declare_lint_pass! {
         RUST_2021_INCOMPATIBLE_OR_PATTERNS,
         RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
         RUST_2021_PRELUDE_COLLISIONS,
+        RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
         SEMICOLON_IN_EXPRESSIONS_FROM_MACROS,
         SINGLE_USE_LIFETIMES,
         SOFT_UNSTABLE,
@@ -4807,3 +4808,43 @@ declare_lint! {
         reference: "issue #124559 <https://github.com/rust-lang/rust/issues/124559>",
     };
 }
+
+declare_lint! {
+    /// The `rust_2024_guarded_string_incompatible_syntax` lint detects `#` tokens
+    /// that will be parsed as part of a guarded string literal in Rust 2024.
+    ///
+    /// ### Example
+    ///
+    /// ```rust,edition2021,compile_fail
+    /// #![deny(rust_2024_guarded_string_incompatible_syntax)]
+    ///
+    /// macro_rules! m {
+    ///     (# $x:expr #) => ();
+    ///     (# $x:expr) => ();
+    /// }
+    ///
+    /// m!(#"hey"#);
+    /// m!(#"hello");
+    /// ```
+    ///
+    /// {{produces}}
+    ///
+    /// ### Explanation
+    ///
+    /// Prior to Rust 2024, `#"hey"#` is three tokens: the first `#`
+    /// followed by the string literal `"hey"` then the final `#`.
+    /// In Rust 2024, the whole sequence is considered a single token.
+    ///
+    /// This lint suggests to add whitespace between the leading `#`
+    /// and the string to keep them separated in Rust 2024.
+    // Allow this lint -- rustdoc doesn't yet support threading edition into this lint's parser.
+    #[allow(rustdoc::invalid_rust_codeblocks)]
+    pub RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
+    Allow,
+    "will be parsed as a guarded string in Rust 2024",
+    @future_incompatible = FutureIncompatibleInfo {
+        reason: FutureIncompatibilityReason::EditionError(Edition::Edition2024),
+        reference: "issue #123735 <https://github.com/rust-lang/rust/issues/123735>",
+    };
+    crate_level_only
+}
diff --git a/compiler/rustc_lint_defs/src/lib.rs b/compiler/rustc_lint_defs/src/lib.rs
@@ -590,6 +590,7 @@ pub enum BuiltinLintDiag {
     ProcMacroBackCompat(String),
     OrPatternsBackCompat(Span, String),
     ReservedPrefix(Span),
+    ReservedGuardedString(Option<Span>),
     TrailingMacro(bool, Ident),
     BreakWithLabelAndLoop(Span),
     NamedAsmLabel(String),

diff --git a/compiler/rustc_parse/messages.ftl b/compiler/rustc_parse/messages.ftl
@@ -672,6 +672,10 @@ parse_require_colon_after_labeled_expression = labeled expression must be follow
     .label = the label
     .suggestion = add `:` after the label
 
+parse_reserved_guarded_string = invalid string literal
+    .note = unprefixed guarded string literals are reserved for future use since Rust 2024
+    .suggestion_whitespace = consider inserting whitespace here
+
 parse_return_types_use_thin_arrow = return types are denoted using `->`
     .suggestion = use `->` instead
 

diff --git a/compiler/rustc_parse/src/errors.rs b/compiler/rustc_parse/src/errors.rs
@@ -2009,6 +2009,24 @@ pub enum UnknownPrefixSugg {
     },
 }
 
+#[derive(Diagnostic)]
+#[diag(parse_reserved_guarded_string)]
+#[note]
+pub struct ReservedGuardedString {
+    #[primary_span]
+    pub span: Span,
+    #[subdiagnostic]
+    pub sugg: Option<GuardedStringSugg>,
+}
+#[derive(Subdiagnostic)]
+#[suggestion(
+    parse_suggestion_whitespace,
+    code = " ",
+    applicability = "maybe-incorrect",
+    style = "verbose"
+)]
+pub struct GuardedStringSugg(#[primary_span] pub Span);
+
 #[derive(Diagnostic)]
 #[diag(parse_too_many_hashes)]
 pub struct TooManyHashes {

diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
@@ -12,7 +12,8 @@ use rustc_lexer::unescape::{self, EscapeError, Mode};
 use rustc_lexer::{Base, DocStyle, RawStrError};
 use rustc_lexer::{Cursor, LiteralKind};
 use rustc_session::lint::builtin::{
-    RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
+    RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
+    TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
 };
 use rustc_session::lint::BuiltinLintDiag;
 use rustc_session::parse::ParseSess;
@@ -241,6 +242,40 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
                     let prefix_span = self.mk_sp(start, lit_start);
                     return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
                 }
+                rustc_lexer::TokenKind::Literal {
+                    kind: rustc_lexer::LiteralKind::GuardedStr { n_start_hashes, .. },
+                    suffix_start: _
+                } if !self.mk_sp(start, self.pos).edition().at_least_rust_2024() => {
+                    // Check if previous char was `#`, so we don't
+                    // lint for each `#` before the string.
+                    if !(
+                        start > self.start_pos &&
+                        self.src.as_bytes()[self.src_index(start) - 1] == b'#'
+                    ) {
+                        let span = self.mk_sp(start, self.pos);
+                        let space_span = n_start_hashes.map(|n_hashes| {
+                            let space_pos = start + BytePos(n_hashes.get().into());
+                            self.mk_sp(space_pos, space_pos)
+                        });
+
+                        // Before Rust 2021, only emit a lint for migration.
+                        self.psess.buffer_lint_with_diagnostic(
+                            RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
+                            span,
+                            ast::CRATE_NODE_ID,
+                            "will be parsed as a guarded string in Rust 2024",
+                            BuiltinLintDiag::ReservedGuardedString(space_span),
+                        );
+                    }
+
+                    // reset the state so that only the first `#` was consumed.
+                    let next = start + BytePos(1);
+                    self.pos = next;
+                    self.cursor = Cursor::new(&str_before[1..]);
+
+                    let pound_span = self.mk_sp(start, next);
+                    return (Token::new(TokenKind::Pound, pound_span), preceded_by_whitespace);
+                }
                 rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
                     let suffix_start = start + BytePos(suffix_start);
                     let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
@@ -490,6 +525,33 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
                     self.report_raw_str_error(start, 1);
                 }
             }
+            // RFC 3598 reserved this syntax for future use. As of Rust 2024,
+            // using this syntax produces an error. In earlier editions, however, it
+            // only results in an (allowed by default) lint, and is treated as
+            // separate tokens.
+            rustc_lexer::LiteralKind::GuardedStr { n_start_hashes, n_end_hashes } => {
+                let span = self.mk_sp(start, self.pos);
+
+                if let Some(n_start_hashes) = n_start_hashes {
+                    let n = u32::from(n_start_hashes.get());
+                    let e = u32::from(n_end_hashes);
+                    let expn_data = span.ctxt().outer_expn_data();
+
+                    let space_pos = start + BytePos(n);
+                    let space_span = self.mk_sp(space_pos, space_pos);
+
+                    let sugg = if expn_data.is_root() {
+                        Some(errors::GuardedStringSugg(space_span))
+                    } else {
+                        None
+                    };
+
+                    self.dcx().emit_err(errors::ReservedGuardedString { span, sugg });
+                    self.cook_unicode(token::Str, Mode::Str, start, end, 1 + n, 1 + e) // ##" "##
+                } else {
+                    self.dcx().emit_fatal(errors::ReservedGuardedString { span, sugg: None });
+                }
+            }
             rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
                 if let Some(n_hashes) = n_hashes {
                     let n = u32::from(n_hashes);

diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs
@@ -850,7 +850,8 @@ impl<'src> Classifier<'src> {
                 | LiteralKind::RawStr { .. }
                 | LiteralKind::RawByteStr { .. }
                 | LiteralKind::CStr { .. }
-                | LiteralKind::RawCStr { .. } => Class::String,
+                | LiteralKind::RawCStr { .. }
+                | LiteralKind::GuardedStr { .. } => Class::String,
                 // Number literals.
                 LiteralKind::Float { .. } | LiteralKind::Int { .. } => Class::Number,
             },

diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
@@ -331,6 +331,10 @@ impl<'a> Converter<'a> {
                 }
                 C_STRING
             }
+            rustc_lexer::LiteralKind::GuardedStr { .. } => {
+                err = "Invalid string literal";
+                STRING
+            }
         };
 
         let err = if err.is_empty() { None } else { Some(err) };

diff --git a/src/tools/rust-analyzer/crates/proc-macro-srv/src/server/rust_analyzer_span.rs b/src/tools/rust-analyzer/crates/proc-macro-srv/src/server/rust_analyzer_span.rs
@@ -120,6 +120,7 @@ impl server::FreeFunctions for RaSpanServer {
                 3 + n_hashes.unwrap_or_default() as usize,
                 1 + n_hashes.unwrap_or_default() as usize,
             ),
+            LiteralKind::GuardedStr { .. } => return Err(()),
         };
 
         let (lit, suffix) = s.split_at(suffix_start as usize);

diff --git a/src/tools/rust-analyzer/crates/proc-macro-srv/src/server/token_id.rs b/src/tools/rust-analyzer/crates/proc-macro-srv/src/server/token_id.rs
@@ -113,6 +113,7 @@ impl server::FreeFunctions for TokenIdServer {
                 3 + n_hashes.unwrap_or_default() as usize,
                 1 + n_hashes.unwrap_or_default() as usize,
             ),
+            LiteralKind::GuardedStr { .. } => return Err(()),
         };
 
         let (lit, suffix) = s.split_at(suffix_start as usize);

diff --git a/tests/ui/rust-2024/auxiliary/reserved-guarded-strings-macro-2021.rs b/tests/ui/rust-2024/auxiliary/reserved-guarded-strings-macro-2021.rs
@@ -0,0 +1,20 @@
+//@ force-host
+//@ edition:2021
+//@ no-prefer-dynamic
+
+#![crate_type = "proc-macro"]
+
+extern crate proc_macro;
+
+use proc_macro::TokenStream;
+use std::str::FromStr;
+
+#[proc_macro]
+pub fn number_of_tokens_in_a_guarded_string_literal(_: TokenStream) -> TokenStream {
+    TokenStream::from_str("#\"abc\"#").unwrap().into_iter().count().to_string().parse().unwrap()
+}
+
+#[proc_macro]
+pub fn number_of_tokens_in_a_guarded_unterminated_string_literal(_: TokenStream) -> TokenStream {
+    TokenStream::from_str("#\"abc\"").unwrap().into_iter().count().to_string().parse().unwrap()
+}