Skip to content

Commit

Permalink
Reserve guarded string literals (RFC 3593)
Browse files Browse the repository at this point in the history
  • Loading branch information
pitaj committed May 2, 2024
1 parent f92d49b commit 8f57684
Show file tree
Hide file tree
Showing 24 changed files with 876 additions and 10 deletions.
1 change: 1 addition & 0 deletions compiler/rustc_lexer/src/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use std::str::Chars;
///
/// Next characters can be peeked via `first` method,
/// and position can be shifted forward via `bump` method.
#[derive(Clone)]
pub struct Cursor<'a> {
len_remaining: usize,
/// Iterator over chars. Slightly faster than a &str.
Expand Down
92 changes: 84 additions & 8 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ pub mod unescape;
#[cfg(test)]
mod tests;

use std::num::NonZeroU8;

pub use crate::cursor::Cursor;

use self::LiteralKind::*;
Expand Down Expand Up @@ -179,24 +181,27 @@ pub enum DocStyle {
/// `rustc_ast::ast::LitKind`).
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LiteralKind {
/// "12_u8", "0o100", "0b120i99", "1f32".
/// `12_u8`, `0o100`, `0b120i99`, `1f32`.
Int { base: Base, empty_int: bool },
/// "12.34f32", "1e3", but not "1f32".
/// `12.34f32`, `1e3`, but not `1f32`.
Float { base: Base, empty_exponent: bool },
/// "'a'", "'\\'", "'''", "';"
/// `'a'`, `'\\'`, `'''`, `';`
Char { terminated: bool },
/// "b'a'", "b'\\'", "b'''", "b';"
/// `b'a'`, `b'\\'`, `b'''`, `b';`
Byte { terminated: bool },
/// ""abc"", ""abc"
/// `"abc"`, `"abc`
Str { terminated: bool },
/// "b"abc"", "b"abc"
/// `b"abc"`, `b"abc`
ByteStr { terminated: bool },
/// `c"abc"`, `c"abc`
CStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
/// `#"abc"#`, `#"a`, `##"a"#`. `None` indicates no closing quote.
/// Allows fewer hashes to close the string to support older editions.
GuardedStr { n_start_hashes: Option<NonZeroU8>, n_end_hashes: u8 },
/// `r"abc"`, `r#"abc"#`, `r####"ab"###"c"####`, `r#"a`. `None` indicates
/// an invalid literal.
RawStr { n_hashes: Option<u8> },
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
/// `br"abc"`, `br#"abc"#`, `br####"ab"###"c"####`, `br#"a`. `None`
/// indicates an invalid literal.
RawByteStr { n_hashes: Option<u8> },
/// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal.
Expand Down Expand Up @@ -365,6 +370,49 @@ impl Cursor<'_> {
_ => self.ident_or_unknown_prefix(),
},

// Guarded string literal (reserved syntax).
'#' if matches!(self.first(), '"' | '#') => {
// Create a backup to restore later if this
// turns out to not be a guarded literal.
let backup = self.clone();

let mut n_start_hashes: u32 = 1; // Already captured one `#`.
while self.first() == '#' {
n_start_hashes += 1;
self.bump();
}

if self.first() == '"' {
self.bump();

let res = self.guarded_double_quoted_string(n_start_hashes);
let suffix_start = self.pos_within_token();

if let (Ok(n_end_hashes), Ok(n)) = (res, u8::try_from(n_start_hashes)) {
self.eat_literal_suffix();

Literal {
kind: GuardedStr {
n_start_hashes: NonZeroU8::new(n),
// Always succeeds because `n_end_hashes <= n`
n_end_hashes: n_end_hashes.try_into().unwrap(),
},
suffix_start,
}
} else {
Literal {
kind: GuardedStr { n_start_hashes: None, n_end_hashes: 0 },
suffix_start,
}
}
} else {
// Not a guarded string, so restore old state.
*self = backup;
// Return a pound token.
Pound
}
}

// Byte literal, byte string literal, raw byte string literal or identifier.
'b' => self.c_or_byte_string(
|terminated| ByteStr { terminated },
Expand Down Expand Up @@ -758,6 +806,34 @@ impl Cursor<'_> {
false
}

/// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
fn guarded_double_quoted_string(&mut self, n_start_hashes: u32) -> Result<u32, RawStrError> {
debug_assert!(self.prev() == '"');

// Lex the string itself as a normal string literal
// so we can recover that for older editions later.
if !self.double_quoted_string() {
return Err(RawStrError::NoTerminator {
expected: n_start_hashes,
found: 0,
possible_terminator_offset: None,
});
}

// Consume closing '#' symbols.
// Note that this will not consume extra trailing `#` characters:
// `###"abcde"####` is lexed as a `GuardedStr { n_hashes: 3 }`
// followed by a `#` token.
let mut n_end_hashes = 0;
while self.first() == '#' && n_end_hashes < n_start_hashes {
n_end_hashes += 1;
self.bump();
}

// Handle `n_end_hashes < n_start_hashes` later.
Ok(n_end_hashes)
}

/// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> {
// Wrap the actual function to handle the error with too many hashes.
Expand Down
14 changes: 14 additions & 0 deletions compiler/rustc_lint/src/context/diagnostics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,20 @@ pub(super) fn builtin(sess: &Session, diagnostic: BuiltinLintDiag, diag: &mut Di
Applicability::MachineApplicable,
);
}
BuiltinLintDiag::ReservedGuardedString(space_span) => {
if let Some(space_span) = space_span {
diag.span_suggestion_verbose(
space_span,
"insert whitespace here to avoid this being parsed as guarded string in Rust 2024",
" ",
Applicability::MachineApplicable,
);
} else {
diag.help(
"insert whitespace between the `#`s and the opening quote to avoid this being parsed as guarded string in Rust 2024",
);
}
}
BuiltinLintDiag::UnusedBuiltinAttribute { attr_name, macro_name, invoc_span } => {
diag.span_note(
invoc_span,
Expand Down
41 changes: 41 additions & 0 deletions compiler/rustc_lint_defs/src/builtin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ declare_lint_pass! {
RUST_2021_INCOMPATIBLE_OR_PATTERNS,
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
RUST_2021_PRELUDE_COLLISIONS,
RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
SEMICOLON_IN_EXPRESSIONS_FROM_MACROS,
SINGLE_USE_LIFETIMES,
SOFT_UNSTABLE,
Expand Down Expand Up @@ -4807,3 +4808,43 @@ declare_lint! {
reference: "issue #124559 <https://github.com/rust-lang/rust/issues/124559>",
};
}

declare_lint! {
/// The `rust_2024_guarded_string_incompatible_syntax` lint detects `#` tokens
/// that will be parsed as part of a guarded string literal in Rust 2024.
///
/// ### Example
///
/// ```rust,edition2021,compile_fail
/// #![deny(rust_2024_guarded_string_incompatible_syntax)]
///
/// macro_rules! m {
/// (# $x:expr #) => ();
/// (# $x:expr) => ();
/// }
///
/// m!(#"hey"#);
/// m!(#"hello");
/// ```
///
/// {{produces}}
///
/// ### Explanation
///
/// Prior to Rust 2024, `#"hey"#` is three tokens: the first `#`
/// followed by the string literal `"hey"` then the final `#`.
/// In Rust 2024, the whole sequence is considered a single token.
///
/// This lint suggests to add whitespace between the leading `#`
/// and the string to keep them separated in Rust 2024.
// Allow this lint -- rustdoc doesn't yet support threading edition into this lint's parser.
#[allow(rustdoc::invalid_rust_codeblocks)]
pub RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
Allow,
"will be parsed as a guarded string in Rust 2024",
@future_incompatible = FutureIncompatibleInfo {
reason: FutureIncompatibilityReason::EditionError(Edition::Edition2024),
reference: "issue #123735 <https://github.com/rust-lang/rust/issues/123735>",
};
crate_level_only
}
1 change: 1 addition & 0 deletions compiler/rustc_lint_defs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,7 @@ pub enum BuiltinLintDiag {
ProcMacroBackCompat(String),
OrPatternsBackCompat(Span, String),
ReservedPrefix(Span),
ReservedGuardedString(Option<Span>),
TrailingMacro(bool, Ident),
BreakWithLabelAndLoop(Span),
NamedAsmLabel(String),
Expand Down
4 changes: 4 additions & 0 deletions compiler/rustc_parse/messages.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,10 @@ parse_require_colon_after_labeled_expression = labeled expression must be follow
.label = the label
.suggestion = add `:` after the label
parse_reserved_guarded_string = invalid string literal
.note = unprefixed guarded string literals are reserved for future use since Rust 2024
.suggestion_whitespace = consider inserting whitespace here
parse_return_types_use_thin_arrow = return types are denoted using `->`
.suggestion = use `->` instead
Expand Down
18 changes: 18 additions & 0 deletions compiler/rustc_parse/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2009,6 +2009,24 @@ pub enum UnknownPrefixSugg {
},
}

#[derive(Diagnostic)]
#[diag(parse_reserved_guarded_string)]
#[note]
pub struct ReservedGuardedString {
#[primary_span]
pub span: Span,
#[subdiagnostic]
pub sugg: Option<GuardedStringSugg>,
}
#[derive(Subdiagnostic)]
#[suggestion(
parse_suggestion_whitespace,
code = " ",
applicability = "maybe-incorrect",
style = "verbose"
)]
pub struct GuardedStringSugg(#[primary_span] pub Span);

#[derive(Diagnostic)]
#[diag(parse_too_many_hashes)]
pub struct TooManyHashes {
Expand Down
64 changes: 63 additions & 1 deletion compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ use rustc_lexer::unescape::{self, EscapeError, Mode};
use rustc_lexer::{Base, DocStyle, RawStrError};
use rustc_lexer::{Cursor, LiteralKind};
use rustc_session::lint::builtin::{
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
};
use rustc_session::lint::BuiltinLintDiag;
use rustc_session::parse::ParseSess;
Expand Down Expand Up @@ -241,6 +242,40 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
let prefix_span = self.mk_sp(start, lit_start);
return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
}
rustc_lexer::TokenKind::Literal {
kind: rustc_lexer::LiteralKind::GuardedStr { n_start_hashes, .. },
suffix_start: _
} if !self.mk_sp(start, self.pos).edition().at_least_rust_2024() => {
// Check if previous char was `#`, so we don't
// lint for each `#` before the string.
if !(
start > self.start_pos &&
self.src.as_bytes()[self.src_index(start) - 1] == b'#'
) {
let span = self.mk_sp(start, self.pos);
let space_span = n_start_hashes.map(|n_hashes| {
let space_pos = start + BytePos(n_hashes.get().into());
self.mk_sp(space_pos, space_pos)
});

// Before Rust 2021, only emit a lint for migration.
self.psess.buffer_lint_with_diagnostic(
RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
span,
ast::CRATE_NODE_ID,
"will be parsed as a guarded string in Rust 2024",
BuiltinLintDiag::ReservedGuardedString(space_span),
);
}

// reset the state so that only the first `#` was consumed.
let next = start + BytePos(1);
self.pos = next;
self.cursor = Cursor::new(&str_before[1..]);

let pound_span = self.mk_sp(start, next);
return (Token::new(TokenKind::Pound, pound_span), preceded_by_whitespace);
}
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
let suffix_start = start + BytePos(suffix_start);
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
Expand Down Expand Up @@ -490,6 +525,33 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
self.report_raw_str_error(start, 1);
}
}
// RFC 3598 reserved this syntax for future use. As of Rust 2024,
// using this syntax produces an error. In earlier editions, however, it
// only results in an (allowed by default) lint, and is treated as
// separate tokens.
rustc_lexer::LiteralKind::GuardedStr { n_start_hashes, n_end_hashes } => {
let span = self.mk_sp(start, self.pos);

if let Some(n_start_hashes) = n_start_hashes {
let n = u32::from(n_start_hashes.get());
let e = u32::from(n_end_hashes);
let expn_data = span.ctxt().outer_expn_data();

let space_pos = start + BytePos(n);
let space_span = self.mk_sp(space_pos, space_pos);

let sugg = if expn_data.is_root() {
Some(errors::GuardedStringSugg(space_span))
} else {
None
};

self.dcx().emit_err(errors::ReservedGuardedString { span, sugg });
self.cook_unicode(token::Str, Mode::Str, start, end, 1 + n, 1 + e) // ##" "##
} else {
self.dcx().emit_fatal(errors::ReservedGuardedString { span, sugg: None });
}
}
rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
if let Some(n_hashes) = n_hashes {
let n = u32::from(n_hashes);
Expand Down
3 changes: 2 additions & 1 deletion src/librustdoc/html/highlight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,8 @@ impl<'src> Classifier<'src> {
| LiteralKind::RawStr { .. }
| LiteralKind::RawByteStr { .. }
| LiteralKind::CStr { .. }
| LiteralKind::RawCStr { .. } => Class::String,
| LiteralKind::RawCStr { .. }
| LiteralKind::GuardedStr { .. } => Class::String,
// Number literals.
LiteralKind::Float { .. } | LiteralKind::Int { .. } => Class::Number,
},
Expand Down
4 changes: 4 additions & 0 deletions src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,10 @@ impl<'a> Converter<'a> {
}
C_STRING
}
rustc_lexer::LiteralKind::GuardedStr { .. } => {
err = "Invalid string literal";
STRING
}
};

let err = if err.is_empty() { None } else { Some(err) };
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ impl server::FreeFunctions for RaSpanServer {
3 + n_hashes.unwrap_or_default() as usize,
1 + n_hashes.unwrap_or_default() as usize,
),
LiteralKind::GuardedStr { .. } => return Err(()),
};

let (lit, suffix) = s.split_at(suffix_start as usize);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ impl server::FreeFunctions for TokenIdServer {
3 + n_hashes.unwrap_or_default() as usize,
1 + n_hashes.unwrap_or_default() as usize,
),
LiteralKind::GuardedStr { .. } => return Err(()),
};

let (lit, suffix) = s.split_at(suffix_start as usize);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//@ force-host
//@ edition:2021
//@ no-prefer-dynamic

#![crate_type = "proc-macro"]

extern crate proc_macro;

use proc_macro::TokenStream;
use std::str::FromStr;

#[proc_macro]
pub fn number_of_tokens_in_a_guarded_string_literal(_: TokenStream) -> TokenStream {
TokenStream::from_str("#\"abc\"#").unwrap().into_iter().count().to_string().parse().unwrap()
}

#[proc_macro]
pub fn number_of_tokens_in_a_guarded_unterminated_string_literal(_: TokenStream) -> TokenStream {
TokenStream::from_str("#\"abc\"").unwrap().into_iter().count().to_string().parse().unwrap()
}
Loading

0 comments on commit 8f57684

Please sign in to comment.