Skip to content

Commit

Permalink
[Clang] Allow additional mathematical symbols in identifiers.
Browse files Browse the repository at this point in the history
Implement the proposed UAX Profile
"Mathematical notation profile for default identifiers".

This implements a not-yet approved Unicode for a vetted
UAX31 identifier profile
https://www.unicode.org/L2/L2022/22230-math-profile.pdf

This change mitigates the reported disruption caused
by the implementation of UAX31 in C++ and C2x,
as these mathematical symbols are commonly used in the
scientific community.

Fixes llvm#54732

Reviewed By: tahonermann, #clang-language-wg

Differential Revision: https://reviews.llvm.org/D137051
  • Loading branch information
cor3ntin authored and GuilhermeValarini committed Dec 24, 2022
1 parent e5afae6 commit 67a45e8
Show file tree
Hide file tree
Showing 6 changed files with 133 additions and 27 deletions.
5 changes: 5 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,11 @@ Non-comprehensive list of changes in this release
- Unicode support has been updated to support Unicode 15.0.
New unicode codepoints are supported as appropriate in diagnostics,
C and C++ identifiers, and escape sequences.
- In identifiers, Clang allows a restricted set of additional mathematical symbols
as an extension. These symbols correspond to a proposed Unicode
`Mathematical notation profile for default identifiers
<https://www.unicode.org/L2/L2022/22230-math-profile.pdf>`_.
This resolves `Issue 54732 <https://github.com/llvm/llvm-project/issues/54732>`_.
- Clang now supports loading multiple configuration files. The files from
default configuration paths are loaded first, unless ``--no-default-config``
option is used. All files explicitly specified using ``--config=`` option
Expand Down
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/DiagnosticLexKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ def warn_utf8_symbol_homoglyph : Warning<
def warn_utf8_symbol_zero_width : Warning<
"identifier contains Unicode character <U+%0> that is invisible in "
"some environments">, InGroup<DiagGroup<"unicode-zero-width">>;
def ext_mathematical_notation : ExtWarn<
"mathematical notation character <U+%0> in an identifier is a Clang extension">,
InGroup<DiagGroup<"mathematical-notation-identifier-extension">>;

def ext_delimited_escape_sequence : Extension<
"%select{delimited|named}0 escape sequences are a "
Expand Down
111 changes: 84 additions & 27 deletions clang/lib/Lex/Lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1459,7 +1459,35 @@ static bool isUnicodeWhitespace(uint32_t Codepoint) {
return UnicodeWhitespaceChars.contains(Codepoint);
}

static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
static llvm::SmallString<5> codepointAsHexString(uint32_t C) {
llvm::SmallString<5> CharBuf;
llvm::raw_svector_ostream CharOS(CharBuf);
llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
return CharBuf;
}

// To mitigate https://github.com/llvm/llvm-project/issues/54732,
// we allow "Mathematical Notation Characters" in identifiers.
// This is a proposed profile that extends the XID_Start/XID_continue
// with mathematical symbols, superscipts and subscripts digits
// found in some production software.
// https://www.unicode.org/L2/L2022/22230-math-profile.pdf
static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
bool IsStart, bool &IsExtension) {
static const llvm::sys::UnicodeCharSet MathStartChars(
MathematicalNotationProfileIDStartRanges);
static const llvm::sys::UnicodeCharSet MathContinueChars(
MathematicalNotationProfileIDContinueRanges);
if (MathStartChars.contains(C) ||
(!IsStart && MathContinueChars.contains(C))) {
IsExtension = true;
return true;
}
return false;
}

static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
bool &IsExtension) {
if (LangOpts.AsmPreprocessor) {
return false;
} else if (LangOpts.DollarIdents && '$' == C) {
Expand All @@ -1471,8 +1499,10 @@ static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
// '_' doesn't have the XID_Continue property but is allowed in C and C++.
static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
return C == '_' || XIDStartChars.contains(C) ||
XIDContinueChars.contains(C);
if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
return true;
return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
IsExtension);
} else if (LangOpts.C11) {
static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
C11AllowedIDCharRanges);
Expand All @@ -1484,16 +1514,21 @@ static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
}
}

static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
bool &IsExtension) {
assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
IsExtension = false;
if (LangOpts.AsmPreprocessor) {
return false;
}
if (LangOpts.CPlusPlus || LangOpts.C2x) {
static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
return XIDStartChars.contains(C);
if (XIDStartChars.contains(C))
return true;
return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
IsExtension);
}
if (!isAllowedIDChar(C, LangOpts))
if (!isAllowedIDChar(C, LangOpts, IsExtension))
return false;
if (LangOpts.C11) {
static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
Expand All @@ -1505,6 +1540,20 @@ static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
return !C99DisallowedInitialIDChars.contains(C);
}

static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C,
CharSourceRange Range) {

static const llvm::sys::UnicodeCharSet MathStartChars(
MathematicalNotationProfileIDStartRanges);
static const llvm::sys::UnicodeCharSet MathContinueChars(
MathematicalNotationProfileIDContinueRanges);

assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
"Unexpected mathematical notation codepoint");
Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
<< codepointAsHexString(C) << Range;
}

static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
const char *End) {
return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
Expand Down Expand Up @@ -1604,18 +1653,13 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
std::lower_bound(std::begin(SortedHomoglyphs),
std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
if (Homoglyph->Character == C) {
llvm::SmallString<5> CharBuf;
{
llvm::raw_svector_ostream CharOS(CharBuf);
llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
}
if (Homoglyph->LooksLike) {
const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
<< Range << CharBuf << LooksLikeStr;
<< Range << codepointAsHexString(C) << LooksLikeStr;
} else {
Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
<< Range << CharBuf;
<< Range << codepointAsHexString(C);
}
}
}
Expand All @@ -1626,25 +1670,24 @@ static void diagnoseInvalidUnicodeCodepointInIdentifier(
if (isASCII(CodePoint))
return;

bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts);
bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts);
bool IsExtension;
bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
bool IsIDContinue =
IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);

if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
return;

bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;

llvm::SmallString<5> CharBuf;
llvm::raw_svector_ostream CharOS(CharBuf);
llvm::write_hex(CharOS, CodePoint, llvm::HexPrintStyle::Upper, 4);

if (!IsFirst || InvalidOnlyAtStart) {
Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
<< Range << CharBuf << int(InvalidOnlyAtStart)
<< Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
<< FixItHint::CreateRemoval(Range);
} else {
Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
<< Range << CharBuf << FixItHint::CreateRemoval(Range);
<< Range << codepointAsHexString(CodePoint)
<< FixItHint::CreateRemoval(Range);
}
}

Expand All @@ -1655,8 +1698,8 @@ bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
if (CodePoint == 0) {
return false;
}

if (!isAllowedIDChar(CodePoint, LangOpts)) {
bool IsExtension = false;
if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
return false;
if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
Expand All @@ -1669,10 +1712,15 @@ bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
// We got a unicode codepoint that is neither a space nor a
// a valid identifier part.
// Carry on as if the codepoint was valid for recovery purposes.
} else if (!isLexingRawMode())
} else if (!isLexingRawMode()) {
if (IsExtension)
diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UCNPtr));

maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UCNPtr),
/*IsFirst=*/false);
}

Result.setFlag(Token::HasUCN);
if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
Expand All @@ -1695,7 +1743,9 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
if (Result != llvm::conversionOK)
return false;

if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) {
bool IsExtension = false;
if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
IsExtension)) {
if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
return false;

Expand All @@ -1708,6 +1758,9 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
// a valid identifier part. Carry on as if the codepoint was
// valid for recovery purposes.
} else if (!isLexingRawMode()) {
if (IsExtension)
diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UnicodePtr));
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UnicodePtr),
/*IsFirst=*/false);
Expand All @@ -1721,9 +1774,13 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {

bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
const char *CurPtr) {
if (isAllowedInitiallyIDChar(C, LangOpts)) {
bool IsExtension = false;
if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
!PP->isPreprocessedOutput()) {
if (IsExtension)
diagnoseExtensionInIdentifier(PP->getDiagnostics(), C,
makeCharRange(*this, BufferPtr, CurPtr));
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
makeCharRange(*this, BufferPtr, CurPtr),
/*IsFirst=*/true);
Expand All @@ -1737,7 +1794,7 @@ bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,

if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
!PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
!isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) {
!isUnicodeWhitespace(C)) {
// Non-ASCII characters tend to creep into source code unintentionally.
// Instead of letting the parser complain about the unknown token,
// just drop the character.
Expand Down
30 changes: 30 additions & 0 deletions clang/lib/Lex/UnicodeCharSets.h
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,36 @@ static const llvm::sys::UnicodeCharRange XIDContinueRanges[] = {
{0x1E4EC, 0x1E4F9}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A},
{0x1E950, 0x1E959}, {0x1FBF0, 0x1FBF9}, {0xE0100, 0xE01EF}};

// Clang supports the "Mathematical notation profile" as an extension,
// as described in https://www.unicode.org/L2/L2022/22230-math-profile.pdf
// Math_Start
static const llvm::sys::UnicodeCharRange
MathematicalNotationProfileIDStartRanges[] = {
{0x02202, 0x02202}, // ∂
{0x02207, 0x02207}, // ∇
{0x0221E, 0x0221E}, // ∞
{0x1D6C1, 0x1D6C1}, // 𝛁
{0x1D6DB, 0x1D6DB}, // 𝛛
{0x1D6FB, 0x1D6FB}, // 𝛻
{0x1D715, 0x1D715}, // 𝜕
{0x1D735, 0x1D735}, // 𝜵
{0x1D74F, 0x1D74F}, // 𝝏
{0x1D76F, 0x1D76F}, // 𝝯
{0x1D789, 0x1D789}, // 𝞉
{0x1D7A9, 0x1D7A9}, // 𝞩
{0x1D7C3, 0x1D7C3}, // 𝟃
};

// Math_Continue
static const llvm::sys::UnicodeCharRange
MathematicalNotationProfileIDContinueRanges[] = {
{0x000B2, 0x000B3}, // ²-³
{0x000B9, 0x000B9}, // ¹
{0x02070, 0x02070}, // ⁰
{0x02074, 0x0207E}, // ⁴-⁾
{0x02080, 0x0208E}, // ₀-₎
};

// C11 D.1, C++11 [charname.allowed]
static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[] = {
// 1
Expand Down
1 change: 1 addition & 0 deletions clang/test/Driver/autocomplete.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@
// WARNING-NEXT: -Wmain-return-type
// WARNING-NEXT: -Wmalformed-warning-check
// WARNING-NEXT: -Wmany-braces-around-scalar-init
// WARNING-NEXT: -Wmathematical-notation-identifier-extension
// WARNING-NEXT: -Wmax-tokens
// WARNING-NEXT: -Wmax-unsigned-zero
// RUN: %clang --autocomplete=-Wno-invalid-pp- | FileCheck %s -check-prefix=NOWARNING
Expand Down
10 changes: 10 additions & 0 deletions clang/test/Lexer/unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,17 @@ extern int _\N{TANGSALETTERGA}; // expected-error {{'TANGSALETTERGA' is not a va
// expected-error {{expected ';' after top level declarator}} \
// expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespace}}

extern int 𝛛; // expected-warning {{mathematical notation character <U+1D6DB> in an identifier is a Clang extension}}
extern int ₉; // expected-error {{character <U+2089> not allowed at the start of an identifier}} \\
expected-warning {{declaration does not declare anything}}

int a¹b₍₄₂₎∇; // expected-warning 6{{mathematical notation character}}

int \u{221E} = 1; // expected-warning {{mathematical notation character}}
int \N{MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL} = 1;
// expected-warning@-1 {{mathematical notation character}}

int a\N{SUBSCRIPT EQUALS SIGN} = 1; // expected-warning {{mathematical notation character}}

// This character doesn't have the XID_Start property
extern int \U00016AC0; // TANGSA DIGIT ZERO // cxx-error {{expected unqualified-id}} \
Expand Down

0 comments on commit 67a45e8

Please sign in to comment.