//===--- LiteralSupport.cpp - Code to parse and process literals ----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the NumericLiteralParser, CharLiteralParser, and // StringLiteralParser interfaces. // //===----------------------------------------------------------------------===// #include "clang/Lex/LiteralSupport.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/LangOptions.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/TargetInfo.h" #include "clang/Lex/LexDiagnostic.h" #include "clang/Lex/Lexer.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/Token.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" #include #include #include #include #include #include using namespace clang; static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) { switch (kind) { default: llvm_unreachable("Unknown token type!"); case tok::char_constant: case tok::string_literal: case tok::utf8_char_constant: case tok::utf8_string_literal: return Target.getCharWidth(); case tok::wide_char_constant: case tok::wide_string_literal: return Target.getWCharWidth(); case tok::utf16_char_constant: case tok::utf16_string_literal: return Target.getChar16Width(); case tok::utf32_char_constant: case tok::utf32_string_literal: return Target.getChar32Width(); } } static CharSourceRange MakeCharSourceRange(const LangOptions &Features, FullSourceLoc TokLoc, const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd) { SourceLocation Begin = Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin, TokLoc.getManager(), Features); SourceLocation End = Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin, TokLoc.getManager(), Features); return CharSourceRange::getCharRange(Begin, End); } /// Produce a diagnostic highlighting some portion of a literal. /// /// Emits the diagnostic \p DiagID, highlighting the range of characters from /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be /// a substring of a spelling buffer for the token beginning at \p TokBegin. static DiagnosticBuilder Diag(DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc TokLoc, const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd, unsigned DiagID) { SourceLocation Begin = Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin, TokLoc.getManager(), Features); return Diags->Report(Begin, DiagID) << MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd); } /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in /// either a character or a string literal. static unsigned ProcessCharEscape(const char *ThisTokBegin, const char *&ThisTokBuf, const char *ThisTokEnd, bool &HadError, FullSourceLoc Loc, unsigned CharWidth, DiagnosticsEngine *Diags, const LangOptions &Features) { const char *EscapeBegin = ThisTokBuf; // Skip the '\' char. ++ThisTokBuf; // We know that this character can't be off the end of the buffer, because // that would have been \", which would not have been the end of string. unsigned ResultChar = *ThisTokBuf++; switch (ResultChar) { // These map to themselves. case '\\': case '\'': case '"': case '?': break; // These have fixed mappings. case 'a': // TODO: K&R: the meaning of '\\a' is different in traditional C ResultChar = 7; break; case 'b': ResultChar = 8; break; case 'e': if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, diag::ext_nonstandard_escape) << "e"; ResultChar = 27; break; case 'E': if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, diag::ext_nonstandard_escape) << "E"; ResultChar = 27; break; case 'f': ResultChar = 12; break; case 'n': ResultChar = 10; break; case 'r': ResultChar = 13; break; case 't': ResultChar = 9; break; case 'v': ResultChar = 11; break; case 'x': { // Hex escape. ResultChar = 0; if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, diag::err_hex_escape_no_digits) << "x"; HadError = true; break; } // Hex escapes are a maximal series of hex digits. bool Overflow = false; for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) { int CharVal = llvm::hexDigitValue(ThisTokBuf[0]); if (CharVal == -1) break; // About to shift out a digit? if (ResultChar & 0xF0000000) Overflow = true; ResultChar <<= 4; ResultChar |= CharVal; } // See if any bits will be truncated when evaluated as a character. if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { Overflow = true; ResultChar &= ~0U >> (32-CharWidth); } // Check for overflow. if (Overflow && Diags) // Too many digits to fit in Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, diag::err_escape_too_large) << 0; break; } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal escapes. --ThisTokBuf; ResultChar = 0; // Octal escapes are a series of octal digits with maximum length 3. // "\0123" is a two digit sequence equal to "\012" "3". unsigned NumDigits = 0; do { ResultChar <<= 3; ResultChar |= *ThisTokBuf++ - '0'; ++NumDigits; } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 && ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7'); // Check for overflow. Reject '\777', but not L'\777'. if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, diag::err_escape_too_large) << 1; ResultChar &= ~0U >> (32-CharWidth); } break; } // Otherwise, these are not valid escapes. case '(': case '{': case '[': case '%': // GCC accepts these as extensions. We warn about them as such though. if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, diag::ext_nonstandard_escape) << std::string(1, ResultChar); break; default: if (!Diags) break; if (isPrintable(ResultChar)) Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, diag::ext_unknown_escape) << std::string(1, ResultChar); else Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, diag::ext_unknown_escape) << "x" + llvm::utohexstr(ResultChar); break; } return ResultChar; } static void appendCodePoint(unsigned Codepoint, llvm::SmallVectorImpl &Str) { char ResultBuf[4]; char *ResultPtr = ResultBuf; bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr); (void)Res; assert(Res && "Unexpected conversion failure"); Str.append(ResultBuf, ResultPtr); } void clang::expandUCNs(SmallVectorImpl &Buf, StringRef Input) { for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) { if (*I != '\\') { Buf.push_back(*I); continue; } ++I; assert(*I == 'u' || *I == 'U'); unsigned NumHexDigits; if (*I == 'u') NumHexDigits = 4; else NumHexDigits = 8; assert(I + NumHexDigits <= E); uint32_t CodePoint = 0; for (++I; NumHexDigits != 0; ++I, --NumHexDigits) { unsigned Value = llvm::hexDigitValue(*I); assert(Value != -1U); CodePoint <<= 4; CodePoint += Value; } appendCodePoint(CodePoint, Buf); --I; } } /// ProcessUCNEscape - Read the Universal Character Name, check constraints and /// return the UTF32. static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, const char *ThisTokEnd, uint32_t &UcnVal, unsigned short &UcnLen, FullSourceLoc Loc, DiagnosticsEngine *Diags, const LangOptions &Features, bool in_char_string_literal = false) { const char *UcnBegin = ThisTokBuf; // Skip the '\u' char's. ThisTokBuf += 2; if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1); return false; } UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8); unsigned short UcnLenSave = UcnLen; for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) { int CharVal = llvm::hexDigitValue(ThisTokBuf[0]); if (CharVal == -1) break; UcnVal <<= 4; UcnVal |= CharVal; } // If we didn't consume the proper number of digits, there is a problem. if (UcnLenSave) { if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, diag::err_ucn_escape_incomplete); return false; } // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2] if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints UcnVal > 0x10FFFF) { // maximum legal UTF32 value if (Diags) Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, diag::err_ucn_escape_invalid); return false; } // C++11 allows UCNs that refer to control characters and basic source // characters inside character and string literals if (UcnVal < 0xa0 && (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, ` bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal); if (Diags) { char BasicSCSChar = UcnVal; if (UcnVal >= 0x20 && UcnVal < 0x7f) Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, IsError ? diag::err_ucn_escape_basic_scs : diag::warn_cxx98_compat_literal_ucn_escape_basic_scs) << StringRef(&BasicSCSChar, 1); else Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, IsError ? diag::err_ucn_control_character : diag::warn_cxx98_compat_literal_ucn_control_character); } if (IsError) return false; } if (!Features.CPlusPlus && !Features.C99 && Diags) Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, diag::warn_ucn_not_valid_in_c89_literal); return true; } /// MeasureUCNEscape - Determine the number of bytes within the resulting string /// which this UCN will occupy. static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, const char *ThisTokEnd, unsigned CharByteWidth, const LangOptions &Features, bool &HadError) { // UTF-32: 4 bytes per escape. if (CharByteWidth == 4) return 4; uint32_t UcnVal = 0; unsigned short UcnLen = 0; FullSourceLoc Loc; if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, Loc, nullptr, Features, true)) { HadError = true; return 0; } // UTF-16: 2 bytes for BMP, 4 bytes otherwise. if (CharByteWidth == 2) return UcnVal <= 0xFFFF ? 2 : 4; // UTF-8. if (UcnVal < 0x80) return 1; if (UcnVal < 0x800) return 2; if (UcnVal < 0x10000) return 3; return 4; } /// EncodeUCNEscape - Read the Universal Character Name, check constraints and /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of /// StringLiteralParser. When we decide to implement UCN's for identifiers, /// we will likely rework our support for UCN's. static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, const char *ThisTokEnd, char *&ResultBuf, bool &HadError, FullSourceLoc Loc, unsigned CharByteWidth, DiagnosticsEngine *Diags, const LangOptions &Features) { typedef uint32_t UTF32; UTF32 UcnVal = 0; unsigned short UcnLen = 0; if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, Loc, Diags, Features, true)) { HadError = true; return; } assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) && "only character widths of 1, 2, or 4 bytes supported"); (void)UcnLen; assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported"); if (CharByteWidth == 4) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. llvm::UTF32 *ResultPtr = reinterpret_cast(ResultBuf); *ResultPtr = UcnVal; ResultBuf += 4; return; } if (CharByteWidth == 2) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. llvm::UTF16 *ResultPtr = reinterpret_cast(ResultBuf); if (UcnVal <= (UTF32)0xFFFF) { *ResultPtr = UcnVal; ResultBuf += 2; return; } // Convert to UTF16. UcnVal -= 0x10000; *ResultPtr = 0xD800 + (UcnVal >> 10); *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF); ResultBuf += 4; return; } assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters"); // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8. // The conversion below was inspired by: // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c // First, we determine how many bytes the result will require. typedef uint8_t UTF8; unsigned short bytesToWrite = 0; if (UcnVal < (UTF32)0x80) bytesToWrite = 1; else if (UcnVal < (UTF32)0x800) bytesToWrite = 2; else if (UcnVal < (UTF32)0x10000) bytesToWrite = 3; else bytesToWrite = 4; const unsigned byteMask = 0xBF; const unsigned byteMark = 0x80; // Once the bits are split out into bytes of UTF8, this is a mask OR-ed // into the first byte, depending on how many bytes follow. static const UTF8 firstByteMark[5] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0 }; // Finally, we write the bytes into ResultBuf. ResultBuf += bytesToWrite; switch (bytesToWrite) { // note: everything falls through. case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; LLVM_FALLTHROUGH; case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; LLVM_FALLTHROUGH; case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; LLVM_FALLTHROUGH; case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]); } // Update the buffer. ResultBuf += bytesToWrite; } /// integer-constant: [C99 6.4.4.1] /// decimal-constant integer-suffix /// octal-constant integer-suffix /// hexadecimal-constant integer-suffix /// binary-literal integer-suffix [GNU, C++1y] /// user-defined-integer-literal: [C++11 lex.ext] /// decimal-literal ud-suffix /// octal-literal ud-suffix /// hexadecimal-literal ud-suffix /// binary-literal ud-suffix [GNU, C++1y] /// decimal-constant: /// nonzero-digit /// decimal-constant digit /// octal-constant: /// 0 /// octal-constant octal-digit /// hexadecimal-constant: /// hexadecimal-prefix hexadecimal-digit /// hexadecimal-constant hexadecimal-digit /// hexadecimal-prefix: one of /// 0x 0X /// binary-literal: /// 0b binary-digit /// 0B binary-digit /// binary-literal binary-digit /// integer-suffix: /// unsigned-suffix [long-suffix] /// unsigned-suffix [long-long-suffix] /// long-suffix [unsigned-suffix] /// long-long-suffix [unsigned-sufix] /// nonzero-digit: /// 1 2 3 4 5 6 7 8 9 /// octal-digit: /// 0 1 2 3 4 5 6 7 /// hexadecimal-digit: /// 0 1 2 3 4 5 6 7 8 9 /// a b c d e f /// A B C D E F /// binary-digit: /// 0 /// 1 /// unsigned-suffix: one of /// u U /// long-suffix: one of /// l L /// long-long-suffix: one of /// ll LL /// /// floating-constant: [C99 6.4.4.2] /// TODO: add rules... /// NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling, SourceLocation TokLoc, const SourceManager &SM, const LangOptions &LangOpts, const TargetInfo &Target, DiagnosticsEngine &Diags) : SM(SM), LangOpts(LangOpts), Diags(Diags), ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) { // This routine assumes that the range begin/end matches the regex for integer // and FP constants (specifically, the 'pp-number' regex), and assumes that // the byte at "*end" is both valid and not part of the regex. Because of // this, it doesn't have to check for 'overscan' in various places. assert(!isPreprocessingNumberBody(*ThisTokEnd) && "didn't maximally munch?"); s = DigitsBegin = ThisTokBegin; saw_exponent = false; saw_period = false; saw_ud_suffix = false; saw_fixed_point_suffix = false; isLong = false; isUnsigned = false; isLongLong = false; isHalf = false; isFloat = false; isImaginary = false; isFloat16 = false; isFloat128 = false; MicrosoftInteger = 0; isFract = false; isAccum = false; hadError = false; if (*s == '0') { // parse radix ParseNumberStartingWithZero(TokLoc); if (hadError) return; } else { // the first digit is non-zero radix = 10; s = SkipDigits(s); if (s == ThisTokEnd) { // Done. } else { ParseDecimalOrOctalCommon(TokLoc); if (hadError) return; } } SuffixBegin = s; checkSeparator(TokLoc, s, CSK_AfterDigits); // Initial scan to lookahead for fixed point suffix. if (LangOpts.FixedPoint) { for (const char *c = s; c != ThisTokEnd; ++c) { if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') { saw_fixed_point_suffix = true; break; } } } // Parse the suffix. At this point we can classify whether we have an FP or // integer constant. bool isFixedPointConstant = isFixedPointLiteral(); bool isFPConstant = isFloatingLiteral(); // Loop over all of the characters of the suffix. If we see something bad, // we break out of the loop. for (; s != ThisTokEnd; ++s) { switch (*s) { case 'R': case 'r': if (!LangOpts.FixedPoint) break; if (isFract || isAccum) break; if (!(saw_period || saw_exponent)) break; isFract = true; continue; case 'K': case 'k': if (!LangOpts.FixedPoint) break; if (isFract || isAccum) break; if (!(saw_period || saw_exponent)) break; isAccum = true; continue; case 'h': // FP Suffix for "half". case 'H': // OpenCL Extension v1.2 s9.5 - h or H suffix for half type. if (!(LangOpts.Half || LangOpts.FixedPoint)) break; if (isIntegerLiteral()) break; // Error for integer constant. if (isHalf || isFloat || isLong) break; // HH, FH, LH invalid. isHalf = true; continue; // Success. case 'f': // FP Suffix for "float" case 'F': if (!isFPConstant) break; // Error for integer constant. if (isHalf || isFloat || isLong || isFloat128) break; // HF, FF, LF, QF invalid. // CUDA host and device may have different _Float16 support, therefore // allows f16 literals to avoid false alarm. // ToDo: more precise check for CUDA. if ((Target.hasFloat16Type() || LangOpts.CUDA) && s + 2 < ThisTokEnd && s[1] == '1' && s[2] == '6') { s += 2; // success, eat up 2 characters. isFloat16 = true; continue; } isFloat = true; continue; // Success. case 'q': // FP Suffix for "__float128" case 'Q': if (!isFPConstant) break; // Error for integer constant. if (isHalf || isFloat || isLong || isFloat128) break; // HQ, FQ, LQ, QQ invalid. isFloat128 = true; continue; // Success. case 'u': case 'U': if (isFPConstant) break; // Error for floating constant. if (isUnsigned) break; // Cannot be repeated. isUnsigned = true; continue; // Success. case 'l': case 'L': if (isLong || isLongLong) break; // Cannot be repeated. if (isHalf || isFloat || isFloat128) break; // LH, LF, LQ invalid. // Check for long long. The L's need to be adjacent and the same case. if (s[1] == s[0]) { assert(s + 1 < ThisTokEnd && "didn't maximally munch?"); if (isFPConstant) break; // long long invalid for floats. isLongLong = true; ++s; // Eat both of them. } else { isLong = true; } continue; // Success. case 'i': case 'I': if (LangOpts.MicrosoftExt) { if (isLong || isLongLong || MicrosoftInteger) break; if (!isFPConstant) { // Allow i8, i16, i32, and i64. switch (s[1]) { case '8': s += 2; // i8 suffix MicrosoftInteger = 8; break; case '1': if (s[2] == '6') { s += 3; // i16 suffix MicrosoftInteger = 16; } break; case '3': if (s[2] == '2') { s += 3; // i32 suffix MicrosoftInteger = 32; } break; case '6': if (s[2] == '4') { s += 3; // i64 suffix MicrosoftInteger = 64; } break; default: break; } } if (MicrosoftInteger) { assert(s <= ThisTokEnd && "didn't maximally munch?"); break; } } LLVM_FALLTHROUGH; case 'j': case 'J': if (isImaginary) break; // Cannot be repeated. isImaginary = true; continue; // Success. } // If we reached here, there was an error or a ud-suffix. break; } // "i", "if", and "il" are user-defined suffixes in C++1y. if (s != ThisTokEnd || isImaginary) { // FIXME: Don't bother expanding UCNs if !tok.hasUCN(). expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)); if (isValidUDSuffix(LangOpts, UDSuffixBuf)) { if (!isImaginary) { // Any suffix pieces we might have parsed are actually part of the // ud-suffix. isLong = false; isUnsigned = false; isLongLong = false; isFloat = false; isFloat16 = false; isHalf = false; isImaginary = false; MicrosoftInteger = 0; saw_fixed_point_suffix = false; isFract = false; isAccum = false; } saw_ud_suffix = true; return; } if (s != ThisTokEnd) { // Report an error if there are any. Diags.Report(Lexer::AdvanceToTokenCharacter( TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts), diag::err_invalid_suffix_constant) << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin) << (isFixedPointConstant ? 2 : isFPConstant); hadError = true; } } if (!hadError && saw_fixed_point_suffix) { assert(isFract || isAccum); } } /// ParseDecimalOrOctalCommon - This method is called for decimal or octal /// numbers. It issues an error for illegal digits, and handles floating point /// parsing. If it detects a floating point number, the radix is set to 10. void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){ assert((radix == 8 || radix == 10) && "Unexpected radix"); // If we have a hex digit other than 'e' (which denotes a FP exponent) then // the code is using an incorrect base. if (isHexDigit(*s) && *s != 'e' && *s != 'E' && !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) { Diags.Report( Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts), diag::err_invalid_digit) << StringRef(s, 1) << (radix == 8 ? 1 : 0); hadError = true; return; } if (*s == '.') { checkSeparator(TokLoc, s, CSK_AfterDigits); s++; radix = 10; saw_period = true; checkSeparator(TokLoc, s, CSK_BeforeDigits); s = SkipDigits(s); // Skip suffix. } if (*s == 'e' || *s == 'E') { // exponent checkSeparator(TokLoc, s, CSK_AfterDigits); const char *Exponent = s; s++; radix = 10; saw_exponent = true; if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign const char *first_non_digit = SkipDigits(s); if (containsDigits(s, first_non_digit)) { checkSeparator(TokLoc, s, CSK_BeforeDigits); s = first_non_digit; } else { if (!hadError) { Diags.Report(Lexer::AdvanceToTokenCharacter( TokLoc, Exponent - ThisTokBegin, SM, LangOpts), diag::err_exponent_has_no_digits); hadError = true; } return; } } } /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved /// suffixes as ud-suffixes, because the diagnostic experience is better if we /// treat it as an invalid suffix. bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix) { if (!LangOpts.CPlusPlus11 || Suffix.empty()) return false; // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid. if (Suffix[0] == '_') return true; // In C++11, there are no library suffixes. if (!LangOpts.CPlusPlus14) return false; // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library. // Per tweaked N3660, "il", "i", and "if" are also used in the library. // In C++2a "d" and "y" are used in the library. return llvm::StringSwitch(Suffix) .Cases("h", "min", "s", true) .Cases("ms", "us", "ns", true) .Cases("il", "i", "if", true) .Cases("d", "y", LangOpts.CPlusPlus20) .Default(false); } void NumericLiteralParser::checkSeparator(SourceLocation TokLoc, const char *Pos, CheckSeparatorKind IsAfterDigits) { if (IsAfterDigits == CSK_AfterDigits) { if (Pos == ThisTokBegin) return; --Pos; } else if (Pos == ThisTokEnd) return; if (isDigitSeparator(*Pos)) { Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM, LangOpts), diag::err_digit_separator_not_between_digits) << IsAfterDigits; hadError = true; } } /// ParseNumberStartingWithZero - This method is called when the first character /// of the number is found to be a zero. This means it is either an octal /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or /// a floating point number (01239.123e4). Eat the prefix, determining the /// radix etc. void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { assert(s[0] == '0' && "Invalid method call"); s++; int c1 = s[0]; // Handle a hex number like 0x1234. if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) { s++; assert(s < ThisTokEnd && "didn't maximally munch?"); radix = 16; DigitsBegin = s; s = SkipHexDigits(s); bool HasSignificandDigits = containsDigits(DigitsBegin, s); if (s == ThisTokEnd) { // Done. } else if (*s == '.') { s++; saw_period = true; const char *floatDigitsBegin = s; s = SkipHexDigits(s); if (containsDigits(floatDigitsBegin, s)) HasSignificandDigits = true; if (HasSignificandDigits) checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits); } if (!HasSignificandDigits) { Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts), diag::err_hex_constant_requires) << LangOpts.CPlusPlus << 1; hadError = true; return; } // A binary exponent can appear with or with a '.'. If dotted, the // binary exponent is required. if (*s == 'p' || *s == 'P') { checkSeparator(TokLoc, s, CSK_AfterDigits); const char *Exponent = s; s++; saw_exponent = true; if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign const char *first_non_digit = SkipDigits(s); if (!containsDigits(s, first_non_digit)) { if (!hadError) { Diags.Report(Lexer::AdvanceToTokenCharacter( TokLoc, Exponent - ThisTokBegin, SM, LangOpts), diag::err_exponent_has_no_digits); hadError = true; } return; } checkSeparator(TokLoc, s, CSK_BeforeDigits); s = first_non_digit; if (!LangOpts.HexFloats) Diags.Report(TokLoc, LangOpts.CPlusPlus ? diag::ext_hex_literal_invalid : diag::ext_hex_constant_invalid); else if (LangOpts.CPlusPlus17) Diags.Report(TokLoc, diag::warn_cxx17_hex_literal); } else if (saw_period) { Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts), diag::err_hex_constant_requires) << LangOpts.CPlusPlus << 0; hadError = true; } return; } // Handle simple binary numbers 0b01010 if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) { // 0b101010 is a C++1y / GCC extension. Diags.Report(TokLoc, LangOpts.CPlusPlus14 ? diag::warn_cxx11_compat_binary_literal : LangOpts.CPlusPlus ? diag::ext_binary_literal_cxx14 : diag::ext_binary_literal); ++s; assert(s < ThisTokEnd && "didn't maximally munch?"); radix = 2; DigitsBegin = s; s = SkipBinaryDigits(s); if (s == ThisTokEnd) { // Done. } else if (isHexDigit(*s) && !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) { Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts), diag::err_invalid_digit) << StringRef(s, 1) << 2; hadError = true; } // Other suffixes will be diagnosed by the caller. return; } // For now, the radix is set to 8. If we discover that we have a // floating point constant, the radix will change to 10. Octal floating // point constants are not permitted (only decimal and hexadecimal). radix = 8; DigitsBegin = s; s = SkipOctalDigits(s); if (s == ThisTokEnd) return; // Done, simple octal number like 01234 // If we have some other non-octal digit that *is* a decimal digit, see if // this is part of a floating point number like 094.123 or 09e1. if (isDigit(*s)) { const char *EndDecimal = SkipDigits(s); if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') { s = EndDecimal; radix = 10; } } ParseDecimalOrOctalCommon(TokLoc); } static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) { switch (Radix) { case 2: return NumDigits <= 64; case 8: return NumDigits <= 64 / 3; // Digits are groups of 3 bits. case 10: return NumDigits <= 19; // floor(log10(2^64)) case 16: return NumDigits <= 64 / 4; // Digits are groups of 4 bits. default: llvm_unreachable("impossible Radix"); } } /// GetIntegerValue - Convert this numeric literal value to an APInt that /// matches Val's input width. If there is an overflow, set Val to the low bits /// of the result and return true. Otherwise, return false. bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) { // Fast path: Compute a conservative bound on the maximum number of // bits per digit in this radix. If we can't possibly overflow a // uint64 based on that bound then do the simple conversion to // integer. This avoids the expensive overflow checking below, and // handles the common cases that matter (small decimal integers and // hex/octal values which don't overflow). const unsigned NumDigits = SuffixBegin - DigitsBegin; if (alwaysFitsInto64Bits(radix, NumDigits)) { uint64_t N = 0; for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr) if (!isDigitSeparator(*Ptr)) N = N * radix + llvm::hexDigitValue(*Ptr); // This will truncate the value to Val's input width. Simply check // for overflow by comparing. Val = N; return Val.getZExtValue() != N; } Val = 0; const char *Ptr = DigitsBegin; llvm::APInt RadixVal(Val.getBitWidth(), radix); llvm::APInt CharVal(Val.getBitWidth(), 0); llvm::APInt OldVal = Val; bool OverflowOccurred = false; while (Ptr < SuffixBegin) { if (isDigitSeparator(*Ptr)) { ++Ptr; continue; } unsigned C = llvm::hexDigitValue(*Ptr++); // If this letter is out of bound for this radix, reject it. assert(C < radix && "NumericLiteralParser ctor should have rejected this"); CharVal = C; // Add the digit to the value in the appropriate radix. If adding in digits // made the value smaller, then this overflowed. OldVal = Val; // Multiply by radix, did overflow occur on the multiply? Val *= RadixVal; OverflowOccurred |= Val.udiv(RadixVal) != OldVal; // Add value, did overflow occur on the value? // (a + b) ult b <=> overflow Val += CharVal; OverflowOccurred |= Val.ult(CharVal); } return OverflowOccurred; } llvm::APFloat::opStatus NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) { using llvm::APFloat; unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin); llvm::SmallString<16> Buffer; StringRef Str(ThisTokBegin, n); if (Str.find('\'') != StringRef::npos) { Buffer.reserve(n); std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer), &isDigitSeparator); Str = Buffer; } auto StatusOrErr = Result.convertFromString(Str, APFloat::rmNearestTiesToEven); assert(StatusOrErr && "Invalid floating point representation"); return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr : APFloat::opInvalidOp; } static inline bool IsExponentPart(char c) { return c == 'p' || c == 'P' || c == 'e' || c == 'E'; } bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) { assert(radix == 16 || radix == 10); // Find how many digits are needed to store the whole literal. unsigned NumDigits = SuffixBegin - DigitsBegin; if (saw_period) --NumDigits; // Initial scan of the exponent if it exists bool ExpOverflowOccurred = false; bool NegativeExponent = false; const char *ExponentBegin; uint64_t Exponent = 0; int64_t BaseShift = 0; if (saw_exponent) { const char *Ptr = DigitsBegin; while (!IsExponentPart(*Ptr)) ++Ptr; ExponentBegin = Ptr; ++Ptr; NegativeExponent = *Ptr == '-'; if (NegativeExponent) ++Ptr; unsigned NumExpDigits = SuffixBegin - Ptr; if (alwaysFitsInto64Bits(radix, NumExpDigits)) { llvm::StringRef ExpStr(Ptr, NumExpDigits); llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10); Exponent = ExpInt.getZExtValue(); } else { ExpOverflowOccurred = true; } if (NegativeExponent) BaseShift -= Exponent; else BaseShift += Exponent; } // Number of bits needed for decimal literal is // ceil(NumDigits * log2(10)) Integral part // + Scale Fractional part // + ceil(Exponent * log2(10)) Exponent // -------------------------------------------------- // ceil((NumDigits + Exponent) * log2(10)) + Scale // // But for simplicity in handling integers, we can round up log2(10) to 4, // making: // 4 * (NumDigits + Exponent) + Scale // // Number of digits needed for hexadecimal literal is // 4 * NumDigits Integral part // + Scale Fractional part // + Exponent Exponent // -------------------------------------------------- // (4 * NumDigits) + Scale + Exponent uint64_t NumBitsNeeded; if (radix == 10) NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale; else NumBitsNeeded = 4 * NumDigits + Exponent + Scale; if (NumBitsNeeded > std::numeric_limits::max()) ExpOverflowOccurred = true; llvm::APInt Val(static_cast(NumBitsNeeded), 0, /*isSigned=*/false); bool FoundDecimal = false; int64_t FractBaseShift = 0; const char *End = saw_exponent ? ExponentBegin : SuffixBegin; for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) { if (*Ptr == '.') { FoundDecimal = true; continue; } // Normal reading of an integer unsigned C = llvm::hexDigitValue(*Ptr); assert(C < radix && "NumericLiteralParser ctor should have rejected this"); Val *= radix; Val += C; if (FoundDecimal) // Keep track of how much we will need to adjust this value by from the // number of digits past the radix point. --FractBaseShift; } // For a radix of 16, we will be multiplying by 2 instead of 16. if (radix == 16) FractBaseShift *= 4; BaseShift += FractBaseShift; Val <<= Scale; uint64_t Base = (radix == 16) ? 2 : 10; if (BaseShift > 0) { for (int64_t i = 0; i < BaseShift; ++i) { Val *= Base; } } else if (BaseShift < 0) { for (int64_t i = BaseShift; i < 0 && !Val.isNullValue(); ++i) Val = Val.udiv(Base); } bool IntOverflowOccurred = false; auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth()); if (Val.getBitWidth() > StoreVal.getBitWidth()) { IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth())); StoreVal = Val.trunc(StoreVal.getBitWidth()); } else if (Val.getBitWidth() < StoreVal.getBitWidth()) { IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal); StoreVal = Val.zext(StoreVal.getBitWidth()); } else { StoreVal = Val; } return IntOverflowOccurred || ExpOverflowOccurred; } /// \verbatim /// user-defined-character-literal: [C++11 lex.ext] /// character-literal ud-suffix /// ud-suffix: /// identifier /// character-literal: [C++11 lex.ccon] /// ' c-char-sequence ' /// u' c-char-sequence ' /// U' c-char-sequence ' /// L' c-char-sequence ' /// u8' c-char-sequence ' [C++1z lex.ccon] /// c-char-sequence: /// c-char /// c-char-sequence c-char /// c-char: /// any member of the source character set except the single-quote ', /// backslash \, or new-line character /// escape-sequence /// universal-character-name /// escape-sequence: /// simple-escape-sequence /// octal-escape-sequence /// hexadecimal-escape-sequence /// simple-escape-sequence: /// one of \' \" \? \\ \a \b \f \n \r \t \v /// octal-escape-sequence: /// \ octal-digit /// \ octal-digit octal-digit /// \ octal-digit octal-digit octal-digit /// hexadecimal-escape-sequence: /// \x hexadecimal-digit /// hexadecimal-escape-sequence hexadecimal-digit /// universal-character-name: [C++11 lex.charset] /// \u hex-quad /// \U hex-quad hex-quad /// hex-quad: /// hex-digit hex-digit hex-digit hex-digit /// \endverbatim /// CharLiteralParser::CharLiteralParser(const char *begin, const char *end, SourceLocation Loc, Preprocessor &PP, tok::TokenKind kind) { // At this point we know that the character matches the regex "(L|u|U)?'.*'". HadError = false; Kind = kind; const char *TokBegin = begin; // Skip over wide character determinant. if (Kind != tok::char_constant) ++begin; if (Kind == tok::utf8_char_constant) ++begin; // Skip over the entry quote. assert(begin[0] == '\'' && "Invalid token lexed"); ++begin; // Remove an optional ud-suffix. if (end[-1] != '\'') { const char *UDSuffixEnd = end; do { --end; } while (end[-1] != '\''); // FIXME: Don't bother with this if !tok.hasUCN(). expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end)); UDSuffixOffset = end - TokBegin; } // Trim the ending quote. assert(end != begin && "Invalid token lexed"); --end; // FIXME: The "Value" is an uint64_t so we can handle char literals of // up to 64-bits. // FIXME: This extensively assumes that 'char' is 8-bits. assert(PP.getTargetInfo().getCharWidth() == 8 && "Assumes char is 8 bits"); assert(PP.getTargetInfo().getIntWidth() <= 64 && (PP.getTargetInfo().getIntWidth() & 7) == 0 && "Assumes sizeof(int) on target is <= 64 and a multiple of char"); assert(PP.getTargetInfo().getWCharWidth() <= 64 && "Assumes sizeof(wchar) on target is <= 64"); SmallVector codepoint_buffer; codepoint_buffer.resize(end - begin); uint32_t *buffer_begin = &codepoint_buffer.front(); uint32_t *buffer_end = buffer_begin + codepoint_buffer.size(); // Unicode escapes representing characters that cannot be correctly // represented in a single code unit are disallowed in character literals // by this implementation. uint32_t largest_character_for_kind; if (tok::wide_char_constant == Kind) { largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth()); } else if (tok::utf8_char_constant == Kind) { largest_character_for_kind = 0x7F; } else if (tok::utf16_char_constant == Kind) { largest_character_for_kind = 0xFFFF; } else if (tok::utf32_char_constant == Kind) { largest_character_for_kind = 0x10FFFF; } else { largest_character_for_kind = 0x7Fu; } while (begin != end) { // Is this a span of non-escape characters? if (begin[0] != '\\') { char const *start = begin; do { ++begin; } while (begin != end && *begin != '\\'); char const *tmp_in_start = start; uint32_t *tmp_out_start = buffer_begin; llvm::ConversionResult res = llvm::ConvertUTF8toUTF32(reinterpret_cast(&start), reinterpret_cast(begin), &buffer_begin, buffer_end, llvm::strictConversion); if (res != llvm::conversionOK) { // If we see bad encoding for unprefixed character literals, warn and // simply copy the byte values, for compatibility with gcc and // older versions of clang. bool NoErrorOnBadEncoding = isAscii(); unsigned Msg = diag::err_bad_character_encoding; if (NoErrorOnBadEncoding) Msg = diag::warn_bad_character_encoding; PP.Diag(Loc, Msg); if (NoErrorOnBadEncoding) { start = tmp_in_start; buffer_begin = tmp_out_start; for (; start != begin; ++start, ++buffer_begin) *buffer_begin = static_cast(*start); } else { HadError = true; } } else { for (; tmp_out_start < buffer_begin; ++tmp_out_start) { if (*tmp_out_start > largest_character_for_kind) { HadError = true; PP.Diag(Loc, diag::err_character_too_large); } } } continue; } // Is this a Universal Character Name escape? if (begin[1] == 'u' || begin[1] == 'U') { unsigned short UcnLen = 0; if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen, FullSourceLoc(Loc, PP.getSourceManager()), &PP.getDiagnostics(), PP.getLangOpts(), true)) { HadError = true; } else if (*buffer_begin > largest_character_for_kind) { HadError = true; PP.Diag(Loc, diag::err_character_too_large); } ++buffer_begin; continue; } unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo()); uint64_t result = ProcessCharEscape(TokBegin, begin, end, HadError, FullSourceLoc(Loc,PP.getSourceManager()), CharWidth, &PP.getDiagnostics(), PP.getLangOpts()); *buffer_begin++ = result; } unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front(); if (NumCharsSoFar > 1) { if (isWide()) PP.Diag(Loc, diag::warn_extraneous_char_constant); else if (isAscii() && NumCharsSoFar == 4) PP.Diag(Loc, diag::warn_four_char_character_literal); else if (isAscii()) PP.Diag(Loc, diag::warn_multichar_character_literal); else PP.Diag(Loc, diag::err_multichar_utf_character_literal); IsMultiChar = true; } else { IsMultiChar = false; } llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0); // Narrow character literals act as though their value is concatenated // in this implementation, but warn on overflow. bool multi_char_too_long = false; if (isAscii() && isMultiChar()) { LitVal = 0; for (size_t i = 0; i < NumCharsSoFar; ++i) { // check for enough leading zeros to shift into multi_char_too_long |= (LitVal.countLeadingZeros() < 8); LitVal <<= 8; LitVal = LitVal + (codepoint_buffer[i] & 0xFF); } } else if (NumCharsSoFar > 0) { // otherwise just take the last character LitVal = buffer_begin[-1]; } if (!HadError && multi_char_too_long) { PP.Diag(Loc, diag::warn_char_constant_too_large); } // Transfer the value from APInt to uint64_t Value = LitVal.getZExtValue(); // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1") // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple // character constants are not sign extended in the this implementation: // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC. if (isAscii() && NumCharsSoFar == 1 && (Value & 128) && PP.getLangOpts().CharIsSigned) Value = (signed char)Value; } /// \verbatim /// string-literal: [C++0x lex.string] /// encoding-prefix " [s-char-sequence] " /// encoding-prefix R raw-string /// encoding-prefix: /// u8 /// u /// U /// L /// s-char-sequence: /// s-char /// s-char-sequence s-char /// s-char: /// any member of the source character set except the double-quote ", /// backslash \, or new-line character /// escape-sequence /// universal-character-name /// raw-string: /// " d-char-sequence ( r-char-sequence ) d-char-sequence " /// r-char-sequence: /// r-char /// r-char-sequence r-char /// r-char: /// any member of the source character set, except a right parenthesis ) /// followed by the initial d-char-sequence (which may be empty) /// followed by a double quote ". /// d-char-sequence: /// d-char /// d-char-sequence d-char /// d-char: /// any member of the basic source character set except: /// space, the left parenthesis (, the right parenthesis ), /// the backslash \, and the control characters representing horizontal /// tab, vertical tab, form feed, and newline. /// escape-sequence: [C++0x lex.ccon] /// simple-escape-sequence /// octal-escape-sequence /// hexadecimal-escape-sequence /// simple-escape-sequence: /// one of \' \" \? \\ \a \b \f \n \r \t \v /// octal-escape-sequence: /// \ octal-digit /// \ octal-digit octal-digit /// \ octal-digit octal-digit octal-digit /// hexadecimal-escape-sequence: /// \x hexadecimal-digit /// hexadecimal-escape-sequence hexadecimal-digit /// universal-character-name: /// \u hex-quad /// \U hex-quad hex-quad /// hex-quad: /// hex-digit hex-digit hex-digit hex-digit /// \endverbatim /// StringLiteralParser:: StringLiteralParser(ArrayRef StringToks, Preprocessor &PP, bool Complain) : SM(PP.getSourceManager()), Features(PP.getLangOpts()), Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr), MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) { init(StringToks); } void StringLiteralParser::init(ArrayRef StringToks){ // The literal token may have come from an invalid source location (e.g. due // to a PCH error), in which case the token length will be 0. if (StringToks.empty() || StringToks[0].getLength() < 2) return DiagnoseLexingError(SourceLocation()); // Scan all of the string portions, remember the max individual token length, // computing a bound on the concatenated string length, and see whether any // piece is a wide-string. If any of the string portions is a wide-string // literal, the result is a wide-string literal [C99 6.4.5p4]. assert(!StringToks.empty() && "expected at least one token"); MaxTokenLength = StringToks[0].getLength(); assert(StringToks[0].getLength() >= 2 && "literal token is invalid!"); SizeBound = StringToks[0].getLength()-2; // -2 for "". Kind = StringToks[0].getKind(); hadError = false; // Implement Translation Phase #6: concatenation of string literals /// (C99 5.1.1.2p1). The common case is only one string fragment. for (unsigned i = 1; i != StringToks.size(); ++i) { if (StringToks[i].getLength() < 2) return DiagnoseLexingError(StringToks[i].getLocation()); // The string could be shorter than this if it needs cleaning, but this is a // reasonable bound, which is all we need. assert(StringToks[i].getLength() >= 2 && "literal token is invalid!"); SizeBound += StringToks[i].getLength()-2; // -2 for "". // Remember maximum string piece length. if (StringToks[i].getLength() > MaxTokenLength) MaxTokenLength = StringToks[i].getLength(); // Remember if we see any wide or utf-8/16/32 strings. // Also check for illegal concatenations. if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) { if (isAscii()) { Kind = StringToks[i].getKind(); } else { if (Diags) Diags->Report(StringToks[i].getLocation(), diag::err_unsupported_string_concat); hadError = true; } } } // Include space for the null terminator. ++SizeBound; // TODO: K&R warning: "traditional C rejects string constant concatenation" // Get the width in bytes of char/wchar_t/char16_t/char32_t CharByteWidth = getCharWidth(Kind, Target); assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple"); CharByteWidth /= 8; // The output buffer size needs to be large enough to hold wide characters. // This is a worst-case assumption which basically corresponds to L"" "long". SizeBound *= CharByteWidth; // Size the temporary buffer to hold the result string data. ResultBuf.resize(SizeBound); // Likewise, but for each string piece. SmallString<512> TokenBuf; TokenBuf.resize(MaxTokenLength); // Loop over all the strings, getting their spelling, and expanding them to // wide strings as appropriate. ResultPtr = &ResultBuf[0]; // Next byte to fill in. Pascal = false; SourceLocation UDSuffixTokLoc; for (unsigned i = 0, e = StringToks.size(); i != e; ++i) { const char *ThisTokBuf = &TokenBuf[0]; // Get the spelling of the token, which eliminates trigraphs, etc. We know // that ThisTokBuf points to a buffer that is big enough for the whole token // and 'spelled' tokens can only shrink. bool StringInvalid = false; unsigned ThisTokLen = Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features, &StringInvalid); if (StringInvalid) return DiagnoseLexingError(StringToks[i].getLocation()); const char *ThisTokBegin = ThisTokBuf; const char *ThisTokEnd = ThisTokBuf+ThisTokLen; // Remove an optional ud-suffix. if (ThisTokEnd[-1] != '"') { const char *UDSuffixEnd = ThisTokEnd; do { --ThisTokEnd; } while (ThisTokEnd[-1] != '"'); StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd); if (UDSuffixBuf.empty()) { if (StringToks[i].hasUCN()) expandUCNs(UDSuffixBuf, UDSuffix); else UDSuffixBuf.assign(UDSuffix); UDSuffixToken = i; UDSuffixOffset = ThisTokEnd - ThisTokBuf; UDSuffixTokLoc = StringToks[i].getLocation(); } else { SmallString<32> ExpandedUDSuffix; if (StringToks[i].hasUCN()) { expandUCNs(ExpandedUDSuffix, UDSuffix); UDSuffix = ExpandedUDSuffix; } // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the // result of a concatenation involving at least one user-defined-string- // literal, all the participating user-defined-string-literals shall // have the same ud-suffix. if (UDSuffixBuf != UDSuffix) { if (Diags) { SourceLocation TokLoc = StringToks[i].getLocation(); Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix) << UDSuffixBuf << UDSuffix << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc) << SourceRange(TokLoc, TokLoc); } hadError = true; } } } // Strip the end quote. --ThisTokEnd; // TODO: Input character set mapping support. // Skip marker for wide or unicode strings. if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') { ++ThisTokBuf; // Skip 8 of u8 marker for utf8 strings. if (ThisTokBuf[0] == '8') ++ThisTokBuf; } // Check for raw string if (ThisTokBuf[0] == 'R') { ThisTokBuf += 2; // skip R" const char *Prefix = ThisTokBuf; while (ThisTokBuf[0] != '(') ++ThisTokBuf; ++ThisTokBuf; // skip '(' // Remove same number of characters from the end ThisTokEnd -= ThisTokBuf - Prefix; assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal"); // C++14 [lex.string]p4: A source-file new-line in a raw string literal // results in a new-line in the resulting execution string-literal. StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf); while (!RemainingTokenSpan.empty()) { // Split the string literal on \r\n boundaries. size_t CRLFPos = RemainingTokenSpan.find("\r\n"); StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos); StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos); // Copy everything before the \r\n sequence into the string literal. if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF)) hadError = true; // Point into the \n inside the \r\n sequence and operate on the // remaining portion of the literal. RemainingTokenSpan = AfterCRLF.substr(1); } } else { if (ThisTokBuf[0] != '"') { // The file may have come from PCH and then changed after loading the // PCH; Fail gracefully. return DiagnoseLexingError(StringToks[i].getLocation()); } ++ThisTokBuf; // skip " // Check if this is a pascal string if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd && ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') { // If the \p sequence is found in the first token, we have a pascal string // Otherwise, if we already have a pascal string, ignore the first \p if (i == 0) { ++ThisTokBuf; Pascal = true; } else if (Pascal) ThisTokBuf += 2; } while (ThisTokBuf != ThisTokEnd) { // Is this a span of non-escape characters? if (ThisTokBuf[0] != '\\') { const char *InStart = ThisTokBuf; do { ++ThisTokBuf; } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); // Copy the character span over. if (CopyStringFragment(StringToks[i], ThisTokBegin, StringRef(InStart, ThisTokBuf - InStart))) hadError = true; continue; } // Is this a Universal Character Name escape? if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, ResultPtr, hadError, FullSourceLoc(StringToks[i].getLocation(), SM), CharByteWidth, Diags, Features); continue; } // Otherwise, this is a non-UCN escape character. Process it. unsigned ResultChar = ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError, FullSourceLoc(StringToks[i].getLocation(), SM), CharByteWidth*8, Diags, Features); if (CharByteWidth == 4) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. llvm::UTF32 *ResultWidePtr = reinterpret_cast(ResultPtr); *ResultWidePtr = ResultChar; ResultPtr += 4; } else if (CharByteWidth == 2) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. llvm::UTF16 *ResultWidePtr = reinterpret_cast(ResultPtr); *ResultWidePtr = ResultChar & 0xFFFF; ResultPtr += 2; } else { assert(CharByteWidth == 1 && "Unexpected char width"); *ResultPtr++ = ResultChar & 0xFF; } } } } if (Pascal) { if (CharByteWidth == 4) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. llvm::UTF32 *ResultWidePtr = reinterpret_cast(ResultBuf.data()); ResultWidePtr[0] = GetNumStringChars() - 1; } else if (CharByteWidth == 2) { // FIXME: Make the type of the result buffer correct instead of // using reinterpret_cast. llvm::UTF16 *ResultWidePtr = reinterpret_cast(ResultBuf.data()); ResultWidePtr[0] = GetNumStringChars() - 1; } else { assert(CharByteWidth == 1 && "Unexpected char width"); ResultBuf[0] = GetNumStringChars() - 1; } // Verify that pascal strings aren't too large. if (GetStringLength() > 256) { if (Diags) Diags->Report(StringToks.front().getLocation(), diag::err_pascal_string_too_long) << SourceRange(StringToks.front().getLocation(), StringToks.back().getLocation()); hadError = true; return; } } else if (Diags) { // Complain if this string literal has too many characters. unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509; if (GetNumStringChars() > MaxChars) Diags->Report(StringToks.front().getLocation(), diag::ext_string_too_long) << GetNumStringChars() << MaxChars << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0) << SourceRange(StringToks.front().getLocation(), StringToks.back().getLocation()); } } static const char *resyncUTF8(const char *Err, const char *End) { if (Err == End) return End; End = Err + std::min(llvm::getNumBytesForUTF8(*Err), End-Err); while (++Err != End && (*Err & 0xC0) == 0x80) ; return Err; } /// This function copies from Fragment, which is a sequence of bytes /// within Tok's contents (which begin at TokBegin) into ResultPtr. /// Performs widening for multi-byte characters. bool StringLiteralParser::CopyStringFragment(const Token &Tok, const char *TokBegin, StringRef Fragment) { const llvm::UTF8 *ErrorPtrTmp; if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp)) return false; // If we see bad encoding for unprefixed string literals, warn and // simply copy the byte values, for compatibility with gcc and older // versions of clang. bool NoErrorOnBadEncoding = isAscii(); if (NoErrorOnBadEncoding) { memcpy(ResultPtr, Fragment.data(), Fragment.size()); ResultPtr += Fragment.size(); } if (Diags) { const char *ErrorPtr = reinterpret_cast(ErrorPtrTmp); FullSourceLoc SourceLoc(Tok.getLocation(), SM); const DiagnosticBuilder &Builder = Diag(Diags, Features, SourceLoc, TokBegin, ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()), NoErrorOnBadEncoding ? diag::warn_bad_string_encoding : diag::err_bad_string_encoding); const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end()); StringRef NextFragment(NextStart, Fragment.end()-NextStart); // Decode into a dummy buffer. SmallString<512> Dummy; Dummy.reserve(Fragment.size() * CharByteWidth); char *Ptr = Dummy.data(); while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) { const char *ErrorPtr = reinterpret_cast(ErrorPtrTmp); NextStart = resyncUTF8(ErrorPtr, Fragment.end()); Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin, ErrorPtr, NextStart); NextFragment = StringRef(NextStart, Fragment.end()-NextStart); } } return !NoErrorOnBadEncoding; } void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) { hadError = true; if (Diags) Diags->Report(Loc, diag::err_lexing_string); } /// getOffsetOfStringByte - This function returns the offset of the /// specified byte of the string data represented by Token. This handles /// advancing over escape sequences in the string. unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, unsigned ByteNo) const { // Get the spelling of the token. SmallString<32> SpellingBuffer; SpellingBuffer.resize(Tok.getLength()); bool StringInvalid = false; const char *SpellingPtr = &SpellingBuffer[0]; unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features, &StringInvalid); if (StringInvalid) return 0; const char *SpellingStart = SpellingPtr; const char *SpellingEnd = SpellingPtr+TokLen; // Handle UTF-8 strings just like narrow strings. if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8') SpellingPtr += 2; assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' && SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet"); // For raw string literals, this is easy. if (SpellingPtr[0] == 'R') { assert(SpellingPtr[1] == '"' && "Should be a raw string literal!"); // Skip 'R"'. SpellingPtr += 2; while (*SpellingPtr != '(') { ++SpellingPtr; assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal"); } // Skip '('. ++SpellingPtr; return SpellingPtr - SpellingStart + ByteNo; } // Skip over the leading quote assert(SpellingPtr[0] == '"' && "Should be a string literal!"); ++SpellingPtr; // Skip over bytes until we find the offset we're looking for. while (ByteNo) { assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!"); // Step over non-escapes simply. if (*SpellingPtr != '\\') { ++SpellingPtr; --ByteNo; continue; } // Otherwise, this is an escape character. Advance over it. bool HadError = false; if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') { const char *EscapePtr = SpellingPtr; unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd, 1, Features, HadError); if (Len > ByteNo) { // ByteNo is somewhere within the escape sequence. SpellingPtr = EscapePtr; break; } ByteNo -= Len; } else { ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError, FullSourceLoc(Tok.getLocation(), SM), CharByteWidth*8, Diags, Features); --ByteNo; } assert(!HadError && "This method isn't valid on erroneous strings"); } return SpellingPtr-SpellingStart; } /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved /// suffixes as ud-suffixes, because the diagnostic experience is better if we /// treat it as an invalid suffix. bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix) { return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) || Suffix == "sv"; }