123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497 |
- #include <algorithm>
- #include <iomanip>
- #include <sstream>
- #include "emitterutils.h"
- #include "exp.h"
- #include "indentation.h"
- #include "regex_yaml.h"
- #include "regeximpl.h"
- #include "stringsource.h"
- #include "yaml-cpp/binary.h" // IWYU pragma: keep
- #include "yaml-cpp/null.h"
- #include "yaml-cpp/ostream_wrapper.h"
- namespace YAML {
- namespace Utils {
- namespace {
- enum { REPLACEMENT_CHARACTER = 0xFFFD };
- bool IsAnchorChar(int ch) { // test for ns-anchor-char
- switch (ch) {
- case ',':
- case '[':
- case ']':
- case '{':
- case '}': // c-flow-indicator
- case ' ':
- case '\t': // s-white
- case 0xFEFF: // c-byte-order-mark
- case 0xA:
- case 0xD: // b-char
- return false;
- case 0x85:
- return true;
- }
- if (ch < 0x20) {
- return false;
- }
- if (ch < 0x7E) {
- return true;
- }
- if (ch < 0xA0) {
- return false;
- }
- if (ch >= 0xD800 && ch <= 0xDFFF) {
- return false;
- }
- if ((ch & 0xFFFE) == 0xFFFE) {
- return false;
- }
- if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) {
- return false;
- }
- if (ch > 0x10FFFF) {
- return false;
- }
- return true;
- }
- int Utf8BytesIndicated(char ch) {
- int byteVal = static_cast<unsigned char>(ch);
- switch (byteVal >> 4) {
- case 0:
- case 1:
- case 2:
- case 3:
- case 4:
- case 5:
- case 6:
- case 7:
- return 1;
- case 12:
- case 13:
- return 2;
- case 14:
- return 3;
- case 15:
- return 4;
- default:
- return -1;
- }
- }
- bool IsTrailingByte(char ch) { return (ch & 0xC0) == 0x80; }
- bool GetNextCodePointAndAdvance(int& codePoint,
- std::string::const_iterator& first,
- std::string::const_iterator last) {
- if (first == last)
- return false;
- int nBytes = Utf8BytesIndicated(*first);
- if (nBytes < 1) {
- // Bad lead byte
- ++first;
- codePoint = REPLACEMENT_CHARACTER;
- return true;
- }
- if (nBytes == 1) {
- codePoint = *first++;
- return true;
- }
- // Gather bits from trailing bytes
- codePoint = static_cast<unsigned char>(*first) & ~(0xFF << (7 - nBytes));
- ++first;
- --nBytes;
- for (; nBytes > 0; ++first, --nBytes) {
- if ((first == last) || !IsTrailingByte(*first)) {
- codePoint = REPLACEMENT_CHARACTER;
- break;
- }
- codePoint <<= 6;
- codePoint |= *first & 0x3F;
- }
- // Check for illegal code points
- if (codePoint > 0x10FFFF)
- codePoint = REPLACEMENT_CHARACTER;
- else if (codePoint >= 0xD800 && codePoint <= 0xDFFF)
- codePoint = REPLACEMENT_CHARACTER;
- else if ((codePoint & 0xFFFE) == 0xFFFE)
- codePoint = REPLACEMENT_CHARACTER;
- else if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF)
- codePoint = REPLACEMENT_CHARACTER;
- return true;
- }
- void WriteCodePoint(ostream_wrapper& out, int codePoint) {
- if (codePoint < 0 || codePoint > 0x10FFFF) {
- codePoint = REPLACEMENT_CHARACTER;
- }
- if (codePoint <= 0x7F) {
- out << static_cast<char>(codePoint);
- } else if (codePoint <= 0x7FF) {
- out << static_cast<char>(0xC0 | (codePoint >> 6))
- << static_cast<char>(0x80 | (codePoint & 0x3F));
- } else if (codePoint <= 0xFFFF) {
- out << static_cast<char>(0xE0 | (codePoint >> 12))
- << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
- << static_cast<char>(0x80 | (codePoint & 0x3F));
- } else {
- out << static_cast<char>(0xF0 | (codePoint >> 18))
- << static_cast<char>(0x80 | ((codePoint >> 12) & 0x3F))
- << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
- << static_cast<char>(0x80 | (codePoint & 0x3F));
- }
- }
- bool IsValidPlainScalar(const std::string& str, FlowType::value flowType,
- bool allowOnlyAscii) {
- // check against null
- if (IsNullString(str)) {
- return false;
- }
- // check the start
- const RegEx& start = (flowType == FlowType::Flow ? Exp::PlainScalarInFlow()
- : Exp::PlainScalar());
- if (!start.Matches(str)) {
- return false;
- }
- // and check the end for plain whitespace (which can't be faithfully kept in a
- // plain scalar)
- if (!str.empty() && *str.rbegin() == ' ') {
- return false;
- }
- // then check until something is disallowed
- static const RegEx& disallowed_flow =
- Exp::EndScalarInFlow() | (Exp::BlankOrBreak() + Exp::Comment()) |
- Exp::NotPrintable() | Exp::Utf8_ByteOrderMark() | Exp::Break() |
- Exp::Tab() | Exp::Ampersand();
- static const RegEx& disallowed_block =
- Exp::EndScalar() | (Exp::BlankOrBreak() + Exp::Comment()) |
- Exp::NotPrintable() | Exp::Utf8_ByteOrderMark() | Exp::Break() |
- Exp::Tab() | Exp::Ampersand();
- const RegEx& disallowed =
- flowType == FlowType::Flow ? disallowed_flow : disallowed_block;
- StringCharSource buffer(str.c_str(), str.size());
- while (buffer) {
- if (disallowed.Matches(buffer)) {
- return false;
- }
- if (allowOnlyAscii && (0x80 <= static_cast<unsigned char>(buffer[0]))) {
- return false;
- }
- ++buffer;
- }
- return true;
- }
- bool IsValidSingleQuotedScalar(const std::string& str, bool escapeNonAscii) {
- // TODO: check for non-printable characters?
- return std::none_of(str.begin(), str.end(), [=](char ch) {
- return (escapeNonAscii && (0x80 <= static_cast<unsigned char>(ch))) ||
- (ch == '\n');
- });
- }
- bool IsValidLiteralScalar(const std::string& str, FlowType::value flowType,
- bool escapeNonAscii) {
- if (flowType == FlowType::Flow) {
- return false;
- }
- // TODO: check for non-printable characters?
- return std::none_of(str.begin(), str.end(), [=](char ch) {
- return (escapeNonAscii && (0x80 <= static_cast<unsigned char>(ch)));
- });
- }
- std::pair<uint16_t, uint16_t> EncodeUTF16SurrogatePair(int codePoint) {
- const uint32_t leadOffset = 0xD800 - (0x10000 >> 10);
- return {
- leadOffset | (codePoint >> 10),
- 0xDC00 | (codePoint & 0x3FF),
- };
- }
- void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint, StringEscaping::value stringEscapingStyle) {
- static const char hexDigits[] = "0123456789abcdef";
- out << "\\";
- int digits = 8;
- if (codePoint < 0xFF && stringEscapingStyle != StringEscaping::JSON) {
- out << "x";
- digits = 2;
- } else if (codePoint < 0xFFFF) {
- out << "u";
- digits = 4;
- } else if (stringEscapingStyle != StringEscaping::JSON) {
- out << "U";
- digits = 8;
- } else {
- auto surrogatePair = EncodeUTF16SurrogatePair(codePoint);
- WriteDoubleQuoteEscapeSequence(out, surrogatePair.first, stringEscapingStyle);
- WriteDoubleQuoteEscapeSequence(out, surrogatePair.second, stringEscapingStyle);
- return;
- }
- // Write digits into the escape sequence
- for (; digits > 0; --digits)
- out << hexDigits[(codePoint >> (4 * (digits - 1))) & 0xF];
- }
- bool WriteAliasName(ostream_wrapper& out, const std::string& str) {
- int codePoint;
- for (std::string::const_iterator i = str.begin();
- GetNextCodePointAndAdvance(codePoint, i, str.end());) {
- if (!IsAnchorChar(codePoint)) {
- return false;
- }
- WriteCodePoint(out, codePoint);
- }
- return true;
- }
- } // namespace
- StringFormat::value ComputeStringFormat(const std::string& str,
- EMITTER_MANIP strFormat,
- FlowType::value flowType,
- bool escapeNonAscii) {
- switch (strFormat) {
- case Auto:
- if (IsValidPlainScalar(str, flowType, escapeNonAscii)) {
- return StringFormat::Plain;
- }
- return StringFormat::DoubleQuoted;
- case SingleQuoted:
- if (IsValidSingleQuotedScalar(str, escapeNonAscii)) {
- return StringFormat::SingleQuoted;
- }
- return StringFormat::DoubleQuoted;
- case DoubleQuoted:
- return StringFormat::DoubleQuoted;
- case Literal:
- if (IsValidLiteralScalar(str, flowType, escapeNonAscii)) {
- return StringFormat::Literal;
- }
- return StringFormat::DoubleQuoted;
- default:
- break;
- }
- return StringFormat::DoubleQuoted;
- }
- bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str) {
- out << "'";
- int codePoint;
- for (std::string::const_iterator i = str.begin();
- GetNextCodePointAndAdvance(codePoint, i, str.end());) {
- if (codePoint == '\n') {
- return false; // We can't handle a new line and the attendant indentation
- // yet
- }
- if (codePoint == '\'') {
- out << "''";
- } else {
- WriteCodePoint(out, codePoint);
- }
- }
- out << "'";
- return true;
- }
- bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
- StringEscaping::value stringEscaping) {
- out << "\"";
- int codePoint;
- for (std::string::const_iterator i = str.begin();
- GetNextCodePointAndAdvance(codePoint, i, str.end());) {
- switch (codePoint) {
- case '\"':
- out << "\\\"";
- break;
- case '\\':
- out << "\\\\";
- break;
- case '\n':
- out << "\\n";
- break;
- case '\t':
- out << "\\t";
- break;
- case '\r':
- out << "\\r";
- break;
- case '\b':
- out << "\\b";
- break;
- case '\f':
- out << "\\f";
- break;
- default:
- if (codePoint < 0x20 ||
- (codePoint >= 0x80 &&
- codePoint <= 0xA0)) { // Control characters and non-breaking space
- WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
- } else if (codePoint == 0xFEFF) { // Byte order marks (ZWNS) should be
- // escaped (YAML 1.2, sec. 5.2)
- WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
- } else if (stringEscaping == StringEscaping::NonAscii && codePoint > 0x7E) {
- WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
- } else {
- WriteCodePoint(out, codePoint);
- }
- }
- }
- out << "\"";
- return true;
- }
- bool WriteLiteralString(ostream_wrapper& out, const std::string& str,
- std::size_t indent) {
- out << "|\n";
- int codePoint;
- for (std::string::const_iterator i = str.begin();
- GetNextCodePointAndAdvance(codePoint, i, str.end());) {
- if (codePoint == '\n') {
- out << "\n";
- } else {
- out<< IndentTo(indent);
- WriteCodePoint(out, codePoint);
- }
- }
- return true;
- }
- bool WriteChar(ostream_wrapper& out, char ch, StringEscaping::value stringEscapingStyle) {
- if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')) {
- out << ch;
- } else if (ch == '\"') {
- out << R"("\"")";
- } else if (ch == '\t') {
- out << R"("\t")";
- } else if (ch == '\n') {
- out << R"("\n")";
- } else if (ch == '\b') {
- out << R"("\b")";
- } else if (ch == '\r') {
- out << R"("\r")";
- } else if (ch == '\f') {
- out << R"("\f")";
- } else if (ch == '\\') {
- out << R"("\\")";
- } else if (0x20 <= ch && ch <= 0x7e) {
- out << "\"" << ch << "\"";
- } else {
- out << "\"";
- WriteDoubleQuoteEscapeSequence(out, ch, stringEscapingStyle);
- out << "\"";
- }
- return true;
- }
- bool WriteComment(ostream_wrapper& out, const std::string& str,
- std::size_t postCommentIndent) {
- const std::size_t curIndent = out.col();
- out << "#" << Indentation(postCommentIndent);
- out.set_comment();
- int codePoint;
- for (std::string::const_iterator i = str.begin();
- GetNextCodePointAndAdvance(codePoint, i, str.end());) {
- if (codePoint == '\n') {
- out << "\n"
- << IndentTo(curIndent) << "#" << Indentation(postCommentIndent);
- out.set_comment();
- } else {
- WriteCodePoint(out, codePoint);
- }
- }
- return true;
- }
- bool WriteAlias(ostream_wrapper& out, const std::string& str) {
- out << "*";
- return WriteAliasName(out, str);
- }
- bool WriteAnchor(ostream_wrapper& out, const std::string& str) {
- out << "&";
- return WriteAliasName(out, str);
- }
- bool WriteTag(ostream_wrapper& out, const std::string& str, bool verbatim) {
- out << (verbatim ? "!<" : "!");
- StringCharSource buffer(str.c_str(), str.size());
- const RegEx& reValid = verbatim ? Exp::URI() : Exp::Tag();
- while (buffer) {
- int n = reValid.Match(buffer);
- if (n <= 0) {
- return false;
- }
- while (--n >= 0) {
- out << buffer[0];
- ++buffer;
- }
- }
- if (verbatim) {
- out << ">";
- }
- return true;
- }
- bool WriteTagWithPrefix(ostream_wrapper& out, const std::string& prefix,
- const std::string& tag) {
- out << "!";
- StringCharSource prefixBuffer(prefix.c_str(), prefix.size());
- while (prefixBuffer) {
- int n = Exp::URI().Match(prefixBuffer);
- if (n <= 0) {
- return false;
- }
- while (--n >= 0) {
- out << prefixBuffer[0];
- ++prefixBuffer;
- }
- }
- out << "!";
- StringCharSource tagBuffer(tag.c_str(), tag.size());
- while (tagBuffer) {
- int n = Exp::Tag().Match(tagBuffer);
- if (n <= 0) {
- return false;
- }
- while (--n >= 0) {
- out << tagBuffer[0];
- ++tagBuffer;
- }
- }
- return true;
- }
- bool WriteBinary(ostream_wrapper& out, const Binary& binary) {
- WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()),
- StringEscaping::None);
- return true;
- }
- } // namespace Utils
- } // namespace YAML
|