emitterutils.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497
  1. #include <algorithm>
  2. #include <iomanip>
  3. #include <sstream>
  4. #include "emitterutils.h"
  5. #include "exp.h"
  6. #include "indentation.h"
  7. #include "regex_yaml.h"
  8. #include "regeximpl.h"
  9. #include "stringsource.h"
  10. #include "yaml-cpp/binary.h" // IWYU pragma: keep
  11. #include "yaml-cpp/null.h"
  12. #include "yaml-cpp/ostream_wrapper.h"
  13. namespace YAML {
  14. namespace Utils {
  15. namespace {
  16. enum { REPLACEMENT_CHARACTER = 0xFFFD };
  17. bool IsAnchorChar(int ch) { // test for ns-anchor-char
  18. switch (ch) {
  19. case ',':
  20. case '[':
  21. case ']':
  22. case '{':
  23. case '}': // c-flow-indicator
  24. case ' ':
  25. case '\t': // s-white
  26. case 0xFEFF: // c-byte-order-mark
  27. case 0xA:
  28. case 0xD: // b-char
  29. return false;
  30. case 0x85:
  31. return true;
  32. }
  33. if (ch < 0x20) {
  34. return false;
  35. }
  36. if (ch < 0x7E) {
  37. return true;
  38. }
  39. if (ch < 0xA0) {
  40. return false;
  41. }
  42. if (ch >= 0xD800 && ch <= 0xDFFF) {
  43. return false;
  44. }
  45. if ((ch & 0xFFFE) == 0xFFFE) {
  46. return false;
  47. }
  48. if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) {
  49. return false;
  50. }
  51. if (ch > 0x10FFFF) {
  52. return false;
  53. }
  54. return true;
  55. }
  56. int Utf8BytesIndicated(char ch) {
  57. int byteVal = static_cast<unsigned char>(ch);
  58. switch (byteVal >> 4) {
  59. case 0:
  60. case 1:
  61. case 2:
  62. case 3:
  63. case 4:
  64. case 5:
  65. case 6:
  66. case 7:
  67. return 1;
  68. case 12:
  69. case 13:
  70. return 2;
  71. case 14:
  72. return 3;
  73. case 15:
  74. return 4;
  75. default:
  76. return -1;
  77. }
  78. }
  79. bool IsTrailingByte(char ch) { return (ch & 0xC0) == 0x80; }
  80. bool GetNextCodePointAndAdvance(int& codePoint,
  81. std::string::const_iterator& first,
  82. std::string::const_iterator last) {
  83. if (first == last)
  84. return false;
  85. int nBytes = Utf8BytesIndicated(*first);
  86. if (nBytes < 1) {
  87. // Bad lead byte
  88. ++first;
  89. codePoint = REPLACEMENT_CHARACTER;
  90. return true;
  91. }
  92. if (nBytes == 1) {
  93. codePoint = *first++;
  94. return true;
  95. }
  96. // Gather bits from trailing bytes
  97. codePoint = static_cast<unsigned char>(*first) & ~(0xFF << (7 - nBytes));
  98. ++first;
  99. --nBytes;
  100. for (; nBytes > 0; ++first, --nBytes) {
  101. if ((first == last) || !IsTrailingByte(*first)) {
  102. codePoint = REPLACEMENT_CHARACTER;
  103. break;
  104. }
  105. codePoint <<= 6;
  106. codePoint |= *first & 0x3F;
  107. }
  108. // Check for illegal code points
  109. if (codePoint > 0x10FFFF)
  110. codePoint = REPLACEMENT_CHARACTER;
  111. else if (codePoint >= 0xD800 && codePoint <= 0xDFFF)
  112. codePoint = REPLACEMENT_CHARACTER;
  113. else if ((codePoint & 0xFFFE) == 0xFFFE)
  114. codePoint = REPLACEMENT_CHARACTER;
  115. else if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF)
  116. codePoint = REPLACEMENT_CHARACTER;
  117. return true;
  118. }
  119. void WriteCodePoint(ostream_wrapper& out, int codePoint) {
  120. if (codePoint < 0 || codePoint > 0x10FFFF) {
  121. codePoint = REPLACEMENT_CHARACTER;
  122. }
  123. if (codePoint <= 0x7F) {
  124. out << static_cast<char>(codePoint);
  125. } else if (codePoint <= 0x7FF) {
  126. out << static_cast<char>(0xC0 | (codePoint >> 6))
  127. << static_cast<char>(0x80 | (codePoint & 0x3F));
  128. } else if (codePoint <= 0xFFFF) {
  129. out << static_cast<char>(0xE0 | (codePoint >> 12))
  130. << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
  131. << static_cast<char>(0x80 | (codePoint & 0x3F));
  132. } else {
  133. out << static_cast<char>(0xF0 | (codePoint >> 18))
  134. << static_cast<char>(0x80 | ((codePoint >> 12) & 0x3F))
  135. << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
  136. << static_cast<char>(0x80 | (codePoint & 0x3F));
  137. }
  138. }
  139. bool IsValidPlainScalar(const std::string& str, FlowType::value flowType,
  140. bool allowOnlyAscii) {
  141. // check against null
  142. if (IsNullString(str)) {
  143. return false;
  144. }
  145. // check the start
  146. const RegEx& start = (flowType == FlowType::Flow ? Exp::PlainScalarInFlow()
  147. : Exp::PlainScalar());
  148. if (!start.Matches(str)) {
  149. return false;
  150. }
  151. // and check the end for plain whitespace (which can't be faithfully kept in a
  152. // plain scalar)
  153. if (!str.empty() && *str.rbegin() == ' ') {
  154. return false;
  155. }
  156. // then check until something is disallowed
  157. static const RegEx& disallowed_flow =
  158. Exp::EndScalarInFlow() | (Exp::BlankOrBreak() + Exp::Comment()) |
  159. Exp::NotPrintable() | Exp::Utf8_ByteOrderMark() | Exp::Break() |
  160. Exp::Tab() | Exp::Ampersand();
  161. static const RegEx& disallowed_block =
  162. Exp::EndScalar() | (Exp::BlankOrBreak() + Exp::Comment()) |
  163. Exp::NotPrintable() | Exp::Utf8_ByteOrderMark() | Exp::Break() |
  164. Exp::Tab() | Exp::Ampersand();
  165. const RegEx& disallowed =
  166. flowType == FlowType::Flow ? disallowed_flow : disallowed_block;
  167. StringCharSource buffer(str.c_str(), str.size());
  168. while (buffer) {
  169. if (disallowed.Matches(buffer)) {
  170. return false;
  171. }
  172. if (allowOnlyAscii && (0x80 <= static_cast<unsigned char>(buffer[0]))) {
  173. return false;
  174. }
  175. ++buffer;
  176. }
  177. return true;
  178. }
  179. bool IsValidSingleQuotedScalar(const std::string& str, bool escapeNonAscii) {
  180. // TODO: check for non-printable characters?
  181. return std::none_of(str.begin(), str.end(), [=](char ch) {
  182. return (escapeNonAscii && (0x80 <= static_cast<unsigned char>(ch))) ||
  183. (ch == '\n');
  184. });
  185. }
  186. bool IsValidLiteralScalar(const std::string& str, FlowType::value flowType,
  187. bool escapeNonAscii) {
  188. if (flowType == FlowType::Flow) {
  189. return false;
  190. }
  191. // TODO: check for non-printable characters?
  192. return std::none_of(str.begin(), str.end(), [=](char ch) {
  193. return (escapeNonAscii && (0x80 <= static_cast<unsigned char>(ch)));
  194. });
  195. }
  196. std::pair<uint16_t, uint16_t> EncodeUTF16SurrogatePair(int codePoint) {
  197. const uint32_t leadOffset = 0xD800 - (0x10000 >> 10);
  198. return {
  199. leadOffset | (codePoint >> 10),
  200. 0xDC00 | (codePoint & 0x3FF),
  201. };
  202. }
  203. void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint, StringEscaping::value stringEscapingStyle) {
  204. static const char hexDigits[] = "0123456789abcdef";
  205. out << "\\";
  206. int digits = 8;
  207. if (codePoint < 0xFF && stringEscapingStyle != StringEscaping::JSON) {
  208. out << "x";
  209. digits = 2;
  210. } else if (codePoint < 0xFFFF) {
  211. out << "u";
  212. digits = 4;
  213. } else if (stringEscapingStyle != StringEscaping::JSON) {
  214. out << "U";
  215. digits = 8;
  216. } else {
  217. auto surrogatePair = EncodeUTF16SurrogatePair(codePoint);
  218. WriteDoubleQuoteEscapeSequence(out, surrogatePair.first, stringEscapingStyle);
  219. WriteDoubleQuoteEscapeSequence(out, surrogatePair.second, stringEscapingStyle);
  220. return;
  221. }
  222. // Write digits into the escape sequence
  223. for (; digits > 0; --digits)
  224. out << hexDigits[(codePoint >> (4 * (digits - 1))) & 0xF];
  225. }
  226. bool WriteAliasName(ostream_wrapper& out, const std::string& str) {
  227. int codePoint;
  228. for (std::string::const_iterator i = str.begin();
  229. GetNextCodePointAndAdvance(codePoint, i, str.end());) {
  230. if (!IsAnchorChar(codePoint)) {
  231. return false;
  232. }
  233. WriteCodePoint(out, codePoint);
  234. }
  235. return true;
  236. }
  237. } // namespace
  238. StringFormat::value ComputeStringFormat(const std::string& str,
  239. EMITTER_MANIP strFormat,
  240. FlowType::value flowType,
  241. bool escapeNonAscii) {
  242. switch (strFormat) {
  243. case Auto:
  244. if (IsValidPlainScalar(str, flowType, escapeNonAscii)) {
  245. return StringFormat::Plain;
  246. }
  247. return StringFormat::DoubleQuoted;
  248. case SingleQuoted:
  249. if (IsValidSingleQuotedScalar(str, escapeNonAscii)) {
  250. return StringFormat::SingleQuoted;
  251. }
  252. return StringFormat::DoubleQuoted;
  253. case DoubleQuoted:
  254. return StringFormat::DoubleQuoted;
  255. case Literal:
  256. if (IsValidLiteralScalar(str, flowType, escapeNonAscii)) {
  257. return StringFormat::Literal;
  258. }
  259. return StringFormat::DoubleQuoted;
  260. default:
  261. break;
  262. }
  263. return StringFormat::DoubleQuoted;
  264. }
  265. bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str) {
  266. out << "'";
  267. int codePoint;
  268. for (std::string::const_iterator i = str.begin();
  269. GetNextCodePointAndAdvance(codePoint, i, str.end());) {
  270. if (codePoint == '\n') {
  271. return false; // We can't handle a new line and the attendant indentation
  272. // yet
  273. }
  274. if (codePoint == '\'') {
  275. out << "''";
  276. } else {
  277. WriteCodePoint(out, codePoint);
  278. }
  279. }
  280. out << "'";
  281. return true;
  282. }
  283. bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
  284. StringEscaping::value stringEscaping) {
  285. out << "\"";
  286. int codePoint;
  287. for (std::string::const_iterator i = str.begin();
  288. GetNextCodePointAndAdvance(codePoint, i, str.end());) {
  289. switch (codePoint) {
  290. case '\"':
  291. out << "\\\"";
  292. break;
  293. case '\\':
  294. out << "\\\\";
  295. break;
  296. case '\n':
  297. out << "\\n";
  298. break;
  299. case '\t':
  300. out << "\\t";
  301. break;
  302. case '\r':
  303. out << "\\r";
  304. break;
  305. case '\b':
  306. out << "\\b";
  307. break;
  308. case '\f':
  309. out << "\\f";
  310. break;
  311. default:
  312. if (codePoint < 0x20 ||
  313. (codePoint >= 0x80 &&
  314. codePoint <= 0xA0)) { // Control characters and non-breaking space
  315. WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
  316. } else if (codePoint == 0xFEFF) { // Byte order marks (ZWNS) should be
  317. // escaped (YAML 1.2, sec. 5.2)
  318. WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
  319. } else if (stringEscaping == StringEscaping::NonAscii && codePoint > 0x7E) {
  320. WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
  321. } else {
  322. WriteCodePoint(out, codePoint);
  323. }
  324. }
  325. }
  326. out << "\"";
  327. return true;
  328. }
  329. bool WriteLiteralString(ostream_wrapper& out, const std::string& str,
  330. std::size_t indent) {
  331. out << "|\n";
  332. int codePoint;
  333. for (std::string::const_iterator i = str.begin();
  334. GetNextCodePointAndAdvance(codePoint, i, str.end());) {
  335. if (codePoint == '\n') {
  336. out << "\n";
  337. } else {
  338. out<< IndentTo(indent);
  339. WriteCodePoint(out, codePoint);
  340. }
  341. }
  342. return true;
  343. }
  344. bool WriteChar(ostream_wrapper& out, char ch, StringEscaping::value stringEscapingStyle) {
  345. if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')) {
  346. out << ch;
  347. } else if (ch == '\"') {
  348. out << R"("\"")";
  349. } else if (ch == '\t') {
  350. out << R"("\t")";
  351. } else if (ch == '\n') {
  352. out << R"("\n")";
  353. } else if (ch == '\b') {
  354. out << R"("\b")";
  355. } else if (ch == '\r') {
  356. out << R"("\r")";
  357. } else if (ch == '\f') {
  358. out << R"("\f")";
  359. } else if (ch == '\\') {
  360. out << R"("\\")";
  361. } else if (0x20 <= ch && ch <= 0x7e) {
  362. out << "\"" << ch << "\"";
  363. } else {
  364. out << "\"";
  365. WriteDoubleQuoteEscapeSequence(out, ch, stringEscapingStyle);
  366. out << "\"";
  367. }
  368. return true;
  369. }
  370. bool WriteComment(ostream_wrapper& out, const std::string& str,
  371. std::size_t postCommentIndent) {
  372. const std::size_t curIndent = out.col();
  373. out << "#" << Indentation(postCommentIndent);
  374. out.set_comment();
  375. int codePoint;
  376. for (std::string::const_iterator i = str.begin();
  377. GetNextCodePointAndAdvance(codePoint, i, str.end());) {
  378. if (codePoint == '\n') {
  379. out << "\n"
  380. << IndentTo(curIndent) << "#" << Indentation(postCommentIndent);
  381. out.set_comment();
  382. } else {
  383. WriteCodePoint(out, codePoint);
  384. }
  385. }
  386. return true;
  387. }
  388. bool WriteAlias(ostream_wrapper& out, const std::string& str) {
  389. out << "*";
  390. return WriteAliasName(out, str);
  391. }
  392. bool WriteAnchor(ostream_wrapper& out, const std::string& str) {
  393. out << "&";
  394. return WriteAliasName(out, str);
  395. }
  396. bool WriteTag(ostream_wrapper& out, const std::string& str, bool verbatim) {
  397. out << (verbatim ? "!<" : "!");
  398. StringCharSource buffer(str.c_str(), str.size());
  399. const RegEx& reValid = verbatim ? Exp::URI() : Exp::Tag();
  400. while (buffer) {
  401. int n = reValid.Match(buffer);
  402. if (n <= 0) {
  403. return false;
  404. }
  405. while (--n >= 0) {
  406. out << buffer[0];
  407. ++buffer;
  408. }
  409. }
  410. if (verbatim) {
  411. out << ">";
  412. }
  413. return true;
  414. }
  415. bool WriteTagWithPrefix(ostream_wrapper& out, const std::string& prefix,
  416. const std::string& tag) {
  417. out << "!";
  418. StringCharSource prefixBuffer(prefix.c_str(), prefix.size());
  419. while (prefixBuffer) {
  420. int n = Exp::URI().Match(prefixBuffer);
  421. if (n <= 0) {
  422. return false;
  423. }
  424. while (--n >= 0) {
  425. out << prefixBuffer[0];
  426. ++prefixBuffer;
  427. }
  428. }
  429. out << "!";
  430. StringCharSource tagBuffer(tag.c_str(), tag.size());
  431. while (tagBuffer) {
  432. int n = Exp::Tag().Match(tagBuffer);
  433. if (n <= 0) {
  434. return false;
  435. }
  436. while (--n >= 0) {
  437. out << tagBuffer[0];
  438. ++tagBuffer;
  439. }
  440. }
  441. return true;
  442. }
  443. bool WriteBinary(ostream_wrapper& out, const Binary& binary) {
  444. WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()),
  445. StringEscaping::None);
  446. return true;
  447. }
  448. } // namespace Utils
  449. } // namespace YAML