emitterutils.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. #include <iomanip>
  2. #include <sstream>
  3. #include "emitterutils.h"
  4. #include "exp.h"
  5. #include "indentation.h"
  6. #include "regex_yaml.h"
  7. #include "regeximpl.h"
  8. #include "stringsource.h"
  9. #include "yaml-cpp/binary.h" // IWYU pragma: keep
  10. #include "yaml-cpp/ostream_wrapper.h"
  11. #include "yaml-cpp/null.h"
  12. namespace YAML {
  13. namespace Utils {
  14. namespace {
  15. enum { REPLACEMENT_CHARACTER = 0xFFFD };
  16. bool IsAnchorChar(int ch) { // test for ns-anchor-char
  17. switch (ch) {
  18. case ',':
  19. case '[':
  20. case ']':
  21. case '{':
  22. case '}': // c-flow-indicator
  23. case ' ':
  24. case '\t': // s-white
  25. case 0xFEFF: // c-byte-order-mark
  26. case 0xA:
  27. case 0xD: // b-char
  28. return false;
  29. case 0x85:
  30. return true;
  31. }
  32. if (ch < 0x20) {
  33. return false;
  34. }
  35. if (ch < 0x7E) {
  36. return true;
  37. }
  38. if (ch < 0xA0) {
  39. return false;
  40. }
  41. if (ch >= 0xD800 && ch <= 0xDFFF) {
  42. return false;
  43. }
  44. if ((ch & 0xFFFE) == 0xFFFE) {
  45. return false;
  46. }
  47. if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) {
  48. return false;
  49. }
  50. if (ch > 0x10FFFF) {
  51. return false;
  52. }
  53. return true;
  54. }
  55. int Utf8BytesIndicated(char ch) {
  56. int byteVal = static_cast<unsigned char>(ch);
  57. switch (byteVal >> 4) {
  58. case 0:
  59. case 1:
  60. case 2:
  61. case 3:
  62. case 4:
  63. case 5:
  64. case 6:
  65. case 7:
  66. return 1;
  67. case 12:
  68. case 13:
  69. return 2;
  70. case 14:
  71. return 3;
  72. case 15:
  73. return 4;
  74. default:
  75. return -1;
  76. }
  77. }
  78. bool IsTrailingByte(char ch) { return (ch & 0xC0) == 0x80; }
  79. bool GetNextCodePointAndAdvance(int& codePoint,
  80. std::string::const_iterator& first,
  81. std::string::const_iterator last) {
  82. if (first == last)
  83. return false;
  84. int nBytes = Utf8BytesIndicated(*first);
  85. if (nBytes < 1) {
  86. // Bad lead byte
  87. ++first;
  88. codePoint = REPLACEMENT_CHARACTER;
  89. return true;
  90. }
  91. if (nBytes == 1) {
  92. codePoint = *first++;
  93. return true;
  94. }
  95. // Gather bits from trailing bytes
  96. codePoint = static_cast<unsigned char>(*first) & ~(0xFF << (7 - nBytes));
  97. ++first;
  98. --nBytes;
  99. for (; nBytes > 0; ++first, --nBytes) {
  100. if ((first == last) || !IsTrailingByte(*first)) {
  101. codePoint = REPLACEMENT_CHARACTER;
  102. break;
  103. }
  104. codePoint <<= 6;
  105. codePoint |= *first & 0x3F;
  106. }
  107. // Check for illegal code points
  108. if (codePoint > 0x10FFFF)
  109. codePoint = REPLACEMENT_CHARACTER;
  110. else if (codePoint >= 0xD800 && codePoint <= 0xDFFF)
  111. codePoint = REPLACEMENT_CHARACTER;
  112. else if ((codePoint & 0xFFFE) == 0xFFFE)
  113. codePoint = REPLACEMENT_CHARACTER;
  114. else if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF)
  115. codePoint = REPLACEMENT_CHARACTER;
  116. return true;
  117. }
  118. void WriteCodePoint(ostream_wrapper& out, int codePoint) {
  119. if (codePoint < 0 || codePoint > 0x10FFFF) {
  120. codePoint = REPLACEMENT_CHARACTER;
  121. }
  122. if (codePoint < 0x7F) {
  123. out << static_cast<char>(codePoint);
  124. } else if (codePoint < 0x7FF) {
  125. out << static_cast<char>(0xC0 | (codePoint >> 6))
  126. << static_cast<char>(0x80 | (codePoint & 0x3F));
  127. } else if (codePoint < 0xFFFF) {
  128. out << static_cast<char>(0xE0 | (codePoint >> 12))
  129. << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
  130. << static_cast<char>(0x80 | (codePoint & 0x3F));
  131. } else {
  132. out << static_cast<char>(0xF0 | (codePoint >> 18))
  133. << static_cast<char>(0x80 | ((codePoint >> 12) & 0x3F))
  134. << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
  135. << static_cast<char>(0x80 | (codePoint & 0x3F));
  136. }
  137. }
  138. bool IsValidPlainScalar(const std::string& str, FlowType::value flowType,
  139. bool allowOnlyAscii) {
  140. // check against null
  141. if (IsNullString(str)) {
  142. return false;
  143. }
  144. // check the start
  145. const RegEx& start = (flowType == FlowType::Flow ? Exp::PlainScalarInFlow()
  146. : Exp::PlainScalar());
  147. if (!start.Matches(str)) {
  148. return false;
  149. }
  150. // and check the end for plain whitespace (which can't be faithfully kept in a
  151. // plain scalar)
  152. if (!str.empty() && *str.rbegin() == ' ') {
  153. return false;
  154. }
  155. // then check until something is disallowed
  156. static const RegEx& disallowed_flow =
  157. Exp::EndScalarInFlow() || (Exp::BlankOrBreak() + Exp::Comment()) ||
  158. Exp::NotPrintable() || Exp::Utf8_ByteOrderMark() || Exp::Break() ||
  159. Exp::Tab();
  160. static const RegEx& disallowed_block =
  161. Exp::EndScalar() || (Exp::BlankOrBreak() + Exp::Comment()) ||
  162. Exp::NotPrintable() || Exp::Utf8_ByteOrderMark() || Exp::Break() ||
  163. Exp::Tab();
  164. const RegEx& disallowed =
  165. flowType == FlowType::Flow ? disallowed_flow : disallowed_block;
  166. StringCharSource buffer(str.c_str(), str.size());
  167. while (buffer) {
  168. if (disallowed.Matches(buffer)) {
  169. return false;
  170. }
  171. if (allowOnlyAscii && (0x80 <= static_cast<unsigned char>(buffer[0]))) {
  172. return false;
  173. }
  174. ++buffer;
  175. }
  176. return true;
  177. }
  178. bool IsValidSingleQuotedScalar(const std::string& str, bool escapeNonAscii) {
  179. // TODO: check for non-printable characters?
  180. for (std::size_t i = 0; i < str.size(); i++) {
  181. if (escapeNonAscii && (0x80 <= static_cast<unsigned char>(str[i]))) {
  182. return false;
  183. }
  184. if (str[i] == '\n') {
  185. return false;
  186. }
  187. }
  188. return true;
  189. }
  190. bool IsValidLiteralScalar(const std::string& str, FlowType::value flowType,
  191. bool escapeNonAscii) {
  192. if (flowType == FlowType::Flow) {
  193. return false;
  194. }
  195. // TODO: check for non-printable characters?
  196. for (std::size_t i = 0; i < str.size(); i++) {
  197. if (escapeNonAscii && (0x80 <= static_cast<unsigned char>(str[i]))) {
  198. return false;
  199. }
  200. }
  201. return true;
  202. }
  203. std::pair<uint16_t, uint16_t> EncodeUTF16SurrogatePair(int codePoint) {
  204. const uint32_t leadOffset = 0xD800 - (0x10000 >> 10);
  205. return {
  206. leadOffset | (codePoint >> 10),
  207. 0xDC00 | (codePoint & 0x3FF),
  208. };
  209. }
  210. void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint, bool escapeAsJson) {
  211. static const char hexDigits[] = "0123456789abcdef";
  212. out << "\\";
  213. int digits = 8;
  214. if (codePoint < 0xFF && !escapeAsJson) {
  215. out << "x";
  216. digits = 2;
  217. } else if (codePoint < 0xFFFF) {
  218. out << "u";
  219. digits = 4;
  220. } else if (!escapeAsJson) {
  221. out << "U";
  222. digits = 8;
  223. } else {
  224. auto surrogatePair = EncodeUTF16SurrogatePair(codePoint);
  225. WriteDoubleQuoteEscapeSequence(out, surrogatePair.first, true);
  226. WriteDoubleQuoteEscapeSequence(out, surrogatePair.second, true);
  227. return;
  228. }
  229. // Write digits into the escape sequence
  230. for (; digits > 0; --digits)
  231. out << hexDigits[(codePoint >> (4 * (digits - 1))) & 0xF];
  232. }
  233. bool WriteAliasName(ostream_wrapper& out, const std::string& str) {
  234. int codePoint;
  235. for (std::string::const_iterator i = str.begin();
  236. GetNextCodePointAndAdvance(codePoint, i, str.end());) {
  237. if (!IsAnchorChar(codePoint)) {
  238. return false;
  239. }
  240. WriteCodePoint(out, codePoint);
  241. }
  242. return true;
  243. }
  244. }
  245. StringFormat::value ComputeStringFormat(const std::string& str,
  246. EMITTER_MANIP strFormat,
  247. FlowType::value flowType,
  248. bool escapeNonAscii) {
  249. switch (strFormat) {
  250. case Auto:
  251. if (IsValidPlainScalar(str, flowType, escapeNonAscii)) {
  252. return StringFormat::Plain;
  253. }
  254. return StringFormat::DoubleQuoted;
  255. case SingleQuoted:
  256. if (IsValidSingleQuotedScalar(str, escapeNonAscii)) {
  257. return StringFormat::SingleQuoted;
  258. }
  259. return StringFormat::DoubleQuoted;
  260. case DoubleQuoted:
  261. return StringFormat::DoubleQuoted;
  262. case Literal:
  263. if (IsValidLiteralScalar(str, flowType, escapeNonAscii)) {
  264. return StringFormat::Literal;
  265. }
  266. return StringFormat::DoubleQuoted;
  267. default:
  268. break;
  269. }
  270. return StringFormat::DoubleQuoted;
  271. }
  272. bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str) {
  273. out << "'";
  274. int codePoint;
  275. for (std::string::const_iterator i = str.begin();
  276. GetNextCodePointAndAdvance(codePoint, i, str.end());) {
  277. if (codePoint == '\n') {
  278. return false; // We can't handle a new line and the attendant indentation
  279. // yet
  280. }
  281. if (codePoint == '\'') {
  282. out << "''";
  283. } else {
  284. WriteCodePoint(out, codePoint);
  285. }
  286. }
  287. out << "'";
  288. return true;
  289. }
  290. bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
  291. StringEscaping::value stringEscaping) {
  292. out << "\"";
  293. int codePoint;
  294. for (std::string::const_iterator i = str.begin();
  295. GetNextCodePointAndAdvance(codePoint, i, str.end());) {
  296. switch (codePoint) {
  297. case '\"':
  298. out << "\\\"";
  299. break;
  300. case '\\':
  301. out << "\\\\";
  302. break;
  303. case '\n':
  304. out << "\\n";
  305. break;
  306. case '\t':
  307. out << "\\t";
  308. break;
  309. case '\r':
  310. out << "\\r";
  311. break;
  312. case '\b':
  313. out << "\\b";
  314. break;
  315. case '\f':
  316. out << "\\f";
  317. break;
  318. default:
  319. if (codePoint < 0x20 ||
  320. (codePoint >= 0x80 &&
  321. codePoint <= 0xA0)) { // Control characters and non-breaking space
  322. WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping == StringEscaping::JSON);
  323. } else if (codePoint == 0xFEFF) { // Byte order marks (ZWNS) should be
  324. // escaped (YAML 1.2, sec. 5.2)
  325. WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping == StringEscaping::JSON);
  326. } else if (stringEscaping == StringEscaping::NonAscii && codePoint > 0x7E) {
  327. WriteDoubleQuoteEscapeSequence(out, codePoint, false);
  328. } else {
  329. WriteCodePoint(out, codePoint);
  330. }
  331. }
  332. }
  333. out << "\"";
  334. return true;
  335. }
  336. bool WriteLiteralString(ostream_wrapper& out, const std::string& str,
  337. std::size_t indent) {
  338. out << "|\n";
  339. out << IndentTo(indent);
  340. int codePoint;
  341. for (std::string::const_iterator i = str.begin();
  342. GetNextCodePointAndAdvance(codePoint, i, str.end());) {
  343. if (codePoint == '\n') {
  344. out << "\n" << IndentTo(indent);
  345. } else {
  346. WriteCodePoint(out, codePoint);
  347. }
  348. }
  349. return true;
  350. }
  351. bool WriteChar(ostream_wrapper& out, char ch, bool escapeAsJson) {
  352. if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')) {
  353. out << ch;
  354. } else if (ch == '\"') {
  355. out << "\"\\\"\"";
  356. } else if (ch == '\t') {
  357. out << "\"\\t\"";
  358. } else if (ch == '\n') {
  359. out << "\"\\n\"";
  360. } else if (ch == '\b') {
  361. out << "\"\\b\"";
  362. } else if (ch == '\r') {
  363. out << "\"\\r\"";
  364. } else if (ch == '\f') {
  365. out << "\"\\f\"";
  366. } else if (ch == '\\') {
  367. out << "\"\\\\\"";
  368. } else if ((0x20 <= ch && ch <= 0x7e) || ch == ' ') {
  369. out << "\"" << ch << "\"";
  370. } else {
  371. out << "\"";
  372. WriteDoubleQuoteEscapeSequence(out, ch, escapeAsJson);
  373. out << "\"";
  374. }
  375. return true;
  376. }
  377. bool WriteComment(ostream_wrapper& out, const std::string& str,
  378. std::size_t postCommentIndent) {
  379. const std::size_t curIndent = out.col();
  380. out << "#" << Indentation(postCommentIndent);
  381. out.set_comment();
  382. int codePoint;
  383. for (std::string::const_iterator i = str.begin();
  384. GetNextCodePointAndAdvance(codePoint, i, str.end());) {
  385. if (codePoint == '\n') {
  386. out << "\n" << IndentTo(curIndent) << "#"
  387. << Indentation(postCommentIndent);
  388. out.set_comment();
  389. } else {
  390. WriteCodePoint(out, codePoint);
  391. }
  392. }
  393. return true;
  394. }
  395. bool WriteAlias(ostream_wrapper& out, const std::string& str) {
  396. out << "*";
  397. return WriteAliasName(out, str);
  398. }
  399. bool WriteAnchor(ostream_wrapper& out, const std::string& str) {
  400. out << "&";
  401. return WriteAliasName(out, str);
  402. }
  403. bool WriteTag(ostream_wrapper& out, const std::string& str, bool verbatim) {
  404. out << (verbatim ? "!<" : "!");
  405. StringCharSource buffer(str.c_str(), str.size());
  406. const RegEx& reValid = verbatim ? Exp::URI() : Exp::Tag();
  407. while (buffer) {
  408. int n = reValid.Match(buffer);
  409. if (n <= 0) {
  410. return false;
  411. }
  412. while (--n >= 0) {
  413. out << buffer[0];
  414. ++buffer;
  415. }
  416. }
  417. if (verbatim) {
  418. out << ">";
  419. }
  420. return true;
  421. }
  422. bool WriteTagWithPrefix(ostream_wrapper& out, const std::string& prefix,
  423. const std::string& tag) {
  424. out << "!";
  425. StringCharSource prefixBuffer(prefix.c_str(), prefix.size());
  426. while (prefixBuffer) {
  427. int n = Exp::URI().Match(prefixBuffer);
  428. if (n <= 0) {
  429. return false;
  430. }
  431. while (--n >= 0) {
  432. out << prefixBuffer[0];
  433. ++prefixBuffer;
  434. }
  435. }
  436. out << "!";
  437. StringCharSource tagBuffer(tag.c_str(), tag.size());
  438. while (tagBuffer) {
  439. int n = Exp::Tag().Match(tagBuffer);
  440. if (n <= 0) {
  441. return false;
  442. }
  443. while (--n >= 0) {
  444. out << tagBuffer[0];
  445. ++tagBuffer;
  446. }
  447. }
  448. return true;
  449. }
  450. bool WriteBinary(ostream_wrapper& out, const Binary& binary) {
  451. WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()),
  452. StringEscaping::None);
  453. return true;
  454. }
  455. }
  456. }