stream.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. #include <iostream>
  2. #include "stream.h"
  3. #ifndef YAML_PREFETCH_SIZE
  4. #define YAML_PREFETCH_SIZE 2048
  5. #endif
  6. #define S_ARRAY_SIZE(A) (sizeof(A) / sizeof(*(A)))
  7. #define S_ARRAY_END(A) ((A) + S_ARRAY_SIZE(A))
  8. #define CP_REPLACEMENT_CHARACTER (0xFFFD)
  9. namespace YAML {
  10. enum UtfIntroState {
  11. uis_start,
  12. uis_utfbe_b1,
  13. uis_utf32be_b2,
  14. uis_utf32be_bom3,
  15. uis_utf32be,
  16. uis_utf16be,
  17. uis_utf16be_bom1,
  18. uis_utfle_bom1,
  19. uis_utf16le_bom2,
  20. uis_utf32le_bom3,
  21. uis_utf16le,
  22. uis_utf32le,
  23. uis_utf8_imp,
  24. uis_utf16le_imp,
  25. uis_utf32le_imp3,
  26. uis_utf8_bom1,
  27. uis_utf8_bom2,
  28. uis_utf8,
  29. uis_error
  30. };
  31. enum UtfIntroCharType {
  32. uict00,
  33. uictBB,
  34. uictBF,
  35. uictEF,
  36. uictFE,
  37. uictFF,
  38. uictAscii,
  39. uictOther,
  40. uictMax
  41. };
  42. static bool s_introFinalState[] = {
  43. false, // uis_start
  44. false, // uis_utfbe_b1
  45. false, // uis_utf32be_b2
  46. false, // uis_utf32be_bom3
  47. true, // uis_utf32be
  48. true, // uis_utf16be
  49. false, // uis_utf16be_bom1
  50. false, // uis_utfle_bom1
  51. false, // uis_utf16le_bom2
  52. false, // uis_utf32le_bom3
  53. true, // uis_utf16le
  54. true, // uis_utf32le
  55. false, // uis_utf8_imp
  56. false, // uis_utf16le_imp
  57. false, // uis_utf32le_imp3
  58. false, // uis_utf8_bom1
  59. false, // uis_utf8_bom2
  60. true, // uis_utf8
  61. true, // uis_error
  62. };
  63. static UtfIntroState s_introTransitions[][uictMax] = {
  64. // uict00, uictBB, uictBF, uictEF,
  65. // uictFE, uictFF, uictAscii, uictOther
  66. {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1,
  67. uis_utfle_bom1, uis_utf8_imp, uis_utf8},
  68. {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
  69. uis_utf16be, uis_utf8},
  70. {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8,
  71. uis_utf8, uis_utf8},
  72. {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8,
  73. uis_utf8},
  74. {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be,
  75. uis_utf32be, uis_utf32be, uis_utf32be},
  76. {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be,
  77. uis_utf16be, uis_utf16be, uis_utf16be},
  78. {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8,
  79. uis_utf8},
  80. {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8,
  81. uis_utf8, uis_utf8},
  82. {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
  83. uis_utf16le, uis_utf16le, uis_utf16le},
  84. {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
  85. uis_utf16le, uis_utf16le, uis_utf16le},
  86. {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
  87. uis_utf16le, uis_utf16le, uis_utf16le},
  88. {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le,
  89. uis_utf32le, uis_utf32le, uis_utf32le},
  90. {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
  91. uis_utf8, uis_utf8},
  92. {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
  93. uis_utf16le, uis_utf16le, uis_utf16le},
  94. {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
  95. uis_utf16le, uis_utf16le, uis_utf16le},
  96. {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
  97. uis_utf8},
  98. {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
  99. uis_utf8},
  100. {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
  101. uis_utf8},
  102. };
  103. static char s_introUngetCount[][uictMax] = {
  104. // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
  105. {0, 1, 1, 0, 0, 0, 0, 1}, {0, 2, 2, 2, 2, 2, 2, 2},
  106. {3, 3, 3, 3, 0, 3, 3, 3}, {4, 4, 4, 4, 4, 0, 4, 4},
  107. {1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1, 1},
  108. {2, 2, 2, 2, 2, 0, 2, 2}, {2, 2, 2, 2, 0, 2, 2, 2},
  109. {0, 1, 1, 1, 1, 1, 1, 1}, {0, 2, 2, 2, 2, 2, 2, 2},
  110. {1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1, 1},
  111. {0, 2, 2, 2, 2, 2, 2, 2}, {0, 3, 3, 3, 3, 3, 3, 3},
  112. {4, 4, 4, 4, 4, 4, 4, 4}, {2, 0, 2, 2, 2, 2, 2, 2},
  113. {3, 3, 0, 3, 3, 3, 3, 3}, {1, 1, 1, 1, 1, 1, 1, 1},
  114. };
  115. inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch) {
  116. if (std::istream::traits_type::eof() == ch) {
  117. return uictOther;
  118. }
  119. switch (ch) {
  120. case 0:
  121. return uict00;
  122. case 0xBB:
  123. return uictBB;
  124. case 0xBF:
  125. return uictBF;
  126. case 0xEF:
  127. return uictEF;
  128. case 0xFE:
  129. return uictFE;
  130. case 0xFF:
  131. return uictFF;
  132. }
  133. if ((ch > 0) && (ch < 0xFF)) {
  134. return uictAscii;
  135. }
  136. return uictOther;
  137. }
  138. inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits,
  139. unsigned char rshift) {
  140. const unsigned char header =
  141. static_cast<unsigned char>(((1 << lead_bits) - 1) << (8 - lead_bits));
  142. const unsigned char mask = (0xFF >> (lead_bits + 1));
  143. return static_cast<char>(
  144. static_cast<unsigned char>(header | ((ch >> rshift) & mask)));
  145. }
  146. inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch) {
  147. // We are not allowed to queue the Stream::eof() codepoint, so
  148. // replace it with CP_REPLACEMENT_CHARACTER
  149. if (static_cast<unsigned long>(Stream::eof()) == ch) {
  150. ch = CP_REPLACEMENT_CHARACTER;
  151. }
  152. if (ch < 0x80) {
  153. q.push_back(Utf8Adjust(ch, 0, 0));
  154. } else if (ch < 0x800) {
  155. q.push_back(Utf8Adjust(ch, 2, 6));
  156. q.push_back(Utf8Adjust(ch, 1, 0));
  157. } else if (ch < 0x10000) {
  158. q.push_back(Utf8Adjust(ch, 3, 12));
  159. q.push_back(Utf8Adjust(ch, 1, 6));
  160. q.push_back(Utf8Adjust(ch, 1, 0));
  161. } else {
  162. q.push_back(Utf8Adjust(ch, 4, 18));
  163. q.push_back(Utf8Adjust(ch, 1, 12));
  164. q.push_back(Utf8Adjust(ch, 1, 6));
  165. q.push_back(Utf8Adjust(ch, 1, 0));
  166. }
  167. }
  168. Stream::Stream(std::istream& input)
  169. : m_input(input),
  170. m_mark{},
  171. m_charSet{},
  172. m_readahead{},
  173. m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]),
  174. m_nPrefetchedAvailable(0),
  175. m_nPrefetchedUsed(0) {
  176. using char_traits = std::istream::traits_type;
  177. if (!input)
  178. return;
  179. // Determine (or guess) the character-set by reading the BOM, if any. See
  180. // the YAML specification for the determination algorithm.
  181. char_traits::int_type intro[4]{};
  182. int nIntroUsed = 0;
  183. UtfIntroState state = uis_start;
  184. for (; !s_introFinalState[state];) {
  185. std::istream::int_type ch = input.get();
  186. intro[nIntroUsed++] = ch;
  187. UtfIntroCharType charType = IntroCharTypeOf(ch);
  188. UtfIntroState newState = s_introTransitions[state][charType];
  189. int nUngets = s_introUngetCount[state][charType];
  190. if (nUngets > 0) {
  191. input.clear();
  192. for (; nUngets > 0; --nUngets) {
  193. if (char_traits::eof() != intro[--nIntroUsed])
  194. input.putback(char_traits::to_char_type(intro[nIntroUsed]));
  195. }
  196. }
  197. state = newState;
  198. }
  199. switch (state) {
  200. case uis_utf8:
  201. m_charSet = utf8;
  202. break;
  203. case uis_utf16le:
  204. m_charSet = utf16le;
  205. break;
  206. case uis_utf16be:
  207. m_charSet = utf16be;
  208. break;
  209. case uis_utf32le:
  210. m_charSet = utf32le;
  211. break;
  212. case uis_utf32be:
  213. m_charSet = utf32be;
  214. break;
  215. default:
  216. m_charSet = utf8;
  217. break;
  218. }
  219. ReadAheadTo(0);
  220. }
  221. Stream::~Stream() { delete[] m_pPrefetched; }
  222. char Stream::peek() const {
  223. if (m_readahead.empty()) {
  224. return Stream::eof();
  225. }
  226. return m_readahead[0];
  227. }
  228. Stream::operator bool() const {
  229. return m_input.good() ||
  230. (!m_readahead.empty() && m_readahead[0] != Stream::eof());
  231. }
  232. // get
  233. // . Extracts a character from the stream and updates our position
  234. char Stream::get() {
  235. char ch = peek();
  236. AdvanceCurrent();
  237. m_mark.column++;
  238. if (ch == '\n') {
  239. m_mark.column = 0;
  240. m_mark.line++;
  241. }
  242. return ch;
  243. }
  244. // get
  245. // . Extracts 'n' characters from the stream and updates our position
  246. std::string Stream::get(int n) {
  247. std::string ret;
  248. if (n > 0) {
  249. ret.reserve(static_cast<std::string::size_type>(n));
  250. for (int i = 0; i < n; i++)
  251. ret += get();
  252. }
  253. return ret;
  254. }
  255. // eat
  256. // . Eats 'n' characters and updates our position.
  257. void Stream::eat(int n) {
  258. for (int i = 0; i < n; i++)
  259. get();
  260. }
  261. void Stream::AdvanceCurrent() {
  262. if (!m_readahead.empty()) {
  263. m_readahead.pop_front();
  264. m_mark.pos++;
  265. }
  266. ReadAheadTo(0);
  267. }
  268. bool Stream::_ReadAheadTo(size_t i) const {
  269. while (m_input.good() && (m_readahead.size() <= i)) {
  270. switch (m_charSet) {
  271. case utf8:
  272. StreamInUtf8();
  273. break;
  274. case utf16le:
  275. StreamInUtf16();
  276. break;
  277. case utf16be:
  278. StreamInUtf16();
  279. break;
  280. case utf32le:
  281. StreamInUtf32();
  282. break;
  283. case utf32be:
  284. StreamInUtf32();
  285. break;
  286. }
  287. }
  288. // signal end of stream
  289. if (!m_input.good())
  290. m_readahead.push_back(Stream::eof());
  291. return m_readahead.size() > i;
  292. }
  293. void Stream::StreamInUtf8() const {
  294. unsigned char b = GetNextByte();
  295. if (m_input.good()) {
  296. m_readahead.push_back(static_cast<char>(b));
  297. }
  298. }
  299. void Stream::StreamInUtf16() const {
  300. unsigned long ch = 0;
  301. unsigned char bytes[2];
  302. int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
  303. bytes[0] = GetNextByte();
  304. bytes[1] = GetNextByte();
  305. if (!m_input.good()) {
  306. return;
  307. }
  308. ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
  309. static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
  310. if (ch >= 0xDC00 && ch < 0xE000) {
  311. // Trailing (low) surrogate...ugh, wrong order
  312. QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
  313. return;
  314. }
  315. if (ch >= 0xD800 && ch < 0xDC00) {
  316. // ch is a leading (high) surrogate
  317. // Four byte UTF-8 code point
  318. // Read the trailing (low) surrogate
  319. for (;;) {
  320. bytes[0] = GetNextByte();
  321. bytes[1] = GetNextByte();
  322. if (!m_input.good()) {
  323. QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
  324. return;
  325. }
  326. unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
  327. static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
  328. if (chLow < 0xDC00 || chLow >= 0xE000) {
  329. // Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the
  330. // stream.
  331. QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
  332. // Deal with the next UTF-16 unit
  333. if (chLow < 0xD800 || chLow >= 0xE000) {
  334. // Easiest case: queue the codepoint and return
  335. QueueUnicodeCodepoint(m_readahead, ch);
  336. return;
  337. }
  338. // Start the loop over with the new high surrogate
  339. ch = chLow;
  340. continue;
  341. }
  342. // Select the payload bits from the high surrogate
  343. ch &= 0x3FF;
  344. ch <<= 10;
  345. // Include bits from low surrogate
  346. ch |= (chLow & 0x3FF);
  347. // Add the surrogacy offset
  348. ch += 0x10000;
  349. break;
  350. }
  351. }
  352. QueueUnicodeCodepoint(m_readahead, ch);
  353. }
  354. inline char* ReadBuffer(unsigned char* pBuffer) {
  355. return reinterpret_cast<char*>(pBuffer);
  356. }
  357. unsigned char Stream::GetNextByte() const {
  358. if (m_nPrefetchedUsed >= m_nPrefetchedAvailable) {
  359. std::streambuf* pBuf = m_input.rdbuf();
  360. m_nPrefetchedAvailable = static_cast<std::size_t>(
  361. pBuf->sgetn(ReadBuffer(m_pPrefetched), YAML_PREFETCH_SIZE));
  362. m_nPrefetchedUsed = 0;
  363. if (!m_nPrefetchedAvailable) {
  364. m_input.setstate(std::ios_base::eofbit);
  365. }
  366. if (0 == m_nPrefetchedAvailable) {
  367. return 0;
  368. }
  369. }
  370. return m_pPrefetched[m_nPrefetchedUsed++];
  371. }
  372. void Stream::StreamInUtf32() const {
  373. static int indexes[2][4] = {{3, 2, 1, 0}, {0, 1, 2, 3}};
  374. unsigned long ch = 0;
  375. unsigned char bytes[4];
  376. int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];
  377. bytes[0] = GetNextByte();
  378. bytes[1] = GetNextByte();
  379. bytes[2] = GetNextByte();
  380. bytes[3] = GetNextByte();
  381. if (!m_input.good()) {
  382. return;
  383. }
  384. for (int i = 0; i < 4; ++i) {
  385. ch <<= 8;
  386. ch |= bytes[pIndexes[i]];
  387. }
  388. QueueUnicodeCodepoint(m_readahead, ch);
  389. }
  390. } // namespace YAML