stream.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. #include <iostream>
  2. #include "stream.h"
  3. #ifndef YAML_PREFETCH_SIZE
  4. #define YAML_PREFETCH_SIZE 2048
  5. #endif
  6. #define S_ARRAY_SIZE(A) (sizeof(A) / sizeof(*(A)))
  7. #define S_ARRAY_END(A) ((A) + S_ARRAY_SIZE(A))
  8. #define CP_REPLACEMENT_CHARACTER (0xFFFD)
  9. namespace YAML {
  10. enum UtfIntroState {
  11. uis_start,
  12. uis_utfbe_b1,
  13. uis_utf32be_b2,
  14. uis_utf32be_bom3,
  15. uis_utf32be,
  16. uis_utf16be,
  17. uis_utf16be_bom1,
  18. uis_utfle_bom1,
  19. uis_utf16le_bom2,
  20. uis_utf32le_bom3,
  21. uis_utf16le,
  22. uis_utf32le,
  23. uis_utf8_imp,
  24. uis_utf16le_imp,
  25. uis_utf32le_imp3,
  26. uis_utf8_bom1,
  27. uis_utf8_bom2,
  28. uis_utf8,
  29. uis_error
  30. };
  31. enum UtfIntroCharType {
  32. uict00,
  33. uictBB,
  34. uictBF,
  35. uictEF,
  36. uictFE,
  37. uictFF,
  38. uictAscii,
  39. uictOther,
  40. uictMax
  41. };
  42. static bool s_introFinalState[] = {
  43. false, // uis_start
  44. false, // uis_utfbe_b1
  45. false, // uis_utf32be_b2
  46. false, // uis_utf32be_bom3
  47. true, // uis_utf32be
  48. true, // uis_utf16be
  49. false, // uis_utf16be_bom1
  50. false, // uis_utfle_bom1
  51. false, // uis_utf16le_bom2
  52. false, // uis_utf32le_bom3
  53. true, // uis_utf16le
  54. true, // uis_utf32le
  55. false, // uis_utf8_imp
  56. false, // uis_utf16le_imp
  57. false, // uis_utf32le_imp3
  58. false, // uis_utf8_bom1
  59. false, // uis_utf8_bom2
  60. true, // uis_utf8
  61. true, // uis_error
  62. };
  63. static UtfIntroState s_introTransitions[][uictMax] = {
  64. // uict00, uictBB, uictBF, uictEF,
  65. // uictFE, uictFF, uictAscii, uictOther
  66. {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1,
  67. uis_utfle_bom1, uis_utf8_imp, uis_utf8},
  68. {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
  69. uis_utf16be, uis_utf8},
  70. {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8,
  71. uis_utf8, uis_utf8},
  72. {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8,
  73. uis_utf8},
  74. {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be,
  75. uis_utf32be, uis_utf32be, uis_utf32be},
  76. {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be,
  77. uis_utf16be, uis_utf16be, uis_utf16be},
  78. {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8,
  79. uis_utf8},
  80. {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8,
  81. uis_utf8, uis_utf8},
  82. {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
  83. uis_utf16le, uis_utf16le, uis_utf16le},
  84. {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
  85. uis_utf16le, uis_utf16le, uis_utf16le},
  86. {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
  87. uis_utf16le, uis_utf16le, uis_utf16le},
  88. {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le,
  89. uis_utf32le, uis_utf32le, uis_utf32le},
  90. {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
  91. uis_utf8, uis_utf8},
  92. {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
  93. uis_utf16le, uis_utf16le, uis_utf16le},
  94. {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
  95. uis_utf16le, uis_utf16le, uis_utf16le},
  96. {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
  97. uis_utf8},
  98. {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
  99. uis_utf8},
  100. {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
  101. uis_utf8},
  102. };
  103. static char s_introUngetCount[][uictMax] = {
  104. // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
  105. {0, 1, 1, 0, 0, 0, 0, 1},
  106. {0, 2, 2, 2, 2, 2, 2, 2},
  107. {3, 3, 3, 3, 0, 3, 3, 3},
  108. {4, 4, 4, 4, 4, 0, 4, 4},
  109. {1, 1, 1, 1, 1, 1, 1, 1},
  110. {1, 1, 1, 1, 1, 1, 1, 1},
  111. {2, 2, 2, 2, 2, 0, 2, 2},
  112. {2, 2, 2, 2, 0, 2, 2, 2},
  113. {0, 1, 1, 1, 1, 1, 1, 1},
  114. {0, 2, 2, 2, 2, 2, 2, 2},
  115. {1, 1, 1, 1, 1, 1, 1, 1},
  116. {1, 1, 1, 1, 1, 1, 1, 1},
  117. {0, 2, 2, 2, 2, 2, 2, 2},
  118. {0, 3, 3, 3, 3, 3, 3, 3},
  119. {4, 4, 4, 4, 4, 4, 4, 4},
  120. {2, 0, 2, 2, 2, 2, 2, 2},
  121. {3, 3, 0, 3, 3, 3, 3, 3},
  122. {1, 1, 1, 1, 1, 1, 1, 1},
  123. };
  124. inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch) {
  125. if (std::istream::traits_type::eof() == ch) {
  126. return uictOther;
  127. }
  128. switch (ch) {
  129. case 0:
  130. return uict00;
  131. case 0xBB:
  132. return uictBB;
  133. case 0xBF:
  134. return uictBF;
  135. case 0xEF:
  136. return uictEF;
  137. case 0xFE:
  138. return uictFE;
  139. case 0xFF:
  140. return uictFF;
  141. }
  142. if ((ch > 0) && (ch < 0xFF)) {
  143. return uictAscii;
  144. }
  145. return uictOther;
  146. }
  147. inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits,
  148. unsigned char rshift) {
  149. const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits);
  150. const unsigned char mask = (0xFF >> (lead_bits + 1));
  151. return static_cast<char>(
  152. static_cast<unsigned char>(header | ((ch >> rshift) & mask)));
  153. }
  154. inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch) {
  155. // We are not allowed to queue the Stream::eof() codepoint, so
  156. // replace it with CP_REPLACEMENT_CHARACTER
  157. if (static_cast<unsigned long>(Stream::eof()) == ch) {
  158. ch = CP_REPLACEMENT_CHARACTER;
  159. }
  160. if (ch < 0x80) {
  161. q.push_back(Utf8Adjust(ch, 0, 0));
  162. } else if (ch < 0x800) {
  163. q.push_back(Utf8Adjust(ch, 2, 6));
  164. q.push_back(Utf8Adjust(ch, 1, 0));
  165. } else if (ch < 0x10000) {
  166. q.push_back(Utf8Adjust(ch, 3, 12));
  167. q.push_back(Utf8Adjust(ch, 1, 6));
  168. q.push_back(Utf8Adjust(ch, 1, 0));
  169. } else {
  170. q.push_back(Utf8Adjust(ch, 4, 18));
  171. q.push_back(Utf8Adjust(ch, 1, 12));
  172. q.push_back(Utf8Adjust(ch, 1, 6));
  173. q.push_back(Utf8Adjust(ch, 1, 0));
  174. }
  175. }
  176. Stream::Stream(std::istream& input)
  177. : m_input(input),
  178. m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]),
  179. m_nPrefetchedAvailable(0),
  180. m_nPrefetchedUsed(0) {
  181. typedef std::istream::traits_type char_traits;
  182. if (!input)
  183. return;
  184. // Determine (or guess) the character-set by reading the BOM, if any. See
  185. // the YAML specification for the determination algorithm.
  186. char_traits::int_type intro[4];
  187. int nIntroUsed = 0;
  188. UtfIntroState state = uis_start;
  189. for (; !s_introFinalState[state];) {
  190. std::istream::int_type ch = input.get();
  191. intro[nIntroUsed++] = ch;
  192. UtfIntroCharType charType = IntroCharTypeOf(ch);
  193. UtfIntroState newState = s_introTransitions[state][charType];
  194. int nUngets = s_introUngetCount[state][charType];
  195. if (nUngets > 0) {
  196. input.clear();
  197. for (; nUngets > 0; --nUngets) {
  198. if (char_traits::eof() != intro[--nIntroUsed])
  199. input.putback(char_traits::to_char_type(intro[nIntroUsed]));
  200. }
  201. }
  202. state = newState;
  203. }
  204. switch (state) {
  205. case uis_utf8:
  206. m_charSet = utf8;
  207. break;
  208. case uis_utf16le:
  209. m_charSet = utf16le;
  210. break;
  211. case uis_utf16be:
  212. m_charSet = utf16be;
  213. break;
  214. case uis_utf32le:
  215. m_charSet = utf32le;
  216. break;
  217. case uis_utf32be:
  218. m_charSet = utf32be;
  219. break;
  220. default:
  221. m_charSet = utf8;
  222. break;
  223. }
  224. ReadAheadTo(0);
  225. }
  226. Stream::~Stream() { delete[] m_pPrefetched; }
  227. char Stream::peek() const {
  228. if (m_readahead.empty()) {
  229. return Stream::eof();
  230. }
  231. return m_readahead[0];
  232. }
  233. Stream::operator bool() const {
  234. return m_input.good() ||
  235. (!m_readahead.empty() && m_readahead[0] != Stream::eof());
  236. }
  237. // get
  238. // . Extracts a character from the stream and updates our position
  239. char Stream::get() {
  240. char ch = peek();
  241. AdvanceCurrent();
  242. m_mark.column++;
  243. if (ch == '\n') {
  244. m_mark.column = 0;
  245. m_mark.line++;
  246. }
  247. return ch;
  248. }
  249. // get
  250. // . Extracts 'n' characters from the stream and updates our position
  251. std::string Stream::get(int n) {
  252. std::string ret;
  253. ret.reserve(n);
  254. for (int i = 0; i < n; i++)
  255. ret += get();
  256. return ret;
  257. }
  258. // eat
  259. // . Eats 'n' characters and updates our position.
  260. void Stream::eat(int n) {
  261. for (int i = 0; i < n; i++)
  262. get();
  263. }
  264. void Stream::AdvanceCurrent() {
  265. if (!m_readahead.empty()) {
  266. m_readahead.pop_front();
  267. m_mark.pos++;
  268. }
  269. ReadAheadTo(0);
  270. }
  271. bool Stream::_ReadAheadTo(size_t i) const {
  272. while (m_input.good() && (m_readahead.size() <= i)) {
  273. switch (m_charSet) {
  274. case utf8:
  275. StreamInUtf8();
  276. break;
  277. case utf16le:
  278. StreamInUtf16();
  279. break;
  280. case utf16be:
  281. StreamInUtf16();
  282. break;
  283. case utf32le:
  284. StreamInUtf32();
  285. break;
  286. case utf32be:
  287. StreamInUtf32();
  288. break;
  289. }
  290. }
  291. // signal end of stream
  292. if (!m_input.good())
  293. m_readahead.push_back(Stream::eof());
  294. return m_readahead.size() > i;
  295. }
  296. void Stream::StreamInUtf8() const {
  297. unsigned char b = GetNextByte();
  298. if (m_input.good()) {
  299. m_readahead.push_back(b);
  300. }
  301. }
  302. void Stream::StreamInUtf16() const {
  303. unsigned long ch = 0;
  304. unsigned char bytes[2];
  305. int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
  306. bytes[0] = GetNextByte();
  307. bytes[1] = GetNextByte();
  308. if (!m_input.good()) {
  309. return;
  310. }
  311. ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
  312. static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
  313. if (ch >= 0xDC00 && ch < 0xE000) {
  314. // Trailing (low) surrogate...ugh, wrong order
  315. QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
  316. return;
  317. } else if (ch >= 0xD800 && ch < 0xDC00) {
  318. // ch is a leading (high) surrogate
  319. // Four byte UTF-8 code point
  320. // Read the trailing (low) surrogate
  321. for (;;) {
  322. bytes[0] = GetNextByte();
  323. bytes[1] = GetNextByte();
  324. if (!m_input.good()) {
  325. QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
  326. return;
  327. }
  328. unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
  329. static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
  330. if (chLow < 0xDC00 || chLow >= 0xE000) {
  331. // Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the
  332. // stream.
  333. QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
  334. // Deal with the next UTF-16 unit
  335. if (chLow < 0xD800 || chLow >= 0xE000) {
  336. // Easiest case: queue the codepoint and return
  337. QueueUnicodeCodepoint(m_readahead, ch);
  338. return;
  339. } else {
  340. // Start the loop over with the new high surrogate
  341. ch = chLow;
  342. continue;
  343. }
  344. }
  345. // Select the payload bits from the high surrogate
  346. ch &= 0x3FF;
  347. ch <<= 10;
  348. // Include bits from low surrogate
  349. ch |= (chLow & 0x3FF);
  350. // Add the surrogacy offset
  351. ch += 0x10000;
  352. break;
  353. }
  354. }
  355. QueueUnicodeCodepoint(m_readahead, ch);
  356. }
  357. inline char* ReadBuffer(unsigned char* pBuffer) {
  358. return reinterpret_cast<char*>(pBuffer);
  359. }
  360. unsigned char Stream::GetNextByte() const {
  361. if (m_nPrefetchedUsed >= m_nPrefetchedAvailable) {
  362. std::streambuf* pBuf = m_input.rdbuf();
  363. m_nPrefetchedAvailable = static_cast<std::size_t>(
  364. pBuf->sgetn(ReadBuffer(m_pPrefetched), YAML_PREFETCH_SIZE));
  365. m_nPrefetchedUsed = 0;
  366. if (!m_nPrefetchedAvailable) {
  367. m_input.setstate(std::ios_base::eofbit);
  368. }
  369. if (0 == m_nPrefetchedAvailable) {
  370. return 0;
  371. }
  372. }
  373. return m_pPrefetched[m_nPrefetchedUsed++];
  374. }
  375. void Stream::StreamInUtf32() const {
  376. static int indexes[2][4] = {{3, 2, 1, 0}, {0, 1, 2, 3}};
  377. unsigned long ch = 0;
  378. unsigned char bytes[4];
  379. int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];
  380. bytes[0] = GetNextByte();
  381. bytes[1] = GetNextByte();
  382. bytes[2] = GetNextByte();
  383. bytes[3] = GetNextByte();
  384. if (!m_input.good()) {
  385. return;
  386. }
  387. for (int i = 0; i < 4; ++i) {
  388. ch <<= 8;
  389. ch |= bytes[pIndexes[i]];
  390. }
  391. QueueUnicodeCodepoint(m_readahead, ch);
  392. }
  393. }