123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448 |
- #include <iostream>
- #include "stream.h"
- #ifndef YAML_PREFETCH_SIZE
- #define YAML_PREFETCH_SIZE 2048
- #endif
- #define S_ARRAY_SIZE(A) (sizeof(A) / sizeof(*(A)))
- #define S_ARRAY_END(A) ((A) + S_ARRAY_SIZE(A))
- #define CP_REPLACEMENT_CHARACTER (0xFFFD)
- namespace YAML {
- enum UtfIntroState {
- uis_start,
- uis_utfbe_b1,
- uis_utf32be_b2,
- uis_utf32be_bom3,
- uis_utf32be,
- uis_utf16be,
- uis_utf16be_bom1,
- uis_utfle_bom1,
- uis_utf16le_bom2,
- uis_utf32le_bom3,
- uis_utf16le,
- uis_utf32le,
- uis_utf8_imp,
- uis_utf16le_imp,
- uis_utf32le_imp3,
- uis_utf8_bom1,
- uis_utf8_bom2,
- uis_utf8,
- uis_error
- };
- enum UtfIntroCharType {
- uict00,
- uictBB,
- uictBF,
- uictEF,
- uictFE,
- uictFF,
- uictAscii,
- uictOther,
- uictMax
- };
- static bool s_introFinalState[] = {
- false, // uis_start
- false, // uis_utfbe_b1
- false, // uis_utf32be_b2
- false, // uis_utf32be_bom3
- true, // uis_utf32be
- true, // uis_utf16be
- false, // uis_utf16be_bom1
- false, // uis_utfle_bom1
- false, // uis_utf16le_bom2
- false, // uis_utf32le_bom3
- true, // uis_utf16le
- true, // uis_utf32le
- false, // uis_utf8_imp
- false, // uis_utf16le_imp
- false, // uis_utf32le_imp3
- false, // uis_utf8_bom1
- false, // uis_utf8_bom2
- true, // uis_utf8
- true, // uis_error
- };
- static UtfIntroState s_introTransitions[][uictMax] = {
- // uict00, uictBB, uictBF, uictEF,
- // uictFE, uictFF, uictAscii, uictOther
- {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1,
- uis_utfle_bom1, uis_utf8_imp, uis_utf8},
- {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
- uis_utf16be, uis_utf8},
- {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8,
- uis_utf8, uis_utf8},
- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8,
- uis_utf8},
- {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be,
- uis_utf32be, uis_utf32be, uis_utf32be},
- {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be,
- uis_utf16be, uis_utf16be, uis_utf16be},
- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8,
- uis_utf8},
- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8,
- uis_utf8, uis_utf8},
- {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
- uis_utf16le, uis_utf16le, uis_utf16le},
- {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
- uis_utf16le, uis_utf16le, uis_utf16le},
- {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
- uis_utf16le, uis_utf16le, uis_utf16le},
- {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le,
- uis_utf32le, uis_utf32le, uis_utf32le},
- {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
- uis_utf8, uis_utf8},
- {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
- uis_utf16le, uis_utf16le, uis_utf16le},
- {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
- uis_utf16le, uis_utf16le, uis_utf16le},
- {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
- uis_utf8},
- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
- uis_utf8},
- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
- uis_utf8},
- };
- static char s_introUngetCount[][uictMax] = {
- // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
- {0, 1, 1, 0, 0, 0, 0, 1},
- {0, 2, 2, 2, 2, 2, 2, 2},
- {3, 3, 3, 3, 0, 3, 3, 3},
- {4, 4, 4, 4, 4, 0, 4, 4},
- {1, 1, 1, 1, 1, 1, 1, 1},
- {1, 1, 1, 1, 1, 1, 1, 1},
- {2, 2, 2, 2, 2, 0, 2, 2},
- {2, 2, 2, 2, 0, 2, 2, 2},
- {0, 1, 1, 1, 1, 1, 1, 1},
- {0, 2, 2, 2, 2, 2, 2, 2},
- {1, 1, 1, 1, 1, 1, 1, 1},
- {1, 1, 1, 1, 1, 1, 1, 1},
- {0, 2, 2, 2, 2, 2, 2, 2},
- {0, 3, 3, 3, 3, 3, 3, 3},
- {4, 4, 4, 4, 4, 4, 4, 4},
- {2, 0, 2, 2, 2, 2, 2, 2},
- {3, 3, 0, 3, 3, 3, 3, 3},
- {1, 1, 1, 1, 1, 1, 1, 1},
- };
- inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch) {
- if (std::istream::traits_type::eof() == ch) {
- return uictOther;
- }
- switch (ch) {
- case 0:
- return uict00;
- case 0xBB:
- return uictBB;
- case 0xBF:
- return uictBF;
- case 0xEF:
- return uictEF;
- case 0xFE:
- return uictFE;
- case 0xFF:
- return uictFF;
- }
- if ((ch > 0) && (ch < 0xFF)) {
- return uictAscii;
- }
- return uictOther;
- }
- inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits,
- unsigned char rshift) {
- const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits);
- const unsigned char mask = (0xFF >> (lead_bits + 1));
- return static_cast<char>(
- static_cast<unsigned char>(header | ((ch >> rshift) & mask)));
- }
- inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch) {
- // We are not allowed to queue the Stream::eof() codepoint, so
- // replace it with CP_REPLACEMENT_CHARACTER
- if (static_cast<unsigned long>(Stream::eof()) == ch) {
- ch = CP_REPLACEMENT_CHARACTER;
- }
- if (ch < 0x80) {
- q.push_back(Utf8Adjust(ch, 0, 0));
- } else if (ch < 0x800) {
- q.push_back(Utf8Adjust(ch, 2, 6));
- q.push_back(Utf8Adjust(ch, 1, 0));
- } else if (ch < 0x10000) {
- q.push_back(Utf8Adjust(ch, 3, 12));
- q.push_back(Utf8Adjust(ch, 1, 6));
- q.push_back(Utf8Adjust(ch, 1, 0));
- } else {
- q.push_back(Utf8Adjust(ch, 4, 18));
- q.push_back(Utf8Adjust(ch, 1, 12));
- q.push_back(Utf8Adjust(ch, 1, 6));
- q.push_back(Utf8Adjust(ch, 1, 0));
- }
- }
- Stream::Stream(std::istream& input)
- : m_input(input),
- m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]),
- m_nPrefetchedAvailable(0),
- m_nPrefetchedUsed(0) {
- typedef std::istream::traits_type char_traits;
- if (!input)
- return;
- // Determine (or guess) the character-set by reading the BOM, if any. See
- // the YAML specification for the determination algorithm.
- char_traits::int_type intro[4];
- int nIntroUsed = 0;
- UtfIntroState state = uis_start;
- for (; !s_introFinalState[state];) {
- std::istream::int_type ch = input.get();
- intro[nIntroUsed++] = ch;
- UtfIntroCharType charType = IntroCharTypeOf(ch);
- UtfIntroState newState = s_introTransitions[state][charType];
- int nUngets = s_introUngetCount[state][charType];
- if (nUngets > 0) {
- input.clear();
- for (; nUngets > 0; --nUngets) {
- if (char_traits::eof() != intro[--nIntroUsed])
- input.putback(char_traits::to_char_type(intro[nIntroUsed]));
- }
- }
- state = newState;
- }
- switch (state) {
- case uis_utf8:
- m_charSet = utf8;
- break;
- case uis_utf16le:
- m_charSet = utf16le;
- break;
- case uis_utf16be:
- m_charSet = utf16be;
- break;
- case uis_utf32le:
- m_charSet = utf32le;
- break;
- case uis_utf32be:
- m_charSet = utf32be;
- break;
- default:
- m_charSet = utf8;
- break;
- }
- ReadAheadTo(0);
- }
- Stream::~Stream() { delete[] m_pPrefetched; }
- char Stream::peek() const {
- if (m_readahead.empty()) {
- return Stream::eof();
- }
- return m_readahead[0];
- }
- Stream::operator bool() const {
- return m_input.good() ||
- (!m_readahead.empty() && m_readahead[0] != Stream::eof());
- }
- // get
- // . Extracts a character from the stream and updates our position
- char Stream::get() {
- char ch = peek();
- AdvanceCurrent();
- m_mark.column++;
- if (ch == '\n') {
- m_mark.column = 0;
- m_mark.line++;
- }
- return ch;
- }
- // get
- // . Extracts 'n' characters from the stream and updates our position
- std::string Stream::get(int n) {
- std::string ret;
- ret.reserve(n);
- for (int i = 0; i < n; i++)
- ret += get();
- return ret;
- }
- // eat
- // . Eats 'n' characters and updates our position.
- void Stream::eat(int n) {
- for (int i = 0; i < n; i++)
- get();
- }
- void Stream::AdvanceCurrent() {
- if (!m_readahead.empty()) {
- m_readahead.pop_front();
- m_mark.pos++;
- }
- ReadAheadTo(0);
- }
- bool Stream::_ReadAheadTo(size_t i) const {
- while (m_input.good() && (m_readahead.size() <= i)) {
- switch (m_charSet) {
- case utf8:
- StreamInUtf8();
- break;
- case utf16le:
- StreamInUtf16();
- break;
- case utf16be:
- StreamInUtf16();
- break;
- case utf32le:
- StreamInUtf32();
- break;
- case utf32be:
- StreamInUtf32();
- break;
- }
- }
- // signal end of stream
- if (!m_input.good())
- m_readahead.push_back(Stream::eof());
- return m_readahead.size() > i;
- }
- void Stream::StreamInUtf8() const {
- unsigned char b = GetNextByte();
- if (m_input.good()) {
- m_readahead.push_back(b);
- }
- }
- void Stream::StreamInUtf16() const {
- unsigned long ch = 0;
- unsigned char bytes[2];
- int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
- bytes[0] = GetNextByte();
- bytes[1] = GetNextByte();
- if (!m_input.good()) {
- return;
- }
- ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
- static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
- if (ch >= 0xDC00 && ch < 0xE000) {
- // Trailing (low) surrogate...ugh, wrong order
- QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
- return;
- } else if (ch >= 0xD800 && ch < 0xDC00) {
- // ch is a leading (high) surrogate
- // Four byte UTF-8 code point
- // Read the trailing (low) surrogate
- for (;;) {
- bytes[0] = GetNextByte();
- bytes[1] = GetNextByte();
- if (!m_input.good()) {
- QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
- return;
- }
- unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
- static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
- if (chLow < 0xDC00 || chLow >= 0xE000) {
- // Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the
- // stream.
- QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
- // Deal with the next UTF-16 unit
- if (chLow < 0xD800 || chLow >= 0xE000) {
- // Easiest case: queue the codepoint and return
- QueueUnicodeCodepoint(m_readahead, ch);
- return;
- } else {
- // Start the loop over with the new high surrogate
- ch = chLow;
- continue;
- }
- }
- // Select the payload bits from the high surrogate
- ch &= 0x3FF;
- ch <<= 10;
- // Include bits from low surrogate
- ch |= (chLow & 0x3FF);
- // Add the surrogacy offset
- ch += 0x10000;
- break;
- }
- }
- QueueUnicodeCodepoint(m_readahead, ch);
- }
- inline char* ReadBuffer(unsigned char* pBuffer) {
- return reinterpret_cast<char*>(pBuffer);
- }
- unsigned char Stream::GetNextByte() const {
- if (m_nPrefetchedUsed >= m_nPrefetchedAvailable) {
- std::streambuf* pBuf = m_input.rdbuf();
- m_nPrefetchedAvailable = static_cast<std::size_t>(
- pBuf->sgetn(ReadBuffer(m_pPrefetched), YAML_PREFETCH_SIZE));
- m_nPrefetchedUsed = 0;
- if (!m_nPrefetchedAvailable) {
- m_input.setstate(std::ios_base::eofbit);
- }
- if (0 == m_nPrefetchedAvailable) {
- return 0;
- }
- }
- return m_pPrefetched[m_nPrefetchedUsed++];
- }
- void Stream::StreamInUtf32() const {
- static int indexes[2][4] = {{3, 2, 1, 0}, {0, 1, 2, 3}};
- unsigned long ch = 0;
- unsigned char bytes[4];
- int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];
- bytes[0] = GetNextByte();
- bytes[1] = GetNextByte();
- bytes[2] = GetNextByte();
- bytes[3] = GetNextByte();
- if (!m_input.good()) {
- return;
- }
- for (int i = 0; i < 4; ++i) {
- ch <<= 8;
- ch |= bytes[pIndexes[i]];
- }
- QueueUnicodeCodepoint(m_readahead, ch);
- }
- }
|