| /* |
| * JSON parser: C++ oriented JSON parser. |
| */ |
| #ifndef minijson_h |
| #define minijson_h |
| |
| #include <iostream> |
| #include <map> |
| #include <sstream> |
| #include <string> |
| #include <vector> |
| |
| //#define __MINIJSON_LIBERAL |
| |
| // We recommended to use simdjson from_chars. |
| // Using strtod() is a fallback |
| #if defined(MINIJSON_USE_STRTOD) |
| // Use stdlib's strtod |
| #include <cstring> |
| #else |
| |
| namespace minijson { |
| namespace simdjson { |
| namespace internal { |
| |
| double from_chars(const char *first) noexcept; |
| double from_chars(const char *first, const char *end) noexcept; |
| |
| char *to_chars(char *first, const char *last, double value); |
| |
| } // namespace internal |
| } // namespace simdjson |
| } // namespace minijson |
| |
| #endif |
| |
| namespace minijson { |
| |
| // Simple C++ implementation of Python's OrderedDict like dictonary |
| // (preserves key insertion order) |
| // Modified for JSON: |
| // - No duplicated key allowed |
| |
| template <typename T> |
| class ordered_dict { |
| public: |
| bool at(const size_t idx, T *dst) const { |
| if (idx >= _keys.size()) { |
| return false; |
| } |
| |
| if (!_m.count(_keys[idx])) { |
| // This should not happen though. |
| return false; |
| } |
| |
| (*dst) = _m.at(_keys[idx]); |
| |
| return true; |
| } |
| |
| bool count(const std::string &key) const { return _m.count(key); } |
| |
| void insert(const std::string &key, const T &value) { |
| if (_m.count(key)) { |
| // overwrite existing value |
| } else { |
| _keys.push_back(key); |
| } |
| |
| _m[key] = value; |
| } |
| |
| void insert(const std::string &key, T &&value) { |
| if (_m.count(key)) { |
| // overwrite existing value |
| } else { |
| _keys.push_back(key); |
| } |
| |
| _m[key] = std::move(value); |
| } |
| |
| bool at(const std::string &key, T *dst) const { |
| if (!_m.count(key)) { |
| // This should not happen though. |
| return false; |
| } |
| |
| (*dst) = _m.at(key); |
| |
| return true; |
| } |
| |
| const std::vector<std::string> &keys() const { return _keys; } |
| |
| size_t size() const { return _m.size(); } |
| |
| bool erase(const std::string &key) { |
| // simple linear search |
| for (size_t i = 0; i < _keys.size(); i++) { |
| if (_keys[i] == key) { |
| _keys.erase(_keys.begin() + i); |
| _m.erase(key); |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| private: |
| std::vector<std::string> _keys; |
| std::map<std::string, T> _m; |
| }; |
| |
| namespace detail { |
| |
| double from_chars(const char *p); |
| const char *my_strchr(const char *p, int ch); |
| |
| } // namespace detail |
| |
| namespace detail { |
| |
| // |
| // Usage: |
| // - set_input() |
| // - scan_string() |
| // - success: use `token_buffer` string |
| // - error: use `error_message` |
| // |
| struct string_parser { |
| // input string must be UTF-8 |
| void set_input(const std::string &s) { _input = s; } |
| |
| bool scan_string(); |
| |
| void reset() { |
| if (_input.size()) { |
| current = _input[0]; |
| } else { |
| current = '\0'; |
| } |
| curr_idx = 0; |
| token_buffer.clear(); |
| } |
| |
| // fetch next token. |
| unsigned char get() { |
| if ((curr_idx + 1) < _input.size()) { |
| curr_idx++; |
| current = _input[curr_idx]; |
| return current; |
| } |
| current = '\0'; |
| return current; |
| } |
| |
| bool eof() { |
| if (_input.empty()) { |
| return true; |
| } |
| |
| if (curr_idx >= _input.size()) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| void add(const unsigned char c) { token_buffer += c; } |
| |
| void add(const int i) { |
| // use lower 8bit |
| token_buffer += static_cast<unsigned char>(i & 0xff); |
| } |
| |
| int get_codepoint(); |
| |
| bool next_byte_in_range(const std::initializer_list<int> ranges); |
| |
| std::string error_message; |
| std::string token_buffer; // output |
| |
| unsigned char current{'\0'}; |
| size_t curr_idx{0}; |
| std::string _input; |
| }; |
| |
| } // namespace detail |
| |
| typedef enum { |
| unknown_type, |
| null_type, |
| boolean_type, |
| number_type, |
| string_type, |
| array_type, |
| object_type, |
| } type; |
| |
| typedef enum { |
| no_error, |
| undefined_error, |
| invalid_token_error, |
| unknown_type_error, |
| memory_allocation_error, |
| corrupted_json_error, |
| duplicated_key_error, |
| } error; |
| |
| class value; |
| |
| typedef bool boolean; |
| typedef double number; |
| typedef std::string string; |
| typedef ordered_dict<value> object; |
| typedef std::vector<value> array; |
| typedef struct { |
| } null_t; |
| |
| // null_t null; |
| |
| template <typename T> |
| struct TypeTraits; |
| |
| template <> |
| struct TypeTraits<null_t> { |
| static constexpr uint32_t type_id() { return 0; } |
| }; |
| |
| template <> |
| struct TypeTraits<boolean> { |
| static constexpr uint32_t type_id() { return 1; } |
| }; |
| |
| template <> |
| struct TypeTraits<number> { |
| static constexpr uint32_t type_id() { return 2; } |
| }; |
| |
| template <> |
| struct TypeTraits<string> { |
| static constexpr uint32_t type_id() { return 3; } |
| }; |
| |
| template <> |
| struct TypeTraits<object> { |
| static constexpr uint32_t type_id() { return 4; } |
| }; |
| |
| template <> |
| struct TypeTraits<array> { |
| static constexpr uint32_t type_id() { return 5; } |
| }; |
| |
| class value { |
| private: |
| type t; |
| union { |
| null_t n; |
| boolean b; |
| number d; |
| std::string *s; |
| array *a; |
| object *o; |
| } u; |
| |
| void _free_u() { |
| if (t == string_type) { |
| delete this->u.s; |
| this->u.s = nullptr; |
| } |
| if (t == array_type) { |
| delete this->u.a; |
| this->u.a = nullptr; |
| } |
| if (t == object_type) { |
| delete this->u.o; |
| this->u.o = nullptr; |
| } |
| } |
| |
| public: |
| value() : t(unknown_type), u() {} |
| value(null_t n) : t(null_type), u() { u.n = n; } |
| value(boolean b) : t(boolean_type), u() { u.b = b; } |
| value(number d) : t(boolean_type), u() { u.d = d; } |
| value(const char *s) : t(string_type), u() { u.s = new std::string(s); } |
| value(const std::string &s) : t(string_type), u() { |
| u.s = new std::string(s); |
| } |
| value(const array &a) : t(array_type), u() { u.a = new array(a); } |
| value(const object &o) : t(object_type), u() { u.o = new object(o); } |
| value(const value &v) : t(v.t), u() { |
| if (t == array_type) { |
| u.a = new array(); |
| *u.a = *v.u.a; |
| } else if (t == object_type) { |
| u.o = new object(); |
| *u.o = *v.u.o; |
| } else if (t == string_type) { |
| u.s = new std::string(); |
| *u.s = *v.u.s; |
| } else |
| u.d = v.u.d; |
| } |
| ~value() { _free_u(); } |
| |
| template <typename T> |
| bool is() const { |
| if (TypeTraits<T>::type_id() == TypeTraits<null_t>::type_id() && |
| t == null_type) |
| return true; |
| if (TypeTraits<T>::type_id() == TypeTraits<boolean>::type_id() && |
| t == boolean_type) |
| return true; |
| if (TypeTraits<T>::type_id() == TypeTraits<number>::type_id() && |
| t == number_type) |
| return true; |
| if (TypeTraits<T>::type_id() == TypeTraits<std::string>::type_id() && |
| t == string_type) |
| return true; |
| if (TypeTraits<T>::type_id() == TypeTraits<array>::type_id() && |
| t == array_type) |
| return true; |
| if (TypeTraits<T>::type_id() == TypeTraits<object>::type_id() && |
| t == object_type) |
| return true; |
| return false; |
| } |
| |
| template <typename T> |
| const T *as() const { |
| if ((t == array_type) && |
| (TypeTraits<T>::type_id() == TypeTraits<array>::type_id())) { |
| return reinterpret_cast<const T *>(u.a); |
| } |
| |
| if ((t == object_type) && |
| (TypeTraits<T>::type_id() == TypeTraits<object>::type_id())) { |
| return reinterpret_cast<const T *>(u.o); |
| } |
| |
| if ((t == string_type) && |
| (TypeTraits<T>::type_id() == TypeTraits<std::string>::type_id())) { |
| return reinterpret_cast<const T *>(u.s); |
| } |
| |
| if ((t == null_type) && |
| (TypeTraits<T>::type_id() == TypeTraits<null_t>::type_id())) { |
| return reinterpret_cast<const T *>(&u.n); |
| } |
| |
| if ((t == boolean_type) && |
| (TypeTraits<T>::type_id() == TypeTraits<boolean>::type_id())) { |
| return reinterpret_cast<const T *>(&u.b); |
| } |
| |
| if ((t == number_type) && |
| (TypeTraits<T>::type_id() == TypeTraits<number>::type_id())) { |
| return reinterpret_cast<const T *>(&u.d); |
| } |
| |
| return nullptr; |
| } |
| |
| template <typename T> |
| T *as() { |
| if ((t == array_type) && |
| (TypeTraits<T>::type_id() == TypeTraits<array>::type_id())) { |
| return reinterpret_cast<T *>(u.a); |
| } |
| |
| if ((t == object_type) && |
| (TypeTraits<T>::type_id() == TypeTraits<object>::type_id())) { |
| return reinterpret_cast<T *>(u.o); |
| } |
| |
| if ((t == string_type) && |
| (TypeTraits<T>::type_id() == TypeTraits<string>::type_id())) { |
| return reinterpret_cast<T *>(u.s); |
| } |
| |
| if ((t == null_type) && |
| (TypeTraits<T>::type_id() == TypeTraits<null_t>::type_id())) { |
| return reinterpret_cast<T *>(&u.n); |
| } |
| |
| if ((t == boolean_type) && |
| (TypeTraits<T>::type_id() == TypeTraits<boolean>::type_id())) { |
| return reinterpret_cast<T *>(&u.b); |
| } |
| |
| if ((t == number_type) && |
| (TypeTraits<T>::type_id() == TypeTraits<number>::type_id())) { |
| return reinterpret_cast<T *>(&u.d); |
| } |
| |
| return nullptr; |
| } |
| |
| null_t &operator=(null_t &n) { |
| t = null_type; |
| u.n = n; |
| return u.n; |
| } |
| boolean &operator=(boolean b) { |
| t = boolean_type; |
| u.b = b; |
| return u.b; |
| } |
| number &operator=(number d) { |
| t = number_type; |
| u.d = d; |
| return u.d; |
| } |
| const std::string &operator=(const char *s) { |
| _free_u(); |
| t = string_type; |
| u.s = new std::string(s); |
| return *u.s; |
| } |
| const std::string &operator=(const std::string &s) { |
| _free_u(); |
| t = string_type; |
| u.s = new std::string(s); |
| return *u.s; |
| } |
| const object &operator=(const object &o) { |
| _free_u(); |
| t = object_type; |
| u.o = new object(o); |
| return *u.o; |
| } |
| const array &operator=(const array &a) { |
| _free_u(); |
| t = array_type; |
| u.a = new array(a); |
| return *u.a; |
| } |
| const value &operator=(const value &v) { |
| _free_u(); |
| t = v.t; |
| if (t == array_type) { |
| u.a = new array(*v.u.a); |
| } else if (t == object_type) { |
| u.o = new object(*v.u.o); |
| } else if (t == string_type) { |
| u.s = new std::string(*v.u.s); |
| } else |
| u.d = v.u.d; |
| return *this; |
| } |
| |
| std::string type_name() const { |
| if (t == array_type) { |
| return "array"; |
| } |
| |
| if (t == object_type) { |
| return "object"; |
| } |
| |
| if (t == string_type) { |
| return "string"; |
| } |
| |
| if (t == null_type) { |
| return "null"; |
| } |
| |
| if (t == boolean_type) { |
| return "boolean"; |
| } |
| |
| if (t == number_type) { |
| return "number"; |
| } |
| |
| return "[[invalid]]"; |
| } |
| |
| std::string str(const char *p) const { |
| std::stringstream ss; |
| ss << '"'; |
| while (*p) { |
| if (*p == '\n') { |
| ss << "\\n"; |
| } else if (*p == '\r') { |
| ss << "\\r"; |
| } else if (*p == '\t') { |
| ss << "\\t"; |
| } else if (detail::my_strchr("\"", *p)) { |
| ss << "\\" << *p; |
| } else { |
| ss << *p; |
| } |
| p++; |
| } |
| ss << '"'; |
| return ss.str(); |
| } |
| |
| std::string str() const { |
| std::stringstream ss; |
| if (t == unknown_type) { |
| ss << "undefined"; |
| } else if (t == null_type) { |
| ss << "null"; |
| } else if (t == boolean_type) { |
| ss << (u.b ? "true" : "false"); |
| } else if (t == number_type) { |
| ss << double(u.d); |
| } else if (t == string_type) { |
| ss << str(u.s->c_str()); |
| } else if (const array *pa = as<array>()) { |
| array::const_iterator i; |
| ss << "["; |
| // array a = get<array>(); |
| for (i = pa->begin(); i != pa->end(); i++) { |
| if (i != pa->begin()) ss << ", "; |
| ss << i->str(); |
| } |
| ss << "]"; |
| } else if (auto po = as<object>()) { |
| // object::const_iterator i; |
| ss << "{"; |
| // object o = get<object>(); |
| for (size_t i = 0; i < po->size(); i++) { |
| if (i > 0) ss << ", "; |
| ss << "\"" << po->keys()[i] << "\""; |
| |
| value v; |
| if (po->at(i, &v)) { |
| ss << ": " << v.str(); |
| } else { |
| // TODO: report error |
| ss << ": null"; |
| } |
| } |
| ss << "}"; |
| } |
| return ss.str(); |
| } |
| }; |
| |
| #define MINIJSON_SKIP(i) \ |
| while (*i && detail::my_strchr("\r\n \t", *i)) { \ |
| i++; \ |
| } |
| |
| template <typename Iter> |
| inline error parse_object(Iter &i, value &v) { |
| object o; |
| i++; |
| MINIJSON_SKIP(i) |
| if (!(*i)) { |
| return corrupted_json_error; |
| } |
| if (*i != '\x7d') { |
| while (*i) { |
| value vk, vv; |
| error e = parse_string(i, vk); |
| if (e != no_error) return e; |
| MINIJSON_SKIP(i) |
| if (!(*i)) { |
| return corrupted_json_error; |
| } |
| if (*i != ':') return invalid_token_error; |
| i++; |
| e = parse_any(i, vv); |
| if (e != no_error) return e; |
| |
| auto ps = vk.as<std::string>(); |
| if (!ps) { |
| return unknown_type_error; |
| } |
| |
| if (o.count(*ps)) { |
| return duplicated_key_error; |
| } |
| o.insert(*ps, vv); |
| |
| MINIJSON_SKIP(i) |
| if (!(*i)) { |
| return corrupted_json_error; |
| } |
| if (*i == '\x7d') break; |
| if (*i != ',') return invalid_token_error; |
| i++; |
| MINIJSON_SKIP(i) |
| if (!(*i)) { |
| return corrupted_json_error; |
| } |
| #ifdef __MINIJSON_LIBERAL |
| if (*i == '\x7d') break; |
| #endif |
| } |
| } |
| v = value(o); |
| i++; |
| return no_error; |
| } |
| |
| template <typename Iter> |
| inline error parse_array(Iter &i, value &v) { |
| array a; |
| i++; |
| MINIJSON_SKIP(i) |
| if (!(*i)) { |
| return corrupted_json_error; |
| } |
| if (*i != ']') { |
| while (*i) { |
| value va; |
| error e = parse_any(i, va); |
| if (e != no_error) return e; |
| a.push_back(va); |
| MINIJSON_SKIP(i) |
| if (!(*i)) { |
| return corrupted_json_error; |
| } |
| if (*i == ']') break; |
| if (*i != ',') return invalid_token_error; |
| i++; |
| MINIJSON_SKIP(i) |
| if (!(*i)) { |
| return corrupted_json_error; |
| } |
| #ifdef __MINIJSON_LIBERAL |
| if (*i == '\x7d') break; |
| #endif |
| } |
| } |
| v = value(a); |
| i++; |
| return no_error; |
| } |
| |
| template <typename Iter> |
| inline error parse_null(Iter &i, value &v) { |
| Iter p = i; |
| if (*i == 'n' && *(i + 1) == 'u' && *(i + 2) == 'l' && *(i + 3) == 'l') { |
| i += 4; |
| v = null_t(); |
| } |
| if (*i && nullptr == detail::my_strchr(":,\x7d]\r\n ", *i)) { |
| i = p; |
| return undefined_error; |
| } |
| return no_error; |
| } |
| |
| template <typename Iter> |
| inline error parse_boolean(Iter &i, value &v) { |
| Iter p = i; |
| if (*i == 't' && *(i + 1) == 'r' && *(i + 2) == 'u' && *(i + 3) == 'e') { |
| i += 4; |
| v = static_cast<boolean>(true); |
| } else if (*i == 'f' && *(i + 1) == 'a' && *(i + 2) == 'l' && |
| *(i + 3) == 's' && *(i + 4) == 'e') { |
| i += 5; |
| v = static_cast<boolean>(false); |
| } |
| if (*i && nullptr == detail::my_strchr(":,\x7d]\r\n ", *i)) { |
| i = p; |
| return undefined_error; |
| } |
| return no_error; |
| } |
| |
| template <typename Iter> |
| inline error parse_number(Iter &i, value &v) { |
| Iter p = i; |
| |
| #define MINIJSON_IS_NUM(x) ('0' <= x && x <= '9') |
| #define MINIJSON_IS_ALNUM(x) \ |
| (('0' <= x && x <= '9') || ('a' <= x && x <= 'f') || ('A' <= x && x <= 'F')) |
| if (*i == '0' && *(i + 1) == 'x' && MINIJSON_IS_ALNUM(*(i + 2))) { |
| i += 3; |
| while (MINIJSON_IS_ALNUM(*i)) i++; |
| v = static_cast<number>(detail::from_chars(p)); |
| } else { |
| while (MINIJSON_IS_NUM(*i)) i++; |
| if (*i == '.') { |
| i++; |
| if (!MINIJSON_IS_NUM(*i)) { |
| i = p; |
| return invalid_token_error; |
| } |
| while (MINIJSON_IS_NUM(*i)) i++; |
| } |
| if (*i == 'e') { |
| i++; |
| if (!MINIJSON_IS_NUM(*i)) { |
| i = p; |
| return invalid_token_error; |
| } |
| while (MINIJSON_IS_NUM(*i)) i++; |
| } |
| v = static_cast<number>(detail::from_chars(p)); |
| } |
| if (*i && nullptr == detail::my_strchr(":,\x7d]\r\n ", *i)) { |
| i = p; |
| return invalid_token_error; |
| } |
| return no_error; |
| } |
| |
| template <typename Iter> |
| inline error parse_string(Iter &i, value &v) { |
| if (*i != '"') return invalid_token_error; |
| |
| Iter s = i; |
| char t = *i++; // = '"' |
| Iter p = i; |
| |
| #if 0 |
| std::stringstream ss; |
| while (*i && *i != t) { |
| if (*i == '\\' && *(i + 1)) { |
| i++; |
| if (*i == 'n') |
| ss << "\n"; |
| else if (*i == 'r') |
| ss << "\r"; |
| else if (*i == 't') |
| ss << "\t"; |
| else |
| ss << *i; |
| } else { |
| ss << *i; |
| } |
| i++; |
| } |
| #else |
| // read until '"' |
| while (*i && *i != t) { |
| if (*i == '\\' && *(i + 1)) { |
| i++; |
| } |
| i++; |
| } |
| |
| #endif |
| if (!*i) return invalid_token_error; |
| if (i < p) { |
| return corrupted_json_error; |
| } |
| |
| #if 0 |
| v = std::string(p, size_t(i - p)); |
| |
| i++; |
| if (*i && nullptr == detail::my_strchr(":,\x7d]\r\n ", *i)) { |
| i = p; |
| return invalid_token_error; |
| } |
| |
| #else |
| |
| i++; |
| if (*i && nullptr == detail::my_strchr(":,\x7d]\r\n ", *i)) { |
| i = p; |
| return invalid_token_error; |
| } |
| |
| // include first and last '"' char |
| std::string buf(s, size_t(i - s)); |
| |
| detail::string_parser str_parser; |
| str_parser.set_input(buf); |
| |
| if (!str_parser.scan_string()) { |
| // TODO: error message |
| // str_parser.error_message; |
| return invalid_token_error; |
| } else { |
| v = str_parser.token_buffer; |
| } |
| |
| #endif |
| |
| return no_error; |
| } |
| |
| template <typename Iter> |
| inline error parse_any(Iter &i, value &v) { |
| MINIJSON_SKIP(i) |
| if (*i == '\x7b') return parse_object(i, v); |
| if (*i == '[') return parse_array(i, v); |
| if (*i == 't' || *i == 'f') return parse_boolean(i, v); |
| if (*i == 'n') return parse_null(i, v); |
| if ('0' <= *i && *i <= '9') return parse_number(i, v); |
| if (*i == '"') return parse_string(i, v); |
| return invalid_token_error; |
| } |
| |
| template <typename Iter> |
| inline error parse(Iter &i, value &v) { |
| return parse_any(i, v); |
| } |
| |
| #undef MINIJSON_SKIP |
| |
| inline const char *errstr(error e) { |
| const char *s = "unknown error"; |
| switch (e) { |
| case no_error: { |
| s = "no error"; |
| break; |
| } |
| case undefined_error: { |
| s = "undefined"; |
| break; |
| } |
| case invalid_token_error: { |
| s = "invalid token"; |
| break; |
| } |
| case unknown_type_error: { |
| s = "unknown type"; |
| break; |
| } |
| case memory_allocation_error: { |
| s = "memory allocation error"; |
| break; |
| } |
| case corrupted_json_error: { |
| s = "input is corrupted"; |
| break; |
| } |
| case duplicated_key_error: { |
| s = "duplicated key found"; |
| break; |
| } |
| // default: return "unknown error"; |
| } |
| |
| return s; |
| } |
| |
| } // namespace minijson |
| |
| #if defined(MINIJSON_IMPLEMENTATION) |
| |
| namespace minijson { |
| |
| namespace detail { |
| |
| // clang-format off |
| // |
| // From json.hpp --------------------------------------------------------- |
| // __ _____ _____ _____ |
| // __| | __| | | | JSON for Modern C++ |
| // | | |__ | | | | | | version 3.11.3 |
| // |_____|_____|_____|_|___| https://github.com/nlohmann/json |
| // |
| // SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me> |
| // SPDX-License-Identifier: MIT |
| |
| #if 1 |
| #define JSON_HEDLEY_UNLIKELY(cond) (cond) |
| #define JSON_HEDLEY_LIKELY(cond) (cond) |
| |
| /*! |
| @brief get codepoint from 4 hex characters following `\u` |
| |
| For input "\u c1 c2 c3 c4" the codepoint is: |
| (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4 |
| = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0) |
| |
| Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f' |
| must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The |
| conversion is done by subtracting the offset (0x30, 0x37, and 0x57) |
| between the ASCII value of the character and the desired integer value. |
| |
| @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or |
| non-hex character) |
| */ |
| int string_parser::get_codepoint() |
| { |
| // this function only makes sense after reading `\u` |
| //JSON_ASSERT(current == 'u'); |
| if (current != 'u') { |
| return -1; |
| } |
| int codepoint = 0; |
| |
| const auto factors = { 12u, 8u, 4u, 0u }; |
| for (const auto factor : factors) |
| { |
| get(); |
| |
| if (current >= '0' && current <= '9') |
| { |
| codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor); |
| } |
| else if (current >= 'A' && current <= 'F') |
| { |
| codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor); |
| } |
| else if (current >= 'a' && current <= 'f') |
| { |
| codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor); |
| } |
| else |
| { |
| return -1; |
| } |
| } |
| |
| if (0x0000 <= codepoint && codepoint <= 0xFFFF) { |
| } else { |
| return -1; |
| } |
| return codepoint; |
| } |
| |
| /*! |
| @brief check if the next byte(s) are inside a given range |
| |
| Adds the current byte and, for each passed range, reads a new byte and |
| checks if it is inside the range. If a violation was detected, set up an |
| error message and return false. Otherwise, return true. |
| |
| @param[in] ranges list of integers; interpreted as list of pairs of |
| inclusive lower and upper bound, respectively |
| |
| @pre The passed list @a ranges must have 2, 4, or 6 elements; that is, |
| 1, 2, or 3 pairs. This precondition is enforced by an assertion. |
| |
| @return true if and only if no range violation was detected |
| */ |
| bool string_parser::next_byte_in_range(const std::initializer_list<int> ranges) |
| { |
| if (ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6) { |
| } else { |
| return false; |
| } |
| |
| add(current); |
| |
| for (auto range = ranges.begin(); range != ranges.end(); ++range) |
| { |
| get(); |
| if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) // NOLINT(bugprone-inc-dec-in-conditions) |
| { |
| add(current); |
| } |
| else |
| { |
| error_message = "invalid string: ill-formed UTF-8 byte"; |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| /*! |
| @brief scan a string literal |
| |
| This function scans a string according to Sect. 7 of RFC 8259. While |
| scanning, bytes are escaped and copied into buffer token_buffer. Then the |
| function returns successfully, token_buffer is *not* null-terminated (as it |
| may contain \0 bytes), and token_buffer.size() is the number of bytes in the |
| string. |
| |
| @return true if string could be successfully scanned, |
| false otherwise |
| |
| @note In case of errors, variable error_message contains a textual |
| description. |
| */ |
| bool string_parser::scan_string() |
| { |
| // reset token_buffer (ignore opening quote) |
| reset(); |
| |
| // we entered the function by reading an open quote |
| //JSON_ASSERT(current == '\"'); |
| if (current != '\"') { |
| error_message = "first character must be '\"'"; |
| return false; |
| } |
| |
| |
| while (!eof()) |
| { |
| |
| // get next character |
| switch (get()) |
| { |
| |
| // closing quote |
| case '\"': |
| { |
| return true; |
| } |
| |
| // escapes |
| case '\\': |
| { |
| switch (get()) |
| { |
| // quotation mark |
| case '\"': |
| add('\"'); |
| break; |
| // reverse solidus |
| case '\\': |
| add('\\'); |
| break; |
| // solidus |
| case '/': |
| add('/'); |
| break; |
| // backspace |
| case 'b': |
| add('\b'); |
| break; |
| // form feed |
| case 'f': |
| add('\f'); |
| break; |
| // line feed |
| case 'n': |
| add('\n'); |
| break; |
| // carriage return |
| case 'r': |
| add('\r'); |
| break; |
| // tab |
| case 't': |
| add('\t'); |
| break; |
| |
| // unicode escapes |
| case 'u': |
| { |
| const int codepoint1 = get_codepoint(); |
| int codepoint = codepoint1; // start with codepoint1 |
| |
| if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1)) |
| { |
| error_message = "invalid string: '\\u' must be followed by 4 hex digits"; |
| return false; |
| } |
| |
| // check if code point is a high surrogate |
| if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF) |
| { |
| // expect next \uxxxx entry |
| if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u')) |
| { |
| const int codepoint2 = get_codepoint(); |
| |
| if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1)) |
| { |
| error_message = "invalid string: '\\u' must be followed by 4 hex digits"; |
| return false; |
| } |
| |
| // check if codepoint2 is a low surrogate |
| if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF)) |
| { |
| // overwrite codepoint |
| codepoint = static_cast<int>( |
| // high surrogate occupies the most significant 22 bits |
| (static_cast<unsigned int>(codepoint1) << 10u) |
| // low surrogate occupies the least significant 15 bits |
| + static_cast<unsigned int>(codepoint2) |
| // there is still the 0xD800, 0xDC00 and 0x10000 noise |
| // in the result, so we have to subtract with: |
| // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 |
| - 0x35FDC00u); |
| } |
| else |
| { |
| error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; |
| return false; |
| } |
| } |
| else |
| { |
| error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; |
| return false; |
| } |
| } |
| else |
| { |
| if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF)) |
| { |
| error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF"; |
| return false; |
| } |
| } |
| |
| // result of the above calculation yields a proper codepoint |
| //JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF); |
| if (0x00 <= codepoint && codepoint <= 0x10FFFF) { |
| } else { |
| error_message = "invalid string: invalid codepoint"; |
| return false; |
| } |
| |
| // translate codepoint into bytes |
| if (codepoint < 0x80) |
| { |
| // 1-byte characters: 0xxxxxxx (ASCII) |
| add(static_cast<int>(codepoint)); |
| } |
| else if (codepoint <= 0x7FF) |
| { |
| // 2-byte characters: 110xxxxx 10xxxxxx |
| add(static_cast<int>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u))); |
| add(static_cast<int>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu))); |
| } |
| else if (codepoint <= 0xFFFF) |
| { |
| // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx |
| add(static_cast<int>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u))); |
| add(static_cast<int>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu))); |
| add(static_cast<int>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu))); |
| } |
| else |
| { |
| // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| add(static_cast<int>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u))); |
| add(static_cast<int>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu))); |
| add(static_cast<int>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu))); |
| add(static_cast<int>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu))); |
| } |
| |
| break; |
| } |
| |
| // other characters after escape |
| default: |
| error_message = "invalid string: forbidden character after backslash"; |
| return false; |
| } |
| |
| break; |
| } |
| |
| // invalid control characters |
| case 0x00: |
| { |
| error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000"; |
| return false; |
| } |
| |
| case 0x01: |
| { |
| error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001"; |
| return false; |
| } |
| |
| case 0x02: |
| { |
| error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002"; |
| return false; |
| } |
| |
| case 0x03: |
| { |
| error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003"; |
| return false; |
| } |
| |
| case 0x04: |
| { |
| error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004"; |
| return false; |
| } |
| |
| case 0x05: |
| { |
| error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005"; |
| return false; |
| } |
| |
| case 0x06: |
| { |
| error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006"; |
| return false; |
| } |
| |
| case 0x07: |
| { |
| error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007"; |
| return false; |
| } |
| |
| case 0x08: |
| { |
| error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b"; |
| return false; |
| } |
| |
| case 0x09: |
| { |
| error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t"; |
| return false; |
| } |
| |
| case 0x0A: |
| { |
| error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n"; |
| return false; |
| } |
| |
| case 0x0B: |
| { |
| error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B"; |
| return false; |
| } |
| |
| case 0x0C: |
| { |
| error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f"; |
| return false; |
| } |
| |
| case 0x0D: |
| { |
| error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r"; |
| return false; |
| } |
| |
| case 0x0E: |
| { |
| error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E"; |
| return false; |
| } |
| |
| case 0x0F: |
| { |
| error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F"; |
| return false; |
| } |
| |
| case 0x10: |
| { |
| error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010"; |
| return false; |
| } |
| |
| case 0x11: |
| { |
| error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011"; |
| return false; |
| } |
| |
| case 0x12: |
| { |
| error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012"; |
| return false; |
| } |
| |
| case 0x13: |
| { |
| error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013"; |
| return false; |
| } |
| |
| case 0x14: |
| { |
| error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014"; |
| return false; |
| } |
| |
| case 0x15: |
| { |
| error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015"; |
| return false; |
| } |
| |
| case 0x16: |
| { |
| error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016"; |
| return false; |
| } |
| |
| case 0x17: |
| { |
| error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017"; |
| return false; |
| } |
| |
| case 0x18: |
| { |
| error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018"; |
| return false; |
| } |
| |
| case 0x19: |
| { |
| error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019"; |
| return false; |
| } |
| |
| case 0x1A: |
| { |
| error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A"; |
| return false; |
| } |
| |
| case 0x1B: |
| { |
| error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B"; |
| return false; |
| } |
| |
| case 0x1C: |
| { |
| error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C"; |
| return false; |
| } |
| |
| case 0x1D: |
| { |
| error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D"; |
| return false; |
| } |
| |
| case 0x1E: |
| { |
| error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E"; |
| return false; |
| } |
| |
| case 0x1F: |
| { |
| error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F"; |
| return false; |
| } |
| |
| // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) |
| case 0x20: |
| case 0x21: |
| case 0x23: |
| case 0x24: |
| case 0x25: |
| case 0x26: |
| case 0x27: |
| case 0x28: |
| case 0x29: |
| case 0x2A: |
| case 0x2B: |
| case 0x2C: |
| case 0x2D: |
| case 0x2E: |
| case 0x2F: |
| case 0x30: |
| case 0x31: |
| case 0x32: |
| case 0x33: |
| case 0x34: |
| case 0x35: |
| case 0x36: |
| case 0x37: |
| case 0x38: |
| case 0x39: |
| case 0x3A: |
| case 0x3B: |
| case 0x3C: |
| case 0x3D: |
| case 0x3E: |
| case 0x3F: |
| case 0x40: |
| case 0x41: |
| case 0x42: |
| case 0x43: |
| case 0x44: |
| case 0x45: |
| case 0x46: |
| case 0x47: |
| case 0x48: |
| case 0x49: |
| case 0x4A: |
| case 0x4B: |
| case 0x4C: |
| case 0x4D: |
| case 0x4E: |
| case 0x4F: |
| case 0x50: |
| case 0x51: |
| case 0x52: |
| case 0x53: |
| case 0x54: |
| case 0x55: |
| case 0x56: |
| case 0x57: |
| case 0x58: |
| case 0x59: |
| case 0x5A: |
| case 0x5B: |
| case 0x5D: |
| case 0x5E: |
| case 0x5F: |
| case 0x60: |
| case 0x61: |
| case 0x62: |
| case 0x63: |
| case 0x64: |
| case 0x65: |
| case 0x66: |
| case 0x67: |
| case 0x68: |
| case 0x69: |
| case 0x6A: |
| case 0x6B: |
| case 0x6C: |
| case 0x6D: |
| case 0x6E: |
| case 0x6F: |
| case 0x70: |
| case 0x71: |
| case 0x72: |
| case 0x73: |
| case 0x74: |
| case 0x75: |
| case 0x76: |
| case 0x77: |
| case 0x78: |
| case 0x79: |
| case 0x7A: |
| case 0x7B: |
| case 0x7C: |
| case 0x7D: |
| case 0x7E: |
| case 0x7F: |
| { |
| add(current); |
| break; |
| } |
| |
| // U+0080..U+07FF: bytes C2..DF 80..BF |
| case 0xC2: |
| case 0xC3: |
| case 0xC4: |
| case 0xC5: |
| case 0xC6: |
| case 0xC7: |
| case 0xC8: |
| case 0xC9: |
| case 0xCA: |
| case 0xCB: |
| case 0xCC: |
| case 0xCD: |
| case 0xCE: |
| case 0xCF: |
| case 0xD0: |
| case 0xD1: |
| case 0xD2: |
| case 0xD3: |
| case 0xD4: |
| case 0xD5: |
| case 0xD6: |
| case 0xD7: |
| case 0xD8: |
| case 0xD9: |
| case 0xDA: |
| case 0xDB: |
| case 0xDC: |
| case 0xDD: |
| case 0xDE: |
| case 0xDF: |
| { |
| if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF}))) |
| { |
| return false; |
| } |
| break; |
| } |
| |
| // U+0800..U+0FFF: bytes E0 A0..BF 80..BF |
| case 0xE0: |
| { |
| if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF})))) |
| { |
| return false; |
| } |
| break; |
| } |
| |
| // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF |
| // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF |
| case 0xE1: |
| case 0xE2: |
| case 0xE3: |
| case 0xE4: |
| case 0xE5: |
| case 0xE6: |
| case 0xE7: |
| case 0xE8: |
| case 0xE9: |
| case 0xEA: |
| case 0xEB: |
| case 0xEC: |
| case 0xEE: |
| case 0xEF: |
| { |
| if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF})))) |
| { |
| return false; |
| } |
| break; |
| } |
| |
| // U+D000..U+D7FF: bytes ED 80..9F 80..BF |
| case 0xED: |
| { |
| if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF})))) |
| { |
| return false; |
| } |
| break; |
| } |
| |
| // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF |
| case 0xF0: |
| { |
| if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) |
| { |
| return false; |
| } |
| break; |
| } |
| |
| // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF |
| case 0xF1: |
| case 0xF2: |
| case 0xF3: |
| { |
| if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) |
| { |
| return false; |
| } |
| break; |
| } |
| |
| // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF |
| case 0xF4: |
| { |
| if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})))) |
| { |
| return false; |
| } |
| break; |
| } |
| |
| // remaining bytes (80..C1 and F5..FF) are ill-formed |
| default: |
| { |
| error_message = "invalid string: ill-formed UTF-8 byte"; |
| return false; |
| } |
| } |
| } |
| |
| error_message = "invalid string: missing closing quote"; |
| return false; |
| } |
| #endif |
| // end json.hpp |
| // clang-format on |
| |
| } // namespace detail |
| |
| namespace detail { |
| |
| double from_chars(const char *p) { |
| #if defined(MINIJSON_USE_STRTOD) |
| return strtod(p, nullptr); |
| #else |
| return simdjson::internal::from_chars(p); |
| #endif |
| } |
| |
| const char *my_strchr(const char *p, int ch) { |
| char c; |
| |
| constexpr uint64_t kMaxCount = 1024ull * 1024ull; // up to 1M chars |
| |
| uint64_t cnt{0}; |
| |
| c = ch; |
| for (;; ++p, cnt++) { |
| if (cnt > kMaxCount) { |
| return nullptr; |
| } |
| |
| if (*p == c) { |
| return (p); |
| } |
| if (*p == '\0') { |
| return (nullptr); |
| } |
| } |
| } |
| |
| } // namespace detail |
| } // namespace minijson |
| |
| #if !defined(MINIJSON_USE_STRTOD) |
| |
| #include <cstring> |
| #include <limits> |
| |
| namespace minijson { |
| namespace simdjson { |
| namespace internal { |
| |
| /** |
| * The code in the internal::from_chars function is meant to handle the |
| *floating-point number parsing when we have more than 19 digits in the decimal |
| *mantissa. This should only be seen in adversarial scenarios: we do not expect |
| *production systems to even produce such floating-point numbers. |
| * |
| * The parser is based on work by Nigel Tao (at |
| *https://github.com/google/wuffs/) who credits Ken Thompson for the design (via |
| *a reference to the Go source code). See |
| * https://github.com/google/wuffs/blob/aa46859ea40c72516deffa1b146121952d6dfd3b/internal/cgen/base/floatconv-submodule-data.c |
| * https://github.com/google/wuffs/blob/46cd8105f47ca07ae2ba8e6a7818ef9c0df6c152/internal/cgen/base/floatconv-submodule-code.c |
| * It is probably not very fast but it is a fallback that should almost never be |
| * called in real life. Google Wuffs is published under APL 2.0. |
| **/ |
| |
| namespace { |
| constexpr uint32_t max_digits = 768; |
| constexpr int32_t decimal_point_range = 2047; |
| } // namespace |
| |
| struct adjusted_mantissa { |
| uint64_t mantissa; |
| int power2; |
| adjusted_mantissa() : mantissa(0), power2(0) {} |
| }; |
| |
| struct decimal { |
| uint32_t num_digits; |
| int32_t decimal_point; |
| bool negative; |
| bool truncated; |
| uint8_t digits[max_digits]; |
| }; |
| |
| template <typename T> |
| struct binary_format { |
| static constexpr int mantissa_explicit_bits(); |
| static constexpr int minimum_exponent(); |
| static constexpr int infinite_power(); |
| static constexpr int sign_index(); |
| }; |
| |
| template <> |
| constexpr int binary_format<double>::mantissa_explicit_bits() { |
| return 52; |
| } |
| |
| template <> |
| constexpr int binary_format<double>::minimum_exponent() { |
| return -1023; |
| } |
| template <> |
| constexpr int binary_format<double>::infinite_power() { |
| return 0x7FF; |
| } |
| |
| template <> |
| constexpr int binary_format<double>::sign_index() { |
| return 63; |
| } |
| |
| inline bool is_integer(char c) noexcept { return (c >= '0' && c <= '9'); } |
| |
| // This should always succeed since it follows a call to parse_number. |
| static decimal parse_decimal(const char *&p) noexcept { |
| decimal answer; |
| answer.num_digits = 0; |
| answer.decimal_point = 0; |
| answer.truncated = false; |
| answer.negative = (*p == '-'); |
| if ((*p == '-') || (*p == '+')) { |
| ++p; |
| } |
| |
| while (*p == '0') { |
| ++p; |
| } |
| while (is_integer(*p)) { |
| if (answer.num_digits < max_digits) { |
| answer.digits[answer.num_digits] = uint8_t(*p - '0'); |
| } |
| answer.num_digits++; |
| ++p; |
| } |
| if (*p == '.') { |
| ++p; |
| const char *first_after_period = p; |
| // if we have not yet encountered a zero, we have to skip it as well |
| if (answer.num_digits == 0) { |
| // skip zeros |
| while (*p == '0') { |
| ++p; |
| } |
| } |
| while (is_integer(*p)) { |
| if (answer.num_digits < max_digits) { |
| answer.digits[answer.num_digits] = uint8_t(*p - '0'); |
| } |
| answer.num_digits++; |
| ++p; |
| } |
| answer.decimal_point = int32_t(first_after_period - p); |
| } |
| if (answer.num_digits > 0) { |
| const char *preverse = p - 1; |
| int32_t trailing_zeros = 0; |
| while ((*preverse == '0') || (*preverse == '.')) { |
| if (*preverse == '0') { |
| trailing_zeros++; |
| } |
| --preverse; |
| } |
| answer.decimal_point += int32_t(answer.num_digits); |
| answer.num_digits -= uint32_t(trailing_zeros); |
| } |
| if (answer.num_digits > max_digits) { |
| answer.num_digits = max_digits; |
| answer.truncated = true; |
| } |
| if (('e' == *p) || ('E' == *p)) { |
| ++p; |
| bool neg_exp = false; |
| if ('-' == *p) { |
| neg_exp = true; |
| ++p; |
| } else if ('+' == *p) { |
| ++p; |
| } |
| int32_t exp_number = 0; // exponential part |
| while (is_integer(*p)) { |
| uint8_t digit = uint8_t(*p - '0'); |
| if (exp_number < 0x10000) { |
| exp_number = 10 * exp_number + digit; |
| } |
| ++p; |
| } |
| answer.decimal_point += (neg_exp ? -exp_number : exp_number); |
| } |
| return answer; |
| } |
| |
| // This should always succeed since it follows a call to parse_number. |
| // Will not read at or beyond the "end" pointer. |
| static decimal parse_decimal(const char *&p, const char *end) noexcept { |
| decimal answer; |
| answer.num_digits = 0; |
| answer.decimal_point = 0; |
| answer.truncated = false; |
| if (p == end) { |
| return answer; |
| } // should never happen |
| answer.negative = (*p == '-'); |
| if ((*p == '-') || (*p == '+')) { |
| ++p; |
| } |
| |
| while ((p != end) && (*p == '0')) { |
| ++p; |
| } |
| while ((p != end) && is_integer(*p)) { |
| if (answer.num_digits < max_digits) { |
| answer.digits[answer.num_digits] = uint8_t(*p - '0'); |
| } |
| answer.num_digits++; |
| ++p; |
| } |
| if ((p != end) && (*p == '.')) { |
| ++p; |
| if (p == end) { |
| return answer; |
| } // should never happen |
| const char *first_after_period = p; |
| // if we have not yet encountered a zero, we have to skip it as well |
| if (answer.num_digits == 0) { |
| // skip zeros |
| while (*p == '0') { |
| ++p; |
| } |
| } |
| while ((p != end) && is_integer(*p)) { |
| if (answer.num_digits < max_digits) { |
| answer.digits[answer.num_digits] = uint8_t(*p - '0'); |
| } |
| answer.num_digits++; |
| ++p; |
| } |
| answer.decimal_point = int32_t(first_after_period - p); |
| } |
| if (answer.num_digits > 0) { |
| const char *preverse = p - 1; |
| int32_t trailing_zeros = 0; |
| while ((*preverse == '0') || (*preverse == '.')) { |
| if (*preverse == '0') { |
| trailing_zeros++; |
| } |
| --preverse; |
| } |
| answer.decimal_point += int32_t(answer.num_digits); |
| answer.num_digits -= uint32_t(trailing_zeros); |
| } |
| if (answer.num_digits > max_digits) { |
| answer.num_digits = max_digits; |
| answer.truncated = true; |
| } |
| if ((p != end) && (('e' == *p) || ('E' == *p))) { |
| ++p; |
| if (p == end) { |
| return answer; |
| } // should never happen |
| bool neg_exp = false; |
| if ('-' == *p) { |
| neg_exp = true; |
| ++p; |
| } else if ('+' == *p) { |
| ++p; |
| } |
| int32_t exp_number = 0; // exponential part |
| while ((p != end) && is_integer(*p)) { |
| uint8_t digit = uint8_t(*p - '0'); |
| if (exp_number < 0x10000) { |
| exp_number = 10 * exp_number + digit; |
| } |
| ++p; |
| } |
| answer.decimal_point += (neg_exp ? -exp_number : exp_number); |
| } |
| return answer; |
| } |
| |
| namespace { |
| |
| // remove all final zeroes |
| inline void trim(decimal &h) { |
| while ((h.num_digits > 0) && (h.digits[h.num_digits - 1] == 0)) { |
| h.num_digits--; |
| } |
| } |
| |
| uint32_t number_of_digits_decimal_left_shift(decimal &h, uint32_t shift) { |
| shift &= 63; |
| const static uint16_t number_of_digits_decimal_left_shift_table[65] = { |
| 0x0000, 0x0800, 0x0801, 0x0803, 0x1006, 0x1009, 0x100D, 0x1812, 0x1817, |
| 0x181D, 0x2024, 0x202B, 0x2033, 0x203C, 0x2846, 0x2850, 0x285B, 0x3067, |
| 0x3073, 0x3080, 0x388E, 0x389C, 0x38AB, 0x38BB, 0x40CC, 0x40DD, 0x40EF, |
| 0x4902, 0x4915, 0x4929, 0x513E, 0x5153, 0x5169, 0x5180, 0x5998, 0x59B0, |
| 0x59C9, 0x61E3, 0x61FD, 0x6218, 0x6A34, 0x6A50, 0x6A6D, 0x6A8B, 0x72AA, |
| 0x72C9, 0x72E9, 0x7B0A, 0x7B2B, 0x7B4D, 0x8370, 0x8393, 0x83B7, 0x83DC, |
| 0x8C02, 0x8C28, 0x8C4F, 0x9477, 0x949F, 0x94C8, 0x9CF2, 0x051C, 0x051C, |
| 0x051C, 0x051C, |
| }; |
| uint32_t x_a = number_of_digits_decimal_left_shift_table[shift]; |
| uint32_t x_b = number_of_digits_decimal_left_shift_table[shift + 1]; |
| uint32_t num_new_digits = x_a >> 11; |
| uint32_t pow5_a = 0x7FF & x_a; |
| uint32_t pow5_b = 0x7FF & x_b; |
| const static uint8_t |
| number_of_digits_decimal_left_shift_table_powers_of_5[0x051C] = { |
| 5, 2, 5, 1, 2, 5, 6, 2, 5, 3, 1, 2, 5, 1, 5, 6, 2, 5, 7, 8, 1, 2, 5, |
| 3, 9, 0, 6, 2, 5, 1, 9, 5, 3, 1, 2, 5, 9, 7, 6, 5, 6, 2, 5, 4, 8, 8, |
| 2, 8, 1, 2, 5, 2, 4, 4, 1, 4, 0, 6, 2, 5, 1, 2, 2, 0, 7, 0, 3, 1, 2, |
| 5, 6, 1, 0, 3, 5, 1, 5, 6, 2, 5, 3, 0, 5, 1, 7, 5, 7, 8, 1, 2, 5, 1, |
| 5, 2, 5, 8, 7, 8, 9, 0, 6, 2, 5, 7, 6, 2, 9, 3, 9, 4, 5, 3, 1, 2, 5, |
| 3, 8, 1, 4, 6, 9, 7, 2, 6, 5, 6, 2, 5, 1, 9, 0, 7, 3, 4, 8, 6, 3, 2, |
| 8, 1, 2, 5, 9, 5, 3, 6, 7, 4, 3, 1, 6, 4, 0, 6, 2, 5, 4, 7, 6, 8, 3, |
| 7, 1, 5, 8, 2, 0, 3, 1, 2, 5, 2, 3, 8, 4, 1, 8, 5, 7, 9, 1, 0, 1, 5, |
| 6, 2, 5, 1, 1, 9, 2, 0, 9, 2, 8, 9, 5, 5, 0, 7, 8, 1, 2, 5, 5, 9, 6, |
| 0, 4, 6, 4, 4, 7, 7, 5, 3, 9, 0, 6, 2, 5, 2, 9, 8, 0, 2, 3, 2, 2, 3, |
| 8, 7, 6, 9, 5, 3, 1, 2, 5, 1, 4, 9, 0, 1, 1, 6, 1, 1, 9, 3, 8, 4, 7, |
| 6, 5, 6, 2, 5, 7, 4, 5, 0, 5, 8, 0, 5, 9, 6, 9, 2, 3, 8, 2, 8, 1, 2, |
| 5, 3, 7, 2, 5, 2, 9, 0, 2, 9, 8, 4, 6, 1, 9, 1, 4, 0, 6, 2, 5, 1, 8, |
| 6, 2, 6, 4, 5, 1, 4, 9, 2, 3, 0, 9, 5, 7, 0, 3, 1, 2, 5, 9, 3, 1, 3, |
| 2, 2, 5, 7, 4, 6, 1, 5, 4, 7, 8, 5, 1, 5, 6, 2, 5, 4, 6, 5, 6, 6, 1, |
| 2, 8, 7, 3, 0, 7, 7, 3, 9, 2, 5, 7, 8, 1, 2, 5, 2, 3, 2, 8, 3, 0, 6, |
| 4, 3, 6, 5, 3, 8, 6, 9, 6, 2, 8, 9, 0, 6, 2, 5, 1, 1, 6, 4, 1, 5, 3, |
| 2, 1, 8, 2, 6, 9, 3, 4, 8, 1, 4, 4, 5, 3, 1, 2, 5, 5, 8, 2, 0, 7, 6, |
| 6, 0, 9, 1, 3, 4, 6, 7, 4, 0, 7, 2, 2, 6, 5, 6, 2, 5, 2, 9, 1, 0, 3, |
| 8, 3, 0, 4, 5, 6, 7, 3, 3, 7, 0, 3, 6, 1, 3, 2, 8, 1, 2, 5, 1, 4, 5, |
| 5, 1, 9, 1, 5, 2, 2, 8, 3, 6, 6, 8, 5, 1, 8, 0, 6, 6, 4, 0, 6, 2, 5, |
| 7, 2, 7, 5, 9, 5, 7, 6, 1, 4, 1, 8, 3, 4, 2, 5, 9, 0, 3, 3, 2, 0, 3, |
| 1, 2, 5, 3, 6, 3, 7, 9, 7, 8, 8, 0, 7, 0, 9, 1, 7, 1, 2, 9, 5, 1, 6, |
| 6, 0, 1, 5, 6, 2, 5, 1, 8, 1, 8, 9, 8, 9, 4, 0, 3, 5, 4, 5, 8, 5, 6, |
| 4, 7, 5, 8, 3, 0, 0, 7, 8, 1, 2, 5, 9, 0, 9, 4, 9, 4, 7, 0, 1, 7, 7, |
| 2, 9, 2, 8, 2, 3, 7, 9, 1, 5, 0, 3, 9, 0, 6, 2, 5, 4, 5, 4, 7, 4, 7, |
| 3, 5, 0, 8, 8, 6, 4, 6, 4, 1, 1, 8, 9, 5, 7, 5, 1, 9, 5, 3, 1, 2, 5, |
| 2, 2, 7, 3, 7, 3, 6, 7, 5, 4, 4, 3, 2, 3, 2, 0, 5, 9, 4, 7, 8, 7, 5, |
| 9, 7, 6, 5, 6, 2, 5, 1, 1, 3, 6, 8, 6, 8, 3, 7, 7, 2, 1, 6, 1, 6, 0, |
| 2, 9, 7, 3, 9, 3, 7, 9, 8, 8, 2, 8, 1, 2, 5, 5, 6, 8, 4, 3, 4, 1, 8, |
| 8, 6, 0, 8, 0, 8, 0, 1, 4, 8, 6, 9, 6, 8, 9, 9, 4, 1, 4, 0, 6, 2, 5, |
| 2, 8, 4, 2, 1, 7, 0, 9, 4, 3, 0, 4, 0, 4, 0, 0, 7, 4, 3, 4, 8, 4, 4, |
| 9, 7, 0, 7, 0, 3, 1, 2, 5, 1, 4, 2, 1, 0, 8, 5, 4, 7, 1, 5, 2, 0, 2, |
| 0, 0, 3, 7, 1, 7, 4, 2, 2, 4, 8, 5, 3, 5, 1, 5, 6, 2, 5, 7, 1, 0, 5, |
| 4, 2, 7, 3, 5, 7, 6, 0, 1, 0, 0, 1, 8, 5, 8, 7, 1, 1, 2, 4, 2, 6, 7, |
| 5, 7, 8, 1, 2, 5, 3, 5, 5, 2, 7, 1, 3, 6, 7, 8, 8, 0, 0, 5, 0, 0, 9, |
| 2, 9, 3, 5, 5, 6, 2, 1, 3, 3, 7, 8, 9, 0, 6, 2, 5, 1, 7, 7, 6, 3, 5, |
| 6, 8, 3, 9, 4, 0, 0, 2, 5, 0, 4, 6, 4, 6, 7, 7, 8, 1, 0, 6, 6, 8, 9, |
| 4, 5, 3, 1, 2, 5, 8, 8, 8, 1, 7, 8, 4, 1, 9, 7, 0, 0, 1, 2, 5, 2, 3, |
| 2, 3, 3, 8, 9, 0, 5, 3, 3, 4, 4, 7, 2, 6, 5, 6, 2, 5, 4, 4, 4, 0, 8, |
| 9, 2, 0, 9, 8, 5, 0, 0, 6, 2, 6, 1, 6, 1, 6, 9, 4, 5, 2, 6, 6, 7, 2, |
| 3, 6, 3, 2, 8, 1, 2, 5, 2, 2, 2, 0, 4, 4, 6, 0, 4, 9, 2, 5, 0, 3, 1, |
| 3, 0, 8, 0, 8, 4, 7, 2, 6, 3, 3, 3, 6, 1, 8, 1, 6, 4, 0, 6, 2, 5, 1, |
| 1, 1, 0, 2, 2, 3, 0, 2, 4, 6, 2, 5, 1, 5, 6, 5, 4, 0, 4, 2, 3, 6, 3, |
| 1, 6, 6, 8, 0, 9, 0, 8, 2, 0, 3, 1, 2, 5, 5, 5, 5, 1, 1, 1, 5, 1, 2, |
| 3, 1, 2, 5, 7, 8, 2, 7, 0, 2, 1, 1, 8, 1, 5, 8, 3, 4, 0, 4, 5, 4, 1, |
| 0, 1, 5, 6, 2, 5, 2, 7, 7, 5, 5, 5, 7, 5, 6, 1, 5, 6, 2, 8, 9, 1, 3, |
| 5, 1, 0, 5, 9, 0, 7, 9, 1, 7, 0, 2, 2, 7, 0, 5, 0, 7, 8, 1, 2, 5, 1, |
| 3, 8, 7, 7, 7, 8, 7, 8, 0, 7, 8, 1, 4, 4, 5, 6, 7, 5, 5, 2, 9, 5, 3, |
| 9, 5, 8, 5, 1, 1, 3, 5, 2, 5, 3, 9, 0, 6, 2, 5, 6, 9, 3, 8, 8, 9, 3, |
| 9, 0, 3, 9, 0, 7, 2, 2, 8, 3, 7, 7, 6, 4, 7, 6, 9, 7, 9, 2, 5, 5, 6, |
| 7, 6, 2, 6, 9, 5, 3, 1, 2, 5, 3, 4, 6, 9, 4, 4, 6, 9, 5, 1, 9, 5, 3, |
| 6, 1, 4, 1, 8, 8, 8, 2, 3, 8, 4, 8, 9, 6, 2, 7, 8, 3, 8, 1, 3, 4, 7, |
| 6, 5, 6, 2, 5, 1, 7, 3, 4, 7, 2, 3, 4, 7, 5, 9, 7, 6, 8, 0, 7, 0, 9, |
| 4, 4, 1, 1, 9, 2, 4, 4, 8, 1, 3, 9, 1, 9, 0, 6, 7, 3, 8, 2, 8, 1, 2, |
| 5, 8, 6, 7, 3, 6, 1, 7, 3, 7, 9, 8, 8, 4, 0, 3, 5, 4, 7, 2, 0, 5, 9, |
| 6, 2, 2, 4, 0, 6, 9, 5, 9, 5, 3, 3, 6, 9, 1, 4, 0, 6, 2, 5, |
| }; |
| const uint8_t *pow5 = |
| &number_of_digits_decimal_left_shift_table_powers_of_5[pow5_a]; |
| uint32_t i = 0; |
| uint32_t n = pow5_b - pow5_a; |
| for (; i < n; i++) { |
| if (i >= h.num_digits) { |
| return num_new_digits - 1; |
| } else if (h.digits[i] == pow5[i]) { |
| continue; |
| } else if (h.digits[i] < pow5[i]) { |
| return num_new_digits - 1; |
| } else { |
| return num_new_digits; |
| } |
| } |
| return num_new_digits; |
| } |
| |
| } // end of anonymous namespace |
| |
| static uint64_t round(decimal &h) { |
| if ((h.num_digits == 0) || (h.decimal_point < 0)) { |
| return 0; |
| } else if (h.decimal_point > 18) { |
| return UINT64_MAX; |
| } |
| // at this point, we know that h.decimal_point >= 0 |
| uint32_t dp = uint32_t(h.decimal_point); |
| uint64_t n = 0; |
| for (uint32_t i = 0; i < dp; i++) { |
| n = (10 * n) + ((i < h.num_digits) ? h.digits[i] : 0); |
| } |
| bool round_up = false; |
| if (dp < h.num_digits) { |
| round_up = h.digits[dp] >= 5; // normally, we round up |
| // but we may need to round to even! |
| if ((h.digits[dp] == 5) && (dp + 1 == h.num_digits)) { |
| round_up = h.truncated || ((dp > 0) && (1 & h.digits[dp - 1])); |
| } |
| } |
| if (round_up) { |
| n++; |
| } |
| return n; |
| } |
| |
| // computes h * 2^-shift |
| static void decimal_left_shift(decimal &h, uint32_t shift) { |
| if (h.num_digits == 0) { |
| return; |
| } |
| uint32_t num_new_digits = number_of_digits_decimal_left_shift(h, shift); |
| int32_t read_index = int32_t(h.num_digits - 1); |
| uint32_t write_index = h.num_digits - 1 + num_new_digits; |
| uint64_t n = 0; |
| |
| while (read_index >= 0) { |
| n += uint64_t(h.digits[read_index]) << shift; |
| uint64_t quotient = n / 10; |
| uint64_t remainder = n - (10 * quotient); |
| if (write_index < max_digits) { |
| h.digits[write_index] = uint8_t(remainder); |
| } else if (remainder > 0) { |
| h.truncated = true; |
| } |
| n = quotient; |
| write_index--; |
| read_index--; |
| } |
| while (n > 0) { |
| uint64_t quotient = n / 10; |
| uint64_t remainder = n - (10 * quotient); |
| if (write_index < max_digits) { |
| h.digits[write_index] = uint8_t(remainder); |
| } else if (remainder > 0) { |
| h.truncated = true; |
| } |
| n = quotient; |
| write_index--; |
| } |
| h.num_digits += num_new_digits; |
| if (h.num_digits > max_digits) { |
| h.num_digits = max_digits; |
| } |
| h.decimal_point += int32_t(num_new_digits); |
| trim(h); |
| } |
| |
| // computes h * 2^shift |
| static void decimal_right_shift(decimal &h, uint32_t shift) { |
| uint32_t read_index = 0; |
| uint32_t write_index = 0; |
| |
| uint64_t n = 0; |
| |
| while ((n >> shift) == 0) { |
| if (read_index < h.num_digits) { |
| n = (10 * n) + h.digits[read_index++]; |
| } else if (n == 0) { |
| return; |
| } else { |
| while ((n >> shift) == 0) { |
| n = 10 * n; |
| read_index++; |
| } |
| break; |
| } |
| } |
| h.decimal_point -= int32_t(read_index - 1); |
| if (h.decimal_point < -decimal_point_range) { // it is zero |
| h.num_digits = 0; |
| h.decimal_point = 0; |
| h.negative = false; |
| h.truncated = false; |
| return; |
| } |
| uint64_t mask = (uint64_t(1) << shift) - 1; |
| while (read_index < h.num_digits) { |
| uint8_t new_digit = uint8_t(n >> shift); |
| n = (10 * (n & mask)) + h.digits[read_index++]; |
| h.digits[write_index++] = new_digit; |
| } |
| while (n > 0) { |
| uint8_t new_digit = uint8_t(n >> shift); |
| n = 10 * (n & mask); |
| if (write_index < max_digits) { |
| h.digits[write_index++] = new_digit; |
| } else if (new_digit > 0) { |
| h.truncated = true; |
| } |
| } |
| h.num_digits = write_index; |
| trim(h); |
| } |
| |
| template <typename binary> |
| adjusted_mantissa compute_float(decimal &d) { |
| adjusted_mantissa answer; |
| if (d.num_digits == 0) { |
| // should be zero |
| answer.power2 = 0; |
| answer.mantissa = 0; |
| return answer; |
| } |
| // At this point, going further, we can assume that d.num_digits > 0. |
| // We want to guard against excessive decimal point values because |
| // they can result in long running times. Indeed, we do |
| // shifts by at most 60 bits. We have that log(10**400)/log(2**60) ~= 22 |
| // which is fine, but log(10**299995)/log(2**60) ~= 16609 which is not |
| // fine (runs for a long time). |
| // |
| if (d.decimal_point < -324) { |
| // We have something smaller than 1e-324 which is always zero |
| // in binary64 and binary32. |
| // It should be zero. |
| answer.power2 = 0; |
| answer.mantissa = 0; |
| return answer; |
| } else if (d.decimal_point >= 310) { |
| // We have something at least as large as 0.1e310 which is |
| // always infinite. |
| answer.power2 = binary::infinite_power(); |
| answer.mantissa = 0; |
| return answer; |
| } |
| |
| static const uint32_t max_shift = 60; |
| static const uint32_t num_powers = 19; |
| static const uint8_t powers[19] = { |
| 0, 3, 6, 9, 13, 16, 19, 23, 26, 29, // |
| 33, 36, 39, 43, 46, 49, 53, 56, 59, // |
| }; |
| int32_t exp2 = 0; |
| while (d.decimal_point > 0) { |
| uint32_t n = uint32_t(d.decimal_point); |
| uint32_t shift = (n < num_powers) ? powers[n] : max_shift; |
| decimal_right_shift(d, shift); |
| if (d.decimal_point < -decimal_point_range) { |
| // should be zero |
| answer.power2 = 0; |
| answer.mantissa = 0; |
| return answer; |
| } |
| exp2 += int32_t(shift); |
| } |
| // We shift left toward [1/2 ... 1]. |
| while (d.decimal_point <= 0) { |
| uint32_t shift; |
| if (d.decimal_point == 0) { |
| if (d.digits[0] >= 5) { |
| break; |
| } |
| shift = (d.digits[0] < 2) ? 2 : 1; |
| } else { |
| uint32_t n = uint32_t(-d.decimal_point); |
| shift = (n < num_powers) ? powers[n] : max_shift; |
| } |
| decimal_left_shift(d, shift); |
| if (d.decimal_point > decimal_point_range) { |
| // we want to get infinity: |
| answer.power2 = 0xFF; |
| answer.mantissa = 0; |
| return answer; |
| } |
| exp2 -= int32_t(shift); |
| } |
| // We are now in the range [1/2 ... 1] but the binary format uses [1 ... 2]. |
| exp2--; |
| constexpr int32_t minimum_exponent = binary::minimum_exponent(); |
| while ((minimum_exponent + 1) > exp2) { |
| uint32_t n = uint32_t((minimum_exponent + 1) - exp2); |
| if (n > max_shift) { |
| n = max_shift; |
| } |
| decimal_right_shift(d, n); |
| exp2 += int32_t(n); |
| } |
| if ((exp2 - minimum_exponent) >= binary::infinite_power()) { |
| answer.power2 = binary::infinite_power(); |
| answer.mantissa = 0; |
| return answer; |
| } |
| |
| const int mantissa_size_in_bits = binary::mantissa_explicit_bits() + 1; |
| decimal_left_shift(d, mantissa_size_in_bits); |
| |
| uint64_t mantissa = round(d); |
| // It is possible that we have an overflow, in which case we need |
| // to shift back. |
| if (mantissa >= (uint64_t(1) << mantissa_size_in_bits)) { |
| decimal_right_shift(d, 1); |
| exp2 += 1; |
| mantissa = round(d); |
| if ((exp2 - minimum_exponent) >= binary::infinite_power()) { |
| answer.power2 = binary::infinite_power(); |
| answer.mantissa = 0; |
| return answer; |
| } |
| } |
| answer.power2 = exp2 - binary::minimum_exponent(); |
| if (mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) { |
| answer.power2--; |
| } |
| answer.mantissa = |
| mantissa & ((uint64_t(1) << binary::mantissa_explicit_bits()) - 1); |
| return answer; |
| } |
| |
| template <typename binary> |
| adjusted_mantissa parse_long_mantissa(const char *first) { |
| decimal d = parse_decimal(first); |
| return compute_float<binary>(d); |
| } |
| |
| template <typename binary> |
| adjusted_mantissa parse_long_mantissa(const char *first, const char *end) { |
| decimal d = parse_decimal(first, end); |
| return compute_float<binary>(d); |
| } |
| |
| double from_chars(const char *first) noexcept { |
| bool negative = first[0] == '-'; |
| if (negative) { |
| first++; |
| } |
| adjusted_mantissa am = parse_long_mantissa<binary_format<double>>(first); |
| uint64_t word = am.mantissa; |
| word |= uint64_t(am.power2) |
| << binary_format<double>::mantissa_explicit_bits(); |
| word = negative ? word | (uint64_t(1) << binary_format<double>::sign_index()) |
| : word; |
| double value; |
| std::memcpy(&value, &word, sizeof(double)); |
| return value; |
| } |
| |
| double from_chars(const char *first, const char *end) noexcept { |
| bool negative = first[0] == '-'; |
| if (negative) { |
| first++; |
| } |
| adjusted_mantissa am = parse_long_mantissa<binary_format<double>>(first, end); |
| uint64_t word = am.mantissa; |
| word |= uint64_t(am.power2) |
| << binary_format<double>::mantissa_explicit_bits(); |
| word = negative ? word | (uint64_t(1) << binary_format<double>::sign_index()) |
| : word; |
| double value; |
| std::memcpy(&value, &word, sizeof(double)); |
| return value; |
| } |
| |
| } // namespace internal |
| } // namespace simdjson |
| } // namespace minijson |
| |
| namespace minijson { |
| namespace simdjson { |
| namespace internal { |
| /*! |
| implements the Grisu2 algorithm for binary to decimal floating-point |
| conversion. |
| Adapted from JSON for Modern C++ |
| |
| This implementation is a slightly modified version of the reference |
| implementation which may be obtained from |
| http://florian.loitsch.com/publications (bench.tar.gz). |
| The code is distributed under the MIT license, Copyright (c) 2009 Florian |
| Loitsch. For a detailed description of the algorithm see: [1] Loitsch, "Printing |
| Floating-Point Numbers Quickly and Accurately with Integers", Proceedings of the |
| ACM SIGPLAN 2010 Conference on Programming Language Design and Implementation, |
| PLDI 2010 [2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and |
| Accurately", Proceedings of the ACM SIGPLAN 1996 Conference on Programming |
| Language Design and Implementation, PLDI 1996 |
| */ |
| namespace dtoa_impl { |
| |
| template <typename Target, typename Source> |
| Target reinterpret_bits(const Source source) { |
| static_assert(sizeof(Target) == sizeof(Source), "size mismatch"); |
| |
| Target target; |
| std::memcpy(&target, &source, sizeof(Source)); |
| return target; |
| } |
| |
| struct diyfp // f * 2^e |
| { |
| static constexpr int kPrecision = 64; // = q |
| |
| std::uint64_t f = 0; |
| int e = 0; |
| |
| constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {} |
| |
| /*! |
| @brief returns x - y |
| @pre x.e == y.e and x.f >= y.f |
| */ |
| static diyfp sub(const diyfp &x, const diyfp &y) noexcept { |
| return {x.f - y.f, x.e}; |
| } |
| |
| /*! |
| @brief returns x * y |
| @note The result is rounded. (Only the upper q bits are returned.) |
| */ |
| static diyfp mul(const diyfp &x, const diyfp &y) noexcept { |
| static_assert(kPrecision == 64, "internal error"); |
| |
| // Computes: |
| // f = round((x.f * y.f) / 2^q) |
| // e = x.e + y.e + q |
| |
| // Emulate the 64-bit * 64-bit multiplication: |
| // |
| // p = u * v |
| // = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi) |
| // = (u_lo v_lo ) + 2^32 ((u_lo v_hi ) + (u_hi v_lo )) + |
| // 2^64 (u_hi v_hi ) = (p0 ) + 2^32 ((p1 ) + (p2 )) |
| // + 2^64 (p3 ) = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo + |
| // 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3 ) = |
| // (p0_lo ) + 2^32 (p0_hi + p1_lo + p2_lo ) + 2^64 (p1_hi + |
| // p2_hi + p3) = (p0_lo ) + 2^32 (Q ) + 2^64 (H ) = (p0_lo ) + |
| // 2^32 (Q_lo + 2^32 Q_hi ) + 2^64 (H ) |
| // |
| // (Since Q might be larger than 2^32 - 1) |
| // |
| // = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H) |
| // |
| // (Q_hi + H does not overflow a 64-bit int) |
| // |
| // = p_lo + 2^64 p_hi |
| |
| const std::uint64_t u_lo = x.f & 0xFFFFFFFFu; |
| const std::uint64_t u_hi = x.f >> 32u; |
| const std::uint64_t v_lo = y.f & 0xFFFFFFFFu; |
| const std::uint64_t v_hi = y.f >> 32u; |
| |
| const std::uint64_t p0 = u_lo * v_lo; |
| const std::uint64_t p1 = u_lo * v_hi; |
| const std::uint64_t p2 = u_hi * v_lo; |
| const std::uint64_t p3 = u_hi * v_hi; |
| |
| const std::uint64_t p0_hi = p0 >> 32u; |
| const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu; |
| const std::uint64_t p1_hi = p1 >> 32u; |
| const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu; |
| const std::uint64_t p2_hi = p2 >> 32u; |
| |
| std::uint64_t Q = p0_hi + p1_lo + p2_lo; |
| |
| // The full product might now be computed as |
| // |
| // p_hi = p3 + p2_hi + p1_hi + (Q >> 32) |
| // p_lo = p0_lo + (Q << 32) |
| // |
| // But in this particular case here, the full p_lo is not required. |
| // Effectively we only need to add the highest bit in p_lo to p_hi (and |
| // Q_hi + 1 does not overflow). |
| |
| Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up |
| |
| const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u); |
| |
| return {h, x.e + y.e + 64}; |
| } |
| |
| /*! |
| @brief normalize x such that the significand is >= 2^(q-1) |
| @pre x.f != 0 |
| */ |
| static diyfp normalize(diyfp x) noexcept { |
| while ((x.f >> 63u) == 0) { |
| x.f <<= 1u; |
| x.e--; |
| } |
| |
| return x; |
| } |
| |
| /*! |
| @brief normalize x such that the result has the exponent E |
| @pre e >= x.e and the upper e - x.e bits of x.f must be zero. |
| */ |
| static diyfp normalize_to(const diyfp &x, |
| const int target_exponent) noexcept { |
| const int delta = x.e - target_exponent; |
| |
| return {x.f << delta, target_exponent}; |
| } |
| }; |
| |
| struct boundaries { |
| diyfp w; |
| diyfp minus; |
| diyfp plus; |
| }; |
| |
| /*! |
| Compute the (normalized) diyfp representing the input number 'value' and its |
| boundaries. |
| @pre value must be finite and positive |
| */ |
| template <typename FloatType> |
| boundaries compute_boundaries(FloatType value) { |
| // Convert the IEEE representation into a diyfp. |
| // |
| // If v is denormal: |
| // value = 0.F * 2^(1 - bias) = ( F) * 2^(1 - bias - (p-1)) |
| // If v is normalized: |
| // value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1)) |
| |
| static_assert(std::numeric_limits<FloatType>::is_iec559, |
| "internal error: dtoa_short requires an IEEE-754 " |
| "floating-point implementation"); |
| |
| constexpr int kPrecision = |
| std::numeric_limits<FloatType>::digits; // = p (includes the hidden bit) |
| constexpr int kBias = |
| std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1); |
| constexpr int kMinExp = 1 - kBias; |
| constexpr std::uint64_t kHiddenBit = std::uint64_t{1} |
| << (kPrecision - 1); // = 2^(p-1) |
| |
| using bits_type = typename std::conditional<kPrecision == 24, std::uint32_t, |
| std::uint64_t>::type; |
| |
| const std::uint64_t bits = reinterpret_bits<bits_type>(value); |
| const std::uint64_t E = bits >> (kPrecision - 1); |
| const std::uint64_t F = bits & (kHiddenBit - 1); |
| |
| const bool is_denormal = E == 0; |
| const diyfp v = is_denormal |
| ? diyfp(F, kMinExp) |
| : diyfp(F + kHiddenBit, static_cast<int>(E) - kBias); |
| |
| // Compute the boundaries m- and m+ of the floating-point value |
| // v = f * 2^e. |
| // |
| // Determine v- and v+, the floating-point predecessor and successor if v, |
| // respectively. |
| // |
| // v- = v - 2^e if f != 2^(p-1) or e == e_min (A) |
| // = v - 2^(e-1) if f == 2^(p-1) and e > e_min (B) |
| // |
| // v+ = v + 2^e |
| // |
| // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_ |
| // between m- and m+ round to v, regardless of how the input rounding |
| // algorithm breaks ties. |
| // |
| // ---+-------------+-------------+-------------+-------------+--- (A) |
| // v- m- v m+ v+ |
| // |
| // -----------------+------+------+-------------+-------------+--- (B) |
| // v- m- v m+ v+ |
| |
| const bool lower_boundary_is_closer = F == 0 && E > 1; |
| const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1); |
| const diyfp m_minus = lower_boundary_is_closer |
| ? diyfp(4 * v.f - 1, v.e - 2) // (B) |
| : diyfp(2 * v.f - 1, v.e - 1); // (A) |
| |
| // Determine the normalized w+ = m+. |
| const diyfp w_plus = diyfp::normalize(m_plus); |
| |
| // Determine w- = m- such that e_(w-) = e_(w+). |
| const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e); |
| |
| return {diyfp::normalize(v), w_minus, w_plus}; |
| } |
| |
| // Given normalized diyfp w, Grisu needs to find a (normalized) cached |
| // power-of-ten c, such that the exponent of the product c * w = f * 2^e lies |
| // within a certain range [alpha, gamma] (Definition 3.2 from [1]) |
| // |
| // alpha <= e = e_c + e_w + q <= gamma |
| // |
| // or |
| // |
| // f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q |
| // <= f_c * f_w * 2^gamma |
| // |
| // Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies |
| // |
| // 2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma |
| // |
| // or |
| // |
| // 2^(q - 2 + alpha) <= c * w < 2^(q + gamma) |
| // |
| // The choice of (alpha,gamma) determines the size of the table and the form of |
| // the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well |
| // in practice: |
| // |
| // The idea is to cut the number c * w = f * 2^e into two parts, which can be |
| // processed independently: An integral part p1, and a fractional part p2: |
| // |
| // f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e |
| // = (f div 2^-e) + (f mod 2^-e) * 2^e |
| // = p1 + p2 * 2^e |
| // |
| // The conversion of p1 into decimal form requires a series of divisions and |
| // modulos by (a power of) 10. These operations are faster for 32-bit than for |
| // 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be |
| // achieved by choosing |
| // |
| // -e >= 32 or e <= -32 := gamma |
| // |
| // In order to convert the fractional part |
| // |
| // p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ... |
| // |
| // into decimal form, the fraction is repeatedly multiplied by 10 and the digits |
| // d[-i] are extracted in order: |
| // |
| // (10 * p2) div 2^-e = d[-1] |
| // (10 * p2) mod 2^-e = d[-2] / 10^1 + ... |
| // |
| // The multiplication by 10 must not overflow. It is sufficient to choose |
| // |
| // 10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64. |
| // |
| // Since p2 = f mod 2^-e < 2^-e, |
| // |
| // -e <= 60 or e >= -60 := alpha |
| |
| constexpr int kAlpha = -60; |
| constexpr int kGamma = -32; |
| |
| struct cached_power // c = f * 2^e ~= 10^k |
| { |
| std::uint64_t f; |
| int e; |
| int k; |
| }; |
| |
| /*! |
| For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached |
| power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c |
| satisfies (Definition 3.2 from [1]) |
| alpha <= e_c + e + q <= gamma. |
| */ |
| inline cached_power get_cached_power_for_binary_exponent(int e) { |
| // Now |
| // |
| // alpha <= e_c + e + q <= gamma (1) |
| // ==> f_c * 2^alpha <= c * 2^e * 2^q |
| // |
| // and since the c's are normalized, 2^(q-1) <= f_c, |
| // |
| // ==> 2^(q - 1 + alpha) <= c * 2^(e + q) |
| // ==> 2^(alpha - e - 1) <= c |
| // |
| // If c were an exact power of ten, i.e. c = 10^k, one may determine k as |
| // |
| // k = ceil( log_10( 2^(alpha - e - 1) ) ) |
| // = ceil( (alpha - e - 1) * log_10(2) ) |
| // |
| // From the paper: |
| // "In theory the result of the procedure could be wrong since c is rounded, |
| // and the computation itself is approximated [...]. In practice, however, |
| // this simple function is sufficient." |
| // |
| // For IEEE double precision floating-point numbers converted into |
| // normalized diyfp's w = f * 2^e, with q = 64, |
| // |
| // e >= -1022 (min IEEE exponent) |
| // -52 (p - 1) |
| // -52 (p - 1, possibly normalize denormal IEEE numbers) |
| // -11 (normalize the diyfp) |
| // = -1137 |
| // |
| // and |
| // |
| // e <= +1023 (max IEEE exponent) |
| // -52 (p - 1) |
| // -11 (normalize the diyfp) |
| // = 960 |
| // |
| // This binary exponent range [-1137,960] results in a decimal exponent |
| // range [-307,324]. One does not need to store a cached power for each |
| // k in this range. For each such k it suffices to find a cached power |
| // such that the exponent of the product lies in [alpha,gamma]. |
| // This implies that the difference of the decimal exponents of adjacent |
| // table entries must be less than or equal to |
| // |
| // floor( (gamma - alpha) * log_10(2) ) = 8. |
| // |
| // (A smaller distance gamma-alpha would require a larger table.) |
| |
| // NB: |
| // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34. |
| |
| constexpr int kCachedPowersMinDecExp = -300; |
| constexpr int kCachedPowersDecStep = 8; |
| |
| static constexpr std::array<cached_power, 79> kCachedPowers = {{ |
| {0xAB70FE17C79AC6CA, -1060, -300}, {0xFF77B1FCBEBCDC4F, -1034, -292}, |
| {0xBE5691EF416BD60C, -1007, -284}, {0x8DD01FAD907FFC3C, -980, -276}, |
| {0xD3515C2831559A83, -954, -268}, {0x9D71AC8FADA6C9B5, -927, -260}, |
| {0xEA9C227723EE8BCB, -901, -252}, {0xAECC49914078536D, -874, -244}, |
| {0x823C12795DB6CE57, -847, -236}, {0xC21094364DFB5637, -821, -228}, |
| {0x9096EA6F3848984F, -794, -220}, {0xD77485CB25823AC7, -768, -212}, |
| {0xA086CFCD97BF97F4, -741, -204}, {0xEF340A98172AACE5, -715, -196}, |
| {0xB23867FB2A35B28E, -688, -188}, {0x84C8D4DFD2C63F3B, -661, -180}, |
| {0xC5DD44271AD3CDBA, -635, -172}, {0x936B9FCEBB25C996, -608, -164}, |
| {0xDBAC6C247D62A584, -582, -156}, {0xA3AB66580D5FDAF6, -555, -148}, |
| {0xF3E2F893DEC3F126, -529, -140}, {0xB5B5ADA8AAFF80B8, -502, -132}, |
| {0x87625F056C7C4A8B, -475, -124}, {0xC9BCFF6034C13053, -449, -116}, |
| {0x964E858C91BA2655, -422, -108}, {0xDFF9772470297EBD, -396, -100}, |
| {0xA6DFBD9FB8E5B88F, -369, -92}, {0xF8A95FCF88747D94, -343, -84}, |
| {0xB94470938FA89BCF, -316, -76}, {0x8A08F0F8BF0F156B, -289, -68}, |
| {0xCDB02555653131B6, -263, -60}, {0x993FE2C6D07B7FAC, -236, -52}, |
| {0xE45C10C42A2B3B06, -210, -44}, {0xAA242499697392D3, -183, -36}, |
| {0xFD87B5F28300CA0E, -157, -28}, {0xBCE5086492111AEB, -130, -20}, |
| {0x8CBCCC096F5088CC, -103, -12}, {0xD1B71758E219652C, -77, -4}, |
| {0x9C40000000000000, -50, 4}, {0xE8D4A51000000000, -24, 12}, |
| {0xAD78EBC5AC620000, 3, 20}, {0x813F3978F8940984, 30, 28}, |
| {0xC097CE7BC90715B3, 56, 36}, {0x8F7E32CE7BEA5C70, 83, 44}, |
| {0xD5D238A4ABE98068, 109, 52}, {0x9F4F2726179A2245, 136, 60}, |
| {0xED63A231D4C4FB27, 162, 68}, {0xB0DE65388CC8ADA8, 189, 76}, |
| {0x83C7088E1AAB65DB, 216, 84}, {0xC45D1DF942711D9A, 242, 92}, |
| {0x924D692CA61BE758, 269, 100}, {0xDA01EE641A708DEA, 295, 108}, |
| {0xA26DA3999AEF774A, 322, 116}, {0xF209787BB47D6B85, 348, 124}, |
| {0xB454E4A179DD1877, 375, 132}, {0x865B86925B9BC5C2, 402, 140}, |
| {0xC83553C5C8965D3D, 428, 148}, {0x952AB45CFA97A0B3, 455, 156}, |
| {0xDE469FBD99A05FE3, 481, 164}, {0xA59BC234DB398C25, 508, 172}, |
| {0xF6C69A72A3989F5C, 534, 180}, {0xB7DCBF5354E9BECE, 561, 188}, |
| {0x88FCF317F22241E2, 588, 196}, {0xCC20CE9BD35C78A5, 614, 204}, |
| {0x98165AF37B2153DF, 641, 212}, {0xE2A0B5DC971F303A, 667, 220}, |
| {0xA8D9D1535CE3B396, 694, 228}, {0xFB9B7CD9A4A7443C, 720, 236}, |
| {0xBB764C4CA7A44410, 747, 244}, {0x8BAB8EEFB6409C1A, 774, 252}, |
| {0xD01FEF10A657842C, 800, 260}, {0x9B10A4E5E9913129, 827, 268}, |
| {0xE7109BFBA19C0C9D, 853, 276}, {0xAC2820D9623BF429, 880, 284}, |
| {0x80444B5E7AA7CF85, 907, 292}, {0xBF21E44003ACDD2D, 933, 300}, |
| {0x8E679C2F5E44FF8F, 960, 308}, {0xD433179D9C8CB841, 986, 316}, |
| {0x9E19DB92B4E31BA9, 1013, 324}, |
| }}; |
| |
| // This computation gives exactly the same results for k as |
| // k = ceil((kAlpha - e - 1) * 0.30102999566398114) |
| // for |e| <= 1500, but doesn't require floating-point operations. |
| // NB: log_10(2) ~= 78913 / 2^18 |
| const int f = kAlpha - e - 1; |
| const int k = (f * 78913) / (1 << 18) + static_cast<int>(f > 0); |
| |
| const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) / |
| kCachedPowersDecStep; |
| |
| const cached_power cached = kCachedPowers[static_cast<std::size_t>(index)]; |
| |
| return cached; |
| } |
| |
| /*! |
| For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k. |
| For n == 0, returns 1 and sets pow10 := 1. |
| */ |
| inline int find_largest_pow10(const std::uint32_t n, std::uint32_t &pow10) { |
| // LCOV_EXCL_START |
| if (n >= 1000000000) { |
| pow10 = 1000000000; |
| return 10; |
| } |
| // LCOV_EXCL_STOP |
| else if (n >= 100000000) { |
| pow10 = 100000000; |
| return 9; |
| } else if (n >= 10000000) { |
| pow10 = 10000000; |
| return 8; |
| } else if (n >= 1000000) { |
| pow10 = 1000000; |
| return 7; |
| } else if (n >= 100000) { |
| pow10 = 100000; |
| return 6; |
| } else if (n >= 10000) { |
| pow10 = 10000; |
| return 5; |
| } else if (n >= 1000) { |
| pow10 = 1000; |
| return 4; |
| } else if (n >= 100) { |
| pow10 = 100; |
| return 3; |
| } else if (n >= 10) { |
| pow10 = 10; |
| return 2; |
| } else { |
| pow10 = 1; |
| return 1; |
| } |
| } |
| |
| inline void grisu2_round(char *buf, int len, std::uint64_t dist, |
| std::uint64_t delta, std::uint64_t rest, |
| std::uint64_t ten_k) { |
| // <--------------------------- delta ----> |
| // <---- dist ---------> |
| // --------------[------------------+-------------------]-------------- |
| // M- w M+ |
| // |
| // ten_k |
| // <------> |
| // <---- rest ----> |
| // --------------[------------------+----+--------------]-------------- |
| // w V |
| // = buf * 10^k |
| // |
| // ten_k represents a unit-in-the-last-place in the decimal representation |
| // stored in buf. |
| // Decrement buf by ten_k while this takes buf closer to w. |
| |
| // The tests are written in this order to avoid overflow in unsigned |
| // integer arithmetic. |
| |
| while (rest < dist && delta - rest >= ten_k && |
| (rest + ten_k < dist || dist - rest > rest + ten_k - dist)) { |
| buf[len - 1]--; |
| rest += ten_k; |
| } |
| } |
| |
| /*! |
| Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+. |
| M- and M+ must be normalized and share the same exponent -60 <= e <= -32. |
| */ |
| inline void grisu2_digit_gen(char *buffer, int &length, int &decimal_exponent, |
| diyfp M_minus, diyfp w, diyfp M_plus) { |
| static_assert(kAlpha >= -60, "internal error"); |
| static_assert(kGamma <= -32, "internal error"); |
| |
| // Generates the digits (and the exponent) of a decimal floating-point |
| // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's |
| // w, M- and M+ share the same exponent e, which satisfies alpha <= e <= |
| // gamma. |
| // |
| // <--------------------------- delta ----> |
| // <---- dist ---------> |
| // --------------[------------------+-------------------]-------------- |
| // M- w M+ |
| // |
| // Grisu2 generates the digits of M+ from left to right and stops as soon as |
| // V is in [M-,M+]. |
| |
| std::uint64_t delta = |
| diyfp::sub(M_plus, M_minus) |
| .f; // (significand of (M+ - M-), implicit exponent is e) |
| std::uint64_t dist = |
| diyfp::sub(M_plus, w) |
| .f; // (significand of (M+ - w ), implicit exponent is e) |
| |
| // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0): |
| // |
| // M+ = f * 2^e |
| // = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e |
| // = ((p1 ) * 2^-e + (p2 )) * 2^e |
| // = p1 + p2 * 2^e |
| |
| const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e); |
| |
| auto p1 = static_cast<std::uint32_t>( |
| M_plus.f >> |
| -one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.) |
| std::uint64_t p2 = M_plus.f & (one.f - 1); // p2 = f mod 2^-e |
| |
| // 1) |
| // |
| // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0] |
| |
| std::uint32_t pow10; |
| const int k = find_largest_pow10(p1, pow10); |
| |
| // 10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1) |
| // |
| // p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1)) |
| // = (d[k-1] ) * 10^(k-1) + (p1 mod 10^(k-1)) |
| // |
| // M+ = p1 + p2 * 2^e |
| // = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1)) + p2 * 2^e |
| // = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e |
| // = d[k-1] * 10^(k-1) + ( rest) * 2^e |
| // |
| // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0) |
| // |
| // p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0] |
| // |
| // but stop as soon as |
| // |
| // rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e |
| |
| int n = k; |
| while (n > 0) { |
| // Invariants: |
| // M+ = buffer * 10^n + (p1 + p2 * 2^e) (buffer = 0 for n = k) |
| // pow10 = 10^(n-1) <= p1 < 10^n |
| // |
| const std::uint32_t d = p1 / pow10; // d = p1 div 10^(n-1) |
| const std::uint32_t r = p1 % pow10; // r = p1 mod 10^(n-1) |
| // |
| // M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e |
| // = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e) |
| // |
| buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d |
| // |
| // M+ = buffer * 10^(n-1) + (r + p2 * 2^e) |
| // |
| p1 = r; |
| n--; |
| // |
| // M+ = buffer * 10^n + (p1 + p2 * 2^e) |
| // pow10 = 10^n |
| // |
| |
| // Now check if enough digits have been generated. |
| // Compute |
| // |
| // p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e |
| // |
| // Note: |
| // Since rest and delta share the same exponent e, it suffices to |
| // compare the significands. |
| const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2; |
| if (rest <= delta) { |
| // V = buffer * 10^n, with M- <= V <= M+. |
| |
| decimal_exponent += n; |
| |
| // We may now just stop. But instead look if the buffer could be |
| // decremented to bring V closer to w. |
| // |
| // pow10 = 10^n is now 1 ulp in the decimal representation V. |
| // The rounding procedure works with diyfp's with an implicit |
| // exponent of e. |
| // |
| // 10^n = (10^n * 2^-e) * 2^e = ulp * 2^e |
| // |
| const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e; |
| grisu2_round(buffer, length, dist, delta, rest, ten_n); |
| |
| return; |
| } |
| |
| pow10 /= 10; |
| // |
| // pow10 = 10^(n-1) <= p1 < 10^n |
| // Invariants restored. |
| } |
| |
| // 2) |
| // |
| // The digits of the integral part have been generated: |
| // |
| // M+ = d[k-1]...d[1]d[0] + p2 * 2^e |
| // = buffer + p2 * 2^e |
| // |
| // Now generate the digits of the fractional part p2 * 2^e. |
| // |
| // Note: |
| // No decimal point is generated: the exponent is adjusted instead. |
| // |
| // p2 actually represents the fraction |
| // |
| // p2 * 2^e |
| // = p2 / 2^-e |
| // = d[-1] / 10^1 + d[-2] / 10^2 + ... |
| // |
| // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...) |
| // |
| // p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m |
| // + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...) |
| // |
| // using |
| // |
| // 10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e) |
| // = ( d) * 2^-e + ( r) |
| // |
| // or |
| // 10^m * p2 * 2^e = d + r * 2^e |
| // |
| // i.e. |
| // |
| // M+ = buffer + p2 * 2^e |
| // = buffer + 10^-m * (d + r * 2^e) |
| // = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e |
| // |
| // and stop as soon as 10^-m * r * 2^e <= delta * 2^e |
| |
| int m = 0; |
| for (;;) { |
| // Invariant: |
| // M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...) |
| // * 2^e |
| // = buffer * 10^-m + 10^-m * (p2 ) |
| // * 2^e = buffer * 10^-m + 10^-m * (1/10 * (10 * p2) ) * 2^e = |
| // buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e + |
| // (10*p2 mod 2^-e)) * 2^e |
| // |
| p2 *= 10; |
| const std::uint64_t d = p2 >> -one.e; // d = (10 * p2) div 2^-e |
| const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e |
| // |
| // M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e |
| // = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e)) |
| // = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e |
| // |
| buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d |
| // |
| // M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e |
| // |
| p2 = r; |
| m++; |
| // |
| // M+ = buffer * 10^-m + 10^-m * p2 * 2^e |
| // Invariant restored. |
| |
| // Check if enough digits have been generated. |
| // |
| // 10^-m * p2 * 2^e <= delta * 2^e |
| // p2 * 2^e <= 10^m * delta * 2^e |
| // p2 <= 10^m * delta |
| delta *= 10; |
| dist *= 10; |
| if (p2 <= delta) { |
| break; |
| } |
| } |
| |
| // V = buffer * 10^-m, with M- <= V <= M+. |
| |
| decimal_exponent -= m; |
| |
| // 1 ulp in the decimal representation is now 10^-m. |
| // Since delta and dist are now scaled by 10^m, we need to do the |
| // same with ulp in order to keep the units in sync. |
| // |
| // 10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e |
| // |
| const std::uint64_t ten_m = one.f; |
| grisu2_round(buffer, length, dist, delta, p2, ten_m); |
| |
| // By construction this algorithm generates the shortest possible decimal |
| // number (Loitsch, Theorem 6.2) which rounds back to w. |
| // For an input number of precision p, at least |
| // |
| // N = 1 + ceil(p * log_10(2)) |
| // |
| // decimal digits are sufficient to identify all binary floating-point |
| // numbers (Matula, "In-and-Out conversions"). |
| // This implies that the algorithm does not produce more than N decimal |
| // digits. |
| // |
| // N = 17 for p = 53 (IEEE double precision) |
| // N = 9 for p = 24 (IEEE single precision) |
| } |
| |
| /*! |
| v = buf * 10^decimal_exponent |
| len is the length of the buffer (number of decimal digits) |
| The buffer must be large enough, i.e. >= max_digits10. |
| */ |
| inline void grisu2(char *buf, int &len, int &decimal_exponent, diyfp m_minus, |
| diyfp v, diyfp m_plus) { |
| // --------(-----------------------+-----------------------)-------- (A) |
| // m- v m+ |
| // |
| // --------------------(-----------+-----------------------)-------- (B) |
| // m- v m+ |
| // |
| // First scale v (and m- and m+) such that the exponent is in the range |
| // [alpha, gamma]. |
| |
| const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e); |
| |
| const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k |
| |
| // The exponent of the products is = v.e + c_minus_k.e + q and is in the range |
| // [alpha,gamma] |
| const diyfp w = diyfp::mul(v, c_minus_k); |
| const diyfp w_minus = diyfp::mul(m_minus, c_minus_k); |
| const diyfp w_plus = diyfp::mul(m_plus, c_minus_k); |
| |
| // ----(---+---)---------------(---+---)---------------(---+---)---- |
| // w- w w+ |
| // = c*m- = c*v = c*m+ |
| // |
| // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and |
| // w+ are now off by a small amount. |
| // In fact: |
| // |
| // w - v * 10^k < 1 ulp |
| // |
| // To account for this inaccuracy, add resp. subtract 1 ulp. |
| // |
| // --------+---[---------------(---+---)---------------]---+-------- |
| // w- M- w M+ w+ |
| // |
| // Now any number in [M-, M+] (bounds included) will round to w when input, |
| // regardless of how the input rounding algorithm breaks ties. |
| // |
| // And digit_gen generates the shortest possible such number in [M-, M+]. |
| // Note that this does not mean that Grisu2 always generates the shortest |
| // possible number in the interval (m-, m+). |
| const diyfp M_minus(w_minus.f + 1, w_minus.e); |
| const diyfp M_plus(w_plus.f - 1, w_plus.e); |
| |
| decimal_exponent = -cached.k; // = -(-k) = k |
| |
| grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus); |
| } |
| |
| /*! |
| v = buf * 10^decimal_exponent |
| len is the length of the buffer (number of decimal digits) |
| The buffer must be large enough, i.e. >= max_digits10. |
| */ |
| template <typename FloatType> |
| void grisu2(char *buf, int &len, int &decimal_exponent, FloatType value) { |
| static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3, |
| "internal error: not enough precision"); |
| |
| // If the neighbors (and boundaries) of 'value' are always computed for |
| // double-precision numbers, all float's can be recovered using strtod (and |
| // strtof). However, the resulting decimal representations are not exactly |
| // "short". |
| // |
| // The documentation for 'std::to_chars' |
| // (https://en.cppreference.com/w/cpp/utility/to_chars) says "value is |
| // converted to a string as if by std::sprintf in the default ("C") locale" |
| // and since sprintf promotes float's to double's, I think this is exactly |
| // what 'std::to_chars' does. On the other hand, the documentation for |
| // 'std::to_chars' requires that "parsing the representation using the |
| // corresponding std::from_chars function recovers value exactly". That |
| // indicates that single precision floating-point numbers should be recovered |
| // using 'std::strtof'. |
| // |
| // NB: If the neighbors are computed for single-precision numbers, there is a |
| // single float |
| // (7.0385307e-26f) which can't be recovered using strtod. The resulting |
| // double precision value is off by 1 ulp. |
| #if 0 |
| const boundaries w = compute_boundaries(static_cast<double>(value)); |
| #else |
| const boundaries w = compute_boundaries(value); |
| #endif |
| |
| grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus); |
| } |
| |
| /*! |
| @brief appends a decimal representation of e to buf |
| @return a pointer to the element following the exponent. |
| @pre -1000 < e < 1000 |
| */ |
| inline char *append_exponent(char *buf, int e) { |
| if (e < 0) { |
| e = -e; |
| *buf++ = '-'; |
| } else { |
| *buf++ = '+'; |
| } |
| |
| auto k = static_cast<std::uint32_t>(e); |
| if (k < 10) { |
| // Always print at least two digits in the exponent. |
| // This is for compatibility with printf("%g"). |
| *buf++ = '0'; |
| *buf++ = static_cast<char>('0' + k); |
| } else if (k < 100) { |
| *buf++ = static_cast<char>('0' + k / 10); |
| k %= 10; |
| *buf++ = static_cast<char>('0' + k); |
| } else { |
| *buf++ = static_cast<char>('0' + k / 100); |
| k %= 100; |
| *buf++ = static_cast<char>('0' + k / 10); |
| k %= 10; |
| *buf++ = static_cast<char>('0' + k); |
| } |
| |
| return buf; |
| } |
| |
| /*! |
| @brief prettify v = buf * 10^decimal_exponent |
| If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point |
| notation. Otherwise it will be printed in exponential notation. |
| @pre min_exp < 0 |
| @pre max_exp > 0 |
| */ |
| inline char *format_buffer(char *buf, int len, int decimal_exponent, |
| int min_exp, int max_exp) { |
| const int k = len; |
| const int n = len + decimal_exponent; |
| |
| // v = buf * 10^(n-k) |
| // k is the length of the buffer (number of decimal digits) |
| // n is the position of the decimal point relative to the start of the buffer. |
| |
| if (k <= n && n <= max_exp) { |
| // digits[000] |
| // len <= max_exp + 2 |
| |
| std::memset(buf + k, '0', static_cast<size_t>(n) - static_cast<size_t>(k)); |
| // Make it look like a floating-point number (#362, #378) |
| buf[n + 0] = '.'; |
| buf[n + 1] = '0'; |
| return buf + (static_cast<size_t>(n)) + 2; |
| } |
| |
| if (0 < n && n <= max_exp) { |
| // dig.its |
| // len <= max_digits10 + 1 |
| std::memmove(buf + (static_cast<size_t>(n) + 1), buf + n, |
| static_cast<size_t>(k) - static_cast<size_t>(n)); |
| buf[n] = '.'; |
| return buf + (static_cast<size_t>(k) + 1U); |
| } |
| |
| if (min_exp < n && n <= 0) { |
| // 0.[000]digits |
| // len <= 2 + (-min_exp - 1) + max_digits10 |
| |
| std::memmove(buf + (2 + static_cast<size_t>(-n)), buf, |
| static_cast<size_t>(k)); |
| buf[0] = '0'; |
| buf[1] = '.'; |
| std::memset(buf + 2, '0', static_cast<size_t>(-n)); |
| return buf + (2U + static_cast<size_t>(-n) + static_cast<size_t>(k)); |
| } |
| |
| if (k == 1) { |
| // dE+123 |
| // len <= 1 + 5 |
| |
| buf += 1; |
| } else { |
| // d.igitsE+123 |
| // len <= max_digits10 + 1 + 5 |
| |
| std::memmove(buf + 2, buf + 1, static_cast<size_t>(k) - 1); |
| buf[1] = '.'; |
| buf += 1 + static_cast<size_t>(k); |
| } |
| |
| *buf++ = 'e'; |
| return append_exponent(buf, n - 1); |
| } |
| |
| } // namespace dtoa_impl |
| |
| /*! |
| The format of the resulting decimal representation is similar to printf's %g |
| format. Returns an iterator pointing past-the-end of the decimal representation. |
| @note The input number must be finite, i.e. NaN's and Inf's are not supported. |
| @note The buffer must be large enough. |
| @note The result is NOT null-terminated. |
| */ |
| char *to_chars(char *first, const char *last, double value) { |
| static_cast<void>(last); // maybe unused - fix warning |
| |
| // bool negative = std::signbit(value); |
| bool negative = (*reinterpret_cast<uint64_t *>(&value)) & (1 << 31ull); |
| if (negative) { |
| value = -value; |
| *first++ = '-'; |
| } |
| |
| #ifdef __clang__ |
| #pragma clang diagnostic push |
| #pragma clang diagnostic ignored "-Wfloat-equal" |
| #endif |
| |
| if (value == 0) // +-0 |
| { |
| *first++ = '0'; |
| // Make it look like a floating-point number (#362, #378) |
| *first++ = '.'; |
| *first++ = '0'; |
| return first; |
| } |
| |
| #ifdef __clang__ |
| #pragma clang diagnostic pop |
| #endif |
| |
| // Compute v = buffer * 10^decimal_exponent. |
| // The decimal digits are stored in the buffer, which needs to be interpreted |
| // as an unsigned decimal integer. |
| // len is the length of the buffer, i.e. the number of decimal digits. |
| int len = 0; |
| int decimal_exponent = 0; |
| dtoa_impl::grisu2(first, len, decimal_exponent, value); |
| // Format the buffer like printf("%.*g", prec, value) |
| constexpr int kMinExp = -4; |
| constexpr int kMaxExp = std::numeric_limits<double>::digits10; |
| |
| return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp, |
| kMaxExp); |
| } |
| } // namespace internal |
| } // namespace simdjson |
| } // namespace minijson |
| |
| #endif // !MINIJSON_USE_STRTOD |
| |
| #endif // MINIJSON_IMPLEMENTATION |
| |
| #endif /* minijson_h */ |