Program Listing for File unicode-support.hpp

Return to documentation for file (zeep/unicode-support.hpp)

// Copyright Maarten L. Hekkelman, Radboud University 2008-2013.
//        Copyright Maarten L. Hekkelman, 2014-2023
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE_1_0.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

#pragma once


#include <zeep/config.hpp>
#include <zeep/exception.hpp>

#include <cstdint>
#include <locale>
#include <vector>
#include <string>
#include <tuple>

namespace zeep
{

using unicode = char32_t;

enum class encoding_type
{
    ASCII,
    UTF8,
    UTF16BE,
    UTF16LE,
    ISO88591
};

constexpr bool is_single_byte_encoding(encoding_type enc)
{
    return enc == encoding_type::ASCII or enc == encoding_type::ISO88591 or enc == encoding_type::UTF8;
}

std::string wstring_to_string(const std::wstring& s);

void append(std::string& s, unicode ch);
unicode pop_last_char(std::string& s);
template<typename Iter>
std::tuple<unicode,Iter> get_first_char(Iter ptr, Iter end);

inline bool iequals(const std::string& a, const std::string& b)
{
    bool equal = a.length() == b.length();

    for (std::string::size_type i = 0; equal and i < a.length(); ++i)
        equal = std::toupper(a[i]) == std::toupper(b[i]);

    return equal;
}

// inlines

inline void append(std::string& s, unicode uc)
{
    if (uc < 0x080)
        s += (static_cast<char>(uc));
    else if (uc < 0x0800)
    {
        char ch[2] = {
            static_cast<char>(0x0c0 | (uc >> 6)),
            static_cast<char>(0x080 | (uc & 0x3f))
        };
        s.append(ch, 2);
    }
    else if (uc < 0x00010000)
    {
        char ch[3] = {
            static_cast<char>(0x0e0 | (uc >> 12)),
            static_cast<char>(0x080 | ((uc >> 6) & 0x3f)),
            static_cast<char>(0x080 | (uc & 0x3f))
        };
        s.append(ch, 3);
    }
    else
    {
        char ch[4] = {
            static_cast<char>(0x0f0 | (uc >> 18)),
            static_cast<char>(0x080 | ((uc >> 12) & 0x3f)),
            static_cast<char>(0x080 | ((uc >> 6) & 0x3f)),
            static_cast<char>(0x080 | (uc & 0x3f))
        };
        s.append(ch, 4);
    }
}

inline unicode pop_last_char(std::string& s)
{
    unicode result = 0;

    if (not s.empty())
    {
        std::string::iterator ch = s.end() - 1;

        if ((*ch & 0x0080) == 0)
        {
            result = *ch;
            s.erase(ch);
        }
        else
        {
            int o = 0;

            do
            {
                result |= (*ch & 0x03F) << o;
                o += 6;
                --ch;
            }
            while (ch != s.begin() and (*ch & 0x0C0) == 0x080);

            switch (o)
            {
                case  6: result |= (*ch & 0x01F) <<  6; break;
                case 12: result |= (*ch & 0x00F) << 12; break;
                case 18: result |= (*ch & 0x007) << 18; break;
            }

            s.erase(ch, s.end());
        }
    }

    return result;
}

// I used to have this comment here:
//
//    this code only works if the input is valid utf-8
//
// That was a bad idea....
//
template<typename Iter>
std::tuple<unicode,Iter> get_first_char(Iter ptr, Iter end)
{
    unicode result = static_cast<unsigned char>(*ptr);
    ++ptr;

    if (result > 0x07f)
    {
        unsigned char ch[3];

        if ((result & 0x0E0) == 0x0C0)
        {
            if (ptr >= end)
                throw zeep::exception("Invalid utf-8");

            ch[0] = static_cast<unsigned char>(*ptr); ++ptr;

            if ((ch[0] & 0x0c0) != 0x080)
                throw zeep::exception("Invalid utf-8");

            result = ((result & 0x01F) << 6) | (ch[0] & 0x03F);
        }
        else if ((result & 0x0F0) == 0x0E0)
        {
            if (ptr + 1 >= end)
                throw zeep::exception("Invalid utf-8");

            ch[0] = static_cast<unsigned char>(*ptr); ++ptr;
            ch[1] = static_cast<unsigned char>(*ptr); ++ptr;

            if ((ch[0] & 0x0c0) != 0x080 or (ch[1] & 0x0c0) != 0x080)
                throw zeep::exception("Invalid utf-8");

            result = ((result & 0x00F) << 12) | ((ch[0] & 0x03F) << 6) | (ch[1] & 0x03F);
        }
        else if ((result & 0x0F8) == 0x0F0)
        {
            if (ptr + 2 >= end)
                throw zeep::exception("Invalid utf-8");

            ch[0] = static_cast<unsigned char>(*ptr); ++ptr;
            ch[1] = static_cast<unsigned char>(*ptr); ++ptr;
            ch[2] = static_cast<unsigned char>(*ptr); ++ptr;

            if ((ch[0] & 0x0c0) != 0x080 or (ch[1] & 0x0c0) != 0x080 or (ch[2] & 0x0c0) != 0x080)
                throw zeep::exception("Invalid utf-8");

            result = ((result & 0x007) << 18) | ((ch[0] & 0x03F) << 12) | ((ch[1] & 0x03F) << 6) | (ch[2] & 0x03F);
        }
    }

    return std::make_tuple(result, ptr);
}

// --------------------------------------------------------------------

inline std::wstring convert_s2w(std::string_view s)
{
    auto b = s.begin();
    auto e = s.end();

    std::wstring result;

    while (b != e)
    {
        const auto &[uc, i] = get_first_char(b, e);
        if (not uc)
            break;

        result += static_cast<wchar_t>(uc);
        b = i;
    }

    return result;
}

inline std::string convert_w2s(std::wstring_view s)
{
    std::string result;

    for (unicode ch : s)
        append(result, ch);

    return result;
}

// --------------------------------------------------------------------

inline std::string to_hex(uint32_t i)
{
    char s[sizeof(i) * 2 + 3];
    char* p = s + sizeof(s);
    *--p = 0;

    const char kHexChars[] = "0123456789abcdef";

    while (i)
    {
        *--p = kHexChars[i & 0x0F];
        i >>= 4;
    }

    *--p = 'x';
    *--p = '0';

    return p;
}

// --------------------------------------------------------------------

inline void trim(std::string& s)
{
    std::string::iterator b = s.begin();
    while (b != s.end() and *b > 0 and std::isspace(*b))
        ++b;

    std::string::iterator e = s.end();
    while (e > b and *(e - 1) > 0 and std::isspace(*(e - 1)))
        --e;

    if (b != s.begin() or e != s.end())
        s = { b, e };
}

// --------------------------------------------------------------------

inline bool starts_with(std::string_view s, std::string_view p)
{
    return s.compare(0, p.length(), p) == 0;
}

// --------------------------------------------------------------------

inline bool ends_with(std::string_view s, std::string_view p)
{
    return s.length() >= p.length() and s.compare(s.length() - p.length(), p.length(), p) == 0;
}

// --------------------------------------------------------------------

inline bool contains(std::string_view s, std::string_view p)
{
    return s.find(p) != std::string_view::npos;
}

// --------------------------------------------------------------------
inline void split(std::vector<std::string>& v, std::string_view s, std::string_view p, bool compress = false)
{
    v.clear();

    std::string_view::size_type i = 0;
    const auto e = s.length();

    while (i <= e)
    {
        auto n = s.find(p, i);
        if (n > e)
            n = e;

        if (n > i or compress == false)
            v.emplace_back(s.substr(i, n - i));

        if (n == std::string_view::npos)
            break;

        i = n + p.length();
    }
}

// --------------------------------------------------------------------

inline void to_lower(std::string& s, const std::locale& loc = std::locale())
{
    for (char& ch: s)
        ch = std::tolower(ch, loc);
}

// --------------------------------------------------------------------

template<typename Container = std::vector<std::string> >
std::string join(const Container& v, std::string_view d)
{
    std::string result;

    if (not v.empty())
    {
        auto i = v.begin();
        for (;;)
        {
            result += *i++;

            if (i == v.end())
                break;

            result += d;
        }
    }
    return result;
}

// --------------------------------------------------------------------

inline void replace_all(std::string& s, std::string_view p, std::string_view r)
{
    std::string::size_type i = 0;
    for (;;)
    {
        auto l = s.find(p, i);
        if (l == std::string::npos)
            break;

        s.replace(l, p.length(), r);
        i = l + r.length();
    }
}

} // namespace xml