/** * UTF8 string library. * * Allows to use native UTF8 sequences as a string class. Has many overloaded * operators that provides such features as concatenation, types converting and * much more. * * Distributed under GPL v3 * * Author: * Grigory Gorelov (gorelov@grigory.info) * See more information on grigory.info */ #include "String.h" #include #include #include #include #include #include #include #include "Exception.h" void UTF8::String::ConvertFromDouble(const long double d, const UTF8::String &ThousandSeparator, const UTF8::String &FractionSeparator, const int IntegerPartLength, const int FractionPartLength) { std::ostringstream os; os.precision(15); os << d; UTF8::String Number(os.str()); UTF8::String Integer, Fraction; // Extracting integer and fraction std::vector Extracted = Number.Explode("."); unsigned int IntegerLength; if (IntegerPartLength) { IntegerLength = IntegerPartLength; } else { IntegerLength = Extracted[0].Length(); } unsigned int FractionLength; if (FractionPartLength) { FractionLength = FractionPartLength; } else { if (Extracted.size() > 1) { FractionLength = Extracted[1].Length(); } else { FractionLength = 0; } } // Parsing integer for (unsigned int i = 0; i < IntegerLength; i++) { if ((i > 0) && (i % 3 == 0)) { Integer = ThousandSeparator + Integer; } if (Extracted[0].Length() < i + 1) { Integer = "0" + Integer; } else { Integer = Extracted[0][Extracted[0].Length() - 1 - i] + Integer; } } // Parsing fraction if (FractionLength) { Fraction = FractionSeparator; for (unsigned int i = 0; i < FractionLength; i++) { if ((Extracted.size() > 1) && (Extracted[1].Length() > i)) { Fraction += Extracted[1][i]; } else { Fraction += "0"; } } } * this = Integer + Fraction; } bool UTF8::String::HasThisString(const UTF8::String &Str) const { return GetSubstringPosition(Str) != -1; } bool UTF8::String::CharacterIsOneOfThese(const UTF8::String &Characters) const { if (Length() == 1) { for (unsigned int i = 0; i < Characters.Length(); i++) { if (Characters[i] == *this) { return true; } } return false; } else { throw Exception("[CharacterIsOneOfThese] String is more then one character length: \"" + ToString() + "\"", UTF8::Exception::StringIsNotACharacter); } } UTF8::String UTF8::String::FromFile(const UTF8::String &Path) { UTF8::String s; std::ifstream File; File.open(Path.ToConstCharPtr()); if (File.is_open()) { File.seekg(0, std::ios::end); unsigned int Length = File.tellg(); File.seekg(0, std::ios::beg); char *buf = new char[Length + 1]; memset(buf, 0, Length + 1); File.read(buf, Length); s.AppendString(buf); delete buf; } else { throw Exception("Cannot open file \"" + Path.ToString() + "\"", UTF8::Exception::FileNotFound); } File.close(); return s; } long UTF8::String::Search(const UTF8::String &SubString, unsigned int StartPosition, int Direction) const { unsigned int SubstringLength = SubString.Length(); unsigned int n = StartPosition; if (n > Length() - SubstringLength) { if (Direction == SearchDirectionFromLeftToRight) { return -1; } else { n = Length() - SubstringLength; } } if (n < 0) { if (Direction == SearchDirectionFromRightToLeft) { return -1; } else { n = 0; } } while (((Direction == SearchDirectionFromLeftToRight) && (n < Length() - SubstringLength + 1)) || ((Direction == SearchDirectionFromRightToLeft) && (n >= 0))) { if (this->Substring(n, SubstringLength) == SubString) { return n; } n += Direction == SearchDirectionFromLeftToRight ? 1 : -1; } return -1; } std::ostream & operator<<(std::ostream &os, const UTF8::String &s) { os << s.ToString(); return os; } bool operator==(const char *str, const UTF8::String &StringObj) { return StringObj == str; } bool operator==(const std::string &str, const UTF8::String &StringObj) { return StringObj == str; } bool operator!=(const char *str, const UTF8::String &StringObj) { return StringObj != str; } bool operator!=(const std::string &str, const UTF8::String &StringObj) { return StringObj != str; } UTF8::String UTF8::String::Quote() const { return "«"+(*this)+"»"; } UTF8::String UTF8::String::Trim() const { UTF8::String result = *this; long i = 0; while ((result[i] == " ") || (result[i] == "\n") || (result[i] == "\r") || (result[i] == "\t")) { i++; } if (i == result.Length()) { return UTF8::String(); } long j = result.Length(); while ((result[j - 1] == " ") || (result[j - 1] == "\n") || (result[j - 1] == "\r") || (result[j - 1] == "\t")) { j--; } result = result.Substring(i, j - i); return result; } UTF8::String UTF8::String::Replace(const UTF8::String &Search, const UTF8::String &Replace) const { UTF8::String result = *this; // Long to cover unsigned int and -1 long pos = 0; while ((pos = result.Search(Search, pos)) != -1) { result = result.SubstringReplace(pos, Search.Length(), Replace); // Next time we search after replacement pos += Replace.Length(); } return result; } UTF8::String UTF8::String::SubstringReplace(unsigned int Start, unsigned int Count, const UTF8::String &Replace) const { if (Start < Length()) { return (Start ? Substring(0, Start) : UTF8::String())+Replace + Substring(Start + Count); } else { return *this; } } UTF8::String UTF8::String::Implode(const std::vector &Strings, const UTF8::String &Separator) { if (Strings.size()) { UTF8::String Result; for (unsigned int i = 0; i < Strings.size(); i++) { if (Result.Length()) { Result += Separator; } Result += Strings[i]; } return Result; } else { return UTF8::String(); } } std::vector UTF8::String::Explode(const String &Separator) const { std::vector v; unsigned int prev = 0; int i = 0; while (i < Length() - Separator.Length() + 1) { if (Substring(i, Separator.Length()) == Separator) { if (i - prev > 0) { v.push_back(Substring(prev, i - prev)); } i += Separator.Length(); prev = i; } else { i++; } } if (prev < Length()) { v.push_back(Substring(prev, Length() - prev)); } return v; } UTF8::String operator+(const char *CharPtr, const UTF8::String &StringObj) { UTF8::String s(CharPtr); s += StringObj; return s; } UTF8::String operator+(const std::string & str, const UTF8::String &StringObj) { UTF8::String s(str); s += StringObj; return s; } UTF8::String operator+(const long l, const UTF8::String &StringObj) { UTF8::String s(l); s += StringObj; return s; } UTF8::String UTF8::String::operator+(const UTF8::String &s) const { UTF8::String res(*this); res.AppendString(s.Data); return res; } UTF8::String & UTF8::String::operator+=(const UTF8::String &s) { AppendString(s.Data); return *this; } void UTF8::String::AppendString(const char *str) { // The functions that can fill buffer directly: // // SetString AppendString // // Make shure all preparations are done there if (str && strlen(str)) { if (DataArrayLength) { CheckIfStringIsCorrect(str); unsigned int StrLength = strlen(str); Data = (char *) realloc(Data, DataArrayLength + StrLength + 1); if (Data != NULL) { memcpy(Data + DataArrayLength, str, StrLength); DataArrayLength += StrLength; Data[DataArrayLength] = 0; CalculateStringLength(); } else { throw Exception("[AppendString] Cannot realloc any more memory"); } } else { SetString(str); } } } void UTF8::String::SetString(const char *str) { // The functions that can fill buffer directly: // // SetString AppendString // // Make shure all preparations are done there if (str && strlen(str)) { CheckIfStringIsCorrect(str); Empty(); DataArrayLength = strlen(str); Data = new char[DataArrayLength + 1]; Data[DataArrayLength] = 0; memcpy(Data, str, DataArrayLength); CalculateStringLength(); } else { Empty(); } } void UTF8::String::ConvertFromInt64(int64_t n) { Empty(); if (n) { bool minus; if (n < 0) { n = -n; minus = true; } else { minus = false; } char tmp[32] = "0"; const char *num = "0123456789"; memset(tmp, 0, 32); unsigned int i = 30; while (n) { tmp[i] = num[n % 10]; n /= 10; i--; if ((i < 0) || ((i < 1) && minus)) { throw Exception("[ConvertFromInt] Cycle terminated, buffer overflow."); } } if (minus) { tmp[i] = '-'; i--; } SetString(tmp + i + 1); } else { SetString("0"); } CalculateStringLength(); } UTF8::String::String(const long double d, const UTF8::String &ThousandSeparator, const UTF8::String &DecimalSeparator, const int IntegerPartCount, const int FractionPartCount) { InitString(); ConvertFromDouble(d, ThousandSeparator, DecimalSeparator, IntegerPartCount, FractionPartCount); } void UTF8::String::InitString() { Data = NULL; DataArrayLength = 0; StringLength = 0; } UTF8::String::String() { InitString(); } UTF8::String::String(const std::string & s) { InitString(); CheckIfStringIsCorrect(s.c_str()); AppendString(s.c_str()); CalculateStringLength(); } int UTF8::String::GetSymbolIndexInDataArray(unsigned int Position) const { if (Position >= StringLength) { throw Exception((UTF8::String("[GetSymbolIndexInDataArray] trying to get position beyond the end of string. StringLength: ") + StringLength + " Position: " + Position + " String: [" + Data + "]").ToString()); } unsigned int n = 0; for (unsigned int i = 0; i < Position; i++) { n += GetSequenceLength(Data + n); } return n; } long UTF8::String::GetSubstringPosition(const UTF8::String &SubString, unsigned int Start) const { if (SubString.Length() > StringLength) { return -1; } unsigned int ScansCount = StringLength - SubString.StringLength + 1 - Start; for (unsigned int i = 0; i < ScansCount; i++) { if (this->Substring(i + Start, SubString.StringLength) == SubString) { return i + Start; } } return -1; } UTF8::String UTF8::String::Substring(unsigned int Start, unsigned int Count) const { if (Start >= StringLength) { return UTF8::String(); } if ((Start + Count > StringLength) || (Count == 0)) { Count = StringLength - Start; } unsigned int StartIndex = GetSymbolIndexInDataArray(Start); unsigned int CopyAmount = 0; for (unsigned int i = 0; i < Count; i++) { CopyAmount += GetSequenceLength(Data + StartIndex + CopyAmount); } char *tmp = new char[CopyAmount + 1]; memcpy(tmp, Data + StartIndex, CopyAmount); tmp[CopyAmount] = 0; UTF8::String r(tmp); delete tmp; return r; } UTF8::String::String(const char * str) { InitString(); SetString(str); } UTF8::String::String(const uint32_t * str) { InitString(); ConvertFromUTF32(str); } void UTF8::String::ConvertFromUTF32(const uint32_t *s) { if (s) { unsigned int WideStringLength = 0; do { WideStringLength++; if (WideStringLength == 4294967295UL) { throw Exception("[ConvertFromUTF32] Cannot find termination symbol in incoming string."); } } while (s[WideStringLength]); char *tmp = new char[WideStringLength * 4 + 1]; memset(tmp, 0, WideStringLength * 4 + 1); unsigned int pos = 0; for (int i = 0; i < WideStringLength; i++) { uint32_t wc = s[i]; if (wc < 0x80) { tmp[pos++] = wc; } else if (wc < 0x800) { tmp[pos++] = (wc >> 6) | 0b11000000; tmp[pos++] = (wc & 0b111111) | 0b10000000; } else if (wc < 0x10000) { tmp[pos++] = (wc >> 12) | 0b11100000; tmp[pos++] = ((wc >> 6) & 0b111111) | 0b10000000; tmp[pos++] = (wc & 0b111111) | 0b10000000; } else { tmp[pos++] = (wc >> 18) | 0b11110000; tmp[pos++] = ((wc >> 12) & 0b111111) | 0b10000000; tmp[pos++] = ((wc >> 6) & 0b111111) | 0b10000000; tmp[pos++] = (wc & 0b111111) | 0b10000000; } } SetString(tmp); delete tmp; } } void UTF8::String::CalculateStringLength() { // We are not writing anything to memory so limits are not needed if (Data) { unsigned int n = 0, count = 0; do { // We do not need to check line end here, it is checked when string is changed n += GetSequenceLength(Data + n); count++; } while (Data[n]); StringLength = count; } else { StringLength = 0; } } void UTF8::String::CheckIfStringIsCorrect(const char *str) const { if (str) { // We are not writing anything to memory so limits are not needed unsigned int n = 0, i; unsigned int SequenceLength; while (str[n]) { SequenceLength = GetSequenceLength(str + n); for (i = 1; i < SequenceLength; i++) { if ((((unsigned char) str[n + i]) >> 6) != 0b10) { std::string s(str); throw Exception("[CheckIfStringIsCorrect] Incorrect byte in UTF8 sequence: \"" + s + "\""); } } n += SequenceLength; if (n >= 0xFFFFFFFF - 4) { std::string s(str); throw Exception("[CheckIfStringIsCorrect] termination char was not found in string: \"" + s + "\""); } } } } bool UTF8::String::operator>(const UTF8::String &s) const { if (*this == s) { return false; } if (*this= 0; i--) { c = Data[i]; if ((c >= '0') && (c <= '9')) { int_part += (c - '0') * mul; mul *= 10; } else { if (c == '.') { prec_part = (double) int_part / (double) mul; int_part = 0; mul = 1; } else { if ((c == '-') && (i == 0)) { int_part = -int_part; prec_part = -prec_part; } else { UTF8::String err = "Cannot convert \"" + * this+"\" to double."; throw UTF8::Exception(err.ToConstCharPtr(), UTF8::Exception::StringToDoubleConversionError); } } } } return int_part + prec_part; } int64_t UTF8::String::ToLong() const { int64_t mul = 1; char c; int64_t number = 0; for (int i = DataArrayLength - 1; i >= 0; i--) { c = Data[i]; if ((c >= '0') && (c <= '9')) { number += (c - '0') * mul; mul *= 10; } else { if (c == '.') { number = 0; mul = 1; } else { if ((c == '-') && (i == 0)) { number = -number; } else { UTF8::String err = "Cannot convert \"" + * this+"\" to number."; throw UTF8::Exception(err.ToConstCharPtr(), UTF8::Exception::StringToIntConversionError); } } } } return number; } UTF8::String UTF8::String::operator+(const char *s) const { UTF8::String res(*this); res.AppendString(s); return res; } bool UTF8::String::operator==(const UTF8::String &s) const { if (DataArrayLength != s.DataArrayLength) { return false; } else { for (int i = 0; i < DataArrayLength; i++) { if (Data[i] != s.Data[i]) { return false; } } return true; } } bool UTF8::String::operator!=(const UTF8::String &s) const { return !(*this == s); } bool UTF8::String::operator==(const char *str) const { if (str && strlen(str)) { if (DataArrayLength != strlen(str)) { return false; } else { for (int i = 0; i < DataArrayLength; i++) { if (Data[i] != str[i]) { return false; } } return true; } } else { return StringLength == 0; } } bool UTF8::String::operator!=(const char *str) const { return !(*this == str); } const char * UTF8::String::ToConstCharPtr() const { return Data; } unsigned int UTF8::String::Length() const { return StringLength; } unsigned int UTF8::String::DataLength() const { return DataArrayLength; } UTF8::String::~String() { Empty(); } UTF8::String::String(const String& orig) { InitString(); SetString(orig.Data); }