Tyro/include/UTF8Strings/String.cpp

900 lines
21 KiB
C++

/**
* UTF8 string library.
*
* Allows to use native UTF8 sequences as a string class. Has many overloaded
* operators that provides such features as concatenation, types converting and
* much more.
*
* Distributed under GPL v3
*
* Author:
* Grigory Gorelov (gorelov@grigory.info)
* See more information on grigory.info
*/
#include "String.h"
#include <string>
#include <sstream>
#include <iostream>
#include <stdlib.h>
#include <ostream>
#include <stdint.h>
#include <errno.h>
#include "Exception.h"
void UTF8::String::ConvertFromDouble(const long double d, const UTF8::String &ThousandSeparator, const UTF8::String &FractionSeparator, const int IntegerPartLength, const int FractionPartLength) {
std::ostringstream os;
os.precision(15);
os << d;
UTF8::String Number(os.str());
UTF8::String Integer, Fraction;
// Extracting integer and fraction
std::vector <UTF8::String> Extracted = Number.Explode(".");
unsigned int IntegerLength;
if (IntegerPartLength) {
IntegerLength = IntegerPartLength;
} else {
IntegerLength = Extracted[0].Length();
}
unsigned int FractionLength;
if (FractionPartLength) {
FractionLength = FractionPartLength;
} else {
if (Extracted.size() > 1) {
FractionLength = Extracted[1].Length();
} else {
FractionLength = 0;
}
}
// Parsing integer
for (unsigned int i = 0; i < IntegerLength; i++) {
if ((i > 0) && (i % 3 == 0)) {
Integer = ThousandSeparator + Integer;
}
if (Extracted[0].Length() < i + 1) {
Integer = "0" + Integer;
} else {
Integer = Extracted[0][Extracted[0].Length() - 1 - i] + Integer;
}
}
// Parsing fraction
if (FractionLength) {
Fraction = FractionSeparator;
for (unsigned int i = 0; i < FractionLength; i++) {
if ((Extracted.size() > 1) && (Extracted[1].Length() > i)) {
Fraction += Extracted[1][i];
} else {
Fraction += "0";
}
}
}
* this = Integer + Fraction;
}
bool UTF8::String::HasThisString(const UTF8::String &Str) const {
return GetSubstringPosition(Str) != -1;
}
bool UTF8::String::CharacterIsOneOfThese(const UTF8::String &Characters) const {
if (Length() == 1) {
for (unsigned int i = 0; i < Characters.Length(); i++) {
if (Characters[i] == *this) {
return true;
}
}
return false;
} else {
throw Exception("[CharacterIsOneOfThese] String is more then one character length: \"" + ToString() + "\"", UTF8::Exception::StringIsNotACharacter);
}
}
UTF8::String UTF8::String::FromFile(const UTF8::String &Path) {
UTF8::String s;
std::ifstream File;
File.open(Path.ToConstCharPtr());
if (File.is_open()) {
File.seekg(0, std::ios::end);
unsigned int Length = File.tellg();
File.seekg(0, std::ios::beg);
char *buf = new char[Length + 1];
memset(buf, 0, Length + 1);
File.read(buf, Length);
s.AppendString(buf);
delete buf;
} else {
throw Exception("Cannot open file \"" + Path.ToString() + "\"", UTF8::Exception::FileNotFound);
}
File.close();
return s;
}
long UTF8::String::Search(const UTF8::String &SubString, unsigned int StartPosition, int Direction) const {
unsigned int SubstringLength = SubString.Length();
unsigned int n = StartPosition;
if (n > Length() - SubstringLength) {
if (Direction == SearchDirectionFromLeftToRight) {
return -1;
} else {
n = Length() - SubstringLength;
}
}
if (n < 0) {
if (Direction == SearchDirectionFromRightToLeft) {
return -1;
} else {
n = 0;
}
}
while (((Direction == SearchDirectionFromLeftToRight) && (n < Length() - SubstringLength + 1)) || ((Direction == SearchDirectionFromRightToLeft) && (n >= 0))) {
if (this->Substring(n, SubstringLength) == SubString) {
return n;
}
n += Direction == SearchDirectionFromLeftToRight ? 1 : -1;
}
return -1;
}
std::ostream & operator<<(std::ostream &os, const UTF8::String &s) {
os << s.ToString();
return os;
}
bool operator==(const char *str, const UTF8::String &StringObj) {
return StringObj == str;
}
bool operator==(const std::string &str, const UTF8::String &StringObj) {
return StringObj == str;
}
bool operator!=(const char *str, const UTF8::String &StringObj) {
return StringObj != str;
}
bool operator!=(const std::string &str, const UTF8::String &StringObj) {
return StringObj != str;
}
UTF8::String UTF8::String::Quote() const {
return "«"+(*this)+"»";
}
UTF8::String UTF8::String::Trim() const {
UTF8::String result = *this;
long i = 0;
while ((result[i] == " ") || (result[i] == "\n") || (result[i] == "\r") || (result[i] == "\t")) {
i++;
}
if (i == result.Length()) {
return UTF8::String();
}
long j = result.Length();
while ((result[j - 1] == " ") || (result[j - 1] == "\n") || (result[j - 1] == "\r") || (result[j - 1] == "\t")) {
j--;
}
result = result.Substring(i, j - i);
return result;
}
UTF8::String UTF8::String::Replace(const UTF8::String &Search, const UTF8::String &Replace) const {
UTF8::String result = *this;
// Long to cover unsigned int and -1
long pos = 0;
while ((pos = result.Search(Search, pos)) != -1) {
result = result.SubstringReplace(pos, Search.Length(), Replace);
// Next time we search after replacement
pos += Replace.Length();
}
return result;
}
UTF8::String UTF8::String::SubstringReplace(unsigned int Start, unsigned int Count, const UTF8::String &Replace) const {
if (Start < Length()) {
return (Start ? Substring(0, Start) : UTF8::String())+Replace + Substring(Start + Count);
} else {
return *this;
}
}
UTF8::String UTF8::String::Implode(const std::vector <UTF8::String> &Strings, const UTF8::String &Separator) {
if (Strings.size()) {
UTF8::String Result;
for (unsigned int i = 0; i < Strings.size(); i++) {
if (Result.Length()) {
Result += Separator;
}
Result += Strings[i];
}
return Result;
} else {
return UTF8::String();
}
}
std::vector <UTF8::String> UTF8::String::Explode(const String &Separator) const {
std::vector <UTF8::String> v;
unsigned int prev = 0;
int i = 0;
while (i < Length() - Separator.Length() + 1) {
if (Substring(i, Separator.Length()) == Separator) {
if (i - prev > 0) {
v.push_back(Substring(prev, i - prev));
}
i += Separator.Length();
prev = i;
} else {
i++;
}
}
if (prev < Length()) {
v.push_back(Substring(prev, Length() - prev));
}
return v;
}
UTF8::String operator+(const char *CharPtr, const UTF8::String &StringObj) {
UTF8::String s(CharPtr);
s += StringObj;
return s;
}
UTF8::String operator+(const std::string & str, const UTF8::String &StringObj) {
UTF8::String s(str);
s += StringObj;
return s;
}
UTF8::String operator+(const long l, const UTF8::String &StringObj) {
UTF8::String s(l);
s += StringObj;
return s;
}
UTF8::String UTF8::String::operator+(const UTF8::String &s) const {
UTF8::String res(*this);
res.AppendString(s.Data);
return res;
}
UTF8::String & UTF8::String::operator+=(const UTF8::String &s) {
AppendString(s.Data);
return *this;
}
void UTF8::String::AppendString(const char *str) {
// The functions that can fill buffer directly:
//
// SetString AppendString
//
// Make shure all preparations are done there
if (str && strlen(str)) {
if (DataArrayLength) {
CheckIfStringIsCorrect(str);
unsigned int StrLength = strlen(str);
Data = (char *) realloc(Data, DataArrayLength + StrLength + 1);
if (Data != NULL) {
memcpy(Data + DataArrayLength, str, StrLength);
DataArrayLength += StrLength;
Data[DataArrayLength] = 0;
CalculateStringLength();
} else {
throw Exception("[AppendString] Cannot realloc any more memory");
}
} else {
SetString(str);
}
}
}
void UTF8::String::SetString(const char *str) {
// The functions that can fill buffer directly:
//
// SetString AppendString
//
// Make shure all preparations are done there
if (str && strlen(str)) {
CheckIfStringIsCorrect(str);
Empty();
DataArrayLength = strlen(str);
Data = new char[DataArrayLength + 1];
Data[DataArrayLength] = 0;
memcpy(Data, str, DataArrayLength);
CalculateStringLength();
} else {
Empty();
}
}
void UTF8::String::ConvertFromInt64(int64_t n) {
Empty();
if (n) {
bool minus;
if (n < 0) {
n = -n;
minus = true;
} else {
minus = false;
}
char tmp[32] = "0";
const char *num = "0123456789";
memset(tmp, 0, 32);
unsigned int i = 30;
while (n) {
tmp[i] = num[n % 10];
n /= 10;
i--;
if ((i < 0) || ((i < 1) && minus)) {
throw Exception("[ConvertFromInt] Cycle terminated, buffer overflow.");
}
}
if (minus) {
tmp[i] = '-';
i--;
}
SetString(tmp + i + 1);
} else {
SetString("0");
}
CalculateStringLength();
}
UTF8::String::String(const long double d, const UTF8::String &ThousandSeparator, const UTF8::String &DecimalSeparator, const int IntegerPartCount, const int FractionPartCount) {
InitString();
ConvertFromDouble(d, ThousandSeparator, DecimalSeparator, IntegerPartCount, FractionPartCount);
}
void UTF8::String::InitString() {
Data = NULL;
DataArrayLength = 0;
StringLength = 0;
}
UTF8::String::String() {
InitString();
}
UTF8::String::String(const std::string & s) {
InitString();
CheckIfStringIsCorrect(s.c_str());
AppendString(s.c_str());
CalculateStringLength();
}
int UTF8::String::GetSymbolIndexInDataArray(unsigned int Position) const {
if (Position >= StringLength) {
throw Exception((UTF8::String("[GetSymbolIndexInDataArray] trying to get position beyond the end of string. StringLength: ") + StringLength + " Position: " + Position + " String: [" + Data + "]").ToString());
}
unsigned int n = 0;
for (unsigned int i = 0; i < Position; i++) {
n += GetSequenceLength(Data + n);
}
return n;
}
long UTF8::String::GetSubstringPosition(const UTF8::String &SubString, unsigned int Start) const {
if (SubString.Length() > StringLength) {
return -1;
}
unsigned int ScansCount = StringLength - SubString.StringLength + 1 - Start;
for (unsigned int i = 0; i < ScansCount; i++) {
if (this->Substring(i + Start, SubString.StringLength) == SubString) {
return i + Start;
}
}
return -1;
}
UTF8::String UTF8::String::Substring(unsigned int Start, unsigned int Count) const {
if (Start >= StringLength) {
return UTF8::String();
}
if ((Start + Count > StringLength) || (Count == 0)) {
Count = StringLength - Start;
}
unsigned int StartIndex = GetSymbolIndexInDataArray(Start);
unsigned int CopyAmount = 0;
for (unsigned int i = 0; i < Count; i++) {
CopyAmount += GetSequenceLength(Data + StartIndex + CopyAmount);
}
char *tmp = new char[CopyAmount + 1];
memcpy(tmp, Data + StartIndex, CopyAmount);
tmp[CopyAmount] = 0;
UTF8::String r(tmp);
delete tmp;
return r;
}
UTF8::String::String(const char * str) {
InitString();
SetString(str);
}
UTF8::String::String(const uint32_t * str) {
InitString();
ConvertFromUTF32(str);
}
void UTF8::String::ConvertFromUTF32(const uint32_t *s) {
if (s) {
unsigned int WideStringLength = 0;
do {
WideStringLength++;
if (WideStringLength == 4294967295UL) {
throw Exception("[ConvertFromUTF32] Cannot find termination symbol in incoming string.");
}
} while (s[WideStringLength]);
char *tmp = new char[WideStringLength * 4 + 1];
memset(tmp, 0, WideStringLength * 4 + 1);
unsigned int pos = 0;
for (int i = 0; i < WideStringLength; i++) {
uint32_t wc = s[i];
if (wc < 0x80) {
tmp[pos++] = wc;
} else if (wc < 0x800) {
tmp[pos++] = (wc >> 6) | 0b11000000;
tmp[pos++] = (wc & 0b111111) | 0b10000000;
} else if (wc < 0x10000) {
tmp[pos++] = (wc >> 12) | 0b11100000;
tmp[pos++] = ((wc >> 6) & 0b111111) | 0b10000000;
tmp[pos++] = (wc & 0b111111) | 0b10000000;
} else {
tmp[pos++] = (wc >> 18) | 0b11110000;
tmp[pos++] = ((wc >> 12) & 0b111111) | 0b10000000;
tmp[pos++] = ((wc >> 6) & 0b111111) | 0b10000000;
tmp[pos++] = (wc & 0b111111) | 0b10000000;
}
}
SetString(tmp);
delete tmp;
}
}
void UTF8::String::CalculateStringLength() {
// We are not writing anything to memory so limits are not needed
if (Data) {
unsigned int n = 0, count = 0;
do {
// We do not need to check line end here, it is checked when string is changed
n += GetSequenceLength(Data + n);
count++;
} while (Data[n]);
StringLength = count;
} else {
StringLength = 0;
}
}
void UTF8::String::CheckIfStringIsCorrect(const char *str) const {
if (str) {
// We are not writing anything to memory so limits are not needed
unsigned int n = 0, i;
unsigned int SequenceLength;
while (str[n]) {
SequenceLength = GetSequenceLength(str + n);
for (i = 1; i < SequenceLength; i++) {
if ((((unsigned char) str[n + i]) >> 6) != 0b10) {
std::string s(str);
throw Exception("[CheckIfStringIsCorrect] Incorrect byte in UTF8 sequence: \"" + s + "\"");
}
}
n += SequenceLength;
if (n >= 0xFFFFFFFF - 4) {
std::string s(str);
throw Exception("[CheckIfStringIsCorrect] termination char was not found in string: \"" + s + "\"");
}
}
}
}
bool UTF8::String::operator>(const UTF8::String &s) const {
if (*this == s) {
return false;
}
if (*this<s) {
return false;
}
return true;
}
bool UTF8::String::operator<(const UTF8::String &s) const {
unsigned int MinLength = StringLength < s.StringLength ? StringLength : s.StringLength;
//std::cout << "MinLength=" << MinLength;
unsigned int MyPos = 0, RemotePos = 0;
unsigned int MySequenceLength, RemoteSequenceLength;
for (unsigned int i = 0; i < MinLength; i++) {
MySequenceLength = GetSequenceLength(Data + MyPos);
RemoteSequenceLength = GetSequenceLength(s.Data + RemotePos);
if (MySequenceLength < RemoteSequenceLength) {
return true;
}
if (MySequenceLength > RemoteSequenceLength) {
return false;
}
for (unsigned int j = 0; j < MySequenceLength; j++) {
if (Data[MyPos + j] < s.Data[RemotePos + j]) {
return true;
}
if (Data[MyPos + j] > s.Data[RemotePos + j]) {
return false;
}
}
MyPos += MySequenceLength;
RemotePos += RemoteSequenceLength;
}
// If this string is substring of s (from left side) then it is lower
return StringLength < s.StringLength;
}
UTF8::String UTF8::String::operator[](unsigned int const n) const {
if (n >= StringLength) {
return UTF8::String();
}
if (n < 0) {
return UTF8::String();
}
unsigned int pos = 0;
for (unsigned int i = 0; i < n; i++) {
pos += GetSequenceLength(Data + pos);
}
char t[5];
memset(t, 0, 5);
memcpy(t, Data + pos, GetSequenceLength(Data + pos));
return UTF8::String(t);
}
unsigned int UTF8::String::GetSequenceLength(const char * StartByte) const {
if (StartByte && strlen(StartByte)) {
unsigned char Byte = StartByte[0];
if (Byte < 128) {
return 1;
}
// Here we need back order due to mask operation
if ((Byte >> 5) == 0b110) {
return 2;
}
if ((Byte >> 4) == 0b1110) {
return 3;
}
if ((Byte >> 3) == 0b11110) {
return 4;
}
throw Exception(std::string("[GetSequenceLength] Invalid UTF8 start byte. My own string is: [") + Data + "] Argument is: [" + StartByte + "]");
} else {
throw Exception(std::string("[GetSequenceLength] Invalid UTF8 start byte (it is empty). My own string is: [") + Data + "] Argument is: [" + StartByte + "]");
}
}
UTF8::String & UTF8::String::operator=(const String &Original) {
// Check if objects are not same
if ((unsigned int long) &Original != (unsigned int long) this) {
Empty();
SetString(Original.Data);
}
return *this;
}
UTF8::String & UTF8::String::operator=(const char *str) {
Empty();
SetString(str);
return *this;
}
UTF8::String & UTF8::String::operator=(const uint32_t *str) {
Empty();
ConvertFromUTF32(str);
return *this;
}
UTF8::String & UTF8::String::operator=(long double d) {
Empty();
ConvertFromDouble(d);
return *this;
}
void UTF8::String::Empty() {
if (DataArrayLength) {
delete Data;
InitString();
}
}
std::string UTF8::String::ToString() const {
if (DataArrayLength) {
return std::string(Data);
} else {
return std::string();
}
}
double UTF8::String::ToDouble() const {
int64_t mul = 1;
char c;
int int_part = 0;
double prec_part = 0;
for (int i = DataArrayLength - 1; i >= 0; i--) {
c = Data[i];
if ((c >= '0') && (c <= '9')) {
int_part += (c - '0') * mul;
mul *= 10;
} else {
if (c == '.') {
prec_part = (double) int_part / (double) mul;
int_part = 0;
mul = 1;
} else {
if ((c == '-') && (i == 0)) {
int_part = -int_part;
prec_part = -prec_part;
} else {
UTF8::String err = "Cannot convert \"" + * this+"\" to double.";
throw UTF8::Exception(err.ToConstCharPtr(), UTF8::Exception::StringToDoubleConversionError);
}
}
}
}
return int_part + prec_part;
}
int64_t UTF8::String::ToLong() const {
int64_t mul = 1;
char c;
int64_t number = 0;
for (int i = DataArrayLength - 1; i >= 0; i--) {
c = Data[i];
if ((c >= '0') && (c <= '9')) {
number += (c - '0') * mul;
mul *= 10;
} else {
if (c == '.') {
number = 0;
mul = 1;
} else {
if ((c == '-') && (i == 0)) {
number = -number;
} else {
UTF8::String err = "Cannot convert \"" + * this+"\" to number.";
throw UTF8::Exception(err.ToConstCharPtr(), UTF8::Exception::StringToIntConversionError);
}
}
}
}
return number;
}
UTF8::String UTF8::String::operator+(const char *s) const {
UTF8::String res(*this);
res.AppendString(s);
return res;
}
bool UTF8::String::operator==(const UTF8::String &s) const {
if (DataArrayLength != s.DataArrayLength) {
return false;
} else {
for (int i = 0; i < DataArrayLength; i++) {
if (Data[i] != s.Data[i]) {
return false;
}
}
return true;
}
}
bool UTF8::String::operator!=(const UTF8::String &s) const {
return !(*this == s);
}
bool UTF8::String::operator==(const char *str) const {
if (str && strlen(str)) {
if (DataArrayLength != strlen(str)) {
return false;
} else {
for (int i = 0; i < DataArrayLength; i++) {
if (Data[i] != str[i]) {
return false;
}
}
return true;
}
} else {
return StringLength == 0;
}
}
bool UTF8::String::operator!=(const char *str) const {
return !(*this == str);
}
const char * UTF8::String::ToConstCharPtr() const {
return Data;
}
unsigned int UTF8::String::Length() const {
return StringLength;
}
unsigned int UTF8::String::DataLength() const {
return DataArrayLength;
}
UTF8::String::~String() {
Empty();
}
UTF8::String::String(const String& orig) {
InitString();
SetString(orig.Data);
}