You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
lnav/src/base/intern_string.hh

795 lines
20 KiB
C++

/**
* Copyright (c) 2014, Timothy Stack
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* * Neither the name of Timothy Stack nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @file intern_string.hh
*/
#ifndef intern_string_hh
#define intern_string_hh
#include <ostream>
#include <string>
#include <vector>
#include <assert.h>
#include <string.h>
#include <sys/types.h>
#include "fmt/format.h"
#include "optional.hpp"
#include "scn/util/string_view.h"
#include "strnatcmp.h"
#include "ww898/cp_utf8.hpp"
struct string_fragment {
using iterator = const char*;
static string_fragment invalid()
{
string_fragment retval;
retval.invalidate();
return retval;
}
static string_fragment from_c_str(const char* str)
{
return string_fragment{str, 0, str != nullptr ? (int) strlen(str) : 0};
}
template<typename T, std::size_t N>
static string_fragment from_const(const T (&str)[N])
{
return string_fragment{str, 0, (int) N - 1};
}
static string_fragment from_str(const std::string& str)
{
return string_fragment{str.c_str(), 0, (int) str.size()};
}
static string_fragment from_substr(const std::string& str,
size_t offset,
size_t length)
{
return string_fragment{
str.c_str(), (int) offset, (int) (offset + length)};
}
static string_fragment from_str_range(const std::string& str,
size_t begin,
size_t end)
{
return string_fragment{str.c_str(), (int) begin, (int) end};
}
static string_fragment from_bytes(const char* bytes, size_t len)
{
return string_fragment{bytes, 0, (int) len};
}
static string_fragment from_bytes(const unsigned char* bytes, size_t len)
{
return string_fragment{(const char*) bytes, 0, (int) len};
}
static string_fragment from_memory_buffer(const fmt::memory_buffer& buf)
{
return string_fragment{buf.data(), 0, (int) buf.size()};
}
static string_fragment from_byte_range(const char* bytes,
size_t begin,
size_t end)
{
return string_fragment{bytes, (int) begin, (int) end};
}
explicit string_fragment(const char* str = "", int begin = 0, int end = -1)
: sf_string(str), sf_begin(begin), sf_end(end == -1 ? strlen(str) : end)
{
}
explicit string_fragment(const unsigned char* str,
int begin = 0,
int end = -1)
: sf_string((const char*) str), sf_begin(begin),
sf_end(end == -1 ? strlen((const char*) str) : end)
{
}
string_fragment(const std::string& str)
: sf_string(str.c_str()), sf_begin(0), sf_end(str.length())
{
}
bool is_valid() const
{
return this->sf_begin != -1 && this->sf_begin <= this->sf_end;
}
int length() const { return this->sf_end - this->sf_begin; }
Result<ssize_t, const char*> utf8_length() const;
const char* data() const { return &this->sf_string[this->sf_begin]; }
const unsigned char* udata() const
{
return (const unsigned char*) &this->sf_string[this->sf_begin];
}
char front() const { return this->sf_string[this->sf_begin]; }
uint32_t front_codepoint() const
{
size_t index = 0;
try {
return ww898::utf::utf8::read(
[this, &index]() { return this->data()[index++]; });
} catch (const std::runtime_error& e) {
return this->data()[0];
}
}
char back() const { return this->sf_string[this->sf_end - 1]; }
iterator begin() const { return &this->sf_string[this->sf_begin]; }
iterator end() const { return &this->sf_string[this->sf_end]; }
bool empty() const { return !this->is_valid() || length() == 0; }
Result<ssize_t, const char*> codepoint_to_byte_index(ssize_t cp_index) const
{
ssize_t retval = 0;
while (cp_index > 0) {
if (retval >= this->length()) {
return Err("index is beyond the end of the string");
}
auto ch_len = TRY(ww898::utf::utf8::char_size([this, retval]() {
return std::make_pair(this->data()[retval],
this->length() - retval - 1);
}));
retval += ch_len;
cp_index -= 1;
}
return Ok(retval);
}
char operator[](int index) const
{
return this->sf_string[sf_begin + index];
}
bool operator==(const std::string& str) const
{
if (this->length() != (int) str.length()) {
return false;
}
return memcmp(
&this->sf_string[this->sf_begin], str.c_str(), str.length())
== 0;
}
bool operator==(const string_fragment& sf) const
{
if (this->length() != sf.length()) {
return false;
}
return memcmp(this->data(), sf.data(), sf.length()) == 0;
}
bool iequal(const string_fragment& sf) const
{
if (this->length() != sf.length()) {
return false;
}
return strnatcasecmp(
this->length(), this->data(), sf.length(), sf.data())
== 0;
}
bool operator==(const char* str) const
{
size_t len = strlen(str);
return len == (size_t) this->length()
&& strncmp(this->data(), str, this->length()) == 0;
}
bool operator!=(const char* str) const { return !(*this == str); }
bool startswith(const char* prefix) const
{
const auto* iter = this->begin();
while (*prefix != '\0' && iter < this->end() && *prefix == *iter) {
prefix += 1;
iter += 1;
}
return *prefix == '\0';
}
bool endswith(const char* suffix) const
{
int suffix_len = strlen(suffix);
if (suffix_len > this->length()) {
return false;
}
const auto* curr = this->end() - suffix_len;
while (*suffix != '\0' && *curr == *suffix) {
suffix += 1;
curr += 1;
}
return *suffix == '\0';
}
string_fragment substr(int begin) const
{
return string_fragment{
this->sf_string, this->sf_begin + begin, this->sf_end};
}
string_fragment sub_range(int begin, int end) const
{
return string_fragment{
this->sf_string, this->sf_begin + begin, this->sf_begin + end};
}
nonstd::optional<size_t> find(char ch) const
{
for (int lpc = this->sf_begin; lpc < this->sf_end; lpc++) {
if (this->sf_string[lpc] == ch) {
return lpc;
}
}
return nonstd::nullopt;
}
template<typename P>
string_fragment find_left_boundary(size_t start, P&& predicate) const
{
assert((int) start <= this->length());
if (start > 0 && start == this->length()) {
start -= 1;
}
while (start > 0) {
if (predicate(this->data()[start])) {
start += 1;
break;
}
start -= 1;
}
return string_fragment{
this->sf_string,
(int) start,
this->sf_end,
};
}
template<typename P>
string_fragment find_right_boundary(size_t start, P&& predicate) const
{
while ((int) start < this->length()) {
if (predicate(this->data()[start])) {
break;
}
start += 1;
}
return string_fragment{
this->sf_string,
this->sf_begin,
this->sf_begin + (int) start,
};
}
template<typename P>
string_fragment find_boundaries_around(size_t start, P&& predicate) const
{
return this->template find_left_boundary(start, predicate)
.find_right_boundary(0, predicate);
}
nonstd::optional<std::pair<uint32_t, string_fragment>> consume_codepoint()
const
{
auto cp = this->front_codepoint();
auto index_res = this->codepoint_to_byte_index(1);
if (index_res.isErr()) {
return nonstd::nullopt;
}
return std::make_pair(cp, this->substr(index_res.unwrap()));
}
template<typename P>
nonstd::optional<string_fragment> consume(P predicate) const
{
int consumed = 0;
while (consumed < this->length()) {
if (!predicate(this->data()[consumed])) {
break;
}
consumed += 1;
}
if (consumed == 0) {
return nonstd::nullopt;
}
return string_fragment{
this->sf_string,
this->sf_begin + consumed,
this->sf_end,
};
}
nonstd::optional<string_fragment> consume_n(int amount) const;
template<typename P>
string_fragment skip(P predicate) const
{
int offset = 0;
while (offset < this->length() && predicate(this->data()[offset])) {
offset += 1;
}
return string_fragment{
this->sf_string,
this->sf_begin + offset,
this->sf_end,
};
}
using split_result
= nonstd::optional<std::pair<string_fragment, string_fragment>>;
template<typename P>
split_result split_while(P&& predicate) const
{
int consumed = 0;
while (consumed < this->length()) {
if (!predicate(this->data()[consumed])) {
break;
}
consumed += 1;
}
if (consumed == 0) {
return nonstd::nullopt;
}
return std::make_pair(
string_fragment{
this->sf_string,
this->sf_begin,
this->sf_begin + consumed,
},
string_fragment{
this->sf_string,
this->sf_begin + consumed,
this->sf_end,
});
}
split_result split_n(int amount) const;
std::vector<string_fragment> split_lines() const;
struct tag1 {
const char t_value;
bool operator()(char ch) const { return this->t_value == ch; }
};
struct quoted_string_body {
bool qs_in_escape{false};
bool operator()(char ch)
{
if (this->qs_in_escape) {
this->qs_in_escape = false;
return true;
} else if (ch == '\\') {
this->qs_in_escape = true;
return true;
} else if (ch == '"') {
return false;
} else {
return true;
}
}
};
const char* to_string(char* buf) const
{
memcpy(buf, this->data(), this->length());
buf[this->length()] = '\0';
return buf;
}
std::string to_string() const
{
return {this->data(), (size_t) this->length()};
}
void clear()
{
this->sf_begin = 0;
this->sf_end = 0;
}
void invalidate()
{
this->sf_begin = -1;
this->sf_end = -1;
}
string_fragment trim(const char* tokens) const;
string_fragment trim() const;
string_fragment prepend(const char* str, int amount) const
{
return string_fragment{
str,
this->sf_begin + amount,
this->sf_end + amount,
};
}
string_fragment erase_before(const char* str, int amount) const
{
return string_fragment{
str,
this->sf_begin - amount,
this->sf_end - amount,
};
}
string_fragment erase(const char* str, int amount) const
{
return string_fragment{
str,
this->sf_begin,
this->sf_end - amount,
};
}
template<typename A>
const char* to_c_str(A allocator) const
{
auto* retval = allocator.allocate(this->length() + 1);
memcpy(retval, this->data(), this->length());
retval[this->length()] = '\0';
return retval;
}
template<typename A>
string_fragment to_owned(A allocator) const
{
return string_fragment{
this->template to_c_str(allocator),
0,
this->length(),
};
}
scn::string_view to_string_view() const
{
return scn::string_view{this->begin(), this->end()};
}
const char* sf_string;
int sf_begin;
int sf_end;
};
inline bool
operator==(const std::string& left, const string_fragment& right)
{
return right == left;
}
inline bool
operator<(const char* left, const string_fragment& right)
{
int rc = strncmp(left, right.data(), right.length());
return rc < 0;
}
inline void
operator+=(std::string& left, const string_fragment& right)
{
left.append(right.data(), right.length());
}
inline bool
operator<(const string_fragment& left, const char* right)
{
return strncmp(left.data(), right, left.length()) < 0;
}
inline std::ostream&
operator<<(std::ostream& os, const string_fragment& sf)
{
os.write(sf.data(), sf.length());
return os;
}
class intern_string {
public:
static const intern_string* lookup(const char* str, ssize_t len) noexcept;
static const intern_string* lookup(const string_fragment& sf) noexcept;
static const intern_string* lookup(const std::string& str) noexcept;
const char* get() const { return this->is_str.c_str(); };
size_t size() const { return this->is_str.size(); }
std::string to_string() const { return this->is_str; }
string_fragment to_string_fragment() const
{
return string_fragment{this->is_str};
}
bool startswith(const char* prefix) const;
struct intern_table;
static std::shared_ptr<intern_table> get_table_lifetime();
private:
friend intern_table;
intern_string(const char* str, ssize_t len)
: is_next(nullptr), is_str(str, (size_t) len)
{
}
intern_string* is_next;
std::string is_str;
};
using intern_table_lifetime = std::shared_ptr<intern_string::intern_table>;
class intern_string_t {
public:
using iterator = const char*;
intern_string_t(const intern_string* is = nullptr) : ist_interned_string(is)
{
}
const intern_string* unwrap() const { return this->ist_interned_string; }
void clear() { this->ist_interned_string = nullptr; };
bool empty() const { return this->ist_interned_string == nullptr; }
const char* get() const
{
if (this->empty()) {
return "";
}
return this->ist_interned_string->get();
}
const char* c_str() const { return this->get(); }
iterator begin() const { return this->get(); }
iterator end() const { return this->get() + this->size(); }
size_t size() const
{
if (this->ist_interned_string == nullptr) {
return 0;
}
return this->ist_interned_string->size();
}
size_t hash() const
{
auto ptr = (uintptr_t) this->ist_interned_string;
return ptr;
}
std::string to_string() const
{
if (this->ist_interned_string == nullptr) {
return "";
}
return this->ist_interned_string->to_string();
}
string_fragment to_string_fragment() const
{
if (this->ist_interned_string == nullptr) {
return string_fragment{"", 0, 0};
}
return this->ist_interned_string->to_string_fragment();
}
bool operator<(const intern_string_t& rhs) const
{
return strcmp(this->get(), rhs.get()) < 0;
}
bool operator==(const intern_string_t& rhs) const
{
return this->ist_interned_string == rhs.ist_interned_string;
}
bool operator!=(const intern_string_t& rhs) const
{
return !(*this == rhs);
}
bool operator==(const char* rhs) const
{
return strcmp(this->get(), rhs) == 0;
}
bool operator!=(const char* rhs) const
{
return strcmp(this->get(), rhs) != 0;
}
static bool case_lt(const intern_string_t& lhs, const intern_string_t& rhs)
{
return strnatcasecmp(lhs.size(), lhs.get(), rhs.size(), rhs.get()) < 0;
}
private:
const intern_string* ist_interned_string;
};
unsigned long hash_str(const char* str, size_t len);
namespace fmt {
template<>
struct formatter<string_fragment> : formatter<string_view> {
template<typename FormatContext>
auto format(const string_fragment& sf, FormatContext& ctx)
{
return formatter<string_view>::format(
string_view{sf.data(), (size_t) sf.length()}, ctx);
}
};
template<>
struct formatter<intern_string_t> : formatter<string_view> {
template<typename FormatContext>
auto format(const intern_string_t& is, FormatContext& ctx)
{
return formatter<string_view>::format(
string_view{is.get(), (size_t) is.size()}, ctx);
}
};
} // namespace fmt
namespace std {
template<>
struct hash<const intern_string_t> {
std::size_t operator()(const intern_string_t& ist) const
{
return ist.hash();
}
};
} // namespace std
inline bool
operator<(const char* left, const intern_string_t& right)
{
int rc = strncmp(left, right.get(), right.size());
return rc < 0;
}
inline bool
operator<(const intern_string_t& left, const char* right)
{
return strncmp(left.get(), right, left.size()) < 0;
}
inline bool
operator==(const intern_string_t& left, const string_fragment& sf)
{
return ((int) left.size() == sf.length())
&& (memcmp(left.get(), sf.data(), left.size()) == 0);
}
inline bool
operator==(const string_fragment& left, const intern_string_t& right)
{
return (left.length() == (int) right.size())
&& (memcmp(left.data(), right.get(), left.length()) == 0);
}
namespace std {
inline string
to_string(const string_fragment& s)
{
return {s.data(), (size_t) s.length()};
}
inline string
to_string(const intern_string_t& s)
{
return s.to_string();
}
} // namespace std
inline string_fragment
to_string_fragment(const string_fragment& s)
{
return s;
}
inline string_fragment
to_string_fragment(const intern_string_t& s)
{
return string_fragment(s.get(), 0, s.size());
}
inline string_fragment
to_string_fragment(const std::string& s)
{
return string_fragment(s.c_str(), 0, s.length());
}
struct frag_hasher {
size_t operator()(const string_fragment& sf) const
{
return hash_str(sf.data(), sf.length());
}
};
#endif