ext-boost/boost/url/detail/impl/normalize.ipp
2023-05-22 18:45:02 +10:00

850 lines
19 KiB
C++

//
// Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com)
// Copyright (c) 2022 Alan de Freitas (alandefreitas@gmail.com)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// Official repository: https://github.com/boostorg/url
//
#ifndef BOOST_URL_DETAIL_IMPL_NORMALIZE_IPP
#define BOOST_URL_DETAIL_IMPL_NORMALIZE_IPP
#include <boost/url/detail/normalize.hpp>
#include <boost/url/segments_encoded_view.hpp>
#include <boost/assert.hpp>
#include <cstring>
namespace boost {
namespace urls {
namespace detail {
void
pop_encoded_front(
string_view& s,
char& c,
std::size_t& n) noexcept
{
if(s.front() != '%')
{
c = s.front();
s.remove_prefix(1);
}
else
{
detail::decode_unsafe(
&c,
&c + 1,
s.substr(0, 3));
s.remove_prefix(3);
}
++n;
}
int
compare_encoded(
string_view lhs,
string_view rhs) noexcept
{
std::size_t n0 = 0;
std::size_t n1 = 0;
char c0 = 0;
char c1 = 0;
while(
!lhs.empty() &&
!rhs.empty())
{
pop_encoded_front(lhs, c0, n0);
pop_encoded_front(rhs, c1, n1);
if (c0 < c1)
return -1;
if (c1 < c0)
return 1;
}
n0 += detail::decode_bytes_unsafe(lhs);
n1 += detail::decode_bytes_unsafe(rhs);
if (n0 == n1)
return 0;
if (n0 < n1)
return -1;
return 1;
}
void
digest_encoded(
string_view s,
fnv_1a& hasher) noexcept
{
char c = 0;
std::size_t n = 0;
while(!s.empty())
{
pop_encoded_front(s, c, n);
hasher.put(c);
}
}
int
ci_compare_encoded(
string_view lhs,
string_view rhs) noexcept
{
std::size_t n0 = 0;
std::size_t n1 = 0;
char c0 = 0;
char c1 = 0;
while (
!lhs.empty() &&
!rhs.empty())
{
pop_encoded_front(lhs, c0, n0);
pop_encoded_front(rhs, c1, n1);
c0 = grammar::to_lower(c0);
c1 = grammar::to_lower(c1);
if (c0 < c1)
return -1;
if (c1 < c0)
return 1;
}
n0 += detail::decode_bytes_unsafe(lhs);
n1 += detail::decode_bytes_unsafe(rhs);
if (n0 == n1)
return 0;
if (n0 < n1)
return -1;
return 1;
}
void
ci_digest_encoded(
string_view s,
fnv_1a& hasher) noexcept
{
char c = 0;
std::size_t n = 0;
while(!s.empty())
{
pop_encoded_front(s, c, n);
c = grammar::to_lower(c);
hasher.put(c);
}
}
int
compare(
string_view lhs,
string_view rhs) noexcept
{
auto rlen = (std::min)(lhs.size(), rhs.size());
for (std::size_t i = 0; i < rlen; ++i)
{
char c0 = lhs[i];
char c1 = rhs[i];
if (c0 < c1)
return -1;
if (c1 < c0)
return 1;
}
if ( lhs.size() == rhs.size() )
return 0;
if ( lhs.size() < rhs.size() )
return -1;
return 1;
}
int
ci_compare(
string_view lhs,
string_view rhs) noexcept
{
auto rlen = (std::min)(lhs.size(), rhs.size());
for (std::size_t i = 0; i < rlen; ++i)
{
char c0 = grammar::to_lower(lhs[i]);
char c1 = grammar::to_lower(rhs[i]);
if (c0 < c1)
return -1;
if (c1 < c0)
return 1;
}
if ( lhs.size() == rhs.size() )
return 0;
if ( lhs.size() < rhs.size() )
return -1;
return 1;
}
void
ci_digest(
string_view s,
fnv_1a& hasher) noexcept
{
for (char c: s)
{
c = grammar::to_lower(c);
hasher.put(c);
}
}
std::size_t
path_starts_with(
string_view lhs,
string_view rhs) noexcept
{
auto consume_one = [](
string_view::iterator& it,
char &c)
{
if(*it != '%')
{
c = *it;
++it;
return;
}
detail::decode_unsafe(
&c,
&c + 1,
string_view(it, 3));
if (c != '/')
{
it += 3;
return;
}
c = *it;
++it;
};
auto it0 = lhs.begin();
auto it1 = rhs.begin();
auto end0 = lhs.end();
auto end1 = rhs.end();
char c0 = 0;
char c1 = 0;
while (
it0 < end0 &&
it1 < end1)
{
consume_one(it0, c0);
consume_one(it1, c1);
if (c0 != c1)
return 0;
}
if (it1 == end1)
return it0 - lhs.begin();
return 0;
}
std::size_t
path_ends_with(
string_view lhs,
string_view rhs) noexcept
{
auto consume_last = [](
string_view::iterator& it,
string_view::iterator& end,
char& c)
{
if ((end - it) < 3 ||
*(std::prev(end, 3)) != '%')
{
c = *--end;
return;
}
detail::decode_unsafe(
&c,
&c + 1,
string_view(std::prev(
end, 3), 3));
if (c != '/')
{
end -= 3;
return;
}
c = *--end;
};
auto it0 = lhs.begin();
auto it1 = rhs.begin();
auto end0 = lhs.end();
auto end1 = rhs.end();
char c0 = 0;
char c1 = 0;
while(
it0 < end0 &&
it1 < end1)
{
consume_last(it0, end0, c0);
consume_last(it1, end1, c1);
if (c0 != c1)
return 0;
}
if (it1 == end1)
return lhs.end() - end0;
return 0;
}
std::size_t
remove_dot_segments(
char* dest0,
char const* end,
string_view s) noexcept
{
// 1. The input buffer `s` is initialized with
// the now-appended path components and the
// output buffer `dest0` is initialized to
// the empty string.
char* dest = dest0;
// Step 2 is a loop through 5 production rules:
// https://www.rfc-editor.org/rfc/rfc3986#section-5.2.4
//
// There are no transitions between all rules,
// which enables some optimizations.
//
// Initial:
// - Rule A: handle initial dots
// If the input buffer begins with a
// prefix of "../" or "./", then remove
// that prefix from the input buffer.
// Rule A can only happen at the beginning.
// Errata 4547: Keep "../" in the beginning
// https://www.rfc-editor.org/errata/eid4547
//
// Then:
// - Rule D: ignore a final ".." or "."
// if the input buffer consists only of "."
// or "..", then remove that from the input
// buffer.
// Rule D can only happen after Rule A because:
// - B and C write "/" to the input
// - E writes "/" to input or returns
//
// Then:
// - Rule B: ignore ".": write "/" to the input
// - Rule C: apply "..": remove seg and write "/"
// - Rule E: copy complete segment
auto append =
[](char*& first, char const* last, string_view in)
{
// append `in` to `dest`
BOOST_ASSERT(in.size() <= std::size_t(last - first));
std::memmove(first, in.data(), in.size());
first += in.size();
ignore_unused(last);
};
auto dot_starts_with = [](
string_view str, string_view dots, std::size_t& n)
{
// starts_with for encoded/decoded dots
// or decoded otherwise. return how many
// chars in str match the dots
n = 0;
for (char c: dots)
{
if (str.empty())
{
n = 0;
return false;
}
else if (str.starts_with(c))
{
str.remove_prefix(1);
++n;
}
else if (str.size() > 2 &&
str[0] == '%' &&
str[1] == '2' &&
(str[2] == 'e' ||
str[2] == 'E'))
{
str.remove_prefix(3);
n += 3;
}
else
{
n = 0;
return false;
}
}
return true;
};
auto dot_equal = [&dot_starts_with](
string_view str, string_view dots)
{
std::size_t n = 0;
dot_starts_with(str, dots, n);
return n == str.size();
};
// Rule A
std::size_t n;
while (!s.empty())
{
if (dot_starts_with(s, "../", n))
{
// Errata 4547
append(dest, end, "../");
s.remove_prefix(n);
continue;
}
else if (!dot_starts_with(s, "./", n))
{
break;
}
s.remove_prefix(n);
}
// Rule D
if( dot_equal(s, "."))
{
s = {};
}
else if( dot_equal(s, "..") )
{
// Errata 4547
append(dest, end, "..");
s = {};
}
// 2. While the input buffer is not empty,
// loop as follows:
while (!s.empty())
{
// Rule B
if (dot_starts_with(s, "/./", n))
{
s.remove_prefix(n - 1);
continue;
}
if (dot_equal(s, "/."))
{
// We can't remove "." from a string_view
// So what we do here is equivalent to
// replacing s with '/' as required
// in Rule B and executing the next
// iteration, which would append this
// '/' to the output, as required by
// Rule E
append(dest, end, s.substr(0, 1));
s = {};
break;
}
// Rule C
if (dot_starts_with(s, "/../", n))
{
std::size_t p = string_view(
dest0, dest - dest0).find_last_of('/');
if (p != string_view::npos)
{
// output has multiple segments
// "erase" [p, end] if not "/.."
string_view last_seg(dest0 + p, dest - (dest0 + p));
if (!dot_equal(last_seg, "/.."))
dest = dest0 + p;
else
append(dest, end, "/..");
}
else if (dest0 != dest)
{
// one segment in the output
dest = dest0;
s.remove_prefix(1);
}
else
{
// output is empty
append(dest, end, "/..");
}
s.remove_prefix(n-1);
continue;
}
if (dot_equal(s, "/.."))
{
std::size_t p = string_view(
dest0, dest - dest0).find_last_of('/');
if (p != string_view::npos)
{
// erase [p, end]
dest = dest0 + p;
append(dest, end, "/");
}
else if (dest0 != dest)
{
dest = dest0;
}
else
{
append(dest, end, "/..");
}
s = {};
break;
}
// Rule E
std::size_t p = s.find_first_of('/', 1);
if (p != string_view::npos)
{
append(dest, end, s.substr(0, p));
s.remove_prefix(p);
}
else
{
append(dest, end, s);
s = {};
}
}
// 3. Finally, the output buffer is set
// as the result of remove_dot_segments,
// and we return its size
return dest - dest0;
}
char
path_pop_back( string_view& s )
{
if (s.size() < 3 ||
*std::prev(s.end(), 3) != '%')
{
char c = s.back();
s.remove_suffix(1);
return c;
}
char c = 0;
detail::decode_unsafe(
&c, &c + 1, s.substr(s.size() - 3));
if (c != '/')
{
s.remove_suffix(3);
return c;
}
c = s.back();
s.remove_suffix(1);
return c;
};
void
pop_last_segment(
string_view& s,
string_view& c,
std::size_t& level,
bool r) noexcept
{
c = {};
std::size_t n = 0;
while (!s.empty())
{
// B. if the input buffer begins with a
// prefix of "/./" or "/.", where "." is
// a complete path segment, then replace
// that prefix with "/" in the input
// buffer; otherwise,
n = detail::path_ends_with(s, "/./");
if (n)
{
c = s.substr(s.size() - n);
s.remove_suffix(n);
continue;
}
n = detail::path_ends_with(s, "/.");
if (n)
{
c = s.substr(s.size() - n, 1);
s.remove_suffix(n);
continue;
}
// C. if the input buffer begins with a
// prefix of "/../" or "/..", where ".."
// is a complete path segment, then
// replace that prefix with "/" in the
// input buffer and remove the last
// segment and its preceding "/"
// (if any) from the output buffer
// otherwise,
n = detail::path_ends_with(s, "/../");
if (n)
{
c = s.substr(s.size() - n);
s.remove_suffix(n);
++level;
continue;
}
n = detail::path_ends_with(s, "/..");
if (n)
{
c = s.substr(s.size() - n);
s.remove_suffix(n);
++level;
continue;
}
// E. move the first path segment in the
// input buffer to the end of the output
// buffer, including the initial "/"
// character (if any) and any subsequent
// characters up to, but not including,
// the next "/" character or the end of
// the input buffer.
std::size_t p = s.size() > 1
? s.find_last_of('/', s.size() - 2)
: string_view::npos;
if (p != string_view::npos)
{
c = s.substr(p + 1);
s.remove_suffix(c.size());
}
else
{
c = s;
s = {};
}
if (level == 0)
return;
if (!s.empty())
--level;
}
// we still need to skip n_skip + 1
// but the string is empty
if (r && level)
{
c = "/";
level = 0;
return;
}
else if (level)
{
if (c.empty())
c = "/..";
else
c = "/../";
--level;
return;
}
c = {};
}
void
normalized_path_digest(
string_view s,
bool remove_unmatched,
fnv_1a& hasher) noexcept
{
string_view child;
std::size_t level = 0;
do
{
pop_last_segment(
s, child, level, remove_unmatched);
while (!child.empty())
{
char c = path_pop_back(child);
hasher.put(c);
}
}
while (!s.empty());
}
// compare segments as if there were a normalized
int
segments_compare(
segments_encoded_view seg0,
segments_encoded_view seg1) noexcept
{
// calculate path size as if it were normalized
auto normalized_size =
[](segments_encoded_view seg) -> std::size_t
{
if (seg.empty())
return seg.is_absolute();
std::size_t n = 0;
std::size_t skip = 0;
auto begin = seg.begin();
auto it = seg.end();
while (it != begin)
{
--it;
decode_view dseg = **it;
if (dseg == "..")
++skip;
else if (dseg != ".")
{
if (skip)
--skip;
else
n += dseg.size() + 1;
}
}
n += skip * 3;
n -= !seg.is_absolute();
return n;
};
// find the normalized size for the comparison
std::size_t n0 = normalized_size(seg0);
std::size_t n1 = normalized_size(seg1);
std::size_t n00 = n0;
std::size_t n10 = n1;
// consume the last char from a segment range
auto consume_last =
[](
std::size_t& n,
decode_view& dseg,
segments_encoded_view::iterator& begin,
segments_encoded_view::iterator& it,
decode_view::iterator& cit,
std::size_t& skip,
bool& at_slash) -> char
{
if (cit != dseg.begin())
{
// return last char from current segment
at_slash = false;
--cit;
--n;
return *cit;
}
if (!at_slash)
{
// current segment dseg is over and
// previous char was not a slash
// so we output one
at_slash = true;
--n;
return '/';
}
// current segment dseg is over and
// last char was already the slash
// between segments, so take the
// next final segment to consume
at_slash = false;
while (cit == dseg.begin())
{
// take next segment
if (it != begin)
--it;
else
break;
if (**it == "..")
{
// skip next if this is ".."
++skip;
}
else if (**it != ".")
{
if (skip)
{
// discount skips
--skip;
}
else
{
// or update current seg
dseg = **it;
cit = dseg.end();
break;
}
}
}
// consume from the new current
// segment
--n;
if (cit != dseg.begin())
{
// in the general case, we consume
// one more character from the end
--cit;
return *cit;
}
// nothing left to consume in the
// current and new segment
if (it == begin)
{
// if this is the first
// segment, the segments are
// over and there can only
// be repetitions of "../" to
// output
return "/.."[n % 3];
}
// at other segments, we need
// a slash to transition to the
// next segment
at_slash = true;
return '/';
};
// consume final segments from seg0 that
// should not influence the comparison
auto begin0 = seg0.begin();
auto it0 = seg0.end();
decode_view dseg0;
if (it0 != seg0.begin())
{
--it0;
dseg0 = **it0;
}
decode_view::iterator cit0 = dseg0.end();
std::size_t skip0 = 0;
bool at_slash0 = true;
while (n0 > n1)
{
consume_last(n0, dseg0, begin0, it0, cit0, skip0, at_slash0);
}
// consume final segments from seg1 that
// should not influence the comparison
auto begin1 = seg1.begin();
auto it1 = seg1.end();
decode_view dseg1;
if (it1 != seg1.begin())
{
--it1;
dseg1 = **it1;
}
decode_view::iterator cit1 = dseg1.end();
std::size_t skip1 = 0;
bool at_slash1 = true;
while (n1 > n0)
{
consume_last(n1, dseg1, begin1, it1, cit1, skip1, at_slash1);
}
int cmp = 0;
while (n0)
{
char c0 = consume_last(
n0, dseg0, begin0, it0, cit0, skip0, at_slash0);
char c1 = consume_last(
n1, dseg1, begin1, it1, cit1, skip1, at_slash1);
if (c0 < c1)
cmp = -1;
else if (c1 < c0)
cmp = +1;
}
if (cmp != 0)
return cmp;
if ( n00 == n10 )
return 0;
if ( n00 < n10 )
return -1;
return 1;
}
} // detail
} // urls
} // boost
#endif