// // Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com) // Copyright (c) 2022 Alan de Freitas (alandefreitas@gmail.com) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // // Official repository: https://github.com/boostorg/url // #ifndef BOOST_URL_DETAIL_IMPL_NORMALIZE_IPP #define BOOST_URL_DETAIL_IMPL_NORMALIZE_IPP #include #include #include #include namespace boost { namespace urls { namespace detail { void pop_encoded_front( string_view& s, char& c, std::size_t& n) noexcept { if(s.front() != '%') { c = s.front(); s.remove_prefix(1); } else { detail::decode_unsafe( &c, &c + 1, s.substr(0, 3)); s.remove_prefix(3); } ++n; } int compare_encoded( string_view lhs, string_view rhs) noexcept { std::size_t n0 = 0; std::size_t n1 = 0; char c0 = 0; char c1 = 0; while( !lhs.empty() && !rhs.empty()) { pop_encoded_front(lhs, c0, n0); pop_encoded_front(rhs, c1, n1); if (c0 < c1) return -1; if (c1 < c0) return 1; } n0 += detail::decode_bytes_unsafe(lhs); n1 += detail::decode_bytes_unsafe(rhs); if (n0 == n1) return 0; if (n0 < n1) return -1; return 1; } void digest_encoded( string_view s, fnv_1a& hasher) noexcept { char c = 0; std::size_t n = 0; while(!s.empty()) { pop_encoded_front(s, c, n); hasher.put(c); } } int ci_compare_encoded( string_view lhs, string_view rhs) noexcept { std::size_t n0 = 0; std::size_t n1 = 0; char c0 = 0; char c1 = 0; while ( !lhs.empty() && !rhs.empty()) { pop_encoded_front(lhs, c0, n0); pop_encoded_front(rhs, c1, n1); c0 = grammar::to_lower(c0); c1 = grammar::to_lower(c1); if (c0 < c1) return -1; if (c1 < c0) return 1; } n0 += detail::decode_bytes_unsafe(lhs); n1 += detail::decode_bytes_unsafe(rhs); if (n0 == n1) return 0; if (n0 < n1) return -1; return 1; } void ci_digest_encoded( string_view s, fnv_1a& hasher) noexcept { char c = 0; std::size_t n = 0; while(!s.empty()) { pop_encoded_front(s, c, n); c = grammar::to_lower(c); hasher.put(c); } } int compare( string_view lhs, string_view rhs) noexcept { auto rlen = (std::min)(lhs.size(), rhs.size()); for (std::size_t i = 0; i < rlen; ++i) { char c0 = lhs[i]; char c1 = rhs[i]; if (c0 < c1) return -1; if (c1 < c0) return 1; } if ( lhs.size() == rhs.size() ) return 0; if ( lhs.size() < rhs.size() ) return -1; return 1; } int ci_compare( string_view lhs, string_view rhs) noexcept { auto rlen = (std::min)(lhs.size(), rhs.size()); for (std::size_t i = 0; i < rlen; ++i) { char c0 = grammar::to_lower(lhs[i]); char c1 = grammar::to_lower(rhs[i]); if (c0 < c1) return -1; if (c1 < c0) return 1; } if ( lhs.size() == rhs.size() ) return 0; if ( lhs.size() < rhs.size() ) return -1; return 1; } void ci_digest( string_view s, fnv_1a& hasher) noexcept { for (char c: s) { c = grammar::to_lower(c); hasher.put(c); } } std::size_t path_starts_with( string_view lhs, string_view rhs) noexcept { auto consume_one = []( string_view::iterator& it, char &c) { if(*it != '%') { c = *it; ++it; return; } detail::decode_unsafe( &c, &c + 1, string_view(it, 3)); if (c != '/') { it += 3; return; } c = *it; ++it; }; auto it0 = lhs.begin(); auto it1 = rhs.begin(); auto end0 = lhs.end(); auto end1 = rhs.end(); char c0 = 0; char c1 = 0; while ( it0 < end0 && it1 < end1) { consume_one(it0, c0); consume_one(it1, c1); if (c0 != c1) return 0; } if (it1 == end1) return it0 - lhs.begin(); return 0; } std::size_t path_ends_with( string_view lhs, string_view rhs) noexcept { auto consume_last = []( string_view::iterator& it, string_view::iterator& end, char& c) { if ((end - it) < 3 || *(std::prev(end, 3)) != '%') { c = *--end; return; } detail::decode_unsafe( &c, &c + 1, string_view(std::prev( end, 3), 3)); if (c != '/') { end -= 3; return; } c = *--end; }; auto it0 = lhs.begin(); auto it1 = rhs.begin(); auto end0 = lhs.end(); auto end1 = rhs.end(); char c0 = 0; char c1 = 0; while( it0 < end0 && it1 < end1) { consume_last(it0, end0, c0); consume_last(it1, end1, c1); if (c0 != c1) return 0; } if (it1 == end1) return lhs.end() - end0; return 0; } std::size_t remove_dot_segments( char* dest0, char const* end, string_view s) noexcept { // 1. The input buffer `s` is initialized with // the now-appended path components and the // output buffer `dest0` is initialized to // the empty string. char* dest = dest0; // Step 2 is a loop through 5 production rules: // https://www.rfc-editor.org/rfc/rfc3986#section-5.2.4 // // There are no transitions between all rules, // which enables some optimizations. // // Initial: // - Rule A: handle initial dots // If the input buffer begins with a // prefix of "../" or "./", then remove // that prefix from the input buffer. // Rule A can only happen at the beginning. // Errata 4547: Keep "../" in the beginning // https://www.rfc-editor.org/errata/eid4547 // // Then: // - Rule D: ignore a final ".." or "." // if the input buffer consists only of "." // or "..", then remove that from the input // buffer. // Rule D can only happen after Rule A because: // - B and C write "/" to the input // - E writes "/" to input or returns // // Then: // - Rule B: ignore ".": write "/" to the input // - Rule C: apply "..": remove seg and write "/" // - Rule E: copy complete segment auto append = [](char*& first, char const* last, string_view in) { // append `in` to `dest` BOOST_ASSERT(in.size() <= std::size_t(last - first)); std::memmove(first, in.data(), in.size()); first += in.size(); ignore_unused(last); }; auto dot_starts_with = []( string_view str, string_view dots, std::size_t& n) { // starts_with for encoded/decoded dots // or decoded otherwise. return how many // chars in str match the dots n = 0; for (char c: dots) { if (str.empty()) { n = 0; return false; } else if (str.starts_with(c)) { str.remove_prefix(1); ++n; } else if (str.size() > 2 && str[0] == '%' && str[1] == '2' && (str[2] == 'e' || str[2] == 'E')) { str.remove_prefix(3); n += 3; } else { n = 0; return false; } } return true; }; auto dot_equal = [&dot_starts_with]( string_view str, string_view dots) { std::size_t n = 0; dot_starts_with(str, dots, n); return n == str.size(); }; // Rule A std::size_t n; while (!s.empty()) { if (dot_starts_with(s, "../", n)) { // Errata 4547 append(dest, end, "../"); s.remove_prefix(n); continue; } else if (!dot_starts_with(s, "./", n)) { break; } s.remove_prefix(n); } // Rule D if( dot_equal(s, ".")) { s = {}; } else if( dot_equal(s, "..") ) { // Errata 4547 append(dest, end, ".."); s = {}; } // 2. While the input buffer is not empty, // loop as follows: while (!s.empty()) { // Rule B if (dot_starts_with(s, "/./", n)) { s.remove_prefix(n - 1); continue; } if (dot_equal(s, "/.")) { // We can't remove "." from a string_view // So what we do here is equivalent to // replacing s with '/' as required // in Rule B and executing the next // iteration, which would append this // '/' to the output, as required by // Rule E append(dest, end, s.substr(0, 1)); s = {}; break; } // Rule C if (dot_starts_with(s, "/../", n)) { std::size_t p = string_view( dest0, dest - dest0).find_last_of('/'); if (p != string_view::npos) { // output has multiple segments // "erase" [p, end] if not "/.." string_view last_seg(dest0 + p, dest - (dest0 + p)); if (!dot_equal(last_seg, "/..")) dest = dest0 + p; else append(dest, end, "/.."); } else if (dest0 != dest) { // one segment in the output dest = dest0; s.remove_prefix(1); } else { // output is empty append(dest, end, "/.."); } s.remove_prefix(n-1); continue; } if (dot_equal(s, "/..")) { std::size_t p = string_view( dest0, dest - dest0).find_last_of('/'); if (p != string_view::npos) { // erase [p, end] dest = dest0 + p; append(dest, end, "/"); } else if (dest0 != dest) { dest = dest0; } else { append(dest, end, "/.."); } s = {}; break; } // Rule E std::size_t p = s.find_first_of('/', 1); if (p != string_view::npos) { append(dest, end, s.substr(0, p)); s.remove_prefix(p); } else { append(dest, end, s); s = {}; } } // 3. Finally, the output buffer is set // as the result of remove_dot_segments, // and we return its size return dest - dest0; } char path_pop_back( string_view& s ) { if (s.size() < 3 || *std::prev(s.end(), 3) != '%') { char c = s.back(); s.remove_suffix(1); return c; } char c = 0; detail::decode_unsafe( &c, &c + 1, s.substr(s.size() - 3)); if (c != '/') { s.remove_suffix(3); return c; } c = s.back(); s.remove_suffix(1); return c; }; void pop_last_segment( string_view& s, string_view& c, std::size_t& level, bool r) noexcept { c = {}; std::size_t n = 0; while (!s.empty()) { // B. if the input buffer begins with a // prefix of "/./" or "/.", where "." is // a complete path segment, then replace // that prefix with "/" in the input // buffer; otherwise, n = detail::path_ends_with(s, "/./"); if (n) { c = s.substr(s.size() - n); s.remove_suffix(n); continue; } n = detail::path_ends_with(s, "/."); if (n) { c = s.substr(s.size() - n, 1); s.remove_suffix(n); continue; } // C. if the input buffer begins with a // prefix of "/../" or "/..", where ".." // is a complete path segment, then // replace that prefix with "/" in the // input buffer and remove the last // segment and its preceding "/" // (if any) from the output buffer // otherwise, n = detail::path_ends_with(s, "/../"); if (n) { c = s.substr(s.size() - n); s.remove_suffix(n); ++level; continue; } n = detail::path_ends_with(s, "/.."); if (n) { c = s.substr(s.size() - n); s.remove_suffix(n); ++level; continue; } // E. move the first path segment in the // input buffer to the end of the output // buffer, including the initial "/" // character (if any) and any subsequent // characters up to, but not including, // the next "/" character or the end of // the input buffer. std::size_t p = s.size() > 1 ? s.find_last_of('/', s.size() - 2) : string_view::npos; if (p != string_view::npos) { c = s.substr(p + 1); s.remove_suffix(c.size()); } else { c = s; s = {}; } if (level == 0) return; if (!s.empty()) --level; } // we still need to skip n_skip + 1 // but the string is empty if (r && level) { c = "/"; level = 0; return; } else if (level) { if (c.empty()) c = "/.."; else c = "/../"; --level; return; } c = {}; } void normalized_path_digest( string_view s, bool remove_unmatched, fnv_1a& hasher) noexcept { string_view child; std::size_t level = 0; do { pop_last_segment( s, child, level, remove_unmatched); while (!child.empty()) { char c = path_pop_back(child); hasher.put(c); } } while (!s.empty()); } // compare segments as if there were a normalized int segments_compare( segments_encoded_view seg0, segments_encoded_view seg1) noexcept { // calculate path size as if it were normalized auto normalized_size = [](segments_encoded_view seg) -> std::size_t { if (seg.empty()) return seg.is_absolute(); std::size_t n = 0; std::size_t skip = 0; auto begin = seg.begin(); auto it = seg.end(); while (it != begin) { --it; decode_view dseg = **it; if (dseg == "..") ++skip; else if (dseg != ".") { if (skip) --skip; else n += dseg.size() + 1; } } n += skip * 3; n -= !seg.is_absolute(); return n; }; // find the normalized size for the comparison std::size_t n0 = normalized_size(seg0); std::size_t n1 = normalized_size(seg1); std::size_t n00 = n0; std::size_t n10 = n1; // consume the last char from a segment range auto consume_last = []( std::size_t& n, decode_view& dseg, segments_encoded_view::iterator& begin, segments_encoded_view::iterator& it, decode_view::iterator& cit, std::size_t& skip, bool& at_slash) -> char { if (cit != dseg.begin()) { // return last char from current segment at_slash = false; --cit; --n; return *cit; } if (!at_slash) { // current segment dseg is over and // previous char was not a slash // so we output one at_slash = true; --n; return '/'; } // current segment dseg is over and // last char was already the slash // between segments, so take the // next final segment to consume at_slash = false; while (cit == dseg.begin()) { // take next segment if (it != begin) --it; else break; if (**it == "..") { // skip next if this is ".." ++skip; } else if (**it != ".") { if (skip) { // discount skips --skip; } else { // or update current seg dseg = **it; cit = dseg.end(); break; } } } // consume from the new current // segment --n; if (cit != dseg.begin()) { // in the general case, we consume // one more character from the end --cit; return *cit; } // nothing left to consume in the // current and new segment if (it == begin) { // if this is the first // segment, the segments are // over and there can only // be repetitions of "../" to // output return "/.."[n % 3]; } // at other segments, we need // a slash to transition to the // next segment at_slash = true; return '/'; }; // consume final segments from seg0 that // should not influence the comparison auto begin0 = seg0.begin(); auto it0 = seg0.end(); decode_view dseg0; if (it0 != seg0.begin()) { --it0; dseg0 = **it0; } decode_view::iterator cit0 = dseg0.end(); std::size_t skip0 = 0; bool at_slash0 = true; while (n0 > n1) { consume_last(n0, dseg0, begin0, it0, cit0, skip0, at_slash0); } // consume final segments from seg1 that // should not influence the comparison auto begin1 = seg1.begin(); auto it1 = seg1.end(); decode_view dseg1; if (it1 != seg1.begin()) { --it1; dseg1 = **it1; } decode_view::iterator cit1 = dseg1.end(); std::size_t skip1 = 0; bool at_slash1 = true; while (n1 > n0) { consume_last(n1, dseg1, begin1, it1, cit1, skip1, at_slash1); } int cmp = 0; while (n0) { char c0 = consume_last( n0, dseg0, begin0, it0, cit0, skip0, at_slash0); char c1 = consume_last( n1, dseg1, begin1, it1, cit1, skip1, at_slash1); if (c0 < c1) cmp = -1; else if (c1 < c0) cmp = +1; } if (cmp != 0) return cmp; if ( n00 == n10 ) return 0; if ( n00 < n10 ) return -1; return 1; } } // detail } // urls } // boost #endif