/*
* Copyright (c) 2010-2023 Belledonne Communications SARL.
*
* This file is part of linphone-desktop
* (see https://www.linphone.org).
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
// =============================================================================
// Library to deal with IRI and URI.
// See:
// IRI : https://tools.ietf.org/html/rfc3987
// URI : https://tools.ietf.org/html/rfc3986
// =============================================================================
#include "UriTools.hpp"
static UriTools gUriTools;
UriTools::UriTools() {
initRegularExpressions();
}
QVector> UriTools::parseIri(const QString &text) {
return parse(text, gUriTools.mIriRegularExpression);
}
QVector> UriTools::parseUri(const QString &text) {
return parse(text, gUriTools.mUriRegularExpression);
}
QVector> UriTools::parseMention(const QString &text) {
return parse(text, gUriTools.mMentionRegularExpression);
}
// Parse a text and return all lines where regex is matched or not
QVector> UriTools::parse(const QString &text, const QRegularExpression regex) {
QVector> results;
int currentIndex = 0;
auto match = regex.match(text);
for (int i = 0; i <= match.lastCapturedIndex(); ++i) {
int startIndex = match.capturedStart(i);
if (currentIndex != startIndex) {
results.push_back({false, text.mid(currentIndex, startIndex - currentIndex)});
}
results.push_back({true, match.captured(i)});
currentIndex = startIndex;
}
if (results.size() == 0) results.push_back({false, text});
else {
currentIndex += results.back().second.length();
if (currentIndex < text.size()) results.append(parse(text.mid(currentIndex), regex));
}
return results;
}
void UriTools::initRegularExpressions() {
// Level 0. --------------------------------------------------------------------
QString URI_DEC_OCTET =
QString("(?:") + "25[0-5]" + "|" + "2[0-4]\\d" + "|" + "1\\d{2}" + "|" + "[1-9]\\d" + "|" + "\\d" + ")";
QString URI_H16 = "[0-9A-Fa-f]{1,4}";
QString URI_PCT_ENCODED = "%[A-Fa-f\\d]{2}";
QString URI_PORT = "\\d*";
QString URI_SCHEME = "[a-zA-Z][\\w+\\.\\-]*";
QString URI_SUB_DELIMS = "[!$&\"()*+,;=]";
QString URI_UNRESERVED = "[\\w\\._~\\-]";
QString IRI_UCS_CHAR =
QString("(?:") + "[\\x{00A0}-\\x{D7FF}]" + "|" + "[\\x{F900}-\\x{FDCF}]" + "|" + "[\\x{FDF0}-\\x{FFEF}]" + "|" +
"[\\x{10000}-\\x{1FFFD}]" + "|" + "[\\x{20000}-\\x{2FFFD}]" + "|" + "[\\x{30000}-\\x{3FFFD}]" +
//"|" + "[\\x{D800\\x{DC00}-\\x{D83F\\x{DFFD}]" + "|" + "[\\x{D840\\x{DC00}-\\x{D87F\\x{DFFD}]" + "|" +
//"[\\x{D880\\x{DC00}-\\x{D8BF\\x{DFFD}]" +
"|" + "[\\x{40000}-\\x{4FFFD}]" + "|" + "[\\x{50000}-\\x{5FFFD}]" + "|" + "[\\x{60000}-\\x{6FFFD}]" +
//"|" + "[\\x{D8C0\\x{DC00}-\\x{D8FF\\x{DFFD}]" + "|" + "[\\x{D900\\x{DC00}-\\x{D93F\\x{DFFD}]" + "|" +
//"[\\x{D940\\x{DC00}-\\x{D97F\\x{DFFD}]" +
"|" + "[\\x{70000}-\\x{7FFFD}]" + "|" + "[\\x{80000}-\\x{8FFFD}]" + "|" + "[\\x{90000}-\\x{9FFFD}]" +
//"|" + "[\\x{D980\\x{DC00}-\\x{D9BF\\x{DFFD}]" + "|" + "[\\x{D9C0\\x{DC00}-\\x{D9FF\\x{DFFD}]" + "|" +
//"[\\x{DA00\\x{DC00}-\\x{DA3F\\x{DFFD}]" +
"|" + "[\\x{A0000}-\\x{AFFFD}]" + "|" + "[\\x{B0000}-\\x{BFFFD}]" + "|" + "[\\x{C0000}-\\x{CFFFD}]" +
//"|" + "[\\x{DA40\\x{DC00}-\\x{DA7F\\x{DFFD}]" + "|" + "[\\x{DA80\\x{DC00}-\\x{DABF\\x{DFFD}]" + "|" +
//"[\\x{DAC0\\x{DC00}-\\x{DAFF\\x{DFFD}]" +
"|" + "[\\x{D0000}-\\x{DFFFD}]" + "|" + "[\\x{E1000}-\\x{EFFFD}]" +
//"|" + "[\\x{DB00\\x{DC00}-\\x{DB3F\\x{DFFD}]" + "|" + "[\\x{DB44\\x{DC00}-\\x{DB7F\\x{DFFD}]" +
")";
QString IRI_PRIVATE =
QString("(?:") + "[\\x{E000}-\\x{F8FF}]" + "|" + "[\\x{F0000}-\\x{FFFFD}]" + "|" + "[\\x{100000}-\\x{10FFFD}]" +
//"|" + "[\\x{DBC0\\x{DC00}-\\x{DBFF\\x{DFFD}]" + "|" + "[\\x{DBC0\\x{DC00}-\\x{DBFF\\x{DFFD}]" +
")";
// Level 1. --------------------------------------------------------------------
QString URI_IPV_FUTURE = QString("v[0-9A-Fa-f]+\\.") + "(?:" + URI_UNRESERVED + URI_SUB_DELIMS + ":" + ")";
QString IRI_UNRESERVED = QString("(?:") + "[\\w\\._~\\-]" + "|" + IRI_UCS_CHAR + ")";
QString URI_IPV4_ADDRESS = URI_DEC_OCTET + "\\." + URI_DEC_OCTET + "\\." + URI_DEC_OCTET + "\\." + URI_DEC_OCTET;
QString URI_PCHAR = "(?:" + URI_UNRESERVED + "|" + URI_PCT_ENCODED + "|" + URI_SUB_DELIMS + "|" + "[:@]" + ")";
QString URI_REG_NAME = "(?:" + URI_UNRESERVED + "|" + URI_PCT_ENCODED + "|" + URI_SUB_DELIMS + ")*";
QString URI_USERINFO = "(?:" + URI_UNRESERVED + "|" + URI_PCT_ENCODED + "|" + URI_SUB_DELIMS + "|" + ":" + ")*";
// Level 2. --------------------------------------------------------------------
QString URI_FRAGMENT = "(?:" + URI_PCHAR + "|" + "[/?]" + ")*";
QString URI_LS32 = "(?:" + URI_H16 + ":" + URI_H16 + "|" + URI_IPV4_ADDRESS + ")";
QString URI_QUERY = "(?:" + URI_PCHAR + "|" + "[/?]" + ")*";
QString URI_SEGMENT = URI_PCHAR + "*";
QString URI_SEGMENT_NZ = URI_PCHAR + "+";
QString IRI_PCHAR = "(?:" + IRI_UNRESERVED + "|" + URI_PCT_ENCODED + "|" + URI_SUB_DELIMS + "|" + "[:@]" + ")";
QString IRI_REG_NAME = "(?:" + IRI_UNRESERVED + "|" + URI_PCT_ENCODED + "|" + URI_SUB_DELIMS + ")*";
QString IRI_USERINFO = "(?:" + IRI_UNRESERVED + "|" + URI_PCT_ENCODED + "|" + URI_SUB_DELIMS + "|" + ":" + ")*";
// Level 3. --------------------------------------------------------------------
QString URI_IPV6_ADDRESS = QString("(?:") + "(?:" + URI_H16 + ":){6}" + URI_LS32 + "|" + "::(?:" + URI_H16 +
":){5}" + URI_LS32 + "|" + "\\[" + URI_H16 + "\\]::(?:" + URI_H16 + ":){4}" + URI_LS32 +
"|" + "\\[" + "(?:" + URI_H16 + ":)?" + URI_H16 + "\\]::(?:" + URI_H16 + ":){3}" +
URI_LS32 + "|" + "\\[" + "(?:" + URI_H16 + ":){0,2}" + URI_H16 + "\\]::(?:" + URI_H16 +
":){2}" + URI_LS32 + "|" + "\\[" + "(?:" + URI_H16 + ":){0,3}" + URI_H16 +
"\\]::" + URI_H16 + ":" + URI_LS32 + "|" + "\\[" + "(?:" + URI_H16 + ":){0,4}" +
URI_H16 + "\\]::" + URI_LS32 + "|" + "\\[" + "(?:" + URI_H16 + ":){0,5}" + URI_H16 +
"\\]::" + URI_H16 + "|" + "\\[" + "(?:" + URI_H16 + ":){0,6}" + URI_H16 + "\\]::" + ")";
QString URI_PATH_ABEMPTY = QString("(?:") + "/" + URI_SEGMENT + ")*";
QString URI_PATH_ABSOLUTE = QString("/") + "(?:" + URI_SEGMENT_NZ + "(?:" + "/" + URI_SEGMENT + ")*" + ")?";
QString URI_PATH_ROOTLESS = URI_SEGMENT_NZ + "(?:" + "/" + URI_SEGMENT + ")*";
QString IRI_FRAGMENT = "(?:" + IRI_PCHAR + "|" + "[/?]" + ")*";
QString IRI_QUERY = "(?:" + IRI_PCHAR + "|" + IRI_PRIVATE + "|" + "[/?]" + ")*";
QString IRI_SEGMENT = IRI_PCHAR + "*";
QString IRI_SEGMENT_NZ = IRI_PCHAR + "+";
// Level 4. --------------------------------------------------------------------
QString URI_IP_LITERAL = QString("\\[") + "(?:" + URI_IPV6_ADDRESS + "|" + URI_IPV_FUTURE + ")" + "\\]";
QString IRI_PATH_ABEMPTY = QString("(?:") + "/" + IRI_SEGMENT + ")*";
QString IRI_PATH_ABSOLUTE = QString("/") + "(?:" + IRI_SEGMENT_NZ + "(?:" + "/" + IRI_SEGMENT + ")*" + ")?";
QString IRI_PATH_ROOTLESS = IRI_SEGMENT_NZ + "(?:" + "/" + IRI_SEGMENT + ")*";
// Level 5. --------------------------------------------------------------------
QString URI_HOST = "(?:" + URI_REG_NAME + "|" + URI_IPV4_ADDRESS + "|" + URI_IP_LITERAL + ")";
QString IRI_HOST = "(?:" + IRI_REG_NAME + "|" + URI_IPV4_ADDRESS + "|" + URI_IP_LITERAL + ")";
// Level 6. --------------------------------------------------------------------
QString URI_AUTHORITY = "(?:" + URI_USERINFO + "@" + ")?" + URI_HOST + "(?:" + ":" + URI_PORT + ")?";
QString IRI_AUTHORITY = "(?:" + IRI_USERINFO + "@" + ")?" + IRI_HOST + "(?:" + ":" + URI_PORT + ")?";
// Level 7. --------------------------------------------------------------------
// `path-empty` not used.
QString URI_HIER_PART = QString("(?:") + "//" + URI_AUTHORITY + URI_PATH_ABEMPTY + "|" + URI_PATH_ABSOLUTE + "|" +
URI_PATH_ROOTLESS + ")";
QString IRI_HIER_PART = QString("(?:") + "//" + IRI_AUTHORITY + IRI_PATH_ABEMPTY + "|" + IRI_PATH_ABSOLUTE + "|" +
IRI_PATH_ROOTLESS + ")";
// Level 8. --------------------------------------------------------------------
// Regex to match URI. It respects the RFC 3986.
QString URI = "(?:" + URI_SCHEME + ":" + "|" + "www\\." + ")" + URI_HIER_PART + "(?:" + "\\?" + URI_QUERY + ")?" +
"(?:" + "#" + URI_FRAGMENT + ")?";
// Regex to match URI. It respects the RFC 3987.
QString IRI = "(?:" + URI_SCHEME + ":" + "|" + "www\\." + ")" + IRI_HIER_PART + "(?:" + "\\?" + IRI_QUERY + ")?" +
"(?:" + "#" + IRI_FRAGMENT + ")?";
mIriRegularExpression = QRegularExpression(IRI, QRegularExpression::CaseInsensitiveOption |
QRegularExpression::UseUnicodePropertiesOption);
mUriRegularExpression = QRegularExpression(URI, QRegularExpression::CaseInsensitiveOption |
QRegularExpression::UseUnicodePropertiesOption);
mMentionRegularExpression = QRegularExpression(
"@[A-Za-z0-9.-_]+", QRegularExpression::CaseInsensitiveOption | QRegularExpression::UseUnicodePropertiesOption);
}