syncthingtray/syncthingconnector/syncthingignorepattern.cpp

489 lines
19 KiB
C++

#include "./syncthingignorepattern.h"
namespace Data {
/// \cond
namespace SyncthingIgnorePatternState {
enum State {
Escaping, // passed the esacping character "\"
AppendRangeLower, // passed a "[" marking the start of a character range
AppendRangeUpper, // passed a "-" within a range marking the start of an upper-bound range character
AppendAlternative, // passed a "{" marking the start of an alternative set
MatchVerbatimly, // initial/default state
MatchRange, // passed a "]" marking the end of a character range
MatchAlternatives, // passed a "}" marking the end of an alternative set
MatchAny, // passed the "?" character that allows matching any character but the path separator
MatchManyAny, // passed the "*" character that allows matching many arbitrary characters except the path separator
MatchManyAnyIncludingDirSep, // passed a sequence of two "*" characters allowing to match also the path separator
};
struct Asterisk {
QString::const_iterator pos;
SyncthingIgnorePatternState::State state;
bool visited = false;
};
struct CharacterRange {
QChar lowerBound, upperBound;
};
struct AlternativeRange {
explicit AlternativeRange(QString::const_iterator beg)
: beg(beg)
, end(beg)
{
}
QString::const_iterator beg, end;
};
} // namespace SyncthingIgnorePatternState
/// \endcond
/*!
* \struct SyncthingIgnorePattern
* \brief The SyncthingIgnorePattern struct allows matching a Syncthing ignore pattern against a path.
* \remarks
* - The `#include`-syntax is not supported.
* - A "/" is always treated as path separator within the pattern. Additionally, the character specified
* when calling matches() is treated as path separator as well. This means that under Windows where
* patterns can contain a "\" one *must* specify paths with a "\" as separator when invoking the matches()
* function to allow patterns using "/" and "\" to match correctly; otherwiise ignore patterns containing
* "\" do not work.
* \sa
* - https://docs.syncthing.net/users/ignoring.html
* - https://docs.syncthing.net/rest/db-ignores-get.html
*/
/*!
* \brief Parses the specified \a pattern populating the struct's fields.
*/
SyncthingIgnorePattern::SyncthingIgnorePattern(QString &&pattern)
: pattern(std::move(pattern))
{
if (this->pattern.startsWith(QLatin1String("//"))) {
comment = true;
ignore = false;
return;
}
glob = this->pattern;
for (;;) {
if (glob.startsWith(QLatin1String("!"))) {
ignore = !ignore;
glob.remove(0, 1);
} else if (glob.startsWith(QLatin1String("(?i)"))) {
caseInsensitive = true;
glob.remove(0, 4);
} else if (glob.startsWith(QLatin1String("(?d)"))) {
allowRemovalOnParentDirRemoval = true;
glob.remove(0, 4);
} else {
break;
}
}
}
/*!
* \brief Moves the ignore pattern.
*/
SyncthingIgnorePattern::SyncthingIgnorePattern(SyncthingIgnorePattern &&) = default;
/*!
* \brief Destroys the ignore pattern.
*/
SyncthingIgnorePattern::~SyncthingIgnorePattern()
{
}
/*!
* \brief Matches the assigned glob against the specified \a path.
* \remarks
* - Returns always false if the pattern is flagged as comment or the glob is empty.
* - This function tries to follow rules outlined on https://docs.syncthing.net/users/ignoring.html.
* - The specified \a path is *not* supposed to start with the \a pathSeparator (or a "/"). It must always be a path
* relative to the root of the Syncthing folder it is contained by. A pattern that is only supposed to match from the
* root of the Syncthing folder is supposed to start with \a pathSeparator (or a "/"), though.
* - This function probably doesn't work if the pattern or \a path contain a surrogate pair.
* - By default, the path separator is "/". If \a path uses a different separator you must specify it as second argument.
* Note that "/" is still be treated as path separator in this case and the specified \a pathSeparator is only used in
* addition. This is intended because this way one can specify "\" as \a pathSeparator under Windows but patterns using
* "/" still work (as they should because "/" and "\" can both be used as path separator under Windows in ignore
* patterns).
*/
bool SyncthingIgnorePattern::matches(const QString &path, QChar pathSeparator) const
{
if (comment || glob.isEmpty()) {
return false;
}
// get iterators
auto globIter = glob.begin(), globEnd = glob.end();
auto pathIter = path.begin(), pathEnd = path.end();
// handle pattners starting with "/" indicating the pattern must match from the root (see last remark in docstring)
static
#if (QT_VERSION >= QT_VERSION_CHECK(6, 0, 0))
constexpr
#else
const
#endif
auto genericPathSeparator
= QChar('/');
const auto matchFromRoot = *globIter == pathSeparator || *globIter == genericPathSeparator;
if (matchFromRoot) {
++globIter;
}
// define variables to track the state when processing the glob pattern
using namespace SyncthingIgnorePatternState;
auto state = MatchVerbatimly;
auto escapedState = MatchVerbatimly;
auto inAsterisk = false;
asterisks.clear();
// define behavior to handle the current character in the glob pattern not matching the current pattern in the path
const auto handleMismatch = [&, this] {
// fail the match immediately if the glob pattern started with a "/" indicating it is supposed to match only from the root
if (matchFromRoot) {
return false;
}
// deal with the mismatch by trying to match previous asterisks more greedily
while (!asterisks.empty() && asterisks.back().visited) {
// do not consider asterisks we have already visited, though (as it would lead to an endless loop)
asterisks.pop_back();
}
if (!asterisks.empty()) {
// rewind back to when we have passed the last non-visited asterisk
auto &asterisk = asterisks.back();
globIter = asterisk.pos;
inAsterisk = asterisk.visited = true;
state = asterisk.state;
return true;
}
// deal with the mismatch by checking the path as of the next path element
for (; pathIter != pathEnd; ++pathIter) {
// forward to the next path separator
if (*pathIter != pathSeparator && *pathIter != genericPathSeparator) {
continue;
}
// skip the path separator itself and give up when the end of the path is reached
if (++pathIter == pathEnd) {
return false;
}
break;
}
// give up when the end of the path is reached
if (pathIter == pathEnd) {
return false;
}
// start matching the glob pattern from the beginning
globIter = glob.begin();
asterisks.clear();
inAsterisk = false;
return true;
};
// define function to handle single match
const auto handleSingleMatch = [&, this] {
// proceed with the next characters on a match
++globIter;
inAsterisk = false;
if (!asterisks.empty()) {
asterisks.back().visited = false;
}
};
// define function to match single character against the current character in the path
static constexpr auto compareSingleCharBasic = [](QChar expectedChar, QChar presentChar, QChar pathSep, bool caseInsensitive) {
Q_UNUSED(pathSep)
return caseInsensitive ? presentChar.toCaseFolded() == expectedChar.toCaseFolded() : presentChar == expectedChar;
};
const auto compareSingleChar = pathSeparator == genericPathSeparator
? compareSingleCharBasic
: [](QChar expectedChar, QChar presentChar, QChar pathSep, bool caseInsensitive) {
return compareSingleCharBasic(expectedChar, presentChar, pathSep, caseInsensitive)
|| (expectedChar == genericPathSeparator && presentChar == pathSep);
};
const auto matchSingleChar = [&, this](QChar expectedChar) { return compareSingleChar(expectedChar, *pathIter, pathSeparator, caseInsensitive); };
// define function to transition to verbatim matching (which makes only sense when not in any of the "Append…"-states)
const auto transitionToVerbatimMatching = [&] {
switch (state) {
case AppendRangeLower:
case AppendRangeUpper:
case AppendAlternative:
break;
default:
state = MatchVerbatimly;
}
};
// try to match each character of the glob against a character in the path
match:
while (globIter != globEnd) {
// decide what to do next depending on the current glob pattern character and state transitioning the state accordingly
#if (QT_VERSION >= QT_VERSION_CHECK(6, 0, 0))
static constexpr auto escapeCharacter = QChar('\\');
static constexpr auto escapeCharacterUnicode = escapeCharacter.unicode();
#else
#define escapeCharacterUnicode '\\'
#define escapeCharacter QChar(escapeCharacterUnicode)
#endif
switch (state) {
case Escaping:
// treat every character as-is in "escaping" state
state = escapedState;
break;
default:
// transition state according to special meaning of the current glob pattern character
switch (globIter->unicode()) {
case escapeCharacterUnicode:
if (pathSeparator != escapeCharacter) {
// transition into "escaping" state
escapedState = state;
state = Escaping;
} else {
// treat the escape character as normal character if it is the path separator
// quote from Syncthing documentation: Escaped characters are not supported on Windows, where "\" is the path
// separator.
transitionToVerbatimMatching();
}
break;
case '[':
state = AppendRangeLower;
characterRange.clear();
++globIter;
continue;
case ']':
switch (state) {
case AppendRangeLower:
case AppendRangeUpper:
state = MatchRange;
break;
default:
transitionToVerbatimMatching();
}
break;
case '-':
switch (state) {
case AppendRangeLower:
state = AppendRangeUpper;
++globIter;
continue;
default:
transitionToVerbatimMatching();
}
break;
case '{':
switch (state) {
case AppendAlternative:
continue;
default:
state = AppendAlternative;
alternatives.clear();
alternatives.emplace_back(++globIter);
}
continue;
case '}':
switch (state) {
case AppendAlternative:
alternatives.back().end = globIter;
state = MatchAlternatives;
break;
default:
transitionToVerbatimMatching();
}
break;
case ',':
switch (state) {
case AppendAlternative:
alternatives.back().end = globIter;
alternatives.emplace_back(++globIter);
continue;
default:
transitionToVerbatimMatching();
}
break;
case '?':
// transition into "match any" state
state = MatchAny;
break;
case '*':
// transition into one of the "match many any" state (depending on current state)
switch (state) {
case MatchManyAny:
state = MatchManyAnyIncludingDirSep;
break;
default:
state = MatchManyAny;
}
break;
default:
// try to match/append all other non-special characters as-is
transitionToVerbatimMatching();
}
}
// proceed according to state
switch (state) {
case Escaping:
// proceed with the next character in the glob pattern which will be matched as-is (even if it is special)
[[fallthrough]];
case AppendAlternative:
// just move on to the next character (alternatives are populated completely in the previous switch-case)
++globIter;
break;
case AppendRangeLower:
// add the current character in the glob pattern as start of a new range
characterRange.emplace_back().lowerBound = *globIter++;
break;
case AppendRangeUpper:
// add the current character in the glob pattern as end of a new or the current range
(characterRange.empty() ? characterRange.emplace_back() : characterRange.back()).upperBound = *globIter++;
state = AppendRangeLower;
break;
case MatchVerbatimly:
// match the current character in the glob pattern verbatimly against the current character in the path
if (pathIter != pathEnd && matchSingleChar(*globIter)) {
++pathIter;
handleSingleMatch();
} else if (inAsterisk
&& (asterisks.back().state == MatchManyAnyIncludingDirSep
|| (pathIter == pathEnd || (*pathIter != pathSeparator && *pathIter != genericPathSeparator)))) {
// consider the path character dealt with despite no match if we have just passed an asterisk in the glob pattern
if (pathIter != pathEnd) {
++pathIter;
} else {
inAsterisk = false;
}
} else if (!handleMismatch()) {
return false;
}
break;
case MatchRange:
// match the concluded character range in the glob pattern against the current character in the path
if (pathIter != pathEnd) {
auto inRange = false;
for (const auto &bounds : characterRange) {
if ((!bounds.upperBound.isNull() && *pathIter >= bounds.lowerBound && *pathIter <= bounds.upperBound)
|| (bounds.upperBound.isNull() && matchSingleChar(bounds.lowerBound))) {
inRange = true;
break;
}
}
if (inRange) {
characterRange.clear();
state = MatchVerbatimly;
++pathIter;
handleSingleMatch();
break;
}
}
if (!handleMismatch()) {
return false;
}
break;
case MatchAlternatives:
// match the current alternatives as of the current character in the path
if (pathIter != pathEnd) {
const auto pathStart = pathIter;
for (auto &alternative : alternatives) {
// match characters in the alternative against the path
// note: Special characters like "*" are matched verbatimly. Is that the correct behavior?
pathIter = pathStart;
for (; alternative.beg != alternative.end && pathIter != pathEnd; ++alternative.beg) {
if (*alternative.beg == escapeCharacter) {
continue;
}
if (!matchSingleChar(*alternative.beg)) {
break;
}
++pathIter;
}
// go with the first alternative that fully matched
// note: What is the correct behavior? Should this be most/least greedy (matching the longest/shortest possible alternative) instead?
if (alternative.beg == alternative.end) {
alternatives.clear();
break;
}
}
if (alternatives.empty()) {
state = MatchVerbatimly;
handleSingleMatch();
break;
}
}
if (!handleMismatch()) {
return false;
}
break;
case MatchAny:
// allow the current character in the path to be anything but a path separator; otherwise consider it as mismatch as in the case for an exact match
if (pathIter == pathEnd || (*pathIter != pathSeparator && *pathIter != genericPathSeparator)) {
++globIter, ++pathIter;
} else if (!handleMismatch()) {
return false;
}
break;
case MatchManyAny: {
// take record of the asterisks
auto &glob = asterisks.emplace_back();
glob.pos = ++globIter;
glob.state = MatchManyAny;
inAsterisk = true;
break;
}
case MatchManyAnyIncludingDirSep: {
// take record of the second asterisks
auto &glob = asterisks.back();
glob.pos = ++globIter;
glob.state = MatchManyAnyIncludingDirSep;
break;
}
}
}
// check whether all characters of the glob have been matched against all characters of the path
if (globIter == globEnd) {
// consider the match a success if all characters of the path were matched or the glob ended with a "**"
if (pathIter == pathEnd || state == MatchManyAnyIncludingDirSep) {
return true;
}
if (const auto remainingPath = QStringView(pathIter, pathEnd);
state == MatchManyAny && !(remainingPath.contains(pathSeparator) || remainingPath.contains(genericPathSeparator))) {
return true;
}
// try again as of the next path segment if the glob fully matched but there are still characters in the path to be matched
// note: This allows "foo" to match against "foo/foo" even tough the glob characters have already consumed after matching the first path segment.
if (!matchFromRoot && (*pathIter == pathSeparator || *pathIter == genericPathSeparator)) {
state = MatchVerbatimly;
++pathIter;
globIter = glob.begin();
asterisks.clear();
inAsterisk = false;
goto match;
}
}
return false;
}
/*!
* \brief Makes an ignore pattern for \a path with the specified settings.
*/
QString SyncthingIgnorePattern::forPath(const QString &path, bool ignore, bool caseInsensitive, bool allowRemovalOnParentDirRemoval)
{
auto res = QString();
res.reserve(10 + path.size());
if (!ignore) {
res += QChar('!');
}
if (caseInsensitive) {
res += QStringLiteral("(?i)");
}
if (allowRemovalOnParentDirRemoval) {
res += QStringLiteral("(?d)");
}
return res += path;
}
} // namespace Data