hostile-takeover/server/badwords.cpp

#include "server/badwords.h"
#include "mpshared/misc.h"

namespace boost {
template <class T>
T next(T x) { return ++x; }
template <class T>
T prev(T x) { return --x; }
} // namespace boost

namespace wi {

// Number of wildcard chars in an entry, for example s**t has 2.
const int kcWildcardsEntryMax = 2;

BadWords::BadWords(const std::string& filename) : watcher_(filename),
        on_(true) {
    watcher_.SignalOnFileUpdated.connect(this, &BadWords::OnFileUpdated);
}

std::map<std::string, std::vector<std::string> > BadWords::ParsePairs(
        const std::string& s) {
    // Example: @pairs/f:ph/o:a,e,i,u/o:y
    std::vector<std::string> parts = Split(s, '/');
    std::map<std::string, std::vector<std::string> > pairs;
    if (parts.size() == 0) {
        return pairs;
    }
    std::vector<std::string>::const_iterator it = parts.begin();
    for (; it != parts.end(); it++) {
        std::vector<std::string> key_values = Split(*it, ':');
        if (key_values.size() != 2) {
            continue;
        }
        // Get the values for this key. Include the key itself, since
        // the value list will be enumerated for word permutation.
        std::vector<std::string> values = Split(key_values[1], ',');
        values.push_back(key_values[0]);

        if (values.size() == 0) {
            continue;
        }
        std::map<std::string, std::vector<std::string> >::iterator it =
                pairs.find(key_values[0]);
        if (it == pairs.end()) {
            pairs.insert(std::map<std::string, std::vector<std::string> >
                    ::value_type(key_values[0], values));
        } else {
            it->second.insert(it->second.end(), values.begin(), values.end());
        }
    }
    return pairs;
}

void BadWords::OnFileUpdated(ThreadedFileWatcher *watcher) {
    RLOG() << watcher_.filename() << " has changed!";
    FILE *f = fopen(watcher_.filename().c_str(), "r");
    if (f == NULL) {
        return;
    }

    std::map<std::string, std::vector<std::string> > pairs;
    nodes_.clear();
    nodes_.push_back(MatchNode());
    bool standalone = false;
    bool wildcards = true;

    char sz[256];
    while (fgets(sz, sizeof(sz), f) != NULL) {
        int cch;
        const char *start = StripWhitespace(sz, &cch);
        if (cch == 0) {
            continue;
        }
        std::string s(start, cch);
        if (s[0] == '#') {
            continue;
        }
        if (s.find("@pairs") == 0) {
            pairs = ParsePairs(s);
            continue;
        }
        if (s.find("@standalone") == 0) {
            std::vector<std::string> parts = Split(s.c_str(), '/');
            if (parts.size() == 2) {
                if (parts[1].compare("true") == 0) {
                    standalone = true;
                }
                if (parts[1].compare("false") == 0) {
                    standalone = false;
                }
            }
            continue;
        }
        if (s.find("@wildcards") == 0) {
            std::vector<std::string> parts = Split(s.c_str(), '/');
            if (parts.size() == 2) {
                if (parts[1].compare("true") == 0) {
                    wildcards = true;
                }
                if (parts[1].compare("false") == 0) {
                    wildcards = false;
                }
            }
            continue;
        }
        AddPermutations(s, pairs, standalone, wildcards);
    }
    fclose(f);
}

void BadWords::AddPermutations(const std::string& word,
            const std::map<std::string, std::vector<std::string> >& pairs,
            bool standalone, bool wildcards) {
    // Split the word into pieces. Remember the order of the split values.
    std::vector<std::string> parts;
    std::vector<const std::vector<std::string> *> values;
    const std::vector<std::string> *splitter_values;
    const char *next = word.c_str();
    while (true) {
        const char *last = next;
        next = NULL;
        const char *splitter = NULL;
        std::map<std::string, std::vector<std::string> >::const_iterator it =
                pairs.begin();
        for (; it != pairs.end(); it++) {
            const char *pos = strstr(last, it->first.c_str());
            if (pos != NULL && (next == NULL || pos < next)) {
                next = pos;
                splitter = it->first.c_str();
                splitter_values = &it->second;
            }
        }
        if (next == NULL) {
            parts.push_back(last);
            break;
        }
        parts.push_back(std::string(last, next - last));
        next += strlen(splitter);
        values.push_back(splitter_values);
    }

    // Perform a recusion based permutation of each value at each split point.
    // Cycle through the split points right to left, so the left side
    // is the inner loop. That way, right sides can re-use portions of
    // the node graph that are the same.
    if (values.size() != 0) {
        Permute(parts[parts.size() - 1], parts, values, values.size() - 1,
                standalone, wildcards);
    } else {
        AddWord(parts[0], standalone, wildcards);
    }
}

void BadWords::Permute(const std::string& suffix,
        const std::vector<std::string>& parts,
        const std::vector<const std::vector<std::string> *>& values,
        int index, bool standalone, bool wildcards) {
    for (int i = 0; i < values[index]->size(); i++) {
        std::string word = parts[index] + (*values[index])[i] + suffix;
        if (index > 0) {
            Permute(word, parts, values, index - 1, standalone, wildcards);
        } else {
            AddWord(word, standalone, wildcards);
        }
    }
}

void BadWords::AddWord(const std::string& word, bool standalone,
        bool wildcards) {
    // Add this word to the graph, and expand the graph as necessary.
    int current_index = 0;
    for (int i = 0; i < word.size(); i++) {
        char ch = word[i];
        if (ch >= 'A' && ch <= 'Z') {
            ch += 'a' - 'A';
        }
        if (ch < 'a' || ch > 'z') {
            continue;
        }
        int char_index = ch - 'a';
        dword *next_index = &nodes_[current_index].indexes[char_index];
        if (i == word.size() - 1)  {
            dword ff = kfMatMatch;
            if (wildcards) {
                ff |= kfMatWildcards;
            }
            if (standalone) {
                ff |= kfMatStandalone;
            }
            *next_index |= ff;
            continue;
        }
        if ((*next_index & kfMatIndex) != 0) {
            current_index = (*next_index & kfMatIndex);
            continue;
        }
        *next_index |= (nodes_.size() & kfMatIndex);
        current_index = nodes_.size();
        nodes_.push_back(MatchNode());
    }
    //LOG() << "added: " << word << " total nodes:" << nodes_.size();
}

std::vector<std::string> BadWords::Split(const std::string& s, char ch) {
    std::vector<std::string> parts;
    const char *next = s.c_str();
    while (true) {
        const char *last = next;
        next = strchr(next, ch);
        if (next == NULL) {
            parts.push_back(std::string(last));
            return parts;
        }
        parts.push_back(std::string(last, next - last));
        next += 1;
    }
}

bool BadWords::IsStandalone(const char *input, int start, int end) {
    if (start > 0) {
        char ch = input[start - 1];
        if (ch >= 'A' && ch <= 'Z') {
            ch += 'a' - 'A';
        }
        if (ch >= 'a' && ch <= 'z') {
            return false;
        }
    }
    if (input[end] != 0 && input[end + 1] != 0) {
        char ch = input[end + 1];
        if (ch >= 'A' && ch <= 'Z') {
            ch += 'a' - 'A';
        }
        if (ch >= 'a' && ch <= 'z') {
            return false;
        }
    }
    return true;
}

bool BadWords::IsMatch(const char *input, const MatchEntry& entry) {
    // Can't have more wildcard replacements than not
    if (entry.wildcard_replacements >= entry.char_count) {
        return false;
    }

    // Check for standalone if asked
    if (entry.standalone && !IsStandalone(input, entry.start, entry.end)) {
        return false;
    }

    // Don't allow spaces in matches that have wildcards of either type
    // (filler or replacement)
    if (entry.wildcard_count != 0 && entry.last_space >= entry.start &&
            entry.last_space <= entry.end) {
        return false;
    }
    return true;
}

const char *BadWords::Filter(const char *input, int *cch_back) {
    // Filter only if turned on (on is the default).
    if (cch_back != NULL) {
        *cch_back = -1;
    }
    if (!on_ || nodes_.size() == 0) {
        strncpyz(buffer_, input, sizeof(buffer_));
        return buffer_;
    }

    // Traverse the match graph, looking for matches. Handle non-alpha
    // wildcards and insertions, like f*k vs. fu*k.

    std::list<Match> matches;
    std::list<MatchEntry> progress;

    const char *pch = input;
    for (; *pch != 0; pch++) {
        // Ignore whitespace, but track it since it will cancel some matches
        char ch = *pch;

        // Treat newlines specially. This marks the boundary between
        // entered chat. This is tracked.
        if (ch == '\n') {
            std::list<MatchEntry>::iterator it = progress.begin();
            for (; it != progress.end(); it++) {
                it->has_newline = true;
                it->char_index_last = -1;
            }
            continue;
        }

        if (isspace(ch)) {
            std::list<MatchEntry>::iterator it = progress.begin();
            while (it != progress.end()) {
                it->last_space = pch - input;
                if (it->match) {
                    it->char_index_last = -1;
                }
                it++;
            }
            continue;
        }

        if (ch >= 'A' && ch <= 'Z') {
            ch += 'a' - 'A';
        }

        // If it is a letter, match against the match graph.
        if (ch >= 'a' && ch <= 'z') {
            int char_index = ch - 'a';

            // Examine in-progress matches.
            std::list<MatchEntry>::iterator it = progress.begin();
            while (it != progress.end()) {
                // Match a trailing repeated char.
                MatchEntry& entry = *it;
                if (entry.match) {
                    if (entry.char_index_last == char_index) {
                        entry.end = pch - input;
                        if (*(pch + 1) != 0) {
                            it++;
                            continue;
                        }
                    }
                    if (IsMatch(input, entry)) {
                        matches.push_back(Match(entry.start, entry.end));
                    }
                    it = progress.erase(it);
                    continue;
                }
                entry.char_count++;

                // Not matched yet. Is it a match now?
                dword *next_index =
                        &nodes_[entry.current_index].indexes[char_index];
                if (*next_index & kfMatMatch) {
                    if ((!entry.has_newline && entry.wildcard_replacements == 0)
                            || (*next_index & kfMatWildcards) != 0) {
                        // Add a new entry that is the match, ahead of the
                        // iterator, so it gets processed even if this is the
                        // last char.
                        MatchEntry new_entry = entry;
                        new_entry.end = pch - input;
                        new_entry.char_index_last = char_index;
                        new_entry.standalone =
                                (*next_index & kfMatStandalone) != 0;
                        new_entry.standalone |= (new_entry.last_space != -1);
                        new_entry.standalone |=
                                ((new_entry.wildcard_count) != 0);
                        new_entry.match = true;
                        progress.insert(boost::next(it), new_entry);
                    }
                }

                // Follow this char_index to the next node, if there one
                if (*next_index & kfMatIndex) {
                    entry.end = pch - input;
                    entry.current_index = (*next_index & kfMatIndex);
                    entry.char_index_last = char_index;
                    it++;
                    continue;
                }
                // No next node; allow the entry to persist if it is letter
                // doubling.
                if (entry.char_index_last == char_index) {
                    entry.end = pch - input;
                    entry.char_count--;
                    it++;
                    continue;
                }

                // Remove the entry
                it = progress.erase(it);
            }

            // Start a new match, if there is one
            dword node_index = nodes_[0].indexes[char_index] & kfMatIndex;
            if (node_index != 0) {
                MatchEntry entry(pch - input, pch - input, node_index,
                        char_index);
                entry.char_count = 1;
                progress.push_back(entry);
            }
            continue;
        }

        // Not a-z, handle skip and insertion. Insertion is easy, just
        // preserve existing MatchEntrys. Skips require wildcarding:
        // clone existing MatchEntrys, and step them for each next node.
        // Add new match entries for all first char matches.

        std::list<MatchEntry>::iterator it = progress.begin();
        int next_indexes[26];
        int index_count;

        // Start new wildcard match entries only if this is a whitespace
        // boundary. Otherwise it is easy to cause an explosion of match
        // entries by simply typing "***************".

        if (pch == input || isspace(*(pch - 1))) {
            index_count = GetNextCharIndexes(0, next_indexes);
            for (int i = 0; i < index_count; i++) {
                // Push to the front so these entries aren't enumerated again
                dword *next_index = &nodes_[0].indexes[next_indexes[i]];
                MatchEntry entry(pch - input, pch - input,
                        (*next_index & kfMatIndex), -1);
                entry.wildcard_count = 1;
                entry.wildcard_replacements = 1;
                progress.push_front(entry);
            }
        }

        // Handle wildcarding in existing entries
        while (it != progress.end()) {
            // Don't span existing matched entries over wildcard chars.
            MatchEntry& entry = *it;
            if (entry.match) {
                if (IsMatch(input, entry)) {
                    matches.push_back(Match(entry.start, entry.end));
                }
                it = progress.erase(it);
                continue;
            }
            entry.wildcard_count++;

            // Enumerate next nodes, and make new stepped MatchEntrys for each.
            index_count = GetNextCharIndexes(entry.current_index, next_indexes);
            for (int i = 0; i < index_count; i++) {
                dword *next_index =
                        &nodes_[entry.current_index].indexes[next_indexes[i]];
                // Only match if there is a trailing whitespace boundary
                if (*(pch + 1) == 0 || isspace(*(pch + 1))) {
                    if ((*next_index & (kfMatMatch | kfMatWildcards)) ==
                            (kfMatMatch | kfMatWildcards)) {
                        // And there is more string to be evaled
                        if (*(pch + 1) != 0) {
                            // Add a floating MatchEntry, in order to suck up
                            // repeated chars of the match.
                            MatchEntry new_entry = entry;
                            new_entry.end = pch - input;

                            // Standalone, because since there is a wildcard,
                            // the entry is by definition standalone only.
                            new_entry.standalone = true;
                            new_entry.match = true;
                            new_entry.wildcard_replacements++;
                            progress.insert(it, new_entry);
                        } else {
                            // There is a wildcard, so by definition, it is
                            // standalone only.
                            MatchEntry new_entry = entry;
                            new_entry.standalone = true;
                            new_entry.end = pch - input;
                            new_entry.wildcard_replacements++;
                            if (IsMatch(input, new_entry)) {
                                matches.push_back(Match(new_entry.start,
                                        new_entry.end));
                            }
                        }
                    }
                }

                // If there is a child at this char (already know there is)
                if (*next_index & kfMatIndex) {
                    // And there has been kcWildcardsEntryMax wildcards only
                    if (entry.wildcard_count <= kcWildcardsEntryMax) {
                        // Then add a new match entry to track this path.
                        MatchEntry new_entry = entry;
                        new_entry.current_index = (*next_index & kfMatIndex);
                        new_entry.end = pch - input;
                        new_entry.wildcard_replacements++;
                        progress.insert(it, new_entry);
                    }
                }
            }
            it++;
        }
    }

    if (matches.size() > 1) {
        // See if the matches overlap; if so clean them up.
        bool overlap = false;
        std::list<Match>::iterator it = matches.begin();
        for (; it != matches.end(); it++) {
            std::list<Match>::iterator it_next = boost::next(it);
            if (it_next != matches.end()) {
                if ((*it).end >= (*it_next).start) {
                    overlap = true;
                    break;
                }
            }
        }
        if (overlap) {
            CleanupMatches(matches);
        }
    }

    // Find the earliest starting entry that spans the end.
    // If this overlaps a match, start from the match.
    if (cch_back != NULL) {
        int earliest = -1;
        std::list<MatchEntry>::iterator it = progress.begin();
        for (; it != progress.end(); it++) {
            if (earliest == -1 || (*it).start < earliest) {
                earliest = (*it).start;
            }
        }
        if (earliest != -1) {
            std::list<Match>::iterator itm = matches.begin();
            for (; itm != matches.end(); itm++) {
                if (itm->end >= earliest) {
                    earliest = itm->start;
                }
            }
            *cch_back = (pch - input) - earliest;
        }
    }

    // Any matches? If not, return the original line
    if (matches.size() == 0) {
        strncpyz(buffer_, input, sizeof(buffer_));
        return buffer_;
    }

    // Finally, build the new string.
    return BuildResult(input, matches);
}

int BadWords::GetNextCharIndexes(int current_index, int *next_indexes) {
    int count = 0;
    for (int i = 0; i < 26; i++) {
        if (nodes_[current_index].indexes[i] != 0) {
            *next_indexes++ = i;
            count++;
        }
    }
    return count;
}

const char *BadWords::BuildResult(const char *input,
        std::list<Match>& matches) {
    std::ostringstream s;
    const char *in = input;
    int cch = strlen(input);
    std::list<Match>::iterator it = matches.begin();
    for (; it != matches.end(); it++) {
        const Match& match = *it;
        s << std::string(in, match.start - (in - input));
        s << std::string(match.end - match.start + 1, '*');
        in = &input[match.end + 1];
    }
    s << in;
    strncpyz(buffer_, s.str().c_str(), sizeof(buffer_));
    return buffer_;
}

bool CompareMatches(const Match& match_a, const Match& match_b) {
    // Sort so that earlier starting, and longer matches are first
    if (match_a.start < match_b.start) {
        return true;
    }
    if (match_a.start > match_b.start) {
        return false;
    }
    // But between matches that start at the same index, sort matches
    // that are longer first.
    if (match_a.end > match_b.end) {
        return true;
    }
    return false;
}

void BadWords::CleanupMatches(std::list<Match>& matches) {
    // Clean up the match list. First, remove smaller matches that are
    // inside larger matches.

    // Sort so that earlier and longer matches are first
    matches.sort(CompareMatches);

    // Remove matches that overlap with other matches. This is easy since
    // the matches have been sorted.
    std::list<Match>::iterator it_i = matches.begin();
    while (it_i != matches.end()) {
        std::list<Match>::iterator it_j = boost::next(it_i);
        while (it_j != matches.end()) {
            if ((*it_i).end >= (*it_j).start) {
                it_j = matches.erase(it_j);
                continue;
            }
            it_j++;
        }
        it_i++;
    }
}

} // namespace wi