- 01
- 02
- 03
- 04
- 05
- 06
- 07
- 08
- 09
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
#include <iostream>
#include <set>
#include <unicode/brkiter.h>
#include <unicode/unistr.h>
#include <unicode/coll.h>
#include <unicode/sortkey.h>
bool hasRepeatingCharacters(const icu::UnicodeString &word)
{
icu::Locale locale = icu::Locale::getDefault();
UErrorCode status = U_ZERO_ERROR;
std::unique_ptr<icu::BreakIterator> it{icu::BreakIterator::createCharacterInstance(locale, status)};
if (U_FAILURE(status)) throw 42;
it->setText(word);
std::unique_ptr<icu::Collator> collator{icu::Collator::createInstance(status)};
if (U_FAILURE(status)) throw 42;
collator->setStrength(icu::Collator::SECONDARY);
auto less = [](const icu::CollationKey &k1, const icu::CollationKey &k2){
UErrorCode status = U_ZERO_ERROR;
bool isLess = k1.compareTo(k2, status) == UCOL_LESS;
if (U_FAILURE(status)) throw 42;
return isLess;
};
std::set<icu::CollationKey, decltype(less)> cache(less);
int32_t p = it->first();
while (p != icu::BreakIterator::DONE) {
int32_t q = it->next();
if (q == icu::BreakIterator::DONE)
break;
icu::CollationKey key;
collator->getCollationKey(word.tempSubStringBetween(p, q), key, status);
if (U_FAILURE(status)) throw 42;
if (cache.find(key) != cache.end())
return true;
cache.insert(key);
p = q;
}
return false;
}
int main()
{
icu::UnicodeString words(u8"Example english Боб мир כוכב 民主主義語こんにちは", "utf-8");
icu::Locale locale = icu::Locale::getDefault();
UErrorCode status = U_ZERO_ERROR;
std::unique_ptr<icu::BreakIterator> it{icu::BreakIterator::createWordInstance(locale, status)};
if (U_FAILURE(status)) throw 42;
it->setText(words);
int32_t p = it->first();
while (p != icu::BreakIterator::DONE) {
int32_t q = it->next();
if (q == icu::BreakIterator::DONE)
break;
if (it->getRuleStatus() != UBRK_WORD_NONE)
{
icu::UnicodeString word{words.tempSubStringBetween(p, q)};
bool hasRepeats = hasRepeatingCharacters(word);
std::string wordUtf8;
word.toUTF8String(wordUtf8);
std::cout << (hasRepeats ? "Has repeats: " : "No repeats: ") << wordUtf8 << std::endl;
}
p = q;
}
return 0;
}