@@ -19,6 +19,7 @@ const double NORMAL_PEAK = 580;
1919const double NORMAL_BOOST = 0.03 ;
2020const double SMALL_THRESHOLD = 350 ;
2121const double SMALL_BOOST = 0.0228 ;
22+ const int MIN_LONGEST_RUN = 200 ;
2223
2324struct SigmoidClassifier {
2425 const double lo;
@@ -650,6 +651,20 @@ struct DSU {
650651 }
651652};
652653
654+ int longest_run (const vector< int >& a, const vector< int >& b) {
655+ int result = 0 ;
656+ for (int i = 0 , na = a.size (); i < na; ++i) {
657+ for (int j = 0 , nb = b.size (); j < nb; ++j) {
658+ int run = 0 ;
659+ while (i + run < na && j + run < nb && a[i + run] == b[j + run]) {
660+ ++run;
661+ }
662+ result = max (result, run);
663+ }
664+ }
665+ return result;
666+ }
667+
653668int main () {
654669 Tokens special_tokens = {
655670 VALUE_TOKEN,
@@ -717,7 +732,8 @@ int main() {
717732 auto dist = levenshtein_distance (i->fingerprint , j->fingerprint );
718733 auto size = (double ) (i->fingerprint .size () + j->fingerprint .size ()) / 2 ;
719734 auto ratio = dist / size;
720- bool similar = classify (size, ratio);
735+ bool similar = classify (size, ratio) ||
736+ longest_run (i->fingerprint , j->fingerprint ) >= MIN_LONGEST_RUN;
721737 if (similar) {
722738 ans.unite (i->id , j->id );
723739 }
0 commit comments