diff --git a/lib/Service/NegativeSampleGenerator.php b/lib/Service/NegativeSampleGenerator.php index 3705fc37..150e4af8 100644 --- a/lib/Service/NegativeSampleGenerator.php +++ b/lib/Service/NegativeSampleGenerator.php @@ -38,6 +38,10 @@ use function str_split; class NegativeSampleGenerator { + /** + * Get IP vectors exclusively used by one user. + * Includes the user vector in second dimension of the returned array. + */ private function getUniqueIPsPerUser(Dataset $positives): array { $map = []; @@ -51,7 +55,7 @@ private function getUniqueIPsPerUser(Dataset $positives): array { $map[$ipVecStr] = [ $uidVecStr, ]; - } else { + } elseif (!in_array($uidVecStr, $map[$ipVecStr])) { $map[$ipVecStr][] = $uidVecStr; } } diff --git a/tests/Unit/Service/NegativeSampleGeneratorTest.php b/tests/Unit/Service/NegativeSampleGeneratorTest.php index 76fa1d3e..d8701f80 100644 --- a/tests/Unit/Service/NegativeSampleGeneratorTest.php +++ b/tests/Unit/Service/NegativeSampleGeneratorTest.php @@ -144,6 +144,37 @@ public function testGenerateMultipleShuffledFromLimitedUnique(): void { self::assertCount(5, $result); } + /** + * DataSet can consist of multiple unique entries only. If not handled correctly, + * this will result in an array without any IP. This tests the + * correct handling. See GitHub issue #860 for more. + * @return void + */ + public function testGenerateMultipleShuffledFromUniquesOnly(): void { + $positives = new Unlabeled([ + array_merge(self::decToBitArray(1, 16), self::decToBitArray(1, 32)), + array_merge(self::decToBitArray(1, 16), self::decToBitArray(1, 32)), + array_merge(self::decToBitArray(1, 16), self::decToBitArray(1, 32)), + + array_merge(self::decToBitArray(2, 16), self::decToBitArray(2, 32)), + array_merge(self::decToBitArray(2, 16), self::decToBitArray(2, 32)), + array_merge(self::decToBitArray(2, 16), self::decToBitArray(2, 32)), + ]); + + $result = $this->generator->generateShuffledFromPositiveSamples($positives, 2); + + self::assertCount(2, $result); + foreach ($result as $sample) { + $ipVec = array_slice($sample, 16, 32); + + self::assertTrue( + $ipVec === self::decToBitArray(1, 32) || + $ipVec === self::decToBitArray(2, 32), + 'Sample has an unique IP' + ); + } + } + /** * @return int[] */