diff --git a/simhash/__init__.py b/simhash/__init__.py index 4fd83be..a217e29 100644 --- a/simhash/__init__.py +++ b/simhash/__init__.py @@ -168,6 +168,22 @@ def distance(self, another): return ans +class MultiSimhash(Simhash): + def __init__(self, simhashes): + multi_f = 0 + if not isinstance(simhashes, Iterable): + raise Exception('Value passed is not a list of simhashes') + for i in simhashes: + multi_f = multi_f + i.f + if multi_f % 8: + raise Exception('Simhashes do not the same length (f)') + multi_value = self._concatenate_simhashes(simhashes) + super(MultiSimhash, self).__init__(value=multi_value, f=multi_f, hashfunc=simhashes[0].hashfunc) + + def _concatenate_simhashes(self, objs): + digests = [int_to_bytes(obj.value, obj.f_bytes) for obj in objs] + return bytes_to_int(b''.join(digests)) + class SimhashIndex(object): def __init__(self, objs, f=64, k=2, log=None): diff --git a/tests/test_simhash.py b/tests/test_simhash.py index 47fccd8..17812a3 100644 --- a/tests/test_simhash.py +++ b/tests/test_simhash.py @@ -4,7 +4,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer -from simhash import Simhash, SimhashIndex +from simhash import MultiSimhash, Simhash, SimhashIndex class TestSimhash(TestCase): @@ -146,6 +146,19 @@ def test_get_near_dup(self): dups = self.index.get_near_dups(s1) self.assertEqual(3, len(dups)) +class TestMultiSimhash(TestCase): + def test_creation(self): + a = Simhash('My name is John') + b = 1 + self.assertRaises(Exception, [a, b]) + + a = Simhash(0xaaaaaaaa, f=32) + b = Simhash(0xbbbbbbbb, f=32) + ms = MultiSimhash([a, b]) + d = Simhash(0xaaaaaaaabbbbbbbb, f=64) + self.assertEqual(ms.value, d.value) + + self.assertEqual(ms.f, d.f) if __name__ == '__main__': main()