diff --git a/examples/bloom-embeddings.ipynb b/examples/bloom-embeddings.ipynb new file mode 100644 index 000000000..dadc0366a --- /dev/null +++ b/examples/bloom-embeddings.ipynb @@ -0,0 +1,727 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 3527, + "status": "ok", + "timestamp": 1648197799884, + "user": { + "displayName": "Vincent D. Warmerdam", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gh4KYzhhhK0YDTnAQsUIaQPw-0dKIP-kLBID7nFdQ=s64", + "userId": "05641618555626735638" + }, + "user_tz": -60 + }, + "id": "ssfd1qsSxtRS", + "outputId": "7c6e6585-2362-4be2-da05-db00a0307fe6", + "tags": [] + }, + "outputs": [], + "source": [ + "%pip install mmh3 numpy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The Bloom embeddings algorithm\n", + "\n", + "In a normal embedding table, each word-string is mapped to a distinct ID.\n", + "Usually these IDs will be sequential, so if you have a vocabulary of 100 words,\n", + "your words will be mapped to numbers `range(100)`. The sequential IDs can then\n", + "be used as indices into an embedding table: if you have 100 words in your\n", + "vocabulary, you have 100 rows in the table, and each word receives its own\n", + "vector.\n", + "\n", + "However, there's no limit to the number of unique words that might occur in a\n", + "sample of text, while we definitely want a limited number of rows in our\n", + "embedding table. Some of the rows in our table will therefore need to be shared\n", + "between multiple words in our vocabulary. One obvious solution is to set aside a\n", + "single vector in the table. Words 0-98 will each receive their own vector, while\n", + "all other words are assigned to vector 99.\n", + "\n", + "However, this asks vector 99 to do a lot of work. What if we gave more vectors\n", + "to the unknown words?" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "executionInfo": { + "elapsed": 10, + "status": "ok", + "timestamp": 1648197799885, + "user": { + "displayName": "Vincent D. Warmerdam", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gh4KYzhhhK0YDTnAQsUIaQPw-0dKIP-kLBID7nFdQ=s64", + "userId": "05641618555626735638" + }, + "user_tz": -60 + }, + "id": "Eb895XpR-VUB" + }, + "outputs": [], + "source": [ + "def get_row(word_id, number_vector=100, number_oov=10):\n", + " if word_id < (number_vector - number_oov):\n", + " return word_id\n", + " else:\n", + " return number_vector + (word_id % number_oov)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This gives the model a little more resolution for the unknown words. If all\n", + "out-of-vocabulary words are assigned the same vector, then they'll all look\n", + "identical to the model. Even if the training data actually includes information\n", + "that shows two different out-of-vocabulary words have important, different\n", + "implications -- for instance, if one word is a strong indicator of positive\n", + "sentiment, while the other is a strong indicator of negative sentiment -- the\n", + "model won't be able to tell them apart. However, if we have 10 buckets for the\n", + "unknown words, we might get lucky, and assign these words to different buckets.\n", + "If so, the model would be able to learn that one of the unknown-word vectors\n", + "makes positive sentiment more likely, while the other vector makes negative\n", + "sentiment more likely.\n", + "\n", + "If this is good, then why not do more of it? Bloom embeddings are like an\n", + "extreme version, where _every_ word is handled like the unknown words above:\n", + "there are 100 vectors for the \"unknown\" portion, and 0 for the \"known\" portion.\n", + "\n", + "So far, this approach seems weird, but not necessarily good. The part that makes\n", + "it unfairly effective is the next step: by simply doing the same thing multiple\n", + "times, we can greatly improve the resolution, and have unique representations\n", + "for far more words than we have vectors. The code in full:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "executionInfo": { + "elapsed": 8, + "status": "ok", + "timestamp": 1648197799885, + "user": { + "displayName": "Vincent D. Warmerdam", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gh4KYzhhhK0YDTnAQsUIaQPw-0dKIP-kLBID7nFdQ=s64", + "userId": "05641618555626735638" + }, + "user_tz": -60 + }, + "id": "tTkqM3EixhWM" + }, + "outputs": [], + "source": [ + "import numpy\n", + "import mmh3\n", + "\n", + "def allocate(n_vectors, n_dimensions):\n", + " table = numpy.zeros((n_vectors, n_dimensions), dtype='f')\n", + " table += numpy.random.uniform(-0.1, 0.1, table.size).reshape(table.shape)\n", + " return table\n", + "\n", + "def get_vector(table, word):\n", + " hash1 = mmh3.hash(word, seed=0)\n", + " hash2 = mmh3.hash(word, seed=1)\n", + " row1 = hash1 % table.shape[0]\n", + " row2 = hash2 % table.shape[0]\n", + " return table[row1] + table[row2]\n", + "\n", + "def update_vector(table, word, d_vector):\n", + " hash1 = mmh3.hash(word, seed=0)\n", + " hash2 = mmh3.hash(word, seed=1)\n", + " row1 = hash1 % table.shape[0]\n", + " row2 = hash2 % table.shape[0]\n", + " table[row1] -= 0.001 * d_vector\n", + " table[row2] -= 0.001 * d_vector" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we've used two keys, assigned from two random hash functions.\n", + "It's unlikely that two words will collide on both keys, so by simply summing the\n", + "vectors together, we'll assign most words a unique representation.\n", + "\n", + "For the sake of illustration, let's step through a very small example,\n", + "explicitly.\n", + "\n", + "Let's say we have this vocabulary of 20 words:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "executionInfo": { + "elapsed": 8, + "status": "ok", + "timestamp": 1648197799885, + "user": { + "displayName": "Vincent D. Warmerdam", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gh4KYzhhhK0YDTnAQsUIaQPw-0dKIP-kLBID7nFdQ=s64", + "userId": "05641618555626735638" + }, + "user_tz": -60 + }, + "id": "QMaz-mr9xjPG" + }, + "outputs": [], + "source": [ + "vocab = ['apple', 'strawberry', 'orange', 'juice', 'drink', 'smoothie',\n", + " 'eat', 'fruit', 'health', 'wellness', 'steak', 'fries', 'ketchup',\n", + " 'burger', 'chips', 'lobster', 'caviar', 'service', 'waiter', 'chef']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll embed these into two dimensions. Normally this would give us a table of\n", + "`(20, 2)` floats, which we would randomly initialise. With the hashing trick, we\n", + "can make the table smaller. Let's give it 15 vectors:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "executionInfo": { + "elapsed": 8, + "status": "ok", + "timestamp": 1648197799886, + "user": { + "displayName": "Vincent D. Warmerdam", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gh4KYzhhhK0YDTnAQsUIaQPw-0dKIP-kLBID7nFdQ=s64", + "userId": "05641618555626735638" + }, + "user_tz": -60 + }, + "id": "LNg60lvqxkmP" + }, + "outputs": [], + "source": [ + "normal_embed = numpy.random.uniform(-0.1, 0.1, (20, 2))\n", + "hashed_embed = numpy.random.uniform(-0.1, 0.1, (15, 2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the normal table, we want to map each word in our vocabulary to its own\n", + "vector:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "executionInfo": { + "elapsed": 5, + "status": "ok", + "timestamp": 1648197801914, + "user": { + "displayName": "Vincent D. Warmerdam", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gh4KYzhhhK0YDTnAQsUIaQPw-0dKIP-kLBID7nFdQ=s64", + "userId": "05641618555626735638" + }, + "user_tz": -60 + }, + "id": "9wFC_iR_xlBH" + }, + "outputs": [], + "source": [ + "word2id = {}\n", + "def get_normal_vector(word, table):\n", + " if word not in word2id.keys():\n", + " word2id[word] = len(word2id)\n", + " return table[word2id[word]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The hashed table only has 15 rows, so some words will have to share. We'll\n", + "handle this by mapping the word into an arbitrary integer – called a \"hash\n", + "value\". The hash function will return an arbitrary integer, which we'll mod into\n", + "the range `(0, 15)`. Importantly, we need to be able to compute _multiple,\n", + "distinct_ hash values for each key – so Python's built-in hash function is\n", + "inconvenient. We'll therefore use MurmurHash.\n", + "\n", + "Let's see what keys we get for our 20 vocabulary items, using MurmurHash:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "executionInfo": { + "elapsed": 4, + "status": "ok", + "timestamp": 1648197804508, + "user": { + "displayName": "Vincent D. Warmerdam", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gh4KYzhhhK0YDTnAQsUIaQPw-0dKIP-kLBID7nFdQ=s64", + "userId": "05641618555626735638" + }, + "user_tz": -60 + }, + "id": "Gs69d2KRxmg9" + }, + "outputs": [], + "source": [ + "hashes1 = [mmh3.hash(w, 1) % 15 for w in vocab]\n", + "assert hashes1 == [3, 6, 4, 13, 8, 3, 13, 1, 9, 12, 11, 4, 2, 13, 5, 10, 0, 2, 10, 13]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, some keys are shared between multiple words, while 2/15 keys are\n", + "unoccupied. This is obviously unideal! If multiple words have the same key,\n", + "they'll map to the same vector – as far as the model is concerned, \"strawberry\"\n", + "and \"heart\" will be indistinguishable. It won't be clear which word was used –\n", + "they have the same representation.\n", + "\n", + "To address this, we simply hash the words again, this time using a different\n", + "seed – so that we get a different set of arbitrary keys:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "executionInfo": { + "elapsed": 3, + "status": "ok", + "timestamp": 1648197804508, + "user": { + "displayName": "Vincent D. Warmerdam", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gh4KYzhhhK0YDTnAQsUIaQPw-0dKIP-kLBID7nFdQ=s64", + "userId": "05641618555626735638" + }, + "user_tz": -60 + }, + "id": "acpOxkljynPo" + }, + "outputs": [], + "source": [ + "from collections import Counter\n", + "\n", + "hashes2 = [mmh3.hash(w, 2) % 15 for w in vocab]\n", + "assert len(Counter(hashes2).most_common()) == 12" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This one's even worse – 3 keys unoccupied! But our strategy is not to keep drawing until we get a favorable seed. Instead, consider this:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "executionInfo": { + "elapsed": 3, + "status": "ok", + "timestamp": 1648197805024, + "user": { + "displayName": "Vincent D. Warmerdam", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gh4KYzhhhK0YDTnAQsUIaQPw-0dKIP-kLBID7nFdQ=s64", + "userId": "05641618555626735638" + }, + "user_tz": -60 + }, + "id": "W7tfxLQBytWP" + }, + "outputs": [], + "source": [ + "assert len(Counter(zip(hashes1, hashes2))) == 20" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By combining the results from the two hashes, our 20 words distribute perfectly,\n", + "into 20 unique combinations. This makes sense: we expect to have some words\n", + "overlapping on one of the keys, but we'd have to be very unlucky for a pair of\n", + "words to overlap on _both_ keys.\n", + "\n", + "This means that if we simply add the two vectors together, each word once more\n", + "has a unique representation:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 2, + "status": "ok", + "timestamp": 1648197805764, + "user": { + "displayName": "Vincent D. Warmerdam", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gh4KYzhhhK0YDTnAQsUIaQPw-0dKIP-kLBID7nFdQ=s64", + "userId": "05641618555626735638" + }, + "user_tz": -60 + }, + "id": "wI5yayZWyxVP", + "outputId": "4f62b77d-709f-483b-a0bb-4e5fbe68d5fe" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "apple -0.033 -0.012\n", + "strawberry -0.023 -0.037\n", + "orange 0.158 -0.031\n", + "juice -0.045 0.139\n", + "drink 0.024 0.030\n", + "smoothie 0.121 0.076\n", + "eat -0.093 0.153\n", + "fruit 0.083 0.052\n", + "health 0.064 -0.046\n", + "wellness 0.143 0.112\n", + "steak 0.011 -0.097\n", + "fries 0.036 0.041\n", + "ketchup 0.081 0.029\n", + "burger -0.045 0.139\n", + "chips -0.118 -0.090\n", + "lobster 0.016 -0.107\n", + "caviar -0.033 -0.012\n", + "service 0.081 0.029\n", + "waiter 0.179 -0.038\n", + "chef -0.047 0.062\n" + ] + } + ], + "source": [ + "for word in vocab:\n", + " key1 = mmh3.hash(word, 0) % 15\n", + " key2 = mmh3.hash(word, 1) % 15\n", + " vector = hashed_embed[key1] + hashed_embed[key2]\n", + " print(word, '%.3f %.3f' % tuple(vector))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now have a function that maps our 20 words to 20 unique vectors – but we're\n", + "storing weights for only 15 vectors in memory. Now the question is: will we be\n", + "able to find values for these weights that let us actually map words to useful\n", + "vectors?\n", + "\n", + "Let's do a quick experiment to see how this works. We'll assign \"true\" values\n", + "for our little vocabulary, and see how well we can approximate them with our\n", + "compressed table. To get the \"true\" values, we _could_ put the \"science\" in data\n", + "science, and drag the words around into reasonable-looking clusters. But for our\n", + "purposes, the actual \"true\" values don't matter. We'll therefore just do a\n", + "simulation: we'll assign random vectors as the \"true\" state, and see if we can\n", + "learn values for the hash embeddings that match them.\n", + "\n", + "The learning procedure will be a simple stochastic gradient descent:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "background_save": true + }, + "executionInfo": { + "elapsed": 3, + "status": "aborted", + "timestamp": 1648199186370, + "user": { + "displayName": "Vincent D. Warmerdam", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gh4KYzhhhK0YDTnAQsUIaQPw-0dKIP-kLBID7nFdQ=s64", + "userId": "05641618555626735638" + }, + "user_tz": -60 + }, + "id": "ET4n9AA5y0fX" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "499 43.47128495286\n" + ] + } + ], + "source": [ + "import numpy\n", + "import numpy.random as random\n", + "import mmh3\n", + "\n", + "random.seed(0)\n", + "nb_epoch = 500\n", + "learn_rate = 0.001\n", + "nr_hash_vector = 1000\n", + "\n", + "words = [str(i) for i in range(2000)]\n", + "true_vectors = numpy.random.uniform(-0.1, 0.1, (len(words), 10))\n", + "hash_vectors = numpy.random.uniform(-0.1, 0.1, (nr_hash_vector, 10))\n", + "examples = list(zip(words, true_vectors))\n", + "\n", + "for epoch in range(nb_epoch):\n", + " random.shuffle(examples)\n", + " loss=0.\n", + " for word, truth in examples:\n", + " key1 = mmh3.hash(word, 0) % nr_hash_vector\n", + " key2 = mmh3.hash(word, 1) % nr_hash_vector\n", + " hash_vector = hash_vectors[key1] + hash_vectors[key2]\n", + " diff = hash_vector - truth\n", + " hash_vectors[key1] -= learn_rate * diff\n", + " hash_vectors[key2] -= learn_rate * diff\n", + " loss += (diff**2).sum()\n", + "print(epoch, loss)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's worth taking some time to play with this simulation. You can start by doing\n", + "some sanity checks:\n", + "\n", + "- How does the loss change with `nr_hash_vector`?\n", + "- If you remove `key2`, does the loss go up?\n", + "- What happens if you add more hash keys?\n", + "- What happens as the vocabulary size increases?\n", + "- What happens when more dimensions are added?\n", + "- How sensitive are the hash embeddings to the initial conditions? If we change the random seed, do we ever get unlucky?\n", + "\n", + "If you play with the simulation for a while, you'll start to get a good feel for\n", + "the dynamics, and hopefully you'll have a clear idea of why the technique works." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TuRoY34yQb0v", + "tags": [] + }, + "source": [ + "## Bonus Section \n", + "\n", + "To make it easier for folks to try out a whole bunch of settings we'd added a little bit of code below that makes it easier to get relevant visuals." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 2919, + "status": "ok", + "timestamp": 1648200042349, + "user": { + "displayName": "Vincent D. Warmerdam", + "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gh4KYzhhhK0YDTnAQsUIaQPw-0dKIP-kLBID7nFdQ=s64", + "userId": "05641618555626735638" + }, + "user_tz": -60 + }, + "id": "NPVKX_pbXJYs", + "outputId": "fc046666-d690-426d-b8a7-dc557f12832d", + "tags": [] + }, + "outputs": [], + "source": [ + "%pip install altair pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "background_save": true + }, + "id": "nHd1wo6m1q-J" + }, + "outputs": [], + "source": [ + "from functools import reduce \n", + "\n", + "\n", + "def calc_losses(epochs=500, seed=0, learn_rate=0.001, nr_hash_vector=1000, n_hash=3, n_words=1000, size_vector=10):\n", + " random.seed(seed)\n", + " nb_epoch = epochs\n", + " learn_rate = learn_rate\n", + " nr_hash_vector = nr_hash_vector\n", + "\n", + " words = [str(i) for i in range(n_words)]\n", + " true_vectors = numpy.random.uniform(-0.1, 0.1, (len(words), size_vector))\n", + " hash_vectors = numpy.random.uniform(-0.1, 0.1, (nr_hash_vector, size_vector))\n", + " examples = list(zip(words, true_vectors))\n", + "\n", + " losses = []\n", + " for epoch in range(nb_epoch):\n", + " random.shuffle(examples)\n", + " loss=0.\n", + " for word, truth in examples:\n", + " keys = [mmh3.hash(word, k) % nr_hash_vector for k in range(n_hash)]\n", + " hash_vector = reduce(lambda a, b: a + b, [hash_vectors[k] for k in keys])\n", + " diff = hash_vector - truth\n", + " for key in keys:\n", + " hash_vectors[key] -= learn_rate * diff\n", + " loss += (diff**2).sum()\n", + " losses.append(loss)\n", + " return losses\n", + "\n", + "data = []\n", + "for n_hash in [1, 2, 3, 4, 5]:\n", + " losses = calc_losses(nr_hash_vector=2_000, n_words=10_000, n_hash=n_hash, epochs=150)\n", + " data = data + [{\"loss\": l, \"nr_hash_vector\": nr_hash_vector, \"n_hash\": str(n_hash), \"epoch\": e} for e, l in enumerate(losses)]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "background_save": true + }, + "id": "P0Q0k9bjXMm3" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import altair as alt\n", + "\n", + "source = pd.DataFrame(data)\n", + "\n", + "(alt.Chart(source)\n", + " .mark_line()\n", + " .encode(x='epoch', y='loss', color='n_hash')\n", + " .properties(width=600, height=250)\n", + " .interactive())" + ] + } + ], + "metadata": { + "colab": { + "authorship_tag": "ABX9TyPAXtr/TeMWYmJkxrXcAPIT", + "collapsed_sections": [], + "name": "the-hashing-trick.ipynb", + "version": "" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}