Skip to content
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
9371cea
work with cupy arrays and 2d arrays
kadarakos Jul 13, 2022
0909a32
force mypy pass
kadarakos Jul 13, 2022
7627643
addressing comments
kadarakos Jul 15, 2022
dc2f416
return correct shape empty array
kadarakos Jul 15, 2022
6b6de6a
test remap_ids with Ints2d
kadarakos Jul 16, 2022
ac5a893
Update thinc/layers/remap_ids.py
kadarakos Jul 27, 2022
bcc35fe
use numpy array
kadarakos Jul 27, 2022
a56929a
remove cupy import
kadarakos Jul 27, 2022
76fbda4
mini fix
kadarakos Jul 28, 2022
e3ecf84
more strict typing
kadarakos Aug 2, 2022
2983853
Merge branch 'master' of https://github.com/explosion/thinc into rema…
kadarakos Aug 2, 2022
ddc8ae5
adjust test
kadarakos Aug 2, 2022
71a86b6
Update thinc/layers/remap_ids.py
kadarakos Aug 3, 2022
fad1247
remove check
kadarakos Aug 3, 2022
e9abe92
Update thinc/layers/remap_ids.py
kadarakos Aug 3, 2022
d0a2f8b
address reviews
kadarakos Aug 3, 2022
ba96fd6
Update thinc/layers/remap_ids.py
kadarakos Aug 3, 2022
ce6dbe5
simplify casting
kadarakos Aug 3, 2022
3d87100
Update thinc/layers/remap_ids.py
kadarakos Aug 4, 2022
c0fc214
Update thinc/layers/remap_ids.py
kadarakos Aug 4, 2022
b809f4d
remap_ids legacy
kadarakos Aug 5, 2022
40aace6
Merge branch 'remap-ids-fix' of https://github.com/kadarakos/thinc in…
kadarakos Aug 5, 2022
ba730c3
legacy
kadarakos Aug 5, 2022
d0bc261
Merge branch 'master' of https://github.com/explosion/thinc into rema…
kadarakos Aug 5, 2022
fb32a29
test version 1 and 2
kadarakos Aug 5, 2022
cecba3e
rename legacy to v1
kadarakos Aug 8, 2022
38ce16e
adding old test back
kadarakos Aug 8, 2022
ecbd06e
remap_ids docs update
kadarakos Aug 11, 2022
7f90739
Update website/docs/api-layers.md
kadarakos Aug 16, 2022
6d38a15
Update website/docs/api-layers.md
kadarakos Aug 16, 2022
eb9572f
make init/forward attribute setting more clear
kadarakos Aug 17, 2022
0c82bae
merge
kadarakos Aug 17, 2022
e2425b1
Update website/docs/api-layers.md
kadarakos Aug 19, 2022
7cceacc
Update website/docs/api-layers.md
kadarakos Aug 19, 2022
b9fd349
Update website/docs/api-layers.md
kadarakos Aug 19, 2022
e8241dc
prettier
kadarakos Aug 31, 2022
c2f4edd
merge
kadarakos Aug 31, 2022
33e7f20
update model type
kadarakos Sep 1, 2022
ea930c1
prettier
kadarakos Sep 2, 2022
67f90a4
Use new _v2 instead of renamed _v1
adrianeboyd Sep 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 67 additions & 9 deletions thinc/layers/remap_ids.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,37 @@
from typing import Tuple, Callable, Sequence, Dict, Any
from typing import Tuple, Callable, Sequence, cast
from typing import Dict, Union, Optional, Hashable, Any

from ..model import Model
from ..config import registry
from ..types import Ints2d, DTypes
from ..types import Ints1d, Ints2d, DTypes
from ..util import is_xp_array, to_numpy


InT = Sequence[Any]
InT = Union[Sequence[Hashable], Ints1d, Ints2d]
OutT = Ints2d

InT_v1 = Sequence[Any]
OutT_v1 = Ints2d


@registry.layers("remap_ids.v1")
def remap_ids(
def remap_ids_v1(
mapping_table: Dict[Any, int] = {}, default: int = 0, dtype: DTypes = "i"
) -> Model[InT, OutT]:
) -> Model[InT_v1, OutT_v1]:
"""Remap string or integer inputs using a mapping table, usually as a
preprocess before embeddings. The mapping table can be passed in on input,
or updated after the layer has been created. The mapping table is stored in
the "mapping_table" attribute.
"""
return Model(
"remap_ids",
forward,
forward_v1,
attrs={"mapping_table": mapping_table, "dtype": dtype, "default": default},
)


def forward(
model: Model[InT, OutT], inputs: InT, is_train: bool
def forward_v1(
model: Model[InT_v1, OutT_v1], inputs: InT_v1, is_train: bool
) -> Tuple[OutT, Callable]:
table = model.attrs["mapping_table"]
default = model.attrs["default"]
Expand All @@ -35,7 +40,60 @@ def forward(
arr = model.ops.asarray2i(values, dtype=dtype)
output = model.ops.reshape2i(arr, -1, 1)

def backprop(dY: OutT) -> InT:
def backprop(dY: OutT_v1) -> InT:
return []

return output, backprop


@registry.layers("remap_ids.v2")
def remap_ids(
mapping_table: Optional[Union[Dict[int, int], Dict[str, int]]] = None,
default: int = 0,
*,
column: Optional[int] = None
) -> Model[InT, OutT]:
"""Remap string or integer inputs using a mapping table,
usually as a preprocessing step before embeddings.
The mapping table can be passed in on input,
or updated after the layer has been created.
The mapping table is stored in the "mapping_table" attribute.
Two dimensional arrays can be provided as input in which case
the 'column' chooses which column to process. This is useful
to work together with FeatureExtractor in spaCy.
"""
return Model(
"remap_ids",
forward,
attrs={"mapping_table": mapping_table, "default": default, "column": column},
)


def forward(
model: Model[InT, OutT], inputs: InT, is_train: bool
) -> Tuple[OutT, Callable]:
table = model.attrs["mapping_table"]
if table is None:
raise ValueError("'mapping table' not set")
default = model.attrs["default"]
column = model.attrs["column"]
if is_xp_array(inputs):
xp_input = True
if column is not None:
idx = to_numpy(cast(Ints2d, inputs)[:, column])
else:
idx = to_numpy(inputs)
else:
xp_input = False
idx = inputs
values = [table.get(x, default) for x in idx]
arr = model.ops.asarray2i(values, dtype="i")
output = model.ops.reshape2i(arr, -1, 1)

def backprop(dY: OutT) -> InT:
if xp_input:
return model.ops.xp.empty(dY.shape) # type: ignore
else:
return []

return output, backprop
3 changes: 2 additions & 1 deletion thinc/tests/layers/test_layers_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,8 @@ def assert_data_match(Y, out_data):
# ("CauchySimilarity.v1", {}, (array2d, array2d), array1d),
("ParametricAttention.v1", {}, ragged, ragged),
("SparseLinear.v1", {}, (numpy.asarray([1, 2, 3], dtype="uint64"), array1d, numpy.asarray([1, 1], dtype="i")), array2d),
("remap_ids.v1", {"dtype": "f"}, ["a", 1, 5.0], array2dint)
("remap_ids.v1", {"dtype": "f"}, ["a", 1, 5.0], array2dint),
("remap_ids.v2", {"mapping_table": {}, "column": 1}, numpy.array([[1, 2, 3], [4, 5, 6]]).T, array2dint)
# fmt: on
]

Expand Down
16 changes: 10 additions & 6 deletions website/docs/api-layers.md
Original file line number Diff line number Diff line change
Expand Up @@ -1267,21 +1267,25 @@ https://github.com/explosion/thinc/blob/master/thinc/layers/padded2list.py

<inline-list>

- **Input:** <tt>Sequence[Any]</tt>
- **Input:** <tt>Union[Sequence[Hashable], Ints1d, Ints2d]</tt>
- **Output:** <ndarray>Ints2d</ndarray>

</inline-list>

Remap string or integer inputs using a mapping table, usually as a preprocess
before embeddings. The mapping table can be passed in on input, or updated after
the layer has been created. The mapping table is stored in the `"mapping_table"`
attribute.
Remap a sequence of strings, integers or other Hashable inputs using a
mapping table, usually as a preprocessing step before embeddings. The
input can also be a two dimensional integer array in which case the
`column: int` attribute tells the `remap_ids` layer which column of the
array to map. For 2D inputs the `column` has to be set on initialization.
The mapping table can be set at initialization, but can also be passed in on input,
When provided on initializtion the mapping table is stored in the `"mapping_table"`
attribute.

| Argument | Type | Description |
| --------------- | ------------------------------------- | ------------------------------------------------------------------------------------------------------------ |
| `mapping_table` | <tt>Dict[Any, int]</tt> | The mapping table to use. Can also be set after initialization by writing to `model.attrs["mapping_table"]`. |
| `default` | <tt>int</tt> | The default value if the input does not have an entry in the mapping table. |
| `dtype` | <tt>DTypes</tt> | The data type of the array. |
| `column` | <tt>int</tt> | The column to apply the mapper to in case of 2D input. |
| **RETURNS** | <tt>Model[Sequence[Any], Ints2d]</tt> | The layer to compute the transformation. |

```python
Expand Down