Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions stwfsapy/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from scipy.sparse import csr_array
from stwfsapy import thesaurus as t
from stwfsapy.automata import nfa, construction, conversion, dfa
from stwfsapy.thesaurus_features import ThesaurusFeatureTransformation
Expand Down Expand Up @@ -291,7 +291,7 @@ def _fit_after_init(self, X, y=None):
self.pipeline_.fit(matches, y=train_y)
return self

def predict_proba(self, X) -> csr_matrix:
def predict_proba(self, X) -> csr_array:
"""
Predicts probability scores for each concept per document.

Expand All @@ -305,7 +305,7 @@ def predict_proba(self, X) -> csr_matrix:
predictions = self.pipeline_.predict_proba(match_X)[:, 1]
else:
predictions = []
return self._create_sparse_matrix(
return self._create_sparse_array(
predictions,
[tpl[0] for tpl in match_X],
doc_counts
Expand Down Expand Up @@ -344,7 +344,7 @@ def suggest_proba(
in combined
]

def predict(self, X) -> csr_matrix:
def predict(self, X) -> csr_array:
"""
Predicts binary concept match labels for each input text.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think in the docstrings for the predict() method we can replace "A sparse matrix of shape ..." with a sparse array for the sake of consistency.

Expand All @@ -358,19 +358,19 @@ def predict(self, X) -> csr_matrix:
predictions = self.pipeline_.predict(match_X)
else:
predictions = []
return self._create_sparse_matrix(
return self._create_sparse_array(
predictions,
[tpl[0] for tpl in match_X],
doc_counts
)

def _create_sparse_matrix(
def _create_sparse_array(
self,
values: Nl,
concept_names: List[str],
doc_counts: List[int]
) -> csr_matrix:
return csr_matrix(
) -> csr_array:
return csr_array(
(
values,
(
Expand Down Expand Up @@ -427,7 +427,7 @@ def match_and_extend(
for inp, truth_refs in zip(inputs, map(str, truth_refss)):
text = input_handler(inp)
if self.use_txt_vec:
txt_vec = self.text_vectorizer_.transform([inp])[0]
txt_vec = self.text_vectorizer_.transform([inp])
else:
txt_vec = 0
txt_feat = self.text_features_.transform([text])[0]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain why over here(line 433) and in line 458, when the transform method is applied we access index 0. I see that it was modified for the text_vectorizer attribute i.e. index 0 is not accessed. Should it be also modified for the text_features attribute or does it have a different data structure?

Copy link
Contributor

@gmmajal gmmajal Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I looked at what the transform methods are doing for text_vectorizer and text_features, respectively. The one for text_vectorizer returns a sparse matrix whereas for text_features a numpy array is returned. The data structure does indeed seem to be different.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The text vectorizer produces a csr_matrix from scikit-learn, so we can’t switch it to a sparray at this point.

Expand All @@ -452,7 +452,7 @@ def match_and_extend(
for inp in inputs:
text = input_handler(inp)
if self.use_txt_vec:
txt_vec = self.text_vectorizer_.transform([inp])[0]
txt_vec = self.text_vectorizer_.transform([inp])
else:
txt_vec = 0
txt_feat = self.text_features_.transform([text])[0]
Expand Down
26 changes: 13 additions & 13 deletions stwfsapy/tests/predictor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@
# limitations under the License.

from stwfsapy.text_features import mk_text_features
from scipy.sparse import lil_matrix
from scipy.sparse import lil_array
from stwfsapy import predictor as p
import stwfsapy.thesaurus as t
from stwfsapy.automata.dfa import Dfa
import stwfsapy.tests.common as c
from stwfsapy.automata.construction import ConstructionState
import pytest
from scipy.sparse import csr_matrix
from scipy.sparse import csr_array
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
Expand Down Expand Up @@ -62,8 +62,8 @@
("9", [0], 0), ("11", [0, 1], 1)]


def make_test_result_matrix(values):
return csr_matrix((
def make_test_result_array(values):
return csr_array((
values,
(
[
Expand Down Expand Up @@ -113,7 +113,7 @@ def no_match_predictor(mocker):


def mock_vec_transform(X):
ret = lil_matrix((len(X), 5000))
ret = lil_array((len(X), 5000))
for idx, x in enumerate(X):
ret[idx] = len(x)
return ret
Expand Down Expand Up @@ -145,20 +145,20 @@ def test_result_collection():
assert [(r[0], list(r[1])) for r in res] == _collection_result


def test_sparse_matrix_creation():
def test_sparse_array_creation():
predictor = p.StwfsapyPredictor(None, None, None, None)
predictor.concept_map_ = _concept_map
res = predictor._create_sparse_matrix(
res = predictor._create_sparse_array(
_predictions[:, 1],
[c[0] for c in _concepts_with_text],
_doc_counts
)
assert res.shape[0] == len(_doc_counts)
assert res.shape[1] == 23
for i, count in enumerate(_doc_counts):
row = res.getrow(i)
row = res[[i], :]
slice_start = sum(_doc_counts[:i])
assert row.getnnz() == count
assert row.nnz == count
# reverse slices because of mapping.
assert list(row.nonzero()[1]) == list(reversed([
22-i for i in _concepts[slice_start: slice_start+count]]))
Expand Down Expand Up @@ -418,7 +418,7 @@ def check_fit_arg(vec_fun, text_feature_fun, txt, actual, expected):
def test_predict(mocked_predictor):
res = mocked_predictor.predict([])
assert (
res.toarray() == make_test_result_matrix(_classifications).toarray()
res.toarray() == make_test_result_array(_classifications).toarray()
).all()
mocked_predictor.match_and_extend.assert_called_once_with(
[]
Expand All @@ -431,7 +431,7 @@ def test_predict(mocked_predictor):
def test_predict_proba(mocked_predictor):
res = mocked_predictor.predict_proba([])
assert (
res.toarray() == make_test_result_matrix(
res.toarray() == make_test_result_array(
_predictions[:, 1]).toarray()).all()
mocked_predictor.match_and_extend.assert_called_once_with(
[]
Expand All @@ -458,13 +458,13 @@ def test_suggest(mocked_predictor):

def test_predict_no_match(no_match_predictor):
res = no_match_predictor.predict([])
assert res.getnnz() == 0
assert res.nnz == 0
assert res.shape == (3, len(_concept_map))


def test_predict_proba_no_match(no_match_predictor):
res = no_match_predictor.predict_proba([])
assert res.getnnz() == 0
assert res.nnz == 0
assert res.shape == (3, len(_concept_map))


Expand Down
30 changes: 15 additions & 15 deletions stwfsapy/tests/thesaurus_features_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from stwfsapy import thesaurus_features as tf
from stwfsapy.tests.thesaurus import common as tc
from stwfsapy.tests import common as c
from scipy.sparse import coo_matrix, csr_matrix
from scipy.sparse import coo_array, csr_array
from sklearn.exceptions import NotFittedError
import pytest

Expand All @@ -46,7 +46,7 @@ def test_unfitted_raises():
def test_transform():
trans = tf.ThesaurusFeatureTransformation(None, None, None, None)
trans.mapping_ = {
'a': coo_matrix([[1]]), 'b': coo_matrix([[2]]), 'c': coo_matrix([[3]])}
'a': coo_array([[1]]), 'b': coo_array([[2]]), 'c': coo_array([[3]])}
res = trans.transform(['c', 'c', 'a'])
assert (res.toarray() == array([[3], [3], [1]])).all()

Expand All @@ -70,15 +70,15 @@ def test_fit(full_graph):
assert x.shape[1] == 6
# Can not test positions because retrieval from graph is not deterministic.
# Therefore, test non zero entries only.
assert mapping[c.test_concept_uri_0_0].getnnz() == 1
assert mapping[c.test_concept_uri_01_0].getnnz() == 2
assert mapping[c.test_concept_uri_01_00].getnnz() == 2
assert mapping[c.test_concept_uri_10_0].getnnz() == 2
assert mapping[c.test_concept_uri_10_1].getnnz() == 2
assert mapping[c.test_concept_uri_100_0].getnnz() == 3
assert mapping[c.test_concept_uri_100_00].getnnz() == 3
assert mapping[c.test_concept_uri_100_01].getnnz() == 3
assert mapping[c.test_concept_uri_100_02].getnnz() == 3
assert mapping[c.test_concept_uri_0_0].nnz == 1
assert mapping[c.test_concept_uri_01_0].nnz == 2
assert mapping[c.test_concept_uri_01_00].nnz == 2
assert mapping[c.test_concept_uri_10_0].nnz == 2
assert mapping[c.test_concept_uri_10_1].nnz == 2
assert mapping[c.test_concept_uri_100_0].nnz == 3
assert mapping[c.test_concept_uri_100_00].nnz == 3
assert mapping[c.test_concept_uri_100_01].nnz == 3
assert mapping[c.test_concept_uri_100_02].nnz == 3


def test_transform_unknown():
Expand All @@ -90,14 +90,14 @@ def test_transform_unknown():

feature_dim = 12
trans.feature_dim_ = feature_dim
known = csr_matrix(([1], ([0], [4])), shape=(1, feature_dim))
known = csr_array(([1], ([0], [4])), shape=(1, feature_dim))
trans.mapping_ = {'key': known}
random_results = trans.transform([
'some random stuff edsfysdfhjsedf',
'key'])
assert random_results.shape == (2, feature_dim)
assert random_results.getrow(0).getnnz() == 0
assert random_results.getrow(1).getnnz() == 1
assert random_results[[0], :].nnz == 0
assert random_results[[1], :].nnz == 1


def test_empty_relation(full_graph):
Expand All @@ -110,4 +110,4 @@ def test_empty_relation(full_graph):
trans.fit([], [])
features = trans.transform(['empty'])
assert features.shape == (1, 1)
assert features.getnnz() == 0
assert features.nnz == 0
6 changes: 3 additions & 3 deletions stwfsapy/tests/util/passthrough_transformer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ def test_array_input():

def test_sparse_input():
in_feat = [
sp.lil_matrix(np.array([[1, 0, 0]])),
sp.lil_matrix(np.array([[0, 7, 0]])),
sp.lil_matrix(np.array([[0, 0, -3]]))
sp.lil_array(np.array([[1, 0, 0]])),
sp.lil_array(np.array([[0, 7, 0]])),
sp.lil_array(np.array([[0, 0, -3]]))
]
pt = PassthroughTransformer()
out_feat = pt.transform(in_feat)
Expand Down
8 changes: 4 additions & 4 deletions stwfsapy/thesaurus_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from typing import Set, Iterable, Tuple, DefaultDict
import rdflib
from scipy.sparse import csr_matrix, vstack
from scipy.sparse import csr_array, vstack
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.exceptions import NotFittedError
from stwfsapy import thesaurus as t
Expand Down Expand Up @@ -68,7 +68,7 @@ def fit(self, X=None, y=None, **kwargs):
}
self.feature_dim_ = max(len(thesaurus_indices), 1)
self.mapping_ = {
str(concept): csr_matrix(
str(concept): csr_array(
(
[1 for _ in thesaurii],
(
Expand All @@ -91,13 +91,13 @@ def _transform_single(self, x):
try:
res = self.mapping_[x]
except KeyError:
res = csr_matrix(
res = csr_array(
([], ([], [])),
shape=(1, self.feature_dim_)
)
return res

def transform(self, X) -> csr_matrix:
def transform(self, X) -> csr_array:
if self.mapping_ is None:
raise NotFittedError
return vstack([self._transform_single(x) for x in X])
Expand Down