diff --git a/stwfsapy/predictor.py b/stwfsapy/predictor.py index 2f7b44a..7241f0f 100644 --- a/stwfsapy/predictor.py +++ b/stwfsapy/predictor.py @@ -29,7 +29,7 @@ from sklearn.compose import ColumnTransformer from sklearn.tree import DecisionTreeClassifier from sklearn.feature_extraction.text import TfidfVectorizer -from scipy.sparse import csr_matrix +from scipy.sparse import csr_array from stwfsapy import thesaurus as t from stwfsapy.automata import nfa, construction, conversion, dfa from stwfsapy.thesaurus_features import ThesaurusFeatureTransformation @@ -291,7 +291,7 @@ def _fit_after_init(self, X, y=None): self.pipeline_.fit(matches, y=train_y) return self - def predict_proba(self, X) -> csr_matrix: + def predict_proba(self, X) -> csr_array: """ Predicts probability scores for each concept per document. @@ -305,7 +305,7 @@ def predict_proba(self, X) -> csr_matrix: predictions = self.pipeline_.predict_proba(match_X)[:, 1] else: predictions = [] - return self._create_sparse_matrix( + return self._create_sparse_array( predictions, [tpl[0] for tpl in match_X], doc_counts @@ -344,7 +344,7 @@ def suggest_proba( in combined ] - def predict(self, X) -> csr_matrix: + def predict(self, X) -> csr_array: """ Predicts binary concept match labels for each input text. @@ -358,19 +358,19 @@ def predict(self, X) -> csr_matrix: predictions = self.pipeline_.predict(match_X) else: predictions = [] - return self._create_sparse_matrix( + return self._create_sparse_array( predictions, [tpl[0] for tpl in match_X], doc_counts ) - def _create_sparse_matrix( + def _create_sparse_array( self, values: Nl, concept_names: List[str], doc_counts: List[int] - ) -> csr_matrix: - return csr_matrix( + ) -> csr_array: + return csr_array( ( values, ( @@ -427,7 +427,7 @@ def match_and_extend( for inp, truth_refs in zip(inputs, map(str, truth_refss)): text = input_handler(inp) if self.use_txt_vec: - txt_vec = self.text_vectorizer_.transform([inp])[0] + txt_vec = self.text_vectorizer_.transform([inp]) else: txt_vec = 0 txt_feat = self.text_features_.transform([text])[0] @@ -452,7 +452,7 @@ def match_and_extend( for inp in inputs: text = input_handler(inp) if self.use_txt_vec: - txt_vec = self.text_vectorizer_.transform([inp])[0] + txt_vec = self.text_vectorizer_.transform([inp]) else: txt_vec = 0 txt_feat = self.text_features_.transform([text])[0] diff --git a/stwfsapy/tests/predictor_test.py b/stwfsapy/tests/predictor_test.py index d10bca1..1837e35 100644 --- a/stwfsapy/tests/predictor_test.py +++ b/stwfsapy/tests/predictor_test.py @@ -13,14 +13,14 @@ # limitations under the License. from stwfsapy.text_features import mk_text_features -from scipy.sparse import lil_matrix +from scipy.sparse import lil_array from stwfsapy import predictor as p import stwfsapy.thesaurus as t from stwfsapy.automata.dfa import Dfa import stwfsapy.tests.common as c from stwfsapy.automata.construction import ConstructionState import pytest -from scipy.sparse import csr_matrix +from scipy.sparse import csr_array import numpy as np from sklearn.tree import DecisionTreeClassifier from sklearn.compose import ColumnTransformer @@ -62,8 +62,8 @@ ("9", [0], 0), ("11", [0, 1], 1)] -def make_test_result_matrix(values): - return csr_matrix(( +def make_test_result_array(values): + return csr_array(( values, ( [ @@ -113,7 +113,7 @@ def no_match_predictor(mocker): def mock_vec_transform(X): - ret = lil_matrix((len(X), 5000)) + ret = lil_array((len(X), 5000)) for idx, x in enumerate(X): ret[idx] = len(x) return ret @@ -145,10 +145,10 @@ def test_result_collection(): assert [(r[0], list(r[1])) for r in res] == _collection_result -def test_sparse_matrix_creation(): +def test_sparse_array_creation(): predictor = p.StwfsapyPredictor(None, None, None, None) predictor.concept_map_ = _concept_map - res = predictor._create_sparse_matrix( + res = predictor._create_sparse_array( _predictions[:, 1], [c[0] for c in _concepts_with_text], _doc_counts @@ -156,9 +156,9 @@ def test_sparse_matrix_creation(): assert res.shape[0] == len(_doc_counts) assert res.shape[1] == 23 for i, count in enumerate(_doc_counts): - row = res.getrow(i) + row = res[[i], :] slice_start = sum(_doc_counts[:i]) - assert row.getnnz() == count + assert row.nnz == count # reverse slices because of mapping. assert list(row.nonzero()[1]) == list(reversed([ 22-i for i in _concepts[slice_start: slice_start+count]])) @@ -418,7 +418,7 @@ def check_fit_arg(vec_fun, text_feature_fun, txt, actual, expected): def test_predict(mocked_predictor): res = mocked_predictor.predict([]) assert ( - res.toarray() == make_test_result_matrix(_classifications).toarray() + res.toarray() == make_test_result_array(_classifications).toarray() ).all() mocked_predictor.match_and_extend.assert_called_once_with( [] @@ -431,7 +431,7 @@ def test_predict(mocked_predictor): def test_predict_proba(mocked_predictor): res = mocked_predictor.predict_proba([]) assert ( - res.toarray() == make_test_result_matrix( + res.toarray() == make_test_result_array( _predictions[:, 1]).toarray()).all() mocked_predictor.match_and_extend.assert_called_once_with( [] @@ -458,13 +458,13 @@ def test_suggest(mocked_predictor): def test_predict_no_match(no_match_predictor): res = no_match_predictor.predict([]) - assert res.getnnz() == 0 + assert res.nnz == 0 assert res.shape == (3, len(_concept_map)) def test_predict_proba_no_match(no_match_predictor): res = no_match_predictor.predict_proba([]) - assert res.getnnz() == 0 + assert res.nnz == 0 assert res.shape == (3, len(_concept_map)) diff --git a/stwfsapy/tests/thesaurus_features_test.py b/stwfsapy/tests/thesaurus_features_test.py index cf64f0d..79eef24 100644 --- a/stwfsapy/tests/thesaurus_features_test.py +++ b/stwfsapy/tests/thesaurus_features_test.py @@ -19,7 +19,7 @@ from stwfsapy import thesaurus_features as tf from stwfsapy.tests.thesaurus import common as tc from stwfsapy.tests import common as c -from scipy.sparse import coo_matrix, csr_matrix +from scipy.sparse import coo_array, csr_array from sklearn.exceptions import NotFittedError import pytest @@ -46,7 +46,7 @@ def test_unfitted_raises(): def test_transform(): trans = tf.ThesaurusFeatureTransformation(None, None, None, None) trans.mapping_ = { - 'a': coo_matrix([[1]]), 'b': coo_matrix([[2]]), 'c': coo_matrix([[3]])} + 'a': coo_array([[1]]), 'b': coo_array([[2]]), 'c': coo_array([[3]])} res = trans.transform(['c', 'c', 'a']) assert (res.toarray() == array([[3], [3], [1]])).all() @@ -70,15 +70,15 @@ def test_fit(full_graph): assert x.shape[1] == 6 # Can not test positions because retrieval from graph is not deterministic. # Therefore, test non zero entries only. - assert mapping[c.test_concept_uri_0_0].getnnz() == 1 - assert mapping[c.test_concept_uri_01_0].getnnz() == 2 - assert mapping[c.test_concept_uri_01_00].getnnz() == 2 - assert mapping[c.test_concept_uri_10_0].getnnz() == 2 - assert mapping[c.test_concept_uri_10_1].getnnz() == 2 - assert mapping[c.test_concept_uri_100_0].getnnz() == 3 - assert mapping[c.test_concept_uri_100_00].getnnz() == 3 - assert mapping[c.test_concept_uri_100_01].getnnz() == 3 - assert mapping[c.test_concept_uri_100_02].getnnz() == 3 + assert mapping[c.test_concept_uri_0_0].nnz == 1 + assert mapping[c.test_concept_uri_01_0].nnz == 2 + assert mapping[c.test_concept_uri_01_00].nnz == 2 + assert mapping[c.test_concept_uri_10_0].nnz == 2 + assert mapping[c.test_concept_uri_10_1].nnz == 2 + assert mapping[c.test_concept_uri_100_0].nnz == 3 + assert mapping[c.test_concept_uri_100_00].nnz == 3 + assert mapping[c.test_concept_uri_100_01].nnz == 3 + assert mapping[c.test_concept_uri_100_02].nnz == 3 def test_transform_unknown(): @@ -90,14 +90,14 @@ def test_transform_unknown(): feature_dim = 12 trans.feature_dim_ = feature_dim - known = csr_matrix(([1], ([0], [4])), shape=(1, feature_dim)) + known = csr_array(([1], ([0], [4])), shape=(1, feature_dim)) trans.mapping_ = {'key': known} random_results = trans.transform([ 'some random stuff edsfysdfhjsedf', 'key']) assert random_results.shape == (2, feature_dim) - assert random_results.getrow(0).getnnz() == 0 - assert random_results.getrow(1).getnnz() == 1 + assert random_results[[0], :].nnz == 0 + assert random_results[[1], :].nnz == 1 def test_empty_relation(full_graph): @@ -110,4 +110,4 @@ def test_empty_relation(full_graph): trans.fit([], []) features = trans.transform(['empty']) assert features.shape == (1, 1) - assert features.getnnz() == 0 + assert features.nnz == 0 diff --git a/stwfsapy/tests/util/passthrough_transformer_test.py b/stwfsapy/tests/util/passthrough_transformer_test.py index b9ebcb4..d42e6d1 100644 --- a/stwfsapy/tests/util/passthrough_transformer_test.py +++ b/stwfsapy/tests/util/passthrough_transformer_test.py @@ -31,9 +31,9 @@ def test_array_input(): def test_sparse_input(): in_feat = [ - sp.lil_matrix(np.array([[1, 0, 0]])), - sp.lil_matrix(np.array([[0, 7, 0]])), - sp.lil_matrix(np.array([[0, 0, -3]])) + sp.lil_array(np.array([[1, 0, 0]])), + sp.lil_array(np.array([[0, 7, 0]])), + sp.lil_array(np.array([[0, 0, -3]])) ] pt = PassthroughTransformer() out_feat = pt.transform(in_feat) diff --git a/stwfsapy/thesaurus_features.py b/stwfsapy/thesaurus_features.py index 9bdfd08..124f4d8 100644 --- a/stwfsapy/thesaurus_features.py +++ b/stwfsapy/thesaurus_features.py @@ -15,7 +15,7 @@ from typing import Set, Iterable, Tuple, DefaultDict import rdflib -from scipy.sparse import csr_matrix, vstack +from scipy.sparse import csr_array, vstack from sklearn.base import BaseEstimator, TransformerMixin from sklearn.exceptions import NotFittedError from stwfsapy import thesaurus as t @@ -68,7 +68,7 @@ def fit(self, X=None, y=None, **kwargs): } self.feature_dim_ = max(len(thesaurus_indices), 1) self.mapping_ = { - str(concept): csr_matrix( + str(concept): csr_array( ( [1 for _ in thesaurii], ( @@ -91,13 +91,13 @@ def _transform_single(self, x): try: res = self.mapping_[x] except KeyError: - res = csr_matrix( + res = csr_array( ([], ([], [])), shape=(1, self.feature_dim_) ) return res - def transform(self, X) -> csr_matrix: + def transform(self, X) -> csr_array: if self.mapping_ is None: raise NotFittedError return vstack([self._transform_single(x) for x in X])