Skip to content

Commit c0ad6c8

Browse files
authored
Merge pull request #486 from stephenhky/develop
Release 3.0.1
2 parents ba6da5c + 6b5f904 commit c0ad6c8

File tree

8 files changed

+216
-19
lines changed

8 files changed

+216
-19
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ You can talk to me in advance through e-mails or the [Issues](https://github.com
8080

8181
## News
8282

83+
* 10/27/2025: `shorttext` 3.0.1 released.
8384
* 08/10/2025: `shorttext` 3.0.0 released.
8485
* 06/02/2025: `shorttext` 2.2.1 released. (Acknowledgement: [Minseo Kim](https://kmingseo.github.io/))
8586
* 05/29/2025: `shorttext` 2.2.0 released. (Acknowledgement: [Minseo Kim](https://kmingseo.github.io/))

docs/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
# The short X.Y version.
6161
version = u'3.0'
6262
# The full version, including alpha/beta/rc tags.
63-
release = u'3.0.0'
63+
release = u'3.0.1'
6464

6565
# The language for content autogenerated by Sphinx. Refer to documentation
6666
# for a list of supported languages.

docs/news.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
News
22
====
33

4+
* 10/27/2025: `shorttext` 3.0.1 released.
45
* 08/10/2025: `shorttext` 3.0.0 released.
56
* 06/02/2025: `shorttext` 2.2.1 released.
67
* 05/29/2025: `shorttext` 2.2.0 released.
@@ -86,6 +87,11 @@ News
8687
What's New
8788
----------
8889

90+
Release 3.0.1 (October 25, 2025)
91+
--------------------------------
92+
93+
* Small bugs fixed.
94+
8995
Release 3.0.0 (August 10, 2025)
9096
-------------------------------
9197

docs/requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@ scipy==1.16.2
33
joblib==1.5.2
44
scikit-learn==1.7.2
55
tensorflow==2.20.0
6-
keras==3.11.3
6+
keras==3.12.0
77
gensim==4.3.3
88
pandas==2.3.3
99
snowballstemmer==3.0.1
1010
transformers==4.57.1
1111
torch==2.9.0
1212
numba==0.62.1
13+
npdict==0.0.5
14+
nptyping==2.5.0

pyproject.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "shorttext"
7-
version = "3.0.0"
7+
version = "3.0.1"
88
authors = [
99
{name = "Kwan Yuet Stephen Ho", email = "[email protected]"}
1010
]
@@ -40,7 +40,9 @@ dependencies = [
4040
"pandas>=1.2.0",
4141
"snowballstemmer>=3.0.0",
4242
"numba>=0.57.0",
43-
"deprecation>=2.0.0"
43+
"deprecation>=2.0.0",
44+
"npdict>=0.0.5",
45+
"nptyping>=2.0.0"
4446
]
4547

4648
[project.urls]

src/shorttext/utils/compactmodel_io.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,8 @@ def get_info(self):
169169

170170

171171
# decorator that adds compact model methods to classifier dynamically (deprecated)
172-
@deprecated
172+
@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
173+
details="Use `CompactIOMachine` instead")
173174
def CompactIOClassifier(Classifier, infodict, prefix, suffices):
174175
""" Returns a decorated class object with additional methods for compact model I/O.
175176
@@ -208,7 +209,8 @@ def get_info(self):
208209

209210

210211
# decorator for use (deprecated)
211-
@deprecated
212+
@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
213+
details="Use `CompactIOMachine` instead")
212214
def compactio(infodict, prefix, suffices):
213215
""" Returns a decorator that performs the decoration by :func:`CompactIOClassifier`.
214216

src/shorttext/utils/dtm.py

Lines changed: 142 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,150 @@
11

2+
import pickle
3+
from typing import Optional, Any
4+
from types import FunctionType
5+
26
import numpy as np
7+
import npdict
38
from gensim.corpora import Dictionary
49
from gensim.models import TfidfModel
10+
from npdict import SparseArrayWrappedDict
511
from scipy.sparse import dok_matrix
6-
7-
import pickle
12+
from deprecation import deprecated
13+
from nptyping import NDArray, Shape, Int
814

915
from .compactmodel_io import CompactIOMachine
1016
from .classification_exceptions import NotImplementedException
17+
from .textpreprocessing import advanced_text_tokenizer_1
1118

1219

1320
dtm_suffices = ['_docids.pkl', '_dictionary.dict', '_dtm.pkl']
21+
npdtm_suffices = []
22+
23+
24+
def generate_npdict_document_term_matrix(
25+
corpus: list[str],
26+
doc_ids: list[Any],
27+
tokenize_func: FunctionType
28+
) -> npdict.NumpyNDArrayWrappedDict:
29+
# grabbing tokens from each document in the corpus
30+
doc_tokens = [tokenize_func(document) for document in corpus]
31+
tokens_set = set([
32+
token
33+
for document in doc_tokens
34+
for token in document
35+
])
36+
npdtm = npdict.SparseArrayWrappedDict(
37+
[doc_ids, sorted(list(tokens_set))],
38+
default_initial_value=0.0
39+
)
40+
for doc_id, document in zip(doc_ids, doc_tokens):
41+
for token in document:
42+
npdtm[doc_id, token] += 1
43+
return npdtm
44+
45+
46+
def compute_document_frequency(
47+
npdtm: npdict.NumpyNDArrayWrappedDict
48+
) -> NDArray[Shape["*"], Int]:
49+
if isinstance(npdtm, npdict.SparseArrayWrappedDict):
50+
return np.sum(npdtm.to_coo() > 0, axis=0).todense()
51+
else:
52+
return np.sum(npdtm.to_numpy() > 0, axis=0)
53+
54+
55+
def compute_tfidf_document_term_matrix(
56+
npdtm: npdict.NumpyNDArrayWrappedDict,
57+
sparse: bool=True
58+
) -> npdict.NumpyNDArrayWrappedDict:
59+
doc_frequencies = compute_document_frequency(npdtm)
60+
nbdocs = npdtm.dimension_sizes[0]
61+
if isinstance(npdtm, npdict.SparseArrayWrappedDict):
62+
new_dtm_sparray = npdtm.to_coo() * np.log(nbdocs / doc_frequencies)
63+
return npdict.SparseArrayWrappedDict.generate_dict(new_dtm_sparray, dense=not sparse)
64+
else:
65+
new_dtm_nparray = npdtm.to_numpy() * np.log(nbdocs / doc_frequencies)
66+
new_npdtm = npdict.NumpyNDArrayWrappedDict.generate_dict(new_dtm_nparray)
67+
if sparse:
68+
new_sparse_dtm = npdict.SparseArrayWrappedDict.from_NumpyNDArrayWrappedDict(
69+
new_npdtm, default_initial_value=0.0
70+
)
71+
return new_sparse_dtm
72+
else:
73+
return new_npdtm
74+
75+
76+
class NumpyDocumentTermMatrix(CompactIOMachine):
77+
def __init__(
78+
self,
79+
corpus: Optional[list[str]]=None,
80+
docids: Optional[list[Any]]=None,
81+
tfidf: bool=False,
82+
tokenize_func: Optional[FunctionType]=None
83+
):
84+
CompactIOMachine.__init__(self, {'classifier': 'npdtm'}, 'dtm', dtm_suffices)
85+
self.tokenize_func = tokenize_func if tokenize_func is not None else advanced_text_tokenizer_1
86+
87+
# generate DTM
88+
if corpus is not None:
89+
self.generate_dtm(corpus, docids=docids, tfidf=tfidf)
90+
91+
def generate_dtm(
92+
self,
93+
corpus: list[str],
94+
docids: Optional[list[Any]]=None,
95+
tfidf: bool=False
96+
):
97+
# wrangling document IDs
98+
if docids is None:
99+
doc_ids = [f"doc{i}" for i in range(len(corpus))]
100+
else:
101+
if len(docids) == len(corpus):
102+
doc_ids = docids
103+
elif len(docids) > len(corpus):
104+
doc_ids = docids[:len(corpus)]
105+
else:
106+
doc_ids = docids + [f"doc{i}" for i in range(len(docids), len(corpus))]
107+
108+
self.npdtm = generate_npdict_document_term_matrix(corpus, doc_ids, self.tokenize_func)
109+
110+
if tfidf:
111+
self.npdtm = compute_tfidf_document_term_matrix(self.npdtm, sparse=True)
112+
113+
def get_termfreq(self, docid: str, token: str) -> float:
114+
return self.npdtm[docid, token]
115+
116+
def get_total_termfreq(self, token: str) -> float:
117+
token_index = self.npdtm._keystrings_to_indices[1][token]
118+
if isinstance(self.npdtm, SparseArrayWrappedDict):
119+
matrix = self.npdtm.to_coo()
120+
else:
121+
matrix = self.npdtm.to_numpy()
122+
return np.sum(matrix[:, token_index])
123+
124+
def get_doc_frequency(self, token) -> int:
125+
token_index = self.npdtm._keystrings_to_indices[1][token]
126+
if isinstance(self.npdtm, npdict.SparseArrayWrappedDict):
127+
freq_array = self.npdtm.to_coo()[:, token_index]
128+
return np.sum(freq_array > 0, axis=0).todense()
129+
else:
130+
freq_array = self.npdtm.to_numpy()[:, token_index]
131+
return np.sum(freq_array > 0, axis=0)
132+
133+
def get_token_occurences(self, token: str) -> dict[str, float]:
134+
return {
135+
docid: self.npdtm[docid, token]
136+
for docid in self.npdtm._lists_keystrings[0]
137+
}
138+
139+
def get_doc_tokens(self, docid: str) -> dict[str, float]:
140+
return {
141+
token: self.npdtm[docid, token]
142+
for token in self.npdtm._lists_keystrings[1]
143+
}
144+
14145

146+
@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
147+
details="Use `NumpyDocumentTermMatrix` instead")
15148
class DocumentTermMatrix(CompactIOMachine):
16149
""" Document-term matrix for corpus.
17150
@@ -38,9 +171,9 @@ def __init__(self, corpus, docids=None, tfidf=False):
38171
:type tfidf: bool
39172
"""
40173
CompactIOMachine.__init__(self, {'classifier': 'dtm'}, 'dtm', dtm_suffices)
41-
if docids == None:
174+
if docids is None:
42175
self.docid_dict = {i: i for i in range(len(corpus))}
43-
self.docids = range(len(corpus))
176+
self.docids = [i for i in range(len(corpus))]
44177
else:
45178
if len(docids) == len(corpus):
46179
self.docid_dict = {docid: i for i, docid in enumerate(docids)}
@@ -50,8 +183,8 @@ def __init__(self, corpus, docids=None, tfidf=False):
50183
self.docids = docids[:len(corpus)]
51184
else:
52185
self.docid_dict = {docid: i for i, docid in enumerate(docids)}
53-
self.docid_dict = {i: i for i in range(len(docids), range(corpus))}
54-
self.docids = docids + range(len(docids), range(corpus))
186+
self.docid_dict = {i: i for i in range(len(docids), len(corpus))}
187+
self.docids = docids + [i for i in range(len(docids), len(corpus))]
55188
# generate DTM
56189
self.generate_dtm(corpus, tfidf=tfidf)
57190

@@ -66,7 +199,7 @@ def generate_dtm(self, corpus, tfidf=False):
66199
:type tfidf: bool
67200
"""
68201
self.dictionary = Dictionary(corpus)
69-
self.dtm = dok_matrix((len(corpus), len(self.dictionary)), dtype=np.float_)
202+
self.dtm = dok_matrix((len(corpus), len(self.dictionary)), dtype=np.float64)
70203
bow_corpus = [self.dictionary.doc2bow(doctokens) for doctokens in corpus]
71204
if tfidf:
72205
weighted_model = TfidfModel(bow_corpus)
@@ -183,6 +316,8 @@ def loadmodel(self, prefix):
183316
self.dtm = pickle.load(open(prefix+'_dtm.pkl', 'rb'))
184317

185318

319+
@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
320+
details="Use `npdict` instead")
186321
def load_DocumentTermMatrix(filename, compact=True):
187322
""" Load presaved Document-Term Matrix (DTM).
188323

src/shorttext/utils/textpreprocessing.py

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,13 @@
22
import re
33
import os
44
import codecs
5+
from io import TextIOWrapper
6+
from types import FunctionType
7+
from functools import partial
58

69
import snowballstemmer
710

11+
812
# tokenizer
913
def tokenize(s: str) -> list[str]:
1014
return s.split(' ')
@@ -25,7 +29,7 @@ def stemword(s: str) -> str:
2529
return StemmerSingleton()(s)
2630

2731

28-
def preprocess_text(text, pipeline):
32+
def preprocess_text(text: str, pipeline: list[FunctionType]) -> str:
2933
""" Preprocess the text according to the given pipeline.
3034
3135
Given the pipeline, which is a list of functions that process an
@@ -42,7 +46,32 @@ def preprocess_text(text, pipeline):
4246
return text if len(pipeline)==0 else preprocess_text(pipeline[0](text), pipeline[1:])
4347

4448

45-
def text_preprocessor(pipeline):
49+
def tokenize_text(
50+
text: str,
51+
presplit_pipeline: list[FunctionType],
52+
primitize_tokenizer: FunctionType,
53+
prosplit_pipeline: list[FunctionType],
54+
stopwordsfile: TextIOWrapper
55+
) -> list[str]:
56+
# load stop words file
57+
stopwordset = set([stopword.strip() for stopword in stopwordsfile])
58+
59+
# done
60+
presplit_text = text
61+
for func in presplit_pipeline:
62+
presplit_text = func(presplit_text)
63+
postsplit_tokens = primitize_tokenizer(presplit_text)
64+
for func in prosplit_pipeline:
65+
for i, token in enumerate(postsplit_tokens):
66+
postsplit_tokens[i] = func(token)
67+
postsplit_tokens = [
68+
token for token in postsplit_tokens
69+
if token not in stopwordset
70+
]
71+
return postsplit_tokens
72+
73+
74+
def text_preprocessor(pipeline: list[FunctionType]) -> FunctionType:
4675
""" Return the function that preprocesses text according to the pipeline.
4776
4877
Given the pipeline, which is a list of functions that process an
@@ -55,10 +84,10 @@ def text_preprocessor(pipeline):
5584
:type pipeline: list
5685
:rtype: function
5786
"""
58-
return lambda text: preprocess_text(text, pipeline)
87+
return partial(preprocess_text, pipeline=pipeline)
5988

6089

61-
def oldschool_standard_text_preprocessor(stopwordsfile):
90+
def oldschool_standard_text_preprocessor(stopwordsfile: TextIOWrapper) -> FunctionType:
6291
""" Return a commonly used text preprocessor.
6392
6493
Return a text preprocessor that is commonly used, with the following steps:
@@ -90,7 +119,7 @@ def oldschool_standard_text_preprocessor(stopwordsfile):
90119
return text_preprocessor(pipeline)
91120

92121

93-
def standard_text_preprocessor_1():
122+
def standard_text_preprocessor_1() -> FunctionType:
94123
""" Return a commonly used text preprocessor.
95124
96125
Return a text preprocessor that is commonly used, with the following steps:
@@ -113,7 +142,7 @@ def standard_text_preprocessor_1():
113142
return oldschool_standard_text_preprocessor(stopwordsfile)
114143

115144

116-
def standard_text_preprocessor_2():
145+
def standard_text_preprocessor_2() -> FunctionType:
117146
""" Return a commonly used text preprocessor.
118147
119148
Return a text preprocessor that is commonly used, with the following steps:
@@ -134,3 +163,23 @@ def standard_text_preprocessor_2():
134163
stopwordsfile = codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8')
135164

136165
return oldschool_standard_text_preprocessor(stopwordsfile)
166+
167+
168+
def advanced_text_tokenizer_1() -> FunctionType:
169+
presplit_pipeline = [
170+
lambda s: re.sub('[^\w\s]', '', s),
171+
lambda s: re.sub('[\d]', '', s),
172+
lambda s: s.lower()
173+
]
174+
tokenizer = tokenize
175+
postsplit_pipeline = [
176+
lambda s: ' '.join([stemword(stemmed_token) for stemmed_token in tokenize(s)])
177+
]
178+
this_dir, _ = os.path.split(__file__)
179+
return partial(
180+
tokenize_text,
181+
presplit_pipeline=presplit_pipeline,
182+
tokenizer=tokenizer,
183+
postsplit_pipeline=postsplit_pipeline,
184+
stopwordsfile=codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8')
185+
)

0 commit comments

Comments
 (0)