Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
a81045b
initiate development of 3.0.1
stephenhky Aug 11, 2025
5c560cd
Merge pull request #449 from stephenhky/pyup-update-keras-3.11.1-to-3…
stephenhky Aug 13, 2025
dc8665e
Merge pull request #451 from stephenhky/pyup-update-tensorflow-2.19.0…
stephenhky Aug 14, 2025
9e07319
Merge pull request #453 from stephenhky/pyup-update-transformers-4.55…
stephenhky Aug 19, 2025
055a4a2
Merge pull request #455 from stephenhky/pyup-update-pandas-2.3.1-to-2…
stephenhky Aug 21, 2025
8219c93
introducing npdict
stephenhky Aug 22, 2025
104663c
Merge pull request #457 from stephenhky/pyup-update-keras-3.11.2-to-3…
stephenhky Aug 22, 2025
db2a9d1
Merge pull request #459 from stephenhky/pyup-update-transformers-4.55…
stephenhky Aug 22, 2025
bea76e7
updated deprecated decorators
stephenhky Aug 25, 2025
7941acb
more preprocessor
stephenhky Aug 26, 2025
3b0857a
start implementing npdict
stephenhky Aug 26, 2025
ac14d3e
new npdtm implementation
stephenhky Aug 26, 2025
0190f05
updated the version requirement for npdict
stephenhky Aug 27, 2025
9e5f961
implement tfidf matrix
stephenhky Aug 27, 2025
6774beb
Merge pull request #461 from stephenhky/pyup-update-joblib-1.5.1-to-1…
stephenhky Aug 27, 2025
5bc87ac
more methods implemented for NumpyDocumentTermMatrix
stephenhky Aug 28, 2025
aa6de6c
Merge pull request #463 from stephenhky/pyup-update-transformers-4.55…
stephenhky Sep 4, 2025
b5dd2dd
Merge pull request #465 from stephenhky/pyup-update-scikit-learn-1.7.…
stephenhky Sep 9, 2025
3c3dcb7
Merge pull request #467 from stephenhky/pyup-update-numpy-2.3.2-to-2.3.3
stephenhky Sep 10, 2025
fcf0c08
Merge pull request #469 from stephenhky/pyup-update-scipy-1.16.1-to-1…
stephenhky Sep 12, 2025
befca20
Merge branch 'develop' into pyup-update-numba-0.61.2-to-0.62.0
stephenhky Sep 18, 2025
c5f4233
Merge pull request #471 from stephenhky/pyup-update-numba-0.61.2-to-0…
stephenhky Sep 18, 2025
9fc69c5
Merge pull request #473 from stephenhky/pyup-update-transformers-4.56…
stephenhky Sep 19, 2025
2c29bca
Merge branch 'develop' into pyup-update-numba-0.62.0-to-0.62.1
stephenhky Sep 29, 2025
eed5b6d
Merge pull request #475 from stephenhky/pyup-update-numba-0.62.0-to-0…
stephenhky Sep 29, 2025
18dfce3
Merge pull request #477 from stephenhky/pyup-update-pandas-2.3.2-to-2…
stephenhky Sep 30, 2025
2de8e10
Merge pull request #479 from stephenhky/pyup-update-transformers-4.56…
stephenhky Oct 14, 2025
7dc0310
Merge pull request #482 from stephenhky/pyup-update-torch-2.8.0-to-2.9.0
stephenhky Oct 15, 2025
820afdd
Merge pull request #483 from stephenhky/pyup-update-numpy-2.3.3-to-2.3.4
stephenhky Oct 15, 2025
a8cb306
use np.float64 instead of np.float_
stephenhky Oct 27, 2025
e5e6c7e
updated requirements.txt
stephenhky Oct 27, 2025
6b5f904
Release 3.0.1
stephenhky Oct 28, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ You can talk to me in advance through e-mails or the [Issues](https://github.com

## News

* 10/27/2025: `shorttext` 3.0.1 released.
* 08/10/2025: `shorttext` 3.0.0 released.
* 06/02/2025: `shorttext` 2.2.1 released. (Acknowledgement: [Minseo Kim](https://kmingseo.github.io/))
* 05/29/2025: `shorttext` 2.2.0 released. (Acknowledgement: [Minseo Kim](https://kmingseo.github.io/))
Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
# The short X.Y version.
version = u'3.0'
# The full version, including alpha/beta/rc tags.
release = u'3.0.0'
release = u'3.0.1'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
6 changes: 6 additions & 0 deletions docs/news.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
News
====

* 10/27/2025: `shorttext` 3.0.1 released.
* 08/10/2025: `shorttext` 3.0.0 released.
* 06/02/2025: `shorttext` 2.2.1 released.
* 05/29/2025: `shorttext` 2.2.0 released.
Expand Down Expand Up @@ -86,6 +87,11 @@ News
What's New
----------

Release 3.0.1 (October 25, 2025)
--------------------------------

* Small bugs fixed.

Release 3.0.0 (August 10, 2025)
-------------------------------

Expand Down
6 changes: 4 additions & 2 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@ scipy==1.16.2
joblib==1.5.2
scikit-learn==1.7.2
tensorflow==2.20.0
keras==3.11.3
keras==3.12.0
gensim==4.3.3
pandas==2.3.3
snowballstemmer==3.0.1
transformers==4.57.1
torch==2.8.0
torch==2.9.0
numba==0.62.1
npdict==0.0.5
nptyping==2.5.0
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "shorttext"
version = "3.0.0"
version = "3.0.1"
authors = [
{name = "Kwan Yuet Stephen Ho", email = "[email protected]"}
]
Expand Down Expand Up @@ -40,7 +40,9 @@ dependencies = [
"pandas>=1.2.0",
"snowballstemmer>=3.0.0",
"numba>=0.57.0",
"deprecation>=2.0.0"
"deprecation>=2.0.0",
"npdict>=0.0.5",
"nptyping>=2.0.0"
]

[project.urls]
Expand Down
6 changes: 4 additions & 2 deletions src/shorttext/utils/compactmodel_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,8 @@ def get_info(self):


# decorator that adds compact model methods to classifier dynamically (deprecated)
@deprecated
@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
details="Use `CompactIOMachine` instead")
def CompactIOClassifier(Classifier, infodict, prefix, suffices):
""" Returns a decorated class object with additional methods for compact model I/O.

Expand Down Expand Up @@ -208,7 +209,8 @@ def get_info(self):


# decorator for use (deprecated)
@deprecated
@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
details="Use `CompactIOMachine` instead")
def compactio(infodict, prefix, suffices):
""" Returns a decorator that performs the decoration by :func:`CompactIOClassifier`.

Expand Down
149 changes: 142 additions & 7 deletions src/shorttext/utils/dtm.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,150 @@

import pickle
from typing import Optional, Any
from types import FunctionType

import numpy as np
import npdict
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from npdict import SparseArrayWrappedDict
from scipy.sparse import dok_matrix

import pickle
from deprecation import deprecated
from nptyping import NDArray, Shape, Int

from .compactmodel_io import CompactIOMachine
from .classification_exceptions import NotImplementedException
from .textpreprocessing import advanced_text_tokenizer_1


dtm_suffices = ['_docids.pkl', '_dictionary.dict', '_dtm.pkl']
npdtm_suffices = []


def generate_npdict_document_term_matrix(
corpus: list[str],
doc_ids: list[Any],
tokenize_func: FunctionType
) -> npdict.NumpyNDArrayWrappedDict:
# grabbing tokens from each document in the corpus
doc_tokens = [tokenize_func(document) for document in corpus]
tokens_set = set([
token
for document in doc_tokens
for token in document
])
npdtm = npdict.SparseArrayWrappedDict(
[doc_ids, sorted(list(tokens_set))],
default_initial_value=0.0
)
for doc_id, document in zip(doc_ids, doc_tokens):
for token in document:
npdtm[doc_id, token] += 1
return npdtm


def compute_document_frequency(
npdtm: npdict.NumpyNDArrayWrappedDict
) -> NDArray[Shape["*"], Int]:
if isinstance(npdtm, npdict.SparseArrayWrappedDict):
return np.sum(npdtm.to_coo() > 0, axis=0).todense()
else:
return np.sum(npdtm.to_numpy() > 0, axis=0)


def compute_tfidf_document_term_matrix(
npdtm: npdict.NumpyNDArrayWrappedDict,
sparse: bool=True
) -> npdict.NumpyNDArrayWrappedDict:
doc_frequencies = compute_document_frequency(npdtm)
nbdocs = npdtm.dimension_sizes[0]
if isinstance(npdtm, npdict.SparseArrayWrappedDict):
new_dtm_sparray = npdtm.to_coo() * np.log(nbdocs / doc_frequencies)
return npdict.SparseArrayWrappedDict.generate_dict(new_dtm_sparray, dense=not sparse)
else:
new_dtm_nparray = npdtm.to_numpy() * np.log(nbdocs / doc_frequencies)
new_npdtm = npdict.NumpyNDArrayWrappedDict.generate_dict(new_dtm_nparray)
if sparse:
new_sparse_dtm = npdict.SparseArrayWrappedDict.from_NumpyNDArrayWrappedDict(
new_npdtm, default_initial_value=0.0
)
return new_sparse_dtm
else:
return new_npdtm


class NumpyDocumentTermMatrix(CompactIOMachine):
def __init__(
self,
corpus: Optional[list[str]]=None,
docids: Optional[list[Any]]=None,
tfidf: bool=False,
tokenize_func: Optional[FunctionType]=None
):
CompactIOMachine.__init__(self, {'classifier': 'npdtm'}, 'dtm', dtm_suffices)
self.tokenize_func = tokenize_func if tokenize_func is not None else advanced_text_tokenizer_1

# generate DTM
if corpus is not None:
self.generate_dtm(corpus, docids=docids, tfidf=tfidf)

def generate_dtm(
self,
corpus: list[str],
docids: Optional[list[Any]]=None,
tfidf: bool=False
):
# wrangling document IDs
if docids is None:
doc_ids = [f"doc{i}" for i in range(len(corpus))]
else:
if len(docids) == len(corpus):
doc_ids = docids
elif len(docids) > len(corpus):
doc_ids = docids[:len(corpus)]
else:
doc_ids = docids + [f"doc{i}" for i in range(len(docids), len(corpus))]

self.npdtm = generate_npdict_document_term_matrix(corpus, doc_ids, self.tokenize_func)

if tfidf:
self.npdtm = compute_tfidf_document_term_matrix(self.npdtm, sparse=True)

def get_termfreq(self, docid: str, token: str) -> float:
return self.npdtm[docid, token]

def get_total_termfreq(self, token: str) -> float:
token_index = self.npdtm._keystrings_to_indices[1][token]
if isinstance(self.npdtm, SparseArrayWrappedDict):
matrix = self.npdtm.to_coo()
else:
matrix = self.npdtm.to_numpy()
return np.sum(matrix[:, token_index])

def get_doc_frequency(self, token) -> int:
token_index = self.npdtm._keystrings_to_indices[1][token]
if isinstance(self.npdtm, npdict.SparseArrayWrappedDict):
freq_array = self.npdtm.to_coo()[:, token_index]
return np.sum(freq_array > 0, axis=0).todense()
else:
freq_array = self.npdtm.to_numpy()[:, token_index]
return np.sum(freq_array > 0, axis=0)

def get_token_occurences(self, token: str) -> dict[str, float]:
return {
docid: self.npdtm[docid, token]
for docid in self.npdtm._lists_keystrings[0]
}

def get_doc_tokens(self, docid: str) -> dict[str, float]:
return {
token: self.npdtm[docid, token]
for token in self.npdtm._lists_keystrings[1]
}


@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
details="Use `NumpyDocumentTermMatrix` instead")
class DocumentTermMatrix(CompactIOMachine):
""" Document-term matrix for corpus.

Expand All @@ -38,9 +171,9 @@ def __init__(self, corpus, docids=None, tfidf=False):
:type tfidf: bool
"""
CompactIOMachine.__init__(self, {'classifier': 'dtm'}, 'dtm', dtm_suffices)
if docids == None:
if docids is None:
self.docid_dict = {i: i for i in range(len(corpus))}
self.docids = range(len(corpus))
self.docids = [i for i in range(len(corpus))]
else:
if len(docids) == len(corpus):
self.docid_dict = {docid: i for i, docid in enumerate(docids)}
Expand All @@ -50,8 +183,8 @@ def __init__(self, corpus, docids=None, tfidf=False):
self.docids = docids[:len(corpus)]
else:
self.docid_dict = {docid: i for i, docid in enumerate(docids)}
self.docid_dict = {i: i for i in range(len(docids), range(corpus))}
self.docids = docids + range(len(docids), range(corpus))
self.docid_dict = {i: i for i in range(len(docids), len(corpus))}
self.docids = docids + [i for i in range(len(docids), len(corpus))]
# generate DTM
self.generate_dtm(corpus, tfidf=tfidf)

Expand All @@ -66,7 +199,7 @@ def generate_dtm(self, corpus, tfidf=False):
:type tfidf: bool
"""
self.dictionary = Dictionary(corpus)
self.dtm = dok_matrix((len(corpus), len(self.dictionary)), dtype=np.float_)
self.dtm = dok_matrix((len(corpus), len(self.dictionary)), dtype=np.float64)
bow_corpus = [self.dictionary.doc2bow(doctokens) for doctokens in corpus]
if tfidf:
weighted_model = TfidfModel(bow_corpus)
Expand Down Expand Up @@ -183,6 +316,8 @@ def loadmodel(self, prefix):
self.dtm = pickle.load(open(prefix+'_dtm.pkl', 'rb'))


@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
details="Use `npdict` instead")
def load_DocumentTermMatrix(filename, compact=True):
""" Load presaved Document-Term Matrix (DTM).

Expand Down
61 changes: 55 additions & 6 deletions src/shorttext/utils/textpreprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
import re
import os
import codecs
from io import TextIOWrapper
from types import FunctionType
from functools import partial

import snowballstemmer


# tokenizer
def tokenize(s: str) -> list[str]:
return s.split(' ')
Expand All @@ -25,7 +29,7 @@ def stemword(s: str) -> str:
return StemmerSingleton()(s)


def preprocess_text(text, pipeline):
def preprocess_text(text: str, pipeline: list[FunctionType]) -> str:
""" Preprocess the text according to the given pipeline.

Given the pipeline, which is a list of functions that process an
Expand All @@ -42,7 +46,32 @@ def preprocess_text(text, pipeline):
return text if len(pipeline)==0 else preprocess_text(pipeline[0](text), pipeline[1:])


def text_preprocessor(pipeline):
def tokenize_text(
text: str,
presplit_pipeline: list[FunctionType],
primitize_tokenizer: FunctionType,
prosplit_pipeline: list[FunctionType],
stopwordsfile: TextIOWrapper
) -> list[str]:
# load stop words file
stopwordset = set([stopword.strip() for stopword in stopwordsfile])

# done
presplit_text = text
for func in presplit_pipeline:
presplit_text = func(presplit_text)
postsplit_tokens = primitize_tokenizer(presplit_text)
for func in prosplit_pipeline:
for i, token in enumerate(postsplit_tokens):
postsplit_tokens[i] = func(token)
postsplit_tokens = [
token for token in postsplit_tokens
if token not in stopwordset
]
return postsplit_tokens


def text_preprocessor(pipeline: list[FunctionType]) -> FunctionType:
""" Return the function that preprocesses text according to the pipeline.

Given the pipeline, which is a list of functions that process an
Expand All @@ -55,10 +84,10 @@ def text_preprocessor(pipeline):
:type pipeline: list
:rtype: function
"""
return lambda text: preprocess_text(text, pipeline)
return partial(preprocess_text, pipeline=pipeline)


def oldschool_standard_text_preprocessor(stopwordsfile):
def oldschool_standard_text_preprocessor(stopwordsfile: TextIOWrapper) -> FunctionType:
""" Return a commonly used text preprocessor.

Return a text preprocessor that is commonly used, with the following steps:
Expand Down Expand Up @@ -90,7 +119,7 @@ def oldschool_standard_text_preprocessor(stopwordsfile):
return text_preprocessor(pipeline)


def standard_text_preprocessor_1():
def standard_text_preprocessor_1() -> FunctionType:
""" Return a commonly used text preprocessor.

Return a text preprocessor that is commonly used, with the following steps:
Expand All @@ -113,7 +142,7 @@ def standard_text_preprocessor_1():
return oldschool_standard_text_preprocessor(stopwordsfile)


def standard_text_preprocessor_2():
def standard_text_preprocessor_2() -> FunctionType:
""" Return a commonly used text preprocessor.

Return a text preprocessor that is commonly used, with the following steps:
Expand All @@ -134,3 +163,23 @@ def standard_text_preprocessor_2():
stopwordsfile = codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8')

return oldschool_standard_text_preprocessor(stopwordsfile)


def advanced_text_tokenizer_1() -> FunctionType:
presplit_pipeline = [
lambda s: re.sub('[^\w\s]', '', s),
lambda s: re.sub('[\d]', '', s),
lambda s: s.lower()
]
tokenizer = tokenize
postsplit_pipeline = [
lambda s: ' '.join([stemword(stemmed_token) for stemmed_token in tokenize(s)])
]
this_dir, _ = os.path.split(__file__)
return partial(
tokenize_text,
presplit_pipeline=presplit_pipeline,
tokenizer=tokenizer,
postsplit_pipeline=postsplit_pipeline,
stopwordsfile=codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8')
)