Merge pull request #486 from stephenhky/develop

stephenhky · web-flow · commit c0ad6c872bba · 2025-10-28T02:24:03.000Z
Release 3.0.1
diff --git a/README.md b/README.md
@@ -80,6 +80,7 @@ You can talk to me in advance through e-mails or the [Issues](https://github.com
 
 ## News
 
+* 10/27/2025: `shorttext` 3.0.1 released.
 * 08/10/2025: `shorttext` 3.0.0 released.
 * 06/02/2025: `shorttext` 2.2.1 released. (Acknowledgement:  [Minseo Kim](https://kmingseo.github.io/))
 * 05/29/2025: `shorttext` 2.2.0 released. (Acknowledgement:  [Minseo Kim](https://kmingseo.github.io/))
diff --git a/docs/conf.py b/docs/conf.py
@@ -60,7 +60,7 @@
 # The short X.Y version.
 version = u'3.0'
 # The full version, including alpha/beta/rc tags.
-release = u'3.0.0'
+release = u'3.0.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/news.rst b/docs/news.rst
@@ -1,6 +1,7 @@
 News
 ====
 
+* 10/27/2025: `shorttext` 3.0.1 released.
 * 08/10/2025: `shorttext` 3.0.0 released.
 * 06/02/2025: `shorttext` 2.2.1 released.
 * 05/29/2025: `shorttext` 2.2.0 released.
@@ -86,6 +87,11 @@ News
 What's New
 ----------
 
+Release 3.0.1 (October 25, 2025)
+--------------------------------
+
+* Small bugs fixed.
+
 Release 3.0.0 (August 10, 2025)
 -------------------------------
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -3,10 +3,12 @@ scipy==1.16.2
 joblib==1.5.2
 scikit-learn==1.7.2
 tensorflow==2.20.0
-keras==3.11.3
+keras==3.12.0
 gensim==4.3.3
 pandas==2.3.3
 snowballstemmer==3.0.1
 transformers==4.57.1
 torch==2.9.0
 numba==0.62.1
+npdict==0.0.5
+nptyping==2.5.0
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "shorttext"
-version = "3.0.0"
+version = "3.0.1"
 authors = [
     {name = "Kwan Yuet Stephen Ho", email = "stephenhky@yahoo.com.hk"}
 ]
@@ -40,7 +40,9 @@ dependencies = [
     "pandas>=1.2.0",
     "snowballstemmer>=3.0.0",
     "numba>=0.57.0",
-    "deprecation>=2.0.0"
+    "deprecation>=2.0.0",
+    "npdict>=0.0.5",
+    "nptyping>=2.0.0"
 ]
 
 [project.urls]
diff --git a/src/shorttext/utils/compactmodel_io.py b/src/shorttext/utils/compactmodel_io.py
@@ -169,7 +169,8 @@ def get_info(self):
 
 
 # decorator that adds compact model methods to classifier dynamically (deprecated)
-@deprecated
+@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
+            details="Use `CompactIOMachine` instead")
 def CompactIOClassifier(Classifier, infodict, prefix, suffices):
     """ Returns a decorated class object with additional methods for compact model I/O.
 
@@ -208,7 +209,8 @@ def get_info(self):
 
 
 # decorator for use (deprecated)
-@deprecated
+@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
+            details="Use `CompactIOMachine` instead")
 def compactio(infodict, prefix, suffices):
     """ Returns a decorator that performs the decoration by :func:`CompactIOClassifier`.
 
diff --git a/src/shorttext/utils/dtm.py b/src/shorttext/utils/dtm.py
@@ -1,17 +1,150 @@
 
+import pickle
+from typing import Optional, Any
+from types import FunctionType
+
 import numpy as np
+import npdict
 from gensim.corpora import Dictionary
 from gensim.models import TfidfModel
+from npdict import SparseArrayWrappedDict
 from scipy.sparse import dok_matrix
-
-import pickle
+from deprecation import deprecated
+from nptyping import NDArray, Shape, Int
 
 from .compactmodel_io import CompactIOMachine
 from .classification_exceptions import NotImplementedException
+from .textpreprocessing import advanced_text_tokenizer_1
 
 
 dtm_suffices = ['_docids.pkl', '_dictionary.dict', '_dtm.pkl']
+npdtm_suffices = []
+
+
+def generate_npdict_document_term_matrix(
+        corpus: list[str],
+        doc_ids: list[Any],
+        tokenize_func: FunctionType
+) -> npdict.NumpyNDArrayWrappedDict:
+    # grabbing tokens from each document in the corpus
+    doc_tokens = [tokenize_func(document) for document in corpus]
+    tokens_set = set([
+        token
+        for document in doc_tokens
+        for token in document
+    ])
+    npdtm = npdict.SparseArrayWrappedDict(
+        [doc_ids, sorted(list(tokens_set))],
+        default_initial_value=0.0
+    )
+    for doc_id, document in zip(doc_ids, doc_tokens):
+        for token in document:
+            npdtm[doc_id, token] += 1
+    return npdtm
+
+
+def compute_document_frequency(
+        npdtm: npdict.NumpyNDArrayWrappedDict
+) -> NDArray[Shape["*"], Int]:
+    if isinstance(npdtm, npdict.SparseArrayWrappedDict):
+        return np.sum(npdtm.to_coo() > 0, axis=0).todense()
+    else:
+        return np.sum(npdtm.to_numpy() > 0, axis=0)
+
+
+def compute_tfidf_document_term_matrix(
+        npdtm: npdict.NumpyNDArrayWrappedDict,
+        sparse: bool=True
+) -> npdict.NumpyNDArrayWrappedDict:
+    doc_frequencies = compute_document_frequency(npdtm)
+    nbdocs = npdtm.dimension_sizes[0]
+    if isinstance(npdtm, npdict.SparseArrayWrappedDict):
+        new_dtm_sparray = npdtm.to_coo() * np.log(nbdocs / doc_frequencies)
+        return npdict.SparseArrayWrappedDict.generate_dict(new_dtm_sparray, dense=not sparse)
+    else:
+        new_dtm_nparray = npdtm.to_numpy() * np.log(nbdocs / doc_frequencies)
+        new_npdtm = npdict.NumpyNDArrayWrappedDict.generate_dict(new_dtm_nparray)
+        if sparse:
+            new_sparse_dtm = npdict.SparseArrayWrappedDict.from_NumpyNDArrayWrappedDict(
+                new_npdtm, default_initial_value=0.0
+            )
+            return new_sparse_dtm
+        else:
+            return new_npdtm
+
+
+class NumpyDocumentTermMatrix(CompactIOMachine):
+    def __init__(
+            self,
+            corpus: Optional[list[str]]=None,
+            docids: Optional[list[Any]]=None,
+            tfidf: bool=False,
+            tokenize_func: Optional[FunctionType]=None
+    ):
+        CompactIOMachine.__init__(self, {'classifier': 'npdtm'}, 'dtm', dtm_suffices)
+        self.tokenize_func = tokenize_func if tokenize_func is not None else advanced_text_tokenizer_1
+
+        # generate DTM
+        if corpus is not None:
+            self.generate_dtm(corpus, docids=docids, tfidf=tfidf)
+
+    def generate_dtm(
+            self,
+            corpus: list[str],
+            docids: Optional[list[Any]]=None,
+            tfidf: bool=False
+    ):
+        # wrangling document IDs
+        if docids is None:
+            doc_ids = [f"doc{i}" for i in range(len(corpus))]
+        else:
+            if len(docids) == len(corpus):
+                doc_ids = docids
+            elif len(docids) > len(corpus):
+                doc_ids = docids[:len(corpus)]
+            else:
+                doc_ids = docids + [f"doc{i}" for i in range(len(docids), len(corpus))]
+
+        self.npdtm = generate_npdict_document_term_matrix(corpus, doc_ids, self.tokenize_func)
+
+        if tfidf:
+            self.npdtm = compute_tfidf_document_term_matrix(self.npdtm, sparse=True)
+
+    def get_termfreq(self, docid: str, token: str) -> float:
+        return self.npdtm[docid, token]
+
+    def get_total_termfreq(self, token: str) -> float:
+        token_index = self.npdtm._keystrings_to_indices[1][token]
+        if isinstance(self.npdtm, SparseArrayWrappedDict):
+            matrix = self.npdtm.to_coo()
+        else:
+            matrix = self.npdtm.to_numpy()
+        return np.sum(matrix[:, token_index])
+
+    def get_doc_frequency(self, token) -> int:
+        token_index = self.npdtm._keystrings_to_indices[1][token]
+        if isinstance(self.npdtm, npdict.SparseArrayWrappedDict):
+            freq_array = self.npdtm.to_coo()[:, token_index]
+            return np.sum(freq_array > 0, axis=0).todense()
+        else:
+            freq_array = self.npdtm.to_numpy()[:, token_index]
+            return np.sum(freq_array > 0, axis=0)
+
+    def get_token_occurences(self, token: str) -> dict[str, float]:
+        return {
+            docid: self.npdtm[docid, token]
+            for docid in self.npdtm._lists_keystrings[0]
+        }
+
+    def get_doc_tokens(self, docid: str) -> dict[str, float]:
+        return {
+            token: self.npdtm[docid, token]
+            for token in self.npdtm._lists_keystrings[1]
+        }
+
 
+@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
+            details="Use `NumpyDocumentTermMatrix` instead")
 class DocumentTermMatrix(CompactIOMachine):
     """ Document-term matrix for corpus.
 
@@ -38,9 +171,9 @@ def __init__(self, corpus, docids=None, tfidf=False):
         :type tfidf: bool
         """
         CompactIOMachine.__init__(self, {'classifier': 'dtm'}, 'dtm', dtm_suffices)
-        if docids == None:
+        if docids is None:
             self.docid_dict = {i: i for i in range(len(corpus))}
-            self.docids = range(len(corpus))
+            self.docids = [i for i in range(len(corpus))]
         else:
             if len(docids) == len(corpus):
                 self.docid_dict = {docid: i for i, docid in enumerate(docids)}
@@ -50,8 +183,8 @@ def __init__(self, corpus, docids=None, tfidf=False):
                 self.docids = docids[:len(corpus)]
             else:
                 self.docid_dict = {docid: i for i, docid in enumerate(docids)}
-                self.docid_dict = {i: i for i in range(len(docids), range(corpus))}
-                self.docids = docids + range(len(docids), range(corpus))
+                self.docid_dict = {i: i for i in range(len(docids), len(corpus))}
+                self.docids = docids + [i for i in range(len(docids), len(corpus))]
         # generate DTM
         self.generate_dtm(corpus, tfidf=tfidf)
 
@@ -66,7 +199,7 @@ def generate_dtm(self, corpus, tfidf=False):
         :type tfidf: bool
         """
         self.dictionary = Dictionary(corpus)
-        self.dtm = dok_matrix((len(corpus), len(self.dictionary)), dtype=np.float_)
+        self.dtm = dok_matrix((len(corpus), len(self.dictionary)), dtype=np.float64)
         bow_corpus = [self.dictionary.doc2bow(doctokens) for doctokens in corpus]
         if tfidf:
             weighted_model = TfidfModel(bow_corpus)
@@ -183,6 +316,8 @@ def loadmodel(self, prefix):
         self.dtm = pickle.load(open(prefix+'_dtm.pkl', 'rb'))
 
 
+@deprecated(deprecated_in="3.0.1", removed_in="4.0.0",
+            details="Use `npdict` instead")
 def load_DocumentTermMatrix(filename, compact=True):
     """ Load presaved Document-Term Matrix (DTM).
 
diff --git a/src/shorttext/utils/textpreprocessing.py b/src/shorttext/utils/textpreprocessing.py
@@ -2,9 +2,13 @@
 import re
 import os
 import codecs
+from io import TextIOWrapper
+from types import FunctionType
+from functools import partial
 
 import snowballstemmer
 
+
 # tokenizer
 def tokenize(s: str) -> list[str]:
     return s.split(' ')
@@ -25,7 +29,7 @@ def stemword(s: str) -> str:
     return StemmerSingleton()(s)
 
 
-def preprocess_text(text, pipeline):
+def preprocess_text(text: str, pipeline: list[FunctionType]) -> str:
     """ Preprocess the text according to the given pipeline.
 
     Given the pipeline, which is a list of functions that process an
@@ -42,7 +46,32 @@ def preprocess_text(text, pipeline):
     return text if len(pipeline)==0 else preprocess_text(pipeline[0](text), pipeline[1:])
 
 
-def text_preprocessor(pipeline):
+def tokenize_text(
+        text: str,
+        presplit_pipeline: list[FunctionType],
+        primitize_tokenizer: FunctionType,
+        prosplit_pipeline: list[FunctionType],
+        stopwordsfile: TextIOWrapper
+) -> list[str]:
+    # load stop words file
+    stopwordset = set([stopword.strip() for stopword in stopwordsfile])
+
+    # done
+    presplit_text = text
+    for func in presplit_pipeline:
+        presplit_text = func(presplit_text)
+    postsplit_tokens = primitize_tokenizer(presplit_text)
+    for func in prosplit_pipeline:
+        for i, token in enumerate(postsplit_tokens):
+            postsplit_tokens[i] = func(token)
+    postsplit_tokens = [
+        token for token in postsplit_tokens
+        if token not in stopwordset
+    ]
+    return postsplit_tokens
+
+
+def text_preprocessor(pipeline: list[FunctionType]) -> FunctionType:
     """ Return the function that preprocesses text according to the pipeline.
 
     Given the pipeline, which is a list of functions that process an
@@ -55,10 +84,10 @@ def text_preprocessor(pipeline):
     :type pipeline: list
     :rtype: function
     """
-    return lambda text: preprocess_text(text, pipeline)
+    return partial(preprocess_text, pipeline=pipeline)
 
 
-def oldschool_standard_text_preprocessor(stopwordsfile):
+def oldschool_standard_text_preprocessor(stopwordsfile: TextIOWrapper) -> FunctionType:
     """ Return a commonly used text preprocessor.
 
     Return a text preprocessor that is commonly used, with the following steps:
@@ -90,7 +119,7 @@ def oldschool_standard_text_preprocessor(stopwordsfile):
     return text_preprocessor(pipeline)
 
 
-def standard_text_preprocessor_1():
+def standard_text_preprocessor_1() -> FunctionType:
     """ Return a commonly used text preprocessor.
 
     Return a text preprocessor that is commonly used, with the following steps:
@@ -113,7 +142,7 @@ def standard_text_preprocessor_1():
     return oldschool_standard_text_preprocessor(stopwordsfile)
 
 
-def standard_text_preprocessor_2():
+def standard_text_preprocessor_2() -> FunctionType:
     """ Return a commonly used text preprocessor.
 
     Return a text preprocessor that is commonly used, with the following steps:
@@ -134,3 +163,23 @@ def standard_text_preprocessor_2():
     stopwordsfile = codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8')
 
     return oldschool_standard_text_preprocessor(stopwordsfile)
+
+
+def advanced_text_tokenizer_1() -> FunctionType:
+    presplit_pipeline = [
+        lambda s: re.sub('[^\w\s]', '', s),
+        lambda s: re.sub('[\d]', '', s),
+        lambda s: s.lower()
+    ]
+    tokenizer = tokenize
+    postsplit_pipeline = [
+        lambda s: ' '.join([stemword(stemmed_token) for stemmed_token in tokenize(s)])
+    ]
+    this_dir, _ = os.path.split(__file__)
+    return partial(
+        tokenize_text,
+        presplit_pipeline=presplit_pipeline,
+        tokenizer=tokenizer,
+        postsplit_pipeline=postsplit_pipeline,
+        stopwordsfile=codecs.open(os.path.join(this_dir, 'nonneg_stopwords.txt'), 'r', 'utf-8')
+    )