11
2+ import pickle
3+ from typing import Optional , Any
4+ from types import FunctionType
5+
26import numpy as np
7+ import npdict
38from gensim .corpora import Dictionary
49from gensim .models import TfidfModel
10+ from npdict import SparseArrayWrappedDict
511from scipy .sparse import dok_matrix
6-
7- import pickle
12+ from deprecation import deprecated
13+ from nptyping import NDArray , Shape , Int
814
915from .compactmodel_io import CompactIOMachine
1016from .classification_exceptions import NotImplementedException
17+ from .textpreprocessing import advanced_text_tokenizer_1
1118
1219
1320dtm_suffices = ['_docids.pkl' , '_dictionary.dict' , '_dtm.pkl' ]
21+ npdtm_suffices = []
22+
23+
24+ def generate_npdict_document_term_matrix (
25+ corpus : list [str ],
26+ doc_ids : list [Any ],
27+ tokenize_func : FunctionType
28+ ) -> npdict .NumpyNDArrayWrappedDict :
29+ # grabbing tokens from each document in the corpus
30+ doc_tokens = [tokenize_func (document ) for document in corpus ]
31+ tokens_set = set ([
32+ token
33+ for document in doc_tokens
34+ for token in document
35+ ])
36+ npdtm = npdict .SparseArrayWrappedDict (
37+ [doc_ids , sorted (list (tokens_set ))],
38+ default_initial_value = 0.0
39+ )
40+ for doc_id , document in zip (doc_ids , doc_tokens ):
41+ for token in document :
42+ npdtm [doc_id , token ] += 1
43+ return npdtm
44+
45+
46+ def compute_document_frequency (
47+ npdtm : npdict .NumpyNDArrayWrappedDict
48+ ) -> NDArray [Shape ["*" ], Int ]:
49+ if isinstance (npdtm , npdict .SparseArrayWrappedDict ):
50+ return np .sum (npdtm .to_coo () > 0 , axis = 0 ).todense ()
51+ else :
52+ return np .sum (npdtm .to_numpy () > 0 , axis = 0 )
53+
54+
55+ def compute_tfidf_document_term_matrix (
56+ npdtm : npdict .NumpyNDArrayWrappedDict ,
57+ sparse : bool = True
58+ ) -> npdict .NumpyNDArrayWrappedDict :
59+ doc_frequencies = compute_document_frequency (npdtm )
60+ nbdocs = npdtm .dimension_sizes [0 ]
61+ if isinstance (npdtm , npdict .SparseArrayWrappedDict ):
62+ new_dtm_sparray = npdtm .to_coo () * np .log (nbdocs / doc_frequencies )
63+ return npdict .SparseArrayWrappedDict .generate_dict (new_dtm_sparray , dense = not sparse )
64+ else :
65+ new_dtm_nparray = npdtm .to_numpy () * np .log (nbdocs / doc_frequencies )
66+ new_npdtm = npdict .NumpyNDArrayWrappedDict .generate_dict (new_dtm_nparray )
67+ if sparse :
68+ new_sparse_dtm = npdict .SparseArrayWrappedDict .from_NumpyNDArrayWrappedDict (
69+ new_npdtm , default_initial_value = 0.0
70+ )
71+ return new_sparse_dtm
72+ else :
73+ return new_npdtm
74+
75+
76+ class NumpyDocumentTermMatrix (CompactIOMachine ):
77+ def __init__ (
78+ self ,
79+ corpus : Optional [list [str ]]= None ,
80+ docids : Optional [list [Any ]]= None ,
81+ tfidf : bool = False ,
82+ tokenize_func : Optional [FunctionType ]= None
83+ ):
84+ CompactIOMachine .__init__ (self , {'classifier' : 'npdtm' }, 'dtm' , dtm_suffices )
85+ self .tokenize_func = tokenize_func if tokenize_func is not None else advanced_text_tokenizer_1
86+
87+ # generate DTM
88+ if corpus is not None :
89+ self .generate_dtm (corpus , docids = docids , tfidf = tfidf )
90+
91+ def generate_dtm (
92+ self ,
93+ corpus : list [str ],
94+ docids : Optional [list [Any ]]= None ,
95+ tfidf : bool = False
96+ ):
97+ # wrangling document IDs
98+ if docids is None :
99+ doc_ids = [f"doc{ i } " for i in range (len (corpus ))]
100+ else :
101+ if len (docids ) == len (corpus ):
102+ doc_ids = docids
103+ elif len (docids ) > len (corpus ):
104+ doc_ids = docids [:len (corpus )]
105+ else :
106+ doc_ids = docids + [f"doc{ i } " for i in range (len (docids ), len (corpus ))]
107+
108+ self .npdtm = generate_npdict_document_term_matrix (corpus , doc_ids , self .tokenize_func )
109+
110+ if tfidf :
111+ self .npdtm = compute_tfidf_document_term_matrix (self .npdtm , sparse = True )
112+
113+ def get_termfreq (self , docid : str , token : str ) -> float :
114+ return self .npdtm [docid , token ]
115+
116+ def get_total_termfreq (self , token : str ) -> float :
117+ token_index = self .npdtm ._keystrings_to_indices [1 ][token ]
118+ if isinstance (self .npdtm , SparseArrayWrappedDict ):
119+ matrix = self .npdtm .to_coo ()
120+ else :
121+ matrix = self .npdtm .to_numpy ()
122+ return np .sum (matrix [:, token_index ])
123+
124+ def get_doc_frequency (self , token ) -> int :
125+ token_index = self .npdtm ._keystrings_to_indices [1 ][token ]
126+ if isinstance (self .npdtm , npdict .SparseArrayWrappedDict ):
127+ freq_array = self .npdtm .to_coo ()[:, token_index ]
128+ return np .sum (freq_array > 0 , axis = 0 ).todense ()
129+ else :
130+ freq_array = self .npdtm .to_numpy ()[:, token_index ]
131+ return np .sum (freq_array > 0 , axis = 0 )
132+
133+ def get_token_occurences (self , token : str ) -> dict [str , float ]:
134+ return {
135+ docid : self .npdtm [docid , token ]
136+ for docid in self .npdtm ._lists_keystrings [0 ]
137+ }
138+
139+ def get_doc_tokens (self , docid : str ) -> dict [str , float ]:
140+ return {
141+ token : self .npdtm [docid , token ]
142+ for token in self .npdtm ._lists_keystrings [1 ]
143+ }
144+
14145
146+ @deprecated (deprecated_in = "3.0.1" , removed_in = "4.0.0" ,
147+ details = "Use `NumpyDocumentTermMatrix` instead" )
15148class DocumentTermMatrix (CompactIOMachine ):
16149 """ Document-term matrix for corpus.
17150
@@ -38,9 +171,9 @@ def __init__(self, corpus, docids=None, tfidf=False):
38171 :type tfidf: bool
39172 """
40173 CompactIOMachine .__init__ (self , {'classifier' : 'dtm' }, 'dtm' , dtm_suffices )
41- if docids == None :
174+ if docids is None :
42175 self .docid_dict = {i : i for i in range (len (corpus ))}
43- self .docids = range (len (corpus ))
176+ self .docids = [ i for i in range (len (corpus ))]
44177 else :
45178 if len (docids ) == len (corpus ):
46179 self .docid_dict = {docid : i for i , docid in enumerate (docids )}
@@ -50,8 +183,8 @@ def __init__(self, corpus, docids=None, tfidf=False):
50183 self .docids = docids [:len (corpus )]
51184 else :
52185 self .docid_dict = {docid : i for i , docid in enumerate (docids )}
53- self .docid_dict = {i : i for i in range (len (docids ), range (corpus ))}
54- self .docids = docids + range (len (docids ), range (corpus ))
186+ self .docid_dict = {i : i for i in range (len (docids ), len (corpus ))}
187+ self .docids = docids + [ i for i in range (len (docids ), len (corpus ))]
55188 # generate DTM
56189 self .generate_dtm (corpus , tfidf = tfidf )
57190
@@ -66,7 +199,7 @@ def generate_dtm(self, corpus, tfidf=False):
66199 :type tfidf: bool
67200 """
68201 self .dictionary = Dictionary (corpus )
69- self .dtm = dok_matrix ((len (corpus ), len (self .dictionary )), dtype = np .float_ )
202+ self .dtm = dok_matrix ((len (corpus ), len (self .dictionary )), dtype = np .float64 )
70203 bow_corpus = [self .dictionary .doc2bow (doctokens ) for doctokens in corpus ]
71204 if tfidf :
72205 weighted_model = TfidfModel (bow_corpus )
@@ -183,6 +316,8 @@ def loadmodel(self, prefix):
183316 self .dtm = pickle .load (open (prefix + '_dtm.pkl' , 'rb' ))
184317
185318
319+ @deprecated (deprecated_in = "3.0.1" , removed_in = "4.0.0" ,
320+ details = "Use `npdict` instead" )
186321def load_DocumentTermMatrix (filename , compact = True ):
187322 """ Load presaved Document-Term Matrix (DTM).
188323
0 commit comments