-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathconcordance_tfidf.js
More file actions
139 lines (116 loc) · 3.27 KB
/
concordance_tfidf.js
File metadata and controls
139 lines (116 loc) · 3.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
// task
// Reetesh Kumar
// http://reet.herokuapp.com
// https://github.com/krreet/node_concordance
// Some utility functions
module.exports = {
TFIDF: class {
constructor(stopWords) {
this.dict = {};
this.keys = [];
this.totalwords = 0;
this.stopWords = stopWords;
}
tokenize(text) {
// Split into array of tokens
return text.split(/\W+/);
}
// A function to validate a toke
validate(token) {
return /\w{2,}/.test(token) && (!(/\d+/.test(token)) && !(this.stopWords.indexOf(token) > -1));
}
// Count the words
termFreq(data) {
var tokens = this.tokenize(data);
// For every token
for (var i = 0; i < tokens.length; i++) {
// Lowercase everything to ignore case
var token = tokens[i].toLowerCase();
if (this.validate(token)) {
this.increment(token);
this.totalwords++;
}
}
}
// Get the document frequencies across all documents
docFreq(data) {
//console.log(data);
var tokens = this.tokenize(data);
// A temporary dictionary of words in this document
var tempDict = {};
// For every token
for (var i = 0; i < tokens.length; i++) {
// Lowercase everything to ignore case
var token = tokens[i].toLowerCase();
// Simpler we just need to see if it exists or not
if (this.validate(token) && tempDict[token] === undefined) {
tempDict[token] = true;
}
}
for (var i = 0; i < this.keys.length; i++) {
var key = this.keys[i];
// Does this word exist in this document?
if (tempDict[key]) {
this.dict[key].docCount++;
}
}
}
// Get all the keys
getKeys() {
return this.keys;
}
// Get the count for one word
getCount(word) {
return this.dict[word].count;
}
// Get the score for one word
getScore(word) {
return this.dict[word].tfidf;
}
// Increment the count for one word
increment(word) {
// Is this a new word?
if (this.dict[word] == undefined) {
this.dict[word] = {};
this.dict[word].count = 1;
this.dict[word].docCount = 0;
this.dict[word].word = word;
this.keys.push(word);
// Otherwise just increment its count
} else {
this.dict[word].count++;
}
}
// Finish and calculate everything
finish(totaldocs) {
// console.log(totaldocs);
// calculate tf-idf score
for (var i = 0; i < this.keys.length; i++) {
var key = this.keys[i];
var word = this.dict[key];
var tf = word.count / this.totalwords;
// See:
var idf = Math.log(totaldocs / word.docCount);
word.tfidf = tf * idf;
}
}
// Sort by word counts
sortByCount() {
// A fancy way to sort each element
// Compare the counts
var tfidf = this;
this.keys.sort(function (a, b) {
return (tfidf.getCount(b) - tfidf.getCount(a));
});
}
// Sort by TFIDF score
sortByScore() {
// A fancy way to sort each element
// Compare the counts
var tfidf = this;
this.keys.sort(function (a, b) {
return (tfidf.getScore(b) - tfidf.getScore(a));
});
}
}
};