-
Notifications
You must be signed in to change notification settings - Fork 31
/
tfidf.py
99 lines (79 loc) · 3.22 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import numpy as np
from collections import Counter
class TFIDF:
"""
Computes TFIDF given a set of words. Includes matching functions and other tricks.
"""
def __init__(self, doc_names):
# Initialize
print "Initializing TFIDF for %d documents" %len(doc_names)
self.doc_names = doc_names
self.ndocs = len(doc_names)
def get_filtered_vocabulary(self, all_words, word_thresh=1):
"""
Get the list of words used in the documents and filter it based on word frequency
Inputs:
all_words: a document list of word list [['d1w1', 'd1w2'], ['d2w1', 'd2w2']]
word_thresh: at least N words to add to vocabulary
Outputs:
vocab: a list of words
"""
assert(len(all_words) == self.ndocs)
# Get vocabulary
allwordslist = [w for doc in all_words for w in doc]
vocab = Counter(allwordslist)
# filter vocabulary
filt_vocab = { k:vocab[k] for k in vocab.keys() if vocab[k] >= word_thresh }
# filt_vocab = vocab
vocab = sorted(filt_vocab.keys())
print "\nVocabulary length: " + str(len(vocab))
self.vocab = vocab
def compute_tfidf(self, all_words):
"""
Compute TF-IDF weighting for the documents.
Uses: tf(t, d) = (1 + log10(frequency(t, d)))
idf(t) = log10(#documents / sum(document-has-term))
Inputs:
vocab: a list of words part of the vocabulary (see get_filtered_vocabulary)
all_words: a document list of word list [['d1w1', 'd1w2'], ['d2w1', 'd2w2']]
Outputs:
tf: simple term frequencies dictionary
idf: inverted frequency, before log
tfidf: (1 + log(tf)) * log(idf)
"""
assert(len(all_words) == self.ndocs)
# Get term frequencies
print "Computing TF vectors"
tf = { w:np.zeros(self.ndocs) for w in self.vocab }
for d, doc in enumerate(all_words):
# accumulate all word counts
for w in doc:
if tf.has_key(w):
tf[w][d] += 1.0
self.tf = tf
# Get inverse document frequencies
print "Computing IDF"
idf = { w:0 for w in self.vocab }
for w in self.vocab:
# no. documents / no. of documents which have this word
idf[w] = self.ndocs * 1.0 / sum(tf[w] != 0)
self.idf = idf
# Compute TF-IDF
print "Computing TF-IDF"
tfidf = np.zeros((len(self.vocab), self.ndocs))
for w, key in enumerate(self.vocab):
logtf = np.ma.log10(tf[key]) # create a masked input to save log10(0)
tfidf[w] = (1 + logtf).filled(0.) # fill log(0)'s with 0
tfidf[w] *= np.log10(idf[key])
self.tfidf = tfidf
def match_tfidf_score(self, docid, sentence1, sentence2):
"""
Compute matching score between 2 sentences
IMPORTANT: sentence1 and sentence2 are a list of words!
"""
matching_words = set(sentence1).intersection(sentence2)
score = 0.
for w in matching_words:
if w in self.vocab:
score += self.tfidf[self.vocab.index(w)][docid]
return score