-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexer.py
174 lines (153 loc) · 6.52 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import utils
# DO NOT MODIFY CLASS NAME
class Indexer:
# DO NOT MODIFY THIS SIGNATURE
# You can change the internal implementation as you see fit.
def __init__(self, config):
self.doc_counter = 0
self.inverted_idx = {}
self.postingDict = {}
self.docs_dict = {}
self.config = config
self.pop_dict = {}
self.thresh_hold = 125000
self.stop_cleaning_value = 100000
self.exponent_grown_word_size_to_remove = 4
# DO NOT MODIFY THIS SIGNATURE
# You can change the internal implementation as you see fit.
def add_new_doc(self, document):
"""
This function perform indexing process for a document object.
Saved information is captures via two dictionaries ('inverted index' and 'posting')
:param document: a document need to be indexed.
:return: -
"""
document_dictionary = document.term_doc_dictionary
self.simple_dics_combine(document_dictionary, document.entity_dict)
max_tf = self.update_inverted(document_dictionary,document.tweet_id)
length = len(document_dictionary)
self.save_information_on_doc(length, max_tf, document.tweet_id, document.tweet_date, document.retweet,document_dictionary)
self.doc_counter += 1
self.thresh_hold_handler()
def thresh_hold_handler(self):
if self.thresh_hold < len(self.inverted_idx):
while self.stop_cleaning_value < len(self.inverted_idx):
to_remove = self.get_list_of_terms_to_remove()
self.clean_inverted_to_thresh_hold(to_remove)
self.exponent_grown_word_size_to_remove = self.exponent_grown_word_size_to_remove*2
self.exponent_grown_word_size_to_remove = self.exponent_grown_word_size_to_remove / 2
def clean_inverted_to_thresh_hold(self, remove_list):
for remove_item in remove_list:
self.inverted_idx.pop(remove_item)
self.postingDict.pop(remove_item)
if self.stop_cleaning_value >= len(self.inverted_idx):
break
def get_list_of_terms_to_remove(self):
res = []
for key, value in self.inverted_idx.items():
if value < self.exponent_grown_word_size_to_remove:
res.append(key)
return res
def save_information_on_doc(self, unique, max_tf, tweet_id, tweet_date, rt_list,term_dict):
self.simple_dics_combine(self.pop_dict, rt_list)
self.docs_dict[tweet_id] = (max_tf, unique, tweet_date[0],term_dict)
def simple_dics_combine(self, source, to_add):
for key in to_add.keys():
if key in source:
source[key] += to_add[key]
else:
source[key] = to_add[key]
def calculate_posting_terms(self):
count = 0
for term in self.postingDict:
count += len(self.postingDict[term])
return count
def posting_small_letter_handler(self, key, value, tweet_id):
opp_word = key.upper()
if opp_word in self.postingDict:
self.postingDict[key] = [(tweet_id, value)]
self.postingDict[key] += self.postingDict[opp_word]
self.postingDict.pop(opp_word)
else:
self.adding_to_posting(key, value, tweet_id)
def posting_big_letters_handler(self, key, value, tweet_id):
opp_word = key.lower()
if opp_word in self.postingDict:
self.adding_to_posting(opp_word, value, tweet_id)
else:
self.adding_to_posting(key, value, tweet_id)
def adding_to_posting(self, key, value, tweet_id):
if key in self.postingDict:
self.postingDict[key] += [(tweet_id, value)]
else:
self.postingDict[key] = [(tweet_id, value)]
def update_inverted(self, dict_to_combine, tweet_id):
max_val = 0
for key, value in dict_to_combine.items():
if value > max_val:
max_val = value
if key in self.inverted_idx:
self.inverted_idx[key] += value
self.adding_to_posting(key, value, tweet_id)
elif key[0].isupper():
self.inverted_big_letters_handler(key, value)
self.posting_big_letters_handler(key, value, tweet_id)
elif key[0].islower():
self.inverted_small_letter_handler(key, value)
self.posting_small_letter_handler(key, value, tweet_id)
else:
self.inverted_idx[key] = value
self.adding_to_posting(key, value, tweet_id)
return max_val
def inverted_small_letter_handler(self, key, value):
opp_word = key.upper()
if opp_word in self.inverted_idx:
self.inverted_idx[key] = value + self.inverted_idx[opp_word]
self.inverted_idx.pop(opp_word)
else:
self.inverted_idx[key] = value
def inverted_big_letters_handler(self, key, value):
opp_word = key.lower()
if opp_word in self.inverted_idx:
self.inverted_idx[opp_word] += value
else:
self.inverted_idx[key] = value
# DO NOT MODIFY THIS SIGNATURE
# You can change the internal implementation as you see fit.
def load_index(self, fn):
"""
Loads a pre-computed index (or indices) so we can answer queries.
Input:
fn - file name of pickled index.
"""
saved_files = utils.load_obj(fn)
self.inverted_idx = saved_files[0]
self.postingDict = saved_files[1]
self.docs_dict = saved_files[2]
self.pop_dict = saved_files[3]
return saved_files
# DO NOT MODIFY THIS SIGNATURE
# You can change the internal implementation as you see fit.
def save_index(self, fn):
"""
Saves a pre-computed index (or indices) so we can save our work.
Input:
fn - file name of pickled index.
"""
utils.save_obj((self.inverted_idx,self.postingDict,self.docs_dict, self.pop_dict),fn)
# feel free to change the signature and/or implementation of this function
# or drop altogether.
def _is_term_exist(self, term):
"""
Checks if a term exist in the dictionary.
"""
return term in self.inverted_idx
# feel free to change the signature and/or implementation of this function
# or drop altogether.
def get_term_posting_list(self, term):
"""
Return the posting list from the index for a term.
"""
if term in self.postingDict:
return self.postingDict[term]
return None