search-engine/Analysis.py at main · arminZolfaghari/search-engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import copy
import math
import matplotlib.pyplot as plt
from Preprocess import stem, normalize, df_before_preprocess
from PositionalPosting import create_positional_postings_lists

POSITIONAL_POSTINGS_LIST_FILE_WITH_STOP_WORDS = "./positional_postings_lists_with_stop_words.json"
POSITIONAL_POSTINGS_LIST_FILE_WITHOUT_STOP_WORDS = "./positional_postings_lists_without_stop_words.json"


def check_doc_content_with_query(query_vector, doc_id_result):
    """
    check document content (that search engine result ) with query
    and return related sentences
    :param query_vector:
    :param doc_id_result:
    :return: related_sentences_in_doc
    """
    doc_content = df_before_preprocess["content"][int(doc_id_result)]
    doc_content_normalize = normalize(doc_content)
    doc_content_sentences_list = doc_content_normalize.split(".")

    related_sentences_in_doc = []
    for sentence in doc_content_sentences_list:
        words_in_sentence = sentence.split(" ")
        words_stem_in_sentence = stem(words_in_sentence, "non positional")
        for term in query_vector:
            if term in words_stem_in_sentence or term in words_in_sentence:
                related_sentences_in_doc.append(sentence)
                break

    return related_sentences_in_doc


def get_word_freq_dict_from_postings_lists(postings_lists):
    word_freq_dict = {}
    for word, inf_about_word in postings_lists.items():
        word_freq_dict[word] = inf_about_word["frequency_in_all_documents"]

    return word_freq_dict


def remove_high_freq(sorted_word_freq_dict, limit):
    new_sorted_word_freq_dict = copy.deepcopy(sorted_word_freq_dict)
    for i in range(limit):
        word = list(new_sorted_word_freq_dict.keys())[0]
        del new_sorted_word_freq_dict[word]

    return new_sorted_word_freq_dict


def plot_zipf():
    positional_postings_list_with_stop_words = load_positional_postings_list(
        POSITIONAL_POSTINGS_LIST_FILE_WITH_STOP_WORDS)
    positional_postings_list_without_stop_words = load_positional_postings_list(
        POSITIONAL_POSTINGS_LIST_FILE_WITHOUT_STOP_WORDS)

    word_freq_dict_with_stop_words = get_word_freq_dict_from_postings_lists(positional_postings_list_with_stop_words)
    word_freq_dict_without_stop_words = get_word_freq_dict_from_postings_lists(
        positional_postings_list_without_stop_words)

    # sorted descending
    sorted_descending_word_freq_dict_with_stop_words = dict(
        sorted(word_freq_dict_with_stop_words.items(), key=lambda item: item[1], reverse=True))
    sorted_descending_word_freq_dict_without_stop_words = dict(
        sorted(word_freq_dict_without_stop_words.items(), key=lambda item: item[1], reverse=True))
    sorted_descending_word_freq_dict_without_high_freq = remove_high_freq(
        sorted_descending_word_freq_dict_with_stop_words, 30)
    # sorted(word_freq_dict_with_stop_words, reverse=True)
    # sorted(word_freq_dict_without_stop_words, reverse=True)

    max_number_with_stop_words = list(sorted_descending_word_freq_dict_with_stop_words.values())[0]
    max_number_without_stop_words = list(sorted_descending_word_freq_dict_without_stop_words.values())[0]
    max_number_without_high_freq = list(sorted_descending_word_freq_dict_without_high_freq.values())[0]
    print("max_number_with_stop_words: ", max_number_with_stop_words)
    print("max_number_without_stop_words: ", max_number_without_stop_words)
    print("max_number_without_high_freq: ", max_number_without_high_freq)

    # when have stop words
    L1, L2, L3 = [], [], []

    for word, freq in sorted_descending_word_freq_dict_with_stop_words.items():
        L3.append(math.log(freq, 10))
        word_index = list(sorted_descending_word_freq_dict_with_stop_words.keys()).index(word)
        L1.append(math.log(word_index + 1, 10))
        L2.append(math.log(max_number_with_stop_words / (word_index + 1), 10))

    plt.plot(L1, L2)
    plt.plot(L1, L3)
    plt.xlabel("Log 10 Rank")
    plt.ylabel("Log 10 cf")
    plt.title("With stop words")
    plt.show()

    # when remove stop words
    L4, L5, L6 = [], [], []
    for word, freq in sorted_descending_word_freq_dict_without_stop_words.items():
        L6.append(math.log(freq, 10))
        word_index = list(sorted_descending_word_freq_dict_without_stop_words.keys()).index(word)
        L4.append(math.log(word_index + 1, 10))
        L5.append(math.log(max_number_without_stop_words / (word_index + 1), 10))

    plt.plot(L4, L5)
    plt.plot(L4, L6)
    plt.xlabel("Log 10 Rank")
    plt.ylabel("Log 10 cf")
    plt.title("Without stop words")
    plt.show()

    # when remove high freq words
    L7, L8, L9 = [], [], []
    for word, freq in sorted_descending_word_freq_dict_without_high_freq.items():
        L9.append(math.log(freq, 10))
        word_index = list(sorted_descending_word_freq_dict_without_high_freq.keys()).index(word)
        L7.append(math.log(word_index + 1, 10))
        L8.append(math.log(max_number_without_high_freq / (word_index + 1), 10))

    plt.plot(L7, L8)
    plt.plot(L7, L9)
    plt.xlabel("Log 10 Rank")
    plt.ylabel("Log 10 cf")
    plt.title("Without high freq words")
    plt.show()


def calculate_tokens_and_words_number(data_frame):
    postings_lists = create_positional_postings_lists(data_frame)
    number_of_tokens = len(postings_lists)
    number_of_words = 0
    for token, inf_token in postings_lists.items():
        number_of_words += inf_token.frequency_in_all_documents

    return number_of_tokens, number_of_words


def plot_heaps():
    df_after_preprocess_with_stemming = get_data_frame_after_preprocess(True, True)
    df_after_preprocess_without_stemming = get_data_frame_after_preprocess(True, False)

    result_with_stemming = {}
    result_without_stemming = {}
    for number_of_documents in [500, 1000, 1500, 2000]:
        number_of_tokens_with_stemming, number_of_words_with_stemming = calculate_tokens_and_words_number(
            df_after_preprocess_with_stemming.head(number_of_documents))
        result_with_stemming[number_of_documents] = [number_of_tokens_with_stemming, number_of_words_with_stemming]
        number_of_tokens_without_stemming, number_of_words_without_stemming = calculate_tokens_and_words_number(
            df_after_preprocess_without_stemming.head(number_of_documents))
        result_without_stemming[number_of_documents] = [number_of_tokens_without_stemming,
                                                        number_of_words_without_stemming]

    print("with stemming")
    print(result_with_stemming)
    print("without stemming")
    print(result_without_stemming)

    number_of_tokens_with_stemming_in_all_documents, number_of_words_with_stemming_in_all_documents = calculate_tokens_and_words_number(
        df_after_preprocess_with_stemming)
    number_of_tokens_without_stemming_in_all_documents, number_of_words_without_stemming_in_all_documents = calculate_tokens_and_words_number(
        df_after_preprocess_without_stemming)

    print("with stemming")
    print(
        f'all documents: number of words: {number_of_tokens_with_stemming_in_all_documents}, number of tokens: {number_of_words_with_stemming_in_all_documents}')
    print("without stemming")
    print(
        f'all documents: number of words: {number_of_tokens_without_stemming_in_all_documents}, number of tokens: {number_of_words_without_stemming_in_all_documents}')


if __name__ == "__main__":
    plot_heaps()
    # plot_zipf()