diff --git a/A_Naive_Bayes_Classifier_of_Spatial_Prepositions.ipynb b/A_Naive_Bayes_Classifier_of_Spatial_Prepositions.ipynb index 56d6fcb..b692c8d 100644 --- a/A_Naive_Bayes_Classifier_of_Spatial_Prepositions.ipynb +++ b/A_Naive_Bayes_Classifier_of_Spatial_Prepositions.ipynb @@ -5,7 +5,7 @@ "colab": { "private_outputs": true, "provenance": [], - "authorship_tag": "ABX9TyO/HriD8rlWF7QYjff37k8Q", + "authorship_tag": "ABX9TyP2KqHzT9giuOQl/CAysGTo", "include_colab_link": true }, "kernelspec": { @@ -42,7 +42,7 @@ " (\"I'm at a crossroads\", '0'), ('Our office is near the lab, but not next to it.', '1'), ('The new year is near.', '1'), ('The train is right on time.', '0'), ('The restaurant is on strike.', '0'), ('The bridge is just beyond that tollbooth.', '1'), ('I left the house at 6:00.', '0'), ('From Paris he flew to Berlin.', '1'), ('She abstained from casting a vote of no confidence.', '0'), (\" got beaten up by thugs on my way home.\", '0'),\n", " ('The restaurant is at the end of the street.', '0'), ('They were left out of the group.', '0'), ('They went out of the room.', '1'), (\"Jesus walked on water.\", '1'), ('She had peeped into the book.', '0'), ('Our favorite restaurant is just off the road.', '1'),('The troops marched toward the village.', '1'), ('They prevented her from leaving the meeting.', '0'), ('The girl broke into tears', '0'), (\"I'll go ahead and you can catch me up later.\", '0'),\n", " ('She fearlessly walks into the room.', '1'), ('I am going to the lab.', '1'), ('We gave the results to our advisor', '0'), ('We had to get through the literature review before we began our methodology.', '0'), ('This painting will look great over the fireplace.', '1'), ('The puppy crept under the chair.', '1'), ('He dissuaded her from reporting him to the director.', '0'), ('The boy is in tears', '0'), (\"I'm calling off the event\", '0'),\n", - " ('We had to start from these findings in order to get our best results', '0'), ('He walked toward the door', '1'), ('We are working toward a better research method but have not achieved it yet.', '0'), ('My keys were under the dresser.', '1'), (' I met her in the United States', '1'), (' I met her in Chicago at the Lyric Opera.', '1'), ('I can see you before 3:00.', '0'), ('He cried in happiness', '0'),\n", + " ('We had to start from these findings in order to get our best results', '0'), ('He walked toward the door', '1'), ('We are working toward a better research method but have not achieved it yet.', '0'), ('My keys were under the dresser.', '1'), (' I met her in the United States', '1'), (' I met her in Chicago at the Lyric Opera.', '1'), ('I can see you before 3:00.', '0'), ('He cried in happiness', '0'), (\"She's far from home.\", '1'), (\"This is far from good.\", '0'),\n", " ('The dog sat on the pillow', '1'), ('We live between two neighbors.', '1'), (\"Let's keep it between you and me\", '0'), ('He sat on the chair', '1'), ('There is some milk in the fridge.', '1'), ('She was hiding under the table.', '1'), ('The stampeding cattle ran right off the cliff.', '1'), ('The ball fell off the table, onto the floor, and rolled under the bed.', '1'), (\"The roof gave in under the weight of the snow.\", '0'), (\"They gave up the search when it got dark.\", '0'),\n", " ('The cat jumped off the counter.', '1'), ('Barry drove over the bridge.', '1'), ('Matilde lost her ring at the beach.', '1'), ('The book belongs to Anthony.', '0'), ('They were sitting by the tree.', '0'), ('I walked out of the house.', '1'), ('The portrait of their mother hangs over the fireplace.', '1'), ('I heard it from her.', '0'), ('We learned a lot from Professor Kearns.', '0'), ('He checked into the hotel.', '0'),\n", " ('Once upon a time, there was a beautiful princess.', '0'), ('The baby climbed onto the table.', '1'), ('It is up to us to find the answer.', '0'), ('The loud noise came from within the stadium.', '1'), ('She never leaves without her phone.', '0'), ('The house lies just over that hill', '1'), ('When he awoke they were driving through the forest.', '1'), ('Toward morning, he fell asleep.', '0'), (\"Don't bring up politics if you want to have a quiet conversation with that guy.\", '0'),\n", @@ -55,18 +55,11 @@ "execution_count": null, "outputs": [] }, - { - "cell_type": "markdown", - "source": [ - "# Naive Bayes Model from scratch" - ], - "metadata": { - "id": "aXtnBBLAaK9F" - } - }, { "cell_type": "code", "source": [ + "#### Importing the libraries:\n", + "\n", "import itertools\n", "from collections import Counter\n", "import nltk\n", @@ -77,31 +70,46 @@ "nltk.download('punkt')\n", "from nltk.tokenize import word_tokenize\n", "from nltk.stem.snowball import SnowballStemmer\n", - "stemmer = SnowballStemmer('english')\n", - "\n", - "####\n", + "stemmer = SnowballStemmer('english')\n" + ], + "metadata": { + "id": "xFMwCWDjx_Fz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Naive Bayes Model from scratch" + ], + "metadata": { + "id": "aXtnBBLAaK9F" + } + }, + { + "cell_type": "code", + "source": [ + "#### List of stops\n", "\n", "stops = [\"no\", \"not\", \"and\", \"i\", \"me\", \"my\", \"myself\", \"we\", \"our\", \"ours\", \"ourselves\", \"you\", \"your\", \"yours\", \"yourself\", \"yourselves\", \"he\", \"him\", \"his\", \"himself\", \"she\", \"her\", \"hers\", \"herself\", \"it\", \"its\", \"itself\", \"they\", \"them\", \"their\", \"theirs\", \"themselves\", \"what\", \"which\", \"who\", \"whom\", \"this\", \"that\", \"these\", \"those\", \"am\", \"is\", \"are\", \"was\", \"were\", \"be\", \"been\", \"being\", \"have\", \"has\", \"had\", \"having\", \"do\", \"does\", \"did\", \"doing\", \"but\", \"if\", \"or\", \"because\", \"as\", \"until\", \"while\", \"with\", \"again\", \"further\", \"then\", \"once\", \"here\", \"there\", \"when\", \"where\", \"why\", \"how\", \"all\", \"any\", \"both\", \"each\", \"few\", \"more\", \"most\", \"other\", \"some\", \"such\", \"no\", \"nor\", \"not\", \"only\", \"own\", \"same\", \"so\", \"than\", \"too\", \"very\", \"s\", \"t\", \"can\", \"will\", \"just\", \"don\", \"should\", \"now\", \"page\"]\n", "\n", "####\n", "\n", - "def tokenizar(str_texto):\n", + "def tokenize(str_texto):\n", " return word_tokenize(str_texto)\n", "\n", - "def limpar(lista):\n", + "def clean(lista):\n", " return [i.lower() for i in lista if i.isalpha()]\n", "\n", - "def sem_stops(lista):\n", + "def remove_stops(lista):\n", " return [i for i in lista if i not in stops]\n", "\n", - "def stemizar(lista):\n", - " return [stemmer.stem(i) for i in lista]\n", - "\n", "def achatar(lista):\n", " return list(itertools.chain(*lista))\n", "\n", - "def pre_processar(str_texto):\n", - " return sem_stops(limpar(tokenizar(str_texto)))\n" + "def preprocess(str_texto):\n", + " return remove_stops(clean(tokenize(str_texto)))\n" ], "metadata": { "id": "clcPkQpDMQOQ" @@ -123,7 +131,7 @@ { "cell_type": "code", "source": [ - "corpus = [(pre_processar(i[0]), i[1]) for i in corpus]\n", + "corpus = [(preprocess(i[0]), i[1]) for i in corpus]\n", "corpus[:10]" ], "metadata": { @@ -135,7 +143,7 @@ { "cell_type": "code", "source": [ - "pre_processar(\"He’s standing against the wall\")" + "preprocess(\"He’s standing against the wall\")" ], "metadata": { "id": "fdS9YpFrfKm1" @@ -194,12 +202,12 @@ "\n", "#### Condition: ####\n", "if sent_input2.lower() == \"y\":\n", - " tokens_test = pre_processar(sent_input)\n", + " tokens_test = preprocess(sent_input)\n", " tokens_test = [i for i in tokens_test if i in vocab]\n", "\n", "if sent_input2.lower() == \"n\": \n", " sent_input = input('Reenter your sentence: ')\n", - " tokens_test = pre_processar(sent_input)\n", + " tokens_test = preprocess(sent_input)\n", " tokens_test = [i for i in tokens_test if i in vocab]\n", "\n", "# Calculating probabilities with Laplace smoothing:\n", @@ -274,8 +282,8 @@ { "cell_type": "code", "source": [ - "vetor = CountVectorizer() \n", - "x = vetor.fit_transform(sents).toarray() \n", + "vector = CountVectorizer() \n", + "x = vector.fit_transform(sents).toarray() \n", "x " ], "metadata": { @@ -343,7 +351,7 @@ "\n", "#### Condition with output: ####\n", "if sent_input4.lower() == \"y\":\n", - " prediction = str(model.predict(vetor.transform([sent_input3])))\n", + " prediction = str(model.predict(vector.transform([sent_input3])))\n", " print(' ')\n", " print('-------')\n", " print(sent_input3)\n", @@ -351,7 +359,7 @@ "\n", "if sent_input4.lower() == \"n\": \n", " sent_input = input('Reenter your sentence: ')\n", - " prediction = str(model.predict(vetor.transform([sent_input3])))\n", + " prediction = str(model.predict(vector.transform([sent_input3])))\n", " print(' ')\n", " print('-------')\n", " print(sent_input3)\n", @@ -511,7 +519,7 @@ { "cell_type": "code", "source": [ - "# Probabilitues (prior) of classes\n", + "# Probabilities (prior) of classes\n", "# Shows bias\n", "\n", "model.class_log_prior_" @@ -565,7 +573,7 @@ "source": [ "# Alphabetically indexed attribute names\n", "\n", - "vetor.get_feature_names_out()" + "vector.get_feature_names_out()" ], "metadata": { "id": "yj_PFsEYRtce" @@ -577,7 +585,7 @@ "cell_type": "code", "source": [ "dic_atributes = dict()\n", - "for e, i in enumerate(vetor.get_feature_names_out()):\n", + "for e, i in enumerate(vector.get_feature_names_out()):\n", " dic_atributes[i] = (model.feature_log_prob_[0][e], model.feature_log_prob_[1][e])" ], "metadata": { @@ -611,6 +619,65 @@ }, "execution_count": null, "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df['SPATIAL'].nlargest(n=10)\n" + ], + "metadata": { + "id": "G00Au2wPpBwg" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df['NON-SPATIAL'].nlargest(n=10)\n" + ], + "metadata": { + "id": "KWxKTVpupIrN" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df['Ratio SPATIAL/NON-SPATIAL'].nlargest(n=10)" + ], + "metadata": { + "id": "eMna5L5J-_jr" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "df.plot()\n", + "\n", + "plt.show()" + ], + "metadata": { + "id": "sWT48Ax-75GC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df['Ratio SPATIAL/NON-SPATIAL'].nlargest(n=10).plot(kind = 'hist')" + ], + "metadata": { + "id": "fIxC3aY-83eT" + }, + "execution_count": null, + "outputs": [] } ] } \ No newline at end of file