diff --git a/A_Naive_Bayes_Classifier_of_Spatial_Prepositions.ipynb b/A_Naive_Bayes_Classifier_of_Spatial_Prepositions.ipynb new file mode 100644 index 0000000..56d6fcb --- /dev/null +++ b/A_Naive_Bayes_Classifier_of_Spatial_Prepositions.ipynb @@ -0,0 +1,616 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "private_outputs": true, + "provenance": [], + "authorship_tag": "ABX9TyO/HriD8rlWF7QYjff37k8Q", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "source": [ + "corpus = [('We finished the research at 5 pm.', '0'), ('We finished the research on June 5.', '0'), ('This big gift is from Tisha’s parents.', '0'), ('Bring your own bags into the store.', '1'), ('My classroom is across the hall.', '1'), ('He’s standing against the wall.', '1'), ('The boy was tossing a ball against the wall.', '1'), ('He works during the day.', '0'), ('They worked for three hours.', '0'), (\"The weather forecast says we are in for three feet of snow over the next week.\", '0'),\n", + " ('We planned to research from fall to spring.', '0'), ('We finished the research in the morning', '0'), ('Did you buy this ice cream for me?', '0'), ('The dog ran at him', '1'), ('He stood among the trees.', '1'), ('He spent two hours walking around the park.', '1'), ('I shot at the target.', '1'), ('I dropped my wallet behind the dresser.', '1'), ('He came here in 2003.', '0'), (\"I don't have to account for anything to you.\", '0'), ('The 2006 St. Louis Cardinals backed into the playoffs.', '0'),\n", + " ('We have been doing this research since the start of the semester', '0'), ('We have been doing this research for three months.', '0'), ('The house is just off the main road.', '1'), ('She’s been at the library for hours.', '1'), ('She’s away from her desk right now.', '1'), ('The ship quickly disappeared beneath the surface.', '1'), ('I’ll be there by 4:00.', '0'), ('Through the years, she became more proficient.', '0'), ('I was about to sue them, but I had to back down.', '0'),\n", + " ('We will complete this research during the spring semester.', '0'), ('We did our research at the library', '0'), ('We did our research on campus.', '1'), ('She walked away from me', '1'), ('I can remember standing before that judge.', '1'), ('We waited behind the building.', '1'), ('We traveled just beyond the border.', '1'), ('She’s coming between 2:00 and 3:00.', '0'), ('They worked through the night.', '0'), ('He stepped aside.', '1'), (\"We're running out of milk\", '0'),\n", + " ('We did our research in the lab.', '1'), ('We plan to finish this research by June 5.', '0'), ('The boy will not sleep in his room.', '1'), ('The boy arrives in a minute.', '0'), ('I can feel something just below the surface.', '1'), ('There’s a door just beneath the stairs.', '1'), ('Can’t you put your dirty laundry in the hamper?', '1'), ('Over the years, he became wiser.', '0'), (\"She's a street performer, so she acts out on Ninth Street.\", '0'), \n", + " ('The students left in a hurry', '0'), ('She left for school.', '1'), ('She is looking for a job.', '0'), ('He is out of danger', '0'), ('He climbed out of the window.', '1'), ('I love reading by the pool.', '1'), ('', ''), ('She sat beside the pool.', '1'), ('I found a quarter between the cushions.', '1'), ('He said he was coming at four o’clock.', '0'), ('He worked from 3:00 to 4:00.', '0'), (\"Back up your documents folder before applying the update.\", '0'),\n", + " ('He jumped from the roof.', '1'), ('This is groundbreaking research in the engineering field. ', '0'), (\"They ran through the forest.\", '1'), ('He is away from the office right now.', '1'), ('The letter is still in the envelope.', '1'), ('It’s inside the box.', '1'), ('I ran inside the house and answered the phone.', '1'), ('She was standing by the window.', '1'), ('The criminal broke into the house.', '0'), ('She backed out of organizing the fund-raising.', '0'),\n", + " ('The clock is on the wall', '1'), ('I got into the program.', '0'), ('The cat is under the table', '1'), ('This research happens under the supervision of an advisor.', '0'), ('The dictionary is on the shelf.', '1'), ('The school is opposite a large park.', '1'), ('The entire team ran onto the court.', '1'), ('He walked into the room and slammed the door.', '1'), ('The child is in despair.', '0'), ('You need to back off, or the situation could turn ugly.', '0'),\n", + " ('There is a post office between the library and the church.', '1'), ('We found the answers between two data points.', '0'), ('The data is within the range that we expected.', '0'), ('The horse galloped away from its owner.', '1'), ('I got a call from my son today.', '0'),('Please don’t throw things on my desk.', '1'), ('A compass is useless at the North Pole.', '1'), (\"We're falling in love.\",'0'), ('It is difficult to carry on a conversation with so many distractions.', '0'),\n", + " (\"I'm at a crossroads\", '0'), ('Our office is near the lab, but not next to it.', '1'), ('The new year is near.', '1'), ('The train is right on time.', '0'), ('The restaurant is on strike.', '0'), ('The bridge is just beyond that tollbooth.', '1'), ('I left the house at 6:00.', '0'), ('From Paris he flew to Berlin.', '1'), ('She abstained from casting a vote of no confidence.', '0'), (\" got beaten up by thugs on my way home.\", '0'),\n", + " ('The restaurant is at the end of the street.', '0'), ('They were left out of the group.', '0'), ('They went out of the room.', '1'), (\"Jesus walked on water.\", '1'), ('She had peeped into the book.', '0'), ('Our favorite restaurant is just off the road.', '1'),('The troops marched toward the village.', '1'), ('They prevented her from leaving the meeting.', '0'), ('The girl broke into tears', '0'), (\"I'll go ahead and you can catch me up later.\", '0'),\n", + " ('She fearlessly walks into the room.', '1'), ('I am going to the lab.', '1'), ('We gave the results to our advisor', '0'), ('We had to get through the literature review before we began our methodology.', '0'), ('This painting will look great over the fireplace.', '1'), ('The puppy crept under the chair.', '1'), ('He dissuaded her from reporting him to the director.', '0'), ('The boy is in tears', '0'), (\"I'm calling off the event\", '0'),\n", + " ('We had to start from these findings in order to get our best results', '0'), ('He walked toward the door', '1'), ('We are working toward a better research method but have not achieved it yet.', '0'), ('My keys were under the dresser.', '1'), (' I met her in the United States', '1'), (' I met her in Chicago at the Lyric Opera.', '1'), ('I can see you before 3:00.', '0'), ('He cried in happiness', '0'),\n", + " ('The dog sat on the pillow', '1'), ('We live between two neighbors.', '1'), (\"Let's keep it between you and me\", '0'), ('He sat on the chair', '1'), ('There is some milk in the fridge.', '1'), ('She was hiding under the table.', '1'), ('The stampeding cattle ran right off the cliff.', '1'), ('The ball fell off the table, onto the floor, and rolled under the bed.', '1'), (\"The roof gave in under the weight of the snow.\", '0'), (\"They gave up the search when it got dark.\", '0'),\n", + " ('The cat jumped off the counter.', '1'), ('Barry drove over the bridge.', '1'), ('Matilde lost her ring at the beach.', '1'), ('The book belongs to Anthony.', '0'), ('They were sitting by the tree.', '0'), ('I walked out of the house.', '1'), ('The portrait of their mother hangs over the fireplace.', '1'), ('I heard it from her.', '0'), ('We learned a lot from Professor Kearns.', '0'), ('He checked into the hotel.', '0'),\n", + " ('Once upon a time, there was a beautiful princess.', '0'), ('The baby climbed onto the table.', '1'), ('It is up to us to find the answer.', '0'), ('The loud noise came from within the stadium.', '1'), ('She never leaves without her phone.', '0'), ('The house lies just over that hill', '1'), ('When he awoke they were driving through the forest.', '1'), ('Toward morning, he fell asleep.', '0'), (\"Don't bring up politics if you want to have a quiet conversation with that guy.\", '0'),\n", + " ('Joni sat across from Marie.', '1'), ('The caterpillar turned into a butterfly.', '0'), ('I was unable to get out of the appointment.', '1'), ('I attended the meeting on behalf of my company.', '0'), ('My car is parked in front of the mailbox.', '1'), ('We went from Chicago to Zurich.', '1'), ('We drove from Frankfurt through Kassel to Hannover.', '1'), ('She completed the assignment in under an hour.', '0'),\n", + " ('Joni sat behind Marie.', '1'), ('The children loved the gifts from their grandparents.', '0'), (\"The burglar stole the money under cover of darkness.\", \"0\"), ('The troops marched by the stand where the general and guests were seated.', '1'), ('Time is passing by.', '0'), ('It was about 3:00 when we stopped.', '0'), ('We stopped around 3:30.', '0'), ('We are meeting at 8:00 pm.', '0'), ('The smell of the magnolia brought back sweet memories of my childhood.', '0')]" + ], + "metadata": { + "id": "egiegQwDMUwg" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Naive Bayes Model from scratch" + ], + "metadata": { + "id": "aXtnBBLAaK9F" + } + }, + { + "cell_type": "code", + "source": [ + "import itertools\n", + "from collections import Counter\n", + "import nltk\n", + "from numpy import prod\n", + "from math import log\n", + "import time\n", + "\n", + "nltk.download('punkt')\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.stem.snowball import SnowballStemmer\n", + "stemmer = SnowballStemmer('english')\n", + "\n", + "####\n", + "\n", + "stops = [\"no\", \"not\", \"and\", \"i\", \"me\", \"my\", \"myself\", \"we\", \"our\", \"ours\", \"ourselves\", \"you\", \"your\", \"yours\", \"yourself\", \"yourselves\", \"he\", \"him\", \"his\", \"himself\", \"she\", \"her\", \"hers\", \"herself\", \"it\", \"its\", \"itself\", \"they\", \"them\", \"their\", \"theirs\", \"themselves\", \"what\", \"which\", \"who\", \"whom\", \"this\", \"that\", \"these\", \"those\", \"am\", \"is\", \"are\", \"was\", \"were\", \"be\", \"been\", \"being\", \"have\", \"has\", \"had\", \"having\", \"do\", \"does\", \"did\", \"doing\", \"but\", \"if\", \"or\", \"because\", \"as\", \"until\", \"while\", \"with\", \"again\", \"further\", \"then\", \"once\", \"here\", \"there\", \"when\", \"where\", \"why\", \"how\", \"all\", \"any\", \"both\", \"each\", \"few\", \"more\", \"most\", \"other\", \"some\", \"such\", \"no\", \"nor\", \"not\", \"only\", \"own\", \"same\", \"so\", \"than\", \"too\", \"very\", \"s\", \"t\", \"can\", \"will\", \"just\", \"don\", \"should\", \"now\", \"page\"]\n", + "\n", + "####\n", + "\n", + "def tokenizar(str_texto):\n", + " return word_tokenize(str_texto)\n", + "\n", + "def limpar(lista):\n", + " return [i.lower() for i in lista if i.isalpha()]\n", + "\n", + "def sem_stops(lista):\n", + " return [i for i in lista if i not in stops]\n", + "\n", + "def stemizar(lista):\n", + " return [stemmer.stem(i) for i in lista]\n", + "\n", + "def achatar(lista):\n", + " return list(itertools.chain(*lista))\n", + "\n", + "def pre_processar(str_texto):\n", + " return sem_stops(limpar(tokenizar(str_texto)))\n" + ], + "metadata": { + "id": "clcPkQpDMQOQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "len(corpus)" + ], + "metadata": { + "id": "eUwDd6nizLIF" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "corpus = [(pre_processar(i[0]), i[1]) for i in corpus]\n", + "corpus[:10]" + ], + "metadata": { + "id": "GsKsb6Z89Q7o" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "pre_processar(\"He’s standing against the wall\")" + ], + "metadata": { + "id": "fdS9YpFrfKm1" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "is_spatial = [i[0] for i in corpus if i[1] == '1'] \n", + "non_spatial = [i[0] for i in corpus if i[1] == '0'] \n", + "\n", + "vocab = set(achatar(is_spatial)) | set(achatar(non_spatial))" + ], + "metadata": { + "id": "Eq7BH890X11o" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Counting items\n", + "n_items_is_spatial = len(is_spatial)\n", + "n_items_non_spatial = len(non_spatial)\n", + "n_docs_total = n_items_is_spatial + n_items_non_spatial\n", + "\n", + "# N. items in vocabulary\n", + "n_vocab = len(vocab)\n", + "\n", + "# Counting words (n. of occurrences)\n", + "tokens_is_spatial = achatar(is_spatial)\n", + "cont_is_spatial = Counter(tokens_is_spatial)\n", + "tokens_non_spatial = achatar(non_spatial)\n", + "cont_non_spatial = Counter(tokens_non_spatial)\n", + "\n", + "# N. of words by class\n", + "n_tokens_is_spatial = sum(cont_is_spatial.values())\n", + "n_tokens_non_spatial = sum(cont_non_spatial.values())" + ], + "metadata": { + "id": "IaFrpLRgBuVK" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#### Input: ####\n", + "sent_input = input('Enter a sentence containing an English preposition: ')\n", + "print(f'You entered \"{sent_input}\", correct?')\n", + "sent_input2 = input('y/n: ')\n", + "\n", + "#### Condition: ####\n", + "if sent_input2.lower() == \"y\":\n", + " tokens_test = pre_processar(sent_input)\n", + " tokens_test = [i for i in tokens_test if i in vocab]\n", + "\n", + "if sent_input2.lower() == \"n\": \n", + " sent_input = input('Reenter your sentence: ')\n", + " tokens_test = pre_processar(sent_input)\n", + " tokens_test = [i for i in tokens_test if i in vocab]\n", + "\n", + "# Calculating probabilities with Laplace smoothing:\n", + "prob_is_spatial = log(n_tokens_is_spatial / n_docs_total) + sum([log((cont_is_spatial[i] + 1) / (n_tokens_is_spatial + n_vocab)) for i in tokens_test])\n", + "prob_non_spatial = log(n_tokens_non_spatial / n_docs_total) + sum([log((cont_non_spatial[i] + 1) / (n_tokens_non_spatial + n_vocab)) for i in tokens_test])\n", + "\n", + "#### Output: ####\n", + "print(' ')\n", + "print('-------')\n", + "print(sent_input)\n", + "print(prob_is_spatial, prob_non_spatial)\n", + "\n", + "if prob_is_spatial > prob_non_spatial:\n", + " print('The preposition you entered is: SPATIAL.')\n", + "else:\n", + " print('The preposition you entered is: NON-SPATIAL.')\n", + "\n", + "time.sleep(5)" + ], + "metadata": { + "id": "pGgn2i6NLYeH" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Model with scikit-learn" + ], + "metadata": { + "id": "KLQd9AKfs3eJ" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "\n", + "import numpy as np" + ], + "metadata": { + "id": "Widp81ZxgLYc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sents = [i[0] for i in corpus]\n", + "tags = [i[1] for i in corpus]" + ], + "metadata": { + "id": "30xgUZslvpjH" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "sents = np.array([' '.join(sents) for sents in sents])" + ], + "metadata": { + "id": "R5XEIGBX_YTN" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "vetor = CountVectorizer() \n", + "x = vetor.fit_transform(sents).toarray() \n", + "x " + ], + "metadata": { + "id": "M2_4N8mQwnd8" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "x.shape" + ], + "metadata": { + "id": "q28WuKZ68wB5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model = MultinomialNB() # Create \"Multinomial Bayesian\"\n", + "model.fit(x, tags) # Busca ajustar os dados às etiquetas" + ], + "metadata": { + "id": "CRODSl3Awy0d" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Number of examplos by class\n", + "\n", + "model.class_count_" + ], + "metadata": { + "id": "Bli_aZgXyBLn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Class probabilities to indicate model bias\n", + "\n", + "model.class_log_prior_" + ], + "metadata": { + "id": "N6xx6vZZ4frX" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#### Input: ####\n", + "sent_input3 = input('Enter a sentence containing an English preposition: ')\n", + "print(f'You entered \"{sent_input3}\", right?')\n", + "sent_input4 = input('y/n: ')\n", + "\n", + "#### Condition with output: ####\n", + "if sent_input4.lower() == \"y\":\n", + " prediction = str(model.predict(vetor.transform([sent_input3])))\n", + " print(' ')\n", + " print('-------')\n", + " print(sent_input3)\n", + " print('The preposition you entered is:', prediction.replace(\"['1']\", 'SPATIAL.').replace(\"['0']\", 'NON-SPATIAL.'))\n", + "\n", + "if sent_input4.lower() == \"n\": \n", + " sent_input = input('Reenter your sentence: ')\n", + " prediction = str(model.predict(vetor.transform([sent_input3])))\n", + " print(' ')\n", + " print('-------')\n", + " print(sent_input3)\n", + " print('The preposition you entered is:', str(prediction.replace(\"['1']\", 'SPATIAL.').replace(\"['0']\", 'NON-SPATIAL.')))\n", + "\n", + "time.sleep(5)" + ], + "metadata": { + "id": "yYQhJrJ_yDNg" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "# Train vs Test Split\n", + "\n", + "\n" + ], + "metadata": { + "id": "-9JyHAREP9ai" + } + }, + { + "cell_type": "code", + "source": [ + "# Splits corpus into Train (80%) and Test (20%)\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(x, tags, test_size=.2)" + ], + "metadata": { + "id": "HghcoY1HP-PR" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "X_train.shape" + ], + "metadata": { + "id": "z6Fg-vWOQLAq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "X_test.shape" + ], + "metadata": { + "id": "0p4AYgGCQOBl" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model = MultinomialNB() # Creates MNB model\n", + "model.fit(X_train, y_train) # Adjests data according to the tags" + ], + "metadata": { + "id": "tSQ06050QRWB" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "predictions = model.predict(X_test)" + ], + "metadata": { + "id": "_-6zX53S6Fpc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "predictions[:3] " + ], + "metadata": { + "id": "uHz4gI5LQW1P" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Using Scikit-learn\n", + "from sklearn.metrics import classification_report\n", + "\n", + "print(classification_report(y_test, predictions))" + ], + "metadata": { + "id": "0c10WyuRQbde" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.metrics import plot_confusion_matrix" + ], + "metadata": { + "id": "UzeLXroZQf6B" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "plot_confusion_matrix(model, X_test, y_test)" + ], + "metadata": { + "id": "OtAo7lJKQiEB" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Probabilities of Classes" + ], + "metadata": { + "id": "eZ3qD1HwRWh9" + } + }, + { + "cell_type": "code", + "source": [ + "# N. of examples by class\n", + "\n", + "model.class_count_" + ], + "metadata": { + "id": "8eWlyORLRaek" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Probabilitues (prior) of classes\n", + "# Shows bias\n", + "\n", + "model.class_log_prior_" + ], + "metadata": { + "id": "Co8zlyUpRg0u" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model.feature_count_" + ], + "metadata": { + "id": "ZH4NbRoGRkWw" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# N. of attributes\n", + "\n", + "model.n_features_in_" + ], + "metadata": { + "id": "XLPNAU18Rmoh" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Post probability of each attribute (logarithmic)\n", + "# The first list corresponds to class 0 (Negative) and the second to class 1 (Positive)\n", + "\n", + "model.feature_log_prob_" + ], + "metadata": { + "id": "Y8Db34jORqJy" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Alphabetically indexed attribute names\n", + "\n", + "vetor.get_feature_names_out()" + ], + "metadata": { + "id": "yj_PFsEYRtce" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "dic_atributes = dict()\n", + "for e, i in enumerate(vetor.get_feature_names_out()):\n", + " dic_atributes[i] = (model.feature_log_prob_[0][e], model.feature_log_prob_[1][e])" + ], + "metadata": { + "id": "k4rhXuDsRwkM" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.DataFrame.from_dict(dic_atributes, orient='index', columns=['SPATIAL', 'NON-SPATIAL'])\n", + "df" + ], + "metadata": { + "id": "kdelGXkbRy0q" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df['Ratio SPATIAL/NON-SPATIAL'] = df['SPATIAL'] / df['NON-SPATIAL'] #The smaller the ratio, the more \"spatial\" is the term.\n", + "df" + ], + "metadata": { + "id": "rvBzcNNxR_bH" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file