diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..6ce73fb9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,162 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +zippedData/ +tableau/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/README.md b/README.md index b5e02341..dc5e2d21 100644 --- a/README.md +++ b/README.md @@ -1,281 +1,209 @@ -# Phase 2 Project Description +

Hi 👋, We're Group 6

-You've made it through the second phase of this course, and now you will put your new skills to use with a large end-of-Phase project! +

Connect with me:

+

+anthony-odhiambo-47167a15b/ +

-In this project description, we will cover: +Github Repository: [Click to Open the Project Github Repository](https://github.com/odhinto/dsc-phase-1-project-v3) -* [***Project Overview:***](#project-overview) the project goal, audience, and dataset -* [***Deliverables:***](#deliverables) the specific items you are required to produce for this project -* [***Grading:***](#grading) how your project will be scored -* [***Getting Started:***](#getting-started) guidance for how to begin your first project +Tableau Dashboard: [Click to Open the Tableau Dashboard](https://public.tableau.com/views/Phase1_EDA/Dashboard2?:language=en-US&publish=yes&:sid=&:redirect=auth&:display_count=n&:origin=viz_share_link) -## Project Overview -For this project, you will use exploratory data analysis to generate insights for a business stakeholder. +# **Problem Definition** -### Business Problem +A recent profitable trend across most big companies is creation of original video content. Our company's expansion and diversification plans include getting in on this fun by creating a new movie studio. +An analysis of box office performance is critical to identify a profitable formula for operating a profitable studio. -Your company now sees all the big companies creating original video content and they want to get in on the fun. They have decided to create a new movie studio, but they don’t know anything about creating movies. You are charged with exploring what types of films are currently doing the best at the box office. You must then translate those findings into actionable insights that the head of your company's new movie studio can use to help decide what type of films to create. +## Business Understanding -### The Data +The primary objective of this exercise is to generate an accurate model for predicting box office success as a blueprint for running our proposed new movie studio -In the folder `zippedData` are movie datasets from: +* The highest rated movies with respect to attributes such as Genre, Region, Writers, Directors, Actors etc. +* The most watched/voted movies with respect to attributes such as Genre, Region, Writers, Directors, Actors etc. +* The highest grossing movies with respect to attributes such as Genre, Region, Writers, Directors, Actors etc. -* [Box Office Mojo](https://www.boxofficemojo.com/) -* [IMDB](https://www.imdb.com/) -* [Rotten Tomatoes](https://www.rottentomatoes.com/) -* [TheMovieDB](https://www.themoviedb.org/) -* [The Numbers](https://www.the-numbers.com/) +The insights from this analysis will determine the kind of movies our studio focuses on and the people we approach to partner with us to ensure our studio is successful and profitable. -Because it was collected from various locations, the different files have different formats. Some are compressed CSV (comma-separated values) or TSV (tab-separated values) files that can be opened using spreadsheet software or `pd.read_csv`, while the data from IMDB is located in a SQLite database. +# **Data Preprocessing** -![movie data erd](https://raw.githubusercontent.com/learn-co-curriculum/dsc-phase-2-project-v3/main/movie_data_erd.jpeg) +This section prepare the provided movie data for analysis. We intend to do the following: -Note that the above diagram shows ONLY the IMDB data. You will need to look carefully at the features to figure out how the IMDB data relates to the other provided data files. +* Dataset Overview - Load and understand the data +* Handling Missing Values using derived domain knowledge and imputation +* Data Cleaning e.g. standardizing categorical values, deriving useful date data, removing duplicates etc -It is up to you to decide what data from this to use and how to use it. If you want to make this more challenging, you can scrape websites or make API calls to get additional data. If you are feeling overwhelmed or behind, we recommend you use only the following data files: +## **Dataset Overview** -* `im.db.zip` - * Zipped SQLite database (you will need to unzip then query using SQLite) - * `movie_basics` and `movie_ratings` tables are most relevant -* `bom.movie_gross.csv.gz` - * Compressed CSV file (you can open without expanding the file using `pd.read_csv`) +It is imperative for us to understand the movie database first i.e.: -### Key Points +* The data structure e.g. available tables, their columns, data types and presence of missing values +* Establish the relevance of the data to our study +* Identify useful columns to focus on -* **Your analysis should yield three concrete business recommendations.** The ultimate purpose of exploratory analysis is not just to learn about the data, but to help an organization perform better. Explicitly relate your findings to business needs by recommending actions that you think the business should take. +Data Understanding will prescribe subsequent cleaning steps to be done in the **Data Cleaning** subsection -* **Communicating about your work well is extremely important.** Your ability to provide value to an organization - or to land a job there - is directly reliant on your ability to communicate with them about what you have done and why it is valuable. Create a storyline your audience (the head of the new movie studio) can follow by walking them through the steps of your process, highlighting the most important points and skipping over the rest. +### Python Libraries Initialization +First, we initialize common libraries we project to utilize in this exercise: -* **Use plenty of visualizations.** Visualizations are invaluable for exploring your data and making your findings accessible to a non-technical audience. Spotlight visuals in your presentation, but only ones that relate directly to your recommendations. Simple visuals are usually best (e.g. bar charts and line graphs), and don't forget to format them well (e.g. labels, titles). +* pandas to create and manipulate dataframes +* seaborn and matplotlib to facilitate any requisite visualizations within the notebook +* numpy for mathematical calculations +* sqlite3 to navigate the movie database +* etc -## Deliverables +### Data Loading +We then load the dataset into python as a dataframe and embark on a data understanding exercise. -There are three deliverables for this project: +### Data Understanding -* A **non-technical presentation** -* A **Jupyter Notebook** -* A **GitHub repository** +The movie database contains the following tables with shown columns -### Non-Technical Presentation +* **principals**: insert description and data summary +* **persons**: insert description and data summary +* **known_for**: insert description and data summary +* **directors**: insert description and data summary +* **writers**: insert description and data summary +* **movie_basics**: insert description and data summary +* **dmovie_ratings**: insert description and data summary +* **movie_akas**: insert description and data summary -The non-technical presentation is a slide deck presenting your analysis to business stakeholders. +
+ Database Schema +

Dirty Makers - Needs Cleaning with Fuzzy Logic

+
-* ***Non-technical*** does not mean that you should avoid mentioning the technologies or techniques that you used, it means that you should explain any mentions of these technologies and avoid assuming that your audience is already familiar with them. -* ***Business stakeholders*** means that the audience for your presentation is the company, not the class or teacher. Do not assume that they are already familiar with the specific business problem. -The presentation describes the project ***goals, data, methods, and results***. It must include at least ***three visualizations*** which correspond to ***three business recommendations***. +## Data Cleaning -We recommend that you follow this structure, although the slide titles should be specific to your project: +From the data cleaning requiements identified during the data understanding step, the following cleaning exercises were implemented: +* Removing rows with empty **Event.ID** values since for such cases, most of the other values were blank. This yielded clean **Investigation.Type** columns as well. +* Removing rows with empty **Make** values since the main objective of this study is to identify the lowest-risk air crafts to invest in. +* Removing the bracketed number of fatalities in the **Injury.Severity** column since the same can be evaluated from the **Total.Fatal.Injuries** column. +* Normalizing capitalization formatting in all the categorical fields to ensure clean categories. +* Replacing **UNK** with **Unknown** in all relevant fields to enhance understanding of the data. +* Converting Date Columns **Event.Date** and **Publication.Date** to Python Datetime format +* Handling missing values: We will use different strategies to handle different categories of missing data: -1. Beginning - * Overview - * Business Understanding -2. Middle - * Data Understanding - * Data Analysis -3. End - * Recommendations - * Next Steps - * Thank You - * This slide should include a prompt for questions as well as your contact information (name and LinkedIn profile) + * Most of the longitude and latitude data is missing, and it does not seem to be relevant to this study. Additionally, we also have location data which is more readily available. Thus, we can drop the longitude and latitude columns + * FAR.Description data seems irrelelevant to the study, and so it was dropped + * For the few missing values of relevant categorical columns, we can **fill missing values with **Unknown". We may be able to extract insight even with "Unknown" parameters if others related parameters are known e.g. you may have an unknown model but know the manufacturer. This remains relevant to our study. + * For the data on total number of injuries, it is best to assume that the data meant to be there is '0' e.g. if Total.Serious.Injuries is empty, it means there were no fatal injuries +* Checking and removing any duplicates +* Encoding categorical data into numbers to facilitate correlation calculations. +# Exploratory Data Analysis -You will give a live presentation of your slides and submit them in PDF format on Canvas. The slides should also be present in the GitHub repository you submit with a file name of `presentation.pdf`. +We can check for correlation in the measures in our data: -The graded elements of the presentation are: +
+ correlation heatmap +

Testing for Correlation in the Data Measures

+
-* Presentation Content -* Slide Style -* Presentation Delivery and Answers to Questions +There is no immediate correlationary insight noted from correlation heat map. This could be due to the several cases of "Unknown" category. It is prudent to carry out further exploratory analysis using Tableau. -See the [Grading](#grading) section for further explanation of these elements. +## Tableau EDA -For further reading on creating professional presentations, check out: +Most of the accidents occur in the USA, distributed across almost all the states. -* [Presentation Content](https://github.com/learn-co-curriculum/dsc-project-presentation-content) -* [Slide Style](https://github.com/learn-co-curriculum/dsc-project-slide-design) +
+ accident location map +

Recorded Accident Locations

+
-### Jupyter Notebook +Cessna, Piper, Beech, Boeing and Bell registered the most accidents. -The Jupyter Notebook is a notebook that uses Python and Markdown to present your analysis to a data science audience. +
+ accident frequency per maker bubble +

Aviation Accidents Per Maker

+
-* ***Python and Markdown*** means that you need to construct an integrated `.ipynb` file with Markdown (headings, paragraphs, links, lists, etc.) and Python code to create a well-organized, skim-able document. - * The notebook kernel should be restarted and all cells run before submission, to ensure that all code is runnable in order. - * Markdown should be used to frame the project with a clear introduction and conclusion, as well as introducing each of the required elements. -* ***Data science audience*** means that you can assume basic data science proficiency in the person reading your notebook. This differs from the non-technical presentation. +In the event of an accident, it is almost guaranteed that the aircraft damage will be substantial to totally damaged for most of the makers. -Along with the presentation, the notebook also describes the project ***goals, data, methods, and results***. It must include at least ***three visualizations*** which correspond to ***three business recommendations***. +
+ aircraft damage per maker barchart +

Aircraft Damage Per Maker

+
-You will submit the notebook in PDF format on Canvas as well as in `.ipynb` format in your GitHub repository. +If you remove events where the aircraft damage is "Unknown", the following makers emerge as better candidates where there is a high likelihood of minor the accident only resulting in minor damage: -The graded elements for the Jupyter Notebook are: +* Airbus +* Boeing +* Mcdonnell Douglas +* Embraer +* Douglas -* Business Understanding -* Data Understanding -* Data Preparation -* Data Analysis -* Visualization -* Code Quality +
+ aircraft damage per maker normalized barchart +

Aircraft Damage Per Maker

+
-See the [Grading](#grading) section for further explanation of these elements. +To analyze fatalities in the event of an accident, it is important to understand the total number of passengers involved in this data. This can be done by summing up all the fatalities, serious injuries, minor injuries and uninjured passengers. -### GitHub Repository +
+ Total Occupancy per maker +

Total Occupancy Per Maker

+
-The GitHub repository is the cloud-hosted directory containing all of your project files as well as their version history. +We go a step further to assess the average occupancy per flight from the data. This will tell us the relative sizes of the aircrafts. -This repository link will be the project link that you include on your resume, LinkedIn, etc. for prospective employers to view your work. Note that we typically recommend that 3 links are highlighted (out of 5 projects) so don't stress too much about getting this one to be perfect! There will also be time after graduation for cosmetic touch-ups. +
+ Average Occupancy per maker +

Average Occupancy Per Maker

+
-A professional GitHub repository has: +Airbus, Mcdonnel Douglas, Boeing, Douglas and Embraer are generally bigger planes carrying more people, hence their high number of average occupancy. Cessna has a very low average occupancy, i.e. they make small aircrafts carrying very few people (around 2). -1. `README.md` - * A file called `README.md` at the root of the repository directory, written in Markdown; this is what is rendered when someone visits the link to your repository in the browser - * This file contains these sections: - * Overview - * Business Understanding - * Include stakeholder and key business questions - * Data Understanding and Analysis - * Source of data - * Description of data - * Three visualizations (the same visualizations presented in the slides and notebook) - * Conclusion - * Summary of conclusions including three relevant findings -2. Commit history - * Progression of updates throughout the project time period, not just immediately before the deadline - * Clear commit messages - * Commits from all team members (if a group project) -3. Organization - * Clear folder structure - * Clear names of files and folders - * Easily-located notebook and presentation linked in the README -4. Notebook(s) - * Clearly-indicated final notebook that runs without errors - * Exploratory/working notebooks (can contain errors, redundant code, etc.) from all team members (if a group project) -5. `.gitignore` - * A file called `.gitignore` at the root of the repository directory instructs Git to ignore large, unnecessary, or private files - * Because it starts with a `.`, you will need to type `ls -a` in the terminal in order to see that it is there - * GitHub maintains a [Python .gitignore](https://github.com/github/gitignore/blob/master/Python.gitignore) that may be a useful starting point for your version of this file - * To tell Git to ignore more files, just add a new line to `.gitignore` for each new file name - * Consider adding `.DS_Store` if you are using a Mac computer, as well as project-specific file names - * If you are running into an error message because you forgot to add something to `.gitignore` and it is too large to be pushed to GitHub [this blog post](https://medium.com/analytics-vidhya/tutorial-removing-large-files-from-git-78dbf4cf83a?sk=c3763d466c7f2528008c3777192dfb95)(friend link) should help you address this +It is important to consider the purpose of the flight from the accident data. -You wil submit a link to the GitHub repository on Canvas. +
+ Average Occupancy per maker +

Accident Distribution Based on Purpose of Flight

+
-See the [Grading](#grading) section for further explanation of how the GitHub repository will be graded. +Most of the accidents occured during personal flights. -For further reading on creating professional notebooks and `README`s, check out [this reading](https://github.com/learn-co-curriculum/dsc-repo-readability-v2-2). +**NB:- Thus, this branch of aviation seems the most risky that our company should steer clear from, or only engage in with extreme caution**. -## Grading +Next, it is important to understand distribution of accidents across different aircraft categories over the years. -***To pass this project, you must pass each project rubric objective.*** The project rubric objectives for Phase 2 are: +
+ Trends Per Category +

Accidents Trends per Aircraft Category

+
-1. Data Communication -2. Authoring Jupyter Notebooks -3. Data Manipulation and Analysis with `pandas` +Majority of the aircraft category data is "unknown". Filtering out unknown data will give a better indication. -### Data Communication +
+ Trends Per Category with Unknowns Filtered Out +

Accidents Trends per Aircraft Category with "Unknown" Filtered Out

+
-Communication is a key "soft skill". In [this survey](https://www.payscale.com/data-packages/job-skills), 46% of hiring managers said that recent college grads were missing this skill. +Airplanes register the most accidents, and it is possible that it is just because their the most used category. -Because "communication" can encompass such a wide range of contexts and skills, we will specifically focus our Phase 2 objective on Data Communication. We define Data Communication as: +**NB:- A recommendation for further study is to compare this against the total number of aircrafts of each category in operation. This will give a better sense of the probability of accident per aircraft category.** -> Communicating basic data analysis results to diverse audiences via writing and live presentation +Next, we analyze the aircraft damage per engine type. -To further define some of these terms: +
+ Accident Damage per Engine Type +

Accidents Damage per Engine Type

+
-* By "basic data analysis" we mean that you are filtering, sorting, grouping, and/or aggregating the data in order to answer business questions. This project does not involve inferential statistics or machine learning, although descriptive statistics such as measures of central tendency are encouraged. -* By "results" we mean your ***three visualizations and recommendations***. -* By "diverse audiences" we mean that your presentation and notebook are appropriately addressing a business and data science audience, respectively. +Reciprocating engines register an overwhelming majority of accidents with guaranteed substantial or total damage. -Below are the definitions of each rubric level for this objective. This information is also summarized in the rubric, which is attached to the project submission assignment. +**NB:- It is likely that this is the cae due to their overwhelming majority for all faircrafts in operation. Analyzing this against the total population of reciprocating engine-type aircrafts in operation will give a better sense.** -#### Exceeds Objective -Creates and describes appropriate visualizations for given business questions, where each visualization fulfills all elements of the checklist +It is important to understand the trend in accidents for different engine technologies over time. -> This "checklist" refers to the Data Visualization checklist within the larger Phase 2 Project Checklist +
+ Accident Trends per Engine Type +

Accidents Trends per Engine Type

+
-#### Meets Objective (Passing Bar) -Creates and describes appropriate visualizations for given business questions +The total number of accidents is droping significantly, in parallel with a steep reduction in the number of accidents for reciprocating engine-type aircrafts. This indicates either of the following: -> This objective can be met even if all checklist elements are not fulfilled. For example, if there is some illegible text in one of your visualizations, you can still meet this objective +* Reciprocating Engine-type Aircrafts are becoming significantly safer with time, hence low number of accident, or, +* Reciprocating Engine-type Aircrafts are becoming less popular and hence not used as much -#### Approaching Objective -Creates visualizations that are not related to the business questions, or uses an inappropriate type of visualization - -> Even if you create very compelling visualizations, you cannot pass this objective if the visualizations are not related to the business questions - -> An example of an inappropriate type of visualization would be using a line graph to show the correlation between two independent variables, when a scatter plot would be more appropriate - -#### Does Not Meet Objective -Does not submit the required number of visualizations - -### Authoring Jupyter Notebooks - -According to [Kaggle's 2020 State of Data Science and Machine Learning Survey](https://www.kaggle.com/kaggle-survey-2020), 74.1% of data scientists use a Jupyter development environment, which is more than twice the percentage of the next-most-popular IDE, Visual Studio Code. Jupyter Notebooks allow for reproducible, skim-able code documents for a data science audience. Comfort and skill with authoring Jupyter Notebooks will prepare you for job interviews, take-home challenges, and on-the-job tasks as a data scientist. - -The key feature that distinguishes *authoring Jupyter Notebooks* from simply *writing Python code* is the fact that Markdown cells are integrated into the notebook along with the Python cells in a notebook. You have seen examples of this throughout the curriculum, but now it's time for you to practice this yourself! - -Below are the definitions of each rubric level for this objective. This information is also summarized in the rubric, which is attached to the project submission assignment. - -#### Exceeds Objective -Uses Markdown and code comments to create a well-organized, skim-able document that follows all best practices - -> Refer to the [repository readability reading](https://github.com/learn-co-curriculum/dsc-repo-readability-v2-2) for more tips on best practices - -#### Meets Objective (Passing Bar) -Uses some Markdown to create an organized notebook, with an introduction at the top and a conclusion at the bottom - -#### Approaching Objective -Uses Markdown cells to organize, but either uses only headers and does not provide any explanations or justifications, or uses only plaintext without any headers to segment out sections of the notebook - -> Headers in Markdown are delineated with one or more `#`s at the start of the line. You should have a mixture of headers and plaintext (text where the line does not start with `#`) - -#### Does Not Meet Objective -Does not submit a notebook, or does not use Markdown cells at all to organize the notebook - -### Data Manipulation and Analysis with `pandas` - -`pandas` is a very popular data manipulation library, with over 2 million downloads on Anaconda (`conda install pandas`) and over 19 million downloads on PyPI (`pip install pandas`) at the time of this writing. In our own internal data, we see that the overwhelming majority of Flatiron School DS grads use `pandas` on the job in some capacity. - -Unlike in base Python, where the Zen of Python says "There should be one-- and preferably only one --obvious way to do it", there is often more than one valid way to do something in `pandas`. However there are still more efficient and less efficient ways to use it. Specifically, the best `pandas` code is *performant* and *idiomatic*. - -Performant `pandas` code utilizes methods and broadcasting rather than user-defined functions or `for` loops. For example, if you need to strip whitespace from a column containing string data, the best approach would be to use the [`pandas.Series.str.strip` method](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.strip.html) rather than writing your own function or writing a loop. Or if you want to multiply everything in a column by 100, the best approach would be to use broadcasting (e.g. `df["column_name"] * 100`) instead of a function or loop. You can still write your own functions if needed, but only after checking that there isn't a built-in way to do it. - -Idiomatic `pandas` code has variable names that are meaningful words or abbreviations in English, that are related to the purpose of the variables. You can still use `df` as the name of your DataFrame if there is only one main DataFrame you are working with, but as soon as you are merging multiple DataFrames or taking a subset of a DataFrame, you should use meaningful names. For example, `df2` would not be an idiomatic name, but `movies_and_reviews` could be. - -We also recommend that you rename all DataFrame columns so that their meanings are more understandable, although it is fine to have acronyms. For example, `"col1"` would not be an idiomatic name, but `"USD"` could be. - -Below are the definitions of each rubric level for this objective. This information is also summarized in the rubric, which is attached to the project submission assignment. - -#### Exceeds Objective -Uses `pandas` to prepare data and answer business questions in an idiomatic, performant way - -#### Meets Objective (Passing Bar) -Successfully uses `pandas` to prepare data in order to answer business questions - -> This includes projects that _occasionally_ use base Python when `pandas` methods would be more appropriate (such as using `enumerate()` on a DataFrame), or occasionally performs operations that do not appear to have any relevance to the business questions - -#### Approaching Objective -Uses `pandas` to prepare data, but makes significant errors - -> Examples of significant errors include: the result presented does not actually answer the stated question, the code produces errors, the code _consistently_ uses base Python when `pandas` methods would be more appropriate, or the submitted notebook contains significant quantities of code that is unrelated to the presented analysis (such as copy/pasted code from the curriculum or StackOverflow) - -#### Does Not Meet Objective -Unable to prepare data using `pandas` - -> This includes projects that successfully answer the business questions, but do not use `pandas` (e.g. use only base Python, or use some other tool like R, Tableau, or Excel) - -## Getting Started - -Please start by reviewing the contents of this project description. If you have any questions, please ask your instructor ASAP. - -Next, you will need to complete the [***Project Proposal***](#project_proposal) which must be reviewed by your instructor before you can continue with the project. - -Then, you will need to create a GitHub repository. There are three options: - -1. Look at the [Phase 2 Project Templates and Examples repo](https://github.com/learn-co-curriculum/dsc-project-template) and follow the directions in the MVP branch. -2. Fork the [Phase 2 Project Repository](https://github.com/learn-co-curriculum/dsc-phase-2-project-v3), clone it locally, and work in the `student.ipynb` file. Make sure to also add and commit a PDF of your presentation to your repository with a file name of `presentation.pdf`. -3. Create a new repository from scratch by going to [github.com/new](https://github.com/new) and copying the data files from one of the above resources into your new repository. This approach will result in the most professional-looking portfolio repository, but can be more complicated to use. So if you are getting stuck with this option, try one of the above options instead. - -## Summary - -This project will give you a valuable opportunity to develop your data science skills using real-world data. The end-of-phase projects are a critical part of the program because they give you a chance to bring together all the skills you've learned, apply them to realistic projects for a business stakeholder, practice communication skills, and get feedback to help you improve. You've got this! +**NB:- Comparing this with data showing the engine types for all aircrafts in operation for the given years will give a better sense of this.** \ No newline at end of file diff --git a/index.ipynb b/index.ipynb index 3623bc14..5f13b431 100644 --- a/index.ipynb +++ b/index.ipynb @@ -2,621 +2,3866 @@ "cells": [ { "cell_type": "markdown", - "id": "5d35b2b4", + "id": "d1041ab1", "metadata": {}, "source": [ - "# Phase 2 Project Description" - ] - }, - { - "cell_type": "markdown", - "id": "b5e9e179", - "metadata": {}, - "source": [ - "You've made it through the second phase of this course, and now you will put your new skills to use with a large end-of-Phase project!\n", - "\n", - "In this project description, we will cover:\n", - "\n", - "* [***Project Overview:***](#project-overview) the project goal, audience, and dataset\n", - "* [***Deliverables:***](#deliverables) the specific items you are required to produce for this project\n", - "* [***Grading:***](#grading) how your project will be scored\n", - "* [***Getting Started:***](#getting-started) guidance for how to begin your first project" - ] - }, - { - "cell_type": "markdown", - "id": "58851385", - "metadata": {}, - "source": [ - "## Project Overview" - ] - }, - { - "cell_type": "markdown", - "id": "6f37995f", - "metadata": {}, - "source": [ - "For this project, you will use exploratory data analysis to generate insights for a business stakeholder." - ] - }, - { - "cell_type": "markdown", - "id": "8b0f1668", - "metadata": {}, - "source": [ - "### Business Problem" - ] - }, - { - "cell_type": "markdown", - "id": "dce55d1d", - "metadata": {}, - "source": [ - "Your company now sees all the big companies creating original video content and they want to get in on the fun. They have decided to create a new movie studio, but they don’t know anything about creating movies. You are charged with exploring what types of films are currently doing the best at the box office. You must then translate those findings into actionable insights that the head of your company's new movie studio can use to help decide what type of films to create." - ] - }, - { - "cell_type": "markdown", - "id": "d3d557bf", - "metadata": {}, - "source": [ - "### The Data" - ] - }, - { - "cell_type": "markdown", - "id": "ca34efb7", - "metadata": {}, - "source": [ - "In the folder `zippedData` are movie datasets from:\n", - "\n", - "* [Box Office Mojo](https://www.boxofficemojo.com/)\n", - "* [IMDB](https://www.imdb.com/)\n", - "* [Rotten Tomatoes](https://www.rottentomatoes.com/)\n", - "* [TheMovieDB](https://www.themoviedb.org/)\n", - "* [The Numbers](https://www.the-numbers.com/)\n", - "\n", - "Because it was collected from various locations, the different files have different formats. Some are compressed CSV (comma-separated values) or TSV (tab-separated values) files that can be opened using spreadsheet software or `pd.read_csv`, while the data from IMDB is located in a SQLite database.\n", - "\n", - "![movie data erd](https://raw.githubusercontent.com/learn-co-curriculum/dsc-phase-2-project-v3/main/movie_data_erd.jpeg)\n", + "# **Problem Definition**\n", "\n", - "Note that the above diagram shows ONLY the IMDB data. You will need to look carefully at the features to figure out how the IMDB data relates to the other provided data files.\n", + "A recent profitable trend across most big companies is creation of original video content. Our company's expansion and diversification plans include getting in on this fun by creating a new movie studio.\n", + "An analysis of box office performance is critical to identify a profitable formula for operating a profitable studio.\n", "\n", - "It is up to you to decide what data from this to use and how to use it. If you want to make this more challenging, you can scrape websites or make API calls to get additional data. If you are feeling overwhelmed or behind, we recommend you use only the following data files:\n", + "The primary objective of this exercise is to generate an accurate model for predicting box office success as a blueprint for running our proposed new movie studio\n", "\n", - "* `im.db.zip`\n", - " * Zipped SQLite database (you will need to unzip then query using SQLite)\n", - " * `movie_basics` and `movie_ratings` tables are most relevant\n", - "* `bom.movie_gross.csv.gz`\n", - " * Compressed CSV file (you can open without expanding the file using `pd.read_csv`)" - ] - }, - { - "cell_type": "markdown", - "id": "5ace6e4f", - "metadata": {}, - "source": [ - "### Key Points" + "* The highest rated movies with respect to attributes such as Genre, Region, Writers, Directors, Actors etc.\n", + "* The most watched/voted movies with respect to attributes such as Genre, Region, Writers, Directors, Actors etc.\n", + "* The highest grossing movies with respect to attributes such as Genre, Region, Writers, Directors, Actors etc." ] }, { "cell_type": "markdown", - "id": "c9d2edeb", + "id": "94558de7", "metadata": {}, "source": [ - "* **Your analysis should yield three concrete business recommendations.** The ultimate purpose of exploratory analysis is not just to learn about the data, but to help an organization perform better. Explicitly relate your findings to business needs by recommending actions that you think the business should take.\n", + "## **Dataset Overview**\n", + "It is imperative for us to understand the movie database first\n", "\n", - "* **Communicating about your work well is extremely important.** Your ability to provide value to an organization - or to land a job there - is directly reliant on your ability to communicate with them about what you have done and why it is valuable. Create a storyline your audience (the head of the new movie studio) can follow by walking them through the steps of your process, highlighting the most important points and skipping over the rest.\n", "\n", - "* **Use plenty of visualizations.** Visualizations are invaluable for exploring your data and making your findings accessible to a non-technical audience. Spotlight visuals in your presentation, but only ones that relate directly to your recommendations. Simple visuals are usually best (e.g. bar charts and line graphs), and don't forget to format them well (e.g. labels, titles)." - ] - }, - { - "cell_type": "markdown", - "id": "474e2ec3", - "metadata": {}, - "source": [ - "## Deliverables" - ] - }, - { - "cell_type": "markdown", - "id": "eaeda85f", - "metadata": {}, - "source": [ - "There are three deliverables for this project:\n", + "* The data structure e.g. available columns, data types and presence of missing values\n", + "* Identify useful columns to focus on\n", "\n", - "* A **non-technical presentation**\n", - "* A **Jupyter Notebook**\n", - "* A **GitHub repository**" + "Data Understanding will prescribe subsequent steps e.g. data cleaning\n" ] }, { "cell_type": "markdown", - "id": "a7f8e274", + "id": "28ba3473", "metadata": {}, "source": [ - "### Non-Technical Presentation" + "### Initializing Relevant Libraries" ] }, { "cell_type": "markdown", - "id": "540d5c27", + "id": "abe92043", "metadata": {}, "source": [ - "The non-technical presentation is a slide deck presenting your analysis to business stakeholders.\n", - "\n", - "* ***Non-technical*** does not mean that you should avoid mentioning the technologies or techniques that you used, it means that you should explain any mentions of these technologies and avoid assuming that your audience is already familiar with them.\n", - "* ***Business stakeholders*** means that the audience for your presentation is the company, not the class or teacher. Do not assume that they are already familiar with the specific business problem.\n", - "\n", - "The presentation describes the project ***goals, data, methods, and results***. It must include at least ***three visualizations*** which correspond to ***three business recommendations***.\n", - "\n", - "We recommend that you follow this structure, although the slide titles should be specific to your project:\n", - "\n", - "1. Beginning\n", - " * Overview\n", - " * Business Understanding\n", - "2. Middle\n", - " * Data Understanding\n", - " * Data Analysis\n", - "3. End\n", - " * Recommendations\n", - " * Next Steps\n", - " * Thank You\n", - " * This slide should include a prompt for questions as well as your contact information (name and LinkedIn profile)\n", - "\n", - "You will give a live presentation of your slides and submit them in PDF format on Canvas. The slides should also be present in the GitHub repository you submit with a file name of `presentation.pdf`.\n", - "\n", - "The graded elements of the presentation are:\n", - "\n", - "* Presentation Content\n", - "* Slide Style\n", - "* Presentation Delivery and Answers to Questions\n", - "\n", - "See the [Grading](#grading) section for further explanation of these elements.\n", - "\n", - "For further reading on creating professional presentations, check out:\n", - "\n", - "* [Presentation Content](https://github.com/learn-co-curriculum/dsc-project-presentation-content)\n", - "* [Slide Style](https://github.com/learn-co-curriculum/dsc-project-slide-design)" + "First, we initialize common libraries we project to utilize in this exercise" ] }, { - "cell_type": "markdown", - "id": "d27915ba", + "cell_type": "code", + "execution_count": null, + "id": "be4633b1", "metadata": {}, + "outputs": [], "source": [ - "### Jupyter Notebook" + "#Import libraries we're likely to use upfront\n", + "import pandas as pd #To create and manipulate pandas dataframes\n", + "import seaborn as sns #To Facilitate visualizations\n", + "import matplotlib.pyplot as plt #To facilitate visualizations\n", + "import numpy as np #To facilitate mathematical calculations\n", + "from sklearn.preprocessing import LabelEncoder #Use this to encode categorical data\n", + "import sqlite3 #Use this to navigate the SQL database\n", + "import warnings\n", + "from scipy import stats\n", + "from numbers import Number\n", + "%matplotlib inline" ] }, { "cell_type": "markdown", - "id": "2d5d45ea", + "id": "53095d17", "metadata": {}, "source": [ - "The Jupyter Notebook is a notebook that uses Python and Markdown to present your analysis to a data science audience.\n", - "\n", - "* ***Python and Markdown*** means that you need to construct an integrated `.ipynb` file with Markdown (headings, paragraphs, links, lists, etc.) and Python code to create a well-organized, skim-able document.\n", - " * The notebook kernel should be restarted and all cells run before submission, to ensure that all code is runnable in order.\n", - " * Markdown should be used to frame the project with a clear introduction and conclusion, as well as introducing each of the required elements.\n", - "* ***Data science audience*** means that you can assume basic data science proficiency in the person reading your notebook. This differs from the non-technical presentation.\n", - "\n", - "Along with the presentation, the notebook also describes the project ***goals, data, methods, and results***. It must include at least ***three visualizations*** which correspond to ***three business recommendations***.\n", - "\n", - "You will submit the notebook in PDF format on Canvas as well as in `.ipynb` format in your GitHub repository.\n", - "\n", - "The graded elements for the Jupyter Notebook are:\n", - "\n", - "* Business Understanding\n", - "* Data Understanding\n", - "* Data Preparation\n", - "* Data Analysis\n", - "* Visualization\n", - "* Code Quality\n", - "\n", - "See the [Grading](#grading) section for further explanation of these elements." + "### Loading the dataset" ] }, { "cell_type": "markdown", - "id": "2027aa4c", + "id": "ee0f2d8b", "metadata": {}, "source": [ - "### GitHub Repository" + "Then we load the provided dataset" ] }, { - "cell_type": "markdown", - "id": "b8057390", + "cell_type": "code", + "execution_count": null, + "id": "1f99dc31", "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"df[df['type'] == 'table']\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"table\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 8,\n \"samples\": [\n \"directors\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tbl_name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 8,\n \"samples\": [\n \"directors\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rootpage\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2,\n \"min\": 2,\n \"max\": 9,\n \"num_unique_values\": 8,\n \"samples\": [\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sql\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 8,\n \"samples\": [\n \"CREATE TABLE \\\"directors\\\" (\\n\\\"movie_id\\\" TEXT,\\n \\\"person_id\\\" TEXT\\n)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
typenametbl_namerootpagesql
0tablemovie_basicsmovie_basics2CREATE TABLE \"movie_basics\" (\\n\"movie_id\" TEXT...
1tabledirectorsdirectors3CREATE TABLE \"directors\" (\\n\"movie_id\" TEXT,\\n...
2tableknown_forknown_for4CREATE TABLE \"known_for\" (\\n\"person_id\" TEXT,\\...
3tablemovie_akasmovie_akas5CREATE TABLE \"movie_akas\" (\\n\"movie_id\" TEXT,\\...
4tablemovie_ratingsmovie_ratings6CREATE TABLE \"movie_ratings\" (\\n\"movie_id\" TEX...
5tablepersonspersons7CREATE TABLE \"persons\" (\\n\"person_id\" TEXT,\\n ...
6tableprincipalsprincipals8CREATE TABLE \"principals\" (\\n\"movie_id\" TEXT,\\...
7tablewriterswriters9CREATE TABLE \"writers\" (\\n\"movie_id\" TEXT,\\n ...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " type name tbl_name rootpage \\\n", + "0 table movie_basics movie_basics 2 \n", + "1 table directors directors 3 \n", + "2 table known_for known_for 4 \n", + "3 table movie_akas movie_akas 5 \n", + "4 table movie_ratings movie_ratings 6 \n", + "5 table persons persons 7 \n", + "6 table principals principals 8 \n", + "7 table writers writers 9 \n", + "\n", + " sql \n", + "0 CREATE TABLE \"movie_basics\" (\\n\"movie_id\" TEXT... \n", + "1 CREATE TABLE \"directors\" (\\n\"movie_id\" TEXT,\\n... \n", + "2 CREATE TABLE \"known_for\" (\\n\"person_id\" TEXT,\\... \n", + "3 CREATE TABLE \"movie_akas\" (\\n\"movie_id\" TEXT,\\... \n", + "4 CREATE TABLE \"movie_ratings\" (\\n\"movie_id\" TEX... \n", + "5 CREATE TABLE \"persons\" (\\n\"person_id\" TEXT,\\n ... \n", + "6 CREATE TABLE \"principals\" (\\n\"movie_id\" TEXT,\\... \n", + "7 CREATE TABLE \"writers\" (\\n\"movie_id\" TEXT,\\n ... " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "The GitHub repository is the cloud-hosted directory containing all of your project files as well as their version history.\n", - "\n", - "This repository link will be the project link that you include on your resume, LinkedIn, etc. for prospective employers to view your work. Note that we typically recommend that 3 links are highlighted (out of 5 projects) so don't stress too much about getting this one to be perfect! There will also be time after graduation for cosmetic touch-ups.\n", - "\n", - "A professional GitHub repository has:\n", + "#connect the im.db database\n", + "#Understand the data and get a snapshot of the data\n", + "conn = sqlite3.connect('im.db')\n", + "cursor = conn.cursor()\n", "\n", - "1. `README.md`\n", - " * A file called `README.md` at the root of the repository directory, written in Markdown; this is what is rendered when someone visits the link to your repository in the browser\n", - " * This file contains these sections:\n", - " * Overview\n", - " * Business Understanding\n", - " * Include stakeholder and key business questions\n", - " * Data Understanding and Analysis\n", - " * Source of data\n", - " * Description of data\n", - " * Three visualizations (the same visualizations presented in the slides and notebook)\n", - " * Conclusion\n", - " * Summary of conclusions including three relevant findings\n", - "2. Commit history\n", - " * Progression of updates throughout the project time period, not just immediately before the deadline\n", - " * Clear commit messages\n", - " * Commits from all team members (if a group project)\n", - "3. Organization\n", - " * Clear folder structure\n", - " * Clear names of files and folders\n", - " * Easily-located notebook and presentation linked in the README\n", - "4. Notebook(s)\n", - " * Clearly-indicated final notebook that runs without errors\n", - " * Exploratory/working notebooks (can contain errors, redundant code, etc.) from all team members (if a group project)\n", - "5. `.gitignore`\n", - " * A file called `.gitignore` at the root of the repository directory instructs Git to ignore large, unnecessary, or private files\n", - " * Because it starts with a `.`, you will need to type `ls -a` in the terminal in order to see that it is there\n", - " * GitHub maintains a [Python .gitignore](https://github.com/github/gitignore/blob/master/Python.gitignore) that may be a useful starting point for your version of this file\n", - " * To tell Git to ignore more files, just add a new line to `.gitignore` for each new file name\n", - " * Consider adding `.DS_Store` if you are using a Mac computer, as well as project-specific file names\n", - " * If you are running into an error message because you forgot to add something to `.gitignore` and it is too large to be pushed to GitHub [this blog post](https://medium.com/analytics-vidhya/tutorial-removing-large-files-from-git-78dbf4cf83a?sk=c3763d466c7f2528008c3777192dfb95)(friend link) should help you address this\n", + "#tables in the database\n", + "df = pd.read_sql(\n", + " \"\"\"\n", + " SELECT *\n", + " FROM sqlite_master\n", + " \"\"\"\n", + ", conn\n", + ")\n", "\n", - "You wil submit a link to the GitHub repository on Canvas.\n", - "\n", - "See the [Grading](#grading) section for further explanation of how the GitHub repository will be graded.\n", - "\n", - "For further reading on creating professional notebooks and `README`s, check out [this reading](https://github.com/learn-co-curriculum/dsc-repo-readability-v2-2)." + "df[df['type'] == 'table']" ] }, { "cell_type": "markdown", - "id": "f19694e7", + "id": "01910e47", "metadata": {}, "source": [ - "## Grading" - ] - }, - { - "cell_type": "markdown", - "id": "06e9cfb7", - "metadata": {}, - "source": [ - "***To pass this project, you must pass each project rubric objective.*** The project rubric objectives for Phase 2 are:\n", - "\n", - "1. Data Communication\n", - "2. Authoring Jupyter Notebooks\n", - "3. Data Manipulation and Analysis with `pandas`" + "The movie database contains the 8 tables as shown below" ] }, { "cell_type": "markdown", - "id": "a4c04769", + "id": "156abdf5", "metadata": {}, "source": [ - "### Data Communication" + "![movie_data_erd.jpeg]()" ] }, { "cell_type": "markdown", - "id": "0834a4ee", + "id": "597a7776", "metadata": {}, "source": [ - "Communication is a key \"soft skill\". In [this survey](https://www.payscale.com/data-packages/job-skills), 46% of hiring managers said that recent college grads were missing this skill.\n", + "### Movie Database Data Understanding\n", "\n", - "Because \"communication\" can encompass such a wide range of contexts and skills, we will specifically focus our Phase 2 objective on Data Communication. We define Data Communication as:\n", - "\n", - "> Communicating basic data analysis results to diverse audiences via writing and live presentation\n", - "\n", - "To further define some of these terms:\n", - "\n", - "* By \"basic data analysis\" we mean that you are filtering, sorting, grouping, and/or aggregating the data in order to answer business questions. This project does not involve inferential statistics or machine learning, although descriptive statistics such as measures of central tendency are encouraged.\n", - "* By \"results\" we mean your ***three visualizations and recommendations***.\n", - "* By \"diverse audiences\" we mean that your presentation and notebook are appropriately addressing a business and data science audience, respectively.\n", - "\n", - "Below are the definitions of each rubric level for this objective. This information is also summarized in the rubric, which is attached to the project submission assignment." + "Next we explore the various tables in the movie database:" ] }, { "cell_type": "markdown", - "id": "276dff7c", + "id": "16874c1f", "metadata": {}, "source": [ - "#### Exceeds Objective" + "#### Principals Table" ] }, { - "cell_type": "markdown", - "id": "e87c2713", + "cell_type": "code", + "execution_count": null, + "id": "a3bc30cf", "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "principals_df" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_idorderingperson_idcategoryjobcharacters
0tt01114141nm0246005actorNone[\"The Man\"]
1tt01114142nm0398271directorNoneNone
2tt01114143nm3739909producerproducerNone
3tt032380810nm0059247editorNoneNone
4tt03238081nm3579312actressNone[\"Beth Boothby\"]
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " movie_id ordering person_id category job characters\n", + "0 tt0111414 1 nm0246005 actor None [\"The Man\"]\n", + "1 tt0111414 2 nm0398271 director None None\n", + "2 tt0111414 3 nm3739909 producer producer None\n", + "3 tt0323808 10 nm0059247 editor None None\n", + "4 tt0323808 1 nm3579312 actress None [\"Beth Boothby\"]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "Creates and describes appropriate visualizations for given business questions, where each visualization fulfills all elements of the checklist\n", - "\n", - "> This \"checklist\" refers to the Data Visualization checklist within the larger Phase 2 Project Checklist" + "#Explore principal table\n", + "principals_query = \"\"\"\n", + " SELECT *\n", + " FROM principals\"\"\"\n", + "principals_df = pd.read_sql(principals_query, conn)\n", + "principals_df.to_csv('principals.csv') #create a principals csv file to facilitate EDA on Tableau\n", + "principals_df.head()" ] }, { "cell_type": "markdown", - "id": "b4e8a4c7", + "id": "d36d8128", "metadata": {}, "source": [ - "#### Meets Objective (Passing Bar)" + "Principals table details main people (using their person_id)that were involved with different movies (using the movie_id) and the capacities in which they were involved e.g. director, actor, producer etc. There could be a relationship between these people and the success of the movie in the box office" ] }, { "cell_type": "markdown", - "id": "bc4e21d0", + "id": "d386044a", "metadata": {}, "source": [ - "Creates and describes appropriate visualizations for given business questions\n", - "\n", - "> This objective can be met even if all checklist elements are not fulfilled. For example, if there is some illegible text in one of your visualizations, you can still meet this objective" + "#### Persons Table" ] }, { - "cell_type": "markdown", - "id": "d0403eb9", + "cell_type": "code", + "execution_count": null, + "id": "80cccf0d", "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "persons_df" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
person_idprimary_namebirth_yeardeath_yearprimary_profession
0nm0061671Mary Ellen BauderNaNNaNmiscellaneous,production_manager,producer
1nm0061865Joseph BauerNaNNaNcomposer,music_department,sound_department
2nm0062070Bruce BaumNaNNaNmiscellaneous,actor,writer
3nm0062195Axel BaumannNaNNaNcamera_department,cinematographer,art_department
4nm0062798Pete BaxterNaNNaNproduction_designer,art_department,set_decorator
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " person_id primary_name birth_year death_year \\\n", + "0 nm0061671 Mary Ellen Bauder NaN NaN \n", + "1 nm0061865 Joseph Bauer NaN NaN \n", + "2 nm0062070 Bruce Baum NaN NaN \n", + "3 nm0062195 Axel Baumann NaN NaN \n", + "4 nm0062798 Pete Baxter NaN NaN \n", + "\n", + " primary_profession \n", + "0 miscellaneous,production_manager,producer \n", + "1 composer,music_department,sound_department \n", + "2 miscellaneous,actor,writer \n", + "3 camera_department,cinematographer,art_department \n", + "4 production_designer,art_department,set_decorator " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "#### Approaching Objective" + "#Explore persons table\n", + "persons_query = \"\"\"\n", + " SELECT *\n", + " FROM persons\"\"\"\n", + "persons_df = pd.read_sql(persons_query, conn)\n", + "persons_df.to_csv('persons.csv') #create a persons csv file to facilitate EDA on Tableau\n", + "persons_df.head()" ] }, { "cell_type": "markdown", - "id": "22dd4ad6", + "id": "3bac623a", "metadata": {}, "source": [ - "Creates visualizations that are not related to the business questions, or uses an inappropriate type of visualization\n", - "\n", - "> Even if you create very compelling visualizations, you cannot pass this objective if the visualizations are not related to the business questions\n", - "\n", - "> An example of an inappropriate type of visualization would be using a line graph to show the correlation between two independent variables, when a scatter plot would be more appropriate" + "Persons table details the name, birth year, death year and primary professions of the various people using their person_id. There could be a relationship between the people involved in a movie and the success of the movie in the box office" ] }, { "cell_type": "markdown", - "id": "aa1b808d", + "id": "a36df9dc", "metadata": {}, "source": [ - "#### Does Not Meet Objective" + "#### Known For Table" ] }, { - "cell_type": "markdown", - "id": "a8a64869", + "cell_type": "code", + "execution_count": null, + "id": "2d0fabc3", "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "known_for_df" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
person_idmovie_id
0nm0061671tt0837562
1nm0061671tt2398241
2nm0061671tt0844471
3nm0061671tt0118553
4nm0061865tt0896534
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " person_id movie_id\n", + "0 nm0061671 tt0837562\n", + "1 nm0061671 tt2398241\n", + "2 nm0061671 tt0844471\n", + "3 nm0061671 tt0118553\n", + "4 nm0061865 tt0896534" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "Does not submit the required number of visualizations" + "#Explore known_for table\n", + "known_for_query = \"\"\"\n", + " SELECT *\n", + " FROM known_for\"\"\"\n", + "known_for_df = pd.read_sql(known_for_query, conn)\n", + "known_for_df.to_csv('known_for.csv') #create a known_for csv file to facilitate EDA on Tableau\n", + "known_for_df.head()" ] }, { "cell_type": "markdown", - "id": "db2e0ce8", + "id": "611553a9", "metadata": {}, "source": [ - "### Authoring Jupyter Notebooks" + "Known_for table details the various movies different people are known for by person_id and movie_id." ] }, { "cell_type": "markdown", - "id": "91cc89b5", + "id": "da96d99b", "metadata": {}, "source": [ - "According to [Kaggle's 2020 State of Data Science and Machine Learning Survey](https://www.kaggle.com/kaggle-survey-2020), 74.1% of data scientists use a Jupyter development environment, which is more than twice the percentage of the next-most-popular IDE, Visual Studio Code. Jupyter Notebooks allow for reproducible, skim-able code documents for a data science audience. Comfort and skill with authoring Jupyter Notebooks will prepare you for job interviews, take-home challenges, and on-the-job tasks as a data scientist.\n", - "\n", - "The key feature that distinguishes *authoring Jupyter Notebooks* from simply *writing Python code* is the fact that Markdown cells are integrated into the notebook along with the Python cells in a notebook. You have seen examples of this throughout the curriculum, but now it's time for you to practice this yourself!\n", - "\n", - "Below are the definitions of each rubric level for this objective. This information is also summarized in the rubric, which is attached to the project submission assignment." + "#### Directors Table" ] }, { - "cell_type": "markdown", - "id": "b9272672", + "cell_type": "code", + "execution_count": null, + "id": "9939af2e", "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "directors_df" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_idperson_id
0tt0285252nm0899854
1tt0462036nm1940585
2tt0835418nm0151540
3tt0835418nm0151540
4tt0878654nm0089502
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " movie_id person_id\n", + "0 tt0285252 nm0899854\n", + "1 tt0462036 nm1940585\n", + "2 tt0835418 nm0151540\n", + "3 tt0835418 nm0151540\n", + "4 tt0878654 nm0089502" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "#### Exceeds Objective" + "#Explore directors table\n", + "directors_query = \"\"\"\n", + " SELECT *\n", + " FROM directors\"\"\"\n", + "directors_df = pd.read_sql(directors_query, conn)\n", + "directors_df.to_csv('directors.csv') #create a directors csv file to facilitate EDA on Tableau\n", + "directors_df.head()" ] }, { "cell_type": "markdown", - "id": "efc937e5", + "id": "eb4e7376", "metadata": {}, "source": [ - "Uses Markdown and code comments to create a well-organized, skim-able document that follows all best practices\n", - "\n", - "> Refer to the [repository readability reading](https://github.com/learn-co-curriculum/dsc-repo-readability-v2-2) for more tips on best practices" + "Directors table details the various movies and the people they are known for by movie_id and person_id. There could be a relationship between the directors of a movie and the success of the movie in the box office" ] }, { "cell_type": "markdown", - "id": "d01725ea", + "id": "20b5fefc", "metadata": {}, "source": [ - "#### Meets Objective (Passing Bar)" + "#### Writers Table" ] }, { - "cell_type": "markdown", - "id": "2c854f50", + "cell_type": "code", + "execution_count": null, + "id": "ba3a5732", "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "writers_df" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_idperson_id
0tt0285252nm0899854
1tt0438973nm0175726
2tt0438973nm1802864
3tt0462036nm1940585
4tt0835418nm0310087
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " movie_id person_id\n", + "0 tt0285252 nm0899854\n", + "1 tt0438973 nm0175726\n", + "2 tt0438973 nm1802864\n", + "3 tt0462036 nm1940585\n", + "4 tt0835418 nm0310087" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "Uses some Markdown to create an organized notebook, with an introduction at the top and a conclusion at the bottom" + "#Explore writers table\n", + "writers_query = \"\"\"\n", + " SELECT *\n", + " FROM writers\"\"\"\n", + "writers_df = pd.read_sql(writers_query, conn)\n", + "writers_df.to_csv('writers.csv') #create a writers csv file to facilitate EDA on Tableau\n", + "writers_df.head()" ] }, { "cell_type": "markdown", - "id": "3e0b3385", + "id": "bcf3b1b4", "metadata": {}, "source": [ - "#### Approaching Objective" + "Writers table details the various movies and their pewriters by movie_id and person_id. There could be a relationship between the writers of a movie and the success of the movie in the box office" ] }, { "cell_type": "markdown", - "id": "67767f89", + "id": "ede4423c", "metadata": {}, "source": [ - "Uses Markdown cells to organize, but either uses only headers and does not provide any explanations or justifications, or uses only plaintext without any headers to segment out sections of the notebook\n", - "\n", - "> Headers in Markdown are delineated with one or more `#`s at the start of the line. You should have a mixture of headers and plaintext (text where the line does not start with `#`)" + "#### Movie_Basics Table" ] }, { - "cell_type": "markdown", - "id": "195ef62a", + "cell_type": "code", + "execution_count": null, + "id": "7b495071", "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "movie_df" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_idprimary_titleoriginal_titlestart_yearruntime_minutesgenres
0tt0063540SunghurshSunghursh2013175.0Action,Crime,Drama
1tt0066787One Day Before the Rainy SeasonAshad Ka Ek Din2019114.0Biography,Drama
2tt0069049The Other Side of the WindThe Other Side of the Wind2018122.0Drama
3tt0069204Sabse Bada SukhSabse Bada Sukh2018NaNComedy,Drama
4tt0100275The Wandering Soap OperaLa Telenovela Errante201780.0Comedy,Drama,Fantasy
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " movie_id primary_title original_title \\\n", + "0 tt0063540 Sunghursh Sunghursh \n", + "1 tt0066787 One Day Before the Rainy Season Ashad Ka Ek Din \n", + "2 tt0069049 The Other Side of the Wind The Other Side of the Wind \n", + "3 tt0069204 Sabse Bada Sukh Sabse Bada Sukh \n", + "4 tt0100275 The Wandering Soap Opera La Telenovela Errante \n", + "\n", + " start_year runtime_minutes genres \n", + "0 2013 175.0 Action,Crime,Drama \n", + "1 2019 114.0 Biography,Drama \n", + "2 2018 122.0 Drama \n", + "3 2018 NaN Comedy,Drama \n", + "4 2017 80.0 Comedy,Drama,Fantasy " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "#### Does Not Meet Objective" + "#Explore movie_basics table\n", + "movie_query = \"\"\"\n", + " SELECT *\n", + " FROM movie_basics\"\"\"\n", + "movie_df = pd.read_sql(movie_query, conn)\n", + "movie_df.to_csv('movie.csv') #create a movies csv file to facilitate EDA on Tableau\n", + "movie_df.head()" ] }, { "cell_type": "markdown", - "id": "709181b9", + "id": "af3f3c04", "metadata": {}, "source": [ - "Does not submit a notebook, or does not use Markdown cells at all to organize the notebook" + "Movie_basics table details the various movie titles, the year they were released, the run-time minutes and the various genres (there may be need for feature engineering around this aspect). There could be a relationship between these parameters and the success of a movie in the box office" ] }, { "cell_type": "markdown", - "id": "290335d1", + "id": "a60bf85e", "metadata": {}, "source": [ - "### Data Manipulation and Analysis with `pandas`" + "#### Movie_Ratings Table" ] }, { - "cell_type": "markdown", - "id": "2c0aae32", + "cell_type": "code", + "execution_count": null, + "id": "f2f5485d", "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"movie_ratings_df\",\n \"rows\": 73856,\n \"fields\": [\n {\n \"column\": \"movie_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 73856,\n \"samples\": [\n \"tt1777573\",\n \"tt1867008\",\n \"tt5652498\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"averagerating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.4749783548957855,\n \"min\": 1.0,\n \"max\": 10.0,\n \"num_unique_values\": 91,\n \"samples\": [\n 3.8,\n 7.7,\n 3.7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"numvotes\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 30294,\n \"min\": 5,\n \"max\": 1841066,\n \"num_unique_values\": 7349,\n \"samples\": [\n 7312,\n 2141,\n 1251\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe", + "variable_name": "movie_ratings_df" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_idaverageratingnumvotes
0tt103565268.331
1tt103846068.9559
2tt10429746.420
3tt10437264.250352
4tt10602406.521
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " movie_id averagerating numvotes\n", + "0 tt10356526 8.3 31\n", + "1 tt10384606 8.9 559\n", + "2 tt1042974 6.4 20\n", + "3 tt1043726 4.2 50352\n", + "4 tt1060240 6.5 21" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "`pandas` is a very popular data manipulation library, with over 2 million downloads on Anaconda (`conda install pandas`) and over 19 million downloads on PyPI (`pip install pandas`) at the time of this writing. In our own internal data, we see that the overwhelming majority of Flatiron School DS grads use `pandas` on the job in some capacity.\n", - "\n", - "Unlike in base Python, where the Zen of Python says \"There should be one-- and preferably only one --obvious way to do it\", there is often more than one valid way to do something in `pandas`. However there are still more efficient and less efficient ways to use it. Specifically, the best `pandas` code is *performant* and *idiomatic*.\n", - "\n", - "Performant `pandas` code utilizes methods and broadcasting rather than user-defined functions or `for` loops. For example, if you need to strip whitespace from a column containing string data, the best approach would be to use the [`pandas.Series.str.strip` method](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.strip.html) rather than writing your own function or writing a loop. Or if you want to multiply everything in a column by 100, the best approach would be to use broadcasting (e.g. `df[\"column_name\"] * 100`) instead of a function or loop. You can still write your own functions if needed, but only after checking that there isn't a built-in way to do it.\n", - "\n", - "Idiomatic `pandas` code has variable names that are meaningful words or abbreviations in English, that are related to the purpose of the variables. You can still use `df` as the name of your DataFrame if there is only one main DataFrame you are working with, but as soon as you are merging multiple DataFrames or taking a subset of a DataFrame, you should use meaningful names. For example, `df2` would not be an idiomatic name, but `movies_and_reviews` could be.\n", - "\n", - "We also recommend that you rename all DataFrame columns so that their meanings are more understandable, although it is fine to have acronyms. For example, `\"col1\"` would not be an idiomatic name, but `\"USD\"` could be.\n", - "\n", - "Below are the definitions of each rubric level for this objective. This information is also summarized in the rubric, which is attached to the project submission assignment." + "#Explore movie_ratings table\n", + "movie_ratings_query = \"\"\"\n", + " SELECT *\n", + " FROM movie_ratings\"\"\"\n", + "movie_ratings_df = pd.read_sql(movie_ratings_query, conn)\n", + "movie_ratings_df.to_csv('movie_ratings.csv') #create a movies ratings csv file to facilitate EDA on Tableau\n", + "movie_ratings_df.head()" ] }, { "cell_type": "markdown", - "id": "e070c91b", + "id": "e3571e8c", "metadata": {}, "source": [ - "#### Exceeds Objective" + "This table shows the average rating for each movie by movie_id and also the number of votes it received (which could give insight into how many people watched it??). There could be a relationship between these parameters and the success of a movie in the box office" ] }, { "cell_type": "markdown", - "id": "20092dcd", + "id": "fc5a7642", "metadata": {}, "source": [ - "Uses `pandas` to prepare data and answer business questions in an idiomatic, performant way" + "#### Movie_Akas Table" ] }, { - "cell_type": "markdown", - "id": "882b158d", + "cell_type": "code", + "execution_count": null, + "id": "c4f7e31e", "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "movie_akas_df" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_idorderingtitleregionlanguagetypesattributesis_original_title
0tt036961010Джурасик святBGbgNoneNone0.0
1tt036961011Jurashikku warudoJPNoneimdbDisplayNone0.0
2tt036961012Jurassic World: O Mundo dos DinossaurosBRNoneimdbDisplayNone0.0
3tt036961013O Mundo dos DinossaurosBRNoneNoneshort title0.0
4tt036961014Jurassic WorldFRNoneimdbDisplayNone0.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " movie_id ordering title region \\\n", + "0 tt0369610 10 Джурасик свят BG \n", + "1 tt0369610 11 Jurashikku warudo JP \n", + "2 tt0369610 12 Jurassic World: O Mundo dos Dinossauros BR \n", + "3 tt0369610 13 O Mundo dos Dinossauros BR \n", + "4 tt0369610 14 Jurassic World FR \n", + "\n", + " language types attributes is_original_title \n", + "0 bg None None 0.0 \n", + "1 None imdbDisplay None 0.0 \n", + "2 None imdbDisplay None 0.0 \n", + "3 None None short title 0.0 \n", + "4 None imdbDisplay None 0.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "#### Meets Objective (Passing Bar)" + "#Explore movie_akas table\n", + "movie_akas_query = \"\"\"\n", + " SELECT *\n", + " FROM movie_akas\"\"\"\n", + "movie_akas_df = pd.read_sql(movie_akas_query, conn)\n", + "movie_akas_df.to_csv('movie_akas.csv') #create a movies_akas csv file to facilitate EDA on Tableau\n", + "movie_akas_df.head()" ] }, { "cell_type": "markdown", - "id": "c2c426e6", + "id": "4cd4386b", "metadata": {}, "source": [ - "Successfully uses `pandas` to prepare data in order to answer business questions\n", - "\n", - "> This includes projects that _occasionally_ use base Python when `pandas` methods would be more appropriate (such as using `enumerate()` on a DataFrame), or occasionally performs operations that do not appear to have any relevance to the business questions" - ] - }, - { - "cell_type": "markdown", - "id": "88d1667b", - "metadata": {}, - "source": [ - "#### Approaching Objective" + "This table shows other movie features e.g. the region, language, type and attributes. There could be a relationship between these features and the success of a movie in the box office" ] }, { "cell_type": "markdown", - "id": "ec132034", + "id": "10b9f01d", "metadata": {}, "source": [ - "Uses `pandas` to prepare data, but makes significant errors\n", + "#### Grossing Data\n", "\n", - "> Examples of significant errors include: the result presented does not actually answer the stated question, the code produces errors, the code _consistently_ uses base Python when `pandas` methods would be more appropriate, or the submitted notebook contains significant quantities of code that is unrelated to the presented analysis (such as copy/pasted code from the curriculum or StackOverflow)" + "Next, we load the grossing data as a reference to how much money the movies made in the box office. This can be used to gauge their success" ] }, { - "cell_type": "markdown", - "id": "c5e3c86b", + "cell_type": "code", + "execution_count": null, + "id": "db169098", "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"grossing_df\",\n \"rows\": 3387,\n \"fields\": [\n {\n \"column\": \"primary_title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3386,\n \"samples\": [\n \"Season of the Witch\",\n \"Picture Me\",\n \"Jane Eyre\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"studio\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 257,\n \"samples\": [\n \"SEA\",\n \"PBS\",\n \"Cohen\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"domestic_gross\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 66982498.23736458,\n \"min\": 100.0,\n \"max\": 936700000.0,\n \"num_unique_values\": 1797,\n \"samples\": [\n 153600000.0,\n 336000.0,\n 295000.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"foreign_gross\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1204,\n \"samples\": [\n \"40000000\",\n \"5400000\",\n \"32000000\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"year\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2,\n \"min\": 2010,\n \"max\": 2018,\n \"num_unique_values\": 9,\n \"samples\": [\n 2017,\n 2011,\n 2015\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe", + "variable_name": "grossing_df" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
primary_titlestudiodomestic_grossforeign_grossyear
0Toy Story 3BV415000000.06520000002010
1Alice in Wonderland (2010)BV334200000.06913000002010
2Harry Potter and the Deathly Hallows Part 1WB296000000.06643000002010
3InceptionWB292600000.05357000002010
4Shrek Forever AfterP/DW238700000.05139000002010
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " primary_title studio domestic_gross \\\n", + "0 Toy Story 3 BV 415000000.0 \n", + "1 Alice in Wonderland (2010) BV 334200000.0 \n", + "2 Harry Potter and the Deathly Hallows Part 1 WB 296000000.0 \n", + "3 Inception WB 292600000.0 \n", + "4 Shrek Forever After P/DW 238700000.0 \n", + "\n", + " foreign_gross year \n", + "0 652000000 2010 \n", + "1 691300000 2010 \n", + "2 664300000 2010 \n", + "3 535700000 2010 \n", + "4 513900000 2010 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "#### Does Not Meet Objective" + "grossing_df = pd.read_csv('/content/bom.movie_gross.csv')\n", + "grossing_df.rename(columns= {\"title\": \"primary_title\"}, inplace=True)\n", + "grossing_df.to_csv('grossing.csv')\n", + "grossing_df.head()" ] }, { "cell_type": "markdown", - "id": "d9566206", + "id": "57ded2dc", "metadata": {}, "source": [ - "Unable to prepare data using `pandas`\n", - "\n", - "> This includes projects that successfully answer the business questions, but do not use `pandas` (e.g. use only base Python, or use some other tool like R, Tableau, or Excel)" + "Next, we load other provided data sources to check if there are relevant attributes that correlate with movie success at the box office" ] }, { "cell_type": "markdown", - "id": "b0923637", + "id": "febbc836", "metadata": {}, "source": [ - "## Getting Started" + "#### rt.movie_info Data" ] }, { - "cell_type": "markdown", - "id": "8e37e815", + "cell_type": "code", + "execution_count": null, + "id": "34f454e7", "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"rtmoviesinfo_df\",\n \"rows\": 1560,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 579,\n \"min\": 1,\n \"max\": 2000,\n \"num_unique_values\": 1560,\n \"samples\": [\n 1961,\n 1329,\n 461\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"synopsis\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1497,\n \"samples\": [\n \"From Ron Shelton, writer/director of Tin Cup and Bull Durham, comes the new comedy, Just Getting Started. Morgan Freeman stars as DUKE DIVER, the freewheeling manager of the luxury Palm Springs resort, the Villa Capri. DIVER may have a mysterious past, but he's a pro at making sure that life for the high-spirited residents is one big, non-stop party. But the status quo is challenged when ex-military charmer LEO (Tommy Lee Jones) checks in, triggering a competition between Duke and Leo for the top spot of Alpha male, as well as for the affections of the newly-arrived SUZIE (Rene Russo). When Duke's past suddenly catches up with him, the rivals put aside their differences and the two men reluctantly team up to stop whoever is trying to kill Duke, and also save the Villa Capri.\",\n \"Basketball superstar Michael Jordan and cartoon favorite Bugs Bunny team up with other basketball greats and Looney Tunes characters in this combination animated/live-action feature. Jordan must help the Looney Tunes gang with a basketball game against a group of outer space creatures whose plan, if they win, is to kidnap the Looney Tunes and take them to a failing intergalactic amusement park as the latest attraction. Also starring Academy Award-nominee Bill Murray (\\\"Lost in Translation,\\\" \\\"Charlie's Angels\\\") and Wayne Knight (TV's \\\"Seinfeld,\\\" \\\"Rat Race\\\"). Gene Shalit says this movie is \\\"for adults, kids, teenagers, women, men, boys, girls, old folks, young folks, Jordan fans, Bill Murray fans, Wayne Knight fans and every Looney Tunes lover who wants to revel in Bugs Bunny and his pals at their looney tuniest.\\\"\",\n \"Filmed in panoramic Cinerama, this star-studded, epic Western adventure is a true cinematic classic. Three legendary directors (Henry Hathaway, John Ford and George Marshall) combine their skills to tell the story of three families and their travels from the Erie Canal to California between 1839 and 1889. Spencer Tracy narrates the film, which cost an estimated\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"R\",\n \"NR\",\n \"NC17\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"genre\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 299,\n \"samples\": [\n \"Drama|Horror|Science Fiction and Fantasy\",\n \"Drama|Musical and Performing Arts|Faith and Spirituality\",\n \"Special Interest\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"director\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1125,\n \"samples\": [\n \"Peter Greenaway\",\n \"Ava DuVernay\",\n \"Henri Xhonneux\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"writer\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1069,\n \"samples\": [\n \"Tudor Gates\",\n \"William Aldridge\",\n \"Frank Gruber\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"theater_date\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 1025,\n \"samples\": [\n \"May 1, 1941\",\n \"Apr 26, 2002\",\n \"Jan 1, 1990\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"dvd_date\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 717,\n \"samples\": [\n \"Mar 23, 1999\",\n \"Oct 23, 2001\",\n \"Jul 9, 2013\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"currency\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"$\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"box_office\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 336,\n \"samples\": [\n \"55,400,000\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"runtime\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 142,\n \"samples\": [\n \"139 minutes\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"studio\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 200,\n \"samples\": [\n \"Lionsgate Films/After Dark Films\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe", + "variable_name": "rtmoviesinfo_df" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsynopsisratinggenredirectorwritertheater_datedvd_datecurrencybox_officeruntimestudio
01This gritty, fast-paced, and innovative police...RAction and Adventure|Classics|DramaWilliam FriedkinErnest TidymanOct 9, 1971Sep 25, 2001NaNNaN104 minutesNaN
13New York City, not-too-distant-future: Eric Pa...RDrama|Science Fiction and FantasyDavid CronenbergDavid Cronenberg|Don DeLilloAug 17, 2012Jan 1, 2013$600,000108 minutesEntertainment One
25Illeana Douglas delivers a superb performance ...RDrama|Musical and Performing ArtsAllison AndersAllison AndersSep 13, 1996Apr 18, 2000NaNNaN116 minutesNaN
36Michael Douglas runs afoul of a treacherous su...RDrama|Mystery and SuspenseBarry LevinsonPaul Attanasio|Michael CrichtonDec 9, 1994Aug 27, 1997NaNNaN128 minutesNaN
47NaNNRDrama|RomanceRodney BennettGiles CooperNaNNaNNaNNaN200 minutesNaN
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " id synopsis rating \\\n", + "0 1 This gritty, fast-paced, and innovative police... R \n", + "1 3 New York City, not-too-distant-future: Eric Pa... R \n", + "2 5 Illeana Douglas delivers a superb performance ... R \n", + "3 6 Michael Douglas runs afoul of a treacherous su... R \n", + "4 7 NaN NR \n", + "\n", + " genre director \\\n", + "0 Action and Adventure|Classics|Drama William Friedkin \n", + "1 Drama|Science Fiction and Fantasy David Cronenberg \n", + "2 Drama|Musical and Performing Arts Allison Anders \n", + "3 Drama|Mystery and Suspense Barry Levinson \n", + "4 Drama|Romance Rodney Bennett \n", + "\n", + " writer theater_date dvd_date currency \\\n", + "0 Ernest Tidyman Oct 9, 1971 Sep 25, 2001 NaN \n", + "1 David Cronenberg|Don DeLillo Aug 17, 2012 Jan 1, 2013 $ \n", + "2 Allison Anders Sep 13, 1996 Apr 18, 2000 NaN \n", + "3 Paul Attanasio|Michael Crichton Dec 9, 1994 Aug 27, 1997 NaN \n", + "4 Giles Cooper NaN NaN NaN \n", + "\n", + " box_office runtime studio \n", + "0 NaN 104 minutes NaN \n", + "1 600,000 108 minutes Entertainment One \n", + "2 NaN 116 minutes NaN \n", + "3 NaN 128 minutes NaN \n", + "4 NaN 200 minutes NaN " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "Please start by reviewing the contents of this project description. If you have any questions, please ask your instructor ASAP.\n", - "\n", - "Next, you will need to complete the [***Project Proposal***](#project_proposal) which must be reviewed by your instructor before you can continue with the project.\n", - "\n", - "Then, you will need to create a GitHub repository. There are three options:\n", - "\n", - "1. Look at the [Phase 2 Project Templates and Examples repo](https://github.com/learn-co-curriculum/dsc-project-template) and follow the directions in the MVP branch.\n", - "2. Fork the [Phase 2 Project Repository](https://github.com/learn-co-curriculum/dsc-phase-2-project-v3), clone it locally, and work in the `student.ipynb` file. Make sure to also add and commit a PDF of your presentation to your repository with a file name of `presentation.pdf`.\n", - "3. Create a new repository from scratch by going to [github.com/new](https://github.com/new) and copying the data files from one of the above resources into your new repository. This approach will result in the most professional-looking portfolio repository, but can be more complicated to use. So if you are getting stuck with this option, try one of the above options instead." + "rtmoviesinfo_df = pd.read_csv('/content/rt.movie_info.tsv', sep = '\\t')\n", + "rtmoviesinfo_df.to_csv('rtmoviesinfo.csv')\n", + "rtmoviesinfo_df.head()" ] }, { "cell_type": "markdown", - "id": "290d61a5", + "id": "09bcd33b", "metadata": {}, "source": [ - "## Summary" + "#### rt.reviews Data" ] }, { - "cell_type": "markdown", - "id": "ac002279", + "cell_type": "code", + "execution_count": null, + "id": "fbe6daa4", "metadata": {}, - "source": [ - "This project will give you a valuable opportunity to develop your data science skills using real-world data. The end-of-phase projects are a critical part of the program because they give you a chance to bring together all the skills you've learned, apply them to realistic projects for a business stakeholder, practice communication skills, and get feedback to help you improve. You've got this!" - ] + "outputs": [], + "source": [] } ], "metadata": { diff --git a/pictures/movie_data_erd.jpeg b/pictures/movie_data_erd.jpeg new file mode 100644 index 00000000..0fa01bda Binary files /dev/null and b/pictures/movie_data_erd.jpeg differ