diff --git a/.github/ISSUE_TEMPLATE/lesson-card.md b/.github/ISSUE_TEMPLATE/lesson-card.md
deleted file mode 100644
index 71ebeba606..0000000000
--- a/.github/ISSUE_TEMPLATE/lesson-card.md
+++ /dev/null
@@ -1,15 +0,0 @@
----
-name: Lesson Card
-about: Add a Lesson Card
-title: "[LESSON]"
-labels: ''
-assignees: ''
-
----
-
-- [ ] quiz 1
-- [ ] written content
-- [ ] quiz 2
-- [ ] challenge
-- [ ] extra reading
-- [ ] assignment
diff --git a/.github/ISSUE_TEMPLATE/lesson_elements.md b/.github/ISSUE_TEMPLATE/lesson_elements.md
deleted file mode 100644
index eef1892228..0000000000
--- a/.github/ISSUE_TEMPLATE/lesson_elements.md
+++ /dev/null
@@ -1,6 +0,0 @@
-- [ ] quiz 1
-- [ ] written content
-- [ ] quiz 2
-- [ ] challenge
-- [ ] extra reading
-- [ ] assignment
diff --git a/.github/ISSUE_TEMPLATE/translations-checklist.md b/.github/ISSUE_TEMPLATE/translations-checklist.md
new file mode 100644
index 0000000000..4cbf577418
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/translations-checklist.md
@@ -0,0 +1,70 @@
+---
+name: Translations Checklist
+about: These are all the files that need to be translated
+title: "[TRANSLATIONS]"
+labels: translations
+assignees: ''
+
+---
+
+- [ ] Base README.md
+- [ ] Quizzes
+- [ ] Introduction base README
+ - [ ] Intro to ML README
+ - [ ] Intro to ML assignment
+ - [ ] History of ML README
+ - [ ] History of ML assignment
+ - [ ] Fairness README
+ - [ ] Fairness assignment
+ - [ ] Techniques of ML README
+ - [ ] Techniques of ML assignment
+- [ ] Regression base README
+ - [ ] Tools README
+ - [ ] Tools assignment
+ - [ ] Data README
+ - [ ] Data assignment
+ - [ ] Linear README
+ - [ ] Linear assignment
+ - [ ] Logistic README
+ - [ ] Logistic assignment
+- [ ] Web app base README
+ - [ ] Web app README
+ - [ ] Web app assignment
+- [ ] Classification base README
+ - [ ] Intro README
+ - [ ] Intro assignment
+ - [ ] Classifiers 1 README
+ - [ ] Classifiers 1 assignment
+ - [ ] Classifiers 2 README
+ - [ ] Classifiers 2 assignment
+ - [ ] Applied README
+ - [ ] Applied assignment
+- [ ] Clustering base README
+ - [ ] Visualize README
+ - [ ] Visualize assignment
+ - [ ] K-means README
+ - [ ] K-means assignment
+- [ ] NLP base README
+ - [ ] Intro README
+ - [ ] Intro assignment
+ - [ ] Tasks README
+ - [ ] Tasks assignment
+ - [ ] Translation README
+ - [ ] Translation assignment
+ - [ ] Reviews 1 README
+ - [ ] Reviews 1 assignment
+ - [ ] Reviews 2 README
+ - [ ] Reviews 2 assignment
+- [ ] Time Series base README
+ - [ ] Intro README
+ - [ ] Intro assignment
+ - [ ] ARIMA README
+ - [ ] ARIMA assignment
+- [ ] Reinforcement base README
+ - [ ] QLearning README
+ - [ ] QLearning assignment
+ - [ ] gym README
+ - [ ] gym assignment
+- [ ] Real World base README
+ - [ ] Real World README
+ - [ ] Real World assignment
diff --git a/.github/workflows/azure-static-web-apps-jolly-sea-0a877260f.yml b/.github/workflows/azure-static-web-apps-white-water-09ec41f0f.yml
similarity index 78%
rename from .github/workflows/azure-static-web-apps-jolly-sea-0a877260f.yml
rename to .github/workflows/azure-static-web-apps-white-water-09ec41f0f.yml
index 6e77ae1ecc..62bfbc96de 100644
--- a/.github/workflows/azure-static-web-apps-jolly-sea-0a877260f.yml
+++ b/.github/workflows/azure-static-web-apps-white-water-09ec41f0f.yml
@@ -16,15 +16,15 @@ jobs:
submodules: true
- name: Build And Deploy
id: builddeploy
- uses: Azure/static-web-apps-deploy@v0.0.1-preview
+ uses: Azure/static-web-apps-deploy@v1
with:
- azure_static_web_apps_api_token: ${{ secrets.AZURE_STATIC_WEB_APPS_API_TOKEN_JOLLY_SEA_0A877260F }}
+ azure_static_web_apps_api_token: ${{ secrets.AZURE_STATIC_WEB_APPS_API_TOKEN_WHITE_WATER_09EC41F0F }}
repo_token: ${{ secrets.GITHUB_TOKEN }} # Used for Github integrations (i.e. PR comments)
action: "upload"
- ###### Repository/Build Configurations - These values can be configured to match you app requirements. ######
+ ###### Repository/Build Configurations - These values can be configured to match your app requirements. ######
# For more information regarding Static Web App workflow configurations, please visit: https://aka.ms/swaworkflowconfig
app_location: "/quiz-app" # App source code path
- api_location: "api" # Api source code path - optional
+ api_location: "" # Api source code path - optional
output_location: "dist" # Built app content directory - optional
###### End of Repository/Build Configurations ######
@@ -35,7 +35,7 @@ jobs:
steps:
- name: Close Pull Request
id: closepullrequest
- uses: Azure/static-web-apps-deploy@v0.0.1-preview
+ uses: Azure/static-web-apps-deploy@v1
with:
- azure_static_web_apps_api_token: ${{ secrets.AZURE_STATIC_WEB_APPS_API_TOKEN_JOLLY_SEA_0A877260F }}
+ azure_static_web_apps_api_token: ${{ secrets.AZURE_STATIC_WEB_APPS_API_TOKEN_WHITE_WATER_09EC41F0F }}
action: "close"
diff --git a/.gitignore b/.gitignore
index a80a15e326..51f47a5aa3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,6 +33,8 @@ bld/
# Visual Studio 2015/2017 cache/options directory
.vs/
+# Visual Studio Code cache/options directory
+.vscode/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
diff --git a/1-Introduction/1-intro-to-ML/README.md b/1-Introduction/1-intro-to-ML/README.md
index 8c1e30d080..4e73a48249 100644
--- a/1-Introduction/1-intro-to-ML/README.md
+++ b/1-Introduction/1-intro-to-ML/README.md
@@ -4,7 +4,7 @@
> 🎥 Click the image above for a video discussing the difference between machine learning, AI, and deep learning.
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/1/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/1/)
### Introduction
@@ -21,7 +21,7 @@ Before starting with this curriculum, you need to have your computer set up and
- **Learn Python**. It's also recommended to have a basic understanding of [Python](https://docs.microsoft.com/learn/paths/python-language/?WT.mc_id=academic-15963-cxa), a programming language useful for data scientists that we use in this course.
- **Learn Node.js and JavaScript**. We also use JavaScript a few times in this course when building web apps, so you will need to have [node](https://nodejs.org) and [npm](https://www.npmjs.com/) installed, as well as [Visual Studio Code](https://code.visualstudio.com/) available for both Python and JavaScript development.
- **Create a GitHub account**. Since you found us here on [GitHub](https://github.com), you might already have an account, but if not, create one and then fork this curriculum to use on your own. (Feel free to give us a star, too 😊)
-- **Explore Scikit-learn**. Familiarize yourself with [Scikit-learn]([https://scikit-learn.org/stable/user_guide.html), a set of ML libraries that we reference in these lessons.
+- **Explore Scikit-learn**. Familiarize yourself with [Scikit-learn](https://scikit-learn.org/stable/user_guide.html), a set of ML libraries that we reference in these lessons.
### What is machine learning?
@@ -96,12 +96,14 @@ In the near future, understanding the basics of machine learning is going to be
Sketch, on paper or using an online app like [Excalidraw](https://excalidraw.com/), your understanding of the differences between AI, ML, deep learning, and data science. Add some ideas of problems that each of these techniques are good at solving.
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/2/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/2/)
## Review & Self Study
To learn more about how you can work with ML algorithms in the cloud, follow this [Learning Path](https://docs.microsoft.com/learn/paths/create-no-code-predictive-models-azure-machine-learning/?WT.mc_id=academic-15963-cxa).
+Take a [Learning Path](https://docs.microsoft.com/learn/modules/introduction-to-machine-learning/?WT.mc_id=academic-15963-cxa) about the basics of ML.
+
## Assignment
[Get up and running](assignment.md)
diff --git a/1-Introduction/1-intro-to-ML/translations/README.fr.md b/1-Introduction/1-intro-to-ML/translations/README.fr.md
new file mode 100644
index 0000000000..9079816069
--- /dev/null
+++ b/1-Introduction/1-intro-to-ML/translations/README.fr.md
@@ -0,0 +1,109 @@
+# Introduction au machine learning
+
+[![ML, AI, deep learning - Quelle est la différence ?](https://img.youtube.com/vi/lTd9RSxS9ZE/0.jpg)](https://youtu.be/lTd9RSxS9ZE "ML, AI, deep learning - What's the difference?")
+
+> 🎥 Cliquer sur l'image ci-dessus afin de regarder une vidéo expliquant la différence entre machine learning, AI et deep learning.
+
+## [Quiz préalable](https://white-water-09ec41f0f.azurestaticapps.net/quiz/1?loc=fr)
+
+### Introduction
+
+Bienvenue à ce cours sur le machine learning classique pour débutant ! Que vous soyez complètement nouveau sur ce sujet ou que vous soyez un professionnel du ML expérimenté cherchant à peaufiner vos connaissances, nous sommes heureux de vous avoir avec nous ! Nous voulons créer un tremplin chaleureux pour vos études en ML et serions ravis d'évaluer, de répondre et d'apprendre de vos retours d'[expériences](https://github.com/microsoft/ML-For-Beginners/discussions).
+
+[![Introduction au ML](https://img.youtube.com/vi/h0e2HAPTGF4/0.jpg)](https://youtu.be/h0e2HAPTGF4 "Introduction to ML")
+
+> 🎥 Cliquer sur l'image ci-dessus afin de regarder une vidéo: John Guttag du MIT introduit le machine learning
+### Débuter avec le machine learning
+
+Avant de commencer avec ce cours, vous aurez besoin d'un ordinateur configuré et prêt à faire tourner des notebooks (jupyter) localement.
+
+- **Configurer votre ordinateur avec ces vidéos**. Apprendre comment configurer votre ordinateur avec cette [série de vidéos](https://www.youtube.com/playlist?list=PLlrxD0HtieHhS8VzuMCfQD4uJ9yne1mE6).
+- **Apprendre Python**. Il est aussi recommandé d'avoir une connaissance basique de [Python](https://docs.microsoft.com/learn/paths/python-language/?WT.mc_id=academic-15963-cxa), un langage de programmaton utile pour les data scientist que nous utilisons tout au long de ce cours.
+- **Apprendre Node.js et Javascript**. Nous utilisons aussi Javascript par moment dans ce cours afin de construire des applications WEB, vous aurez donc besoin de [node](https://nodejs.org) et [npm](https://www.npmjs.com/) installé, ainsi que de [Visual Studio Code](https://code.visualstudio.com/) pour développer en Python et Javascript.
+- **Créer un compte GitHub**. Comme vous nous avez trouvé sur [GitHub](https://github.com), vous y avez sûrement un compte, mais si non, créez en un et répliquez ce cours afin de l'utiliser à votre grés. (N'oublier pas de nous donner une étoile aussi 😊)
+- **Explorer Scikit-learn**. Familiariser vous avec [Scikit-learn](https://scikit-learn.org/stable/user_guide.html), un ensemble de librairies ML que nous mentionnons dans nos leçons.
+
+### Qu'est-ce que le machine learning
+
+Le terme `machine learning` est un des mots les plus populaire et le plus utilisé ces derniers temps. Il y a une probabilité accrue que vous l'ayez entendu au moins une fois si vous avez une appétence pour la technologie indépendamment du domaine dans lequel vous travaillez. Le fonctionnement du machine learning, cependant, reste un mystère pour la plupart des personnes. Pour un débutant en machine learning, le sujet peut nous submerger. Ainsi, il est important de comprendre ce qu'est le machine learning et de l'apprendre petit à petit au travers d'exemples pratiques.
+
+![ml hype curve](../images/hype.png)
+
+> Google Trends montre la récente 'courbe de popularité' pour le mot 'machine learning'
+
+Nous vivons dans un univers rempli de mystères fascinants. De grands scientifiques comme Stephen Hawking, Albert Einstein et pleins d'autres ont dévoués leur vie à la recherche d'informations utiles afin de dévoiler les mystères qui nous entourent. C'est la condition humaine pour apprendre : un enfant apprend de nouvelles choses et découvre la structure du monde année après année jusqu'à qu'ils deviennent adultes.
+
+Le cerveau d'un enfant et ses sens perçoivent l'environnement qui les entourent et apprennent graduellement des schémas non observés de la vie qui vont l'aider à fabriquer des règles logiques afin d'identifier les schémas appris. Le processus d'apprentissage du cerveau humain est ce que rend les hommes comme la créature la plus sophistiquée du monde vivant. Apprendre continuellement par la découverte de schémas non observés et ensuite innover sur ces schémas nous permet de nous améliorer tout au long de notre vie. Cette capacité d'apprendre et d'évoluer est liée au concept de [plasticité neuronale](https://www.simplypsychology.org/brain-plasticity.html), nous pouvons tirer quelques motivations similaires entre le processus d'apprentissage du cerveau humain et le concept de machine learning.
+
+Le [cerveau humain](https://www.livescience.com/29365-human-brain.html) perçoit des choses du monde réel, assimile les informations perçues, fait des décisions rationnelles et entreprend certaines actions selon le contexte. C'est ce que l'on appelle se comporter intelligemment. Lorsque nous programmons une reproduction du processus de ce comportement à une machine, c'est ce que l'on appelle intelligence artificielle (IA).
+
+Bien que le terme peut être confu, machine learning (ML) est un important sous-ensemble de l'intelligence artificielle. **ML se réfère à l'utilisation d'algorithmes spécialisés afin de découvrir des informations utiles et de trouver des schémas non observés depuis des données perçues pour corroborer un processus de décision rationnel**.
+
+![AI, ML, deep learning, data science](../images/ai-ml-ds.png)
+
+> Un diagramme montrant les relations entre AI, ML, deep learning et data science. Infographie par [Jen Looper](https://twitter.com/jenlooper) et inspiré par [ce graphique](https://softwareengineering.stackexchange.com/questions/366996/distinction-between-ai-ml-neural-networks-deep-learning-and-data-mining)
+
+## Ce que vous allez apprendre dans ce cours
+
+Dans ce cours, nous allons nous concentrer sur les concepts clés du machine learning qu'un débutant se doit de connaître. Nous parlerons de ce que l'on appelle le 'machine learning classique' en utilisant principalement Scikit-learn, une excellente librairie que beaucoup d'étudiants utilisent afin d'apprendre les bases. Afin de comprendre les concepts plus larges de l'intelligence artificielle ou du deep learning, une profonde connaissance en machine learning est indispensable, et c'est ce que nous aimerions fournir ici.
+
+Dans ce cours, vous allez apprendre :
+
+- Les concepts clés du machine learning
+- L'histoire du ML
+- ML et équité (fairness)
+- Les techniques de régression ML
+- Les techniques de classification ML
+- Les techniques de regroupement (clustering) ML
+- Les techniques du traitement automatique des langues (NLP) ML
+- Les techniques de prédictions à partir de séries chronologiques ML
+- Apprentissage renforcé
+- D'applications réels du ML
+
+## Ce que nous ne couvrirons pas
+
+- Deep learning
+- Neural networks
+- IA
+
+Afin d'avoir la meilleur expérience d'apprentissage, nous éviterons les complexités des réseaux neuronaux, du 'deep learning' (construire un modèle utilisant plusieurs couches de réseaux neuronaux) et IA, dont nous parlerons dans un cours différent. Nous offirons aussi un cours à venir sur la data science pour concentrer sur cet aspect de champs très large.
+
+## Pourquoi etudier le machine learning ?
+
+Le machine learning, depuis une perspective systémique, est défini comme la création de systèmes automatiques pouvant apprendre des schémas non observés depuis des données afin d'aider à prendre des décisions intelligentes.
+
+Ce but est faiblement inspiré de la manière dont le cerveau humain apprend certaines choses depuis les données qu'il perçoit du monde extérieur.
+
+✅ Penser une minute aux raisons qu'une entreprise aurait d'essayer d'utiliser des stratégies de machine learning au lieu de créer des règles codés en dur.
+
+### Les applications du machine learning
+
+Les applications du machine learning sont maintenant pratiquement partout, et sont aussi omniprésentes que les données qui circulent autour de notre société (générés par nos smartphones, appareils connectés ou autres systèmes). En prenant en considération l'immense potentiel des algorithmes dernier cri de machine learning, les chercheurs ont pu exploités leurs capacités afin de résoudre des problèmes multidimensionnels et interdisciplinaires de la vie avec d'important retours positifs
+
+**Vous pouvez utiliser le machine learning de plusieurs manières** :
+
+- Afin de prédire la possibilité d'avoir une maladie à partir des données médicales d'un patient.
+- Pour tirer parti des données météorologiques afin de prédire les événements météorologiques.
+- Afin de comprendre le sentiment d'un texte.
+- Afin de détecter les fake news pour stopper la propagation de la propagande.
+
+La finance, l'économie, les sciences de la terre, l'exploration spatiale, le génie biomédical, les sciences cognitives et même les domaines des sciences humaines ont adapté le machine learning pour résoudre les problèmes ardus et lourds de traitement des données dans leur domaine respectif.
+
+Le machine learning automatise le processus de découverte de modèles en trouvant des informations significatives à partir de données réelles ou générées. Il s'est avéré très utile dans les applications commerciales, de santé et financières, entre autres.
+
+Dans un avenir proche, comprendre les bases du machine learning sera indispensable pour les personnes de tous les domaines en raison de son adoption généralisée.
+
+---
+## 🚀 Challenge
+
+Esquisser, sur papier ou à l'aide d'une application en ligne comme [Excalidraw](https://excalidraw.com/), votre compréhension des différences entre l'IA, le ML, le deep learning et la data science. Ajouter quelques idées de problèmes que chacune de ces techniques est bonne à résoudre.
+
+## [Quiz de validation des connaissances](https://white-water-09ec41f0f.azurestaticapps.net/quiz/2?loc=fr)
+
+## Révision et auto-apprentissage
+
+Pour en savoir plus sur la façon dont vous pouvez utiliser les algorithmes de ML dans le cloud, suivez ce [Parcours d'apprentissage](https://docs.microsoft.com/learn/paths/create-no-code-predictive-models-azure-machine-learning/?WT.mc_id=academic-15963-cxa).
+
+## Devoir
+
+[Être opérationnel](assignment.fr.md)
diff --git a/1-Introduction/1-intro-to-ML/translations/README.id.md b/1-Introduction/1-intro-to-ML/translations/README.id.md
new file mode 100644
index 0000000000..69a9157b59
--- /dev/null
+++ b/1-Introduction/1-intro-to-ML/translations/README.id.md
@@ -0,0 +1,107 @@
+# Pengantar Machine Learning
+
+[![ML, AI, deep learning - Apa perbedaannya?](https://img.youtube.com/vi/lTd9RSxS9ZE/0.jpg)](https://youtu.be/lTd9RSxS9ZE "ML, AI, deep learning - Apa perbedaannya?")
+
+> 🎥 Klik gambar diatas untuk menonton video yang mendiskusikan perbedaan antara Machine Learning, AI, dan Deep Learning.
+
+## [Quiz Pra-Pelajaran](https://white-water-09ec41f0f.azurestaticapps.net/quiz/1/)
+
+### Pengantar
+
+Selamat datang di pelajaran Machine Learning klasik untuk pemula! Baik kamu yang masih benar-benar baru, atau seorang praktisi ML berpengalaman yang ingin meningkatkan kemampuan kamu, kami senang kamu ikut bersama kami! Kami ingin membuat sebuah titik mulai yang ramah untuk pembelajaran ML kamu dan akan sangat senang untuk mengevaluasi, merespon, dan memasukkan [umpan balik](https://github.com/microsoft/ML-For-Beginners/discussions) kamu.
+
+[![Pengantar Machine Learning](https://img.youtube.com/vi/h0e2HAPTGF4/0.jpg)](https://youtu.be/h0e2HAPTGF4 "Pengantar Machine Learning")
+
+> 🎥 Klik gambar diatas untuk menonton video: John Guttag dari MIT yang memberikan pengantar Machine Learning.
+### Memulai Machine Learning
+
+Sebelum memulai kurikulum ini, kamu perlu memastikan komputer kamu sudah dipersiapkan untuk menjalankan *notebook* secara lokal.
+
+- **Konfigurasi komputer kamu dengan video ini**. Pelajari bagaimana menyiapkan komputer kamu dalam [video-video](https://www.youtube.com/playlist?list=PLlrxD0HtieHhS8VzuMCfQD4uJ9yne1mE6) ini.
+- **Belajar Python**. Disarankan juga untuk memiliki pemahaman dasar dari [Python](https://docs.microsoft.com/learn/paths/python-language/?WT.mc_id=academic-15963-cxa), sebuah bahasa pemrograman yang digunakan oleh data scientist yang juga akan kita gunakan dalam pelajaran ini.
+- **Belajar Node.js dan JavaScript**. Kita juga menggunakan JavaScript beberapa kali dalam pelajaran ini ketika membangun aplikasi web, jadi kamu perlu menginstal [node](https://nodejs.org) dan [npm](https://www.npmjs.com/), serta [Visual Studio Code](https://code.visualstudio.com/) yang tersedia untuk pengembangan Python dan JavaScript.
+- **Buat akun GitHub**. Karena kamu menemukan kami di [GitHub](https://github.com), kamu mungkin sudah punya akun, tapi jika belum, silakan buat akun baru kemudian *fork* kurikulum ini untuk kamu pergunakan sendiri. (Jangan ragu untuk memberikan kami bintang juga 😊)
+- **Jelajahi Scikit-learn**. Buat diri kamu familiar dengan [Scikit-learn]([https://scikit-learn.org/stable/user_guide.html), seperangkat *library* ML yang kita acu dalam pelajaran-pelajaran ini.
+
+### Apa itu Machine Learning?
+
+Istilah 'Machine Learning' merupakan salah satu istilah yang paling populer dan paling sering digunakan saat ini. Ada kemungkinan kamu pernah mendengar istilah ini paling tidak sekali jika kamu familiar dengan teknologi. Tetapi untuk mekanisme Machine Learning sendiri, merupakan sebuah misteri bagi sebagian besar orang. Karena itu, penting untuk memahami sebenarnya apa itu Machine Learning, dan mempelajarinya langkah demi langkah melalui contoh praktis.
+
+![kurva tren ml](../images/hype.png)
+
+> Google Trends memperlihatkan 'kurva tren' dari istilah 'Machine Learning' belakangan ini.
+
+Kita hidup di sebuah alam semesta yang penuh dengan misteri yang menarik. Ilmuwan-ilmuwan besar seperti Stephen Hawking, Albert Einstein, dan banyak lagi telah mengabdikan hidup mereka untuk mencari informasi yang berarti yang mengungkap misteri dari dunia disekitar kita. Ini adalah kondisi belajar manusia: seorang anak manusia belajar hal-hal baru dan mengungkap struktur dari dunianya tahun demi tahun saat mereka tumbuh dewasa.
+
+Otak dan indera seorang anak memahami fakta-fakta di sekitarnya dan secara bertahap mempelajari pola-pola kehidupan yang tersembunyi yang membantu anak untuk menyusun aturan-aturan logis untuk mengidentifikasi pola-pola yang dipelajari. Proses pembelajaran otak manusia ini menjadikan manusia sebagai makhluk hidup paling canggih di dunia ini. Belajar terus menerus dengan menemukan pola-pola tersembunyi dan kemudian berinovasi pada pola-pola itu memungkinkan kita untuk terus menjadikan diri kita lebih baik sepanjang hidup. Kapasitas belajar dan kemampuan berkembang ini terkait dengan konsep yang disebut dengan *[brain plasticity](https://www.simplypsychology.org/brain-plasticity.html)*. Secara sempit, kita dapat menarik beberapa kesamaan motivasi antara proses pembelajaran otak manusia dan konsep Machine Learning.
+
+[Otak manusia](https://www.livescience.com/29365-human-brain.html) menerima banyak hal dari dunia nyata, memproses informasi yang diterima, membuat keputusan rasional, dan melakukan aksi-aksi tertentu berdasarkan keadaan. Inilah yang kita sebut dengan berperilaku cerdas. Ketika kita memprogram sebuah salinan dari proses perilaku cerdas ke sebuah mesin, ini dinamakan kecerdasan buatan atau Artificial Intelligence (AI).
+
+Meskipun istilah-stilahnya bisa membingungkan, Machine Learning (ML) adalah bagian penting dari Artificial Intelligence. **ML berkaitan dengan menggunakan algoritma-algoritma terspesialisasi untuk mengungkap informasi yang berarti dan mencari pola-pola tersembunyi dari data yang diterima untuk mendukung proses pembuatan keputusan rasional**.
+
+![AI, ML, deep learning, data science](../images/ai-ml-ds.png)
+
+> Sebuah diagram yang memperlihatkan hubungan antara AI, ML, Deep Learning, dan Data Science. Infografis oleh [Jen Looper](https://twitter.com/jenlooper) terinspirasi dari [infografis ini](https://softwareengineering.stackexchange.com/questions/366996/distinction-between-ai-ml-neural-networks-deep-learning-and-data-mining)
+
+## Apa yang akan kamu pelajari
+
+Dalam kurikulum ini, kita hanya akan membahas konsep inti dari Machine Learning yang harus diketahui oleh seorang pemula. Kita membahas apa yang kami sebut sebagai 'Machine Learning klasik' utamanya menggunakan Scikit-learn, sebuah *library* luar biasa yang banyak digunakan para siswa untuk belajar dasarnya. Untuk memahami konsep Artificial Intelligence atau Deep Learning yang lebih luas, pengetahuan dasar yang kuat tentang Machine Learning sangat diperlukan, itulah yang ingin kami tawarkan di sini.
+
+Kamu akan belajar:
+
+- Konsep inti ML
+- Sejarah dari ML
+- Keadilan dan ML
+- Teknik regresi ML
+- Teknik klasifikasi ML
+- Teknik *clustering* ML
+- Teknik *natural language processing* ML
+- Teknik *time series forecasting* ML
+- *Reinforcement learning*
+- Penerapan nyata dari ML
+## Yang tidak akan kita bahas
+
+- *deep learning*
+- *neural networks*
+- AI
+
+Untuk membuat pengalaman belajar yang lebih baik, kita akan menghindari kerumitan dari *neural network*, *deep learning* - membangun *many-layered model* menggunakan *neural network* - dan AI, yang mana akan kita bahas dalam kurikulum yang berbeda. Kami juga akan menawarkan kurikulum *data science* yang berfokus pada aspek bidang tersebut.
+## Kenapa belajar Machine Learning?
+
+Machine Learning, dari perspektif sistem, didefinisikan sebagai pembuatan sistem otomatis yang dapat mempelajari pola-pola tersembunyi dari data untuk membantu membuat keputusan cerdas.
+
+Motivasi ini secara bebas terinspirasi dari bagaimana otak manusia mempelajari hal-hal tertentu berdasarkan data yang diterimanya dari dunia luar.
+
+✅ Pikirkan sejenak mengapa sebuah bisnis ingin mencoba menggunakan strategi Machine Learning dibandingkan membuat sebuah mesin berbasis aturan yang tertanam (*hard-coded*).
+
+### Penerapan Machine Learning
+
+Penerapan Machine Learning saat ini hampir ada di mana-mana, seperti data yang mengalir di sekitar kita, yang dihasilkan oleh ponsel pintar, perangkat yang terhubung, dan sistem lainnya. Mempertimbangkan potensi besar dari algoritma Machine Learning terkini, para peneliti telah mengeksplorasi kemampuan Machine Learning untuk memecahkan masalah kehidupan nyata multi-dimensi dan multi-disiplin dengan hasil positif yang luar biasa.
+
+**Kamu bisa menggunakan Machine Learning dalam banyak hal**:
+
+- Untuk memprediksi kemungkinan penyakit berdasarkan riwayat atau laporan medis pasien.
+- Untuk memanfaatkan data cuaca untuk memprediksi peristiwa cuaca.
+- Untuk memahami sentimen sebuah teks.
+- Untuk mendeteksi berita palsu untuk menghentikan penyebaran propaganda.
+
+Keuangan, ekonomi, geosains, eksplorasi ruang angkasa, teknik biomedis, ilmu kognitif, dan bahkan bidang humaniora telah mengadaptasi Machine Learning untuk memecahkan masalah sulit pemrosesan data di bidang mereka.
+
+Machine Learning mengotomatiskan proses penemuan pola dengan menemukan wawasan yang berarti dari dunia nyata atau dari data yang dihasilkan. Machine Learning terbukti sangat berharga dalam penerapannya di berbagai bidang, diantaranya adalah bidang bisnis, kesehatan, dan keuangan.
+
+Dalam waktu dekat, memahami dasar-dasar Machine Learning akan menjadi suatu keharusan bagi orang-orang dari bidang apa pun karena adopsinya yang luas.
+
+---
+## 🚀 Tantangan
+
+Buat sketsa di atas kertas atau menggunakan aplikasi seperti [Excalidraw](https://excalidraw.com/), mengenai pemahaman kamu tentang perbedaan antara AI, ML, Deep Learning, dan Data Science. Tambahkan beberapa ide masalah yang cocok diselesaikan masing-masing teknik.
+
+## [Quiz Pasca-Pelajaran](https://white-water-09ec41f0f.azurestaticapps.net/quiz/2/)
+
+## Ulasan & Belajar Mandiri
+
+Untuk mempelajari lebih lanjut tentang bagaimana kamu dapat menggunakan algoritma ML di cloud, ikuti [Jalur Belajar](https://docs.microsoft.com/learn/paths/create-no-code-predictive-models-azure-machine-learning/?WT.mc_id=academic-15963-cxa) ini.
+
+## Tugas
+
+[Persiapan](assignment.id.md)
diff --git a/1-Introduction/1-intro-to-ML/translations/README.it.md b/1-Introduction/1-intro-to-ML/translations/README.it.md
new file mode 100644
index 0000000000..715a9c90f5
--- /dev/null
+++ b/1-Introduction/1-intro-to-ML/translations/README.it.md
@@ -0,0 +1,108 @@
+# Introduzione a machine learning
+
+[![ML, AI, deep learning: qual è la differenza?](https://img.youtube.com/vi/lTd9RSxS9ZE/0.jpg)](https://youtu.be/lTd9RSxS9ZE "ML, AI, deep learning: qual è la differenza?")
+
+> 🎥 Fare clic sull'immagine sopra per un video che illustra la differenza tra machine learning, intelligenza artificiale (AI) e deep learning.
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/1/)
+
+### Introduzione
+
+Benvenuti in questo corso su machine learning classico per principianti! Che si sia completamente nuovo su questo argomento, o un professionista esperto di ML che cerca di rispolverare un'area, è un piacere avervi con noi! Si vuole creare un punto di partenza amichevole per lo studio di ML e saremo lieti di valutare, rispondere e incorporare il vostro [feedback](https://github.com/microsoft/ML-For-Beginners/discussions).
+
+[![Introduzione a ML](https://img.youtube.com/vi/h0e2HAPTGF4/0.jpg)](https://youtu.be/h0e2HAPTGF4 " Introduzione a ML")
+
+> 🎥 Fare clic sull'immagine sopra per un video: John Guttag del MIT introduce machine learning
+
+### Iniziare con machine learning
+
+Prima di iniziare con questo programma di studi, è necessario che il computer sia configurato e pronto per eseguire i notebook in locale.
+
+- **Si configuri la propria macchina con l'aiuto di questi video**. Si scopra di più su come configurare la propria macchina in questa [serie di video](https://www.youtube.com/playlist?list=PLlrxD0HtieHhS8VzuMCfQD4uJ9yne1mE6).
+- **Imparare Python**. Si consiglia inoltre di avere una conoscenza di base di [Python](https://docs.microsoft.com/learn/paths/python-language/?WT.mc_id=academic-15963-cxa), un linguaggio di programmazione utile per i data scientist che si utilizzerà in questo corso.
+- **Imparare Node.js e JavaScript**. Talvolta in questo corso si usa anche JavaScript durante la creazione di app web, quindi sarà necessario disporre di [node](https://nodejs.org) e [npm](https://www.npmjs.com/) installati, oltre a [Visual Studio Code](https://code.visualstudio.com/) disponibile sia per lo sviluppo Python che JavaScript.
+- **Creare un account GitHub**. E' probabile che si [](https://github.com)disponga già di un account GitHub, ma in caso contrario occorre crearne uno e poi eseguire il fork di questo programma di studi per utilizzarlo autonomamente. (Sentitevi liberi di darci anche una stella 😊)
+- **Esplorare Scikit-learn**. Familiarizzare con Scikit-learn,[]([https://scikit-learn.org/stable/user_guide.html) un insieme di librerie ML a cui si farà riferimento in queste lezioni.
+
+### Che cos'è machine learning?
+
+Il termine "machine learning" è uno dei termini più popolari e usati di oggi. C'è una buona possibilità che si abbia sentito questo termine almeno una volta se si ha una sorta di familiarità con la tecnologia, indipendentemente dal campo in cui si lavora. I meccanismi di machine learning, tuttavia, sono un mistero per la maggior parte delle persone. Per un principiante di machine learning l'argomento a volte può sembrare soffocante. Pertanto, è importante capire cos'è effettivamente machine learning e impararlo passo dopo passo, attraverso esempi pratici.
+
+![ml curva di hype](../images/hype.png)
+
+> Google Trends mostra la recente "curva di hype" del termine "machine learning"
+
+Si vive in un universo pieno di misteri affascinanti. Grandi scienziati come Stephen Hawking, Albert Einstein e molti altri hanno dedicato la loro vita alla ricerca di informazioni significative che svelino i misteri del mondo circostante. Questa è la condizione umana dell'apprendimento: un bambino impara cose nuove e scopre la struttura del suo mondo anno dopo anno mentre cresce fino all'età adulta.
+
+Il cervello e i sensi di un bambino percepiscono i fatti dell'ambiente circostante e apprendono gradualmente i modelli di vita nascosti che aiutano il bambino a creare regole logiche per identificare i modelli appresi. Il processo di apprendimento del cervello umano rende l'essere umano la creatura vivente più sofisticata di questo mondo. Imparare continuamente scoprendo schemi nascosti e poi innovare su questi schemi ci consente di migliorarsi sempre di più per tutta la vita. Questa capacità di apprendimento e capacità di evoluzione è correlata a un concetto chiamato [plasticità cerebrale](https://www.simplypsychology.org/brain-plasticity.html). Superficialmente, si possono tracciare alcune somiglianze motivazionali tra il processo di apprendimento del cervello umano e i concetti di machine learning.
+
+Il [cervello umano](https://www.livescience.com/29365-human-brain.html) percepisce le cose dal mondo reale, elabora le informazioni percepite, prende decisioni razionali ed esegue determinate azioni in base alle circostanze. Questo è ciò che viene chiamato comportarsi in modo intelligente. Quando si programma un facsimile del processo comportamentale intelligente su una macchina, si parla di intelligenza artificiale (AI).
+
+Sebbene i termini possano essere confusi, machine learning (ML) è un importante sottoinsieme dell'intelligenza artificiale. **Machine learning si occupa di utilizzare algoritmi specializzati per scoprire informazioni significative e trovare modelli nascosti dai dati percepiti per corroborare il processo decisionale razionale**.
+
+![AI, machine learning, deep learning, data science](../images/ai-ml-ds.png)
+
+> Un diagramma che mostra le relazioni tra intelligenza artificiale (AI), machine learning, deep learning e data science. Infografica di [Jen Looper](https://twitter.com/jenlooper) ispirata a [questa grafica](https://softwareengineering.stackexchange.com/questions/366996/distinction-between-ai-ml-neural-networks-deep-learning-and-data-mining)
+
+## Ecco cosa si imparerà in questo corso
+
+In questo programma di studi, saranno tratteti solo i concetti fondamentali di machine learning che un principiante deve conoscere. Si tratterà di ciò che viene chiamato "machine learning classico" principalmente utilizzando Scikit-learn, una eccellente libreria che molti studenti usano per apprendere le basi. Per comprendere concetti più ampi di intelligenza artificiale o deep learning, è indispensabile una forte conoscenza fondamentale di machine learning, e quindi la si vorrebbe offrire qui.
+
+In questo corso si imparerà:
+
+- concetti fondamentali di machine learning
+- la storia di ML
+- ML e correttezza
+- tecniche di regressione ML
+- tecniche di classificazione ML
+- tecniche di clustering ML
+- tecniche di elaborazione del linguaggio naturale ML
+- tecniche ML di previsione delle serie temporali
+- reinforcement learning
+- applicazioni del mondo reale per ML
+## Cosa non verrà trattato
+
+- deep learning
+- reti neurali
+- AI (intelligenza artificiale)
+
+Per rendere l'esperienza di apprendimento migliore, si eviteranno le complessità delle reti neurali, del "deep learning" (costruzione di modelli a più livelli utilizzando le reti neurali) e dell'AI, di cui si tratterà in un altro programma di studi. Si offrirà anche un prossimo programma di studi di data science per concentrarsi su quell'aspetto di questo campo più ampio.
+## Perché studiare machine learning?
+
+Machine learning, dal punto di vista dei sistemi, è definito come la creazione di sistemi automatizzati in grado di apprendere modelli nascosti dai dati per aiutare a prendere decisioni intelligenti.
+
+Questa motivazione è vagamente ispirata dal modo in cui il cervello umano apprende determinate cose in base ai dati che percepisce dal mondo esterno.
+
+✅ Si pensi per un minuto al motivo per cui un'azienda dovrebbe provare a utilizzare strategie di machine learning rispetto alla creazione di un motore cablato a codice basato su regole codificate.
+
+### Applicazioni di machine learning
+
+Le applicazioni di machine learning sono ormai quasi ovunque e sono onnipresenti come i dati che circolano nelle società, generati dagli smartphone, dispositivi connessi e altri sistemi. Considerando l'immenso potenziale degli algoritmi di machine learning all'avanguardia, i ricercatori hanno esplorato la loro capacità di risolvere problemi multidimensionali e multidisciplinari della vita reale con grandi risultati positivi.
+
+**Si può utilizzare machine learning in molti modi**:
+
+- Per prevedere la probabilità di malattia dall'anamnesi o dai rapporti di un paziente.
+- Per sfruttare i dati meteorologici per prevedere gli eventi meteorologici.
+- Per comprendere il sentimento di un testo.
+- Per rilevare notizie false per fermare la diffusione della propaganda.
+
+La finanza, l'economia, le scienze della terra, l'esplorazione spaziale, l'ingegneria biomedica, le scienze cognitive e persino i campi delle scienze umanistiche hanno adattato machine learning per risolvere gli ardui problemi di elaborazione dati del proprio campo.
+
+Machine learning automatizza il processo di individuazione dei modelli trovando approfondimenti significativi dal mondo reale o dai dati generati. Si è dimostrato di grande valore in applicazioni aziendali, sanitarie e finanziarie, tra le altre.
+
+Nel prossimo futuro, comprendere le basi di machine learning sarà un must per le persone in qualsiasi campo a causa della sua adozione diffusa.
+
+---
+## 🚀 Sfida
+
+Disegnare, su carta o utilizzando un'app online come [Excalidraw](https://excalidraw.com/), la propria comprensione delle differenze tra AI, ML, deep learning e data science. Aggiungere alcune idee sui problemi che ciascuna di queste tecniche è in grado di risolvere.
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/2/)
+
+## Revisione e Auto Apprendimento
+
+Per saperne di più su come si può lavorare con gli algoritmi ML nel cloud, si segua questo [percorso di apprendimento](https://docs.microsoft.com/learn/paths/create-no-code-predictive-models-azure-machine-learning/?WT.mc_id=academic-15963-cxa).
+
+## Compito
+
+[Tempi di apprendimento brevi](assignment.it.md)
diff --git a/1-Introduction/1-intro-to-ML/translations/README.ja.md b/1-Introduction/1-intro-to-ML/translations/README.ja.md
new file mode 100644
index 0000000000..b88738d03e
--- /dev/null
+++ b/1-Introduction/1-intro-to-ML/translations/README.ja.md
@@ -0,0 +1,105 @@
+# 機械学習への導入
+
+[![ML, AI, deep learning - 違いは何か?](https://img.youtube.com/vi/lTd9RSxS9ZE/0.jpg)](https://youtu.be/lTd9RSxS9ZE "ML, AI, deep learning - 違いは何か?")
+
+> 🎥 上の画像をクリックすると、機械学習、AI、深層学習の違いについて説明した動画が表示されます。
+
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/1?loc=ja)
+
+### イントロダクション
+
+初心者のための古典的な機械学習のコースへようこそ! このテーマに全く触れたことのない方も、この分野をブラッシュアップしたい経験豊富な方も、ぜひご参加ください。私たちは、あなたのMLの学習についての親しみやすいスタート地点を作りたいと考えています。あなたの[フィードバック](https://github.com/microsoft/ML-For-Beginners/discussions)を評価し、対応し、取り入れることができれば幸いです。
+[![機械学習への導入](https://img.youtube.com/vi/h0e2HAPTGF4/0.jpg)](https://youtu.be/h0e2HAPTGF4 "機械学習への導入")
+
+> 🎥 上の画像をクリックすると、MITのJohn Guttagが機械学習を紹介する動画が表示されます。
+### 機械学習を始めるにあたって
+
+このカリキュラムを始める前に、コンピュータを設定し、ノートブックをローカルで実行できるようにする必要があります。
+
+- **こちらのビデオでマシンの設定を行ってください。** マシンの設定方法については、[これらのビデオ](https://www.youtube.com/playlist?list=PLlrxD0HtieHhS8VzuMCfQD4uJ9yne1mE6)をご覧ください。
+- **Pythonを学習する。** 本講座で使用する、データサイエンティストに有用なプログラミング言語である[Python](https://docs.microsoft.com/learn/paths/python-language/?WT.mc_id=academic-15963-cxa)の基本的な理解があることが望ましいです。
+- **Node.jsとJavaScriptを学習する。** このコースではウェブアプリを構築する際にJavaScriptも何度か使用しますので、[node](https://nodejs.org)と[npm](https://www.npmjs.com/)がインストールされていること、PythonとJavaScriptの両方の開発に必要な[Visual Studio Code](https://code.visualstudio.com/)が利用可能であることが必要です。
+- **GitHubのアカウントを作成する。** [GitHub](https://github.com)で私たちを見つけたのですから、すでにアカウントをお持ちかもしれませんが、もしお持ちでなければ、アカウントを作成して、このカリキュラムをフォークしてご自分でお使いください。(スターをつけることもお忘れなく😊)
+- **Scikit-learnを探索する。** このレッスンで参照するMLライブラリのセットである[Scikit-learn]([https://scikit-learn.org/stable/user_guide.html)に慣れ親しんでください。
+
+### 機械学習とは何か?
+
+"機械学習(Machine Learning)"という言葉は、現在最も人気があり、頻繁に使用されている言葉の一つです。どんな分野の技術者であっても、多少なりとも技術に精通していれば、一度はこの言葉を耳にしたことがある可能性は少なくありません。しかし、機械学習の仕組みは、ほとんどの人にとって謎に包まれており、機械学習の初心者にとって、このテーマは時に圧倒されるように感じられます。そのため、機械学習とは何かを実際に理解し、実践的な例を通して段階的に学んでいくことが重要です。
+
+![機械学習の人気を示すグラフ](../images/hype.png)
+
+> Google Trendsによる、「機械学習」という言葉の最近の盛り上がりを示すグラフ。
+
+私たちは、魅力的な謎に満ちた宇宙に住んでいます。ホーキング博士やアインシュタイン博士をはじめとする偉大な科学者たちは、私たちを取り巻く世界の謎を解き明かす意味のある情報を探すことに人生を捧げてきました。人間の子供は、大人になるまでの間に、年々新しいことを学び、自分の世界の構造を明らかにしていきます。
+
+子供の脳と感覚は、周囲の事実を認識し、徐々に人生の隠れたパターンを学び、学習したパターンを識別するための論理的なルールを作るのに役立ちます。こういった学習プロセスは、人間をこの世で最も洗練された生物にしています。隠れたパターンを発見することで継続的に学習し、そのパターンに基づいて革新を行うことで、私たちは生涯を通じて自分自身をより良くしていくことができます。この学習能力と進化能力は、[「脳の可塑性」](https://www.simplypsychology.org/brain-plasticity.html)と呼ばれる概念に関連しています。表面的には、人間の脳の学習プロセスと機械学習のコンセプトには、モチベーションの面でいくつかの共通点があります。
+
+[人間の脳](https://www.livescience.com/29365-human-brain.html)は、現実世界の物事を知覚し、知覚した情報を処理し、合理的な判断を下し、状況に応じてある行動をします。これは知的行動と呼ばれます。この知的行動のプロセスを機械にプログラムすることを人工知能(AI)といいます。
+
+この言葉は混同されることがありますが、機械学習(ML)は人工知能の重要なサブセットです。**MLは、特殊なアルゴリズムを使用して、意味のある情報を発見し、知覚されたデータから隠れたパターンを見つけて、合理的な意思決定プロセスを裏付けることに関係しています。**
+
+![AI, ML, ディープラーニング、データサイエンス](../images/ai-ml-ds.png)
+
+
+>[このグラフ](https://softwareengineering.stackexchange.com/questions/366996/distinction-between-ai-ml-neural-networks-deep-learning-and-data-mining)に触発された[Jen Looper](https://twitter.com/jenlooper)氏によるインフォグラフィック
+
+## このコースで学ぶこと
+
+このカリキュラムでは、初心者が知っておかなければならない機械学習のコアな概念のみを取り上げます。私たちが「古典的な機械学習」と呼ぶものを、多くの学生が基礎を学ぶために使用する優れたライブラリであるScikit-learnを主に使ってカバーします。人工知能や深層学習などのより広い概念を理解するためには、機械学習の強力な基礎知識が不可欠ですので、ここで提供します。
+
+- 機械学習の核となるコンセプト
+- MLの歴史
+- MLと公平性
+- MLによる回帰の手法
+- MLによる分類技術
+- MLによるクラスタリング
+- MLによる自然言語処理の技術
+- MLによる時系列予測の技術
+- 強化学習
+- MLの現実世界への応用
+## このコースで扱わないこと
+
+- ディープラーニング
+- ニューラルネットワーク
+- AI
+
+ニューラルネットワークやディープラーニング(ニューラルネットワークを用いた多層的なモデル構築)、AIなどの複雑な分野は、より良い学習環境を提供するために避けていますが、これらは別のカリキュラムで取り上げます。また、それらの大きな分野の中でも特にデータサイエンスに焦点を当てたカリキュラムを提供する予定です。
+## なぜ機械学習を学ぶのか
+
+機械学習とは、システムの観点から、データから隠れたパターンを学習し、知的な意思決定を支援する自動化されたシステムを構築することと定義されます。
+
+この動機は、人間の脳が外界から認識したデータに基づいて特定の事柄を学習する仕組みに、ゆるやかにインスパイアされています。
+
+✅ なぜビジネスでは、ハードコードされたルールベースのエンジンを作るのではなく、機械学習戦略を使ってみようと思うのか、ちょっと考えてみてください。
+
+
+### 機械学習の応用
+
+機械学習のアプリケーションは、今やほとんどどこにでもあり、スマートフォンやコネクテッドデバイス、その他のシステムから生成され、私たちの社会に流れているデータと同様にありふれたものとなっています。最先端の機械学習アルゴリズムの計り知れない可能性を考慮して、研究者たちは、多次元的・多分野的な現実の問題を解決するためにその能力を探求し、非常に良い結果を得ています。
+
+**機械学習は様々な形で利用できます**:
+
+- 患者の病歴や報告書から病気の可能性を予測する。
+- 気象データを活用して気象現象を予測する。
+- 文章の感情を理解する。
+- プロパガンダの拡散を防ぐためにフェイクニュースを検出する。
+
+金融、経済、地球科学、宇宙開発、生物医学工学、認知科学、さらには文科系の分野でも、それぞれの分野のデータ処理に伴う困難な問題を解決するために、機械学習が採用されています。
+
+機械学習は、実世界のデータや生成されたデータから意味のある洞察を見出し、パターンを発見するプロセスを自動化します。機械学習は、ビジネス、健康、金融などの分野で非常に有用であることが証明されています。
+
+近い将来、機械学習の基礎を理解することは、機械学習の普及に伴い、あらゆる分野の人々にとって必須のものとなるでしょう。
+
+---
+## 🚀 Challenge
+AI、ML、深層学習、データサイエンスの違いについて理解していることを、紙や[Excalidraw](https://excalidraw.com/)などのオンラインアプリを使ってスケッチしてください。また、それぞれの技術が得意とする問題のアイデアを加えてみてください。
+
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/2?loc=ja)
+
+## 振り返りと自習
+
+クラウド上でMLアルゴリズムをどのように扱うことができるかについては、この[ラーニングパス](https://docs.microsoft.com/learn/paths/create-no-code-predictive-models-azure-machine-learning/?WT.mc_id=academic-15963-cxa)に従ってください。
+
+## 課題
+
+[稼働させる](assignment.ja.md)
diff --git a/1-Introduction/1-intro-to-ML/translations/README.tr.md b/1-Introduction/1-intro-to-ML/translations/README.tr.md
new file mode 100644
index 0000000000..669e649de9
--- /dev/null
+++ b/1-Introduction/1-intro-to-ML/translations/README.tr.md
@@ -0,0 +1,114 @@
+# Makine Öğrenimine Giriş
+
+[![ML, AI, Derin öğrenme - Farkları nelerdir?](https://img.youtube.com/vi/lTd9RSxS9ZE/0.jpg)](https://youtu.be/lTd9RSxS9ZE "ML, AI, Derin öğrenme - Farkları nelerdir?")
+
+> 🎥 Makine öğrenimi, yapay zeka ve derin öğrenme arasındaki farkı tartışan bir video için yukarıdaki resme tıklayın.
+
+## [Ders öncesi sınav](https://white-water-09ec41f0f.azurestaticapps.net/quiz/1?loc=tr)
+
+### Introduction
+
+Yeni başlayanlar için klasik makine öğrenimi üzerine olan bu kursa hoş geldiniz! İster bu konuda tamamen yeni olun, ister belli bir alandaki bilgilerini tazelemek isteyen deneyimli bir makine öğrenimi uygulayıcısı olun, aramıza katılmanızdan mutluluk duyarız! Makine öğrenimi çalışmanız için samimi bir başlangıç noktası oluşturmak istiyoruz ve [geri bildiriminizi](https://github.com/microsoft/ML-For-Beginners/discussions) değerlendirmekten, yanıtlamaktan ve hayata geçirmekten memnuniyet duyarız.
+
+[![Makine Öğrenimine Giriş](https://img.youtube.com/vi/h0e2HAPTGF4/0.jpg)](https://youtu.be/h0e2HAPTGF4 "Makine Öğrenimine Giriş")
+
+> 🎥 Video için yukarıdaki resme tıklayın: MIT'den John Guttag, makine öğrenimini tanıtıyor
+### Makine Öğrenimine Başlamak
+
+Bu müfredata başlamadan önce, bilgisayarınızın yerel olarak (Jupyter) not defterlerini çalıştırmak için hazır olması gerekir.
+
+- **Makinenizi bu videolar rehberliğinde yapılandırın**. Bu [video setinde](https://www.youtube.com/playlist?list=PLlrxD0HtieHhS8VzuMCfQD4uJ9yne1mE6) makinenizi nasıl kuracağınız hakkında daha fazla bilgi edinin.
+- **Python öğrenin**. Ayrıca, veri bilimciler için faydalı bir programlama dili olan ve bu derslerde kullandığımız [Python](https://docs.microsoft.com/learn/paths/python-language/?WT.mc_id=academic-15963-cxa) programlama dili hakkında temel bilgilere sahip olmanız da önerilir.
+- **Node.js ve JavaScript'i öğrenin**. Web uygulamaları oluştururken de bu kursta JavaScript'i birkaç kez kullanıyoruz, bu nedenle [node](https://nodejs.org), [npm](https://www.npmjs.com/) ve ayrıca hem Python hem de JavaScript geliştirme için kullanılabilen [Visual Studio Code](https://code.visualstudio.com/) yüklü olmalıdır.
+- **GitHub hesabı oluşturun**. Bizi burada [GitHub](https://github.com) üzerinde bulduğunuza göre, zaten bir hesabınız olabilir, ancak mevcut değilse, bir tane hesap oluşturun ve ardından bu müfredatı kendi başınıza kullanmak için çatallayın (fork). (Bize de yıldız vermekten çekinmeyin 😊)
+- **Scikit-learn'ü keşfedin**. Bu derslerde referans verdiğimiz, bir dizi ML kütüphanesinden oluşan [Scikit-learn](https://scikit-learn.org/stable/user_guide.html) hakkında bilgi edinin.
+
+### Makine öğrenimi nedir?
+
+'Makine öğrenimi' terimi, günümüzün en popüler ve sık kullanılan terimlerinden biridir. Hangi alanda çalışırsanız çalışın, teknolojiyle ilgili bir tür aşinalığınız varsa, bu terimi en az bir kez duymuş olma ihtimaliniz yüksektir. Bununla birlikte, makine öğreniminin mekanikleri, yani çalışma prensipleri, çoğu insan için bir gizemdir. Makine öğrenimine yeni başlayan biri için konu bazen bunaltıcı gelebilir. Bu nedenle, makine öğreniminin gerçekte ne olduğunu anlamak ve pratik örnekler üzerinden adım adım öğrenmek önemlidir.
+
+![ML heyecan eğrisi](../images/hype.png)
+
+> Google Trendler, 'makine öğrenimi' teriminin son 'heyecan eğrisini' gösteriyor
+
+Büyüleyici gizemlerle dolu bir evrende yaşıyoruz. Stephen Hawking, Albert Einstein ve daha pek çoğu gibi büyük bilim adamları, hayatlarını çevremizdeki dünyanın gizemlerini ortaya çıkaran anlamlı bilgiler aramaya adadılar. Öğrenmenin insani yönü de budur: insan evladı yeni şeyler öğrenir ve yetişkinliğe doğru büyüdükçe her yıl kendi dünyasının yapısını ortaya çıkarır.
+
+Bir çocuğun beyni ve duyuları, çevrelerindeki gerçekleri algılar ve çocuğun, öğrenilen kalıpları tanımlamak için mantıksal kurallar oluşturmasına yardımcı olan gizli yaşam kalıplarını yavaş yavaş öğrenir. İnsan beyninin öğrenme süreci, insanı bu dünyanın en gelişmiş canlısı yapar. Gizli kalıpları keşfederek sürekli öğrenmek ve sonra bu kalıplar üzerinde yenilik yapmak, yaşamımız boyunca kendimizi giderek daha iyi hale getirmemizi sağlar. Bu öğrenme kapasitesi ve gelişen kabiliyet, [beyin plastisitesi](https://www.simplypsychology.org/brain-plasticity.html) adı verilen bir kavramla ilgilidir. Yüzeysel olarak, insan beyninin öğrenme süreci ile makine öğrenimi kavramları arasında bazı motivasyonel benzerlikler çizebiliriz.
+
+[İnsan beyni](https://www.livescience.com/29365-human-brain.html) gerçek dünyadaki şeyleri algılar, algılanan bilgileri işler, mantıksal kararlar verir ve koşullara göre belirli eylemler gerçekleştirir. Akıllıca davranmak dediğimiz şey buydu işte. Bir makineye akıllı davranış sürecinin bir kopyasını programladığımızda buna yapay zeka (İngilizce haliyle artificial intelligence, kısaca **AI**) denir.
+
+Terimler karıştırılabilse de, makine öğrenimi (İngilizce haliyle machine learning, kısaca **ML**), yapay zekanın önemli bir alt kümesidir. **ML, mantıklı karar verme sürecini desteklemek için anlamlı bilgileri ortaya çıkarmak ve algılanan verilerden gizli kalıpları bulmak için özel algoritmalar kullanmakla ilgilenir**.
+
+![AI, ML, derin öğrenme, veri bilimi](../images/ai-ml-ds.png)
+
+> Yapay zeka, makine öğrenimi, derin öğrenme ve veri bilimi arasındaki ilişkileri gösteren bir diyagram. Bu infografik, [şu grafikten](https://softwareengineering.stackexchange.com/questions/366996/distinction-between-ai-ml-neural-networks-) ilham alan [Jen Looper](https://twitter.com/jenlooper) tarafından hazırlanmıştır.
+
+> AI (Artificial Intelligence): Yapay zekâ
+> ML(Machine Learning): Makine öğrenimi
+> Deep Learning: Derin Öğrenme
+> Data Science: Veri bilimi
+
+## Bu kursta neler öğreneceksiniz
+
+Bu müfredatta, yalnızca yeni başlayanların bilmesi gereken makine öğreniminin temel kavramlarını ele alacağız. 'Klasik makine öğrenimi' dediğimiz şeyi, öncelikle birçok öğrencinin temel bilgileri öğrenmek için kullandığı mükemmel bir kütüphane olan Scikit-learn'ü kullanarak ele alıyoruz. Daha geniş yapay zeka veya derin öğrenme kavramlarını anlamak için, güçlü bir temel makine öğrenimi bilgisi vazgeçilmezdir ve bu yüzden onu burada sunmak istiyoruz.
+
+Bu kursta şunları öğreneceksiniz:
+
+- makine öğreniminin temel kavramları
+- ML'nin tarihi
+- ML ve adillik
+- regresyon ML teknikleri
+- sınıflandırma ML teknikleri
+- kümeleme ML teknikleri
+- doğal dil işleme ML teknikleri
+- zaman serisi tahmini ML teknikleri
+- pekiştirmeli öğrenme
+- ML için gerçek-dünya uygulamaları
+
+## Neyi kapsamayacağız
+
+- derin öğrenme
+- sinir ağları
+- yapay zeka
+
+Daha iyi bir öğrenme deneyimi sağlamak için, farklı bir müfredatta tartışacağımız sinir ağları, 'derin öğrenme' (sinir ağlarını kullanarak çok katmanlı modeller oluşturma) ve yapay zekânın karmaşıklıklarından kaçınacağız. Ayrıca, bu daha geniş alanın bu yönüne odaklanmak için yakında çıkacak bir veri bilimi müfredatı sunacağız.
+
+## Neden makine öğrenimi üzerinde çalışmalısınız?
+
+Sistemler perspektifinden makine öğrenimi, akıllı kararlar almaya yardımcı olmak için verilerden gizli kalıpları öğrenebilen otomatik sistemlerin oluşturulması olarak tanımlanır.
+
+Bu motivasyon, insan beyninin dış dünyadan algıladığı verilere dayanarak belirli şeyleri nasıl öğrendiğinden bir miktar esinlenmiştir.
+
+✅ Bir işletmenin, sabit kurallara dayalı bir karar aracı oluşturmak yerine neden makine öğrenimi stratejilerini kullanmayı denemek isteyebileceklerini bir an için düşünün.
+
+### Makine öğrenimi uygulamaları
+
+Makine öğrenimi uygulamaları artık neredeyse her yerde ve akıllı telefonlarımız, internete bağlı cihazlarımız ve diğer sistemlerimiz tarafından üretilen, toplumlarımızda akan veriler kadar yaygın hale gelmiş durumda. Son teknoloji makine öğrenimi algoritmalarının muazzam potansiyelini göz önünde bulunduran araştırmacılar, bu algoritmaların çok boyutlu ve çok disiplinli gerçek hayat problemlerini çözme yeteneklerini araştırıyorlar ve oldukça olumlu sonuçlar alıyorlar.
+
+**Makine öğrenimini birçok şekilde kullanabilirsiniz**:
+
+- Bir hastanın tıbbi geçmişinden veya raporlarından hastalık olasılığını tahmin etmek
+- Hava olaylarını tahmin etmek için hava durumu verilerini kullanmak
+- Bir metnin duygu durumunu anlamak
+- Propagandanın yayılmasını durdurmak için sahte haberleri tespit etmek
+
+Finans, ekonomi, yer bilimi, uzay araştırmaları, biyomedikal mühendislik, bilişsel bilim ve hatta beşeri bilimlerdeki alanlar, kendi alanlarının zorlu ve ağır veri işleme sorunlarını çözmek için makine öğrenimini tekniklerini kullanmaya başladılar.
+
+Makine öğrenimi, gerçek dünyadan veya oluşturulan verilerden anlamlı içgörüler bularak örüntü bulma sürecini otomatikleştirir. Diğerlerinin yanı sıra iş, sağlık ve finansal uygulamalarda son derece değerli olduğunu kanıtlamıştır.
+
+Yakın gelecekte, yaygın olarak benimsenmesi nedeniyle makine öğreniminin temellerini anlamak, tüm alanlardan insanlar için bir zorunluluk olacak.
+
+---
+## 🚀 Meydan Okuma
+
+Kağıt üzerinde veya [Excalidraw](https://excalidraw.com/) gibi çevrimiçi bir uygulama kullanarak AI, makine öğrenimi, derin öğrenme ve veri bilimi arasındaki farkları anladığınızdan emin olun. Bu tekniklerin her birinin çözmede iyi olduğu bazı problem fikirleri ekleyin.
+
+## [Ders sonrası test](https://white-water-09ec41f0f.azurestaticapps.net/quiz/2?loc=tr)
+
+## İnceleme ve Bireysel Çalışma
+
+Bulutta makine öğrenimi algoritmalarıyla nasıl çalışabileceğiniz hakkında daha fazla bilgi edinmek için bu [Eğitim Patikasını](https://docs.microsoft.com/learn/paths/create-no-code-predictive-models-azure-machine-learning/?WT.mc_id=academic-15963-cxa) izleyin.
+
+## Ödev
+
+[Haydi başlayalım!](assignment.tr.md)
\ No newline at end of file
diff --git a/1-Introduction/1-intro-to-ML/translations/README.zh-cn.md b/1-Introduction/1-intro-to-ML/translations/README.zh-cn.md
index f14d54aef7..3e8ae621e9 100644
--- a/1-Introduction/1-intro-to-ML/translations/README.zh-cn.md
+++ b/1-Introduction/1-intro-to-ML/translations/README.zh-cn.md
@@ -4,30 +4,30 @@
> 🎥 点击上面的图片观看讨论机器学习、人工智能和深度学习之间区别的视频。
-## [课前测验](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/1/)
+## [课前测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/1/)
### 介绍
-欢迎来到这个经典机器学习的初学者课程!无论您是这个主题的新手,还是一个有经验的ML从业者,我们都很高兴您能加入我们!我们希望为您的ML研究创建一个好的开始,并很乐意评估、回应和接受您的[反馈](https://github.com/microsoft/ML-For-Beginners/discussions)。
+欢迎来到这个经典机器学习的初学者课程!无论你是这个主题的新手,还是一个有经验的ML从业者,我们都很高兴你能加入我们!我们希望为你的ML研究创建一个好的开始,并很乐意评估、回应和接受你的[反馈](https://github.com/microsoft/ML-For-Beginners/discussions)。
[![机器学习简介](https://img.youtube.com/vi/h0e2HAPTGF4/0.jpg)](https://youtu.be/h0e2HAPTGF4 "Introduction to ML")
> 🎥 单击上图观看视频:麻省理工学院的 John Guttag 介绍机器学习
### 机器学习入门
-在开始本课程之前,您需要设置计算机能在本地运行Jupyter Notebooks。
+在开始本课程之前,你需要设置计算机能在本地运行Jupyter Notebooks。
-- **按照这些视频里的讲解配置您的计算机**。了解有关如何在此[视频集](https://www.youtube.com/playlist?list=PLlrxD0HtieHhS8VzuMCfQD4uJ9yne1mE6)中设置计算机的更多信息。
-- **学习Python**. 还建议您对[Python](https://docs.microsoft.com/learn/paths/python-language/?WT.mc_id=academic-15963-cxa),我们在本课程中使用的一种对数据科学家有用的编程语言,有一个基本的了解。
-- **学习Node.js和JavaScript**。在本课程中,我们在构建web应用程序时也使用过几次JavaScript,因此您需要有[node](https://nodejs.org)和[npm](https://www.npmjs.com/) 以及[Visual Studio Code](https://code.visualstudio.com/)用于Python和JavaScript开发。
-- **创建GitHub帐户**。既然你在[GitHub](https://github.com)上找到我们,您可能已经有了一个帐户,但如果没有,请创建一个帐户,然后fork此课程自己使用(也给我们一颗星星吧😊)
+- **按照这些视频里的讲解配置你的计算机**。了解有关如何在此[视频集](https://www.youtube.com/playlist?list=PLlrxD0HtieHhS8VzuMCfQD4uJ9yne1mE6)中设置计算机的更多信息。
+- **学习Python**. 还建议你对[Python](https://docs.microsoft.com/learn/paths/python-language/?WT.mc_id=academic-15963-cxa),我们在本课程中使用的一种对数据科学家有用的编程语言,有一个基本的了解。
+- **学习Node.js和JavaScript**。在本课程中,我们在构建web应用程序时也使用过几次JavaScript,因此你需要有[node](https://nodejs.org)和[npm](https://www.npmjs.com/) 以及[Visual Studio Code](https://code.visualstudio.com/)用于Python和JavaScript开发。
+- **创建GitHub帐户**。既然你在[GitHub](https://github.com)上找到我们,你可能已经有了一个帐户,但如果没有,请创建一个帐户,然后fork此课程自己使用(也给我们一颗星星吧😊)
- **探索Scikit-learn**. 熟悉[Scikit-learn]([https://scikit-learn.org/stable/user_guide.html),我们在这些课程中引用的一组ML库。
### 什么是机器学习?
-术语“机器学习”是当今最流行和最常用的术语之一。 如果您对科技有某种程度的熟悉,那么很可能您至少听说过这个术语一次,无论您在哪个领域工作。然而,机器学习的机制对大多数人来说是一个谜。 对于机器学习初学者来说,这个主题有时会让人感到不知所措。 因此,了解机器学习的实质是什么,并通过实例一步一步地了解机器学习是很重要的。
+术语“机器学习”是当今最流行和最常用的术语之一。 如果你对科技有某种程度的熟悉,那么很可能你至少听说过这个术语一次,无论你在哪个领域工作。然而,机器学习的机制对大多数人来说是一个谜。 对于机器学习初学者来说,这个主题有时会让人感到不知所措。 因此,了解机器学习的实质是什么,并通过实例一步一步地了解机器学习是很重要的。
-![机器学习趋势曲线](images/hype.png)
+![机器学习趋势曲线](../images/hype.png)
> 谷歌趋势显示了“机器学习”一词最近的“趋势曲线”
@@ -39,15 +39,15 @@
尽管这些术语可能会混淆,但机器学习 (ML) 是人工智能的一个重要子集。 **机器学习关注使用专门的算法来发现有意义的信息,并从感知数据中找到隐藏的模式,以证实理性的决策过程**。
-![人工智能、机器学习、深度学习、数据科学](images/ai-ml-ds.png)
+![人工智能、机器学习、深度学习、数据科学](../images/ai-ml-ds.png)
> 显示AI、ML、深度学习和数据科学之间关系的图表。图片作者[Jen Looper](https://twitter.com/jenlooper),灵感来自[这张图](https://softwareengineering.stackexchange.com/questions/366996/distinction-between-ai-ml-neural-networks-deep-learning-and-data-mining)
-## 您将在本课程中学到什么
+## 你将在本课程中学到什么
在本课程中,我们将仅涵盖初学者必须了解的机器学习的核心概念。 我们主要使用Scikit-learn来介绍我们所谓的“经典机器学习”,这是一个许多学生用来学习基础知识的优秀库。要理解更广泛的人工智能或深度学习的概念,机器学习的基础知识是必不可少的,所以我们想在这里提供它。
-在本课程中,您将学习:
+在本课程中,你将学习:
- 机器学习的核心概念
- 机器学习的历史
@@ -94,9 +94,9 @@
---
## 🚀 挑战
-在纸上或使用[Excalidraw](https://excalidraw.com/)等在线应用程序绘制草图,了解您对AI、ML、深度学习和数据科学之间差异的理解。添加一些关于这些技术擅长解决的问题的想法。
+在纸上或使用[Excalidraw](https://excalidraw.com/)等在线应用程序绘制草图,了解你对AI、ML、深度学习和数据科学之间差异的理解。添加一些关于这些技术擅长解决的问题的想法。
-## [阅读后测验](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/2/)
+## [阅读后测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/2/)
## 复习与自学
@@ -104,4 +104,4 @@
## 任务
-[启动并运行](assignment.md)
+[启动并运行](assignment.zh-cn.md)
diff --git a/1-Introduction/1-intro-to-ML/translations/assignment.es.md b/1-Introduction/1-intro-to-ML/translations/assignment.es.md
new file mode 100644
index 0000000000..5241ca9624
--- /dev/null
+++ b/1-Introduction/1-intro-to-ML/translations/assignment.es.md
@@ -0,0 +1,9 @@
+# Lévantate y corre
+
+## Instrucciones
+
+En esta tarea no calificada, debe repasar Python y hacer que su entorno esté en funcionamiento y sea capaz de ejecutar cuadernos.
+
+Tome esta [Ruta de aprendizaje de Python](https://docs.microsoft.com/learn/paths/python-language/?WT.mc_id=academic-15963-cxa), y luego configure sus sistemas con estos videos introductorios:
+
+https://www.youtube.com/playlist?list=PLlrxD0HtieHhS8VzuMCfQD4uJ9yne1mE6
diff --git a/1-Introduction/1-intro-to-ML/translations/assignment.fr.md b/1-Introduction/1-intro-to-ML/translations/assignment.fr.md
new file mode 100644
index 0000000000..0d703d26c3
--- /dev/null
+++ b/1-Introduction/1-intro-to-ML/translations/assignment.fr.md
@@ -0,0 +1,10 @@
+# Être opérationnel
+
+
+## Instructions
+
+Dans ce devoir non noté, vous devez vous familiariser avec Python et rendre votre environnement opérationnel et capable d'exécuter des notebook.
+
+Suivez ce [parcours d'apprentissage Python](https://docs.microsoft.com/learn/paths/python-language/?WT.mc_id=academic-15963-cxa), puis configurez votre système en parcourant ces vidéos introductives :
+
+https://www.youtube.com/playlist?list=PLlrxD0HtieHhS8VzuMCfQD4uJ9yne1mE6
diff --git a/1-Introduction/1-intro-to-ML/translations/assignment.id.md b/1-Introduction/1-intro-to-ML/translations/assignment.id.md
new file mode 100644
index 0000000000..c6ba6e4a8d
--- /dev/null
+++ b/1-Introduction/1-intro-to-ML/translations/assignment.id.md
@@ -0,0 +1,9 @@
+# Persiapan
+
+## Instruksi
+
+Dalam tugas yang tidak dinilai ini, kamu akan mempelajari Python dan mempersiapkan *environment* kamu sehingga dapat digunakan untuk menjalankan *notebook*.
+
+Ambil [Jalur Belajar Python](https://docs.microsoft.com/learn/paths/python-language/?WT.mc_id=academic-15963-cxa) ini, kemudian persiapkan sistem kamu dengan menonton video-video pengantar ini:
+
+https://www.youtube.com/playlist?list=PLlrxD0HtieHhS8VzuMCfQD4uJ9yne1mE6
diff --git a/1-Introduction/1-intro-to-ML/translations/assignment.it.md b/1-Introduction/1-intro-to-ML/translations/assignment.it.md
new file mode 100644
index 0000000000..b4e3ceceda
--- /dev/null
+++ b/1-Introduction/1-intro-to-ML/translations/assignment.it.md
@@ -0,0 +1,9 @@
+# Tempi di apprendimento brevi
+
+## Istruzioni
+
+In questo compito senza valutazione, si dovrebbe rispolverare Python e rendere il proprio ambiente attivo e funzionante, in grado di eseguire notebook.
+
+Si segua questo [percorso di apprendimento di Python](https://docs.microsoft.com/learn/paths/python-language/?WT.mc_id=academic-15963-cxa) e quindi si configurino i propri sistemi seguendo questi video introduttivi:
+
+https://www.youtube.com/playlist?list=PLlrxD0HtieHhS8VzuMCfQD4uJ9yne1mE6
diff --git a/1-Introduction/1-intro-to-ML/translations/assignment.ja.md b/1-Introduction/1-intro-to-ML/translations/assignment.ja.md
new file mode 100644
index 0000000000..9c86969cd9
--- /dev/null
+++ b/1-Introduction/1-intro-to-ML/translations/assignment.ja.md
@@ -0,0 +1,9 @@
+# 稼働させる
+
+## 指示
+
+この評価のない課題では、Pythonについて復習し、環境を稼働させてノートブックを実行できるようにする必要があります。
+
+この[Pythonラーニングパス](https://docs.microsoft.com/learn/paths/python-language/?WT.mc_id=academic-15963-cxa)を受講し、次の入門用ビデオに従ってシステムをセットアップしてください。
+
+https://www.youtube.com/playlist?list=PLlrxD0HtieHhS8VzuMCfQD4uJ9yne1mE6
diff --git a/1-Introduction/1-intro-to-ML/translations/assignment.tr.md b/1-Introduction/1-intro-to-ML/translations/assignment.tr.md
new file mode 100644
index 0000000000..55abaf23d9
--- /dev/null
+++ b/1-Introduction/1-intro-to-ML/translations/assignment.tr.md
@@ -0,0 +1,9 @@
+# Haydi Başlayalım
+
+## Talimatlar
+
+Bu not-verilmeyen ödevde, Python bilgilerinizi tazelemeli, geliştirme ortamınızı çalışır duruma getirmeli ve not defterlerini çalıştırabilmelisiniz.
+
+Bu [Python Eğitim Patikasını](https://docs.microsoft.com/learn/paths/python-language/?WT.mc_id=academic-15963-cxa) bitirin ve ardından bu tanıtım videolarını izleyerek sistem kurulumunuzu yapın :
+
+https://www.youtube.com/playlist?list=PLlrxD0HtieHhS8VzuMCfQD4uJ9yne1mE6
\ No newline at end of file
diff --git a/1-Introduction/1-intro-to-ML/translations/assignment.zh-cn.md b/1-Introduction/1-intro-to-ML/translations/assignment.zh-cn.md
new file mode 100644
index 0000000000..fd59f6919a
--- /dev/null
+++ b/1-Introduction/1-intro-to-ML/translations/assignment.zh-cn.md
@@ -0,0 +1,9 @@
+# 启动和运行
+
+## 说明
+
+在这个不评分的作业中,你应该温习一下 Python,将 Python 环境能够运行起来,并且可以运行 notebooks。
+
+学习这个 [Python 学习路径](https://docs.microsoft.com/learn/paths/python-language/?WT.mc_id=academic-15963-cxa),然后通过这些介绍性的视频将你的系统环境设置好:
+
+https://www.youtube.com/playlist?list=PLlrxD0HtieHhS8VzuMCfQD4uJ9yne1mE6
diff --git a/1-Introduction/2-history-of-ML/README.md b/1-Introduction/2-history-of-ML/README.md
index 67c93dbf5f..a44703d680 100644
--- a/1-Introduction/2-history-of-ML/README.md
+++ b/1-Introduction/2-history-of-ML/README.md
@@ -3,7 +3,7 @@
![Summary of History of machine learning in a sketchnote](../../sketchnotes/ml-history.png)
> Sketchnote by [Tomomi Imura](https://www.twitter.com/girlie_mac)
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/3/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/3/)
In this lesson, we will walk through the major milestones in the history of machine learning and artificial intelligence.
@@ -101,7 +101,7 @@ It remains to be seen what the future holds, but it is important to understand t
Dig into one of these historical moments and learn more about the people behind them. There are fascinating characters, and no scientific discovery was ever created in a cultural vacuum. What do you discover?
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/4/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/4/)
## Review & Self Study
diff --git a/1-Introduction/2-history-of-ML/translations/README.es.md b/1-Introduction/2-history-of-ML/translations/README.es.md
old mode 100644
new mode 100755
index e69de29bb2..28402267ae
--- a/1-Introduction/2-history-of-ML/translations/README.es.md
+++ b/1-Introduction/2-history-of-ML/translations/README.es.md
@@ -0,0 +1,117 @@
+# Historia del machine learning
+
+![Resumen de la historoia del machine learning en un boceto](../../sketchnotes/ml-history.png)
+> Boceto por [Tomomi Imura](https://www.twitter.com/girlie_mac)
+
+## [Cuestionario previo a la conferencia](https://white-water-09ec41f0f.azurestaticapps.net/quiz/3/)
+
+En esta lección, analizaremos los principales hitos en la historia del machine learning y la inteligencia artificial.
+
+La historia de la inteligencia artificial, AI, como campo está entrelazada con la historia del machine learning, ya que los algoritmos y avances computacionales que sustentan el ML se incorporaron al desarrollo de la inteligencia artificial. Es útil recordar que, si bien, estos campos como áreas distintas de investigación comenzaron a cristalizar en la década de 1950, importantes [desubrimientos algorítmicos, estadísticos, matemáticos, computacionales y técnicos](https://wikipedia.org/wiki/Timeline_of_machine_learning) predecieronn y superpusieron a esta era. De hecho, las personas han estado pensando en estas preguntas durante [cientos de años](https://wikipedia.org/wiki/History_of_artificial_intelligence): este artículo analiza los fundamentos intelectuales históricos de la idea de una 'máquina pensante.'
+
+## Descubrimientos notables
+
+- 1763, 1812 [Teorema de Bayes](https://wikipedia.org/wiki/Bayes%27_theorem) y sus predecesores. Este teorema y sus aplicaciones son la base de la inferencia, describiendo la probabilidad de que ocurra un evento basado en el concimiento previo.
+- 1805 [Teoría de mínimos cuadrados](https://wikipedia.org/wiki/Least_squares) por el matemático francés Adrien-Marie Legendre. Esta teoría, que aprenderá en nuestra unidad de Regresión, ayuda en el data fitting.
+- 1913 [Cadenas de Markov](https://wikipedia.org/wiki/Markov_chain) el nombre del matemático ruso Andrey Markov es utilizado para describir una secuencia de eventos basados en su estado anterior.
+- 1957 [Perceptron](https://wikipedia.org/wiki/Perceptron) es un tipo de clasificador lineal inventado por el psicólogo Frank Rosenblatt que subyace a los avances en el deep learning.
+- 1967 [Nearest Neighbor (Vecino más cercano)](https://wikipedia.org/wiki/Nearest_neighbor) es un algoritmo diseñado originalmente para trazar rutas. En un contexto de ML, se utiliza para detectar patrones.
+- 1970 [Backpropagation](https://wikipedia.org/wiki/Backpropagation) es usado para entrenar [feedforward neural networks](https://wikipedia.org/wiki/Feedforward_neural_network).
+- 1982 [Recurrent Neural Networks](https://wikipedia.org/wiki/Recurrent_neural_network) son redes neuronales artificiales derivadas de redes neuronales feedforward que crean grafos temporales.
+
+✅ Investigue un poco. ¿Qué otras fechas se destacan como fundamentales en la historia del machine learning (ML) y la inteligencia artificial (AI)?
+## 1950: Máquinas que piensan
+
+Alan Turing, una persona verdaderamente notable que fue votada [por el público en 2019](https://wikipedia.org/wiki/Icons:_The_Greatest_Person_of_the_20th_Century) como el científico más grande del siglo XX, se le atribuye haber ayudado a sentar las bases del concepto de una 'máquina que puede pensar.' Lidió con los detractores y su propia necesidad de evidencia empírica de este concepto en parte mediante la creación de la [prueba de Turing](https://www.bbc.com/news/technology-18475646, que explorarás en nuestras lecciones de NLP.
+
+## 1956: Dartmouth Summer Research Project
+
+"The Dartmouth Summer Research Project sobre inteligencia artificial fuer un evento fundamental para la inteligencia artificial como campo," y fue aquí donde el se acuñó el término 'inteligencia artificial' ([fuente](https://250.dartmouth.edu/highlights/artificial-intelligence-ai-coined-dartmouth)).
+
+
+> Todos los aspectos del aprendizaje y cualquier otra característica de la inteligencia pueden, en principio, describirse con tanta precisión que se puede hacer una máquina para simularlos.
+
+El investigador principal, el profesor de matemáticas John McCarthy, esperaba "proceder sobre las bases de la conjetura que cada aspecto del aprendizaje o cualquier otra característica de la inteligencia pueden, en principio, describirse con tanta precición que se se puede hacer una máquina para simularlos." Los participantes, incluyeron otra luminaria en el campo, Marvin Minsky.
+
+El taller tiene el mérito de haber iniciado y alentado varias discusiones que incluyen "el surgimiento de métodos simbólicos, systemas en dominios limitados (primeros sistemas expertos), y sistemas deductivos versus sistemas inductivos." ([fuente](https://wikipedia.org/wiki/Dartmouth_workshop)).
+
+## 1956 - 1974: "Los años dorados"
+
+Desde la década de 1950, hasta mediados de la de 1970, el optimismo se elevó con la esperanza de que la AI pudiera resolver muchos problemas. En 1967, Marvin Minsky declaró con seguridad que "dentro de una generación ... el problema de crear 'inteligencia artificial' se resolverá sustancialemte." (Minsky, Marvin (1967), Computation: Finite and Infinite Machines, Englewood Cliffs, N.J.: Prentice-Hall)
+
+La investigación del procesamiento del lenguaje natural floreció, la búsqueda se refinó y se hizo más poderosa, y el concepto de 'micro-worlds' fue creado, donde se completaban tareas simples utilizando instrucciones en lenguaje sencillo.
+
+La investigación estuvo bien financiado por agencias gubernamentales, se realizaron avances en computación y algoritmos, y se construyeron prototipos de máquinas inteligentes.Algunas de esta máquinas incluyen:
+
+* [Shakey la robot](https://wikipedia.org/wiki/Shakey_the_robot), que podría maniobrar y decidir cómo realizar las tares de forma 'inteligente'.
+
+ ![Shakey, un robot inteligente](images/shakey.jpg)
+ > Shakey en 1972
+
+* Eliza, unas de las primeras 'chatterbot', podía conversar con las personas y actuar como un 'terapeuta' primitivo. Aprenderá más sobre ELiza en las lecciones de NLP.
+
+ ![Eliza, un bot](images/eliza.png)
+ > Una versión de Eliza, un chatbot
+
+* "Blocks world" era un ejemplo de micro-world donde los bloques se podían apilar y ordenar, y se podían probar experimentos en máquinas de enseñanza para tomar decisiones. Los avances creados con librerías como [SHRDLU](https://wikipedia.org/wiki/SHRDLU) ayudaron a inpulsar el procesamiento del lenguaje natural.
+
+ [![blocks world con SHRDLU](https://img.youtube.com/vi/QAJz4YKUwqw/0.jpg)](https://www.youtube.com/watch?v=QAJz4YKUwqw "blocks world con SHRDLU")
+
+ > 🎥 Haga click en la imagen de arriba para ver un video: Blocks world con SHRDLU
+
+## 1974 - 1980: "Invierno de la AI"
+
+A mediados de la década de 1970, se hizo evidente que la complejidad de la fabricación de 'máquinas inteligentes' se había subestimado y que su promesa, dado la potencia computacional disponible, había sido exagerada. La financiación se agotó y la confianza en el campo se ralentizó. Algunos problemas que impactaron la confianza incluyeron:
+
+- **Limitaciones**. La potencia computacional era demasiado limitada.
+- **Explosión combinatoria**. La cantidad de parámetros necesitados para entrenar creció exponencialmente a medida que se pedía más a las computadoras sin una evolución paralela de la potencia y la capacidad de cómputo.
+- **Escasez de datos**. Hubo una escasez de datos que obstaculizó el proceso de pruebas, desarrollo y refinamiento de algoritmos.
+- **¿Estamos haciendo las preguntas correctas?**. Las mismas preguntas que se estaban formulando comenzaron a cuestionarse. Los investigadores comenzaron a criticar sus aproches:
+ - Las pruebas de Turing se cuestionaron por medio, entre otras ideas, de la 'teoría de la habitación china' que postulaba que "progrmar una computadora digital puede hacerse que parezca que entiende el lenguaje, pero no puede producir una comprensión real" ([fuente](https://plato.stanford.edu/entries/chinese-room/))
+ - Se cuestionó la ética de introducir inteligencias artificiales como la "terapeuta" Eliza en la sociedad.
+
+Al mismo tiempo, comenzaron a formarse varia escuelas de pensamiento de AI. Se estableció una dicotomía entre las prácticas ["scruffy" vs. "neat AI"](https://wikipedia.org/wiki/Neats_and_scruffies). _Scruffy_ labs modificó los programas durante horas hasta que obtuvieron los objetivos deseados. _Neat_ labs "centrados en la lógica y la resolución de problemas formales". ELIZA y SHRDLU eran systemas _scruffy_ bien conocidos. En la década de 1980, cuando surgió la demanda para hacer que los sistemas de aprendizaje fueran reproducibles, el enfoque _neat_ gradualmente tomó la vanguardia a medidad que sus resultados eran más explicables.
+
+## Systemas expertos de la década de 1980
+
+A medida que el campo creció, su beneficio para las empresas se hizo más claro, y en la década de 1980 también lo hizo la proliferación de 'sistemas expertos'. "Los sistemas expertos estuvieron entre las primeras formas verdaderamente exitosas de software de inteligencia artificial (IA)." ([fuente](https://wikipedia.org/wiki/Expert_system)).
+
+Este tipo de sistemas es en realidad _híbrido_, que consta parcialmente de un motor de reglas que define los requisitos comerciales, y un motor de inferencia que aprovechó el sistema de reglas para deducir nuevos hechos.
+
+En esta era también se prestó mayor atención a las redes neuronales.
+
+## 1987 - 1993: AI 'Chill'
+
+La prolifercaión de hardware de sistemas expertos especializados tuvo el desafortunado efecto de volverse demasiado especializado. El auge de las computadoras personales también compitió con estos grandes sistemas centralizados especializados. La democratización de la informática había comenzado, y finalmente, allanó el camino para la explosión moderna del big data.
+
+## 1993 - 2011
+
+Esta época vió una nueva era para el ML y la IA para poder resolver problemas que habían sido causados anteriormente for la falta de datos y poder de cómputo. La cantidad de datos comenzó a aumentar rápidamente y a estar más disponible, para bien o para mal, especialmente con la llegada del smartphone alrededor del 2007. El poder computacional se expandió exponencialmente y los algoritmos evolucionaron al mismo tiempo. El campo comenzó a ganar madurez a medida que los días libres del pasado comenzaron a cristalizar en un verdadera disciplina.
+
+## Ahora
+
+Hoy en día, machine learning y la inteligencia artificial tocan casi todos los aspectos de nuestras vidas. Esta era requiere una comprensión cuidadosa de los riesgos y los efectos potenciales de estos algoritmos en las vidas humanas. Como ha dicho Brad Smith de Microsoft, "La tecnología de la información plantea problemas que van al corazón de las protecciones fundamentales de los derechos humanos, como la privacidad y la libertad de expresión. Esos problemas aumentan las responsabilidades de las empresas de tecnología que crean estos productos. En nuestra opinión, también exige regulación gubernamental reflexiva y para el desarrollo de normas sobre usos aceptables" ([fuente](https://www.technologyreview.com/2019/12/18/102365/the-future-of-ais-impact-on-society/)).
+
+Queda por ver qué depara el futuro, pero es importante entender estos sistemas informáticos y el software y algortimos que ejecutan. Esperamos que este plan de estudios le ayude a comprender mejor para que pueda decidir por si mismo.
+
+[![La historia del deep learning](https://img.youtube.com/vi/mTtDfKgLm54/0.jpg)](https://www.youtube.com/watch?v=mTtDfKgLm54 "The history of deep learning")
+> 🎥 Haga Click en la imagen de arriba para ver un video: Yann LeCun analiza la historia del deep learning en esta conferencia
+
+---
+## 🚀Desafío
+
+Sumérjase dentro de unos de estos momentos históricos y aprenda más sobre las personas detrás de ellos. Hay personajes fascinantes y nunca se creó ningún descubrimiento científico en un vacío cultural. ¿Qué descubres?
+
+## [Cuestionario posterior a la conferencia](https://white-water-09ec41f0f.azurestaticapps.net/quiz/4/)
+
+## Revisión y autoestudio
+
+Aquí hay elementos para ver y escuchar:
+
+[Este podcast donde Amy Boyd habla sobre la evolución de la IA](http://runasradio.com/Shows/Show/739)
+
+[![La historia de la IA por Amy Boyd](https://img.youtube.com/vi/EJt3_bFYKss/0.jpg)](https://www.youtube.com/watch?v=EJt3_bFYKss "La historia de la IA por Amy Boyd")
+
+## Asignación
+
+[Crea un timeline](assignment.md)
diff --git a/1-Introduction/2-history-of-ML/translations/README.fr.md b/1-Introduction/2-history-of-ML/translations/README.fr.md
new file mode 100644
index 0000000000..efe268777f
--- /dev/null
+++ b/1-Introduction/2-history-of-ML/translations/README.fr.md
@@ -0,0 +1,117 @@
+# Histoire du Machine Learning (apprentissage automatique)
+
+![Résumé de l'histoire du machine learning dans un sketchnote](../../../sketchnotes/ml-history.png)
+> Sketchnote de [Tomomi Imura](https://www.twitter.com/girlie_mac)
+
+## [Quizz préalable](https://white-water-09ec41f0f.azurestaticapps.net/quiz/3?loc=fr)
+
+Dans cette leçon, nous allons parcourir les principales étapes de l'histoire du machine learning et de l'intelligence artificielle.
+
+L'histoire de l'intelligence artificielle, l'IA, en tant que domaine est étroitement liée à l'histoire du machine learning, car les algorithmes et les avancées informatiques qui sous-tendent le ML alimentent le développement de l'IA. Bien que ces domaines en tant que domaines de recherches distincts ont commencé à se cristalliser dans les années 1950, il est important de rappeler que les [découvertes algorithmiques, statistiques, mathématiques, informatiques et techniques](https://wikipedia.org/wiki/Timeline_of_machine_learning) ont précédé et chevauchait cette époque. En fait, le monde réfléchit à ces questions depuis [des centaines d'années](https://fr.wikipedia.org/wiki/Histoire_de_l%27intelligence_artificielle) : cet article traite des fondements intellectuels historiques de l'idée d'une « machine qui pense ».
+
+## Découvertes notables
+
+- 1763, 1812 [théorème de Bayes](https://wikipedia.org/wiki/Bayes%27_theorem) et ses prédécesseurs. Ce théorème et ses applications sous-tendent l'inférence, décrivant la probabilité qu'un événement se produise sur la base de connaissances antérieures.
+- 1805 [Théorie des moindres carrés](https://wikipedia.org/wiki/Least_squares) par le mathématicien français Adrien-Marie Legendre. Cette théorie, que vous découvrirez dans notre unité Régression, aide à l'ajustement des données.
+- 1913 [Chaînes de Markov](https://wikipedia.org/wiki/Markov_chain) du nom du mathématicien russe Andrey Markov sont utilisées pour décrire une séquence d'événements possibles basée sur un état antérieur.
+- 1957 [Perceptron](https://wikipedia.org/wiki/Perceptron) est un type de classificateur linéaire inventé par le psychologue américain Frank Rosenblatt qui sous-tend les progrès de l'apprentissage en profondeur.
+- 1967 [Nearest Neighbor](https://wikipedia.org/wiki/Nearest_neighbor) est un algorithme conçu à l'origine pour cartographier les itinéraires. Dans un contexte ML, il est utilisé pour détecter des modèles.
+- 1970 [Backpropagation](https://wikipedia.org/wiki/Backpropagation) est utilisé pour former des [réseaux de neurones feedforward (propagation avant)](https://fr.wikipedia.org/wiki/R%C3%A9seau_de_neurones_%C3%A0_propagation_avant).
+- 1982 [Réseaux de neurones récurrents](https://wikipedia.org/wiki/Recurrent_neural_network) sont des réseaux de neurones artificiels dérivés de réseaux de neurones à réaction qui créent des graphes temporels.
+
+✅ Faites une petite recherche. Quelles autres dates sont marquantes dans l'histoire du ML et de l'IA ?
+
+## 1950 : Des machines qui pensent
+
+Alan Turing, une personne vraiment remarquable qui a été élue [par le public en 2019](https://wikipedia.org/wiki/Icons:_The_Greatest_Person_of_the_20th_Century) comme le plus grand scientifique du 20e siècle, est reconnu pour avoir aidé à jeter les bases du concept d'une "machine qui peut penser". Il a lutté avec ses opposants et son propre besoin de preuves empiriques de sa théorie en créant le [Test de Turing] (https://www.bbc.com/news/technology-18475646), que vous explorerez dans nos leçons de NLP (TALN en français).
+
+## 1956 : Projet de recherche d'été à Dartmouth
+
+« Le projet de recherche d'été de Dartmouth sur l'intelligence artificielle a été un événement fondateur pour l'intelligence artificielle en tant que domaine », et c'est ici que le terme « intelligence artificielle » a été inventé ([source](https://250.dartmouth.edu/highlights/artificial-intelligence-ai-coined-dartmouth)).
+
+> Chaque aspect de l'apprentissage ou toute autre caractéristique de l'intelligence peut en principe être décrit si précisément qu'une machine peut être conçue pour les simuler.
+
+Le chercheur en tête, le professeur de mathématiques John McCarthy, espérait « procéder sur la base de la conjecture selon laquelle chaque aspect de l'apprentissage ou toute autre caractéristique de l'intelligence peut en principe être décrit avec une telle précision qu'une machine peut être conçue pour les simuler ». Les participants comprenaient une autre sommité dans le domaine, Marvin Minsky.
+
+L'atelier est crédité d'avoir initié et encouragé plusieurs discussions, notamment « l'essor des méthodes symboliques, des systèmes spécialisés sur des domaines limités (premiers systèmes experts) et des systèmes déductifs par rapport aux systèmes inductifs ». ([source](https://fr.wikipedia.org/wiki/Conf%C3%A9rence_de_Dartmouth)).
+
+## 1956 - 1974 : "Les années d'or"
+
+Des années 50 au milieu des années 70, l'optimisme était au rendez-vous en espérant que l'IA puisse résoudre de nombreux problèmes. En 1967, Marvin Minsky a déclaré avec assurance que « Dans une génération... le problème de la création d'"intelligence artificielle" sera substantiellement résolu. » (Minsky, Marvin (1967), Computation: Finite and Infinite Machines, Englewood Cliffs, N.J.: Prentice-Hall)
+
+La recherche sur le Natural Language Processing (traitement du langage naturel en français) a prospéré, la recherche a été affinée et rendue plus puissante, et le concept de « micro-mondes » a été créé, où des tâches simples ont été effectuées en utilisant des instructions en langue naturelle.
+
+La recherche a été bien financée par les agences gouvernementales, des progrès ont été réalisés dans le calcul et les algorithmes, et des prototypes de machines intelligentes ont été construits. Certaines de ces machines incluent :
+
+* [Shakey le robot](https://fr.wikipedia.org/wiki/Shakey_le_robot), qui pouvait manœuvrer et décider comment effectuer des tâches « intelligemment ».
+
+ ![Shakey, un robot intelligent](../images/shakey.jpg)
+ > Shaky en 1972
+
+* Eliza, une des premières « chatbot », pouvait converser avec les gens et agir comme une « thérapeute » primitive. Vous en apprendrez plus sur Eliza dans les leçons de NLP.
+
+ ![Eliza, un bot](../images/eliza.png)
+ > Une version d'Eliza, un chatbot
+
+* Le « monde des blocs » était un exemple de micro-monde où les blocs pouvaient être empilés et triés, et où des expériences d'apprentissages sur des machines, dans le but qu'elles prennent des décisions, pouvaient être testées. Les avancées réalisées avec des bibliothèques telles que [SHRDLU](https://fr.wikipedia.org/wiki/SHRDLU) ont contribué à faire avancer le natural language processing.
+
+ [![Monde de blocs avec SHRDLU](https://img.youtube.com/vi/QAJz4YKUwqw/0.jpg)](https://www.youtube.com/watch?v=QAJz4YKUwqw "Monde de blocs avec SHRDLU" )
+
+ > 🎥 Cliquez sur l'image ci-dessus pour une vidéo : Blocks world with SHRDLU
+
+## 1974 - 1980 : « l'hiver de l'IA »
+
+Au milieu des années 1970, il était devenu évident que la complexité de la fabrication de « machines intelligentes » avait été sous-estimée et que sa promesse, compte tenu de la puissance de calcul disponible, avait été exagérée. Les financements se sont taris et la confiance dans le domaine s'est ralentie. Parmi les problèmes qui ont eu un impact sur la confiance, citons :
+
+- **Restrictions**. La puissance de calcul était trop limitée.
+- **Explosion combinatoire**. Le nombre de paramètres à former augmentait de façon exponentielle à mesure que l'on en demandait davantage aux ordinateurs, sans évolution parallèle de la puissance et de la capacité de calcul.
+- **Pénurie de données**. Il y avait un manque de données qui a entravé le processus de test, de développement et de raffinement des algorithmes.
+- **Posions-nous les bonnes questions ?**. Les questions mêmes, qui étaient posées, ont commencé à être remises en question. Les chercheurs ont commencé à émettre des critiques sur leurs approches :
+ - Les tests de Turing ont été remis en question au moyen, entre autres, de la « théorie de la chambre chinoise » qui postulait que « la programmation d'un ordinateur numérique peut faire croire qu'il comprend le langage mais ne peut pas produire une compréhension réelle ». ([source](https://plato.stanford.edu/entries/chinese-room/))
+ - L'éthique de l'introduction d'intelligences artificielles telles que la "thérapeute" ELIZA dans la société a été remise en cause.
+
+Dans le même temps, diverses écoles de pensée sur l'IA ont commencé à se former. Une dichotomie a été établie entre les pratiques IA ["scruffy" et "neat"](https://wikipedia.org/wiki/Neats_and_scruffies). Les laboratoires _Scruffy_ peaufinaient leurs programmes pendant des heures jusqu'à ce qu'ils obtiennent les résultats souhaités. Les laboratoires _Neat_ "se concentraient sur la logique et la résolution formelle de problèmes". ELIZA et SHRDLU étaient des systèmes _scruffy_ bien connus. Dans les années 1980, alors qu'émergeait la demande de rendre les systèmes ML reproductibles, l'approche _neat_ a progressivement pris le devant de la scène car ses résultats sont plus explicables.
+
+## 1980 : Systèmes experts
+
+Au fur et à mesure que le domaine s'est développé, ses avantages pour les entreprises sont devenus plus clairs, particulièrement via les « systèmes experts » dans les années 1980. "Les systèmes experts ont été parmi les premières formes vraiment réussies de logiciels d'intelligence artificielle (IA)." ([source](https://fr.wikipedia.org/wiki/Syst%C3%A8me_expert)).
+
+Ce type de système est en fait _hybride_, composé en partie d'un moteur de règles définissant les exigences métier et d'un moteur d'inférence qui exploite le système de règles pour déduire de nouveaux faits.
+
+Cette époque a également vu une attention croissante accordée aux réseaux de neurones.
+
+## 1987 - 1993 : IA « Chill »
+
+La prolifération du matériel spécialisé des systèmes experts a eu pour effet malheureux de devenir trop spécialisée. L'essor des ordinateurs personnels a également concurrencé ces grands systèmes spécialisés et centralisés. La démocratisation de l'informatique a commencé et a finalement ouvert la voie à l'explosion des mégadonnées.
+
+## 1993 - 2011
+
+Cette époque a vu naître une nouvelle ère pour le ML et l'IA afin de résoudre certains des problèmes qui n'avaient pu l'être plus tôt par le manque de données et de puissance de calcul. La quantité de données a commencé à augmenter rapidement et à devenir plus largement disponibles, pour le meilleur et pour le pire, en particulier avec l'avènement du smartphone vers 2007. La puissance de calcul a augmenté de façon exponentielle et les algorithmes ont évolué parallèlement. Le domaine a commencé à gagner en maturité alors que l'ingéniosité a commencé à se cristalliser en une véritable discipline.
+
+## À présent
+
+Aujourd'hui, le machine learning et l'IA touchent presque tous les aspects de notre vie. Cette ère nécessite une compréhension approfondie des risques et des effets potentiels de ces algorithmes sur les vies humaines. Comme l'a déclaré Brad Smith de Microsoft, « les technologies de l'information soulèvent des problèmes qui vont au cœur des protections fondamentales des droits de l'homme comme la vie privée et la liberté d'expression. Ces problèmes accroissent la responsabilité des entreprises technologiques qui créent ces produits. À notre avis, ils appellent également à une réglementation gouvernementale réfléchie et au développement de normes autour des utilisations acceptables" ([source](https://www.technologyreview.com/2019/12/18/102365/the-future-of-ais-impact-on-society/)).
+
+Reste à savoir ce que l'avenir nous réserve, mais il est important de comprendre ces systèmes informatiques ainsi que les logiciels et algorithmes qu'ils exécutent. Nous espérons que ce programme vous aidera à mieux les comprendre afin que vous puissiez décider par vous-même.
+
+[![L'histoire du Deep Learning](https://img.youtube.com/vi/mTtDfKgLm54/0.jpg)](https://www.youtube.com/watch?v=mTtDfKgLm54 "L'histoire du Deep Learning")
+> 🎥 Cliquez sur l'image ci-dessus pour une vidéo : Yann LeCun discute de l'histoire du deep learning dans cette conférence
+
+---
+## 🚀Challenge
+
+Plongez dans l'un de ces moments historiques et apprenez-en plus sur les personnes derrière ceux-ci. Il y a des personnalités fascinantes, et aucune découverte scientifique n'a jamais été créée avec un vide culturel. Que découvrez-vous ?
+
+## [Quiz de validation des connaissances](https://white-water-09ec41f0f.azurestaticapps.net/quiz/4?loc=fr)
+
+## Révision et auto-apprentissage
+
+Voici quelques articles à regarder et à écouter :
+
+[Ce podcast où Amy Boyd discute de l'évolution de l'IA](http://runasradio.com/Shows/Show/739)
+
+[![L'histoire de l'IA par Amy Boyd](https://img.youtube.com/vi/EJt3_bFYKss/0.jpg)](https://www.youtube.com/watch?v=EJt3_bFYKss "L'histoire de l'IA par Amy Boyd")
+
+## Devoir
+
+[Créer une frise chronologique](assignment.fr.md)
diff --git a/1-Introduction/2-history-of-ML/translations/README.id.md b/1-Introduction/2-history-of-ML/translations/README.id.md
new file mode 100644
index 0000000000..9e695a8a9a
--- /dev/null
+++ b/1-Introduction/2-history-of-ML/translations/README.id.md
@@ -0,0 +1,116 @@
+# Sejarah Machine Learning
+
+![Ringkasan dari Sejarah Machine Learning dalam sebuah catatan sketsa](../../../sketchnotes/ml-history.png)
+> Catatan sketsa oleh [Tomomi Imura](https://www.twitter.com/girlie_mac)
+
+## [Quiz Pra-Pelajaran](https://white-water-09ec41f0f.azurestaticapps.net/quiz/3/)
+
+Dalam pelajaran ini, kita akan membahas tonggak utama dalam sejarah Machine Learning dan Artificial Intelligence.
+
+Sejarah Artifical Intelligence, AI, sebagai bidang terkait dengan sejarah Machine Learning, karena algoritma dan kemajuan komputasi yang mendukung ML dimasukkan ke dalam pengembangan AI. Penting untuk diingat bahwa, meski bidang-bidang ini sebagai bidang-bidang penelitian yang berbeda mulai terbentuk pada 1950-an, [algoritmik, statistik, matematik, komputasi dan penemuan teknis](https://wikipedia.org/wiki/Timeline_of_machine_learning) penting sudah ada sebelumnya, dan saling tumpang tindih di era ini. Faktanya, orang-orang telah memikirkan pertanyaan-pertanyaan ini selama [ratusan tahun](https://wikipedia.org/wiki/History_of_artificial_intelligence): artikel ini membahas dasar-dasar intelektual historis dari gagasan 'mesin yang berpikir'.
+
+## Penemuan penting
+
+- 1763, 1812 [Bayes Theorem](https://wikipedia.org/wiki/Bayes%27_theorem) dan para pendahulu. Teorema ini dan penerapannya mendasari inferensi, mendeskripsikan kemungkinan suatu peristiwa terjadi berdasarkan pengetahuan sebelumnya.
+- 1805 [Least Square Theory](https://wikipedia.org/wiki/Least_squares) oleh matematikawan Perancis Adrien-Marie Legendre. Teori ini yang akan kamu pelajari di unit Regresi, ini membantu dalam *data fitting*.
+- 1913 [Markov Chains](https://wikipedia.org/wiki/Markov_chain) dinamai dengan nama matematikawan Rusia, Andrey Markov, digunakan untuk mendeskripsikan sebuah urutan dari kejadian-kejadian yang mungkin terjadi berdasarkan kondisi sebelumnya.
+- 1957 [Perceptron](https://wikipedia.org/wiki/Perceptron) adalah sebuah tipe dari *linear classifier* yang ditemukan oleh psikolog Amerika, Frank Rosenblatt, yang mendasari kemajuan dalam *Deep Learning*.
+- 1967 [Nearest Neighbor](https://wikipedia.org/wiki/Nearest_neighbor) adalah sebuah algoritma yang pada awalnya didesain untuk memetakan rute. Dalam konteks ML, ini digunakan untuk mendeteksi berbagai pola.
+- 1970 [Backpropagation](https://wikipedia.org/wiki/Backpropagation) digunakan untuk melatih [feedforward neural networks](https://wikipedia.org/wiki/Feedforward_neural_network).
+- 1982 [Recurrent Neural Networks](https://wikipedia.org/wiki/Recurrent_neural_network) adalah *artificial neural networks* yang berasal dari *feedforward neural networks* yang membuat grafik sementara.
+
+✅ Lakukan sebuah riset kecil. Tanggal berapa lagi yang merupakan tanggal penting dalam sejarah ML dan AI?
+## 1950: Mesin yang berpikir
+
+Alan Turing, merupakan orang luar biasa yang terpilih oleh [publik di tahun 2019](https://wikipedia.org/wiki/Icons:_The_Greatest_Person_of_the_20th_Century) sebagai ilmuwan terhebat di abad 20, diberikan penghargaan karena membantu membuat fondasi dari sebuah konsep 'mesin yang bisa berpikir', Dia berjuang menghadapi orang-orang yang menentangnya dan keperluannya sendiri untuk bukti empiris dari konsep ini dengan membuat [Turing Test](https://www.bbc.com/news/technology-18475646), yang mana akan kamu jelajahi di pelajaran NLP kami.
+
+## 1956: Proyek Riset Musim Panas Dartmouth
+
+"Proyek Riset Musim Panas Dartmouth pada *artificial intelligence* merupakan sebuah acara penemuan untuk *artificial intelligence* sebagai sebuah bidang," dan dari sinilah istilah '*artificial intelligence*' diciptakan ([sumber](https://250.dartmouth.edu/highlights/artificial-intelligence-ai-coined-dartmouth)).
+
+> Setiap aspek pembelajaran atau fitur kecerdasan lainnya pada prinsipnya dapat dideskripsikan dengan sangat tepat sehingga sebuah mesin dapat dibuat untuk mensimulasikannya.
+
+Ketua peneliti, profesor matematika John McCarthy, berharap "untuk meneruskan dasar dari dugaan bahwa setiap aspek pembelajaran atau fitur kecerdasan lainnya pada prinsipnya dapat dideskripsikan dengan sangat tepat sehingga mesin dapat dibuat untuk mensimulasikannya." Marvin Minsky, seorang tokoh terkenal di bidang ini juga termasuk sebagai peserta penelitian.
+
+Workshop ini dipuji karena telah memprakarsai dan mendorong beberapa diskusi termasuk "munculnya metode simbolik, sistem yang berfokus pada domain terbatas (sistem pakar awal), dan sistem deduktif versus sistem induktif." ([sumber](https://wikipedia.org/wiki/Dartmouth_workshop)).
+
+## 1956 - 1974: "Tahun-tahun Emas"
+
+Dari tahun 1950-an hingga pertengahan 70-an, optimisme memuncak dengan harapan bahwa AI dapat memecahkan banyak masalah. Pada tahun 1967, Marvin Minsky dengan yakin menyatakan bahwa "Dalam satu generasi ... masalah menciptakan '*artificial intelligence*' akan terpecahkan secara substansial." (Minsky, Marvin (1967), Computation: Finite and Infinite Machines, Englewood Cliffs, N.J.: Prentice-Hall)
+
+Penelitian *natural language processing* berkembang, pencarian disempurnakan dan dibuat lebih *powerful*, dan konsep '*micro-worlds*' diciptakan, di mana tugas-tugas sederhana diselesaikan menggunakan instruksi bahasa sederhana.
+
+Penelitian didanai dengan baik oleh lembaga pemerintah, banyak kemajuan dibuat dalam komputasi dan algoritma, dan prototipe mesin cerdas dibangun. Beberapa mesin tersebut antara lain:
+
+* [Shakey the robot](https://wikipedia.org/wiki/Shakey_the_robot), yang bisa bermanuver dan memutuskan bagaimana melakukan tugas-tugas secara 'cerdas'.
+
+ ![Shakey, an intelligent robot](../images/shakey.jpg)
+ > Shakey pada 1972
+
+* Eliza, sebuah 'chatterbot' awal, dapat mengobrol dengan orang-orang dan bertindak sebagai 'terapis' primitif. Kamu akan belajar lebih banyak tentang Eliza dalam pelajaran NLP.
+
+ ![Eliza, a bot](../images/eliza.png)
+ > Sebuah versi dari Eliza, sebuah *chatbot*
+
+* "Blocks world" adalah contoh sebuah *micro-world* dimana balok dapat ditumpuk dan diurutkan, dan pengujian eksperimen mesin pengajaran untuk membuat keputusan dapat dilakukan. Kemajuan yang dibuat dengan *library-library* seperti [SHRDLU](https://wikipedia.org/wiki/SHRDLU) membantu mendorong kemajuan pemrosesan bahasa.
+
+ [![blocks world dengan SHRDLU](https://img.youtube.com/vi/QAJz4YKUwqw/0.jpg)](https://www.youtube.com/watch?v=QAJz4YKUwqw "blocks world dengan SHRDLU")
+
+ > 🎥 Klik gambar diatas untuk menonton video: Blocks world with SHRDLU
+
+## 1974 - 1980: "Musim Dingin AI"
+
+Pada pertengahan 1970-an, semakin jelas bahwa kompleksitas pembuatan 'mesin cerdas' telah diremehkan dan janjinya, mengingat kekuatan komputasi yang tersedia, telah dilebih-lebihkan. Pendanaan telah habis dan kepercayaan dalam bidang ini menurun. Beberapa masalah yang memengaruhi kepercayaan diri termasuk:
+
+- **Keterbatasan**. Kekuatan komputasi terlalu terbatas.
+- **Ledakan kombinatorial**. Jumlah parameter yang perlu dilatih bertambah secara eksponensial karena lebih banyak hal yang diminta dari komputer, tanpa evolusi paralel dari kekuatan dan kemampuan komputasi.
+- **Kekurangan data**. Adanya kekurangan data yang menghalangi proses pengujian, pengembangan, dan penyempurnaan algoritma.
+- **Apakah kita menanyakan pertanyaan yang tepat?**. Pertanyaan-pertanyaan yang diajukan pun mulai dipertanyakan kembali. Para peneliti mulai melontarkan kritik tentang pendekatan mereka
+ - Tes Turing mulai dipertanyakan, di antara ide-ide lain, dari 'teori ruang Cina' yang mengemukakan bahwa, "memprogram komputer digital mungkin membuatnya tampak memahami bahasa tetapi tidak dapat menghasilkan pemahaman yang sebenarnya." ([sumber](https://plato.stanford.edu/entries/chinese-room/))
+ - Tantangan etika ketika memperkenalkan kecerdasan buatan seperti si "terapis" ELIZA ke dalam masyarakat.
+
+Pada saat yang sama, berbagai aliran pemikiran AI mulai terbentuk. Sebuah dikotomi didirikan antara praktik ["scruffy" vs. "neat AI"](https://wikipedia.org/wiki/Neats_and_scruffies). Lab _Scruffy_ mengubah program selama berjam-jam sampai mendapat hasil yang diinginkan. Lab _Neat_ "berfokus pada logika dan penyelesaian masalah formal". ELIZA dan SHRDLU adalah sistem _scruffy_ yang terkenal. Pada tahun 1980-an, karena perkembangan permintaan untuk membuat sistem ML yang dapat direproduksi, pendekatan _neat_ secara bertahap menjadi yang terdepan karena hasilnya lebih dapat dijelaskan.
+
+## 1980s Sistem Pakar
+
+Seiring berkembangnya bidang ini, manfaatnya bagi bisnis menjadi lebih jelas, dan begitu pula dengan menjamurnya 'sistem pakar' pada tahun 1980-an. "Sistem pakar adalah salah satu bentuk perangkat lunak artificial intelligence (AI) pertama yang benar-benar sukses." ([sumber](https://wikipedia.org/wiki/Expert_system)).
+
+Tipe sistem ini sebenarnya adalah _hybrid_, sebagian terdiri dari mesin aturan yang mendefinisikan kebutuhan bisnis, dan mesin inferensi yang memanfaatkan sistem aturan untuk menyimpulkan fakta baru.
+
+Pada era ini juga terlihat adanya peningkatan perhatian pada jaringan saraf.
+
+## 1987 - 1993: AI 'Chill'
+
+Perkembangan perangkat keras sistem pakar terspesialisasi memiliki efek yang tidak menguntungkan karena menjadi terlalu terspesialiasasi. Munculnya komputer pribadi juga bersaing dengan sistem yang besar, terspesialisasi, dan terpusat ini. Demokratisasi komputasi telah dimulai, dan pada akhirnya membuka jalan untuk ledakan modern dari *big data*.
+
+## 1993 - 2011
+
+Pada zaman ini memperlihatkan era baru bagi ML dan AI untuk dapat menyelesaikan beberapa masalah yang sebelumnya disebabkan oleh kurangnya data dan daya komputasi. Jumlah data mulai meningkat dengan cepat dan tersedia secara luas, terlepas dari baik dan buruknya, terutama dengan munculnya *smartphone* sekitar tahun 2007. Daya komputasi berkembang secara eksponensial, dan algoritma juga berkembang saat itu. Bidang ini mulai mengalami kedewasaan karena hari-hari yang tidak beraturan di masa lalu mulai terbentuk menjadi disiplin yang sebenarnya.
+
+## Sekarang
+
+Saat ini, *machine learning* dan AI hampir ada di setiap bagian dari kehidupan kita. Era ini menuntut pemahaman yang cermat tentang risiko dan efek potensi dari berbagai algoritma yang ada pada kehidupan manusia. Seperti yang telah dinyatakan oleh Brad Smith dari Microsoft, "Teknologi informasi mengangkat isu-isu yang menjadi inti dari perlindungan hak asasi manusia yang mendasar seperti privasi dan kebebasan berekspresi. Masalah-masalah ini meningkatkan tanggung jawab bagi perusahaan teknologi yang menciptakan produk-produk ini. Dalam pandangan kami, mereka juga menyerukan peraturan pemerintah yang bijaksana dan untuk pengembangan norma-norma seputar penggunaan yang wajar" ([sumber](https://www.technologyreview.com/2019/12/18/102365/the-future-of-ais-impact-on-society/)).
+
+Kita masih belum tahu apa yang akan terjadi di masa depan, tetapi penting untuk memahami sistem komputer dan perangkat lunak serta algoritma yang dijalankannya. Kami berharap kurikulum ini akan membantu kamu untuk mendapatkan pemahaman yang lebih baik sehingga kamu dapat memutuskan sendiri.
+
+[![Sejarah Deep Learning](https://img.youtube.com/vi/mTtDfKgLm54/0.jpg)](https://www.youtube.com/watch?v=mTtDfKgLm54 "Sejarah Deep Learning")
+> 🎥 Klik gambar diatas untuk menonton video: Yann LeCun mendiskusikan sejarah dari Deep Learning dalam pelajaran ini
+
+---
+## 🚀Tantangan
+
+Gali salah satu momen bersejarah ini dan pelajari lebih lanjut tentang orang-orang di baliknya. Ada karakter yang menarik, dan tidak ada penemuan ilmiah yang pernah dibuat dalam kekosongan budaya. Apa yang kamu temukan?
+
+## [Quiz Pasca-Pelajaran](https://white-water-09ec41f0f.azurestaticapps.net/quiz/4/)
+
+## Ulasan & Belajar Mandiri
+
+Berikut adalah item untuk ditonton dan didengarkan:
+
+[Podcast dimana Amy Boyd mendiskusikan evolusi dari AI](http://runasradio.com/Shows/Show/739)
+
+[![Sejarah AI oleh Amy Boyd](https://img.youtube.com/vi/EJt3_bFYKss/0.jpg)](https://www.youtube.com/watch?v=EJt3_bFYKss "Sejarah AI oleh Amy Boyd")
+
+## Tugas
+
+[Membuat sebuah *timeline*](assignment.id.md)
diff --git a/1-Introduction/2-history-of-ML/translations/README.it.md b/1-Introduction/2-history-of-ML/translations/README.it.md
new file mode 100644
index 0000000000..c7d5ce2345
--- /dev/null
+++ b/1-Introduction/2-history-of-ML/translations/README.it.md
@@ -0,0 +1,118 @@
+# Storia di machine learning
+
+![Riepilogo della storia di machine learning in uno sketchnote](../../../sketchnotes/ml-history.png)
+> Sketchnote di [Tomomi Imura](https://www.twitter.com/girlie_mac)
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/3/)
+
+In questa lezione, si camminerà attraverso le principali pietre miliari nella storia di machine learning e dell'intelligenza artificiale.
+
+La storia dell'intelligenza artificiale, AI, come campo è intrecciata con la storia di machine learning, poiché gli algoritmi e i progressi computazionali alla base di machine learning hanno contribuito allo sviluppo dell'intelligenza artificiale. È utile ricordare che, mentre questi campi come distinte aree di indagine hanno cominciato a cristallizzarsi negli anni '50, importanti [scoperte algoritmiche, statistiche, matematiche, computazionali e tecniche](https://wikipedia.org/wiki/Timeline_of_machine_learning) hanno preceduto e si sono sovrapposte a questa era. In effetti, le persone hanno riflettuto su queste domande per [centinaia di anni](https://wikipedia.org/wiki/History_of_artificial_intelligence); questo articolo discute le basi intellettuali storiche dell'idea di una "macchina pensante".
+
+## Scoperte rilevanti
+
+- 1763, 1812 [Teorema di Bayes](https://it.wikipedia.org/wiki/Teorema_di_Bayes) e suoi predecessori. Questo teorema e le sue applicazioni sono alla base dell'inferenza, descrivendo la probabilità che un evento si verifichi in base alla conoscenza precedente.
+- 1805 [Metodo dei Minimi Quadrati](https://it.wikipedia.org/wiki/Metodo_dei_minimi_quadrati) del matematico francese Adrien-Marie Legendre. Questa teoria, che verrà trattata nell'unità Regressione, aiuta nell'adattamento dei dati.
+- 1913 [Processo Markoviano](https://it.wikipedia.org/wiki/Processo_markoviano) dal nome del matematico russo Andrey Markov è usato per descrivere una sequenza di possibili eventi basati su uno stato precedente.
+- 1957 [Percettrone](https://it.wikipedia.org/wiki/Percettrone) è un tipo di classificatore lineare inventato dallo psicologo americano Frank Rosenblatt che sta alla base dei progressi nel deep learning.
+- 1967 [Nearest Neighbor](https://wikipedia.org/wiki/Nearest_neighbor) è un algoritmo originariamente progettato per mappare i percorsi. In un contesto ML viene utilizzato per rilevare i modelli.
+- 1970 [La Retropropagazione dell'Errore](https://it.wikipedia.org/wiki/Retropropagazione_dell'errore) viene utilizzata per addestrare [le reti neurali feed-forward](https://it.wikipedia.org/wiki/Rete_neurale_feed-forward).
+- Le [Reti Neurali Ricorrenti](https://it.wikipedia.org/wiki/Rete_neurale_ricorrente) del 1982 sono reti neurali artificiali derivate da reti neurali feedforward che creano grafici temporali.
+
+✅ Fare una piccola ricerca. Quali altre date si distinguono come fondamentali nella storia del machine learning e dell'intelligenza artificiale?
+## 1950: Macchine che pensano
+
+Alan Turing, una persona davvero notevole che è stata votata [dal pubblico nel 2019](https://wikipedia.org/wiki/Icons:_The_Greatest_Person_of_the_20th_Century) come il più grande scienziato del XX secolo, è accreditato per aver contribuito a gettare le basi per il concetto di "macchina in grado di pensare". Ha affrontato gli oppositori e il suo stesso bisogno di prove empiriche di questo concetto in parte creando il [Test di Turing](https://www.bbc.com/news/technology-18475646), che verrà esplorato nelle lezioni di NLP (elaborazione del linguaggio naturale).
+
+## 1956: Progetto di Ricerca Estivo Dartmouth
+
+"Il Dartmouth Summer Research Project sull'intelligenza artificiale è stato un evento seminale per l'intelligenza artificiale come campo", qui è stato coniato il termine "intelligenza artificiale" ([fonte](https://250.dartmouth.edu/highlights/artificial-intelligence-ai-coined-dartmouth)).
+
+> In linea di principio, ogni aspetto dell'apprendimento o qualsiasi altra caratteristica dell'intelligenza può essere descritto in modo così preciso che si può costruire una macchina per simularlo.
+
+Il ricercatore capo, il professore di matematica John McCarthy, sperava "di procedere sulla base della congettura che ogni aspetto dell'apprendimento o qualsiasi altra caratteristica dell'intelligenza possa in linea di principio essere descritta in modo così preciso che si possa costruire una macchina per simularlo". I partecipanti includevano un altro luminare nel campo, Marvin Minsky.
+
+Il workshop è accreditato di aver avviato e incoraggiato diverse discussioni tra cui "l'ascesa di metodi simbolici, sistemi focalizzati su domini limitati (primi sistemi esperti) e sistemi deduttivi contro sistemi induttivi". ([fonte](https://wikipedia.org/wiki/Dartmouth_workshop)).
+
+## 1956 - 1974: "Gli anni d'oro"
+
+Dagli anni '50 fino alla metà degli anni '70, l'ottimismo era alto nella speranza che l'AI potesse risolvere molti problemi. Nel 1967, Marvin Minsky dichiarò con sicurezza che "Entro una generazione... il problema della creazione di 'intelligenza artificiale' sarà sostanzialmente risolto". (Minsky, Marvin (1967), Computation: Finite and Infinite Machines, Englewood Cliffs, N.J.: Prentice-Hall)
+
+La ricerca sull'elaborazione del linguaggio naturale è fiorita, la ricerca è stata perfezionata e resa più potente ed è stato creato il concetto di "micro-mondi", in cui compiti semplici sono stati completati utilizzando istruzioni in linguaggio semplice.
+
+La ricerca è stata ben finanziata dalle agenzie governative, sono stati fatti progressi nel calcolo e negli algoritmi e sono stati costruiti prototipi di macchine intelligenti. Alcune di queste macchine includono:
+
+* [Shakey il robot](https://wikipedia.org/wiki/Shakey_the_robot), che poteva manovrare e decidere come eseguire i compiti "intelligentemente".
+
+ ![Shakey, un robot intelligente](../images/shakey.jpg)
+ > Shakey nel 1972
+
+* Eliza, una delle prime "chatterbot", poteva conversare con le persone e agire come una "terapeuta" primitiva. Si Imparerà di più su Eliza nelle lezioni di NLP.
+
+ ![Eliza, un bot](../images/eliza.png)
+ > Una versione di Eliza, un chatbot
+
+* Il "mondo dei blocchi" era un esempio di un micromondo in cui i blocchi potevano essere impilati e ordinati e si potevano testare esperimenti su macchine per insegnare a prendere decisioni. I progressi realizzati con librerie come [SHRDLU](https://it.wikipedia.org/wiki/SHRDLU) hanno contribuito a far progredire l'elaborazione del linguaggio.
+
+ [![Il mondo dei blocchi con SHRDLU](https://img.youtube.com/vi/QAJz4YKUwqw/0.jpg)](https://www.youtube.com/watch?v=QAJz4YKUwqw "Il mondo dei blocchi con SHRDLU")
+
+ > 🎥 Fare clic sull'immagine sopra per un video: Blocks world con SHRDLU
+
+## 1974 - 1980: "L'inverno dell'AI"
+
+Verso la metà degli anni '70, era diventato evidente che la complessità della creazione di "macchine intelligenti" era stata sottovalutata e che la sua promessa, data la potenza di calcolo disponibile, era stata esagerata. I finanziamenti si sono prosciugati e la fiducia nel settore è rallentata. Alcuni problemi che hanno influito sulla fiducia includono:
+
+- **Limitazioni**. La potenza di calcolo era troppo limitata.
+- **Esplosione combinatoria**. La quantità di parametri necessari per essere addestrati è cresciuta in modo esponenziale man mano che veniva chiesto di più ai computer, senza un'evoluzione parallela della potenza e delle capacità di calcolo.
+- **Scarsità di dati**. C'era una scarsità di dati che ostacolava il processo di test, sviluppo e perfezionamento degli algoritmi.
+- **Stiamo facendo le domande giuste?**. Le stesse domande che venivano poste cominciarono ad essere messe in discussione. I ricercatori hanno iniziato a criticare i loro approcci:
+ - I test di Turing furono messi in discussione attraverso, tra le altre idee, la "teoria della stanza cinese" che postulava che "la programmazione di un computer digitale può far sembrare che capisca il linguaggio ma non potrebbe produrre una vera comprensione". ([fonte](https://plato.stanford.edu/entries/chinese-room/))
+ - L'etica dell'introduzione di intelligenze artificiali come la "terapeuta" ELIZA nella società è stata messa in discussione.
+
+Allo stesso tempo, iniziarono a formarsi varie scuole di pensiero sull'AI. È stata stabilita una dicotomia tra pratiche ["scruffy" contro "neat AI"](https://wikipedia.org/wiki/Neats_and_scruffies). I laboratori _scruffy_ ottimizzavano i programmi per ore fino a quando non ottenevano i risultati desiderati. I laboratori _Neat_ "si focalizzavano sulla logica e sulla risoluzione formale dei problemi". ELIZA e SHRDLU erano ben noti _sistemi scruffy_. Negli anni '80, quando è emersa la richiesta di rendere riproducibili i sistemi ML, l'_approccio neat_ ha gradualmente preso il sopravvento in quanto i suoi risultati sono più spiegabili.
+
+## Sistemi esperti degli anni '80
+
+Man mano che il settore cresceva, i suoi vantaggi per le imprese diventavano più chiari e negli anni '80 lo stesso accadeva con la proliferazione di "sistemi esperti". "I sistemi esperti sono stati tra le prime forme di software di intelligenza artificiale (AI) di vero successo". ([fonte](https://wikipedia.org/wiki/Expert_system)).
+
+Questo tipo di sistema è in realtà _ibrido_, costituito in parte da un motore di regole che definisce i requisiti aziendali e un motore di inferenza che sfrutta il sistema di regole per dedurre nuovi fatti.
+
+Questa era ha visto anche una crescente attenzione rivolta alle reti neurali.
+
+## 1987 - 1993: AI 'Chill'
+
+La proliferazione di hardware specializzato per sistemi esperti ha avuto lo sfortunato effetto di diventare troppo specializzato. L'ascesa dei personal computer ha anche gareggiato con questi grandi sistemi centralizzati specializzati. La democratizzazione dell'informatica era iniziata e alla fine ha spianato la strada alla moderna esplosione dei big data.
+
+## 1993 - 2011
+
+Questa epoca ha visto una nuova era per ML e AI per essere in grado di risolvere alcuni dei problemi che erano stati causati in precedenza dalla mancanza di dati e potenza di calcolo. La quantità di dati ha iniziato ad aumentare rapidamente e a diventare più ampiamente disponibile, nel bene e nel male, soprattutto con l'avvento degli smartphone intorno al 2007. La potenza di calcolo si è ampliata in modo esponenziale e gli algoritmi si sono evoluti di pari passo. Il campo ha iniziato a maturare quando i giorni a ruota libera del passato hanno iniziato a cristallizzarsi in una vera disciplina.
+
+## Adesso
+
+Oggi, machine learning e intelligenza artificiale toccano quasi ogni parte della nostra vita. Questa era richiede un'attenta comprensione dei rischi e dei potenziali effetti di questi algoritmi sulle vite umane. Come ha affermato Brad Smith di Microsoft, "La tecnologia dell'informazione solleva questioni che vanno al cuore delle protezioni fondamentali dei diritti umani come la privacy e la libertà di espressione. Questi problemi aumentano la responsabilità delle aziende tecnologiche che creano questi prodotti. A nostro avviso, richiedono anche un'attenta regolamentazione del governo e lo sviluppo di norme sugli usi accettabili" ([fonte](https://www.technologyreview.com/2019/12/18/102365/the-future-of-ais-impact-on-society/)).
+
+Resta da vedere cosa riserva il futuro, ma è importante capire questi sistemi informatici e il software e gli algoritmi che eseguono. Ci si augura che questo programma di studi aiuti ad acquisire una migliore comprensione in modo che si possa decidere in autonomia.
+
+[![La storia del deeplearningLa](https://img.youtube.com/vi/mTtDfKgLm54/0.jpg)](https://www.youtube.com/watch?v=mTtDfKgLm54 " storia del deep learning")
+> 🎥 Fare clic sull'immagine sopra per un video: Yann LeCun discute la storia del deep learning in questa lezione
+
+---
+
+## 🚀 Sfida
+
+Approfondire uno di questi momenti storici e scoprire
+ di più sulle persone che stanno dietro ad essi. Ci sono personaggi affascinanti e nessuna scoperta scientifica è mai stata creata in un vuoto culturale. Cosa si è scoperto?
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/4/)
+
+## Revisione e Auto Apprendimento
+
+Ecco gli elementi da guardare e ascoltare:
+
+[Questo podcast in cui Amy Boyd discute l'evoluzione dell'AI](http://runasradio.com/Shows/Show/739)
+
+[![La storia dell'AI di Amy Boyd](https://img.youtube.com/vi/EJt3_bFYKss/0.jpg)](https://www.youtube.com/watch?v=EJt3_bFYKss "La storia dell'AI di Amy Boyd")
+
+## Compito
+
+[Creare una sequenza temporale](assignment.it.md)
diff --git a/1-Introduction/2-history-of-ML/translations/README.ja.md b/1-Introduction/2-history-of-ML/translations/README.ja.md
new file mode 100644
index 0000000000..6ba32096aa
--- /dev/null
+++ b/1-Introduction/2-history-of-ML/translations/README.ja.md
@@ -0,0 +1,114 @@
+# 機械学習の歴史
+
+![機械学習の歴史をまとめたスケッチ](../../../sketchnotes/ml-history.png)
+> [Tomomi Imura](https://www.twitter.com/girlie_mac)によるスケッチ
+
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/3?loc=ja)
+
+この授業では、機械学習と人工知能の歴史における主要な出来事を紹介します。
+
+人工知能(AI)の歴史は、機械学習の歴史と密接に関係しています。なぜならば、機械学習を支えるアルゴリズムと計算の進歩が、AIの発展につながったからです。これらの分野は、1950年代に明確になり始めましたが、重要な[アルゴリズム、統計、数学、計算、技術的な発見](https://wikipedia.org/wiki/Timeline_of_machine_learning)は、この時代よりも前に、そして同時に行われていたことを覚えておくとよいでしょう。実際、人々は[何百年も](https://wikipedia.org/wiki/History_of_artificial_intelligence)この問題について考えてきました。(この記事では、「考える機械」というアイデアの歴史的な知的基盤について説明されています。)
+
+
+## 注目すべき発見
+- 1763年、1812年 [ベイズの定理](https://wikipedia.org/wiki/Bayes%27_theorem)とその前身の発見。ある事象が起こる確率を、事前の知識に基づいて記述する推論の基礎となる定理とその応用。
+- 1805年 フランスの数学者アドリアン=マリー・レジェンドルによる[最小二乗理論](https://wikipedia.org/wiki/Least_squares)。この理論は、データのフィッティングに役立つ。
+- 1913年 ロシアの数学者アンドレイ・マルコフにちなんで名付けられた[マルコフ連鎖](https://wikipedia.org/wiki/Markov_chain)は、以前の状態に基づいて起こりうる一連の事象を記述するために使用される。
+- 1957年 [パーセプトロン](https://wikipedia.org/wiki/Perceptron)は、アメリカの心理学者フランク・ローゼンブラットが発明した線形分類器の一種であり、深層学習の基盤となっている。
+- 1967 [最小近傍法](https://wikipedia.org/wiki/Nearest_neighbor)は、元々は経路探索のために考案されたアルゴリズム。MLではパターンの検出に用いられる。
+- 1970年 [バックプロパゲーション](https://wikipedia.org/wiki/Backpropagation)を用いて[フィードフォワード・ニューラルネットワーク(順伝播型ニューラルネットワーク)](https://wikipedia.org/wiki/Feedforward_neural_network)を学習する。
+- 1982年 [回帰型ニューラルネットワーク](https://wikipedia.org/wiki/Recurrent_neural_network) は、フィードフォワード・ニューラルネットワークから派生した人工的なニューラルネットワークで、時間的なグラフを作成します。
+
+✅ 少し調べてみてください。MLとAIの歴史の中で重要な日付は他にありますか?
+
+## 1950: 思考する機械
+アラン・チューリングは、[2019年に世間から](https://wikipedia.org/wiki/Icons:_The_Greatest_Person_of_the_20th_Century)20世紀最大の科学者として投票された、真に優れた人物で、「考えることができる機械」という概念の基礎を築くのに貢献したとされています。彼は、否定的な意見や、この概念の実証的な証拠を必要とする自分自身と、この先自然言語処理の授業で触れることとなる[チューリング・テスト](https://www.bbc.com/news/technology-18475646)を作成することで戦いました。
+
+## 1956: ダートマス・サマー・リサーチ・プロジェクト
+ダートマス・サマー・リサーチ・プロジェクトは、分野としての人工知能にとって重要な出来事であり、ここで「人工知能」という言葉が作られました([出典](https://250.dartmouth.edu/highlights/artificial-intelligence-ai-coined-dartmouth))
+
+> 学習やその他の知能のあらゆる側面は、原理的に非常に正確に記述することができるので、それをシミュレートする機械を作ることができる。
+
+主任研究者である数学のジョン・マッカーシー教授は、「学習のあらゆる側面や知能のその他の特徴は、原理的に非常に正確に記述することができるので、それをシミュレートする機械を作ることができるという推測に基づいて進めていきたい」と考えていました。参加者の中には、この分野の著名人であるマービン・ミンスキーもいました。
+
+このワークショップでは、「記号的手法の台頭、限定された領域に焦点を当てたシステム(初期のエキスパートシステム)、演繹的システムと帰納的システムの比較」などの議論が開始され、促進されたと評価されています。([出典](https://wikipedia.org/wiki/Dartmouth_workshop))
+
+## 1956 - 1974: 黄金期
+
+1950年代から70年代半ばまでは、AIがさまざまな問題を解決してくれるのではないかという楽観的な見方が広がっていました。1967年、マービン・ミンスキーは「一世代のうちに...『人工知能』を作るという問題は実質的に解決されるだろう」と自信を持って述べている。(Minsky, Marvin (1967), Computation: Finite and Infinite Machines, Englewood Cliffs, N.J.: Prentice-Hall)
+
+自然言語処理の研究が盛んになり、検索が洗練されてより強力になり、平易な言語による指示で簡単な作業をこなす「マイクロワールド」という概念が生まれた。
+
+研究は政府機関から潤沢な資金が提供され、計算とアルゴリズムが進歩し、知的機械のプロトタイプが作られた。その中には次のようなものがある。
+
+* 移動したり、タスクを実行する方法を「知的に」決定することができるロボット[「Shakey」](https://wikipedia.org/wiki/Shakey_the_robot)
+
+ ![知的なロボットであるShakey](../images/shakey.jpg)
+ > 1972のShakey
+
+* 初期の「おしゃべりロボット」であるElizaは、人と会話することができ、原始的な「セラピスト」の役割を果たした。エリザについては、NLPのレッスンで詳しく説明します。
+
+ ![BotであるEliza](../images/eliza.png)
+ > チャットボットEliza
+
+* 「Blocks world」は、ブロックを積み上げたり並べ替えたりするマイクロワールドの一例で、機械に判断力を身につけさせる実験を行った。[SHRDLU](https://wikipedia.org/wiki/SHRDLU)をはじめとするライブラリの進歩は、言語処理の発展に大きく貢献した。
+
+ [![SHRDLUを用いたblocks world](https://img.youtube.com/vi/QAJz4YKUwqw/0.jpg)](https://www.youtube.com/watch?v=QAJz4YKUwqw "SHRDLUを用いたblocks world")
+
+ > 🎥 上の画像をクリックすると動画が見られます:"SHRDLUを用いたblocks world"
+
+## 1974 - 1980: AIの冬
+
+1970年代半ばになると、「知的な機械」を作ることの複雑さが過小評価されていたことや、利用可能な計算能力を考慮すると、その将来性が過大評価されていたことが明らかになりました。資金が枯渇し、この分野への信頼が低下した。信頼性に影響を与えた問題には以下のようなものがある。:
+
+- **限界**. 計算能力の限界
+- **組み合わせの爆発**. 学習に必要なパラメータの量は、コンピュータに要求されることが多くなるにつれて指数関数的に増加しましたが、コンピュータの性能や能力は並行して進化しませんでした。
+- **データの少なさ**. データが不足していたため、アルゴリズムのテスト、開発、改良のプロセスが妨げられた。
+- **正しい質問をしているのかどうか**. 問いかけていた質問そのものが疑問視され始めた。研究者たちは、自分たちのアプローチに批判的な意見を持つようになった。
+ - チューリングテストは、「コンピュータをプログラミングすることで、言語を理解しているように見せかけることはできるが、本当の意味での理解はできない」とする「チャイニーズルーム理論」などによって、疑問視されるようになった。([出典](https://plato.stanford.edu/entries/chinese-room/))
+ - セラピストとしてELIZAのような人工知能を社会に導入することの倫理性が問われた。
+それと同時に、さまざまなAIの流派が形成され始めました。一つは、["Scruffy"と "Neat AI"](https://wikipedia.org/wiki/Neats_and_scruffies)という二分法である。Scruffyな研究室では、目的の結果が得られるまで何時間もプログラムをいじっていた一方、neatな研究室では、論理と形式的な問題解決を重視する。ELIZAやSHRDLUなどが有名なScruffyであるシステムである。1980年代に入って、MLシステムの再現性が求められるようになると、結果が説明可能であることから、次第にneatなアプローチが主流になっていきました。
+
+## 1980s エキスパートシステム
+
+分野が発展するにつれ、ビジネスへの貢献が明確になり、1980年代には「エキスパートシステム」が普及しました。「エキスパートシステムは、人工知能(AI)ソフトウェアの中で最初に真に成功した形態の一つである。」と言われています。([出典](https://wikipedia.org/wiki/Expert_system))
+
+このタイプのシステムは、ビジネス要件を定義するルールエンジンと、ルールシステムを活用して新たな事実を推論する推論エンジンで構成されるハイブリッド型です。
+
+また、この時代はニューラルネットワークにも注目が集まった。
+
+## 1987 - 1993: AIの冷え込み
+
+専門分野に特化したエキスパートシステムのハードウェアが普及したことで、専門性が高くなりすぎてしまうという残念な結果になりました。また、パーソナルコンピュータの台頭は、これらの大規模で専門的な中央集権的システムと競合した。コンピューティングの民主化が始まり、最終的には現代の爆発的なビッグデータへの道が開かれました。
+
+## 1993 - 2011
+
+この期間では、それ以前にデータと計算能力の不足によって引き起こされていた問題を、MLやAIが解決できるようになっていた。特に2007年頃にスマートフォンが登場したことで、良くも悪くもデータ量が急速に増加し、広く利用されるようになりました。計算機の性能も飛躍的に向上し、アルゴリズムもそれに合わせて進化していきました。過去の自由奔放な時代から、真の学問としての結晶化が始まり、この分野は成熟していきました。
+
+## 現在
+
+現在、機械学習やAIは、私たちの生活のほぼすべての部分に関わっています。このような時代には、これらのアルゴリズムが人間の生活に及ぼすリスクや潜在的な影響を注意深く理解することが求められます。マイクロソフトのブラッド・スミスは、「情報技術は、プライバシーや表現の自由といった基本的な人権保護の核心に迫る問題を提起します。情報技術は、プライバシーや表現の自由といった基本的な人権保護の根幹に関わる問題を提起します。我々の見解では、これらの問題は、政府による思慮深い規制と、許容される使用方法に関する規範の策定を必要としています。」と述べています。([出典](https://www.technologyreview.com/2019/12/18/102365/the-future-of-ais-impact-on-society/))
+
+未来がどうなるかはまだわかりませんが、これらのコンピュータシステムと、それを動かすソフトウェアやアルゴリズムを理解することは重要です。このカリキュラムが自身で判断するにあたり、より良い理解を助けるものになると幸いです。
+
+[![ディープラーニングの歴史](https://img.youtube.com/vi/mTtDfKgLm54/0.jpg)](https://www.youtube.com/watch?v=mTtDfKgLm54 "ディープラーニングの歴史")
+> 🎥 上の画像をクリックすると動画が見られます:このレクチャーではYann LeCunがディープラーニングの歴史について議論しています。
+
+---
+## 🚀Challenge
+
+これらの歴史的瞬間の1つを掘り下げて、その背後にいる人々について学びましょう。魅力的な人々がいますし、文化的に空白の状態で科学的発見がなされたことはありません。どういったことが見つかるでしょうか?
+
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/4?loc=ja)
+
+## 振り返りと自習
+
+視聴するべき教材は以下になります:
+
+[Amy BoydがAIの進化について述べているポッドキャスト](http://runasradio.com/Shows/Show/739)
+
+[![Amy BoydによるAIの歴史](https://img.youtube.com/vi/EJt3_bFYKss/0.jpg)](https://www.youtube.com/watch?v=EJt3_bFYKss "Amy BoydによるAIの歴史")
+
+## 課題
+
+[年表を作成する](./assignment.ja.md)
diff --git a/1-Introduction/2-history-of-ML/translations/README.tr.md b/1-Introduction/2-history-of-ML/translations/README.tr.md
new file mode 100644
index 0000000000..af2346fb7e
--- /dev/null
+++ b/1-Introduction/2-history-of-ML/translations/README.tr.md
@@ -0,0 +1,117 @@
+# Makine öğreniminin tarihi
+
+![Bir taslak-notta makine öğrenimi geçmişinin özeti](../../../sketchnotes/ml-history.png)
+> [Tomomi Imura](https://www.twitter.com/girlie_mac) tarafından hazırlanan taslak-not
+
+## [Ders öncesi test](https://white-water-09ec41f0f.azurestaticapps.net/quiz/3?loc=tr)
+
+Bu derste, makine öğrenimi ve yapay zeka tarihindeki önemli kilometre taşlarını inceleyeceğiz.
+
+Bir alan olarak yapay zekanın (AI) tarihi, makine öğreniminin tarihi ile iç içedir, çünkü makine öğrenimini destekleyen algoritmalar ve bilgi-işlem kapasitesindeki ilerlemeler, yapay zekanın gelişimini beslemektedir. Ayrı bilim alanlanları olarak bu alanlar 1950'lerde belirginleşmeye başlarken, önemli [algoritmik, istatistiksel, matematiksel, hesaplamalı ve teknik keşiflerin](https://wikipedia.org/wiki/Timeline_of_machine_learning) bir kısmı bu dönemden önce gelmiş ve bir kısmı da bu dönem ile örtüşmüştür. Aslında, insanlar [yüzlerce yıldır](https://wikipedia.org/wiki/History_of_artificial_intelligence) bu soruları düşünüyorlar: bu makale bir 'düşünen makine' fikrinin tarihsel entelektüel temellerini tartışıyor.
+
+## Önemli keşifler
+
+- 1763, 1812 - [Bayes Teoremi](https://tr.wikipedia.org/wiki/Bayes_teoremi) ve öncülleri. Bu teorem ve uygulamaları, önceki bilgilere dayalı olarak meydana gelen bir olayın olasılığını tanımlayan çıkarımın temelini oluşturur.
+- 1805 - [En Küçük Kareler Teorisi](https://tr.wikipedia.org/wiki/En_k%C3%BC%C3%A7%C3%BCk_kareler_y%C3%B6ntemi), Fransız matematikçi Adrien-Marie Legendre tarafından bulunmuştur. Regresyon ünitemizde öğreneceğiniz bu teori, makine öğrenimi modelini veriye uydurmada yardımcı olur.
+- 1913 - Rus matematikçi Andrey Markov'un adını taşıyan [Markov Zincirleri](https://tr.wikipedia.org/wiki/Markov_zinciri), önceki bir duruma dayalı olası olaylar dizisini tanımlamak için kullanılır.
+- 1957 - [Algılayıcı (Perceptron)](https://tr.wikipedia.org/wiki/Perceptron), derin öğrenmedeki ilerlemelerin temelini oluşturan Amerikalı psikolog Frank Rosenblatt tarafından icat edilen bir tür doğrusal sınıflandırıcıdır.
+- 1967 - [En Yakın Komşu](https://wikipedia.org/wiki/Nearest_neighbor), orijinal olarak rotaları haritalamak için tasarlanmış bir algoritmadır. Bir ML bağlamında kalıpları tespit etmek için kullanılır.
+- 1970 - [Geri Yayılım](https://wikipedia.org/wiki/Backpropagation), [ileri beslemeli sinir ağlarını](https://wikipedia.org/wiki/Feedforward_neural_network) eğitmek için kullanılır.
+- 1982 - [Tekrarlayan Sinir Ağları](https://wikipedia.org/wiki/Recurrent_neural_network), zamansal grafikler oluşturan ileri beslemeli sinir ağlarından türetilen yapay sinir ağlarıdır.
+
+✅ Biraz araştırma yapın. Makine öğrenimi ve yapay zeka tarihinde önemli olan başka hangi tarihler öne çıkıyor?
+
+## 1950: Düşünen makineler
+
+[2019'da halk tarafından](https://wikipedia.org/wiki/Icons:_The_Greatest_Person_of_the_20th_Century) 20. yüzyılın en büyük bilim adamı seçilen gerçekten dikkate değer bir kişi olan Alan Turing'in, 'düşünebilen makine' kavramının temellerini attığı kabul edilir. Kendisine karşı çıkanlara yanıt olması için ve bu kavramın deneysel kanıtlarını bulma ihtiyacı sebebiyle, NLP derslerimizde keşfedeceğiniz [Turing Testi'ni](https://www.bbc.com/news/technology-18475646) oluşturdu.
+
+## 1956: Dartmouth Yaz Araştırma Projesi
+
+"Yapay zeka üzerine Dartmouth Yaz Araştırma Projesi", bir alan olarak yapay zeka için çığır açan bir olaydı ve burada 'yapay zeka' terimi ortaya çıktı ([kaynak](https://250.dartmouth.edu/highlights/artificial-intelligence-ai-coined-dartmouth)).
+
+> Öğrenmenin her yönü veya zekanın diğer herhangi bir özelliği, prensipte o kadar kesin bir şekilde tanımlanabilir ki, onu simüle etmek için bir makine yapılabilir.
+
+Baş araştırmacı, matematik profesörü John McCarthy, "öğrenmenin her yönünün veya zekanın diğer herhangi bir özelliğinin prensipte oldukça kesin bir şekilde tanımlanabileceği varsayımına dayanarak, onu simüle etmek için bir makine yapılabileceği" varsayımının doğru olmasını umarak ilerliyordu. Katılımcılar arasında bu alanın bir diğer önderi olan Marvin Minsky de vardı.
+
+Çalıştay, "sembolik yöntemlerin yükselişi, sınırlı alanlara odaklanan sistemler (ilk uzman sistemler) ve tümdengelimli sistemlere karşı tümevarımlı sistemler" dahil olmak üzere çeşitli tartışmaları başlatmış ve teşvik etmiştir. ([kaynak](https://tr.wikipedia.org/wiki/Dartmouth_Konferans%C4%B1)).
+
+## 1956 - 1974: "Altın yıllar"
+
+1950'lerden 70'lerin ortalarına kadar, yapay zekanın birçok sorunu çözebileceği umuduyla iyimserlik arttı. 1967'de Marvin Minsky kendinden emin bir şekilde "Bir nesil içinde... 'yapay zeka' yaratma sorunu büyük ölçüde çözülecek" dedi. (Minsky, Marvin (1967), Computation: Finite and Infinite Machines, Englewood Cliffs, N.J.: Prentice-Hall)
+
+Doğal dil işleme araştırmaları gelişti, aramalar iyileştirildi ve daha güçlü hale getirildi, ve basit görevlerin sade dil talimatları kullanılarak tamamlandığı 'mikro dünyalar' kavramı yaratıldı.
+
+Araştırmalar, devlet kurumları tarafından iyi finanse edildi, hesaplamalar ve algoritmalarda ilerlemeler kaydedildi ve akıllı makinelerin prototipleri yapıldı. Bu makinelerden bazıları şunlardır:
+
+* [Robot Shakey](https://wikipedia.org/wiki/Shakey_the_robot), manevra yapabilir ve görevleri 'akıllıca' nasıl yerine getireceğine karar verebilir.
+
+ ![Shakey, akıllı bir robot](../images/shakey.jpg)
+ > 1972'de Shakey
+
+* Erken bir 'sohbet botu' olan Eliza, insanlarla sohbet edebilir ve ilkel bir 'terapist' gibi davranabilirdi. NLP derslerinde Eliza hakkında daha fazla bilgi edineceksiniz.
+
+ ![Eliza, bir bot](../images/eliza.png)
+ > Bir sohbet robotu olan Eliza'nın bir versiyonu
+
+* "Dünya Blokları", blokların üst üste koyulabilecekleri, sıralanabilecekleri ve karar vermeyi öğreten makinelerdeki deneylerin test edilebileceği bir mikro dünyaya örnekti. [SHRDLU](https://wikipedia.org/wiki/SHRDLU) gibi kütüphanelerle oluşturulan gelişmeler, dil işlemeyi ilerletmeye yardımcı oldu.
+
+ [![SHRDLU ile Dünya Blokları](https://img.youtube.com/vi/QAJz4YKUwqw/0.jpg)](https://www.youtube.com/watch?v=QAJz4YKUwqw "SHRDLU ile Dünya Blokları" )
+
+ > 🎥 Video için yukarıdaki resme tıklayın: SHRDLU ile Dünya Blokları
+
+## 1974 - 1980: "Yapay Zekâ Kışı"
+
+1970'lerin ortalarına gelindiğinde, 'akıllı makineler' yapmanın karmaşıklığının hafife alındığı ve mevcut hesaplama gücü göz önüne alındığında, verilen vaatlerin abartıldığı ortaya çıktı. Finansman kurudu ve alana olan güven azaldı. Güveni etkileyen bazı sorunlar şunlardı:
+
+- **Kısıtlıklar**. Hesaplama gücü çok sınırlıydı.
+- **Kombinasyonel patlama**. Hesaplama gücü ve yeteneğinde paralel bir evrim olmaksızın, bilgisayarlardan daha fazla soru istendikçe, eğitilmesi gereken parametre miktarı katlanarak arttı.
+- **Veri eksikliği**. Algoritmaları test etme, geliştirme ve iyileştirme sürecini engelleyen bir veri kıtlığı vardı.
+- **Doğru soruları mı soruyoruz?**. Sorulan sorular sorgulanmaya başlandı. Araştırmacılar mevcut yaklaşımları eleştirmeye başladı:
+ - Turing testleri, diğer fikirlerin yanı sıra, "Çin odası teorisi" aracılığıyla sorgulanmaya başlandı. Bu teori, "dijital bir bilgisayar, programlanarak dili anlıyormuş gibi gösterilebilir fakat gerçek bir dil anlayışı elde edilemez" savını öne sürmektedir. ([kaynak](https://plato.stanford.edu/entries/chinese-room/)
+ - "Terapist" ELIZA gibi yapay zekaların topluma tanıtılmasının etiğine meydan okundu.
+
+Aynı zamanda, çeşitli yapay zekâ düşünce okulları oluşmaya başladı. ["dağınık" ile "düzenli AI"](https://wikipedia.org/wiki/Neats_and_scruffies) uygulamaları arasında bir ikilem kuruldu. _Dağınık_ laboratuvarlar, istenen sonuçları elde edene kadar programlar üzerinde saatlerce ince ayar yaptı. _Düzenli_ laboratuvarlar "mantık ve biçimsel problem çözmeye odaklandı". ELIZA ve SHRDLU, iyi bilinen _dağınık_ sistemlerdi. 1980'lerde, ML sistemlerinin sonuçlarını tekrarlanabilir hale getirmek için talep ortaya çıktıkça, sonuçları daha açıklanabilir olduğu için _düzenli_ yaklaşım yavaş yavaş ön plana çıktı.
+
+## 1980'ler: Uzman sistemler
+
+Alan büyüdükçe, şirketlere olan faydası daha net hale geldi ve 1980'lerde 'uzman sistemlerin' yaygınlaşması da bu şekilde meydana geldi. "Uzman sistemler, yapay zeka (AI) yazılımlarının gerçek anlamda başarılı olan ilk formları arasındaydı." ([kaynak](https://tr.wikipedia.org/wiki/Uzman_sistemler)).
+
+Bu sistem türü aslında kısmen iş gereksinimlerini tanımlayan bir kural aracından ve yeni gerçekleri çıkarmak için kurallar sisteminden yararlanan bir çıkarım aracından oluşan bir _melezdir_.
+
+Bu çağda aynı zamanda sinir ağlarına artan ilgi de görülmüştür.
+
+## 1987 - 1993: Yapay Zeka 'Soğuması'
+
+Özelleşmiş uzman sistem donanımının yaygınlaşması, talihsiz bir şekilde bunları aşırı özelleşmiş hale getirdi. Kişisel bilgisayarların yükselişi de bu büyük, özelleşmiş, merkezi sistemlerle rekabet etti. Bilgisayarın demokratikleşmesi başlamıştı ve sonunda modern büyük veri patlamasının yolunu açtı.
+
+## 1993 - 2011
+
+Bu çağ, daha önce veri ve hesaplama gücü eksikliğinden kaynaklanan bazı sorunları çözebilmek için ML ve AI için yeni bir dönemi getirdi. Veri miktarı hızla artmaya başladı ve özellikle 2007'de akıllı telefonun ortaya çıkmasıyla birlikte iyisiyle kötüsüyle daha yaygın bir şekilde ulaşılabilir hale geldi. Hesaplama gücü katlanarak arttı ve algoritmalar da onunla birlikte gelişti. Geçmişin başıboş günleri gitmiş, yerine giderek olgunlaşan gerçek bir disipline dönüşüm başlamıştı.
+
+## Şimdi
+
+Günümüzde makine öğrenimi ve yapay zeka hayatımızın neredeyse her alanına dokunuyor. Bu çağ, bu algoritmaların insan yaşamı üzerindeki risklerinin ve potansiyel etkilerinin dikkatli bir şekilde anlaşılmasını gerektirmektedir. Microsoft'tan Brad Smith'in belirttiği gibi, "Bilgi teknolojisi, gizlilik ve ifade özgürlüğü gibi temel insan hakları korumalarının kalbine giden sorunları gündeme getiriyor. Bu sorunlar, bu ürünleri yaratan teknoloji şirketlerinin sorumluluğunu artırıyor. Bizim açımızdan bakıldığında, düşünceli hükümet düzenlemeleri ve kabul edilebilir kullanımlar etrafında normların geliştirilmesi için de bir çağrı niteliği taşıyor." ([kaynak](https://www.technologyreview.com/2019/12/18/102365/the-future-of-ais-impact-on-society/) )).
+
+Geleceğin neler getireceğini birlikte göreceğiz, ancak bu bilgisayar sistemlerini ve çalıştırdıkları yazılım ve algoritmaları anlamak önemlidir. Bu müfredatın, kendi kararlarınızı verebilmeniz için daha iyi bir anlayış kazanmanıza yardımcı olacağını umuyoruz.
+
+[![Derin öğrenmenin tarihi](https://img.youtube.com/vi/mTtDfKgLm54/0.jpg)](https://www.youtube.com/watch?v=mTtDfKgLm54 "Derin öğrenmenin tarihi")
+> 🎥 Video için yukarıdaki resme tıklayın: Yann LeCun bu derste derin öğrenmenin tarihini tartışıyor
+
+---
+## 🚀Meydan okuma
+
+Bu tarihi anlardan birine girin ve arkasındaki insanlar hakkında daha fazla bilgi edinin. Büyüleyici karakterler var ve kültürel bir boşlukta hiçbir bilimsel keşif yaratılmadı. Ne keşfedersiniz?
+
+## [Ders sonrası test](https://white-water-09ec41f0f.azurestaticapps.net/quiz/4?loc=tr)
+
+## İnceleme ve Bireysel Çalışma
+
+İşte izlenmesi ve dinlenmesi gerekenler:
+
+[Amy Boyd'un yapay zekanın evrimini tartıştığı bu podcast](http://runasradio.com/Shows/Show/739)
+
+[![Amy Boyd ile Yapay Zekâ'nın tarihi](https://img.youtube.com/vi/EJt3_bFYKss/0.jpg)](https://www.youtube.com/watch?v=EJt3_bFYKss "Amy Boyd ile Yapay Zekâ'nın tarihi")
+
+## Ödev
+
+[Bir zaman çizelgesi oluşturun](assignment.tr.md)
\ No newline at end of file
diff --git a/1-Introduction/2-history-of-ML/translations/README.zh-cn.md b/1-Introduction/2-history-of-ML/translations/README.zh-cn.md
new file mode 100644
index 0000000000..70e85f8201
--- /dev/null
+++ b/1-Introduction/2-history-of-ML/translations/README.zh-cn.md
@@ -0,0 +1,116 @@
+# 机器学习的历史
+
+![机器学习历史概述](../../../sketchnotes/ml-history.png)
+> 作者[Tomomi Imura](https://www.twitter.com/girlie_mac)
+
+## [课前测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/3/)
+
+在本课中,我们将走过机器学习和人工智能历史上的主要里程碑。
+
+人工智能(AI)作为一个领域的历史与机器学习的历史交织在一起,因为支持机器学习的算法和计算能力的进步推动了AI的发展。记住,虽然这些领域作为不同研究领域在20世纪50年代才开始具体化,但重要的[算法、统计、数学、计算和技术发现](https://wikipedia.org/wiki/Timeline_of_machine_learning) 要早于和重叠了这个时代。 事实上,[数百年来](https://wikipedia.org/wiki/History_of_artificial_intelligence)人们一直在思考这些问题:本文讨论了“思维机器”这一概念的历史知识基础。
+
+## 主要发现
+
+- 1763, 1812 [贝叶斯定理](https://wikipedia.org/wiki/Bayes%27_theorem) 及其前身。该定理及其应用是推理的基础,描述了基于先验知识的事件发生的概率。
+- 1805 [最小二乘理论](https://wikipedia.org/wiki/Least_squares)由法国数学家Adrien-Marie Legendre提出。 你将在我们的回归单元中了解这一理论,它有助于数据拟合。
+- 1913 [马尔可夫链](https://wikipedia.org/wiki/Markov_chain)以俄罗斯数学家Andrey Markov的名字命名,用于描述基于先前状态的一系列可能事件。
+- 1957 [感知器](https://wikipedia.org/wiki/Perceptron)是美国心理学家Frank Rosenblatt发明的一种线性分类器,是深度学习发展的基础。
+- 1967 [最近邻](https://wikipedia.org/wiki/Nearest_neighbor)是一种最初设计用于映射路线的算法。 在ML中,它用于检测模式。
+- 1970 [反向传播](https://wikipedia.org/wiki/Backpropagation)用于训练[前馈神经网络](https://wikipedia.org/wiki/Feedforward_neural_network)。
+- 1982 [循环神经网络](https://wikipedia.org/wiki/Recurrent_neural_network) 是源自产生时间图的前馈神经网络的人工神经网络。
+
+✅ 做点调查。在ML和AI的历史上,还有哪些日期是重要的?
+## 1950: 会思考的机器
+
+Alan Turing,一个真正杰出的人,[在2019年被公众投票选出](https://wikipedia.org/wiki/Icons:_The_Greatest_Person_of_the_20th_Century) 作为20世纪最伟大的科学家,他认为有助于为“会思考的机器”的概念打下基础。他通过创建 [图灵测试](https://www.bbc.com/news/technology-18475646)来解决反对者和他自己对这一概念的经验证据的需求,你将在我们的 NLP 课程中进行探索。
+
+## 1956: 达特茅斯夏季研究项目
+
+“达特茅斯夏季人工智能研究项目是人工智能领域的一个开创性事件,”正是在这里,人们创造了“人工智能”一词([来源](https://250.dartmouth.edu/highlights/artificial-intelligence-ai-coined-dartmouth))。
+
+> 原则上,学习的每个方面或智能的任何其他特征都可以被精确地描述,以至于可以用机器来模拟它。
+
+首席研究员、数学教授John McCarthy希望“基于这样一种猜想,即学习的每个方面或智能的任何其他特征原则上都可以如此精确地描述,以至于可以制造出一台机器来模拟它。” 参与者包括该领域的另一位杰出人物Marvin Minsky。
+
+研讨会被认为发起并鼓励了一些讨论,包括“符号方法的兴起、专注于有限领域的系统(早期专家系统),以及演绎系统与归纳系统的对比。”([来源](https://wikipedia.org/wiki/Dartmouth_workshop))。
+
+## 1956 - 1974: “黄金岁月”
+
+从20世纪50年代到70年代中期,乐观情绪高涨,希望人工智能能够解决许多问题。1967年,Marvin Minsky自信地说,“一代人之内。。。创造‘人工智能’的问题将得到实质性的解决。”(Minsky,Marvin(1967),《计算:有限和无限机器》,新泽西州恩格伍德克利夫斯:Prentice Hall)
+
+自然语言处理研究蓬勃发展,搜索被提炼并变得更加强大,创造了“微观世界”的概念,在这个概念中,简单的任务是用简单的语言指令完成的。
+
+这项研究得到了政府机构的充分资助,在计算和算法方面取得了进展,并建造了智能机器的原型。其中一些机器包括:
+
+* [机器人Shakey](https://wikipedia.org/wiki/Shakey_the_robot),他们可以“聪明地”操纵和决定如何执行任务。
+
+ ![Shakey, 智能机器人](../images/shakey.jpg)
+ > 1972 年的Shakey
+
+* Eliza,一个早期的“聊天机器人”,可以与人交谈并充当原始的“治疗师”。 你将在NLP课程中了解有关Eliza的更多信息。
+
+ ![Eliza, 机器人](../images/eliza.png)
+ > Eliza的一个版本,一个聊天机器人
+
+* “积木世界”是一个微观世界的例子,在那里积木可以堆叠和分类,并且可以测试教机器做出决策的实验。 使用[SHRDLU](https://wikipedia.org/wiki/SHRDLU)等库构建的高级功能有助于推动语言处理向前发展。
+
+ [![积木世界与SHRDLU](https://img.youtube.com/vi/QAJz4YKUwqw/0.jpg)](https://www.youtube.com/watch?v=QAJz4YKUwqw "积木世界与SHRDLU")
+
+ > 🎥 点击上图观看视频: 积木世界与SHRDLU
+
+## 1974 - 1980: AI的寒冬
+
+到了20世纪70年代中期,很明显制造“智能机器”的复杂性被低估了,而且考虑到可用的计算能力,它的前景被夸大了。资金枯竭,市场信心放缓。影响信心的一些问题包括:
+
+- **限制**。计算能力太有限了
+- **组合爆炸**。随着对计算机的要求越来越高,需要训练的参数数量呈指数级增长,而计算能力却没有平行发展。
+- **缺乏数据**。 缺乏数据阻碍了测试、开发和改进算法的过程。
+- **我们是否在问正确的问题?**。 被问到的问题也开始受到质疑。 研究人员开始对他们的方法提出批评:
+ - 图灵测试受到质疑的方法之一是“中国房间理论”,该理论认为,“对数字计算机进行编程可能使其看起来能理解语言,但不能产生真正的理解。” ([来源](https://plato.stanford.edu/entries/chinese-room/))
+ - 将“治疗师”ELIZA这样的人工智能引入社会的伦理受到了挑战。
+
+与此同时,各种人工智能学派开始形成。 在[“scruffy”与“neat AI”](https://wikipedia.org/wiki/Neats_and_scruffies)之间建立了二分法。 _Scruffy_ 实验室对程序进行了数小时的调整,直到获得所需的结果。 _Neat_ 实验室“专注于逻辑和形式问题的解决”。 ELIZA 和 SHRDLU 是众所周知的 _scruffy_ 系统。 在 1980 年代,随着使 ML 系统可重现的需求出现,_neat_ 方法逐渐走上前沿,因为其结果更易于解释。
+
+## 1980s 专家系统
+
+随着这个领域的发展,它对商业的好处变得越来越明显,在20世纪80年代,‘专家系统’的泛滥也是如此。“专家系统是首批真正成功的人工智能 (AI) 软件形式之一。” ([来源](https://wikipedia.org/wiki/Expert_system))。
+
+这种类型的系统实际上是混合系统,部分由定义业务需求的规则引擎和利用规则系统推断新事实的推理引擎组成。
+
+在这个时代,神经网络也越来越受到重视。
+
+## 1987 - 1993: AI的冷静期
+
+专业的专家系统硬件的激增造成了过于专业化的不幸后果。个人电脑的兴起也与这些大型、专业化、集中化系统展开了竞争。计算机的平民化已经开始,它最终为大数据的现代爆炸铺平了道路。
+
+## 1993 - 2011
+
+这个时代见证了一个新的时代,ML和AI能够解决早期由于缺乏数据和计算能力而导致的一些问题。数据量开始迅速增加,变得越来越广泛,无论好坏,尤其是2007年左右智能手机的出现,计算能力呈指数级增长,算法也随之发展。这个领域开始变得成熟,因为过去那些随心所欲的日子开始具体化为一种真正的纪律。
+
+## 现在
+
+今天,机器学习和人工智能几乎触及我们生活的每一个部分。这个时代要求仔细了解这些算法对人类生活的风险和潜在影响。正如微软的Brad Smith所言,“信息技术引发的问题触及隐私和言论自由等基本人权保护的核心。这些问题加重了制造这些产品的科技公司的责任。在我们看来,它们还呼吁政府进行深思熟虑的监管,并围绕可接受的用途制定规范”([来源](https://www.technologyreview.com/2019/12/18/102365/the-future-of-ais-impact-on-society/))。
+
+未来的情况还有待观察,但了解这些计算机系统以及它们运行的软件和算法是很重要的。我们希望这门课程能帮助你更好的理解,以便你自己决定。
+
+[![深度学习的历史](https://img.youtube.com/vi/mTtDfKgLm54/0.jpg)](https://www.youtube.com/watch?v=mTtDfKgLm54 "深度学习的历史")
+> 🎥 点击上图观看视频:Yann LeCun在本次讲座中讨论深度学习的历史
+
+---
+## 🚀挑战
+
+深入了解这些历史时刻之一,并更多地了解它们背后的人。这里有许多引人入胜的人物,没有一项科学发现是在文化真空中创造出来的。你发现了什么?
+
+## [课后测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/4/)
+
+## 复习与自学
+
+以下是要观看和收听的节目:
+
+[这是Amy Boyd讨论人工智能进化的播客](http://runasradio.com/Shows/Show/739)
+
+[![Amy Boyd的《人工智能史》](https://img.youtube.com/vi/EJt3_bFYKss/0.jpg)](https://www.youtube.com/watch?v=EJt3_bFYKss "Amy Boyd的《人工智能史》")
+
+## 任务
+
+[创建时间线](assignment.zh-cn.md)
diff --git a/1-Introduction/2-history-of-ML/translations/assignment.fr.md b/1-Introduction/2-history-of-ML/translations/assignment.fr.md
new file mode 100644
index 0000000000..c562516e47
--- /dev/null
+++ b/1-Introduction/2-history-of-ML/translations/assignment.fr.md
@@ -0,0 +1,11 @@
+# Créer une frise chronologique
+
+## Instructions
+
+Utiliser [ce repo](https://github.com/Digital-Humanities-Toolkit/timeline-builder), créer une frise chronologique de certains aspects de l'histoire des algorithmes, des mathématiques, des statistiques, de l'IA ou du machine learning, ou une combinaison de ceux-ci. Vous pouvez vous concentrer sur une personne, une idée ou une longue période d'innovations. Assurez-vous d'ajouter des éléments multimédias.
+
+## Rubrique
+
+| Critères | Exemplaire | Adéquate | A améliorer |
+| -------- | ---------------------------------------------------------------- | ------------------------------------ | ------------------------------------------------------------------ |
+| | Une chronologie déployée est présentée sous forme de page GitHub | Le code est incomplet et non déployé | La chronologie est incomplète, pas bien recherchée et pas déployée |
diff --git a/1-Introduction/2-history-of-ML/translations/assignment.id.md b/1-Introduction/2-history-of-ML/translations/assignment.id.md
new file mode 100644
index 0000000000..0ee7c0096c
--- /dev/null
+++ b/1-Introduction/2-history-of-ML/translations/assignment.id.md
@@ -0,0 +1,11 @@
+# Membuat sebuah *timeline*
+
+## Instruksi
+
+Menggunakan [repo ini](https://github.com/Digital-Humanities-Toolkit/timeline-builder), buatlah sebuah *timeline* dari beberapa aspek sejarah algoritma, matematika, statistik, AI, atau ML, atau kombinasi dari semuanya. Kamu dapat fokus pada satu orang, satu ide, atau rentang waktu pemikiran yang panjang. Pastikan untuk menambahkan elemen multimedia.
+
+## Rubrik
+
+| Kriteria | Sangat Bagus | Cukup | Perlu Peningkatan |
+| -------- | ------------------------------------------------- | --------------------------------------- | ---------------------------------------------------------------- |
+| | *Timeline* yang dideploy disajikan sebagai halaman GitHub | Kode belum lengkap dan belum dideploy | *Timeline* belum lengkap, belum diriset dengan baik dan belum dideploy |
\ No newline at end of file
diff --git a/1-Introduction/2-history-of-ML/translations/assignment.it.md b/1-Introduction/2-history-of-ML/translations/assignment.it.md
new file mode 100644
index 0000000000..4de7ed1484
--- /dev/null
+++ b/1-Introduction/2-history-of-ML/translations/assignment.it.md
@@ -0,0 +1,11 @@
+# Creare una sequenza temporale
+
+## Istruzioni
+
+Usando [questo repository](https://github.com/Digital-Humanities-Toolkit/timeline-builder), si crei una sequenza temporale di alcuni aspetti della storia di algoritmi, matematica, statistica, AI o ML, o una combinazione di questi. Ci si può concentrare su una persona, un'idea o un lungo lasso di tempo di pensiero. Ci si assicuri di aggiungere elementi multimediali.
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | ------------------------------------------------- | --------------------------------------- | ---------------------------------------------------------------- |
+| | Una sequenza temporale distribuita viene presentata come una pagina GitHub | Il codice è incompleto e non è stato distribuito | La sequenza temporale è incompleta, non ben studiata e non implementata |
diff --git a/1-Introduction/2-history-of-ML/translations/assignment.ja.md b/1-Introduction/2-history-of-ML/translations/assignment.ja.md
new file mode 100644
index 0000000000..f5f7879928
--- /dev/null
+++ b/1-Introduction/2-history-of-ML/translations/assignment.ja.md
@@ -0,0 +1,11 @@
+# 年表を作成する
+
+## 指示
+
+[このリポジトリ](https://github.com/Digital-Humanities-Toolkit/timeline-builder) を使って、アルゴリズム・数学・統計学・人工知能・機械学習、またはこれらの組み合わせに対して、歴史のひとつの側面に関する年表を作成してください。焦点を当てるのは、ひとりの人物・ひとつのアイディア・長期間にわたる思想のいずれのものでも構いません。マルチメディアの要素を必ず加えるようにしてください。
+
+## 評価基準
+
+| 基準 | 模範的 | 十分 | 要改善 |
+| ---- | -------------------------------------- | ------------------------------------ | ------------------------------------------------------------ |
+| | GitHub page に年表がデプロイされている | コードが未完成でデプロイされていない | 年表が未完成で、十分に調査されておらず、デプロイされていない |
diff --git a/1-Introduction/2-history-of-ML/translations/assignment.tr.md b/1-Introduction/2-history-of-ML/translations/assignment.tr.md
new file mode 100644
index 0000000000..f0e877636f
--- /dev/null
+++ b/1-Introduction/2-history-of-ML/translations/assignment.tr.md
@@ -0,0 +1,11 @@
+# Bir zaman çizelgesi oluşturun
+
+## Talimatlar
+
+[Bu repoyu](https://github.com/Digital-Humanities-Toolkit/timeline-builder) kullanarak; algoritmaların, matematiğin, istatistiğin, AI veya ML'in veya bunların bir kombinasyonunun tarihinin bazı yönlerinin bir zaman çizelgesini oluşturun. Bir kişiye, bir fikre veya bir düşüncenin uzun bir zamanına odaklanabilirsiniz. Multimedya öğeleri eklediğinizden emin olun.
+
+## Değerlendirme Listesi
+
+| | Takdir edilesi | Yeterli | İyileştirilmesi Lazım |
+| -------- | ------------------------------------------------- | --------------------------------------- | ---------------------------------------------------------------- |
+| Kriterler | Zaman çizelgesi bir GitHub sayfası olarak yayınlanmış | Kod eksik ve henüz yayınlanmamış | Zaman çizelgesi eksik, iyi araştırılmamış ve yayınlanmamış |
\ No newline at end of file
diff --git a/1-Introduction/2-history-of-ML/translations/assignment.zh-cn.md b/1-Introduction/2-history-of-ML/translations/assignment.zh-cn.md
new file mode 100644
index 0000000000..adf3ee15ae
--- /dev/null
+++ b/1-Introduction/2-history-of-ML/translations/assignment.zh-cn.md
@@ -0,0 +1,11 @@
+# 建立一个时间轴
+
+## 说明
+
+使用这个 [仓库](https://github.com/Digital-Humanities-Toolkit/timeline-builder),创建一个关于算法、数学、统计学、人工智能、机器学习的某个方面或者可以综合多个以上学科来讲。你可以着重介绍某个人,某个想法,或者一个经久不衰的思想。请确保添加了多媒体元素在你的时间线中。
+
+## 评判标准
+
+| 标准 | 优秀 | 中规中矩 | 仍需努力 |
+| ------------ | ---------------------------------- | ---------------------- | ------------------------------------------ |
+| | 有一个用 GitHub page 展示的 timeline | 代码还不完整并且没有部署 | 时间线不完整,没有经过充分的研究,并且没有部署 |
diff --git a/1-Introduction/3-fairness/README.md b/1-Introduction/3-fairness/README.md
index 063c189813..7e9c8f6d8f 100644
--- a/1-Introduction/3-fairness/README.md
+++ b/1-Introduction/3-fairness/README.md
@@ -3,7 +3,7 @@
![Summary of Fairness in Machine Learning in a sketchnote](../../sketchnotes/ml-fairness.png)
> Sketchnote by [Tomomi Imura](https://www.twitter.com/girlie_mac)
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/5/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/5/)
## Introduction
@@ -29,7 +29,7 @@ Learn more about Responsible AI by following this [Learning Path](https://docs.m
## Unfairness in data and algorithms
-> "If you torture the data long enough, it will confess to anything - Ronald Coase
+> "If you torture the data long enough, it will confess to anything" - Ronald Coase
This statement sounds extreme, but it is true that data can be manipulated to support any conclusion. Such manipulation can sometimes happen unintentionally. As humans, we all have bias, and it's often difficult to consciously know when you are introducing bias in data.
@@ -184,7 +184,7 @@ To prevent biases from being introduced in the first place, we should:
Think about real-life scenarios where unfairness is evident in model-building and usage. What else should we consider?
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/6/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/6/)
## Review & Self Study
In this lesson, you have learned some basics of the concepts of fairness and unfairness in machine learning.
diff --git a/1-Introduction/3-fairness/translations/README.id.md b/1-Introduction/3-fairness/translations/README.id.md
new file mode 100644
index 0000000000..980cbd88d2
--- /dev/null
+++ b/1-Introduction/3-fairness/translations/README.id.md
@@ -0,0 +1,213 @@
+# Keadilan dalam Machine Learning
+
+![Ringkasan dari Keadilan dalam Machine Learning dalam sebuah catatan sketsa](../../../sketchnotes/ml-fairness.png)
+> Catatan sketsa oleh [Tomomi Imura](https://www.twitter.com/girlie_mac)
+
+## [Quiz Pra-Pelajaran](https://white-water-09ec41f0f.azurestaticapps.net/quiz/5/)
+
+## Pengantar
+
+Dalam kurikulum ini, kamu akan mulai mengetahui bagaimana Machine Learning bisa memengaruhi kehidupan kita sehari-hari. Bahkan sekarang, sistem dan model terlibat dalam tugas pengambilan keputusan sehari-hari, seperti diagnosis kesehatan atau mendeteksi penipuan. Jadi, penting bahwa model-model ini bekerja dengan baik untuk memberikan hasil yang adil bagi semua orang.
+
+Bayangkan apa yang bisa terjadi ketika data yang kamu gunakan untuk membangun model ini tidak memiliki demografi tertentu, seperti ras, jenis kelamin, pandangan politik, agama, atau secara tidak proporsional mewakili demografi tersebut. Bagaimana jika keluaran dari model diinterpretasikan lebih menyukai beberapa demografis tertentu? Apa konsekuensi untuk aplikasinya?
+
+Dalam pelajaran ini, kamu akan:
+
+- Meningkatkan kesadaran dari pentingnya keadilan dalam Machine Learning.
+- Mempelajari tentang berbagai kerugian terkait keadilan.
+- Learn about unfairness assessment and mitigation.
+- Mempelajari tentang mitigasi dan penilaian ketidakadilan.
+
+## Prasyarat
+
+Sebagai prasyarat, silakan ikuti jalur belajar "Prinsip AI yang Bertanggung Jawab" dan tonton video di bawah ini dengan topik:
+
+Pelajari lebih lanjut tentang AI yang Bertanggung Jawab dengan mengikuti [Jalur Belajar](https://docs.microsoft.com/learn/modules/responsible-ai-principles/?WT.mc_id=academic-15963-cxa) ini
+
+[![Pendekatan Microsoft untuk AI yang Bertanggung Jawab](https://img.youtube.com/vi/dnC8-uUZXSc/0.jpg)](https://youtu.be/dnC8-uUZXSc "Pendekatan Microsoft untuk AI yang Bertanggung Jawab")
+
+> 🎥 Klik gambar diatas untuk menonton video: Pendekatan Microsoft untuk AI yang Bertanggung Jawab
+
+## Ketidakadilan dalam data dan algoritma
+
+> "Jika Anda menyiksa data cukup lama, data itu akan mengakui apa pun " - Ronald Coase
+
+Pernyataan ini terdengar ekstrem, tetapi memang benar bahwa data dapat dimanipulasi untuk mendukung kesimpulan apa pun. Manipulasi semacam itu terkadang bisa terjadi secara tidak sengaja. Sebagai manusia, kita semua memiliki bias, dan seringkali sulit untuk secara sadar mengetahui kapan kamu memperkenalkan bias dalam data.
+
+Menjamin keadilan dalam AI dan machine learning tetap menjadi tantangan sosioteknik yang kompleks. Artinya, hal itu tidak bisa ditangani baik dari perspektif sosial atau teknis semata.
+
+### Kerugian Terkait Keadilan
+
+Apa yang dimaksud dengan ketidakadilan? "Ketidakadilan" mencakup dampak negatif atau "bahaya" bagi sekelompok orang, seperti yang didefinisikan dalam hal ras, jenis kelamin, usia, atau status disabilitas.
+
+Kerugian utama yang terkait dengan keadilan dapat diklasifikasikan sebagai:
+
+- **Alokasi**, jika suatu jenis kelamin atau etnisitas misalkan lebih disukai daripada yang lain.
+- **Kualitas layanan**. Jika kamu melatih data untuk satu skenario tertentu tetapi kenyataannya jauh lebih kompleks, hasilnya adalah layanan yang berkinerja buruk.
+- **Stereotip**. Mengaitkan grup tertentu dengan atribut yang ditentukan sebelumnya.
+- **Fitnah**. Untuk mengkritik dan melabeli sesuatu atau seseorang secara tidak adil.
+- **Representasi yang kurang atau berlebihan**. Idenya adalah bahwa kelompok tertentu tidak terlihat dalam profesi tertentu, dan layanan atau fungsi apa pun yang terus dipromosikan yang menambah kerugian.
+
+Mari kita lihat contoh-contohnya.
+
+### Alokasi
+
+Bayangkan sebuah sistem untuk menyaring pengajuan pinjaman. Sistem cenderung memilih pria kulit putih sebagai kandidat yang lebih baik daripada kelompok lain. Akibatnya, pinjaman ditahan dari pemohon tertentu.
+
+Contoh lain adalah alat perekrutan eksperimental yang dikembangkan oleh perusahaan besar untuk menyaring kandidat. Alat tersebut secara sistematis mendiskriminasi satu gender dengan menggunakan model yang dilatih untuk lebih memilih kata-kata yang terkait dengan gender lain. Hal ini mengakibatkan kandidat yang resumenya berisi kata-kata seperti "tim rugby wanita" tidak masuk kualifikasi.
+
+✅ Lakukan sedikit riset untuk menemukan contoh dunia nyata dari sesuatu seperti ini
+
+### Kualitas Layanan
+
+Para peneliti menemukan bahwa beberapa pengklasifikasi gender komersial memiliki tingkat kesalahan yang lebih tinggi di sekitar gambar wanita dengan warna kulit lebih gelap dibandingkan dengan gambar pria dengan warna kulit lebih terang. [Referensi](https://www.media.mit.edu/publications/gender-shades-intersectional-accuracy-disparities-in-commercial-gender-classification/)
+
+Contoh terkenal lainnya adalah dispenser sabun tangan yang sepertinya tidak bisa mendeteksi orang dengan kulit gelap. [Referensi](https://gizmodo.com/why-cant-this-soap-dispenser-identify-dark-skin-1797931773)
+
+### Stereotip
+
+Pandangan gender stereotip ditemukan dalam terjemahan mesin. Ketika menerjemahkan "dia (laki-laki) adalah seorang perawat dan dia (perempuan) adalah seorang dokter" ke dalam bahasa Turki, masalah muncul. Turki adalah bahasa tanpa gender yang memiliki satu kata ganti, "o" untuk menyampaikan orang ketiga tunggal, tetapi menerjemahkan kalimat kembali dari Turki ke Inggris menghasilkan stereotip dan salah sebagai "dia (perempuan) adalah seorang perawat dan dia (laki-laki) adalah seorang dokter".
+
+![terjemahan ke bahasa Turki](../images/gender-bias-translate-en-tr.png)
+
+![terjemahan kembali ke bahasa Inggris](../images/gender-bias-translate-tr-en.png)
+
+### Fitnah
+
+Sebuah teknologi pelabelan gambar yang terkenal salah memberi label gambar orang berkulit gelap sebagai gorila. Pelabelan yang salah berbahaya bukan hanya karena sistem membuat kesalahan karena secara khusus menerapkan label yang memiliki sejarah panjang yang sengaja digunakan untuk merendahkan orang kulit hitam.
+
+[![AI: Bukankah Aku Seorang Wanita?](https://img.youtube.com/vi/QxuyfWoVV98/0.jpg)](https://www.youtube.com/watch?v=QxuyfWoVV98 "Bukankah Aku Seorang Wanita?")
+> 🎥 Klik gambar diatas untuk sebuah video: AI, Bukankah Aku Seorang Wanita? - menunjukkan kerugian yang disebabkan oleh pencemaran nama baik yang menyinggung ras oleh AI
+
+### Representasi yang kurang atau berlebihan
+
+Hasil pencarian gambar yang condong ke hal tertentu (skewed) dapat menjadi contoh yang bagus dari bahaya ini. Saat menelusuri gambar profesi dengan persentase pria yang sama atau lebih tinggi daripada wanita, seperti teknik, atau CEO, perhatikan hasil yang lebih condong ke jenis kelamin tertentu.
+
+![Pencarian CEO di Bing](../images/ceos.png)
+> Pencarian di Bing untuk 'CEO' ini menghasilkan hasil yang cukup inklusif
+
+Lima jenis bahaya utama ini tidak saling eksklusif, dan satu sistem dapat menunjukkan lebih dari satu jenis bahaya. Selain itu, setiap kasus bervariasi dalam tingkat keparahannya. Misalnya, memberi label yang tidak adil kepada seseorang sebagai penjahat adalah bahaya yang jauh lebih parah daripada memberi label yang salah pada gambar. Namun, penting untuk diingat bahwa bahkan kerugian yang relatif tidak parah dapat membuat orang merasa terasing atau diasingkan dan dampak kumulatifnya bisa sangat menekan.
+
+✅ **Diskusi**: Tinjau kembali beberapa contoh dan lihat apakah mereka menunjukkan bahaya yang berbeda.
+
+| | Alokasi | Kualitas Layanan | Stereotip | Fitnah | Representasi yang kurang atau berlebihan |
+| -------------------------- | :-----: | :--------------: | :-------: | :----: | :--------------------------------------: |
+| Sistem perekrutan otomatis | x | x | x | | x |
+| Terjemahan mesin | | | | | |
+| Melabeli foto | | | | | |
+
+
+## Mendeteksi Ketidakadilan
+
+Ada banyak alasan mengapa sistem tertentu berperilaku tidak adil. Bias sosial, misalnya, mungkin tercermin dalam kumpulan data yang digunakan untuk melatih. Misalnya, ketidakadilan perekrutan mungkin telah diperburuk oleh ketergantungan yang berlebihan pada data historis. Dengan menggunakan pola dalam resume yang dikirimkan ke perusahaan selama periode 10 tahun, model tersebut menentukan bahwa pria lebih berkualitas karena mayoritas resume berasal dari pria, yang mencerminkan dominasi pria di masa lalu di industri teknologi.
+
+Data yang tidak memadai tentang sekelompok orang tertentu dapat menjadi alasan ketidakadilan. Misalnya, pengklasifikasi gambar memiliki tingkat kesalahan yang lebih tinggi untuk gambar orang berkulit gelap karena warna kulit yang lebih gelap kurang terwakili dalam data.
+
+Asumsi yang salah yang dibuat selama pengembangan menyebabkan ketidakadilan juga. Misalnya, sistem analisis wajah yang dimaksudkan untuk memprediksi siapa yang akan melakukan kejahatan berdasarkan gambar wajah orang dapat menyebabkan asumsi yang merusak. Hal ini dapat menyebabkan kerugian besar bagi orang-orang yang salah diklasifikasikan.
+
+## Pahami model kamu dan bangun dalam keadilan
+
+Meskipun banyak aspek keadilan tidak tercakup dalam metrik keadilan kuantitatif, dan tidak mungkin menghilangkan bias sepenuhnya dari sistem untuk menjamin keadilan, Kamu tetap bertanggung jawab untuk mendeteksi dan mengurangi masalah keadilan sebanyak mungkin.
+
+Saat Kamu bekerja dengan model pembelajaran mesin, penting untuk memahami model Kamu dengan cara memastikan interpretasinya dan dengan menilai serta mengurangi ketidakadilan.
+
+Mari kita gunakan contoh pemilihan pinjaman untuk mengisolasi kasus untuk mengetahui tingkat dampak setiap faktor pada prediksi.
+
+## Metode Penilaian
+
+1. **Identifikasi bahaya (dan manfaat)**. Langkah pertama adalah mengidentifikasi bahaya dan manfaat. Pikirkan tentang bagaimana tindakan dan keputusan dapat memengaruhi calon pelanggan dan bisnis itu sendiri.
+
+1. **Identifikasi kelompok yang terkena dampak**. Setelah Kamu memahami jenis kerugian atau manfaat apa yang dapat terjadi, identifikasi kelompok-kelompok yang mungkin terpengaruh. Apakah kelompok-kelompok ini ditentukan oleh jenis kelamin, etnis, atau kelompok sosial?
+
+1. **Tentukan metrik keadilan**. Terakhir, tentukan metrik sehingga Kamu memiliki sesuatu untuk diukur dalam pekerjaan Kamu untuk memperbaiki situasi.
+
+### Identifikasi bahaya (dan manfaat)
+
+Apa bahaya dan manfaat yang terkait dengan pinjaman? Pikirkan tentang skenario negatif palsu dan positif palsu:
+
+**False negatives** (ditolak, tapi Y=1) - dalam hal ini, pemohon yang akan mampu membayar kembali pinjaman ditolak. Ini adalah peristiwa yang merugikan karena sumber pinjaman ditahan dari pemohon yang memenuhi syarat.
+
+**False positives** (diterima, tapi Y=0) - dalam hal ini, pemohon memang mendapatkan pinjaman tetapi akhirnya wanprestasi. Akibatnya, kasus pemohon akan dikirim ke agen penagihan utang yang dapat mempengaruhi permohonan pinjaman mereka di masa depan.
+
+### Identifikasi kelompok yang terkena dampak
+
+Langkah selanjutnya adalah menentukan kelompok mana yang kemungkinan akan terpengaruh. Misalnya, dalam kasus permohonan kartu kredit, sebuah model mungkin menentukan bahwa perempuan harus menerima batas kredit yang jauh lebih rendah dibandingkan dengan pasangan mereka yang berbagi aset rumah tangga. Dengan demikian, seluruh demografi yang ditentukan berdasarkan jenis kelamin menjadi terpengaruh.
+
+### Tentukan metrik keadilan
+
+Kamu telah mengidentifikasi bahaya dan kelompok yang terpengaruh, dalam hal ini digambarkan berdasarkan jenis kelamin. Sekarang, gunakan faktor terukur (*quantified factors*) untuk memisahkan metriknya. Misalnya, dengan menggunakan data di bawah ini, Kamu dapat melihat bahwa wanita memiliki tingkat *false positive* terbesar dan pria memiliki yang terkecil, dan kebalikannya berlaku untuk *false negative*.
+
+✅ Dalam pelajaran selanjutnya tentang *Clustering*, Kamu akan melihat bagaimana membangun 'confusion matrix' ini dalam kode
+
+| | False positive rate | False negative rate | count |
+| ---------- | ------------------- | ------------------- | ----- |
+| Women | 0.37 | 0.27 | 54032 |
+| Men | 0.31 | 0.35 | 28620 |
+| Non-binary | 0.33 | 0.31 | 1266 |
+
+
+Tabel ini memberitahu kita beberapa hal. Pertama, kami mencatat bahwa ada sedikit orang non-biner dalam data. Datanya condong (*skewed*), jadi Kamu harus berhati-hati dalam menafsirkan angka-angka ini.
+
+Dalam hal ini, kita memiliki 3 grup dan 2 metrik. Ketika kita memikirkan tentang bagaimana sistem kita memengaruhi kelompok pelanggan dengan permohonan pinjaman mereka, ini mungkin cukup, tetapi ketika Kamu ingin menentukan jumlah grup yang lebih besar, Kamu mungkin ingin menyaringnya menjadi kumpulan ringkasan yang lebih kecil. Untuk melakukannya, Kamu dapat menambahkan lebih banyak metrik, seperti perbedaan terbesar atau rasio terkecil dari setiap *false negative* dan *false positive*.
+
+✅ Berhenti dan Pikirkan: Kelompok lain yang apa lagi yang mungkin terpengaruh untuk pengajuan pinjaman?
+
+## Mengurangi ketidakadilan
+
+Untuk mengurangi ketidakadilan, jelajahi model untuk menghasilkan berbagai model yang dimitigasi dan bandingkan pengorbanan yang dibuat antara akurasi dan keadilan untuk memilih model yang paling adil.
+
+Pelajaran pengantar ini tidak membahas secara mendalam mengenai detail mitigasi ketidakadilan algoritmik, seperti pendekatan pasca-pemrosesan dan pengurangan (*post-processing and reductions approach*), tetapi berikut adalah *tool* yang mungkin ingin Kamu coba.
+
+### Fairlearn
+
+[Fairlearn](https://fairlearn.github.io/) adalah sebuah *package* Python open-source yang memungkinkan Kamu untuk menilai keadilan sistem Kamu dan mengurangi ketidakadilan.
+
+*Tool* ini membantu Kamu menilai bagaimana prediksi model memengaruhi kelompok yang berbeda, memungkinkan Kamu untuk membandingkan beberapa model dengan menggunakan metrik keadilan dan kinerja, dan menyediakan serangkaian algoritma untuk mengurangi ketidakadilan dalam klasifikasi dan regresi biner.
+
+- Pelajari bagaimana cara menggunakan komponen-komponen yang berbeda dengan mengunjungi [GitHub](https://github.com/fairlearn/fairlearn/) Fairlearn
+
+- Jelajahi [panduan pengguna](https://fairlearn.github.io/main/user_guide/index.html), [contoh-contoh](https://fairlearn.github.io/main/auto_examples/index.html)
+
+- Coba beberapa [sampel notebook](https://github.com/fairlearn/fairlearn/tree/master/notebooks).
+
+- Pelajari [bagaimana cara mengaktifkan penilaian keadilan](https://docs.microsoft.com/azure/machine-learning/how-to-machine-learning-fairness-aml?WT.mc_id=academic-15963-cxa) dari model machine learning di Azure Machine Learning.
+
+- Lihat [sampel notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/contrib/fairness) ini untuk skenario penilaian keadilan yang lebih banyak di Azure Machine Learning.
+
+---
+## 🚀 Tantangan
+
+Untuk mencegah kemunculan bias pada awalnya, kita harus:
+
+- memiliki keragaman latar belakang dan perspektif di antara orang-orang yang bekerja pada sistem
+- berinvestasi dalam dataset yang mencerminkan keragaman masyarakat kita
+- mengembangkan metode yang lebih baik untuk mendeteksi dan mengoreksi bias ketika itu terjadi
+
+Pikirkan tentang skenario kehidupan nyata di mana ketidakadilan terbukti dalam pembuatan dan penggunaan model. Apa lagi yang harus kita pertimbangkan?
+
+## [Quiz Pasca-Pelajaran](https://white-water-09ec41f0f.azurestaticapps.net/quiz/6/)
+## Ulasan & Belajar Mandiri
+
+Dalam pelajaran ini, Kamu telah mempelajari beberapa dasar konsep keadilan dan ketidakadilan dalam pembelajaran mesin.
+
+Tonton workshop ini untuk menyelami lebih dalam kedalam topik:
+
+- YouTube: Kerugian terkait keadilan dalam sistem AI: Contoh, penilaian, dan mitigasi oleh Hanna Wallach dan Miro Dudik [Kerugian terkait keadilan dalam sistem AI: Contoh, penilaian, dan mitigasi - YouTube](https://www.youtube.com/watch?v=1RptHwfkx_k)
+
+Kamu juga dapat membaca:
+
+- Pusat sumber daya RAI Microsoft: [Responsible AI Resources – Microsoft AI](https://www.microsoft.com/ai/responsible-ai-resources?activetab=pivot1%3aprimaryr4)
+
+- Grup riset FATE Microsoft: [FATE: Fairness, Accountability, Transparency, and Ethics in AI - Microsoft Research](https://www.microsoft.com/research/theme/fate/)
+
+Jelajahi *toolkit* Fairlearn
+
+[Fairlearn](https://fairlearn.org/)
+
+Baca mengenai *tools* Azure Machine Learning untuk memastikan keadilan
+
+- [Azure Machine Learning](https://docs.microsoft.com/azure/machine-learning/concept-fairness-ml?WT.mc_id=academic-15963-cxa)
+
+## Tugas
+
+[Jelajahi Fairlearn](assignment.id.md)
diff --git a/1-Introduction/3-fairness/translations/README.it.md b/1-Introduction/3-fairness/translations/README.it.md
new file mode 100644
index 0000000000..e55fd6e7f9
--- /dev/null
+++ b/1-Introduction/3-fairness/translations/README.it.md
@@ -0,0 +1,212 @@
+# Equità e machine learning
+
+![Riepilogo dell'equità in machine learning in uno sketchnote](../../../sketchnotes/ml-fairness.png)
+> Sketchnote di [Tomomi Imura](https://www.twitter.com/girlie_mac)
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/5/)
+
+## Introduzione
+
+In questo programma di studi, si inizierà a scoprire come machine learning può e sta influenzando la vita quotidiana. Anche ora, sistemi e modelli sono coinvolti nelle attività decisionali quotidiane, come le diagnosi sanitarie o l'individuazione di frodi. Quindi è importante che questi modelli funzionino bene per fornire risultati equi per tutti.
+
+Si immagini cosa può accadere quando i dati che si stanno utilizzando per costruire questi modelli mancano di determinati dati demografici, come razza, genere, visione politica, religione, o rappresentano tali dati demografici in modo sproporzionato. E quando il risultato del modello viene interpretato per favorire alcuni gruppi demografici? Qual è la conseguenza per l'applicazione?
+
+In questa lezione, si dovrà:
+
+- Aumentare la propria consapevolezza sull'importanza dell'equità nel machine learning.
+- Informarsi sui danni legati all'equità.
+- Apprendere ulteriori informazioni sulla valutazione e la mitigazione dell'ingiustizia.
+
+## Prerequisito
+
+Come prerequisito, si segua il percorso di apprendimento "Principi di AI Responsabile" e si guardi il video qui sotto sull'argomento:
+
+Si scopra di più sull'AI Responsabile seguendo questo [percorso di apprendimento](https://docs.microsoft.com/learn/modules/responsible-ai-principles/?WT.mc_id=academic-15963-cxa)
+
+[![L'approccio di Microsoft all'AI responsabileL'](https://img.youtube.com/vi/dnC8-uUZXSc/0.jpg)](https://youtu.be/dnC8-uUZXSc "approccio di Microsoft all'AI Responsabile")
+
+> 🎥 Fare clic sull'immagine sopra per un video: L'approccio di Microsoft all'AI Responsabile
+
+## Iniquità nei dati e negli algoritmi
+
+> "Se si torturano i dati abbastanza a lungo, essi confesseranno qualsiasi cosa" - Ronald Coase
+
+Questa affermazione suona estrema, ma è vero che i dati possono essere manipolati per supportare qualsiasi conclusione. Tale manipolazione a volte può avvenire involontariamente. Come esseri umani, abbiamo tutti dei pregiudizi, ed è spesso difficile sapere consapevolmente quando si introduce un pregiudizio nei dati.
+
+Garantire l'equità nell'intelligenza artificiale e machine learning rimane una sfida socio-tecnica complessa. Ciò significa che non può essere affrontata da prospettive puramente sociali o tecniche.
+
+### Danni legati all'equità
+
+Cosa si intende per ingiustizia? L'"ingiustizia" comprende gli impatti negativi, o "danni", per un gruppo di persone, come quelli definiti in termini di razza, genere, età o stato di disabilità.
+
+I principali danni legati all'equità possono essere classificati come:
+
+- **Allocazione**, se un genere o un'etnia, ad esempio, sono preferiti a un altro.
+- **Qualità di servizio** Se si addestrano i dati per uno scenario specifico, ma la realtà è molto più complessa, si ottiene un servizio scadente.
+- **Stereotipi**. Associazione di un dato gruppo con attributi preassegnati.
+- **Denigrazione**. Criticare ed etichettare ingiustamente qualcosa o qualcuno.
+- **Sovra o sotto rappresentazione**. L'idea è che un certo gruppo non è visto in una certa professione, e qualsiasi servizio o funzione che continua a promuovere ciò, contribuisce al danno.
+
+Si dia un'occhiata agli esempi.
+
+### Allocazione
+
+Si consideri un ipotetico sistema per la scrematura delle domande di prestito. Il sistema tende a scegliere gli uomini bianchi come candidati migliori rispetto ad altri gruppi. Di conseguenza, i prestiti vengono negati ad alcuni richiedenti.
+
+Un altro esempio potrebbe essere uno strumento sperimentale di assunzione sviluppato da una grande azienda per selezionare i candidati. Lo strumento discrimina sistematicamente un genere utilizzando i modelli che sono stati addestrati a preferire parole associate con altro. Ha portato a penalizzare i candidati i cui curricula contengono parole come "squadra di rugby femminile".
+
+✅ Si compia una piccola ricerca per trovare un esempio reale di qualcosa del genere
+
+### Qualità di Servizio
+
+I ricercatori hanno scoperto che diversi classificatori di genere commerciali avevano tassi di errore più elevati intorno alle immagini di donne con tonalità della pelle più scura rispetto alle immagini di uomini con tonalità della pelle più chiare. [Riferimento](https://www.media.mit.edu/publications/gender-shades-intersectional-accuracy-disparities-in-commercial-gender-classification/)
+
+Un altro esempio infamante è un distributore di sapone per le mani che sembrava non essere in grado di percepire le persone con la pelle scura. [Riferimento](https://gizmodo.com/why-cant-this-soap-dispenser-identify-dark-skin-1797931773)
+
+### Stereotipi
+
+La visione di genere stereotipata è stata trovata nella traduzione automatica. Durante la traduzione in turco "he is a nurse and she is a doctor" (lui è un'infermiere e lei un medico), sono stati riscontrati problemi. Il turco è una lingua senza genere che ha un pronome, "o" per trasmettere una terza persona singolare, ma tradurre la frase dal turco all'inglese produce lo stereotipo e scorretto come "she is a nurse and he is a doctor" (lei è un'infermiera e lui è un medico).
+
+![traduzione in turco](../images/gender-bias-translate-en-tr.png)
+
+![Traduzione in inglese](../images/gender-bias-translate-tr-en.png)
+
+### Denigrazione
+
+Una tecnologia di etichettatura delle immagini ha contrassegnato in modo infamante le immagini di persone dalla pelle scura come gorilla. L'etichettatura errata è dannosa non solo perché il sistema ha commesso un errore, ma anche perché ha applicato specificamente un'etichetta che ha una lunga storia di essere intenzionalmente utilizzata per denigrare i neri.
+
+[![AI: Non sono una donna?](https://img.youtube.com/vi/QxuyfWoVV98/0.jpg)](https://www.youtube.com/watch?v=QxuyfWoVV98 "AI, non sono una donna?")
+> 🎥 Cliccare sull'immagine sopra per un video: AI, Ain't I a Woman - una performance che mostra il danno causato dalla denigrazione razzista da parte dell'AI
+
+### Sovra o sotto rappresentazione
+
+I risultati di ricerca di immagini distorti possono essere un buon esempio di questo danno. Quando si cercano immagini di professioni con una percentuale uguale o superiore di uomini rispetto alle donne, come l'ingegneria o CEO, si osserva che i risultati sono più fortemente distorti verso un determinato genere.
+
+![Ricerca CEO di Bing](../images/ceos.png)
+> Questa ricerca su Bing per "CEO" produce risultati piuttosto inclusivi
+
+Questi cinque principali tipi di danno non si escludono a vicenda e un singolo sistema può presentare più di un tipo di danno. Inoltre, ogni caso varia nella sua gravità. Ad esempio, etichettare ingiustamente qualcuno come criminale è un danno molto più grave che etichettare erroneamente un'immagine. È importante, tuttavia, ricordare che anche danni relativamente non gravi possono far sentire le persone alienate o emarginate e l'impatto cumulativo può essere estremamente opprimente.
+
+✅ **Discussione**: rivisitare alcuni degli esempi e vedere se mostrano danni diversi.
+
+| | Allocatione | Qualita di servizio | Stereotipo | Denigrazione | Sovra o sotto rappresentazione |
+| ----------------------------------- | :---------: | :-----------------: | :--------: | :----------: | :----------------------------: |
+| Sistema di assunzione automatizzato | x | x | x | | x |
+| Traduzione automatica | | | | | |
+| Eitchettatura foto | | | | | |
+
+## Rilevare l'ingiustizia
+
+Ci sono molte ragioni per cui un dato sistema si comporta in modo scorretto. I pregiudizi sociali, ad esempio, potrebbero riflettersi nell'insieme di dati utilizzati per addestrarli. Ad esempio, l'ingiustizia delle assunzioni potrebbe essere stata esacerbata dall'eccessivo affidamento sui dati storici. Utilizzando i modelli nei curricula inviati all'azienda per un periodo di 10 anni, il modello ha determinato che gli uomini erano più qualificati perché la maggior parte dei curricula proveniva da uomini, un riflesso del passato dominio maschile nell'industria tecnologica.
+
+Dati inadeguati su un determinato gruppo di persone possono essere motivo di ingiustizia. Ad esempio, i classificatori di immagini hanno un tasso di errore più elevato per le immagini di persone dalla pelle scura perché le tonalità della pelle più scure sono sottorappresentate nei dati.
+
+Anche le ipotesi errate fatte durante lo sviluppo causano iniquità. Ad esempio, un sistema di analisi facciale destinato a prevedere chi commetterà un crimine basato sulle immagini dei volti delle persone può portare a ipotesi dannose. Ciò potrebbe portare a danni sostanziali per le persone classificate erroneamente.
+
+## Si comprendano i propri modelli e si costruiscano in modo onesto
+
+Sebbene molti aspetti dell'equità non vengano catturati nelle metriche di equità quantitativa e non sia possibile rimuovere completamente i pregiudizi da un sistema per garantire l'equità, si è comunque responsabili di rilevare e mitigare il più possibile i problemi di equità.
+
+Quando si lavora con modelli di machine learning, è importante comprendere i propri modelli assicurandone l'interpretabilità e valutando e mitigando l'ingiustizia.
+
+Si utilizza l'esempio di selezione del prestito per isolare il caso e determinare il livello di impatto di ciascun fattore sulla previsione.
+
+## Metodi di valutazione
+
+1. **Identificare i danni (e benefici)**. Il primo passo è identificare danni e benefici. Si pensi a come azioni e decisioni possono influenzare sia i potenziali clienti che un'azienda stessa.
+
+1. **Identificare i gruppi interessati**. Una volta compreso il tipo di danni o benefici che possono verificarsi, identificare i gruppi che potrebbero essere interessati. Questi gruppi sono definiti per genere, etnia o gruppo sociale?
+
+1. **Definire le metriche di equità**. Infine, si definisca una metrica in modo da avere qualcosa su cui misurare il proprio lavoro per migliorare la situazione.
+
+### **Identificare danni (e benefici)**
+
+Quali sono i danni e i benefici associati al prestito? Si pensi agli scenari di falsi negativi e falsi positivi:
+
+**Falsi negativi** (rifiutato, ma Y=1) - in questo caso viene rifiutato un richiedente che sarà in grado di rimborsare un prestito. Questo è un evento avverso perché le risorse dei prestiti non sono erogate a richiedenti qualificati.
+
+**Falsi positivi** (accettato, ma Y=0) - in questo caso, il richiedente ottiene un prestito ma alla fine fallisce. Di conseguenza, il caso del richiedente verrà inviato a un'agenzia di recupero crediti che può influire sulle sue future richieste di prestito.
+
+### **Identificare i gruppi interessati**
+
+Il passo successivo è determinare quali gruppi potrebbero essere interessati. Ad esempio, nel caso di una richiesta di carta di credito, un modello potrebbe stabilire che le donne dovrebbero ricevere limiti di credito molto più bassi rispetto ai loro coniugi che condividono i beni familiari. Un intero gruppo demografico, definito in base al genere, è così interessato.
+
+### **Definire le metriche di equità**
+
+Si sono identificati i danni e un gruppo interessato, in questo caso, delineato per genere. Ora, si usino i fattori quantificati per disaggregare le loro metriche. Ad esempio, utilizzando i dati di seguito, si può vedere che le donne hanno il più alto tasso di falsi positivi e gli uomini il più piccolo, e che è vero il contrario per i falsi negativi.
+
+✅ In una futura lezione sul Clustering, si vedrà come costruire questa 'matrice di confusione' nel codice
+
+| | percentuale di falsi positivi | Percentuale di falsi negativi | conteggio |
+| ----------- | ----------------------------- | ----------------------------- | --------- |
+| Donna | 0,37 | 0,27 | 54032 |
+| Uomo | 0,31 | 0.35 | 28620 |
+| Non binario | 0,33 | 0,31 | 1266 |
+
+Questa tabella ci dice diverse cose. Innanzitutto, si nota che ci sono relativamente poche persone non binarie nei dati. I dati sono distorti, quindi si deve fare attenzione a come si interpretano questi numeri.
+
+In questo caso, ci sono 3 gruppi e 2 metriche. Quando si pensa a come il nostro sistema influisce sul gruppo di clienti con i loro richiedenti di prestito, questo può essere sufficiente, ma quando si desidera definire un numero maggiore di gruppi, è possibile distillare questo in insiemi più piccoli di riepiloghi. Per fare ciò, si possono aggiungere più metriche, come la differenza più grande o il rapporto più piccolo di ogni falso negativo e falso positivo.
+
+✅ Ci si fermi a pensare: quali altri gruppi potrebbero essere interessati dalla richiesta di prestito?
+
+## Mitigare l'ingiustizia
+
+Per mitigare l'ingiustizia, si esplori il modello per generare vari modelli mitigati e si confrontino i compromessi tra accuratezza ed equità per selezionare il modello più equo.
+
+Questa lezione introduttiva non approfondisce i dettagli dell'algoritmo della mitigazione dell'ingiustizia, come l'approccio di post-elaborazione e riduzione, ma ecco uno strumento che si potrebbe voler provare.
+
+### Fairlearn
+
+[Fairlearn](https://fairlearn.github.io/) è un pacchetto Python open source che consente di valutare l'equità dei propri sistemi e mitigare l'ingiustizia.
+
+Lo strumento consente di valutare in che modo le previsioni di un modello influiscono su diversi gruppi, consentendo di confrontare più modelli utilizzando metriche di equità e prestazioni e fornendo una serie di algoritmi per mitigare l'ingiustizia nella classificazione binaria e nella regressione.
+
+- Si scopra come utilizzare i diversi componenti controllando il GitHub di [Fairlearn](https://github.com/fairlearn/fairlearn/)
+
+- Si esplori la [guida per l'utente](https://fairlearn.github.io/main/user_guide/index.html), e gli [esempi](https://fairlearn.github.io/main/auto_examples/index.html)
+
+- Si provino alcuni [notebook di esempio](https://github.com/fairlearn/fairlearn/tree/master/notebooks).
+
+- Si scopra [come abilitare le valutazioni dell'equità](https://docs.microsoft.com/azure/machine-learning/how-to-machine-learning-fairness-aml?WT.mc_id=academic-15963-cxa) dei modelli di Machine Learning in Azure Machine Learning.
+
+- Si dia un'occhiata a questi [notebook di esempio](https://github.com/Azure/MachineLearningNotebooks/tree/master/contrib/fairness) per ulteriori scenari di valutazione dell'equità in Azure Machine Learning.
+
+---
+
+## 🚀 Sfida
+
+Per evitare che vengano introdotti pregiudizi, in primo luogo, si dovrebbe:
+
+- avere una diversità di background e prospettive tra le persone che lavorano sui sistemi
+- investire in insiemi di dati che riflettano le diversità della società
+- sviluppare metodi migliori per rilevare e correggere i pregiudizi quando si verificano
+
+Si pensi a scenari di vita reale in cui l'ingiustizia è evidente nella creazione e nell'utilizzo del modello. Cos'altro si dovrebbe considerare?
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/6/)
+
+## Revisione e Auto Apprendimento
+
+In questa lezione si sono apprese alcune nozioni di base sui concetti di equità e ingiustizia in machine learning.
+
+Si guardi questo workshop per approfondire gli argomenti:
+
+- YouTube: Danni correlati all'equità nei sistemi di IA: esempi, valutazione e mitigazione di Hanna Wallach e Miro Dudik [Danni correlati all'equità nei sistemi di IA: esempi, valutazione e mitigazione - YouTube](https://www.youtube.com/watch?v=1RptHwfkx_k)
+
+Si legga anche:
+
+- Centro risorse RAI di Microsoft: [risorse AI responsabili – Microsoft AI](https://www.microsoft.com/ai/responsible-ai-resources?activetab=pivot1%3aprimaryr4)
+
+- Gruppo di ricerca FATE di Microsoft[: FATE: equità, responsabilità, trasparenza ed etica nell'intelligenza artificiale - Microsoft Research](https://www.microsoft.com/research/theme/fate/)
+
+Si esplori il toolkit Fairlearn
+
+[Fairlearn](https://fairlearn.org/)
+
+Si scoprano gli strumenti di Azure Machine Learning per garantire l'equità
+
+- [Azure Machine Learning](https://docs.microsoft.com/azure/machine-learning/concept-fairness-ml?WT.mc_id=academic-15963-cxa)
+
+## Compito
+
+[Esplorare Fairlearn](assignment.it.md)
diff --git a/1-Introduction/3-fairness/translations/README.ja.md b/1-Introduction/3-fairness/translations/README.ja.md
new file mode 100644
index 0000000000..ffa878c171
--- /dev/null
+++ b/1-Introduction/3-fairness/translations/README.ja.md
@@ -0,0 +1,204 @@
+# 機械学習における公平さ
+
+![機械学習における公平性をまとめたスケッチ](../../../sketchnotes/ml-fairness.png)
+> [Tomomi Imura](https://www.twitter.com/girlie_mac)によるスケッチ
+
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/5?loc=ja)
+
+## イントロダクション
+
+このカリキュラムでは、機械学習が私たちの日常生活にどのような影響を与えているかを知ることができます。たった今、医療の診断や不正の検出など、日常の意思決定にシステムやモデルが関わっています。そのため、誰もが公平な結果を得られるようにするためには、これらのモデルがうまく機能することが重要です。
+
+しかし、これらのモデルを構築するために使用しているデータに、人種、性別、政治的見解、宗教などの特定の属性が欠けていたり、そのような属性が偏っていたりすると、何が起こるか想像してみてください。また、モデルの出力が特定の層に有利になるように解釈された場合はどうでしょうか。その結果、アプリケーションはどのような影響を受けるのでしょうか?
+
+このレッスンでは、以下のことを行います:
+
+- 機械学習における公平性の重要性に対する意識を高める。
+- 公平性に関連する問題について学ぶ。
+- 公平性の評価と緩和について学ぶ。
+
+## 前提条件
+前提条件として、"Responsible AI Principles"のLearn Pathを受講し、このトピックに関する以下のビデオを視聴してください。
+
+こちらの[Learning Path](https://docs.microsoft.com/learn/modules/responsible-ai-principles/?WT.mc_id=academic-15963-cxa)より、責任のあるAIについて学ぶ。
+
+[![Microsoftの責任あるAIに対する取り組み](https://img.youtube.com/vi/dnC8-uUZXSc/0.jpg)](https://youtu.be/dnC8-uUZXSc "Microsoftの責任あるAIに対する取り組み")
+
+> 🎥 上の画像をクリックすると動画が表示されます:Microsoftの責任あるAIに対する取り組み
+
+## データやアルゴリズムの不公平さ
+
+> 「データを長く拷問すれば、何でも自白するようになる」 - Ronald Coase
+
+この言葉は極端に聞こえますが、データがどんな結論をも裏付けるように操作できることは事実です。しかし、そのような操作は、時に意図せずに行われることがあります。人間は誰でもバイアスを持っており、自分がいつデータにバイアスを導入しているかを意識的に知ることは難しいことが多いのです。
+
+AIや機械学習における公平性の保証は、依然として複雑な社会技術的課題です。つまり、純粋に社会的な視点や技術的な視点のどちらからも対処できないということです。
+
+### 公平性に関連した問題
+
+不公平とはどういう意味ですか?不公平とは、人種、性別、年齢、障害の有無などで定義された人々のグループに悪影響を与えること、あるいは、被害を与えることです。
+
+主な不公平に関連する問題は以下のように分類されます。:
+
+- **アロケーション**。ある性別や民族が他の性別や民族よりも優遇されている場合。
+- **サービスの質**。ある特定のシナリオのためにデータを訓練しても、現実がより複雑な場合にはサービスの質の低下につながります。
+- **固定観念**。特定のグループにあらかじめ割り当てられた属性を関連させること。
+- **誹謗中傷**。何かや誰かを不当に批判したり、レッテルを貼ること。
+- **過剰表現または過小表現**。特定のグループが特定の職業に就いている姿が見られず、それを宣伝し続けるサービスや機能は被害を助長しているという考え。
+
+それでは、いくつか例を見ていきましょう。
+
+### アロケーション
+
+ローン申請を審査する仮想的なシステムを考えてみましょう。このシステムでは、他のグループよりも白人男性を優秀な候補者として選ぶ傾向があります。その結果、特定の申請者にはローンが提供されませんでした。
+
+もう一つの例は、大企業が候補者を審査するために開発した実験的な採用ツールです。このツールは、ある性別に関連する言葉を好むように訓練されたモデルを使って、ある性別をシステム的に差別していました。その結果、履歴書に「女子ラグビーチーム」などの単語が含まれている候補者にペナルティを課すものとなっていました。
+
+✅ ここで、上記のような実例を少し調べてみてください。
+
+### サービスの質
+
+研究者は、いくつかの市販のジェンダー分類法は、明るい肌色の男性の画像と比較して、暗い肌色の女性の画像では高い不正解率を示したことを発見した。[参照](https://www.media.mit.edu/publications/gender-shades-intersectional-accuracy-disparities-in-commercial-gender-classification/)
+
+また、肌の色が暗い人を感知できなかったハンドソープディスペンサーの例も悪い意味で有名です。[参照](https://gizmodo.com/why-cant-this-soap-dispenser-identify-dark-skin-1797931773)
+
+### 固定観念
+
+機械翻訳には、ステレオタイプな性別観が見られます。「彼はナースで、彼女は医者です。(“he is a nurse and she is a doctor”)」という文をトルコ語に翻訳する際、問題が発生しました。トルコ語は単数の三人称を表す代名詞「o」が1つあるのみで、性別の区別のない言語で、この文章をトルコ語から英語に翻訳し直すと、「彼女はナースで、彼は医者です。(“she is a nurse and he is a doctor”)」というステレオタイプによる正しくない文章になってしまいます。
+
+![トルコ語に対する翻訳](../images/gender-bias-translate-en-tr.png)
+
+![英語に復元する翻訳](../images/gender-bias-translate-tr-en.png)
+
+### 誹謗中傷
+
+画像ラベリング技術により、肌の色が黒い人の画像をゴリラと誤表示したことが有名です。誤表示は、システムが単に間違いをしたというだけでなく、黒人を誹謗中傷するためにこの表現が意図的に使われてきた長い歴史を持っていたため、有害である。
+
+[![AI: 自分は女性ではないの?](https://img.youtube.com/vi/QxuyfWoVV98/0.jpg)](https://www.youtube.com/watch?v=QxuyfWoVV98 "AI: 自分は女性ではないの?")
+> 🎥 上の画像をクリックすると動画が表示されます: AI: 自分は女性ではないの? - AIによる人種差別的な誹謗中傷による被害を示すパフォーマンス
+
+### 過剰表現または過小表現
+
+異常な画像検索の結果はこの問題の良い例です。エンジニアやCEOなど、男性と女性の割合が同じかそれ以上の職業の画像を検索すると、どちらかの性別に大きく偏った結果が表示されるので注意が必要です。
+
+![BingでCEOと検索](../images/ceos.png)
+> Bing での「CEO」の検索結果は包摂的な結果が表示されています
+
+これらの5つの主要なタイプの問題は、相互に排他的なものではなく、1つのシステムが複数のタイプの害を示すこともあります。さらに、それぞれのケースでは、その重大性が異なります。例えば、ある人に不当に犯罪者のレッテルを貼ることは、画像を誤って表示することよりもはるかに深刻な問題です。しかし、比較的深刻ではない被害であっても、人々が疎外感を感じたり、特別視されていると感じたりすることがあり、その累積的な影響は非常に抑圧的なものになりうることを覚えておくことは重要でしょう。
+
+✅ **ディスカッション**: いくつかの例を再検討し、異なる害を示しているかどうかを確認してください。
+
+| | アロケーション | サービスの質 | 固定観念 | 誹謗中傷 | 過剰表現/過小表現 |
+| -------------------- | :------------: | :----------: | :------: | :------: | :---------------: |
+| 採用システムの自動化 | x | x | x | | x |
+| 機械翻訳 | | | | | |
+| 写真のラベリング | | | | | |
+
+
+## 不公平の検出
+
+あるシステムが不公平な動作をする理由はさまざまです。例えば、社会的なバイアスが、学習に使われたデータセットに反映されているかもしれないですし、過去のデータに頼りすぎたために、採用の不公平が悪化したかもしれません。あるモデルは、10年間に会社に提出された履歴書のパターンを利用して、男性からの履歴書が大半を占めていたことから、男性の方が適格であると判断しました。
+
+特定のグループに関するデータが不十分であることも、不公平の原因となります。例えば、肌の色が濃い人のデータが少ないために、画像分類において肌の色が濃い人の画像のエラー率が高くなります。
+
+また、開発時の誤った仮定も不公平の原因となります。例えば、人の顔の画像から犯罪を犯す人を予測することを目的とした顔分析システムでは、有害な推測をしてしまうことがあります。その結果、誤った分類をされた人が大きな被害を受けることになりかねません。
+
+## モデルを理解し、公平性を構築する
+
+公平性の多くの側面は定量的な指標では捉えられず、公平性を保証するためにシステムからバイアスを完全に取り除くことは不可能ですが、公平性の問題を可能な限り検出し、軽減する責任があります。
+
+機械学習モデルを扱う際には、モデルの解釈可能性を保証し、不公平さを評価・軽減することで、モデルを理解することが重要です。
+
+ここでは、ローン選択の例を使ってケースを切り分け、各要素が予測に与える影響の度合いを把握してみましょう。
+
+## 評価方法
+
+1. **危害(と利益)を特定する**。最初のステップは、危害と利益を特定することです。行動や決定が、潜在的な顧客とビジネスそのものの両方にどのような影響を与えるかを考えてみましょう。
+
+1. **影響を受けるグループを特定する**。どのような害や利益が発生しうるかを理解したら、影響を受ける可能性のあるグループを特定します。これらのグループは、性別、民族、または社会的グループによって定義されるでしょうか。
+
+1. **公正さの測定基準を定義する**。最後に、状況を改善する際に何を基準にするかの指標を定義します。
+
+### 有害性(および利益)を特定する
+貸与に関連する有害性と利益は何か?偽陰性と偽陽性のシナリオについて考えてみましょう。
+
+**偽陰性(認可しないが、Y=1)** - この場合、ローンを返済できるであろう申請者が拒否されます。これは、融資が資格のある申請者になされなくなるため、不利な事象となります。
+
+**偽陽性(受け入れるが、Y=0)** - この場合、申請者は融資を受けたが、最終的には返済不能(デフォルト)になる。その結果、申請者の事例は債権回収会社に送られ、将来のローン申請に影響を与える可能性があります。
+
+### 影響を受けるグループの特定
+次のステップでは、どのグループが影響を受ける可能性があるかを判断します。例えば、クレジットカードの申請の場合、家計の資産を共有している配偶者と比較して、女性の与信限度額は大幅に低くすべきだとモデルが判断するかもしれません。これにより、ジェンダーで定義される層全体が影響を受けることになります。
+
+### 公正さの測定基準を定義する
+あなたは有害性と影響を受けるグループ(この場合は、性別で定義されている)をここまでに特定しました。次に、定量化された要素を使って、その評価基準を分解します。例えば、以下のデータを使用すると、女性の偽陽性率が最も大きく、男性が最も小さいこと、そしてその逆が偽陰性の場合に当てはまることがわかります。
+
+✅ 今後の"クラスタリング"のレッスンでは、この"混同行列"をコードで構築する方法をご紹介します。
+
+| | 偽陽性率 | 偽陰性率 | サンプル数 |
+| ------------------ | -------- | -------- | ---------- |
+| 女性 | 0.37 | 0.27 | 54032 |
+| 男性 | 0.31 | 0.35 | 28620 |
+| どちらにも属さない | 0.33 | 0.31 | 1266 |
+
+この表から、いくつかのことがわかります。まず、データに含まれる男性と女性どちらでもない人が比較的少ないことがわかります。従ってこのデータは歪んでおり、この数字をどう解釈するかに注意が必要です。
+
+今回の場合、3つのグループと2つの指標があります。このシステムがローン申請者であるお客様のグループにどのような影響を与えるかを考えるときにはこれで十分かもしれません。しかし、より多くのグループを定義したい場合は、これをより小さな要約のまとまりに抽出したいと思うかもしれません。そのためには、偽陰性と偽陽性それぞれの最大値の差や最小の比率など、より多くの要素を追加することができます。
+
+✅ 一旦ここで考えてみてください:ローン申請の際に影響を受けそうな他のグループは?
+
+## 不公平の緩和
+
+不公平を緩和するためには、モデルを探索して様々な緩和モデルを生成し、精度と公平性の間で行うトレードオフを比較して、最も公平なモデルを選択します。
+
+この入門編では、後処理やリダクションのアプローチといったアルゴリズムによる不公平の緩和の詳細については深く触れていませんが、試していきたいツールをここで紹介します。
+
+### Fairlearn
+[Fairlearn](https://fairlearn.github.io/)はオープンソースのPythonパッケージで、システムの公平性を評価し、不公平を緩和することができます。
+
+このツールは、モデルの予測が異なるグループにどのような影響を与えるかを評価し、公平性とパフォーマンスの指標を用いて複数のモデルを比較することを可能にし、二項分類(binary classification)と回帰(regression)における不公平さを緩和するためのアルゴリズムを提供します。
+
+- Fairlearnの[GitHub](https://github.com/fairlearn/fairlearn/)では、各要素の使用方法を紹介しています。
+
+- [ユーザーガイド](https://fairlearn.github.io/main/user_guide/index.html)、[サンプル](https://fairlearn.github.io/main/auto_examples/index.html)を見る。
+
+- [サンプルノートブック](https://github.com/fairlearn/fairlearn/tree/master/notebooks)を試す。
+
+- Azure Machine Learningで機械学習モデルの[公平性評価を可能にする方法](https://docs.microsoft.com/azure/machine-learning/how-to-machine-learning-fairness-aml?WT.mc_id=academic-15963-cxa)を学ぶ。
+
+- Azure Machine Learningで[サンプルノートブック](https://github.com/Azure/MachineLearningNotebooks/tree/master/contrib/fairness)をチェックして、公平性評価の流れを確認する。
+
+---
+## 🚀 Challenge
+
+そもそも偏りが生じないようにするためには、次のようなことが必要です。
+
+- システムに携わる人たちの背景や考え方を多様化する。
+- 社会の多様性を反映したデータセットに投資する。
+- バイアスが発生したときに、それを検知して修正するためのより良い方法を開発する。
+
+モデルの構築や使用において、不公平が明らかになるような現実のシナリオを考えてみてください。他にどのようなことを考えるべきでしょうか?
+
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/6?loc=ja)
+## Review & Self Study
+
+このレッスンでは、機械学習における公平、不公平の概念の基礎を学びました。
+
+このワークショップを見て、トピックをより深く理解してください:
+
+- YouTube: AIシステムにおける公平性に関連した被害: Hanna Wallach、Miro Dudikによる、事例、評価、緩和策について[AIシステムにおける公平性に関連した被害: Hanna Wallach、Miro Dudikによる、事例、評価、緩和策について - YouTube](https://www.youtube.com/watch?v=1RptHwfkx_k)
+
+- MicrosoftのRAIリソースセンター: [責任あるAIリソース – Microsoft AI](https://www.microsoft.com/ai/responsible-ai-resources?activetab=pivot1%3aprimaryr4)
+
+- MicrosoftのFATE研究グループ: [AIにおけるFATE: Fairness(公平性), Accountability(説明責任), Transparency(透明性), and Ethics(倫理)- Microsoft Research](https://www.microsoft.com/research/theme/fate/)
+
+Fairlearnのツールキットを調べてみましょう
+
+- [Fairlearn](https://fairlearn.org/)
+
+Azure Machine Learningによる、公平性を確保するためのツールについて読む
+
+- [Azure Machine Learning](https://docs.microsoft.com/azure/machine-learning/concept-fairness-ml?WT.mc_id=academic-15963-cxa)
+
+## 課題
+
+[Fairlearnを調査する](./assignment.ja.md)
diff --git a/1-Introduction/3-fairness/translations/README.zh-cn.md b/1-Introduction/3-fairness/translations/README.zh-cn.md
new file mode 100644
index 0000000000..952e819165
--- /dev/null
+++ b/1-Introduction/3-fairness/translations/README.zh-cn.md
@@ -0,0 +1,214 @@
+# 机器学习中的公平性
+
+![机器学习中的公平性概述](../../../sketchnotes/ml-fairness.png)
+> 作者[Tomomi Imura](https://www.twitter.com/girlie_mac)
+
+## [课前测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/5/)
+
+## 介绍
+
+在本课程中,你将开始了解机器学习如何影响我们的日常生活。截至目前,系统和模型已经参与到日常决策任务中,例如医疗诊断或发现欺诈。因此,这些模型运行良好,并为每个人提供公平的结果非常重要。
+
+想象一下,当你用于构建这些模型的数据缺少某些人口统计信息时会发生什么情况,例如种族、性别、政治观点、宗教,或者不成比例地代表了这些人口统计信息。当模型的输出被解释为有利于某些人口统计学的时候呢?申请结果如何?
+
+在本课中,你将:
+
+- 提高你对机器学习中公平的重要性的认识。
+- 了解公平相关的危害。
+- 了解不公平评估和缓解措施。
+
+## 先决条件
+
+作为先决条件,请选择“负责任的人工智能原则”学习路径并观看以下主题视频:
+
+按照此[学习路径](https://docs.microsoft.com/learn/modules/responsible-ai-principles/?WT.mc_id=academic-15963-cxa)了解有关负责任 AI 的更多信息
+
+[![微软对负责任人工智能的做法](https://img.youtube.com/vi/dnC8-uUZXSc/0.jpg)](https://youtu.be/dnC8-uUZXSc "微软对负责任人工智能的做法")
+
+> 🎥 点击上图观看视频:微软对负责任人工智能的做法
+
+## 数据和算法的不公平性
+
+> “如果你折磨数据足够长的时间,它会坦白一切” - Ronald Coase
+
+这种说法听起来很极端,但数据确实可以被操纵以支持任何结论。这种操纵有时可能是无意中发生的。作为人类,我们都有偏见,当你在数据中引入偏见时,往往很难有意识地知道。
+
+保证人工智能和机器学习的公平性仍然是一项复杂的社会技术挑战。这意味着它不能从纯粹的社会或技术角度来解决。
+
+### 与公平相关的危害
+
+你说的不公平是什么意思?“不公平”包括对一群人的负面影响或“伤害”,例如根据种族、性别、年龄或残疾状况定义的那些人。
+
+与公平相关的主要危害可分为:
+
+- **分配**,如果一个性别或种族比另一个更受青睐。
+- **服务质量**。 如果你针对一种特定场景训练数据,但实际情况要复杂得多,则会导致服务性能不佳。
+- **刻板印象**。 将给定的组与预先分配的属性相关联。
+- **诋毁**。 不公平地批评和标记某事或某人。
+- **代表性过高或过低**。这种想法是,某个群体在某个行业中不被看到,而这个行业一直在提升,这是造成伤害的原因。
+
+让我们来看看这些例子。
+
+### 分配
+
+考虑一个用于筛选贷款申请的假设系统。该系统倾向于选择白人男性作为比其他群体更好的候选人。因此,某些申请人的贷款被拒。
+
+另一个例子是一家大型公司开发的一种实验性招聘工具,用于筛选应聘者。通过使用这些模型,该工具系统地歧视了一种性别,并被训练为更喜欢与另一种性别相关的词。这导致了对简历中含有“女子橄榄球队”等字样的候选人的不公正地对待。
+
+✅ 做一点研究,找出一个真实的例子
+
+### 服务质量
+
+研究人员发现,与肤色较浅的男性相比,一些商业性的性别分类工具在肤色较深的女性图像上的错误率更高。[参考](https://www.media.mit.edu/publications/gender-shades-intersectional-accuracy-disparities-in-commercial-gender-classification/)
+
+另一个臭名昭著的例子是洗手液分配器,它似乎无法感知皮肤黝黑的人。[参考](https://gizmodo.com/why-cant-this-soap-dispenser-identify-dark-skin-1797931773)
+
+### 刻板印象
+
+机器翻译中存在着刻板的性别观。在将“他是护士,她是医生”翻译成土耳其语时,遇到了一些问题。土耳其语是一种无性别的语言,它有一个代词“o”来表示单数第三人称,但把这个句子从土耳其语翻译成英语,会产生“她是护士,他是医生”这样的刻板印象和错误。
+
+![翻译成土耳其语](../images/gender-bias-translate-en-tr.png)
+
+![翻译成英语](../images/gender-bias-translate-tr-en.png)
+
+### 诋毁
+
+一种图像标记技术,臭名昭著地将深色皮肤的人的图像错误地标记为大猩猩。错误的标签是有害的,不仅仅是因为这个系统犯了一个错误,而且它还特别使用了一个长期以来被故意用来诋毁黑人的标签。
+
+[![AI: 我不是女人吗?](https://img.youtube.com/vi/QxuyfWoVV98/0.jpg)](https://www.youtube.com/watch?v=QxuyfWoVV98 "AI, 我不是女人吗?")
+> 🎥 点击上图观看视频:AI,我不是女人吗 - 一场展示AI种族主义诋毁造成的伤害的表演
+
+### 代表性过高或过低
+
+有倾向性的图像搜索结果就是一个很好的例子。在搜索男性比例等于或高于女性的职业的图片时,比如工程或首席执行官,要注意那些更倾向于特定性别的结果。
+
+![必应CEO搜索](../images/ceos.png)
+> 在Bing上搜索“CEO”会得到非常全面的结果
+
+这五种主要类型的危害不是相互排斥的,一个单一的系统可以表现出一种以上的危害。此外,每个案例的严重程度各不相同。例如,不公平地给某人贴上罪犯的标签比给形象贴上错误的标签要严重得多。然而,重要的是要记住,即使是相对不严重的伤害也会让人感到疏远或被孤立,累积的影响可能会非常压抑。
+
+✅ **讨论**:重温一些例子,看看它们是否显示出不同的危害。
+
+| | 分配 | 服务质量 | 刻板印象 | 诋毁 | 代表性过高或过低 |
+| ------------ | :---: | :------: | :------: | :---: | :--------------: |
+| 自动招聘系统 | x | x | x | | x |
+| 机器翻译 | | | | | |
+| 照片加标签 | | | | | |
+
+
+## 检测不公平
+
+给定系统行为不公平的原因有很多。例如,社会偏见可能会反映在用于训练它们的数据集中。例如,过度依赖历史数据可能会加剧招聘不公平。通过使用过去10年提交给公司的简历中的模式,该模型确定男性更合格,因为大多数简历来自男性,这反映了过去男性在整个科技行业的主导地位。
+
+关于特定人群的数据不足可能是不公平的原因。例如,图像分类器对于深肤色人的图像具有较高的错误率,因为数据中没有充分代表较深的肤色。
+
+开发过程中做出的错误假设也会导致不公平。例如,旨在根据人脸图像预测谁将犯罪的面部分析系统可能会导致破坏性假设。这可能会对错误分类的人造成重大伤害。
+
+## 了解你的模型并建立公平性
+
+尽管公平性的许多方面都没有包含在量化公平性指标中,并且不可能从系统中完全消除偏见以保证公平性,但你仍然有责任尽可能多地检测和缓解公平性问题。
+
+当你使用机器学习模型时,通过确保模型的可解释性以及评估和减轻不公平性来理解模型非常重要。
+
+让我们使用贷款选择示例来作为分析案例,以确定每个因素对预测的影响程度。
+
+## 评价方法
+
+1. **识别危害(和好处)**。第一步是找出危害和好处。思考行动和决策如何影响潜在客户和企业本身。
+
+2. **确定受影响的群体**。一旦你了解了什么样的伤害或好处可能会发生,找出可能受到影响的群体。这些群体是按性别、种族或社会群体界定的吗?
+
+3. **定义公平性度量**。最后,定义一个度量标准,这样你就可以在工作中衡量一些东西来改善这种情况。
+
+### 识别危害(和好处)
+
+与贷款相关的危害和好处是什么?想想假阴性和假阳性的情况:
+
+**假阴性**(拒绝,但Y=1)-在这种情况下,将拒绝有能力偿还贷款的申请人。这是一个不利的事件,因为贷款的资源是从合格的申请人扣留。
+
+**假阳性**(接受,但Y=0)-在这种情况下,申请人确实获得了贷款,但最终违约。因此,申请人的案件将被送往一个债务催收机构,这可能会影响他们未来的贷款申请。
+
+### 确定受影响的群体
+
+下一步是确定哪些群体可能受到影响。例如,在信用卡申请的情况下,模型可能会确定女性应获得比共享家庭资产的配偶低得多的信用额度。因此,由性别定义的整个人口统计数据都会受到影响。
+
+### 定义公平性度量
+
+你已经确定了伤害和受影响的群体,在本例中,是按性别划分的。现在,使用量化因子来分解它们的度量。例如,使用下面的数据,你可以看到女性的假阳性率最大,男性的假阳性率最小,而对于假阴性则相反。
+
+✅ 在以后关于聚类的课程中,你将看到如何在代码中构建这个“混淆矩阵”
+
+| | 假阳性率 | 假阴性率 | 数量 |
+| ---------- | -------- | -------- | ----- |
+| 女性 | 0.37 | 0.27 | 54032 |
+| 男性 | 0.31 | 0.35 | 28620 |
+| 未列出性别 | 0.33 | 0.31 | 1266 |
+
+
+这个表格告诉我们几件事。首先,我们注意到数据中的未列出性别的人相对较少。数据是有偏差的,所以你需要小心解释这些数字。
+
+在本例中,我们有3个组和2个度量。当我们考虑我们的系统如何影响贷款申请人的客户群时,这可能就足够了,但是当你想要定义更多的组时,你可能需要将其提取到更小的摘要集。为此,你可以添加更多的度量,例如每个假阴性和假阳性的最大差异或最小比率。
+
+✅ 停下来想一想:还有哪些群体可能会受到贷款申请的影响?
+
+## 减轻不公平
+
+为了缓解不公平,探索模型生成各种缓解模型,并比较其在准确性和公平性之间的权衡,以选择最公平的模型。
+
+这个介绍性的课程并没有深入探讨算法不公平缓解的细节,比如后处理和减少方法,但是这里有一个你可能想尝试的工具。
+
+### Fairlearn
+
+[Fairlearn](https://fairlearn.github.io/) 是一个开源Python包,可让你评估系统的公平性并减轻不公平性。
+
+该工具可帮助你评估模型的预测如何影响不同的组,使你能够通过使用公平性和性能指标来比较多个模型,并提供一组算法来减轻二元分类和回归中的不公平性。
+
+- 通过查看Fairlearn的[GitHub](https://github.com/fairlearn/fairlearn/)了解如何使用不同的组件
+
+- 浏览[用户指南](https://fairlearn.github.io/main/user_guide/index.html), [示例](https://fairlearn.github.io/main/auto_examples/index.html)
+
+- 尝试一些 [示例Notebook](https://github.com/fairlearn/fairlearn/tree/master/notebooks).
+
+- 了解Azure机器学习中机器学习模型[如何启用公平性评估](https://docs.microsoft.com/azure/machine-learning/how-to-machine-learning-fairness-aml?WT.mc_id=academic-15963-cxa)。
+
+- 看看这些[示例Notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/contrib/fairness)了解Azure机器学习中的更多公平性评估场景。
+
+---
+## 🚀 挑战
+
+为了防止首先引入偏见,我们应该:
+
+- 在系统工作人员中有不同的背景和观点
+
+- 获取反映我们社会多样性的数据集
+
+- 开发更好的方法来检测和纠正偏差
+
+想想现实生活中的场景,在模型构建和使用中明显存在不公平。我们还应该考虑什么?
+
+## [课后测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/6/)
+## 复习与自学
+
+在本课中,你学习了机器学习中公平和不公平概念的一些基础知识。
+
+观看本次研讨会,深入探讨以下主题:
+
+- YouTube:人工智能系统中与公平相关的危害:示例、评估和缓解Hanna Wallach和Miro Dudik[人工智能系统中与公平相关的危害:示例、评估和缓解-YouTube](https://www.youtube.com/watch?v=1RptHwfkx_k)
+
+另外,请阅读:
+
+- 微软RAI资源中心:[负责人工智能资源-微软人工智能](https://www.microsoft.com/ai/responsible-ai-resources?activetab=pivot1%3aprimaryr4)
+
+- 微软FATE研究小组:[FATE:AI 中的公平、问责、透明和道德-微软研究院](https://www.microsoft.com/research/theme/fate/)
+
+探索Fairlearn工具箱
+
+[Fairlearn](https://fairlearn.org/)
+
+了解Azure机器学习的工具以确保公平性
+
+- [Azure机器学习](https://docs.microsoft.com/azure/machine-learning/concept-fairness-ml?WT.mc_id=academic-15963-cxa)
+
+## 任务
+
+[探索Fairlearn](assignment.zh-cn.md)
diff --git a/1-Introduction/3-fairness/translations/assignment.es.md b/1-Introduction/3-fairness/translations/assignment.es.md
new file mode 100644
index 0000000000..cf83256ef4
--- /dev/null
+++ b/1-Introduction/3-fairness/translations/assignment.es.md
@@ -0,0 +1,11 @@
+# Explore Fairlearn
+
+## Instrucciones
+
+En esta lección, aprendió sobre Fairlearn, un "proyecto open-source impulsado por la comunidad para ayudar a los científicos de datos a mejorar la equidad de los sistemas de AI." Para esta tarea, explore uno de los [cuadernos](https://fairlearn.org/v0.6.2/auto_examples/index.html) de Fairlearn e informe sus hallazgos en un documento o presentación.
+
+## Rúbrica
+
+| Criterios | Ejemplar | Adecuado | Necesita mejorar |
+| -------- | --------- | -------- | ----------------- |
+| | Un documento o presentación powerpoint es presentado discutiendo los sistemas de Fairlearn, el cuadernos que fue ejecutado, y las conclusiones extraídas al ejecutarlo | Un documento es presentado sin conclusiones | No se presenta ningún documento |
diff --git a/1-Introduction/3-fairness/translations/assignment.id.md b/1-Introduction/3-fairness/translations/assignment.id.md
new file mode 100644
index 0000000000..90389a14de
--- /dev/null
+++ b/1-Introduction/3-fairness/translations/assignment.id.md
@@ -0,0 +1,11 @@
+# Jelajahi Fairlearn
+
+## Instruksi
+
+Dalam pelajaran ini kamu telah belajar mengenai Fairlearn, sebuah "proyek *open-source* berbasis komunitas untuk membantu para *data scientist* meningkatkan keadilan dari sistem AI." Untuk penugasan kali ini, jelajahi salah satu dari [notebook](https://fairlearn.org/v0.6.2/auto_examples/index.html) yang disediakan Fairlearn dan laporkan penemuanmu dalam sebuah paper atau presentasi.
+
+## Rubrik
+
+| Kriteria | Sangat Bagus | Cukup | Perlu Peningkatan |
+| -------- | --------- | -------- | ----------------- |
+| | Sebuah *paper* atau presentasi powerpoint yang membahas sistem Fairlearn, *notebook* yang dijalankan, dan kesimpulan yang diambil dari hasil menjalankannya | Sebuah paper yang dipresentasikan tanpa kesimpulan | Tidak ada paper yang dipresentasikan |
diff --git a/1-Introduction/3-fairness/translations/assignment.it.md b/1-Introduction/3-fairness/translations/assignment.it.md
new file mode 100644
index 0000000000..4523fbb3d9
--- /dev/null
+++ b/1-Introduction/3-fairness/translations/assignment.it.md
@@ -0,0 +1,11 @@
+# Esplorare Fairlearn
+
+## Istruzioni
+
+In questa lezione si è appreso di Fairlearn, un "progetto open source guidato dalla comunità per aiutare i data scientist a migliorare l'equità dei sistemi di intelligenza artificiale". Per questo compito, esplorare uno dei [notebook](https://fairlearn.org/v0.6.2/auto_examples/index.html) di Fairlearn e riportare i propri risultati in un documento o in una presentazione.
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | --------- | -------- | ----------------- |
+| | Viene presentato un documento o una presentazione powerpoint in cui si discutono i sistemi di Fairlearn, il notebook che è stato eseguito e le conclusioni tratte dall'esecuzione | Viene presentato un documento senza conclusioni | Non viene presentato alcun documento |
diff --git a/1-Introduction/3-fairness/translations/assignment.ja.md b/1-Introduction/3-fairness/translations/assignment.ja.md
new file mode 100644
index 0000000000..dbf7b2b46b
--- /dev/null
+++ b/1-Introduction/3-fairness/translations/assignment.ja.md
@@ -0,0 +1,11 @@
+# Fairlearnを調査する
+
+## 指示
+
+このレッスンでは、「データサイエンティストがAIシステムの公平性を向上させるための、オープンソースでコミュニティ主導のプロジェクト」であるFairlearnについて学習しました。この課題では、Fairlearnの [ノートブック](https://fairlearn.org/v0.6.2/auto_examples/index.html) のうちのひとつを調査し、わかったことをレポートやプレゼンテーションの形で報告してください。
+
+## 評価基準
+
+| 基準 | 模範的 | 十分 | 要改善 |
+| ---- | --------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------- | -------------------------- |
+| | Fairlearnのシステム・実行したノートブック・実行によって得られた結果が、レポートやパワーポイントのプレゼンテーションとして提示されている | 結論のないレポートが提示されている | レポートが提示されていない |
diff --git a/1-Introduction/3-fairness/translations/assignment.zh-cn.md b/1-Introduction/3-fairness/translations/assignment.zh-cn.md
new file mode 100644
index 0000000000..a812419948
--- /dev/null
+++ b/1-Introduction/3-fairness/translations/assignment.zh-cn.md
@@ -0,0 +1,11 @@
+# 探索 Fairlearn
+
+## 说明
+
+在这节课中,你了解了 Fairlearn,一个“开源的,社区驱动的项目,旨在帮助数据科学家们提高人工智能系统的公平性”。在这项作业中,探索 Fairlearn [笔记本](https://fairlearn.org/v0.6.2/auto_examples/index.html)中的一个例子,之后你可以用论文或者 ppt 的形式叙述你学习后的发现。
+
+## 评判标准
+
+| 标准 | 优秀 | 中规中矩 | 仍需努力 |
+| -------- | --------- | -------- | ----------------- |
+| | 提交了一篇论文或者ppt 关于讨论 Fairlearn 系统、挑选运行的例子、和运行这个例子后所得出来的心得结论 | 提交了一篇没有结论的论文 | 没有提交论文 |
diff --git a/1-Introduction/4-techniques-of-ML/README.md b/1-Introduction/4-techniques-of-ML/README.md
index ec42fe705e..1b87fd7ce2 100644
--- a/1-Introduction/4-techniques-of-ML/README.md
+++ b/1-Introduction/4-techniques-of-ML/README.md
@@ -4,8 +4,9 @@ The process of building, using, and maintaining machine learning models and the
- Understand the processes underpinning machine learning at a high level.
- Explore base concepts such as 'models', 'predictions', and 'training data'.
-
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/7/)
+
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/7/)
+
## Introduction
On a high level, the craft of creating machine learning (ML) processes is comprised of a number of steps:
@@ -35,25 +36,31 @@ Before starting to build your model, there are several tasks you need to complet
To be able to answer your question with any kind of certainty, you need a good amount of data of the right type. There are two things you need to do at this point:
- **Collect data**. Keeping in mind the previous lesson on fairness in data analysis, collect your data with care. Be aware of the sources of this data, any inherent biases it might have, and document its origin.
-- **Prepare data**. There are several steps in the data preparation process. You might need to collate data and normalize it if it comes from diverse sources. You can improve the data's quality and quantity through various methods such as converting strings to numbers (as we do in [Clustering](../../5-Clustering/1-Visualize/README.md)). You might also generate new data, based on the original (as we do in [Classification](../../4-Classification/1-Introduction/README.md)). You can clean and edit the data (as we did prior to the [Web App](../3-Web-App/README.md) lesson). Finally, you might also need to randomize it and shuffle it, depending on your training techniques.
+- **Prepare data**. There are several steps in the data preparation process. You might need to collate data and normalize it if it comes from diverse sources. You can improve the data's quality and quantity through various methods such as converting strings to numbers (as we do in [Clustering](../../5-Clustering/1-Visualize/README.md)). You might also generate new data, based on the original (as we do in [Classification](../../4-Classification/1-Introduction/README.md)). You can clean and edit the data (as we will prior to the [Web App](../../3-Web-App/README.md) lesson). Finally, you might also need to randomize it and shuffle it, depending on your training techniques.
✅ After collecting and processing your data, take a moment to see if its shape will allow you to address your intended question. It may be that the data will not perform well in your given task, as we discover in our [Clustering](../../5-Clustering/1-Visualize/README.md) lessons!
-### Selecting your feature variable
+### Features and Target
+
+A [feature](https://www.datasciencecentral.com/profiles/blogs/an-introduction-to-variable-and-feature-selection) is a measurable property of your data. In many datasets it is expressed as a column heading like 'date' 'size' or 'color'. Your feature variable, usually represented as `X` in code, represent the input variable which will be used to train model.
+
+A target is a thing you are trying to predict. Target usually represented as `y` in code, represents the answer to the question you are trying to ask of your data: in December, what **color** pumpkins will be cheapest? in San Francisco, what neighborhoods will have the best real estate **price**? Sometimes target is also referred as label attribute.
-A [feature](https://www.datasciencecentral.com/profiles/blogs/an-introduction-to-variable-and-feature-selection) is a measurable property of your data. In many datasets it is expressed as a column heading like 'date' 'size' or 'color'. Your feature variable, usually represented as `y` in code, represents the answer to the question you are trying to ask of your data: in December, what **color** pumpkins will be cheapest? in San Francisco, what neighborhoods will have the best real estate **price**?
+### Selecting your feature variable
🎓 **Feature Selection and Feature Extraction** How do you know which variable to choose when building a model? You'll probably go through a process of feature selection or feature extraction to choose the right variables for the most performant model. They're not the same thing, however: "Feature extraction creates new features from functions of the original features, whereas feature selection returns a subset of the features." ([source](https://wikipedia.org/wiki/Feature_selection))
+
### Visualize your data
-An important aspect of the data scientist's toolkit is the power to visualize data using several excellent libraries such as Seaborn or MatPlotLib. Representing your data visually might allow you to uncover hidden correlations that you can leverage. Your visualizations might also help you to uncover bias or unbalanced data (as we discover in [Classification](../../4-Classification/2-Classifiers-1/README.md)).
+An important aspect of the data scientist's toolkit is the power to visualize data using several excellent libraries such as Seaborn or MatPlotLib. Representing your data visually might allow you to uncover hidden correlations that you can leverage. Your visualizations might also help you to uncover bias or unbalanced data (as we discover in [Classification](../../4-Classification/2-Classifiers-1/README.md)).
+
### Split your dataset
Prior to training, you need to split your dataset into two or more parts of unequal size that still represent the data well.
- **Training**. This part of the dataset is fit to your model to train it. This set constitutes the majority of the original dataset.
- **Testing**. A test dataset is an independent group of data, often gathered from the original data, that you use to confirm the performance of the built model.
-- **Validating**. A validation set is a smaller independent group of examples that you use to tune the model's hyperparameters, or architecture, to improve the model. Depending on your data's size and the question you are asking, you might not need to build this third set (as we note in [Time Series Forecasting](../7-TimeSeries/1-Introduction/README.md)).
+- **Validating**. A validation set is a smaller independent group of examples that you use to tune the model's hyperparameters, or architecture, to improve the model. Depending on your data's size and the question you are asking, you might not need to build this third set (as we note in [Time Series Forecasting](../../7-TimeSeries/1-Introduction/README.md)).
## Building a model
@@ -61,10 +68,12 @@ Using your training data, your goal is to build a model, or a statistical repres
### Decide on a training method
-Depending on your question and the nature of your data, your will choose a method to train it. Stepping through [Scikit-learn's documentation](https://scikit-learn.org/stable/user_guide.html) - which we use in this course - you can explore many ways to train a model. Depending on your experience, you might have to try several different methods to build the best model. You are likely to go through a process whereby data scientists evaluate the performance of a model by feeding it unseen data, checking for accuracy, bias, and other quality-degrading issues, and selecting the most appropriate training method for the task at hand.
+Depending on your question and the nature of your data, you will choose a method to train it. Stepping through [Scikit-learn's documentation](https://scikit-learn.org/stable/user_guide.html) - which we use in this course - you can explore many ways to train a model. Depending on your experience, you might have to try several different methods to build the best model. You are likely to go through a process whereby data scientists evaluate the performance of a model by feeding it unseen data, checking for accuracy, bias, and other quality-degrading issues, and selecting the most appropriate training method for the task at hand.
+
### Train a model
-Armed with your training data, you are ready to 'fit' it to create a model. You will notice that in many ML libraries you will find the code 'model.fit' - it is at this time that you send in your data as an array of values (usually 'X') and a feature variable (usually 'y').
+Armed with your training data, you are ready to 'fit' it to create a model. You will notice that in many ML libraries you will find the code 'model.fit' - it is at this time that you send in your feature variable as an array of values (usually 'X') and a target variable (usually 'y').
+
### Evaluate the model
Once the training process is complete (it can take many iterations, or 'epochs', to train a large model), you will be able to evaluate the model's quality by using test data to gauge its performance. This data is a subset of the original data that the model has not previously analyzed. You can print out a table of metrics about your model's quality.
@@ -94,7 +103,7 @@ In these lessons, you will discover how to use these steps to prepare, build, te
Draw a flow chart reflecting the steps of a ML practitioner. Where do you see yourself right now in the process? Where do you predict you will find difficulty? What seems easy to you?
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/8/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/8/)
## Review & Self Study
diff --git a/1-Introduction/4-techniques-of-ML/translations/README.es.md b/1-Introduction/4-techniques-of-ML/translations/README.es.md
old mode 100644
new mode 100755
index e69de29bb2..0121527ef8
--- a/1-Introduction/4-techniques-of-ML/translations/README.es.md
+++ b/1-Introduction/4-techniques-of-ML/translations/README.es.md
@@ -0,0 +1,112 @@
+# Técnicas de Machine Learning
+
+El proceso de creación, uso y mantenimiento de modelos de machine learning, y los datos que se utilizan, es un proceso muy diferente de muchos otros flujos de trabajo de desarrollo. En esta lección, demistificaremos el proceso, y describiremos las principales técnicas que necesita saber. Vas a:
+
+- Comprender los procesos que sustentan el machine learning a un alto nivel.
+- Explorar conceptos básicos como 'modelos', 'predicciones', y 'datos de entrenamiento'
+
+
+## [Cuestionario previo a la conferencia](https://white-water-09ec41f0f.azurestaticapps.net/quiz/7/)
+## Introducción
+
+A un alto nivel, el arte de crear procesos de machine learning (ML) se compone de una serie de pasos:
+
+1. **Decidir sobre la pregunta**. La mayoría de los procesos de ML, comienzan por hacer una pregunta que no puede ser respondida por un simple programa condicional o un motor basado en reglas. Esas preguntas a menudo giran en torno a predicciones basadas en una recopilación de datos.
+2. **Recopile y prepare datos**. Para poder responder a su pregunta, necesita datos. La calidad y, a veces, cantidad de sus datos determinarán que tan bien puede responder a su pregunta inicial. La visualización de datos es un aspecto importante de esta fase. Esta fase también incluye dividir los datos en un grupo de entrenamiento y pruebas para construir un modelo.
+3. **Elige un método de entrenamiento**. Dependiendo de su pregunta y la naturaleza de sus datos, debe elegir cómo desea entrenar un modelo para reflejar mejor sus datos y hacer predicciones precisas contra ellos. Esta es la parte de su proceso de ML que requiere experiencia específica y, a menudo, una cantidad considerable de experimetación.
+4. **Entrena el model**. Usando sus datos de entrenamiento, usará varios algoritmos para entrenar un modelo para reconocer patrones en los datos. El modelo puede aprovechar las ponderaciones internas que se pueden ajustar para privilegiar ciertas partes de los datos sobre otras para construir un modelo mejor.
+5. **Evaluar el modelo**. Utiliza datos nunca antes vistos (sus datos de prueba) de su conjunto recopilado para ver cómo se está desempeñando el modelo.
+6. **Ajuste de parámetros**. Según el rendimiento de su modelo, puede rehacer el proceso utilizando diferentes parámetros, o variables, que controlan el comportamiento de los algoritmos utlizados para entrenarl el modelo.
+7. **Predecir**. Utilice nuevas entradas para probar la precisión de su modelo.
+
+## Que pregunta hacer
+
+Las computadoras son particularmente hábiles para descubrir patrones ocultos en los datos. Esta utlidad es muy útil para los investigadores que tienen preguntas sobre un dominio determinado que no pueden responderse fácilmente mediante la creación de un motor de reglas basado en condicionales. Dada una tarea actuarial, por ejemplo, un científico de datos podría construir reglas artesanales sobre la mortalidad de los fumadores frente a los no fumadores.
+
+Sin embargo, cuandos se incorporan muchas otras variables a la ecuación, un modelo de ML podría resultar más eficiente para predecir las tasas de mortalidad futuras en funciòn de los antecedentes de salud. Un ejemplo más alegre podría hacer predicciones meteorólogicas para el mes de abril en una ubicación determinada que incluya latitud, longitud, cambio climático, proximidad al océano, patrones de la corriente en chorro, y más.
+
+✅ Esta [presentación de diapositivas](https://www2.cisl.ucar.edu/sites/default/files/0900%20June%2024%20Haupt_0.pdf) sobre modelos meteorológicos ofrece una perspectiva histórica del uso de ML en el análisis meteorológico.
+
+## Tarea previas a la construcción
+
+Antes de comenzar a construir su modelo, hay varias tareas que debe comletar. Para probar su pregunta y formar una hipótesis basada en las predicciones de su modelo, debe identificar y configurar varios elementos.
+
+### Datos
+
+Para poder responder su pregunta con cualquier tipo de certeza, necesita una buena cantidad de datos del tipo correcto.
+Hay dos cosas que debe hacer en este punto:
+
+- **Recolectar datos**. Teniendo en cuenta la lección anterior sobre la equidad en el análisis de datos, recopile sus datos con cuidado. Tenga en cuenta la fuente de estos datos, cualquier sesgo inherente que pueda tener y documente su origen.
+- **Preparar datos**. Hay varios pasos en el proceso de preparación de datos. Podría necesitar recopilar datos y normalizarlos si provienen de diversas fuentes. Puede mejorar la calidad y cantidad de los datos mediante varios métodos, como convertir strings en números (como hacemos en [Clustering](../../5-Clustering/1-Visualize/README.md)). También puede generar nuevos datos, basados en los originales (como hacemos en [Clasificación](../../4-Classification/1-Introduction/README.md)). Puede limpiar y editar los datos (como lo haremos antes de la lección [Web App](../../3-Web-App/README.md)). Por último, es posible que también deba aleotizarlo y mezclarlo, según sus técnicas de entrenamiento.
+
+✅ Despúes de recopilar y procesar sus datos, tómese un momento para ver si su forma le permitirá responder a su pregunta. ¡Puede ser que los datos no funcionen bien en su tarea dada, como descubriremos en nuestras lecciones de[Clustering](../../5-Clustering/1-Visualize/README.md)!
+
+### Características y destino
+
+Una característica es una propiedad medible de los datos.En muchos conjuntos de datos se expresa como un encabezado de columna como 'date' 'size' o 'color'. La variable de entidad, normalmente representada como `X` en el código, representa la variable de entrada que se utilizará para entrenar el modelo.
+
+Un objetivo es una cosa que está tratando de predecir. Target generalmente representado como `y` en el código, representa la respuesta a la pregunta que está tratando de hacer de sus datos: en diciembre, ¿qué color de calabazas serán más baratas? en San Francisco, ¿qué barrios tendrán el mejor precio de bienes raíces? A veces, target también se conoce como atributo label.
+
+### Seleccionando su variable característica
+
+🎓 **Selección y extracción de características** ¿ Cómo sabe que variable elegir al construir un modelo? Probablemente pasará por un proceso de selección o extracción de características para elegir las variables correctas para mayor un mayor rendimiento del modelo. Sin embargo, no son lo mismo: "La extracción de características crea nuevas características a partir de funciones de las características originales, mientras que la selección de características devuelve un subconjunto de las características." ([fuente](https://wikipedia.org/wiki/Feature_selection))
+
+### Visualiza tus datos
+
+Un aspecto importante del conjunto de herramientas del científico de datos es el poder de visualizar datos utilizando varias bibliotecas excelentes como Seaborn o MatPlotLib. Representar sus datos visualmente puede permitirle descubrir correlaciones ocultas que puede aprovechar. Sus visualizaciones también pueden ayudarlo a descubrir sesgos o datos desequilibrados. (como descubrimos en [Clasificación](../../4-Classification/2-Classifiers-1/README.md)).
+
+### Divide tu conjunto de datos
+
+Antes del entrenamiento, debe dividir su conjunto de datos en dos o más partes de tamaño desigual que aún represente bien los datos.
+
+- **Entrenamiento**. Esta parte del conjunto de datos se ajusta a su modelo para entrenarlo. Este conjunto constituye la mayor parte del conjunto de datos original.
+- **Pruebas**. Un conjunto de datos de pruebas es un grupo independiente de datos, a menudo recopilado a partir de los datos originales, que se utiliza para confirmar el rendimiento del modelo construido.
+- **Validación**. Un conjunto de validación es un pequeño grupo independiente de ejemplos que se usa para ajustar los hiperparámetros o la arquitectura del modelo para mejorar el modelo. Dependiendo del tamaño de de su conjunto de datos y de la pregunta que se está haciendo, es posible que no necesite crear este tercer conjunto (como notamos en [Pronóstico se series de tiempo](../../7-TimeSeries/1-Introduction/README.md)).
+
+## Contruye un modelo
+
+Usando sus datos de entrenamiento, su objetivo es construir un modelo, o una representación estadística de sus datos, usando varios algoritmos para **entrenarlo**. El entrenamiento de un modelo lo expone a los datos y le permite hacer suposiciones sobre los patrones percibidos que descubre, valida y rechaza.
+
+### Decide un método de entrenamiento
+
+Dependiendo de su pregunta y la naturaleza de sus datos, elegirá un método para entrenarlos. Pasando por la [documentación de Scikit-learn ](https://scikit-learn.org/stable/user_guide.html) - que usamos en este curso - puede explorar muchas formas de entrenar un modelo. Dependiendo de su experiencia, es posible que deba probar varios métodos diferentes para construir el mejor modelo. Es probable que pase por un proceso en el que los científicos de datos evalúan el rendimiento de un modelo alimentándolo con datos no vistos anteriormente por el modelo, verificando la precisión, el sesgo, y otros problemas que degradan la calidad, y seleccionando el método de entrenamieto más apropiado para la tarea en custión.
+### Entrena un modelo
+
+Armado con sus datos de entrenamiento, está listo para "ajustarlo" para crear un modelo. Notará que en muchas bibliotecas de ML encontrará el código 'model.fit' - es en este momento que envía su variable de característica como una matriz de valores (generalmente `X`) y una variable de destino (generalmente `y`).
+
+### Evaluar el modelo
+
+Una vez que se completa el proceso de entrenamiento (puede tomar muchas iteraciones, o 'épocas', entrenar un modelo de gran tamaño), podrá evaluar la calidad del modelo utilizando datos de prueba para medir su rendimiento. Estos datos son un subconjunto de los datos originales que el modelo no ha analizado previamente. Puede imprimir una tabla de métricas sobre la calidad de su modelo.
+
+🎓 **Ajuste del modelo (Model fitting)**
+
+En el contexto del machine learning, el ajuste del modelo se refiere a la precisión de la función subyacente del modelo cuando intenta analizar datos con los que no está familiarizado.
+
+🎓 **Ajuste insuficiente (Underfitting)** y **sobreajuste (overfitting)** son problemas comunes que degradan la calidad del modelo, ya que el modelo no encaja suficientemente bien, o encaja demasiado bien. Esto hace que el modelo haga predicciones demasiado estrechamente alineadas o demasiado poco alineadas con sus datos de entrenamiento. Un modelo sobreajustadoo (overfitting) predice demasiado bien los datos de entrenamiento porque ha aprendido demasiado bien los detalles de los datos y el ruido. Un modelo insuficentemente ajustado (Underfitting) es es preciso, ya que ni puede analizar con precisión sus datos de entrenamiento ni los datos que aún no ha 'visto'.
+
+![overfitting model](images/overfitting.png)
+> Infografía de [Jen Looper](https://twitter.com/jenlooper)
+
+## Ajuste de parámetros
+
+Una vez que haya completado su entrenamiento inicial, observe la calidad del modelo y considere mejorarlo ajustando sus 'hiperparámetros'. Lea más sobre el proceso [en la documentación](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters?WT.mc_id=academic-15963-cxa).
+
+## Predicción
+
+Este es el momento en el que puede usar datos completamente nuevos para probar la precisión de su modelo. En una configuración de ML aplicada, donde está creando activos web para usar el modelo en producción, este proceo puede implicar la recopilación de la entrada del usuario (presionar un botón, por ejemplo) para establecer una variable y enviarla al modelo para la inferencia, o evaluación.
+En estas lecciones, descubrirá cómo utilizar estos pasos para preparar, construir, probar, evaluar, y predecir - todos los gestos de un científico de datos y más, a medida que avanza en su viaje para convertirse en un ingeniero de machine learning 'full stack'.
+---
+
+## 🚀Desafío
+
+Dibuje un diagrama de flujos que refleje los pasos de practicante de ML. ¿Dónde te ves ahora mismo en el proceso? ¿Dónde predice que encontrará dificultades? ¿Qué te parece fácil?
+
+## [Cuestionario posterior a la conferencia](https://white-water-09ec41f0f.azurestaticapps.net/quiz/8/)
+
+## Revisión & Autoestudio
+
+Busque en línea entrevistas con científicos de datos que analicen su trabajo diario. Aquí está [uno](https://www.youtube.com/watch?v=Z3IjgbbCEfs).
+
+## Asignación
+
+[Entrevistar a un científico de datos](assignment.md)
diff --git a/1-Introduction/4-techniques-of-ML/translations/README.id.md b/1-Introduction/4-techniques-of-ML/translations/README.id.md
new file mode 100644
index 0000000000..699603702e
--- /dev/null
+++ b/1-Introduction/4-techniques-of-ML/translations/README.id.md
@@ -0,0 +1,111 @@
+# Teknik-teknik Machine Learning
+
+Proses membangun, menggunakan, dan memelihara model machine learning dan data yang digunakan adalah proses yang sangat berbeda dari banyak alur kerja pengembangan lainnya. Dalam pelajaran ini, kita akan mengungkap prosesnya dan menguraikan teknik utama yang perlu Kamu ketahui. Kamu akan:
+
+- Memahami gambaran dari proses yang mendasari machine learning.
+- Menjelajahi konsep dasar seperti '*models*', '*predictions*', dan '*training data*'.
+
+## [Quiz Pra-Pelajaran](https://white-water-09ec41f0f.azurestaticapps.net/quiz/7/)
+## Pengantar
+
+Gambaran membuat proses machine learning (ML) terdiri dari sejumlah langkah:
+
+1. **Menentukan pertanyaan**. Sebagian besar proses ML dimulai dengan mengajukan pertanyaan yang tidak dapat dijawab oleh program kondisional sederhana atau mesin berbasis aturan (*rules-based engine*). Pertanyaan-pertanyaan ini sering berkisar seputar prediksi berdasarkan kumpulan data.
+2. **Mengumpulkan dan menyiapkan data**. Untuk dapat menjawab pertanyaanmu, Kamu memerlukan data. Bagaimana kualitas dan terkadang kuantitas data kamu akan menentukan seberapa baik kamu dapat menjawab pertanyaan awal kamu. Memvisualisasikan data merupakan aspek penting dari fase ini. Fase ini juga mencakup pemisahan data menjadi kelompok *training* dan *testing* untuk membangun model.
+3. **Memilih metode training**. Tergantung dari pertanyaan dan sifat datamu, Kamu perlu memilih bagaimana kamu ingin men-training sebuah model untuk mencerminkan data kamu dengan baik dan membuat prediksi yang akurat terhadapnya. Ini adalah bagian dari proses ML yang membutuhkan keahlian khusus dan seringkali perlu banyak eksperimen.
+4. **Melatih model**. Dengan menggunakan data *training*, kamu akan menggunakan berbagai algoritma untuk melatih model guna mengenali pola dalam data. Modelnya mungkin bisa memanfaatkan *internal weight* yang dapat disesuaikan untuk memberi hak istimewa pada bagian tertentu dari data dibandingkan bagian lainnya untuk membangun model yang lebih baik.
+5. **Mengevaluasi model**. Gunakan data yang belum pernah dilihat sebelumnya (data *testing*) untuk melihat bagaimana kinerja model.
+6. **Parameter tuning**. Berdasarkan kinerja modelmu, Kamu dapat mengulang prosesnya menggunakan parameter atau variabel yang berbeda, yang mengontrol perilaku algoritma yang digunakan untuk melatih model.
+7. **Prediksi**. Gunakan input baru untuk menguji keakuratan model kamu.
+
+## Pertanyaan apa yang harus ditanyakan?
+
+Komputer sangat ahli dalam menemukan pola tersembunyi dalam data. Hal ini sangat membantu peneliti yang memiliki pertanyaan tentang domain tertentu yang tidak dapat dijawab dengan mudah dari hanya membuat mesin berbasis aturan kondisional (*conditionally-based rules engine*). Untuk tugas aktuaria misalnya, seorang data scientist mungkin dapat membuat aturan secara manual seputar mortalitas perokok vs non-perokok.
+
+Namun, ketika banyak variabel lain dimasukkan ke dalam persamaan, model ML mungkin terbukti lebih efisien untuk memprediksi tingkat mortalitas di masa depan berdasarkan riwayat kesehatan masa lalu. Contoh yang lebih menyenangkan mungkin membuat prediksi cuaca untuk bulan April di lokasi tertentu berdasarkan data yang mencakup garis lintang, garis bujur, perubahan iklim, kedekatan dengan laut, pola aliran udara (Jet Stream), dan banyak lagi.
+
+✅ [Slide deck](https://www2.cisl.ucar.edu/sites/default/files/0900%20June%2024%20Haupt_0.pdf) ini menawarkan perspektif historis pada model cuaca dengan menggunakan ML dalam analisis cuaca.
+
+## Tugas Pra-Pembuatan
+
+Sebelum mulai membangun model kamu, ada beberapa tugas yang harus kamu selesaikan. Untuk menguji pertanyaan kamu dan membentuk hipotesis berdasarkan prediksi model, Kamu perlu mengidentifikasi dan mengonfigurasi beberapa elemen.
+
+### Data
+
+Untuk dapat menjawab pertanyaan kamu dengan kepastian, Kamu memerlukan sejumlah besar data dengan jenis yang tepat. Ada dua hal yang perlu kamu lakukan pada saat ini:
+
+- **Mengumpulkan data**. Ingat pelajaran sebelumnya tentang keadilan dalam analisis data, kumpulkan data kamu dengan hati-hati. Waspadai sumber datanya, bias bawaan apa pun yang mungkin dimiliki, dan dokumentasikan asalnya.
+- **Menyiapkan data**. Ada beberapa langkah dalam proses persiapan data. Kamu mungkin perlu menyusun data dan melakukan normalisasi jika berasal dari berbagai sumber. Kamu dapat meningkatkan kualitas dan kuantitas data melalui berbagai metode seperti mengonversi string menjadi angka (seperti yang kita lakukan di [Clustering](../../5-Clustering/1-Visualize/translations/README.id.md)). Kamu mungkin juga bisa membuat data baru berdasarkan data yang asli (seperti yang kita lakukan di [Classification](../../4-Classification/1-Introduction/translations/README.id.md)). Kamu bisa membersihkan dan mengubah data (seperti yang kita lakukan sebelum pelajaran [Web App](../3-Web-App/translations/README.id.md)). Terakhir, Kamu mungkin juga perlu mengacaknya dan mengubah urutannya, tergantung pada teknik *training* kamu.
+
+✅ Setelah mengumpulkan dan memproses data kamu, luangkan waktu sejenak untuk melihat apakah bentuknya memungkinkan kamu untuk menjawab pertanyaan yang kamu maksudkan. Mungkin data tidak akan berkinerja baik dalam tugas yang kamu berikan, seperti yang kita temukan dalam pelajaran [Clustering](../../5-Clustering/1-Visualize/translations/README.id.md).
+
+### Fitur dan Target
+
+Fitur adalah properti terukur dari data Anda. Dalam banyak set data, data tersebut dinyatakan sebagai judul kolom seperti 'date' 'size' atau 'color'. Variabel fitur Anda, biasanya direpresentasikan sebagai `X` dalam kode, mewakili variabel input yang akan digunakan untuk melatih model.
+
+A target is a thing you are trying to predict. Target usually represented as `y` in code, represents the answer to the question you are trying to ask of your data: in December, what color pumpkins will be cheapest? in San Francisco, what neighborhoods will have the best real estate price? Sometimes target is also referred as label attribute.
+
+### Memilih variabel fiturmu
+
+🎓 **Feature Selection dan Feature Extraction** Bagaimana kamu tahu variabel mana yang harus dipilih saat membangun model? Kamu mungkin akan melalui proses pemilihan fitur (*Feature Selection*) atau ekstraksi fitur (*Feature Extraction*) untuk memilih variabel yang tepat untuk membuat model yang berkinerja paling baik. Namun, keduanya tidak sama: "Ekstraksi fitur membuat fitur baru dari fungsi fitur asli, sedangkan pemilihan fitur mengembalikan subset fitur." ([sumber](https://wikipedia.org/wiki/Feature_selection))
+### Visualisasikan datamu
+
+Aspek penting dari toolkit data scientist adalah kemampuan untuk memvisualisasikan data menggunakan beberapa *library* seperti Seaborn atau MatPlotLib. Merepresentasikan data kamu secara visual memungkinkan kamu mengungkap korelasi tersembunyi yang dapat kamu manfaatkan. Visualisasimu mungkin juga membantu kamu mengungkap data yang bias atau tidak seimbang (seperti yang kita temukan dalam [Classification](../../4-Classification/2-Classifiers-1/translations/README.id.md)).
+### Membagi dataset
+
+Sebelum memulai *training*, Kamu perlu membagi dataset menjadi dua atau lebih bagian dengan ukuran yang tidak sama tapi masih mewakili data dengan baik.
+
+- **Training**. Bagian dataset ini digunakan untuk men-training model kamu. Bagian dataset ini merupakan mayoritas dari dataset asli.
+- **Testing**. Sebuah dataset tes adalah kelompok data independen, seringkali dikumpulkan dari data yang asli yang akan digunakan untuk mengkonfirmasi kinerja dari model yang dibuat.
+- **Validating**. Dataset validasi adalah kumpulan contoh mandiri yang lebih kecil yang kamu gunakan untuk menyetel hyperparameter atau arsitektur model untuk meningkatkan model. Tergantung dari ukuran data dan pertanyaan yang kamu ajukan, Kamu mungkin tidak perlu membuat dataset ketiga ini (seperti yang kita catat dalam [Time Series Forecasting](../7-TimeSeries/1-Introduction/translations/README.id.md)).
+
+## Membuat sebuah model
+
+Dengan menggunakan data *training*, tujuan kamu adalah membuat model atau representasi statistik data kamu menggunakan berbagai algoritma untuk **melatihnya**. Melatih model berarti mengeksposnya dengan data dan mengizinkannya membuat asumsi tentang pola yang ditemukan, divalidasi, dan diterima atau ditolak.
+
+### Tentukan metode training
+
+Tergantung dari pertanyaan dan sifat datamu, Kamu akan memilih metode untuk melatihnya. Buka dokumentasi [Scikit-learn](https://scikit-learn.org/stable/user_guide.html) yang kita gunakan dalam pelajaran ini, kamu bisa menjelajahi banyak cara untuk melatih sebuah model. Tergantung dari pengalamanmu, kamu mungkin perlu mencoba beberapa metode yang berbeda untuk membuat model yang terbaik. Kemungkinan kamu akan melalui proses di mana data scientist mengevaluasi kinerja model dengan memasukkan data yang belum pernah dilihat, memeriksa akurasi, bias, dan masalah penurunan kualitas lainnya, dan memilih metode training yang paling tepat untuk tugas yang ada.
+
+### Melatih sebuah model
+
+Berbekan dengan data pelatihan Anda, Anda siap untuk 'menyesuaikan' untuk membuat model. Anda akan melihat bahwa di banyak perpustakaan ML Anda akan menemukan kode 'model.fit' - saat inilah Anda mengirim variabel fitur Anda sebagai array nilai (biasanya `X`) dan variabel target (biasanya `y`).
+
+### Mengevaluasi model
+
+Setelah proses *training* selesai (ini mungkin membutuhkan banyak iterasi, atau 'epoch', untuk melatih model besar), Kamu akan dapat mengevaluasi kualitas model dengan menggunakan data tes untuk mengukur kinerjanya. Data ini merupakan subset dari data asli yang modelnya belum pernah dianalisis sebelumnya. Kamu dapat mencetak tabel metrik tentang kualitas model kamu.
+
+🎓 **Model fitting**
+
+Dalam konteks machine learning, *model fitting* mengacu pada keakuratan dari fungsi yang mendasari model saat mencoba menganalisis data yang tidak familiar.
+
+🎓 **Underfitting** dan **overfitting** adalah masalah umum yang menurunkan kualitas model, karena model tidak cukup akurat atau terlalu akurat. Hal ini menyebabkan model membuat prediksi yang terlalu selaras atau tidak cukup selaras dengan data trainingnya. Model overfit memprediksi data *training* terlalu baik karena telah mempelajari detail dan noise data dengan terlalu baik. Model underfit tidak akurat karena tidak dapat menganalisis data *training* atau data yang belum pernah dilihat sebelumnya secara akurat.
+
+![overfitting model](../images/overfitting.png)
+> Infografis oleh [Jen Looper](https://twitter.com/jenlooper)
+
+## Parameter tuning
+
+Setelah *training* awal selesai, amati kualitas model dan pertimbangkan untuk meningkatkannya dengan mengubah 'hyperparameter' nya. Baca lebih lanjut tentang prosesnya [di dalam dokumentasi](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters?WT.mc_id=academic-15963-cxa).
+
+## Prediksi
+
+Ini adalah saat di mana Kamu dapat menggunakan data yang sama sekali baru untuk menguji akurasi model kamu. Dalam setelan ML 'terapan', di mana kamu membangun aset web untuk menggunakan modelnya dalam produksi, proses ini mungkin melibatkan pengumpulan input pengguna (misalnya menekan tombol) untuk menyetel variabel dan mengirimkannya ke model untuk inferensi, atau evaluasi.
+
+Dalam pelajaran ini, Kamu akan menemukan cara untuk menggunakan langkah-langkah ini untuk mempersiapkan, membangun, menguji, mengevaluasi, dan memprediksi - semua gestur data scientist dan banyak lagi, seiring kemajuanmu dalam perjalanan menjadi 'full stack' ML engineer.
+
+---
+
+## 🚀Tantangan
+
+Gambarlah sebuah flow chart yang mencerminkan langkah-langkah seorang praktisi ML. Di mana kamu melihat diri kamu saat ini dalam prosesnya? Di mana kamu memprediksi kamu akan menemukan kesulitan? Apa yang tampak mudah bagi kamu?
+
+## [Quiz Pra-Pelajaran](https://white-water-09ec41f0f.azurestaticapps.net/quiz/8/)
+
+## Ulasan & Belajar Mandiri
+
+Cari di Internet mengenai wawancara dengan data scientist yang mendiskusikan pekerjaan sehari-hari mereka. Ini [salah satunya](https://www.youtube.com/watch?v=Z3IjgbbCEfs).
+
+## Tugas
+
+[Wawancara dengan data scientist](assignment.id.md)
diff --git a/1-Introduction/4-techniques-of-ML/translations/README.it.md b/1-Introduction/4-techniques-of-ML/translations/README.it.md
new file mode 100644
index 0000000000..ecfb35e574
--- /dev/null
+++ b/1-Introduction/4-techniques-of-ML/translations/README.it.md
@@ -0,0 +1,114 @@
+# Tecniche di Machine Learning
+
+Il processo di creazione, utilizzo e mantenimento dei modelli di machine learning e dei dati che utilizzano è un processo molto diverso da molti altri flussi di lavoro di sviluppo. In questa lezione si demistifica il processo, e si delineano le principali tecniche che occorre conoscere. Si dovrà:
+
+- Comprendere i processi ad alto livello alla base di machine learning.
+- Esplorare concetti di base come "modelli", "previsioni" e "dati di addestramento".
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/7/)
+
+## Introduzione
+
+Ad alto livello, il mestiere di creare processi di apprendimento automatico (ML) comprende una serie di passaggi:
+
+1. **Decidere circa la domanda**. La maggior parte dei processi ML inizia ponendo una domanda alla quale non è possibile ottenere risposta da un semplice programma condizionale o da un motore basato su regole. Queste domande spesso ruotano attorno a previsioni basate su una raccolta di dati.
+2. **Raccogliere e preparare i dati**. Per poter rispondere alla domanda, servono dati. La qualità e, a volte, la quantità dei dati determineranno quanto bene sarà possibile rispondere alla domanda iniziale. La visualizzazione dei dati è un aspetto importante di questa fase. Questa fase include anche la suddivisione dei dati in un gruppo di addestramento (training) e test per costruire un modello.
+3. **Scegliere un metodo di addestramento**. A seconda della domanda e della natura dei dati, è necessario scegliere come si desidera addestrare un modello per riflettere al meglio i dati e fare previsioni accurate su di essi. Questa è la parte del processo di ML che richiede competenze specifiche e, spesso, una notevole quantità di sperimentazione.
+4. **Addestrare il modello**. Usando i dati di addestramento, si utilizzeranno vari algoritmi per addestrare un modello a riconoscere modelli nei dati. Il modello potrebbe sfruttare pesi interni che possono essere regolati per privilegiare alcune parti dei dati rispetto ad altre per costruire un modello migliore.
+5. **Valutare il modello**. Si utilizzano dati mai visti prima (i dati di test) da quelli raccolti per osservare le prestazioni del modello.
+6. **Regolazione dei parametri**. In base alle prestazioni del modello, si può ripetere il processo utilizzando parametri differenti, o variabili, che controllano il comportamento degli algoritmi utilizzati per addestrare il modello.
+7. **Prevedere**. Usare nuovi input per testare la precisione del modello.
+
+## Che domanda fare
+
+I computer sono particolarmente abili nello scoprire modelli nascosti nei dati. Questa caratteristica è molto utile per i ricercatori che hanno domande su un determinato campo a cui non è possibile rispondere facilmente creando un motore di regole basato su condizioni. Dato un compito attuariale, ad esempio, un data scientist potrebbe essere in grado di costruire manualmente regole sulla mortalità dei fumatori rispetto ai non fumatori.
+
+Quando molte altre variabili vengono introdotte nell'equazione, tuttavia, un modello ML potrebbe rivelarsi più efficiente per prevedere i tassi di mortalità futuri in base alla storia sanitaria passata. Un esempio più allegro potrebbe essere fare previsioni meteorologiche per il mese di aprile in una determinata località sulla base di dati che includono latitudine, longitudine, cambiamento climatico, vicinanza all'oceano, modelli della corrente a getto e altro ancora.
+
+✅ Questa [presentazione](https://www2.cisl.ucar.edu/sites/default/files/0900%20June%2024%20Haupt_0.pdf) sui modelli meteorologici offre una prospettiva storica per l'utilizzo di ML nell'analisi meteorologica.
+
+## Attività di pre-costruzione
+
+Prima di iniziare a costruire il proprio modello, ci sono diverse attività da completare. Per testare la domanda e formare un'ipotesi basata sulle previsioni di un modello, occorre identificare e configurare diversi elementi.
+
+### Dati
+
+Per poter rispondere con sicurezza alla domanda, serve una buona quantità di dati del tipo giusto. Ci sono due cose da fare a questo punto:
+
+- **Raccogliere dati**. Tenendo presente la lezione precedente sull'equità nell'analisi dei dati, si raccolgano i dati con cura. Ci sia consapevolezza delle fonti di questi dati, di eventuali pregiudizi intrinseci che potrebbero avere e si documenti la loro origine.
+- **Preparare i dati**. Ci sono diversi passaggi nel processo di preparazione dei dati. Potrebbe essere necessario raccogliere i dati e normalizzarli se provengono da fonti diverse. Si può migliorare la qualità e la quantità dei dati attraverso vari metodi come la conversione di stringhe in numeri (come si fa in [Clustering](../../../5-Clustering/1-Visualize/transaltions/README.it.md)). Si potrebbero anche generare nuovi dati, basati sull'originale (come si fa in [Classificazione](../../../4-Classification/1-Introduction/translations/README.it.md)). Si possono pulire e modificare i dati (come verrà fatto prima della lezione sull'[app Web](../../../3-Web-App/translations/README.it.md) ). Infine, si potrebbe anche aver bisogno di renderli casuali e mescolarli, a seconda delle proprie tecniche di addestramento.
+
+✅ Dopo aver raccolto ed elaborato i propri dati, si prenda un momento per vedere se la loro forma consentirà di rispondere alla domanda prevista. Potrebbe essere che i dati non funzionino bene nello svolgere il compito assegnato, come si scopre nelle lezioni di [Clustering](../../../5-Clustering/1-Visualize/translations/README.it.md)!
+
+### Caratteristiche e destinazione
+
+Una funzionalità è una proprietà misurabile dei dati. In molti set di dati è espresso come intestazione di colonna come 'date' 'size' o 'color'. La variabile di funzionalità, solitamente rappresentata come `X` nel codice, rappresenta la variabile di input che verrà utilizzata per il training del modello.
+
+Un obiettivo è una cosa che stai cercando di prevedere. Target solitamente rappresentato come `y` nel codice, rappresenta la risposta alla domanda che stai cercando di porre dei tuoi dati: a dicembre, di che colore saranno le zucche più economiche? a San Francisco, quali quartieri avranno il miglior prezzo immobiliare? A volte la destinazione viene anche definita attributo label.
+
+### Selezione della variabile caratteristica
+
+🎓 **Selezione ed estrazione della caratteristica** Come si fa a sapere quale variabile scegliere quando si costruisce un modello? Probabilmente si dovrà passare attraverso un processo di selezione o estrazione delle caratteristiche per scegliere le variabili giuste per il modello più efficace. Tuttavia, non è la stessa cosa: "L'estrazione delle caratteristiche crea nuove caratteristiche dalle funzioni delle caratteristiche originali, mentre la selezione delle caratteristiche restituisce un sottoinsieme delle caratteristiche". ([fonte](https://it.wikipedia.org/wiki/Selezione_delle_caratteristiche))
+
+### Visualizzare i dati
+
+Un aspetto importante del bagaglio del data scientist è la capacità di visualizzare i dati utilizzando diverse eccellenti librerie come Seaborn o MatPlotLib. Rappresentare visivamente i propri dati potrebbe consentire di scoprire correlazioni nascoste che si possono sfruttare. Le visualizzazioni potrebbero anche aiutare a scoprire pregiudizi o dati sbilanciati (come si scopre in [Classificazione](../../../4-Classification/2-Classifiers-1/translations/README.it.md)).
+
+### Dividere l'insieme di dati
+
+Prima dell'addestramento, è necessario dividere l'insieme di dati in due o più parti di dimensioni diverse che rappresentano comunque bene i dati.
+
+- **Addestramento**. Questa parte dell'insieme di dati è adatta al proprio modello per addestrarlo. Questo insieme costituisce la maggior parte dell'insieme di dati originale.
+- **Test**. Un insieme di dati di test è un gruppo indipendente di dati, spesso raccolti dai dati originali, che si utilizzano per confermare le prestazioni del modello creato.
+- **Convalida**. Un insieme di convalida è un gruppo indipendente più piccolo di esempi da usare per ottimizzare gli iperparametri, o architettura, del modello per migliorarlo. A seconda delle dimensioni dei propri dati e della domanda che si sta ponendo, si potrebbe non aver bisogno di creare questo terzo insieme (come si nota in [Previsione delle Serie Temporali](../../../7-TimeSeries/1-Introduction/translations/README.it.md)).
+
+## Costruire un modello
+
+Utilizzando i dati di addestramento, l'obiettivo è costruire un modello o una rappresentazione statistica dei propri dati, utilizzando vari algoritmi per **addestrarlo** . L'addestramento di un modello lo espone ai dati e consente di formulare ipotesi sui modelli percepiti che scopre, convalida e accetta o rifiuta.
+
+### Decidere un metodo di addestramento
+
+A seconda della domanda e della natura dei dati, si sceglierà un metodo per addestrarlo. Passando attraverso [la documentazione di Scikit-learn](https://scikit-learn.org/stable/user_guide.html), che si usa in questo corso, si possono esplorare molti modi per addestrare un modello. A seconda della propria esperienza, si potrebbe dover provare diversi metodi per creare il modello migliore. È probabile che si attraversi un processo in cui i data scientist valutano le prestazioni di un modello fornendogli dati non visti, verificandone l'accuratezza, i pregiudizi e altri problemi che degradano la qualità e selezionando il metodo di addestramento più appropriato per l'attività da svolgere.
+
+### Allenare un modello
+
+Armati dei tuoi dati di allenamento, sei pronto a "adattarlo" per creare un modello. Noterai che in molte librerie ML troverai il codice "model.fit" - è in questo momento che invii la tua variabile di funzionalità come matrice di valori (in genere `X`) e una variabile di destinazione (di solito `y`).
+
+### Valutare il modello
+
+Una volta completato il processo di addestramento (potrebbero essere necessarie molte iterazioni, o "epoche", per addestrare un modello di grandi dimensioni), si sarà in grado di valutare la qualità del modello utilizzando i dati di test per valutarne le prestazioni. Questi dati sono un sottoinsieme dei dati originali che il modello non ha analizzato in precedenza. Si può stampare una tabella di metriche sulla qualità del proprio modello.
+
+🎓 **Adattamento del modello**
+
+Nel contesto di machine learning, l'adattamento del modello si riferisce all'accuratezza della funzione sottostante del modello mentre tenta di analizzare dati con cui non ha familiarità.
+
+🎓 **Inadeguatezza** o **sovraadattamento** sono problemi comuni che degradano la qualità del modello, poiché il modello non si adatta abbastanza bene o troppo bene. Ciò fa sì che il modello esegua previsioni troppo allineate o troppo poco allineate con i suoi dati di addestramento. Un modello overfit (sovraaddestrato) prevede troppo bene i dati di addestramento perché ha appreso troppo bene i dettagli e il rumore dei dati. Un modello underfit (inadeguato) non è accurato in quanto non può né analizzare accuratamente i suoi dati di allenamento né i dati che non ha ancora "visto".
+
+![modello sovraaddestrato](../images/overfitting.png)
+> Infografica di [Jen Looper](https://twitter.com/jenlooper)
+
+## Sintonia dei parametri
+
+Una volta completato l'addestramento iniziale, si osservi la qualità del modello e si valuti di migliorarlo modificando i suoi "iperparametri". Maggiori informazioni sul processo [nella documentazione](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters?WT.mc_id=academic-15963-cxa).
+
+## Previsione
+
+Questo è il momento in cui si possono utilizzare dati completamente nuovi per testare l'accuratezza del proprio modello. In un'impostazione ML "applicata", in cui si creano risorse Web per utilizzare il modello in produzione, questo processo potrebbe comportare la raccolta dell'input dell'utente (ad esempio, la pressione di un pulsante) per impostare una variabile e inviarla al modello per l'inferenza, oppure valutazione.
+
+In queste lezioni si scoprirà come utilizzare questi passaggi per preparare, costruire, testare, valutare e prevedere - tutti gesti di un data scientist e altro ancora, mentre si avanza nel proprio viaggio per diventare un ingegnere ML "full stack".
+
+---
+
+## 🚀 Sfida
+
+Disegnare un diagramma di flusso che rifletta i passaggi di un professionista di ML. Dove ci si vede in questo momento nel processo? Dove si prevede che sorgeranno difficoltà? Cosa sembra facile?
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/8/)
+
+## Revisione e Auto Apprendimento
+
+Cercare online le interviste con i data scientist che discutono del loro lavoro quotidiano. Eccone [una](https://www.youtube.com/watch?v=Z3IjgbbCEfs).
+
+## Compito
+
+[Intervista a un data scientist](assignment.it.md)
diff --git a/1-Introduction/4-techniques-of-ML/translations/README.ja.md b/1-Introduction/4-techniques-of-ML/translations/README.ja.md
new file mode 100644
index 0000000000..13dcdb3127
--- /dev/null
+++ b/1-Introduction/4-techniques-of-ML/translations/README.ja.md
@@ -0,0 +1,114 @@
+# 機械学習の手法
+
+機械学習モデルやそのモデルが使用するデータを構築・使用・管理するプロセスは、他の多くの開発ワークフローとは全く異なるものです。このレッスンでは、このプロセスを明快にして、知っておくべき主な手法の概要をまとめます。あなたは、
+
+- 機械学習を支えるプロセスを高い水準で理解します。
+- 「モデル」「予測」「訓練データ」などの基本的な概念を調べます。
+
+## [講義前の小テスト](https://white-water-09ec41f0f.azurestaticapps.net/quiz/7?loc=ja)
+
+## 導入
+
+大まかに言うと、機械学習 (Machine Learning: ML) プロセスを作成する技術はいくつかのステップで構成されています。
+
+1. **質問を決める**。ほとんどの機械学習プロセスは、単純な条件のプログラムやルールベースのエンジンでは答えられないような質問をすることから始まります。このような質問は、データの集合を使った予測を中心にされることが多いです。
+2. **データを集めて準備する**。質問に答えるためにはデータが必要です。データの質と、ときには量が、最初の質問にどれだけうまく答えられるかを決めます。データの可視化がこのフェーズの重要な側面です。モデルを構築するためにデータを訓練グループとテストグループに分けることもこのフェーズに含みます。
+3. **学習方法を選ぶ**。質問の内容やデータの性質に応じて、データを最も良く反映して正確に予測できるモデルを、どのように学習するかを選ぶ必要があります。これは機械学習プロセスの中でも、特定の専門知識と、多くの場合はかなりの試行回数が必要になる部分です。
+4. **モデルを学習する**。データのパターンを認識するモデルを学習するために、訓練データと様々なアルゴリズムを使います。モデルはより良いモデルを構築するために、データの特定の部分を優先するように調整できる内部の重みを活用するかもしれません。
+5. **モデルを評価する**。モデルがどのように動作しているかを確認するために、集めたデータの中からまだ見たことのないもの(テストデータ)を使います。
+6. **パラメータチューニング**。モデルの性能によっては、モデルを学習するために使われる、各アルゴリズムの挙動を制御するパラメータや変数を変更してプロセスをやり直すこともできます。
+7. **予測する**。モデルの精度をテストするために新しい入力を使います。
+
+## どのような質問をすれば良いか
+
+コンピュータはデータの中に隠れているパターンを見つけることがとても得意です。この有用性は、条件ベースのルールエンジンを作っても簡単には答えられないような、特定の領域に関する質問を持っている研究者にとって非常に役立ちます。たとえば、ある保険数理の問題があったとして、データサイエンティストは喫煙者と非喫煙者の死亡率に関する法則を自分の手だけでも作れるかもしれません。
+
+しかし、他にも多くの変数が方程式に含まれる場合、過去の健康状態から将来の死亡率を予測する機械学習モデルの方が効率的かもしれません。もっと明るいテーマの例としては、緯度、経度、気候変動、海への近さ、ジェット気流のパターンなどのデータに基づいて、特定の場所における4月の天気を予測することができます。
+
+✅ 気象モデルに関するこの [スライド](https://www2.cisl.ucar.edu/sites/default/files/0900%20June%2024%20Haupt_0.pdf) は、気象解析に機械学習を使う際の歴史的な考え方を示しています。
+
+## 構築前のタスク
+
+モデルの構築を始める前に、いくつかのタスクを完了させる必要があります。質問をテストしたりモデルの予測に基づいた仮説を立てたりするためには、いくつかの要素を特定して設定する必要があります。
+
+### データ
+
+質問に確実に答えるためには、適切な種類のデータが大量に必要になります。ここではやるべきことが2つあります。
+
+- **データを集める**。データ解析における公平性に関する前回の講義を思い出しながら、慎重にデータを集めてください。特定のバイアスを持っているかもしれないデータのソースに注意し、それを記録しておいてください。
+- **データを準備する**。データを準備するプロセスにはいくつかのステップがあります。異なるソースからデータを集めた場合、照合と正規化が必要になるかもしれません。([クラスタリング](../../../5-Clustering/1-Visualize/README.md) で行っているように、)文字列を数値に変換するなどの様々な方法でデータの質と量を向上させることができます。([分類](../../../4-Classification/1-Introduction/README.md) で行っているように、)元のデータから新しいデータを生成することもできます。([Webアプリ](../../../3-Web-App/README.md) の講義の前に行うように、)データをクリーニングしたり編集したりすることができます。最後に、学習の手法によっては、ランダムにしたりシャッフルしたりする必要もあるかもしれません。
+
+✅ データを集めて処理した後は、その形で意図した質問に対応できるかどうかを確認してみましょう。[クラスタリング](../../../5-Clustering/1-Visualize/README.md) の講義でわかるように、データは与えられたタスクに対して上手く機能しないかもしれません!
+
+### 機能とターゲット
+
+フィーチャは、データの測定可能なプロパティです。多くのデータセットでは、'日付' 'サイズ' や '色' のような列見出しとして表現されます。通常、コードでは `X` として表されるフィーチャ変数は、モデルのトレーニングに使用される入力変数を表します
+
+ターゲットは、予測しようとしているものです。ターゲットは通常、コードで`y`として表され、あなたのデータを尋ねようとしている質問に対する答えを表します:12月に、どの色のカボチャが最も安くなりますか?サンフランシスコでは、どの地域が最高の不動産価格を持つでしょうか?ターゲットはラベル属性とも呼ばれることもあります。
+
+### 特徴量の選択
+
+🎓 **特徴選択と特徴抽出** モデルを構築する際にどの変数を選ぶべきかは、どうすればわかるでしょうか?最も性能の高いモデルのためには、適した変数を選択する特徴選択や特徴抽出のプロセスをたどることになるでしょう。しかし、これらは同じものではありません。「特徴抽出は元の特徴の機能から新しい特徴を作成するのに対し、特徴選択は特徴の一部を返すものです。」 ([出典](https://wikipedia.org/wiki/Feature_selection))
+
+### データを可視化する
+
+データサイエンティストの道具に関する重要な側面は、Seaborn や MatPlotLib などの優れたライブラリを使ってデータを可視化する力です。データを視覚的に表現することで、隠れた相関関係を見つけて活用できるかもしれません。また、([分類](../../../4-Classification/2-Classifiers-1/README.md) でわかるように、)視覚化することで、バイアスやバランシングされていないデータを見つけられるかもしれません。
+
+### データセットを分割する
+
+学習の前にデータセットを2つ以上に分割して、それぞれがデータを表すのに十分かつ不均等な大きさにする必要があります。
+
+- **学習**。データセットのこの部分は、モデルを学習するために適合させます。これは元のデータセットの大部分を占めます。
+- **テスト**。テストデータセットとは、構築したモデルの性能を確認するために使用する独立したデータグループのことで、多くの場合は元のデータから集められます。
+- **検証**。検証セットとは、さらに小さくて独立したサンプルの集合のことで、モデルを改善するためにハイパーパラメータや構造を調整する際に使用されます。([時系列予測](../../../7-TimeSeries/1-Introduction/README.md) に記載しているように、)データの大きさや質問の内容によっては、この3つ目のセットを作る必要はありません。
+
+## モデルの構築
+
+訓練データと様々なアルゴリズムを使った **学習** によって、モデルもしくはデータの統計的な表現を構築することが目標です。モデルを学習することで、データを扱えるようになったり、発見、検証、肯定または否定したパターンに関する仮説を立てることができたりします。
+
+### 学習方法を決める
+
+質問の内容やデータの性質に応じて、モデルを学習する方法を選択します。このコースで使用する [Scikit-learn のドキュメント](https://scikit-learn.org/stable/user_guide.html) を見ると、モデルを学習する様々な方法を調べられます。経験次第では、最適なモデルを構築するためにいくつかの異なる方法を試す必要があるかもしれません。また、モデルが見たことのないデータを与えたり、質を下げている問題、精度、バイアスについて調べたり、タスクに対して最適な学習方法を選んだりすることで、データサイエンティストが行っている、モデルの性能を評価するプロセスを踏むことになるでしょう。
+
+### モデルを学習する
+
+トレーニングデータを使用して、モデルを作成するために「フィット」する準備が整いました。多くの ML ライブラリでは、コード 'model.fit' が見つかります - この時点で、値の配列 (通常は `X`) とターゲット変数 (通常は `y`) として機能変数を送信します。
+
+### モデルを評価する
+
+(大きなモデルを学習するには多くの反復(エポック)が必要になりますが、)学習プロセスが完了したら、テストデータを使ってモデルの質を評価することができます。このデータは元のデータのうち、モデルがそれまでに分析していないものです。モデルの質を表す指標の表を出力することができます。
+
+🎓 **モデルフィッティング**
+
+機械学習におけるモデルフィッティングは、モデルがまだ知らないデータを分析する際の根本的な機能の精度を参照します。
+
+🎓 **未学習** と **過学習** はモデルの質を下げる一般的な問題で、モデルが十分に適合していないか、または適合しすぎています。これによってモデルは訓練データに近すぎたり遠すぎたりする予測を行います。過学習モデルは、データの詳細やノイズもよく学習しているため、訓練データを上手く予測しすぎてしまいます。未学習モデルは、訓練データやまだ「見たことのない」データを正確に分析することができないため、精度が高くないです。
+
+![過学習モデル](../images/overfitting.png)
+> [Jen Looper](https://twitter.com/jenlooper) さんによる解説画像
+
+## パラメータチューニング
+
+最初のトレーニングが完了したら、モデルの質を観察して、「ハイパーパラメータ」の調整によるモデルの改善を検討しましょう。このプロセスについては [ドキュメント](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters?WT.mc_id=academic-15963-cxa) を読んでください。
+
+## 予測
+
+全く新しいデータを使ってモデルの精度をテストする瞬間です。本番環境でモデルを使用するためにWebアセットを構築するよう「適用された」機械学習の設定においては、推論や評価のためにモデルに渡したり、変数を設定したりするためにユーザの入力(ボタンの押下など)を収集することがこのプロセスに含まれるかもしれません。
+
+この講義では、「フルスタック」の機械学習エンジニアになるための旅をしながら、準備・構築・テスト・評価・予測などのデータサイエンティストが行うすべてのステップの使い方を学びます。
+
+---
+
+## 🚀チャレンジ
+
+機械学習の学習者のステップを反映したフローチャートを描いてください。今の自分はこのプロセスのどこにいると思いますか?どこに困難があると予想しますか?あなたにとって簡単そうなことは何ですか?
+
+## [講義後の小テスト](https://white-water-09ec41f0f.azurestaticapps.net/quiz/8?loc=ja)
+
+## 振り返りと自主学習
+
+データサイエンティストが日々の仕事について話しているインタビューをネットで検索してみましょう。ひとつは [これ](https://www.youtube.com/watch?v=Z3IjgbbCEfs) です。
+
+## 課題
+
+[データサイエンティストにインタビューする](assignment.ja.md)
diff --git a/1-Introduction/4-techniques-of-ML/translations/README.zh-cn.md b/1-Introduction/4-techniques-of-ML/translations/README.zh-cn.md
new file mode 100644
index 0000000000..2590b99a11
--- /dev/null
+++ b/1-Introduction/4-techniques-of-ML/translations/README.zh-cn.md
@@ -0,0 +1,112 @@
+
+# 机器学习技术
+
+构建、使用和维护机器学习模型及其使用的数据的过程与许多其他开发工作流程截然不同。 在本课中,我们将揭开该过程的神秘面纱,并概述你需要了解的主要技术。 你会:
+
+- 在高层次上理解支持机器学习的过程。
+- 探索基本概念,例如“模型”、“预测”和“训练数据”。
+
+## [课前测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/7/)
+## 介绍
+
+在较高的层次上,创建机器学习(ML)过程的工艺包括许多步骤:
+
+1. **决定问题**。 大多数机器学习过程都是从提出一个简单的条件程序或基于规则的引擎无法回答的问题开始的。 这些问题通常围绕基于数据集合的预测展开。
+2. **收集和准备数据**。为了能够回答你的问题,你需要数据。数据的质量(有时是数量)将决定你回答最初问题的能力。可视化数据是这个阶段的一个重要方面。此阶段还包括将数据拆分为训练和测试组以构建模型。
+3. **选择一种训练方法**。根据你的问题和数据的性质,你需要选择如何训练模型以最好地反映你的数据并对其进行准确预测。这是你的ML过程的一部分,需要特定的专业知识,并且通常需要大量的实验。
+4. **训练模型**。使用你的训练数据,你将使用各种算法来训练模型以识别数据中的模式。该模型可能会利用可以调整的内部权重来使数据的某些部分优于其他部分,从而构建更好的模型。
+5. **评估模型**。你使用收集到的集合中从未见过的数据(你的测试数据)来查看模型的性能。
+6. **参数调整**。根据模型的性能,你可以使用不同的参数或变量重做该过程,这些参数或变量控制用于训练模型的算法的行为。
+7. **预测**。使用新输入来测试模型的准确性。
+
+## 要问什么问题
+
+计算机特别擅长发现数据中的隐藏模式。此实用程序对于对给定领域有疑问的研究人员非常有帮助,这些问题无法通过创建基于条件的规则引擎来轻松回答。例如,给定一项精算任务,数据科学家可能能够围绕吸烟者与非吸烟者的死亡率构建手工规则。
+
+然而,当将许多其他变量纳入等式时,ML模型可能会更有效地根据过去的健康史预测未来的死亡率。一个更令人愉快的例子可能是根据包括纬度、经度、气候变化、与海洋的接近程度、急流模式等在内的数据对给定位置的4月份进行天气预报。
+
+✅ 这个关于天气模型的[幻灯片](https://www2.cisl.ucar.edu/sites/default/files/0900%20June%2024%20Haupt_0.pdf)为在天气分析中使用机器学习提供了一个历史视角。
+
+## 预构建任务
+
+在开始构建模型之前,你需要完成多项任务。要测试你的问题并根据模型的预测形成假设,你需要识别和配置多个元素。
+
+### Data
+
+为了能够确定地回答你的问题,你需要大量正确类型的数据。 此时你需要做两件事:
+
+- **收集数据**。记住之前关于数据分析公平性的课程,小心收集数据。请注意此数据的来源、它可能具有的任何固有偏见,并记录其来源。
+- **准备数据**。数据准备过程有几个步骤。如果数据来自不同的来源,你可能需要整理数据并对其进行标准化。你可以通过各种方法提高数据的质量和数量,例如将字符串转换为数字(就像我们在[聚类](../../../5-Clustering/1-Visualize/README.md)中所做的那样)。你还可以根据原始数据生成新数据(正如我们在[分类](../../../4-Classification/1-Introduction/README.md)中所做的那样)。你可以清理和编辑数据(就像我们在 [Web App](../../3-Web-App/README.md)课程之前所做的那样)。最后,你可能还需要对其进行随机化和打乱,具体取决于你的训练技术。
+
+✅ 在收集和处理你的数据后,花点时间看看它的形状是否能让你解决你的预期问题。正如我们在[聚类](../../../5-Clustering/1-Visualize/README.md)课程中发现的那样,数据可能在你的给定任务中表现不佳!
+
+### 功能和目标
+
+功能是数据的可测量属性。在许多数据集中,它表示为标题为"日期""大小"或"颜色"的列。您的功能变量(通常在代码中表示为 `X`)表示用于训练模型的输入变量。
+
+目标就是你试图预测的事情。目标通常表示为代码中的 `y`,代表您试图询问数据的问题的答案:在 12 月,什么颜色的南瓜最便宜?在旧金山,哪些街区的房地产价格最好?有时目标也称为标签属性。
+
+### 选择特征变量
+
+🎓 **特征选择和特征提取** 构建模型时如何知道选择哪个变量?你可能会经历一个特征选择或特征提取的过程,以便为性能最好的模型选择正确的变量。然而,它们不是一回事:“特征提取是从基于原始特征的函数中创建新特征,而特征选择返回特征的一个子集。”([来源](https://wikipedia.org/wiki/Feature_selection))
+### 可视化数据
+
+数据科学家工具包的一个重要方面是能够使用多个优秀的库(例如 Seaborn 或 MatPlotLib)将数据可视化。直观地表示你的数据可能会让你发现可以利用的隐藏关联。 你的可视化还可以帮助你发现偏见或不平衡的数据(正如我们在 [分类](../../../4-Classification/2-Classifiers-1/README.md)中发现的那样)。
+### 拆分数据集
+
+在训练之前,你需要将数据集拆分为两个或多个大小不等但仍能很好地代表数据的部分。
+
+- **训练**。这部分数据集适合你的模型进行训练。这个集合构成了原始数据集的大部分。
+- **测试**。测试数据集是一组独立的数据,通常从原始数据中收集,用于确认构建模型的性能。
+- **验证**。验证集是一个较小的独立示例组,用于调整模型的超参数或架构,以改进模型。根据你的数据大小和你提出的问题,你可能不需要构建第三组(正如我们在[时间序列预测](../../../7-TimeSeries/1-Introduction/README.md)中所述)。
+
+## 建立模型
+
+使用你的训练数据,你的目标是构建模型或数据的统计表示,并使用各种算法对其进行**训练**。训练模型将其暴露给数据,并允许它对其发现、验证和接受或拒绝的感知模式做出假设。
+
+### 决定一种训练方法
+
+根据你的问题和数据的性质,你将选择一种方法来训练它。逐步完成 [Scikit-learn的文档](https://scikit-learn.org/stable/user_guide.html) - 我们在本课程中使用 - 你可以探索多种训练模型的方法。 根据你的经验,你可能需要尝试多种不同的方法来构建最佳模型。你可能会经历一个过程,在该过程中,数据科学家通过提供未见过的数据来评估模型的性能,检查准确性、偏差和其他降低质量的问题,并为手头的任务选择最合适的训练方法。
+
+### 训练模型
+
+有了您的培训数据,您就可以"适应"它来创建模型。您会注意到,在许多 ML 库中,您会发现代码"model.fit"-此时,您将功能变量作为一系列值(通常是`X`)和目标变量(通常是`y`)发送。
+
+### 评估模型
+
+训练过程完成后(训练大型模型可能需要多次迭代或“时期”),你将能够通过使用测试数据来衡量模型的性能来评估模型的质量。此数据是模型先前未分析的原始数据的子集。 你可以打印出有关模型质量的指标表。
+
+🎓 **模型拟合**
+
+在机器学习的背景下,模型拟合是指模型在尝试分析不熟悉的数据时其底层功能的准确性。
+
+🎓 **欠拟合**和**过拟合**是降低模型质量的常见问题,因为模型拟合得不够好或太好。这会导致模型做出与其训练数据过于紧密对齐或过于松散对齐的预测。 过拟合模型对训练数据的预测太好,因为它已经很好地了解了数据的细节和噪声。欠拟合模型并不准确,因为它既不能准确分析其训练数据,也不能准确分析尚未“看到”的数据。
+
+![过拟合模型 ](../images/overfitting.png)
+> 作者[Jen Looper](https://twitter.com/jenlooper)
+
+## 参数调优
+
+初始训练完成后,观察模型的质量并考虑通过调整其“超参数”来改进它。[在此文档中](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters?WT.mc_id=academic-15963-cxa)阅读有关该过程的更多信息。
+
+## 预测
+
+这是你可以使用全新数据来测试模型准确性的时刻。在“应用”ML设置中,你正在构建Web资源以在生产中使用模型,此过程可能涉及收集用户输入(例如按下按钮)以设置变量并将其发送到模型进行推理,或者评估。
+
+在这些课程中,你将了解如何使用这些步骤来准备、构建、测试、评估和预测—所有这些都是数据科学家的姿态,而且随着你在成为一名“全栈”ML工程师的旅程中取得进展,你将了解更多。
+
+---
+
+## 🚀挑战
+
+画一个流程图,反映ML的步骤。在这个过程中,你认为自己现在在哪里?你预测你在哪里会遇到困难?什么对你来说很容易?
+
+## [阅读后测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/8/)
+
+## 复习与自学
+
+在线搜索对讨论日常工作的数据科学家的采访。 这是[其中之一](https://www.youtube.com/watch?v=Z3IjgbbCEfs)。
+
+## 任务
+
+[采访一名数据科学家](assignment.zh-cn.md)
diff --git a/1-Introduction/4-techniques-of-ML/translations/assignment.id.md b/1-Introduction/4-techniques-of-ML/translations/assignment.id.md
new file mode 100644
index 0000000000..9f7b23be77
--- /dev/null
+++ b/1-Introduction/4-techniques-of-ML/translations/assignment.id.md
@@ -0,0 +1,11 @@
+# Wawancara seorang data scientist
+
+## Instruksi
+
+Di perusahaan Kamu, dalam user group, atau di antara teman atau sesama siswa, berbicaralah dengan seseorang yang bekerja secara profesional sebagai data scientist. Tulis makalah singkat (500 kata) tentang pekerjaan sehari-hari mereka. Apakah mereka spesialis, atau apakah mereka bekerja 'full stack'?
+
+## Rubrik
+
+| Kriteria | Sangat Bagus | Cukup | Perlu Peningkatan |
+| -------- | ------------------------------------------------------------------------------------ | ------------------------------------------------------------------ | --------------------- |
+| | Sebuah esai dengan panjang yang sesuai, dengan sumber yang dikaitkan, disajikan sebagai file .doc | Esai dikaitkan dengan buruk atau lebih pendek dari panjang yang dibutuhkan | Tidak ada esai yang disajikan |
diff --git a/1-Introduction/4-techniques-of-ML/translations/assignment.it.md b/1-Introduction/4-techniques-of-ML/translations/assignment.it.md
new file mode 100644
index 0000000000..41c597f40e
--- /dev/null
+++ b/1-Introduction/4-techniques-of-ML/translations/assignment.it.md
@@ -0,0 +1,11 @@
+# Intervista a un data scientist
+
+## Istruzioni
+
+Nella propria azienda, in un gruppo di utenti, o tra amici o compagni di studio, si parli con qualcuno che lavora professionalmente come data scientist. Si scriva un breve documento (500 parole) sulle loro occupazioni quotidiane. Sono specialisti o lavorano "full stack"?
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | ------------------------------------------------------------------------------------ | ------------------------------------------------------------------ | --------------------- |
+| | Un saggio della lunghezza corretta, con fonti attribuite, è presentato come file .doc | Il saggio è attribuito male o più corto della lunghezza richiesta | Non viene presentato alcun saggio |
diff --git a/1-Introduction/4-techniques-of-ML/translations/assignment.ja.md b/1-Introduction/4-techniques-of-ML/translations/assignment.ja.md
new file mode 100644
index 0000000000..b3690e770f
--- /dev/null
+++ b/1-Introduction/4-techniques-of-ML/translations/assignment.ja.md
@@ -0,0 +1,11 @@
+# データサイエンティストにインタビューする
+
+## 指示
+
+会社・ユーザグループ・友人・学生仲間の中で、データサイエンティストとして専門的に働いている人に話を聞いてみましょう。その人の日々の仕事について短いレポート(500語)を書いてください。その人は専門家でしょうか?それとも「フルスタック」として働いているでしょうか?
+
+## 評価基準
+
+| 基準 | 模範的 | 十分 | 要改善 |
+| ---- | ---------------------------------------------------------------------- | -------------------------------------------------------------- | -------------------------- |
+| | 出典が明記された適切な長さのレポートが.docファイルとして提示されている | レポートに出典が明記されていない、もしくは必要な長さよりも短い | レポートが提示されていない |
diff --git a/1-Introduction/4-techniques-of-ML/translations/assignment.zh-cn.md b/1-Introduction/4-techniques-of-ML/translations/assignment.zh-cn.md
new file mode 100644
index 0000000000..ba28b55497
--- /dev/null
+++ b/1-Introduction/4-techniques-of-ML/translations/assignment.zh-cn.md
@@ -0,0 +1,11 @@
+# 采访一位数据科学家
+
+## 说明
+
+在你的公司、你所在的社群、或者在你的朋友和同学中,找到一位从事数据科学专业工作的人,与他或她交流一下。写一篇关于他们工作日常的小短文(500字左右)。他们是专家,还是说他们是“全栈”开发者?
+
+## 评判标准
+
+| 标准 | 优秀 | 中规中矩 | 仍需努力 |
+| -------- | ------------------------------------------------------------------------------------ | ------------------------------------------------------------------ | --------------------- |
+| | 提交一篇清晰描述了职业属性且字数符合规范的word文档 | 提交的文档职业属性描述得不清晰或者字数不合规范 | 啥都没有交 |
diff --git a/1-Introduction/README.es.md b/1-Introduction/translations/README.es.md
similarity index 100%
rename from 1-Introduction/README.es.md
rename to 1-Introduction/translations/README.es.md
diff --git a/1-Introduction/translations/README.fr.md b/1-Introduction/translations/README.fr.md
new file mode 100644
index 0000000000..462dea70e8
--- /dev/null
+++ b/1-Introduction/translations/README.fr.md
@@ -0,0 +1,22 @@
+# Introduction au machine learning
+
+Dans cette section du programme, vous découvrirez les concepts de base sous-jacents au domaine du machine learning, ce qu’il est, et vous découvrirez son histoire et les techniques que les chercheurs utilisent pour travailler avec lui. Explorons ensemble ce nouveau monde de ML !
+
+![globe](../images/globe.jpg)
+> Photo par Bill Oxford sur Unsplash
+
+### Leçons
+
+1. [Introduction au machine learning](../1-intro-to-ML/translations/README.fr.md)
+1. [L’histoire du machine learning et de l’IA](../2-history-of-ML/translations/README.fr.md)
+1. [Équité et machine learning](../3-fairness/translations/README.fr.md)
+1. [Techniques de machine learning](../4-techniques-of-ML/translations/README.fr.md)
+### Crédits
+
+"Introduction au machine learning" a été écrit avec ♥️ par une équipe de personnes comprenant [Muhammad Sakib Khan Inan](https://twitter.com/Sakibinan), [Ornella Altunyan](https://twitter.com/ornelladotcom) et [Jen Looper](https://twitter.com/jenlooper)
+
+"L’histoire du machine learning" a été écrit avec ♥️ par [Jen Looper](https://twitter.com/jenlooper) et [Amy Boyd](https://twitter.com/AmyKateNicho)
+
+"Équité et machine learning" a été écrit avec ♥️ par [Tomomi Imura](https://twitter.com/girliemac)
+
+"Techniques de machine learning" a été écrit avec ♥️ par [Jen Looper](https://twitter.com/jenlooper) et [Chris Noring](https://twitter.com/softchris)
diff --git a/1-Introduction/translations/README.id.md b/1-Introduction/translations/README.id.md
new file mode 100644
index 0000000000..0e6cc55750
--- /dev/null
+++ b/1-Introduction/translations/README.id.md
@@ -0,0 +1,23 @@
+# Pengantar Machine Learning
+
+Di bagian kurikulum ini, Kamu akan berkenalan dengan konsep yang mendasari bidang Machine Learning, apa itu Machine Learning, dan belajar mengenai
+sejarah serta teknik-teknik yang digunakan oleh para peneliti. Ayo jelajahi dunia baru Machine Learning bersama!
+
+![bola dunia](../images/globe.jpg)
+> Foto oleh Bill Oxford di Unsplash
+
+### Pelajaran
+
+1. [Pengantar Machine Learning](../1-intro-to-ML/translations/README.id.md)
+1. [Sejarah dari Machine Learning dan AI](../2-history-of-ML/translations/README.id.md)
+1. [Keadilan dan Machine Learning](../3-fairness/translations/README.id.md)
+1. [Teknik-Teknik Machine Learning](../4-techniques-of-ML/translations/README.id.md)
+### Penghargaan
+
+"Pengantar Machine Learning" ditulis dengan ♥️ oleh sebuah tim yang terdiri dari [Muhammad Sakib Khan Inan](https://twitter.com/Sakibinan), [Ornella Altunyan](https://twitter.com/ornelladotcom) dan [Jen Looper](https://twitter.com/jenlooper)
+
+"Sejarah dari Machine Learning dan AI" ditulis dengan ♥️ oleh [Jen Looper](https://twitter.com/jenlooper) dan [Amy Boyd](https://twitter.com/AmyKateNicho)
+
+"Keadilan dan Machine Learning" ditulis dengan ♥️ oleh [Tomomi Imura](https://twitter.com/girliemac)
+
+"Teknik-Teknik Machine Learning" ditulis dengan ♥️ oleh [Jen Looper](https://twitter.com/jenlooper) dan [Chris Noring](https://twitter.com/softchris)
diff --git a/1-Introduction/translations/README.it.md b/1-Introduction/translations/README.it.md
new file mode 100644
index 0000000000..a9460c2659
--- /dev/null
+++ b/1-Introduction/translations/README.it.md
@@ -0,0 +1,22 @@
+# Introduzione a machine learning
+
+In questa sezione del programma di studi, verranno presentati i concetti di base sottostanti machine learning, di cosa si tratta, e si imparerà la sua storia e le tecniche utilizzate dai ricercatori per lavorarci. Si esplorerà insieme questo nuovo mondo di ML!
+
+![globo](../images/globe.jpg)
+> Foto di Bill Oxford su Unsplash
+
+### Lezioni
+
+1. [Introduzione a machine learning](../1-intro-to-ML/translations/README.it.md)
+1. [La storia di machine learning e dell'AI](../2-history-of-ML/translations/README.it.md)
+1. [Equità e machine learning](../3-fairness/translations/README.it.md)
+1. [Tecniche di machine learning](../4-techniques-of-ML/translations/README.it.md)
+### Crediti
+
+"Introduzione a Machine Learning" scritto con ♥️ da un team di persone tra cui [Muhammad Sakib Khan Inan](https://twitter.com/Sakibinan), [Ornella Altunyan](https://twitter.com/ornelladotcom) e [Jen Looper](https://twitter.com/jenlooper)
+
+"La Storia di Machine Learning" scritto con ♥️ da [Jen Looper](https://twitter.com/jenlooper) e [Amy Boyd](https://twitter.com/AmyKateNicho)
+
+"Equità e Machine Learning" scritto con ♥️ da [Tomomi Imura](https://twitter.com/girliemac)
+
+"Tecniche di Machine Learning" scritto con ♥️ da [Jen Looper](https://twitter.com/jenlooper) e [Chris Noring](https://twitter.com/softchris)
\ No newline at end of file
diff --git a/1-Introduction/translations/README.ja.md b/1-Introduction/translations/README.ja.md
new file mode 100644
index 0000000000..f2f64c2c63
--- /dev/null
+++ b/1-Introduction/translations/README.ja.md
@@ -0,0 +1,22 @@
+# 機械学習への導入
+
+このセクションでは、機械学習の分野の基礎となる概念、機械学習とは何かを紹介し、その歴史や研究者が機械学習を扱う際に使用する技術について学びます。 新しいMLの世界を一緒に探求していきましょう!
+
+![地球](../images/globe.jpg)
+> UnsplashのBill Oxfordによる写真
+
+### Lessons
+
+1. [機械学習への導入](../1-intro-to-ML/translations/README.ja.md)
+1. [機械学習とAIの歴史](../2-history-of-ML/translations/README.ja.md)
+1. [機械学習における公平さ](../3-fairness/translations/README.ja.md)
+1. [機械学習の技術](../4-techniques-of-ML/translations/README.ja.md)
+### Credits
+
+"機械学習への導入 "は、[Muhammad Sakib Khan Inan](https://twitter.com/Sakibinan)、[Ornella Altunyan](https://twitter.com/ornelladotcom)、[Jen Looper](https://twitter.com/jenlooper)などのチームによって制作されました。
+
+"機械学習とAIの歴史" は[Jen Looper](https://twitter.com/jenlooper)、[Amy Boyd](https://twitter.com/AmyKateNicho)によって制作されました。
+
+"公平性と機械学習"は[Tomomi Imura](https://twitter.com/girliemac) によって制作されました。
+
+"機械学習の技術"は[Jen Looper](https://twitter.com/jenlooper)と[Chris Noring](https://twitter.com/softchris) によって制作されました。
diff --git a/1-Introduction/translations/README.ru.md b/1-Introduction/translations/README.ru.md
new file mode 100644
index 0000000000..887fdacdbf
--- /dev/null
+++ b/1-Introduction/translations/README.ru.md
@@ -0,0 +1,22 @@
+# Введение в машинное обучение
+
+В этом разделе учебной программы вы познакомитесь с базовыми концепциями, лежащими в основе области машинного обучения, что это такое, и узнаете о его истории и методах, которые исследователи используют для работы с ним. Давайте вместе исследуем этот новый мир машинного обучения!
+
+! [глобус](images/global.jpg)
+> Фото Билла Оксфорда на Unsplash
+
+### Уроки
+
+1. [Введение в машинное обучение](1-intro-to-ML/README.md)
+1. [История машинного обучения и искусственного интеллекта](2-history-of-ML/README.md)
+1. [Справедливость и машинное обучение](3-fairness/README.md)
+1. [Приемы машинного обучения](4-techniques-of-ML/README.md)
+### Благодарности
+
+«Введение в машинное обучение» было написано с ♥ ️группой людей, включая [Мухаммад Сакиб Хан Инан](https://twitter.com/Sakibinan), [Орнелла Алтунян](https://twitter.com/ornelladotcom) и [Джен Лупер](https://twitter.com/jenlooper)
+
+«История машинного обучения» была написана с ♥ ️[Джен Лупер](https://twitter.com/jenlooper) и [Эми Бойд](https://twitter.com/AmyKateNicho)
+
+«Справедливость и машинное обучение» написано с ♥ ️[Томоми Имура](https://twitter.com/girliemac)
+
+«Методы машинного обучения» были написаны с ♥ ️[Джен Лупер](https://twitter.com/jenlooper) и [Крис Норинг](https://twitter.com/softchris)
\ No newline at end of file
diff --git a/1-Introduction/translations/README.zh-cn.md b/1-Introduction/translations/README.zh-cn.md
new file mode 100644
index 0000000000..f1ad8e1ebe
--- /dev/null
+++ b/1-Introduction/translations/README.zh-cn.md
@@ -0,0 +1,22 @@
+# 机器学习入门
+
+课程的本章节将为您介绍机器学习领域背后的基本概念、什么是机器学习,并学习它的历史以及曾为此做出贡献的技术研究者门。让我们一起开始探索机器学习的全新世界吧!
+
+![globe](../images/globe.jpg)
+> 图片由 Bill Oxford提供,来自 Unsplash
+
+### 课程安排
+
+1. [机器学习简介](../1-intro-to-ML/translations/README.zh-cn.md)
+1. [机器学习的历史](../2-history-of-ML/translations/README.zh-cn.md)
+1. [机器学习中的公平性](../3-fairness/translations/README.zh-cn.md)
+1. [机器学习技术](../4-techniques-of-ML/translations/README.zh-cn.md)
+### 致谢
+
+"机器学习简介"由 [Muhammad Sakib Khan Inan](https://twitter.com/Sakibinan), [Ornella Altunyan](https://twitter.com/ornelladotcom) 及 [Jen Looper](https://twitter.com/jenlooper),共同倾 ♥️ 而作
+
+"机器学习及人工智能历史" 由 [Jen Looper](https://twitter.com/jenlooper) 及 [Amy Boyd](https://twitter.com/AmyKateNicho)倾 ♥️ 而作
+
+"公平性与机器学习" 由 [Tomomi Imura](https://twitter.com/girliemac) 倾 ♥️ 而作
+
+"机器学习的技术" 由 [Jen Looper](https://twitter.com/jenlooper) 及 [Chris Noring](https://twitter.com/softchris) 倾 ♥️ 而作
diff --git a/2-Regression/1-Tools/README.md b/2-Regression/1-Tools/README.md
index d18f529587..275e79f567 100644
--- a/2-Regression/1-Tools/README.md
+++ b/2-Regression/1-Tools/README.md
@@ -4,7 +4,10 @@
> Sketchnote by [Tomomi Imura](https://www.twitter.com/girlie_mac)
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/9/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/9/)
+
+> ### [This lesson is available in R!](./solution/lesson_1-R.ipynb)
+
## Introduction
In these four lessons, you will discover how to build regression models. We will discuss what these are for shortly. But before you do anything, make sure you have the right tools in place to start the process!
@@ -52,7 +55,7 @@ In this folder, you will find the file _notebook.ipynb_.
Next, add some Python code.
-1. Type **print("hello notebook'")** in the code block.
+1. Type **print('hello notebook')** in the code block.
1. Select the arrow to run the code.
You should see the printed statement:
@@ -95,7 +98,7 @@ For this task we will import some libraries:
- **matplotlib**. It's a useful [graphing tool](https://matplotlib.org/) and we will use it to create a line plot.
- **numpy**. [numpy](https://numpy.org/doc/stable/user/whatisnumpy.html) is a useful library for handling numeric data in Python.
-- **sklearn**. This is the Scikit-learn library.
+- **sklearn**. This is the [Scikit-learn](https://scikit-learn.org/stable/user_guide.html) library.
Import some libraries to help with your tasks.
@@ -180,6 +183,9 @@ In a new code cell, load the diabetes dataset by calling `load_diabetes()`. The
```python
plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)
+ plt.xlabel('Scaled BMIs')
+ plt.ylabel('Disease Progression')
+ plt.title('A Graph Plot Showing Diabetes Progression Against BMI')
plt.show()
```
@@ -193,7 +199,7 @@ Congratulations, you built your first linear regression model, created a predict
## 🚀Challenge
Plot a different variable from this dataset. Hint: edit this line: `X = X[:, np.newaxis, 2]`. Given this dataset's target, what are you able to discover about the progression of diabetes as a disease?
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/10/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/10/)
## Review & Self Study
diff --git a/2-Regression/1-Tools/assignment.md b/2-Regression/1-Tools/assignment.md
index dd58a16970..de37856c51 100644
--- a/2-Regression/1-Tools/assignment.md
+++ b/2-Regression/1-Tools/assignment.md
@@ -2,7 +2,7 @@
## Instructions
-Take a look at the [Linnerud dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_linnerud.html#sklearn.datasets.load_linnerud) in Scikit-learn. This dataset has multiple [targets](https://scikit-learn.org/stable/datasets/toy_dataset.html#linnerrud-dataset): 'It consists of three excercise (data) and three physiological (target) variables collected from twenty middle-aged men in a fitness club'.
+Take a look at the [Linnerud dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_linnerud.html#sklearn.datasets.load_linnerud) in Scikit-learn. This dataset has multiple [targets](https://scikit-learn.org/stable/datasets/toy_dataset.html#linnerrud-dataset): 'It consists of three exercise (data) and three physiological (target) variables collected from twenty middle-aged men in a fitness club'.
In your own words, describe how to create a Regression model that would plot the relationship between the waistline and how many situps are accomplished. Do the same for the other datapoints in this dataset.
diff --git a/2-Regression/1-Tools/images/encouRage.jpg b/2-Regression/1-Tools/images/encouRage.jpg
new file mode 100644
index 0000000000..e1d08fc267
Binary files /dev/null and b/2-Regression/1-Tools/images/encouRage.jpg differ
diff --git a/2-Regression/1-Tools/images/notebook.png b/2-Regression/1-Tools/images/notebook.png
index 27a187118d..bea9e0e9ff 100644
Binary files a/2-Regression/1-Tools/images/notebook.png and b/2-Regression/1-Tools/images/notebook.png differ
diff --git a/2-Regression/1-Tools/images/scatterplot.png b/2-Regression/1-Tools/images/scatterplot.png
index ba9f1610c7..446529a587 100644
Binary files a/2-Regression/1-Tools/images/scatterplot.png and b/2-Regression/1-Tools/images/scatterplot.png differ
diff --git a/2-Regression/1-Tools/solution/lesson_1-R.ipynb b/2-Regression/1-Tools/solution/lesson_1-R.ipynb
new file mode 100644
index 0000000000..ef28c30d16
--- /dev/null
+++ b/2-Regression/1-Tools/solution/lesson_1-R.ipynb
@@ -0,0 +1,441 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "metadata": {
+ "colab": {
+ "name": "lesson_1-R.ipynb",
+ "provenance": [],
+ "collapsed_sections": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "name": "ir",
+ "display_name": "R"
+ },
+ "language_info": {
+ "name": "R"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#Build a regression model: Get started with R and Tidymodels for regression models"
+ ],
+ "metadata": {
+ "id": "YJUHCXqK57yz"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Introduction to Regression - Lesson 1\r\n",
+ "\r\n",
+ "#### Putting it into perspective\r\n",
+ "\r\n",
+ "✅ There are many types of regression methods, and which one you pick depends on the answer you're looking for. If you want to predict the probable height for a person of a given age, you'd use `linear regression`, as you're seeking a **numeric value**. If you're interested in discovering whether a type of cuisine should be considered vegan or not, you're looking for a **category assignment** so you would use `logistic regression`. You'll learn more about logistic regression later. Think a bit about some questions you can ask of data, and which of these methods would be more appropriate.\r\n",
+ "\r\n",
+ "In this section, you will work with a [small dataset about diabetes](https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html). Imagine that you wanted to test a treatment for diabetic patients. Machine Learning models might help you determine which patients would respond better to the treatment, based on combinations of variables. Even a very basic regression model, when visualized, might show information about variables that would help you organize your theoretical clinical trials.\r\n",
+ "\r\n",
+ "That said, let's get started on this task!\r\n",
+ "\r\n",
+ "
\r\n",
+ " \r\n",
+ " Artwork by @allison_horst\r\n",
+ "\r\n",
+ ""
+ ],
+ "metadata": {
+ "id": "LWNNzfqd6feZ"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 1. Loading up our tool set\n",
+ "\n",
+ "For this task, we'll require the following packages:\n",
+ "\n",
+ "- `tidyverse`: The [tidyverse](https://www.tidyverse.org/) is a [collection of R packages](https://www.tidyverse.org/packages) designed to makes data science faster, easier and more fun!\n",
+ "\n",
+ "- `tidymodels`: The [tidymodels](https://www.tidymodels.org/) framework is a [collection of packages](https://www.tidymodels.org/packages/) for modeling and machine learning.\n",
+ "\n",
+ "You can have them installed as:\n",
+ "\n",
+ "`install.packages(c(\"tidyverse\", \"tidymodels\"))`\n",
+ "\n",
+ "The script below checks whether you have the packages required to complete this module and installs them for you in case some are missing."
+ ],
+ "metadata": {
+ "id": "FIo2YhO26wI9"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "source": [
+ "suppressWarnings(if(!require(\"pacman\")) install.packages(\"pacman\"))\r\n",
+ "pacman::p_load(tidyverse, tidymodels)"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "Loading required package: pacman\n",
+ "\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "id": "cIA9fz9v7Dss",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "2df7073b-86b2-4b32-cb86-0da605a0dc11"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Now, let's load these awesome packages and make them available in our current R session.(This is for mere illustration, `pacman::p_load()` already did that for you)"
+ ],
+ "metadata": {
+ "id": "gpO_P_6f9WUG"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# load the core Tidyverse packages\r\n",
+ "library(tidyverse)\r\n",
+ "\r\n",
+ "# load the core Tidymodels packages\r\n",
+ "library(tidymodels)\r\n"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "NLMycgG-9ezO"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 2. The diabetes dataset\n",
+ "\n",
+ "In this exercise, we'll put our regression skills into display by making predictions on a diabetes dataset. The [diabetes dataset](https://www4.stat.ncsu.edu/~boos/var.select/diabetes.rwrite1.txt) includes `442 samples` of data around diabetes, with 10 predictor feature variables, `age`, `sex`, `body mass index`, `average blood pressure`, and `six blood serum measurements` as well as an outcome variable `y`: a quantitative measure of disease progression one year after baseline.\n",
+ "\n",
+ "|Number of observations|442|\n",
+ "|----------------------|:---|\n",
+ "|Number of predictors|First 10 columns are numeric predictive|\n",
+ "|Outcome/Target|Column 11 is a quantitative measure of disease progression one year after baseline|\n",
+ "|Predictor Information|- age in years\n",
+ "||- sex\n",
+ "||- bmi body mass index\n",
+ "||- bp average blood pressure\n",
+ "||- s1 tc, total serum cholesterol\n",
+ "||- s2 ldl, low-density lipoproteins\n",
+ "||- s3 hdl, high-density lipoproteins\n",
+ "||- s4 tch, total cholesterol / HDL\n",
+ "||- s5 ltg, possibly log of serum triglycerides level\n",
+ "||- s6 glu, blood sugar level|\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "> 🎓 Remember, this is supervised learning, and we need a named 'y' target.\n",
+ "\n",
+ "Before you can manipulate data with R, you need to import the data into R's memory, or build a connection to the data that R can use to access the data remotely.\n",
+ "\n",
+ "> The [readr](https://readr.tidyverse.org/) package, which is part of the Tidyverse, provides a fast and friendly way to read rectangular data into R.\n",
+ "\n",
+ "Now, let's load the diabetes dataset provided in this source URL: \n",
+ "\n",
+ "Also, we'll perform a sanity check on our data using `glimpse()` and dsiplay the first 5 rows using `slice()`.\n",
+ "\n",
+ "Before going any further, let's also introduce something you will encounter often in R code 🥁🥁: the pipe operator `%>%`\n",
+ "\n",
+ "The pipe operator (`%>%`) performs operations in logical sequence by passing an object forward into a function or call expression. You can think of the pipe operator as saying \"and then\" in your code."
+ ],
+ "metadata": {
+ "id": "KM6iXLH996Cl"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Import the data set\r\n",
+ "diabetes <- read_table2(file = \"https://www4.stat.ncsu.edu/~boos/var.select/diabetes.rwrite1.txt\")\r\n",
+ "\r\n",
+ "\r\n",
+ "# Get a glimpse and dimensions of the data\r\n",
+ "glimpse(diabetes)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Select the first 5 rows of the data\r\n",
+ "diabetes %>% \r\n",
+ " slice(1:5)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "Z1geAMhM-bSP"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "`glimpse()` shows us that this data has 442 rows and 11 columns with all the columns being of data type `double` \n",
+ "\n",
+ " \n",
+ "\n",
+ "\n",
+ "\n",
+ "> glimpse() and slice() are functions in [`dplyr`](https://dplyr.tidyverse.org/). Dplyr, part of the Tidyverse, is a grammar of data manipulation that provides a consistent set of verbs that help you solve the most common data manipulation challenges\n",
+ "\n",
+ " \n",
+ "\n",
+ "Now that we have the data, let's narrow down to one feature (`bmi`) to target for this exercise. This will require us to select the desired columns. So, how do we do this?\n",
+ "\n",
+ "[`dplyr::select()`](https://dplyr.tidyverse.org/reference/select.html) allows us to *select* (and optionally rename) columns in a data frame."
+ ],
+ "metadata": {
+ "id": "UwjVT1Hz-c3Z"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Select predictor feature `bmi` and outcome `y`\r\n",
+ "diabetes_select <- diabetes %>% \r\n",
+ " select(c(bmi, y))\r\n",
+ "\r\n",
+ "# Print the first 5 rows\r\n",
+ "diabetes_select %>% \r\n",
+ " slice(1:10)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "RDY1oAKI-m80"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 3. Training and Testing data\n",
+ "\n",
+ "It's common practice in supervised learning to *split* the data into two subsets; a (typically larger) set with which to train the model, and a smaller \"hold-back\" set with which to see how the model performed.\n",
+ "\n",
+ "Now that we have data ready, we can see if a machine can help determine a logical split between the numbers in this dataset. We can use the [rsample](https://tidymodels.github.io/rsample/) package, which is part of the Tidymodels framework, to create an object that contains the information on *how* to split the data, and then two more rsample functions to extract the created training and testing sets:\n"
+ ],
+ "metadata": {
+ "id": "SDk668xK-tc3"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "set.seed(2056)\r\n",
+ "# Split 67% of the data for training and the rest for tesing\r\n",
+ "diabetes_split <- diabetes_select %>% \r\n",
+ " initial_split(prop = 0.67)\r\n",
+ "\r\n",
+ "# Extract the resulting train and test sets\r\n",
+ "diabetes_train <- training(diabetes_split)\r\n",
+ "diabetes_test <- testing(diabetes_split)\r\n",
+ "\r\n",
+ "# Print the first 3 rows of the training set\r\n",
+ "diabetes_train %>% \r\n",
+ " slice(1:10)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "EqtHx129-1h-"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 4. Train a linear regression model with Tidymodels\n",
+ "\n",
+ "Now we are ready to train our model!\n",
+ "\n",
+ "In Tidymodels, you specify models using `parsnip()` by specifying three concepts:\n",
+ "\n",
+ "- Model **type** differentiates models such as linear regression, logistic regression, decision tree models, and so forth.\n",
+ "\n",
+ "- Model **mode** includes common options like regression and classification; some model types support either of these while some only have one mode.\n",
+ "\n",
+ "- Model **engine** is the computational tool which will be used to fit the model. Often these are R packages, such as **`\"lm\"`** or **`\"ranger\"`**\n",
+ "\n",
+ "This modeling information is captured in a model specification, so let's build one!"
+ ],
+ "metadata": {
+ "id": "sBOS-XhB-6v7"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Build a linear model specification\r\n",
+ "lm_spec <- \r\n",
+ " # Type\r\n",
+ " linear_reg() %>% \r\n",
+ " # Engine\r\n",
+ " set_engine(\"lm\") %>% \r\n",
+ " # Mode\r\n",
+ " set_mode(\"regression\")\r\n",
+ "\r\n",
+ "\r\n",
+ "# Print the model specification\r\n",
+ "lm_spec"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "20OwEw20--t3"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "After a model has been *specified*, the model can be `estimated` or `trained` using the [`fit()`](https://parsnip.tidymodels.org/reference/fit.html) function, typically using a formula and some data.\n",
+ "\n",
+ "`y ~ .` means we'll fit `y` as the predicted quantity/target, explained by all the predictors/features ie, `.` (in this case, we only have one predictor: `bmi` )"
+ ],
+ "metadata": {
+ "id": "_oDHs89k_CJj"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Build a linear model specification\r\n",
+ "lm_spec <- linear_reg() %>% \r\n",
+ " set_engine(\"lm\") %>%\r\n",
+ " set_mode(\"regression\")\r\n",
+ "\r\n",
+ "\r\n",
+ "# Train a linear regression model\r\n",
+ "lm_mod <- lm_spec %>% \r\n",
+ " fit(y ~ ., data = diabetes_train)\r\n",
+ "\r\n",
+ "# Print the model\r\n",
+ "lm_mod"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "YlsHqd-q_GJQ"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "From the model output, we can see the coefficients learned during training. They represent the coefficients of the line of best fit that gives us the lowest overall error between the actual and predicted variable.\n",
+ " \n",
+ "\n",
+ "## 5. Make predictions on the test set\n",
+ "\n",
+ "Now that we've trained a model, we can use it to predict the disease progression y for the test dataset using [parsnip::predict()](https://parsnip.tidymodels.org/reference/predict.model_fit.html). This will be used to draw the line between data groups."
+ ],
+ "metadata": {
+ "id": "kGZ22RQj_Olu"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Make predictions for the test set\r\n",
+ "predictions <- lm_mod %>% \r\n",
+ " predict(new_data = diabetes_test)\r\n",
+ "\r\n",
+ "# Print out some of the predictions\r\n",
+ "predictions %>% \r\n",
+ " slice(1:5)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "nXHbY7M2_aao"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Woohoo! 💃🕺 We just trained a model and used it to make predictions!\n",
+ "\n",
+ "When making predictions, the tidymodels convention is to always produce a tibble/data frame of results with standardized column names. This makes it easy to combine the original data and the predictions in a usable format for subsequent operations such as plotting.\n",
+ "\n",
+ "`dplyr::bind_cols()` efficiently binds multiple data frames column."
+ ],
+ "metadata": {
+ "id": "R_JstwUY_bIs"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Combine the predictions and the original test set\r\n",
+ "results <- diabetes_test %>% \r\n",
+ " bind_cols(predictions)\r\n",
+ "\r\n",
+ "\r\n",
+ "results %>% \r\n",
+ " slice(1:5)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "RybsMJR7_iI8"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 6. Plot modelling results\n",
+ "\n",
+ "Now, its time to see this visually 📈. We'll create a scatter plot of all the `y` and `bmi` values of the test set, then use the predictions to draw a line in the most appropriate place, between the model's data groupings.\n",
+ "\n",
+ "R has several systems for making graphs, but `ggplot2` is one of the most elegant and most versatile. This allows you to compose graphs by **combining independent components**."
+ ],
+ "metadata": {
+ "id": "XJbYbMZW_n_s"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Set a theme for the plot\r\n",
+ "theme_set(theme_light())\r\n",
+ "# Create a scatter plot\r\n",
+ "results %>% \r\n",
+ " ggplot(aes(x = bmi)) +\r\n",
+ " # Add a scatter plot\r\n",
+ " geom_point(aes(y = y), size = 1.6) +\r\n",
+ " # Add a line plot\r\n",
+ " geom_line(aes(y = .pred), color = \"blue\", size = 1.5)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "R9tYp3VW_sTn"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "> ✅ Think a bit about what's going on here. A straight line is running through many small dots of data, but what is it doing exactly? Can you see how you should be able to use this line to predict where a new, unseen data point should fit in relationship to the plot's y axis? Try to put into words the practical use of this model.\n",
+ "\n",
+ "Congratulations, you built your first linear regression model, created a prediction with it, and displayed it in a plot!\n"
+ ],
+ "metadata": {
+ "id": "zrPtHIxx_tNI"
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/2-Regression/1-Tools/solution/lesson_1.Rmd b/2-Regression/1-Tools/solution/lesson_1.Rmd
new file mode 100644
index 0000000000..d6a0c0eac6
--- /dev/null
+++ b/2-Regression/1-Tools/solution/lesson_1.Rmd
@@ -0,0 +1,250 @@
+---
+title: 'Build a regression model: Get started with R and Tidymodels for regression models'
+output:
+ html_document:
+ df_print: paged
+ theme: flatly
+ highlight: breezedark
+ toc: yes
+ toc_float: yes
+ code_download: yes
+---
+
+## Introduction to Regression - Lesson 1
+
+#### Putting it into perspective
+
+✅ There are many types of regression methods, and which one you pick depends on the answer you're looking for. If you want to predict the probable height for a person of a given age, you'd use `linear regression`, as you're seeking a **numeric value**. If you're interested in discovering whether a type of cuisine should be considered vegan or not, you're looking for a **category assignment** so you would use `logistic regression`. You'll learn more about logistic regression later. Think a bit about some questions you can ask of data, and which of these methods would be more appropriate.
+
+In this section, you will work with a [small dataset about diabetes](https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html). Imagine that you wanted to test a treatment for diabetic patients. Machine Learning models might help you determine which patients would respond better to the treatment, based on combinations of variables. Even a very basic regression model, when visualized, might show information about variables that would help you organize your theoretical clinical trials.
+
+That said, let's get started on this task!
+
+![Artwork by \@allison_horst](../images/encouRage.jpg){width="630"}
+
+## 1. Loading up our tool set
+
+For this task, we'll require the following packages:
+
+- `tidyverse`: The [tidyverse](https://www.tidyverse.org/) is a [collection of R packages](https://www.tidyverse.org/packages) designed to makes data science faster, easier and more fun!
+
+- `tidymodels`: The [tidymodels](https://www.tidymodels.org/) framework is a [collection of packages](https://www.tidymodels.org/packages/) for modeling and machine learning.
+
+You can have them installed as:
+
+`install.packages(c("tidyverse", "tidymodels"))`
+
+The script below checks whether you have the packages required to complete this module and installs them for you in case they are missing.
+
+```{r, message=F, warning=F}
+if (!require("pacman")) install.packages("pacman")
+pacman::p_load(tidyverse, tidymodels)
+```
+
+Now, let's load these awesome packages and make them available in our current R session. (This is for mere illustration, `pacman::p_load()` already did that for you)
+
+```{r load_tidy_verse_models, message=F, warning=F}
+# load the core Tidyverse packages
+library(tidyverse)
+
+# load the core Tidymodels packages
+library(tidymodels)
+
+
+```
+
+## 2. The diabetes dataset
+
+In this exercise, we'll put our regression skills into display by making predictions on a diabetes dataset. The [diabetes dataset](https://www4.stat.ncsu.edu/~boos/var.select/diabetes.rwrite1.txt) includes `442 samples` of data around diabetes, with 10 predictor feature variables, `age`, `sex`, `body mass index`, `average blood pressure`, and `six blood serum measurements` as well as an outcome variable `y`: a quantitative measure of disease progression one year after baseline.
+
++----------------------------+------------------------------------------------------------------------------------+
+| **Number of observations** | **442** |
++============================+====================================================================================+
+| **Number of predictors** | First 10 columns are numeric predictive values |
++----------------------------+------------------------------------------------------------------------------------+
+| **Outcome/Target** | Column 11 is a quantitative measure of disease progression one year after baseline |
++----------------------------+------------------------------------------------------------------------------------+
+| **Predictor Information** | - age age in years |
+| | - sex |
+| | - bmi body mass index |
+| | - bp average blood pressure |
+| | - s1 tc, total serum cholesterol |
+| | - s2 ldl, low-density lipoproteins |
+| | - s3 hdl, high-density lipoproteins |
+| | - s4 tch, total cholesterol / HDL |
+| | - s5 ltg, possibly log of serum triglycerides level |
+| | - s6 glu, blood sugar level |
++----------------------------+------------------------------------------------------------------------------------+
+
+> 🎓 Remember, this is supervised learning, and we need a named 'y' target.
+
+Before you can manipulate data with R, you need to import the data into R's memory, or build a connection to the data that R can use to access the data remotely.\
+
+> The [readr](https://readr.tidyverse.org/) package, which is part of the Tidyverse, provides a fast and friendly way to read rectangular data into R.
+
+Now, let's load the diabetes dataset provided in this source URL:
+
+Also, we'll perform a sanity check on our data using `glimpse()` and dsiplay the first 5 rows using `slice()`.
+
+Before going any further, let's introduce something you will encounter quite often in R code: the pipe operator `%>%`
+
+The pipe operator (`%>%`) performs operations in logical sequence by passing an object forward into a function or call expression. You can think of the pipe operator as saying "and then" in your code.\
+
+```{r load_dataset, message=F, warning=F}
+# Import the data set
+diabetes <- read_table2(file = "https://www4.stat.ncsu.edu/~boos/var.select/diabetes.rwrite1.txt")
+
+
+# Get a glimpse and dimensions of the data
+glimpse(diabetes)
+
+
+# Select the first 5 rows of the data
+diabetes %>%
+ slice(1:5)
+
+```
+
+`glimpse()` shows us that this data has 442 rows and 11 columns with all the columns being of data type `double`
+
+> glimpse() and slice() are functions in [`dplyr`](https://dplyr.tidyverse.org/). Dplyr, part of the Tidyverse, is a grammar of data manipulation that provides a consistent set of verbs that help you solve the most common data manipulation challenges
+
+Now that we have the data, let's narrow down to one feature (`bmi`) to target for this exercise. This will require us to select the desired columns. So, how do we do this?
+
+[`dplyr::select()`](https://dplyr.tidyverse.org/reference/select.html) allows us to *select* (and optionally rename) columns in a data frame.
+
+```{r select, message=F, warning=F}
+# Select predictor feature `bmi` and outcome `y`
+diabetes_select <- diabetes %>%
+ select(c(bmi, y))
+
+# Print the first 5 rows
+diabetes_select %>%
+ slice(1:5)
+```
+
+## 3. Training and Testing data
+
+It's common practice in supervised learning to *split* the data into two subsets; a (typically larger) set with which to train the model, and a smaller "hold-back" set with which to see how the model performed.
+
+Now that we have data ready, we can see if a machine can help determine a logical split between the numbers in this dataset. We can use the [rsample](https://tidymodels.github.io/rsample/) package, which is part of the Tidymodels framework, to create an object that contains the information on *how* to split the data, and then two more rsample functions to extract the created training and testing sets:
+
+```{r split, message=F, warning=F}
+set.seed(2056)
+# Split 67% of the data for training and the rest for tesing
+diabetes_split <- diabetes_select %>%
+ initial_split(prop = 0.67)
+
+# Extract the resulting train and test sets
+diabetes_train <- training(diabetes_split)
+diabetes_test <- testing(diabetes_split)
+
+# Print the first 3 rows of the training set
+diabetes_train %>%
+ slice(1:3)
+
+```
+
+## 4. Train a linear regression model with Tidymodels
+
+Now we are ready to train our model!
+
+In Tidymodels, you specify models using `parsnip()` by specifying three concepts:
+
+- Model **type** differentiates models such as linear regression, logistic regression, decision tree models, and so forth.
+
+- Model **mode** includes common options like regression and classification; some model types support either of these while some only have one mode.
+
+- Model **engine** is the computational tool which will be used to fit the model. Often these are R packages, such as **`"lm"`** or **`"ranger"`**
+
+This modeling information is captured in a model specification, so let's build one!
+
+```{r lm_model_spec, message=F, warning=F}
+# Build a linear model specification
+lm_spec <-
+ # Type
+ linear_reg() %>%
+ # Engine
+ set_engine("lm") %>%
+ # Mode
+ set_mode("regression")
+
+
+# Print the model specification
+lm_spec
+
+```
+
+After a model has been *specified*, the model can be `estimated` or `trained` using the [`fit()`](https://parsnip.tidymodels.org/reference/fit.html) function, typically using a formula and some data.
+
+`y ~ .` means we'll fit `y` as the predicted quantity/target, explained by all the predictors/features ie, `.` (in this case, we only have one predictor: `bmi` )
+
+```{r train, message=F, warning=F}
+# Build a linear model specification
+lm_spec <- linear_reg() %>%
+ set_engine("lm") %>%
+ set_mode("regression")
+
+
+# Train a linear regression model
+lm_mod <- lm_spec %>%
+ fit(y ~ ., data = diabetes_train)
+
+# Print the model
+lm_mod
+```
+
+From the model output, we can see the coefficients learned during training. They represent the coefficients of the line of best fit that gives us the lowest overall error between the actual and predicted variable.
+
+## 5. Make predictions on the test set
+
+Now that we've trained a model, we can use it to predict the disease progression y for the test dataset using [parsnip::predict()](https://parsnip.tidymodels.org/reference/predict.model_fit.html). This will be used to draw the line between data groups.
+
+```{r test, message=F, warning=F}
+# Make predictions for the test set
+predictions <- lm_mod %>%
+ predict(new_data = diabetes_test)
+
+# Print out some of the predictions
+predictions %>%
+ slice(1:5)
+```
+
+Woohoo! 💃🕺 We just trained a model and used it to make predictions!
+
+When making predictions, the tidymodels convention is to always produce a tibble/data frame of results with standardized column names. This makes it easy to combine the original data and the predictions in a usable format for subsequent operations such as plotting.
+
+`dplyr::bind_cols()` efficiently binds multiple data frames column.
+
+```{r test_pred, message=F, warning=F}
+# Combine the predictions and the original test set
+results <- diabetes_test %>%
+ bind_cols(predictions)
+
+
+results %>%
+ slice(1:5)
+```
+
+## 6. Plot modelling results
+
+Now, its time to see this visually 📈. We'll create a scatter plot of all the `y` and `bmi` values of the test set, then use the predictions to draw a line in the most appropriate place, between the model's data groupings.
+
+R has several systems for making graphs, but `ggplot2` is one of the most elegant and most versatile. This allows you to compose graphs by **combining independent components**.
+
+```{r plot_pred, message=F, warning=F}
+# Set a theme for the plot
+theme_set(theme_light())
+# Create a scatter plot
+results %>%
+ ggplot(aes(x = bmi)) +
+ # Add a scatter plot
+ geom_point(aes(y = y), size = 1.6) +
+ # Add a line plot
+ geom_line(aes(y = .pred), color = "blue", size = 1.5)
+
+```
+
+> ✅ Think a bit about what's going on here. A straight line is running through many small dots of data, but what is it doing exactly? Can you see how you should be able to use this line to predict where a new, unseen data point should fit in relationship to the plot's y axis? Try to put into words the practical use of this model.
+
+Congratulations, you built your first linear regression model, created a prediction with it, and displayed it in a plot!
diff --git a/2-Regression/1-Tools/solution/notebook.ipynb b/2-Regression/1-Tools/solution/notebook.ipynb
index e7d80492aa..b70176240f 100644
--- a/2-Regression/1-Tools/solution/notebook.ipynb
+++ b/2-Regression/1-Tools/solution/notebook.ipynb
@@ -182,13 +182,6 @@
"plt.plot(X_test, y_pred, color='blue', linewidth=3)\n",
"plt.show()"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
]
-}
\ No newline at end of file
+}
diff --git a/2-Regression/1-Tools/translations/README.id.md b/2-Regression/1-Tools/translations/README.id.md
new file mode 100644
index 0000000000..87b2be53b4
--- /dev/null
+++ b/2-Regression/1-Tools/translations/README.id.md
@@ -0,0 +1,208 @@
+# Memulai dengan Python dan Scikit-learn untuk model regresi
+
+![Ringkisan regresi dalam sebuah catatan sketsa](../../../sketchnotes/ml-regression.png)
+
+> Catatan sketsa oleh [Tomomi Imura](https://www.twitter.com/girlie_mac)
+
+## [Kuis Pra-ceramah](https://white-water-09ec41f0f.azurestaticapps.net/quiz/9/)
+## Pembukaan
+
+Dalam keempat pelajaran ini, kamu akan belajar bagaimana membangun model regresi. Kita akan berdiskusi apa fungsi model tersebut dalam sejenak. Tetapi sebelum kamu melakukan apapun, pastikan bahwa kamu sudah mempunyai alat-alat yang diperlukan untuk memulai!
+
+Dalam pelajaran ini, kamu akan belajar bagaimana untuk:
+
+- Konfigurasi komputermu untuk tugas pembelajaran.
+- Bekerja dengan Jupyter notebooks.
+- Menggunakan Scikit-learn, termasuk instalasi.
+- Menjelajahi regresi linear dengan latihan *hands-on*.
+
+
+## Instalasi dan konfigurasi
+
+[![Menggunakan Python dalam Visual Studio Code](https://img.youtube.com/vi/7EXd4_ttIuw/0.jpg)](https://youtu.be/7EXd4_ttIuw "Menggunakan Python dalam Visual Studio Code")
+
+> 🎥 Klik foto di atas untuk sebuah video: menggunakan Python dalam VS Code
+
+1. **Pasang Python**. Pastikan bahwa [Python](https://www.python.org/downloads/) telah dipasang di komputermu. Kamu akan menggunakan Python untuk banyak tugas *data science* dan *machine learning*. Python sudah dipasang di kebanyakan sistem komputer. Adapula *[Python Coding Packs](https://code.visualstudio.com/learn/educators/installers?WT.mc_id=academic-15963-cxa)* yang berguna untuk membantu proses pemasangan untuk beberapa pengguna.
+
+ Beberapa penggunaan Python memerlukan satu versi perangkat lunak tersebut, sedangkan beberapa penggunaan lainnya mungkin memerlukan versi Python yang beda lagi. Oleh sebab itulah akan sangat berguna untuk bekerja dalam sebuah *[virtual environment](https://docs.python.org/3/library/venv.html)* (lingkungan virtual).
+
+2. **Pasang Visual Studio Code**. Pastikan kamu sudah memasangkan Visual Studio Code di komputermu. Ikuti instruksi-instruksi ini untuk [memasangkan Visual Studio Code](https://code.visualstudio.com/) untuk instalasi dasar. Kamu akan menggunakan Python dalam Visual Studio Code dalam kursus ini, jadi kamu mungkin akan ingin mencari tahu cara [mengkonfigurasi Visual Studio Code](https://docs.microsoft.com/learn/modules/python-install-vscode?WT.mc_id=academic-15963-cxa) untuk menggunakan Python.
+
+ > Nyamankan diri dengan Python dengan mengerjakan [koleksi modul pembelajaran ini](https://docs.microsoft.com/users/jenlooper-2911/collections/mp1pagggd5qrq7?WT.mc_id=academic-15963-cxa)
+
+3. **Pasang Scikit-learn**, dengan mengikuti [instruksi di sini](https://scikit-learn.org/stable/install.html). Karena harus dipastikan bahwa kamu sedang menggunakan Python 3, kami anjurkan kamu menggunakan sebuah *virtual environment*. Ingatlah juga bahwa jika kamu ingin memasangkan ini di sebuah M1 Mac, ada instruksi khusus dalam laman yang ditautkan di atas.
+
+4. **Pasang Jupyter Notebook**. Kamu akan harus [memasang paket Jupyter](https://pypi.org/project/jupyter/).
+
+## Lingkungan penulisan ML-mu
+
+Kamu akan menggunakan ***notebooks*** untuk bekerja dengan kode Python-mu dan membuat model *machine learning*-mu. Jenis file ini adalah alat yang sering digunakan *data scientists* dan dapat diidentifikasikan dengan akhiran/ekstensi `.ipynb`.
+
+*Notebook* adalah sebuah lingkungan interaktif yang memungkinkan seorang *developer* untuk menulis kode, catatan, dan dokumentasi mengenai kode tersebut sehingga menjadi sangat berguna untuk proyek eksperimental ataupun riset.
+
+### Latihan - bekerja dengan sebuah *notebook*
+
+Dalam folder ini, kamu akan menemukan file _notebook.ipynb_.
+
+1. Buka _notebook.ipynb_ dalam Visual Studio Code.
+
+ Sebuah server Jupyter akan mulai dengan Python 3+. Kamu akan menemukan bagian-bagian *notebook* yang dapat di-`run` (dijalankan). Bagian-bagian tersebut adalah carikan-carikan kode. Kamu bisa menjalankan secarik kode dengan mengklik tombol ▶.
+
+2. Pilih ikon `md` dan tambahlah sedikit *markdown*: **# Selamat datang di *notebook* saya!**
+
+ Lalu, tambahlah sedikit kode Python.
+
+3. Ketik **print('hello notebook')** dalam blok kode.
+
+4. Klik ▶ untuk menjalankan kode.
+
+ Hasilnya harusnya ini:
+
+ ```output
+ hello notebook
+ ```
+
+![Sebuah *notebook* yang dibuka di VS Code](images/notebook.png)
+
+Kamu bisa menyelipkan catatan-catatan antara kodemu untuk mendokumentasi *notebook*-nya.
+
+✅ Pikirkanlah sejenak bagaimana lingkungan seorang *web developer* berbeda dengan lingkungan seorang *data scientist*.
+
+## Berjalan dengan Scikit-learn
+
+Sekarang, Python sudah siap dalam lingkungan lokalmu, dan kamu sudah nyaman bekerja dengan *Jupyter notebook*. Marilah membiasakan diri dengan Scikit-learn (dilafalkan `saikit lern`; huruf `e` dalam `lern` seperti `e` dalam kata `Perancis`). Scikit-learn menyediakan sebuah [API ekstensif](https://scikit-learn.org/stable/modules/classes.html#api-ref) untuk membantu kamu mengerjakan tugas ML.
+
+Berdasarkan [situs mereka](https://scikit-learn.org/stable/getting_started.html), "Scikit-learn" adalah sebuah *open-source library* untuk *machine-learning* yang dapat digunakan untuk *supervised* dan *unsupervised learning*. Scikit-learn juga menyediakan beragam alat untuk *model-fitting*, *data preprocessing*, seleksi dan evaluasi model, dll.
+
+Dalam kursus ini, kamu akan menggunakan Scikit-learn dan beberapa alat lainnya untuk membangun model *machine-learning* untuk mengerjakan apa yang kami panggil tugas '*machine-learning* tradisional'. Kami sengaja menghindari *neural networks* dan *deep learning* sebab mereka akan dibahas dengan lebih lengkap dalam kurikulum 'AI untuk pemula' nanti.
+
+Scikit-learn memudahkan pembangunan dan evaluasi model-model. Dia terutama fokus pada menggunakan data numerik dan mempunyai beberapa *dataset* yang siap sedia untuk digunakan sebagai alat belajar. Dia juga mempunyai model yang sudah dibangun untuk murid-murid untuk langsung coba. Mari menjelajahi proses memuat data yang sudah disiapkan dan menggunakan model ML pengestimasian pertama menggunakan Scikit-learn dengan data sederhana.
+
+
+## Latihan - Scikit-learn *notebook* pertamamu
+
+> Tutorial ini terinspirasi [contoh regresi linear ini](https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html#sphx-glr-auto-examples-linear-model-plot-ols-py) di situs Scikit-learn.
+
+Dalam file _notebook.ipynb_ dari pelajaran ini, kosongkan semua sel dengan mengklik tombol berlambang 'tempat sampah'.
+
+Dalam bagian ini, kamu akan bekerja dengan sebuah *dataset* kecil tentang diabetes yang datang bersama dengan Scikit-learn yang dimaksud sebagai bahan ajaran. Bayangkan bahwa kamu ingin mencoba sebuah cara pengobatan untuk pasien diabetes. Model *machine learning* mungkin dapat membantu kamu menentukan pasien mana merespon lebih baik pada pengobatan tersebut berdasarkan kombinasi-kombinasi variabel. Bahkan sebuah model regresi yang sederhana, saat divisualisasikan, dapat menunjukkanmu informasi tentang variabel-variabel yang dapat membantu kamu mengorganisasikan uji-uji klinis teoritismu.
+
+✅ Ada banyak macam metode regresi, dan yang mana yang kamu pilih tergantung pada jawaban yang kamu sedang cari. Kalau kamu ingin memprediksi tinggi badan seseorang dari usianya, kamu bisa menggunakan regresi linear, karena kamu mencari sebuah **nilai numerik**. Kalau kamu tertarik pada apa sebuah jenis kuliner sebaiknya dianggap sebagai vegan atau tidak, kamu sedang mencari sebuah **kategorisasi/klasifikasi**; kamu bisa menggunakan regresi logistik. Kamu akan belajar lebih banyak tentang regresi logistik nanti. Pikirkan dahulu beberapa pertanyaan yang kamu bisa tanyakan dari data yang ada dan metode yang mana akan paling cocok.
+
+Mari mulai mengerjakan tugas ini.
+
+### Impor *library*
+
+Untuk tugas ini, kita akan mengimpor beberapa *library*:
+
+- **matplotlib**. Sebuah [alat untuk membuat grafik](https://matplotlib.org/) yang kita akan gunakan untuk membuat sebuah grafik garis.
+- **numpy**. [numpy](https://numpy.org/doc/stable/user/whatisnumpy.html) adalah sebuah *library* berguna untuk menangani data numerik di Python.
+- **sklearn**. Ini adalah *library* Scikit-learn.
+
+Imporlah *library-library* yang akan membantu dengan tugasmu.
+
+1. Tambahlah impor dengan mengetik kode berikut:
+
+ ```python
+ import matplotlib.pyplot as plt
+ import numpy as np
+ from sklearn import datasets, linear_model, model_selection
+ ```
+
+ Di atas, kamu sedang mengimpor `matplottlib`, `numpy`. Selain itu, kamu juga sedang mengimpor `datasets`, `linear_model` dan `model_selection` dari `sklearn`. `model_selection` digunakan untuk memisahkan data menjadi set latihan dan set ujian.
+
+### *Dataset* diabetes
+
+[*Dataset* diabetes](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset) mencakupi 442 sampel data mengenai diabetes dengan 10 variabel utama, termasuk:
+
+- age: usia dalam tahun
+- bmi: *body mass index*
+- bp: tekanan darah rata-rata
+- s1 tc: Sel T (sejenis cel darah putih)
+
+✅ *Dataset* ini juga mempunyai konsep 'jenis kelamin' sebagai sebuah variabel utama yang penting dalam riset diabetes. Banyak *dataset* medis mempunyai klasifikasi binari ini. Pikirkan sejenak bagaimana kategorisasi seperti yang ini dapat mengecualikan bagian tertentu dari sebuah populasi dari pengobatan.
+
+Sekarang, muatkan data X dan y.
+
+> 🎓 Ingatlah, ini adalah *supervised learning*, jadi kita perlu sebuah target (dinamakan 'y').
+
+Dalam sebuah sel kode yang baru, muatkan *dataset* diabetes dengan memanggil `load_diabetes()`. Input `return_X_y=True` menunjukkan bahwa `X` adalah sebuah matriks data dan `y` adalah target regresi.
+
+1. Tambah beberapa instruksi *print* untuk menunjukkan ukuran data matriks dan elemen pertama matriks tersebut.
+ ```python
+ X, y = datasets.load_diabetes(return_X_y=True)
+ print(X.shape)
+ print(X[0])
+ ```
+
+ Respon yang didapati adalah sebuah *tuple*. Kamu sedang menetapkan kedua nilai pertama dalam *tuple* itu ke dalam `X` dan `y` secara berturut. Pelajari lebih banyak [tentant *tuples*](https://wikipedia.org/wiki/Tuple).
+
+ Kamu bisa melihat bahwa data ini berupa 422 nilai yang disusun menjadi beberapa `array` dengan 10 elemen:
+
+ ```text
+ (442, 10)
+ [ 0.03807591 0.05068012 0.06169621 0.02187235 -0.0442235 -0.03482076
+ -0.04340085 -0.00259226 0.01990842 -0.01764613]
+ ```
+
+ ✅ Pikirkan sejenak tentang hubungan antara data dan target regresi. Apa kamu bisa menemukan [targetnya](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset) untuk *database* diabetes dalam dokumentasinya? Mengetahui targetnya, apa yang *dataset* ini sedang mendemonstrasikan?
+
+2. Lalu, pilih sebuah porsi dataset ini untuk digambarkan dengan menyusuninya menjadi sebuah `array` baru dengan fungsi `newaxis` dari `numpy`. Kita akan menggunakan regresi linear untuk menggambar sebuah garis antara nilai-nilai dalam data ini sesuai dengan pola yang ditentukannya.
+
+ ```python
+ X = X[:, np.newaxis, 2]
+ ```
+
+ ✅ Kapanpun, *print*-lah datanya untuk memeriksa bentuknya.
+
+3. Sekarang datanya sudah siap untuk digambar. Kamu bisa melihat jikalau sebuah mesin dapat menentukan sebuah perpisahan logika dari nomor-nomor *dataset* ini. Untuk melakukan itu, kamu harus memisahkan data (X) dan target (y) menjadi set latihan dan set ujian. Scikit-learn juga memberi cara untuk melakukan itu; kamu bisa memisahkan data ujianmu pada titik tertentu.
+
+ ```python
+ X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33)
+ ```
+
+4. Sekarang, kamu sudah siap untuk melatihkan modelmu! Muatkan dahulu data regresi linear dan latihkan modelmu dengan set X dan y-mu dengan `model.fit()`:
+
+ ```python
+ model = linear_model.LinearRegression()
+ model.fit(X_train, y_train)
+ ```
+
+ ✅ `model.fit()` adalah sebuah fungsi yang akan kamu temukan dalam banyak *library* ML seperti TensorFlow.
+
+5. Lalu, buatlah sebuah prediksi dengan data ujianmu dengan fungsi `predict()`. Ini akan digunakan untuk menggambar sebuah garis antara grup-grup data.
+
+ ```python
+ y_pred = model.predict(X_test)
+ ```
+
+6. Sekarang waktunya untuk menggambar data dalam sebuah grafik. Matplotlib adalah sebuah alat yang sangat berguna untuk melakukan itu. Buatlah sebuah petak sebar dari semua X dan y dari set ujian dan gunakan prediksi yang dihasilkan untuk menggambar sebuah garis di tempat yang paling cocok antara grup-grup data modelnya.
+
+ ```python
+ plt.scatter(X_test, y_test, color='black')
+ plt.plot(X_test, y_pred, color='blue', linewidth=3)
+ plt.show()
+ ```
+
+ ![sebuah petak sebar yang menunjukkan titik-titik data sekitar diabetes](./images/scatterplot.png)
+
+ ✅ Pikirkan sejenak tentang apa yang sedang terjadi di sini. Sebuah garis lurus membentang tengah-tengah titik-titik kecil data. Tetapi apa yang sedang garis itu lakukan? Apa kamu bisa melihat bagaimana kamu bisa menggunakan garis ini untuk memprediksi di mana sebuah titik data baru yang tidak pernah dilihat sebelumnya kemungkinan besar akan terletak berhubungan dengan sumbu y grafik ini? Coba jelaskan dalam kata-kata kegunaan praktis model ini.
+
+Selamat, kamu telah membangun model regresi linear pertamamu, membuat sebuah prediksi darinya, dan menunjukkannya dalam sebuah grafik!
+
+---
+## Tantangan
+
+Gambarkan sebuah variabel yang beda dari *dataset* ini. Petunjuk: edit baris ini: `X = X[:, np.newaxis, 2]`. Mengetahui target *dataset* ini, apa yang kamu bisa menemukan tentang kemajuan diabetes sebagai sebuah penyakit?
+## [Kuis pasca-ceramah](https://white-water-09ec41f0f.azurestaticapps.net/quiz/10/)
+
+## Review & Pembelajaran Mandiri
+
+Dalam tutorial ini, kamu bekerja dengan sebuah model regresi linear yang sederhana daripada regresi linear univariat atau berganda. Bacalah sedikit tentang perbedaan antara metode-metode ini atau tontonlah [video ini](https://www.coursera.org/lecture/quantifying-relationships-regression-models/linear-vs-nonlinear-categorical-variables-ai2Ef).
+
+Bacalah lebih banyak tentang konsep regresi dan pikirkanlah tentang jenis pertanyaan apa saja yang bisa dijawab teknik ini. Cobalah [tutorial ini](https://docs.microsoft.com/learn/modules/train-evaluate-regression-models?WT.mc_id=academic-15963-cxa) untuk memperdalam pemahamanmu.
+
+## Tugas
+
+[*Dataset* yang beda](assignment.md)
diff --git a/2-Regression/1-Tools/translations/README.it.md b/2-Regression/1-Tools/translations/README.it.md
new file mode 100644
index 0000000000..a5e2e3842e
--- /dev/null
+++ b/2-Regression/1-Tools/translations/README.it.md
@@ -0,0 +1,211 @@
+# Iniziare con Python e Scikit-learn per i modelli di regressione
+
+![Sommario delle regressioni in uno sketchnote](../../../sketchnotes/ml-regression.png)
+
+> Sketchnote di [Tomomi Imura](https://www.twitter.com/girlie_mac)
+
+## [Qui Pre-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/9/)
+
+## Introduzione
+
+In queste quattro lezioni, si scoprirà come costruire modelli di regressione. Si discuterà di cosa siano fra breve.
+Prima di tutto, ci si deve assicurare di avere a disposizione gli strumenti adatti per far partire il processo!
+
+In questa lezione, si imparerà come:
+
+- Configurare il proprio computer per attività locali di machine learning.
+- Lavorare con i Jupyter notebook.
+- Usare Scikit-learn, compresa l'installazione.
+- Esplorare la regressione lineare con un esercizio pratico.
+
+## Installazioni e configurazioni
+
+[![Usare Python con Visual Studio Code](https://img.youtube.com/vi/7EXd4_ttIuw/0.jpg)](https://youtu.be/7EXd4_ttIuw "Using Python with Visual Studio Code")
+
+> 🎥 Fare click sull'immagine qui sopra per un video: usare Python all'interno di VS Code.
+
+1. **Installare Python**. Assicurarsi che [Python](https://www.python.org/downloads/) sia installato nel proprio computer. Si userà Python for per molte attività di data science e machine learning. La maggior parte dei sistemi già include una installazione di Python. Ci sono anche utili [Pacchetti di Codice Python](https://code.visualstudio.com/learn/educators/installers?WT.mc_id=academic-15963-cxa) disponbili, per facilitare l'installazione per alcuni utenti.
+
+ Alcuni utilizzi di Python, tuttavia, richiedono una versione del software, laddove altri ne richiedono un'altra differente. Per questa ragione, è utile lavorare con un [ambiente virtuale](https://docs.python.org/3/library/venv.html).
+
+2. **Installare Visual Studio Code**. Assicurarsi di avere installato Visual Studio Code sul proprio computer. Si seguano queste istruzioni per [installare Visual Studio Code](https://code.visualstudio.com/) per l'installazione basica. Si userà Python in Visual Studio Code in questo corso, quindi meglio rinfrescarsi le idee su come [configurare Visual Studio Code](https://docs.microsoft.com/learn/modules/python-install-vscode?WT.mc_id=academic-15963-cxa) per lo sviluppo in Python.
+
+ > Si prenda confidenza con Python tramite questa collezione di [moduli di apprendimento](https://docs.microsoft.com/users/jenlooper-2911/collections/mp1pagggd5qrq7?WT.mc_id=academic-15963-cxa)
+
+3. **Installare Scikit-learn**, seguendo [queste istruzioni](https://scikit-learn.org/stable/install.html). Visto che ci si deve assicurare di usare Python 3, ci si raccomanda di usare un ambiente virtuale. Si noti che se si installa questa libreria in un M1 Mac, ci sono istruzioni speciali nella pagina di cui al riferimento qui sopra.
+
+1. **Installare Jupyter Notebook**. Servirà [installare il pacchetto Jupyter](https://pypi.org/project/jupyter/).
+
+## Ambiente di creazione ML
+
+Si useranno **notebook** per sviluppare il codice Python e creare modelli di machine learning. Questo tipo di file è uno strumento comune per i data scientist, e viene identificato dal suffisso o estensione `.ipynb`.
+
+I notebook sono un ambiente interattivo che consente allo sviluppatore di scrivere codice, aggiungere note e scrivere documentazione attorno al codice il che è particolarmente utile per progetti sperimentali o orientati alla ricerca.
+
+### Esercizio - lavorare con un notebook
+
+In questa cartella, si troverà il file _notebook.ipynb_.
+
+1. Aprire _notebook.ipynb_ in Visual Studio Code.
+
+ Un server Jupyter verrà lanciato con Python 3+. Si troveranno aree del notebook che possono essere `eseguite`, pezzi di codice. Si può eseguire un blocco di codice selezionando l'icona che assomiglia a un bottone di riproduzione.
+
+1. Selezionare l'icona `md` e aggiungere un poco di markdown, e il seguente testo **# Benvenuto nel tuo notebook**.
+
+ Poi, aggiungere un blocco di codice Python.
+
+1. Digitare **print('hello notebook')** nell'area riservata al codice.
+1. Selezionare la freccia per eseguire il codice.
+
+ Si dovrebbe vedere stampata la seguente frase:
+
+ ```output
+ hello notebook
+ ```
+
+![VS Code con un notebook aperto](../images/notebook.png)
+
+Si può inframezzare il codice con commenti per auto documentare il notebook.
+
+✅ Si pensi per un minuto all'ambiente di lavoro di uno sviluppatore web rispetto a quello di un data scientist.
+
+## Scikit-learn installato e funzionante
+
+Adesso che Python è impostato nel proprio ambiente locale, e si è familiari con i notebook Jupyter, si acquisterà ora confidenza con Scikit-learn (si pronuncia con la `si` della parola inglese `science`). Scikit-learn fornisce una [API estensiva](https://scikit-learn.org/stable/modules/classes.html#api-ref) che aiuta a eseguire attività ML.
+
+Stando al loro [sito web](https://scikit-learn.org/stable/getting_started.html), "Scikit-learn è una libreria di machine learning open source che supporta l'apprendimento assistito (supervised learning) e non assistito (unsuperivised learnin). Fornisce anche strumenti vari per l'adattamento del modello, la pre-elaborazione dei dati, la selezione e la valutazione dei modelli e molte altre utilità."
+
+In questo corso, si userà Scikit-learn e altri strumenti per costruire modelli di machine learning per eseguire quelle che vengono chiamate attività di 'machine learning tradizionale'. Si sono deliberamente evitate le reti neurali e il deep learning visto che saranno meglio trattati nel prossimo programma di studi 'AI per Principianti'.
+
+Scikit-learn rende semplice costruire modelli e valutarli per l'uso. Si concentra principalmente sull'utilizzo di dati numerici e contiene diversi insiemi di dati già pronti per l'uso come strumenti di apprendimento. Include anche modelli pre-costruiti per gli studenti da provare. Si esplora ora il processo di caricamento dei dati preconfezionati, e, utilizzando un modello di stimatore incorporato, un primo modello ML con Scikit-Learn con alcuni dati di base.
+
+## Esercizio - Il Primo notebook Scikit-learn
+
+> Questo tutorial è stato ispirato dall'[esempio di regressione lineare](https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html#sphx-glr-auto-examples-linear-model-plot-ols-py) nel sito web di Scikit-learn.
+
+Nel file _notebook.ipynb_ associato a questa lezione, svuotare tutte le celle usando l'icona cestino ('trash can').
+
+In questa sezione, di lavorerà con un piccolo insieme di dati sul diabete che è incorporato in Scikit-learn per scopi di apprendimento. Si immagini di voler testare un trattamento per i pazienti diabetici. I modelli di machine learning potrebbero essere di aiuto nel determinare quali pazienti risponderebbero meglio al trattamento, in base a combinazioni di variabili. Anche un modello di regressione molto semplice, quando visualizzato, potrebbe mostrare informazioni sulle variabili che aiuteranno a organizzare le sperimentazioni cliniche teoriche.
+
+✅ Esistono molti tipi di metodi di regressione e quale scegliere dipende dalla risposta che si sta cercando. Se si vuole prevedere l'altezza probabile per una persona di una data età, si dovrebbe usare la regressione lineare, visto che si sta cercando un **valore numerico**. Se si è interessati a scoprire se un tipo di cucina dovrebbe essere considerato vegano o no, si sta cercando un'**assegnazione di categoria** quindi si dovrebbe usare la regressione logistica. Si imparerà di più sulla regressione logistica in seguito. Si pensi ad alcune domande che si possono chiedere ai dati e quale di questi metodi sarebbe più appropriato.
+
+Si inizia con questa attività.
+
+### Importare le librerie
+
+Per questo compito verranno importate alcune librerie:
+
+- **matplotlib**. E' un utile [strumento grafico](https://matplotlib.org/) e verrà usato per creare una trama a linee.
+- **numpy**. [numpy](https://numpy.org/doc/stable/user/whatisnumpy.html) è una libreira utile per gestire i dati numerici in Python.
+- **sklearn**. Questa è la libreria Scikit-learn.
+
+Importare alcune librerie che saranno di aiuto per le proprie attività.
+
+1. Con il seguente codice si aggiungono le importazioni:
+
+ ```python
+ import matplotlib.pyplot as plt
+ import numpy as np
+ from sklearn import datasets, linear_model, model_selection
+ ```
+
+ Qui sopra vengono importati `matplottlib`, e `numpy`, da `sklearn` si importa `datasets`, `linear_model` e `model_selection`. `model_selection` viene usato per dividere i dati negli insiemi di addestramento e test.
+
+### L'insieme di dati riguardante il diabete
+
+L'[insieme dei dati sul diabete](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset) include 442 campioni di dati sul diabete, con 10 variabili caratteristiche, alcune delle quali includono:
+
+- age (età): età in anni
+- bmi: indice di massa corporea (body mass index)
+- bp: media pressione sanguinea
+- s1 tc: Cellule T (un tipo di leucocito)
+
+✅ Questo insieme di dati include il concetto di "sesso" come caratteristica variabile importante per la ricerca sul diabete. Molti insiemi di dati medici includono questo tipo di classificazione binaria. Si rifletta su come categorizzazioni come questa potrebbe escludere alcune parti di una popolazione dai trattamenti.
+
+Ora si caricano i dati di X e y.
+
+> 🎓 Si ricordi, questo è apprendimento supervisionato (supervised learning), e serve dare un nome all'obiettivo 'y'.
+
+In una nuova cella di codice, caricare l'insieme di dati sul diabete chiamando `load_diabetes()`. Il parametro `return_X_y=True` segnala che `X` sarà una matrice di dati e `y` sarà l'obiettivo della regressione.
+
+1. Si aggiungono alcuni comandi di stampa per msotrare la forma della matrice di dati e i suoi primi elementi:
+
+ ```python
+ X, y = datasets.load_diabetes(return_X_y=True)
+ print(X.shape)
+ print(X[0])
+ ```
+
+ Quella che viene ritornata è una tuple. Quello che si sta facento è assegnare i primi due valori della tupla a `X` e `y` rispettivamente. Per saperne di più sulle [tuples](https://wikipedia.org/wiki/Tuple).
+
+ Si può vedere che questi dati hanno 442 elementi divisi in array di 10 elementi:
+
+ ```text
+ (442, 10)
+ [ 0.03807591 0.05068012 0.06169621 0.02187235 -0.0442235 -0.03482076
+ -0.04340085 -0.00259226 0.01990842 -0.01764613]
+ ```
+
+ ✅ Si rifletta sulla relazione tra i dati e l'obiettivo di regressione. La regressione lineare prevede le relazioni tra la caratteristica X e la variabile di destinazione y. Si può trovare l'[obiettivo](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset) per l'insieme di dati sul diabete nella documentazione? Cosa dimostra questo insieme di dati, dato quell'obiettivo?
+
+2. Successivamente, selezionare una porzione di questo insieme di dati da tracciare sistemandola in un nuovo array usando la funzione di numpy's `newaxis`. Verrà usata la regressione lineare per generare una linea tra i valori in questi dati secondo il modello che determina.
+
+ ```python
+ X = X[:, np.newaxis, 2]
+ ```
+
+ ✅ A piacere, stampare i dati per verificarne la forma.
+
+3. Ora che si hanno dei dati pronti per essere tracciati, è possibile vedere se una macchina può aiutare a determinare una divisione logica tra i numeri in questo insieme di dati. Per fare ciò, è necessario dividere sia i dati (X) che l'obiettivo (y) in insiemi di test e addestamento. Scikit-learn ha un modo semplice per farlo; si possono dividere i dati di prova in un determinato punto.
+
+ ```python
+ X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33)
+ ```
+
+4. Ora si è pronti ad addestare il modello! Caricare il modello di regressione lineare e addestrarlo con i propri insiemi di addestramento X e y usando `model.fit()`:
+
+ ```python
+ model = linear_model.LinearRegression()
+ model.fit(X_train, y_train)
+ ```
+
+ ✅ `model.fit()` è una funzione che si vedrà in molte librerie ML tipo TensorFlow
+
+5. Successivamente creare una previsione usando i dati di test, con la funzione `predict()`. Questo servirà per tracciare la linea tra i gruppi di dati
+
+ ```python
+ y_pred = model.predict(X_test)
+ ```
+
+6. Ora è il momento di mostrare i dati in un tracciato. Matplotlib è uno strumento molto utile per questo compito. Si crei un grafico a dispersione (scatterplot) di tutti i dati del test X e y e si utilizzi la previsione per disegnare una linea nel luogo più appropriato, tra i raggruppamenti dei dati del modello.
+
+ ```python
+ plt.scatter(X_test, y_test, color='black')
+ plt.plot(X_test, y_pred, color='blue', linewidth=3)
+ plt.show()
+ ```
+
+ ![un grafico a dispersione che mostra i punti dati sul diabete](../images/scatterplot.png)
+
+ ✅ Si pensi a cosa sta succedendo qui. Una linea retta scorre attraverso molti piccoli punti dati, ma cosa sta facendo esattamente? Si può capire come si dovrebbe utilizzare questa linea per prevedere dove un nuovo punto di dati non noto dovrebbe adattarsi alla relazione con l'asse y del tracciato? Si cerchi di mettere in parole l'uso pratico di questo modello.
+
+Congratulazioni, si è costruito il primo modello di regressione lineare, creato una previsione con esso, e visualizzata in una tracciato!
+
+---
+
+## 🚀Sfida
+
+Tracciare una variabile diversa da questo insieme di dati. Suggerimento: modificare questa riga: `X = X[:, np.newaxis, 2]`. Dato l'obiettivo di questo insieme di dati, cosa si potrebbe riuscire a scoprire circa la progressione del diabete come matattia?
+
+## [Qui post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/10/)
+
+## Riepilogo e Auto Apprendimento
+
+In questo tutorial, si è lavorato con una semplice regressione lineare, piuttosto che una regressione univariata o multipla. Ci so informi circa le differenze tra questi metodi oppure si dia uno sguardo a [questo video](https://www.coursera.org/lecture/quantifying-relationships-regression-models/linear-vs-nonlinear-categorical-variables-ai2Ef)
+
+Si legga di più sul concetto di regressione e si pensi a quale tipo di domande potrebbero trovare risposta con questa tecnica. Seguire questo [tutorial](https://docs.microsoft.com/learn/modules/train-evaluate-regression-models?WT.mc_id=academic-15963-cxa) per approfondire la propria conoscenza.
+
+## Compito
+
+[Un insieme di dati diverso](assignment.it.md)
+
diff --git a/2-Regression/1-Tools/translations/README.ja.md b/2-Regression/1-Tools/translations/README.ja.md
new file mode 100644
index 0000000000..fcb5ce0f7a
--- /dev/null
+++ b/2-Regression/1-Tools/translations/README.ja.md
@@ -0,0 +1,218 @@
+# 回帰モデルについてPythonとScikit-learnから始めましょう
+
+![回帰の要約についてのスケッチノート](../../../sketchnotes/ml-regression.png)
+
+> [Tomomi Imura](https://www.twitter.com/girlie_mac) によって制作されたスケッチノート
+
+## [講義前クイズ](https://white-water-09ec41f0f.azurestaticapps.net/quiz/9?loc=ja)
+
+## イントロダクション
+
+この章で用意されている4つのレッスンでは、回帰モデルを構築する方法について学びます。回帰モデルが何をするためのものなのかは、後ほど説明します。しかし、何かを始める前にプロセスを開始するための適切なツールが用意されていることを確認してください!
+
+このレッスンでは、以下のことを学びます。
+
+- ローカル環境で機械学習タスクを実行するための設定
+- Jupyter Notebookの使い方
+- Scikit-learnのインストールと使い方
+- 線形回帰に関するハンズオン
+
+## インストールと設定
+
+[![VisualStudioCodeでのPythonの使用方法](https://img.youtube.com/vi/7EXd4_ttIuw/0.jpg)](https://youtu.be/7EXd4_ttIuw "VisualStudioCodeでのPythonの使用方法")
+
+> 🎥 上の画像をクリックするとビデオが再生されます: VisualStudioCodeでのPythonの使用方法
+
+1. **Pythonのインストール**: [Python](https://www.python.org/downloads/) がコンピュータにインストールされていることを確認してください。Pythonは多くのデータサイエンス、機械学習のタスクで使用します。ほとんどのコンピュータシステムにはPythonがすでにインストールされています。一部のユーザのセットアップを簡単にするために [Python Coding Packs](https://code.visualstudio.com/learn/educators/installers?WT.mc_id=academic-15963-cxa) を利用することもできます。
+
+ しかし、Pythonを使っていると時に異なるバージョンを必要とする場合があります。そのため、[仮想環境](https://docs.python.org/3/library/venv.html) を利用すると便利です。
+
+
+2. **Visual Studio Codeのインストール**: Visual Studio Codeがコンピュータにインストールされていることを確認してください。[こちらの手順](https://code.visualstudio.com/) でVisual Studio Codeをインストールしてください。このコースでは、Visual Studio CodeでPythonを使用しますので [Visual Studio Codeの設定](https://docs.microsoft.com/learn/modules/python-install-vscode?WT.mc_id=academic-15963-cxa) をブラッシュアップしておくといいです。
+
+ > この [学習モジュール](https://docs.microsoft.com/users/jenlooper-2911/collections/mp1pagggd5qrq7?WT.mc_id=academic-15963-cxa) を利用して、Pythonの使い方に慣れてください。
+
+
+3. **Scikit-learnのインストール**: [こちらの手順](https://scikit-learn.org/stable/install.html) に従ってインストールしてください。Python3の環境で実行する必要があるので、仮想環境を使用することをおすすめします。なお、このライブラリをM1のMacにインストールする場合は、上記リンク先のページに特別な説明があります。
+
+
+4. **Jupyter Notebookのインストール**: [Jupyter package](https://pypi.org/project/jupyter/) をインストールする必要があります。
+
+## MLのオーサリング環境
+
+**ノートブック**を利用して、Pythonコードでの開発や機械学習モデルの作成を行います。このような種類のファイルは、データサイエンティストにとって一般的なツールであり、接尾辞または拡張子が `.ipynb` であることで識別できます。
+
+ノートブックは、開発者がコードを書くと同時に、そのコードにメモを加えたり、文書を書いたりすることができるインタラクティブな環境です。そのため、実験や研究を目的としたプロジェクトに非常に役立ちます。
+
+### エクササイズ - ノートブックでの作業
+
+フォルダの中に _notebook.ipynb_ というファイルが入っています。
+
+1. Visual Studio Codeで _notebook.ipynb_ を開いてください。
+
+ JupyterサーバーはPython3+が起動した状態でスタートします。ノートブックの中には、コードを「実行」できる部分があります。再生ボタンのようなアイコンを選択すると、コードブロックを実行することができます。
+
+
+2. `md`アイコンを選択して、**# Welcome to your notebook** というテキストの簡単なマークダウンを追加してみましょう.
+
+ 次に、Pythonのコードを追加します。
+
+
+3. コードブロックで **print('hello notebook')** と入力してください。
+
+
+4. 矢印を選択するとコードが実行されます。
+
+ 以下のような結果が出力されます:
+
+ ```output
+ hello notebook
+ ```
+
+![VS Codeで開いたノートブック](../images/notebook.png)
+
+コードにコメントを追加することで、ノートブックをセルフドキュメント化することができます。
+
+✅ ウェブ開発者とデータサイエンティストの開発環境がどれほど違うか、ちょっと考えてみてください。
+
+## Scikit-learnを使ってみましょう
+
+さて、ローカル環境にPythonがセットアップされ、Jupyter notebookに慣れてきたところで、Scikit-learn(「science」のように「sci」と発音してください)にも同様に慣れていきましょう。Scikit-learnは、MLタスクを実行するための [広範なAPI](https://scikit-learn.org/stable/modules/classes.html#api-ref) を提供しています。
+
+同社の [Webサイト](https://scikit-learn.org/stable/getting_started.html) には次のような説明があります。"Scikit-learnは、教師あり、教師なしの学習をサポートするオープンソースの機械学習ライブラリです。また、モデルのフィッティング、データの前処理、モデルの選択と評価、その他多くのユーティリティーのための様々なツールを提供しています。"
+
+このコースでは、Scikit-learnやその他のツールを使って機械学習モデルを構築し、私たちが「伝統的な機械学習」と呼ぶタスクを実行します。ニューラルネットワークやディープラーニングについては、近日公開予定の「AI for Beginners」カリキュラムで詳しく解説しますので、意図的に避けています。
+
+Scikit-learnは、モデルを構築し、評価を行って実際に利用するということが簡単にできます。主に数値データの使用に焦点を当てており、学習ツールとして使用するための既製のデータセットがいくつか含まれています。また、事前に構築済みのモデルもいくつか含まれています。では、Scikit-learnであらかじめ用意されているデータを使って、最初のMLモデルを構築するプロセスを体験しましょう。
+
+## エクササイズ - 初めてのScikit-learnノートブック
+
+> このチュートリアルはScikit-learnのWebサイトに掲載されている [linear regression example](https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html#sphx-glr-auto-examples-linear-model-plot-ols-py) を参考にしています。
+
+このレッスンに関連する _notebook.ipynb_ ファイルで、「ゴミ箱」アイコンを押して全てのセルを消去します。
+
+このセクションでは、学習用にScikit-learnに組み込まれた、糖尿病に関する小さなデータセットを扱います。糖尿病患者の治療法をテストしたい場合を想像してみてください。機械学習モデルは、変数の組み合わせに基づいて、どの患者がその治療法によく反応するかを判断するのに役立つかもしれません。非常に基本的な回帰モデルであっても、視覚化することで、理論的な臨床試験を整理するのに役立つ変数に関する情報が得られるかもしれません。
+
+✅ 回帰には様々な種類があり、求める答えによってどの手法を選ぶかが異なります。例えば、ある人の年齢が与えられて身長を予測したい場合には、**数値**を求めたいので線形回帰を利用します。ある種類の料理がビーガンとみなされるべきかどうかを発見することに興味がある場合は、**カテゴリーの割り当て**を求めているので、ロジスティック回帰を使用するでしょう。ロジスティック回帰については後ほど詳しく説明します。いくつかのデータについて、どのような手法がより適切であるかを少し考えてみてください。
+
+では、今回のタスクに取り掛かりましょう。
+
+### ライブラリのインポート
+
+このタスクでは、いくつかのライブラリをインポートします。
+
+- **matplotlib**: 便利な [グラフ作成ツール](https://matplotlib.org/) です。今回はこれを使って折れ線グラフを作成します。
+- **numpy**: [numpy](https://numpy.org/doc/stable/user/whatisnumpy.html) はPythonで数値データを扱うための便利なライブラリです。
+- **sklearn**: Scikit-learnのライブラリです。
+
+作業に役立つライブラリをいくつか紹介します。
+
+1. 以下のコードを入力してインポートを追加します。
+
+ ```python
+ import matplotlib.pyplot as plt
+ import numpy as np
+ from sklearn import datasets, linear_model, model_selection
+ ```
+
+ 上記では、`matplottlib`と`numpy`をインポートし、`sklearn`から`datasets`、`linear_model`、`model_selection`をインポートしています。`model_selection`はデータをトレーニングセットとテストセットに分割する際に使用します。
+
+### 糖尿病のデータセット
+
+組み込みの [diabetes dataset](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset) には、糖尿病に関する442サンプルのデータが含まれており、10個の変数が含まれています。
+
+age: 年齢
+bmi: ボディマス指数
+bp: 平均血圧
+s1 tc: T細胞(白血球の一種)
+
+✅ このデータセットには、糖尿病に関する研究に重要な変数として「性別」の概念が含まれています。多くの医療データセットには、このようなバイナリ分類が含まれています。このような分類が、人口のある部分を治療から排除する可能性があることについて、少し考えてみましょう。
+
+では、Xとyのデータを読み込みます。
+
+> 🎓 今回は教師あり学習なので、「y」で表されるターゲットが必要なことを覚えておいてください。
+
+新しいコードセルで、`load_diabetes()` を呼び出して糖尿病データセットを読み込みます。入力の `return_X_y=True` は、`X` がデータ行列であり、`y` が回帰の対象であることを示しています。
+
+1. データ行列の形とその最初の要素を表示するために、いくつかのprintコマンドを追加します。
+
+ ```python
+ X, y = datasets.load_diabetes(return_X_y=True)
+ print(X.shape)
+ print(X[0])
+ ```
+
+ `load_diabetes()` のレスポンスとしてタプルが返ってきます。タプルの2つの値をそれぞれ `X`と` y`に割り当てます。詳しくは、 [タプルについて](https://wikipedia.org/wiki/Tuple) を確認してください。
+
+ このデータは、442個のアイテムで構成されており、一つのアイテムは10個の要素を持つ配列であることがわかります。
+
+ ```text
+ (442, 10)
+ [ 0.03807591 0.05068012 0.06169621 0.02187235 -0.0442235 -0.03482076
+ -0.04340085 -0.00259226 0.01990842 -0.01764613]
+ ```
+
+ ✅ データと回帰対象の関係について少し考えてみましょう。線形回帰では、特徴量Xとターゲット変数yの関係を予測します。ドキュメントで、糖尿病データセットの [ターゲット](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset) を見つけることができますか?そのターゲットを踏まえて、このデータセットは何を示していますか?
+
+
+2. 次に、numpyの`newaxis`関数を使って新しい配列を作成することで、プロットするためのデータセットの一部を選択します。最終的には線形回帰を使用することで、決まったパターンに従って、このデータの値の間に線を生成します。
+
+ ```python
+ X = X[:, np.newaxis, 2]
+ ```
+
+ ✅ いつでも、データをprintして、その形を確認することができます。
+
+
+3. データをプロットする準備ができたので、このデータセットの数字の論理的な分割を機械が判断できるかどうかを確認してみましょう。そのためには、データ(X)とターゲット(y)の両方をトレーニングセットとテストセットに分ける必要があります。Scikit-learnには、これを行うための簡単な方法が用意されています。
+
+ ```python
+ X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33)
+ ```
+
+
+4. これで、モデルをトレーニングする準備が整いました!線形回帰モデルを読み込み、Xとyのトレーニングセットに `model.fit()` を適用して、モデルの学習を行います。
+
+ ```python
+ model = linear_model.LinearRegression()
+ model.fit(X_train, y_train)
+ ```
+
+ ✅ `model.fit()` は、TensorFlowなどのMLライブラリでよく見かける関数です。
+
+
+5. 次に、`predict()` 関数を使って、テストデータに対する予測を行います。これは、データグループ間の線を引くために使用されます。
+
+ ```python
+ y_pred = model.predict(X_test)
+ ```
+
+6. さて、いよいよデータをプロットして表示してみましょう。Matplotlibはこの作業にとても便利なツールです。テストデータの全てのXとyに関する散布図を作成し、モデルの予測を使用することでデータグループ間の最も適切な場所に線を引きます。
+
+ ```python
+ plt.scatter(X_test, y_test, color='black')
+ plt.plot(X_test, y_pred, color='blue', linewidth=3)
+ plt.show()
+ ```
+
+ ![糖尿病周辺のデータポイントを示す散布図](../images/scatterplot.png)
+
+ ✅ ここで何が起こっているのか、少し考えてみましょう。直線がたくさんの小さなデータの点を通っていますが、正確には何をしているのでしょうか?この直線を使って、見たことのない新しいデータポイントがプロットのy軸との関係でどこに当てはまるかを予測することができるはずだということがわかりますか?このモデルの実用的な使い方を言葉にしてみてください。
+
+おめでとうございます!初めて線形回帰モデルを構築し、それを使って予測を行い、結果をプロットで表示しました!
+
+---
+## 🚀チャレンジ
+
+このデータセットから別の変数を選択してプロットしてください。ヒント: `X = X[:, np.newaxis, 2]` の行を編集する。今回のデータセットのターゲットである、糖尿病という病気の進行について、どのような発見があるのでしょうか?
+## [講義後クイズ](https://white-water-09ec41f0f.azurestaticapps.net/quiz/10?loc=ja)
+
+## レビュー & 自主学習
+
+このチュートリアルでは、単変量線形回帰や多変量線形回帰ではなく、単純線形回帰を扱いました。これらの手法の違いについて少し調べてみるか、この [ビデオ](https://www.coursera.org/lecture/quantifying-relationships-regression-models/linear-vs-nonlinear-categorical-variables-ai2Ef) を見てみましょう。
+
+回帰の概念について詳しく調べ、この手法でどのような質問に答えられるかを考えてみましょう。この [チュートリアル](https://docs.microsoft.com/learn/modules/train-evaluate-regression-models?WT.mc_id=academic-15963-cxa) で理解を深めることもできます。
+
+## 課題
+
+[異なるデータセット](./assignment.ja.md)
diff --git a/2-Regression/1-Tools/translations/README.zh-cn.md b/2-Regression/1-Tools/translations/README.zh-cn.md
new file mode 100644
index 0000000000..019eb06f60
--- /dev/null
+++ b/2-Regression/1-Tools/translations/README.zh-cn.md
@@ -0,0 +1,205 @@
+# 开始使用Python和Scikit学习回归模型
+
+![回归](../../../sketchnotes/ml-regression.png)
+
+> 作者[Tomomi Imura](https://www.twitter.com/girlie_mac)
+
+## [课前测](https://white-water-09ec41f0f.azurestaticapps.net/quiz/9/)
+## 介绍
+
+在这四节课中,你将了解如何构建回归模型。我们将很快讨论这些是什么。但在你做任何事情之前,请确保你有合适的工具来开始这个过程!
+
+在本课中,你将学习如何:
+
+- 为本地机器学习任务配置你的计算机。
+- 使用Jupyter notebooks。
+- 使用Scikit-learn,包括安装。
+- 通过动手练习探索线性回归。
+
+## 安装和配置
+
+[![在 Visual Studio Code中使用 Python](https://img.youtube.com/vi/7EXd4_ttIuw/0.jpg)](https://youtu.be/7EXd4_ttIuw "在 Visual Studio Code中使用 Python")
+
+> 🎥 单击上图观看视频:在VS Code中使用Python。
+
+1. **安装 Python**。确保你的计算机上安装了[Python](https://www.python.org/downloads/)。你将在许多数据科学和机器学习任务中使用 Python。大多数计算机系统已经安装了Python。也有一些有用的[Python编码包](https://code.visualstudio.com/learn/educations/installers?WT.mc_id=academic-15963-cxa)可用于简化某些用户的设置。
+
+ 然而,Python的某些用法需要一个版本的软件,而其他用法则需要另一个不同的版本。 因此,在[虚拟环境](https://docs.python.org/3/library/venv.html)中工作很有用。
+
+2. **安装 Visual Studio Code**。确保你的计算机上安装了Visual Studio Code。按照这些说明[安装 Visual Studio Code](https://code.visualstudio.com/)进行基本安装。在本课程中,你将在Visual Studio Code中使用Python,因此你可能想复习如何[配置 Visual Studio Code](https://docs.microsoft.com/learn/modules/python-install-vscode?WT.mc_id=academic-15963-cxa)用于Python开发。
+
+ > 通过学习这一系列的 [学习模块](https://docs.microsoft.com/users/jenlooper-2911/collections/mp1pagggd5qrq7?WT.mc_id=academic-15963-cxa)熟悉Python
+
+3. **按照[这些说明]安装Scikit learn**(https://scikit-learn.org/stable/install.html )。由于你需要确保使用Python3,因此建议你使用虚拟环境。注意,如果你是在M1 Mac上安装这个库,在上面链接的页面上有特别的说明。
+
+4. **安装Jupyter Notebook**。你需要[安装Jupyter包](https://pypi.org/project/jupyter/)。
+
+## 你的ML工作环境
+
+你将使用**notebooks**开发Python代码并创建机器学习模型。这种类型的文件是数据科学家的常用工具,可以通过后缀或扩展名`.ipynb`来识别它们。
+
+Notebooks是一个交互式环境,允许开发人员编写代码并添加注释并围绕代码编写文档,这对于实验或面向研究的项目非常有帮助。
+
+### 练习 - 使用notebook
+
+1. 在Visual Studio Code中打开_notebook.ipynb_。
+
+ Jupyter服务器将以python3+启动。你会发现notebook可以“运行”的区域、代码块。你可以通过选择看起来像播放按钮的图标来运行代码块。
+
+2. 选择`md`图标并添加一点markdown,输入文字 **# Welcome to your notebook**。
+
+ 接下来,添加一些Python代码。
+
+1. 在代码块中输入**print("hello notebook")**。
+
+2. 选择箭头运行代码。
+
+ 你应该看到打印的语句:
+
+ ```output
+ hello notebook
+ ```
+
+![打开notebook的VS Code](../images/notebook.png)
+
+你可以为你的代码添加注释,以便notebook可以自描述。
+
+✅ 想一想web开发人员的工作环境与数据科学家的工作环境有多大的不同。
+
+## 启动并运行Scikit-learn
+
+现在Python已在你的本地环境中设置好,并且你对Jupyter notebook感到满意,让我们同样熟悉Scikit-learn(在“science”中发音为“sci”)。 Scikit-learn提供了[大量的API](https://scikit-learn.org/stable/modules/classes.html#api-ref)来帮助你执行ML任务。
+
+根据他们的[网站](https://scikit-learn.org/stable/getting_started.html),“Scikit-learn是一个开源机器学习库,支持有监督和无监督学习。它还提供了各种模型拟合工具、数据预处理、模型选择和评估以及许多其他实用程序。”
+
+在本课程中,你将使用Scikit-learn和其他工具来构建机器学习模型,以执行我们所谓的“传统机器学习”任务。我们特意避免了神经网络和深度学习,因为它们在我们即将推出的“面向初学者的人工智能”课程中得到了更好的介绍。
+
+Scikit-learn使构建模型和评估它们的使用变得简单。它主要侧重于使用数字数据,并包含几个现成的数据集用作学习工具。它还包括供学生尝试的预建模型。让我们探索加载预先打包的数据和使用内置的estimator first ML模型和Scikit-learn以及一些基本数据的过程。
+
+## 练习 - 你的第一个Scikit-learn notebook
+
+> 本教程的灵感来自Scikit-learn网站上的[线性回归示例](https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html#sphx-glr-auto-examples-linear-model-plot-ols-py)。
+
+在与本课程相关的 _notebook.ipynb_ 文件中,通过点击“垃圾桶”图标清除所有单元格。
+
+在本节中,你将使用一个关于糖尿病的小数据集,该数据集内置于Scikit-learn中以用于学习目的。想象一下,你想为糖尿病患者测试一种治疗方法。机器学习模型可能会帮助你根据变量组合确定哪些患者对治疗反应更好。即使是非常基本的回归模型,在可视化时,也可能会显示有助于组织理论临床试验的变量信息。
+
+✅ 回归方法有很多种,你选择哪一种取决于你正在寻找的答案。如果你想预测给定年龄的人的可能身高,你可以使用线性回归,因为你正在寻找**数值**。如果你有兴趣了解某种菜肴是否应被视为素食主义者,那么你正在寻找**类别分配**,以便使用逻辑回归。稍后你将了解有关逻辑回归的更多信息。想一想你可以对数据提出的一些问题,以及这些方法中的哪一个更合适。
+
+让我们开始这项任务。
+
+### 导入库
+
+对于此任务,我们将导入一些库:
+
+- **matplotlib**。这是一个有用的[绘图工具](https://matplotlib.org/),我们将使用它来创建线图。
+- **numpy**。 [numpy](https://numpy.org/doc/stable/user/whatisnumpy.html)是一个有用的库,用于在Python中处理数字数据。
+- **sklearn**。这是Scikit-learn库。
+
+导入一些库来帮助你完成任务。
+
+1. 通过输入以下代码添加导入:
+
+ ```python
+ import matplotlib.pyplot as plt
+ import numpy as np
+ from sklearn import datasets, linear_model, model_selection
+ ```
+
+ 在上面的代码中,你正在导入`matplottlib`、`numpy`,你正在从`sklearn`导入`datasets`、`linear_model`和`model_selection`。 `model_selection`用于将数据拆分为训练集和测试集。
+
+### 糖尿病数据集
+
+内置的[糖尿病数据集](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset)包含442个围绕糖尿病的数据样本,具有10个特征变量,其中包括:
+
+age:岁数
+bmi:体重指数
+bp:平均血压
+s1 tc:T细胞(一种白细胞)
+
+✅ 该数据集包括“性别”的概念,作为对糖尿病研究很重要的特征变量。许多医学数据集包括这种类型的二元分类。想一想诸如此类的分类如何将人群的某些部分排除在治疗之外。
+
+现在,加载X和y数据。
+
+> 🎓 请记住,这是监督学习,我们需要一个命名为“y”的目标。
+
+在新的代码单元中,通过调用`load_diabetes()`加载糖尿病数据集。输入`return_X_y=True`表示`X`将是一个数据矩阵,而`y`将是回归目标。
+
+1. 添加一些打印命令来显示数据矩阵的形状及其第一个元素:
+
+ ```python
+ X, y = datasets.load_diabetes(return_X_y=True)
+ print(X.shape)
+ print(X[0])
+ ```
+
+ 作为响应返回的是一个元组。你正在做的是将元组的前两个值分别分配给`X`和`y`。了解更多 [关于元组](https://wikipedia.org/wiki/Tuple)。
+
+ 你可以看到这个数据有442个项目,组成了10个元素的数组:
+
+ ```text
+ (442, 10)
+ [ 0.03807591 0.05068012 0.06169621 0.02187235 -0.0442235 -0.03482076
+ -0.04340085 -0.00259226 0.01990842 -0.01764613]
+ ```
+
+ ✅ 稍微思考一下数据和回归目标之间的关系。线性回归预测特征X和目标变量y之间的关系。你能在文档中找到糖尿病数据集的[目标](https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset)吗?鉴于该目标,该数据集展示了什么?
+
+2. 接下来,通过使用numpy的`newaxis`函数将其排列到一个新数组中来选择要绘制的该数据集的一部分。我们将使用线性回归根据它确定的模式在此数据中的值之间生成一条线。
+
+ ```python
+ X = X[:, np.newaxis, 2]
+ ```
+
+ ✅ 随时打印数据以检查其形状。
+
+3. 现在你已准备好绘制数据,你可以查看机器是否可以帮助确定此数据集中数字之间的逻辑分割。为此你需要将数据(X)和目标(y)拆分为测试集和训练集。Scikit-learn有一个简单的方法来做到这一点;你可以在给定点拆分测试数据。
+
+ ```python
+ X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33)
+ ```
+
+4. 现在你已准备好训练你的模型!加载线性回归模型并使用`model.fit()`使用X和y训练集对其进行训练:
+
+ ```python
+ model = linear_model.LinearRegression()
+ model.fit(X_train, y_train)
+ ```
+
+ ✅ `model.fit()`是一个你会在许多机器学习库(例如 TensorFlow)中看到的函数
+
+5. 然后,使用函数`predict()`,使用测试数据创建预测。这将用于绘制数据组之间的线
+
+ ```python
+ y_pred = model.predict(X_test)
+ ```
+
+6. 现在是时候在图中显示数据了。Matplotlib是完成此任务的非常有用的工具。创建所有X和y测试数据的散点图,并使用预测在模型的数据分组之间最合适的位置画一条线。
+
+ ```python
+ plt.scatter(X_test, y_test, color='black')
+ plt.plot(X_test, y_pred, color='blue', linewidth=3)
+ plt.show()
+ ```
+
+ ![显示糖尿病周围数据点的散点图](../images/scatterplot.png)
+
+ ✅ 想一想这里发生了什么。一条直线穿过许多小数据点,但它到底在做什么?你能看到你应该如何使用这条线来预测一个新的、未见过的数据点对应的y轴值吗?尝试用语言描述该模型的实际用途。
+
+恭喜,你构建了第一个线性回归模型,使用它创建了预测,并将其显示在绘图中!
+
+---
+## 🚀挑战
+
+从这个数据集中绘制一个不同的变量。提示:编辑这一行:`X = X[:, np.newaxis, 2]`。鉴于此数据集的目标,你能够发现糖尿病作为一种疾病的进展情况吗?
+## [课后测](https://white-water-09ec41f0f.azurestaticapps.net/quiz/10/)
+
+## 复习与自学
+
+在本教程中,你使用了简单线性回归,而不是单变量或多元线性回归。阅读一些关于这些方法之间差异的信息,或查看[此视频](https://www.coursera.org/lecture/quantifying-relationships-regression-models/linear-vs-nonlinear-categorical-variables-ai2Ef)
+
+阅读有关回归概念的更多信息,并思考这种技术可以回答哪些类型的问题。用这个[教程](https://docs.microsoft.com/learn/modules/train-evaluate-regression-models?WT.mc_id=academic-15963-cxa)加深你的理解。
+
+## 任务
+
+[不同的数据集](../assignment.md)
diff --git a/2-Regression/1-Tools/translations/assignment.it.md b/2-Regression/1-Tools/translations/assignment.it.md
new file mode 100644
index 0000000000..51fa1663cb
--- /dev/null
+++ b/2-Regression/1-Tools/translations/assignment.it.md
@@ -0,0 +1,13 @@
+# Regressione con Scikit-learn
+
+## Istruzioni
+
+Dare un'occhiata all'[insieme di dati Linnerud](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_linnerud.html#sklearn.datasets.load_linnerud) in Scikit-learn. Questo insieme di dati ha [obiettivi](https://scikit-learn.org/stable/datasets/toy_dataset.html#linnerrud-dataset) multipli: "Consiste di tre variabili di esercizio (dati) e tre variabili fisiologiche (obiettivo) raccolte da venti uomini di mezza età in un fitness club".
+
+Con parole proprie, descrivere come creare un modello di Regressione che tracci la relazione tra il punto vita e il numero di addominali realizzati. Fare lo stesso per gli altri punti dati in questo insieme di dati.
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| ------------------------------ | ----------------------------------- | ----------------------------- | -------------------------- |
+| Inviare un paragrafo descrittivo | Viene presentato un paragrafo ben scritto | Vengono inviate alcune frasi | Non viene fornita alcuna descrizione |
diff --git a/2-Regression/1-Tools/translations/assignment.ja.md b/2-Regression/1-Tools/translations/assignment.ja.md
new file mode 100644
index 0000000000..6f7d9ef079
--- /dev/null
+++ b/2-Regression/1-Tools/translations/assignment.ja.md
@@ -0,0 +1,13 @@
+# Scikit-learnを用いた回帰
+
+## 課題の指示
+
+Scikit-learnで[Linnerud dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_linnerud.html#sklearn.datasets.load_linnerud) を見てみましょう。このデータセットは複数の[ターゲット](https://scikit-learn.org/stable/datasets/toy_dataset.html#linnerrud-dataset) を持っています。フィットネスクラブで20人の中年男性から収集した3つの運動変数(data)と3つの生理変数(target)で構成されています。
+
+あなた自身の言葉で、ウエストラインと腹筋の回数との関係をプロットする回帰モデルの作成方法を説明してください。このデータセットの他のデータポイントについても同様に説明してみてください。
+
+## ルーブリック
+
+| 指標 | 模範的 | 適切 | 要改善 |
+| ------------------------------ | ----------------------------------- | ----------------------------- | -------------------------- |
+| 説明文を提出してください。 | よく書けた文章が提出されている。 | いくつかの文章が提出されている。 | 文章が提出されていません。 |
diff --git a/2-Regression/1-Tools/translations/assignment.zh-cn.md b/2-Regression/1-Tools/translations/assignment.zh-cn.md
new file mode 100644
index 0000000000..c296c8cabf
--- /dev/null
+++ b/2-Regression/1-Tools/translations/assignment.zh-cn.md
@@ -0,0 +1,14 @@
+# 用 Scikit-learn 实现一次回归算法
+
+## 说明
+
+先看看 Scikit-learn 中的 [Linnerud 数据集](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_linnerud.html#sklearn.datasets.load_linnerud)
+这个数据集中有多个[目标变量(target)](https://scikit-learn.org/stable/datasets/toy_dataset.html#linnerrud-dataset),其中包含了三种运动(训练数据)和三个生理指标(目标变量)组成,这些数据都是从一个健身俱乐部中的20名中年男子收集到的。
+
+之后用自己的方式,创建一个可以描述腰围和完成仰卧起坐个数关系的回归模型。用同样的方式对这个数据集中的其它数据也建立一下模型探究一下其中的关系。
+
+## 评判标准
+
+| 标准 | 优秀 | 中规中矩 | 仍需努力 |
+| ------------------------------ | ----------------------------------- | ----------------------------- | -------------------------- |
+| 需要提交一段能描述数据集中关系的文字 | 很好的描述了数据集中的关系 | 只能描述少部分的关系 | 啥都没有提交 |
diff --git a/2-Regression/2-Data/README.md b/2-Regression/2-Data/README.md
index 2f36b15a89..a647eb6b42 100644
--- a/2-Regression/2-Data/README.md
+++ b/2-Regression/2-Data/README.md
@@ -1,9 +1,12 @@
# Build a regression model using Scikit-learn: prepare and visualize data
-> ![Data visualization infographic](./images/data-visualization.png)
-> Infographic by [Dasani Madipalli](https://twitter.com/dasani_decoded)
+![Data visualization infographic](./images/data-visualization.png)
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/11/)
+Infographic by [Dasani Madipalli](https://twitter.com/dasani_decoded)
+
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/11/)
+
+> ### [This lesson is available in R!](./solution/lesson_2-R.ipynb)
## Introduction
@@ -52,7 +55,7 @@ Open the _notebook.ipynb_ file in Visual Studio Code and import the spreadsheet
```python
import pandas as pd
- pumpkins = pd.read_csv('../../data/US-pumpkins.csv')
+ pumpkins = pd.read_csv('../data/US-pumpkins.csv')
pumpkins.head()
```
@@ -187,9 +190,9 @@ To get charts to display useful data, you usually need to group the data somehow
## 🚀Challenge
-Explore the different types of visualization that M Matplotlib offers. Which types are most appropriate for regression problems?
+Explore the different types of visualization that Matplotlib offers. Which types are most appropriate for regression problems?
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/12/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/12/)
## Review & Self Study
diff --git a/2-Regression/2-Data/images/dplyr_wrangling.png b/2-Regression/2-Data/images/dplyr_wrangling.png
new file mode 100644
index 0000000000..06c50bb337
Binary files /dev/null and b/2-Regression/2-Data/images/dplyr_wrangling.png differ
diff --git a/2-Regression/2-Data/images/unruly_data.jpg b/2-Regression/2-Data/images/unruly_data.jpg
new file mode 100644
index 0000000000..54943ca9f0
Binary files /dev/null and b/2-Regression/2-Data/images/unruly_data.jpg differ
diff --git a/2-Regression/2-Data/solution/lesson_2-R.ipynb b/2-Regression/2-Data/solution/lesson_2-R.ipynb
new file mode 100644
index 0000000000..2409c37d82
--- /dev/null
+++ b/2-Regression/2-Data/solution/lesson_2-R.ipynb
@@ -0,0 +1,664 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "metadata": {
+ "colab": {
+ "name": "lesson_2-R.ipynb",
+ "provenance": [],
+ "collapsed_sections": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "name": "ir",
+ "display_name": "R"
+ },
+ "language_info": {
+ "name": "R"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Build a regression model: prepare and visualize data\r\n",
+ "\r\n",
+ "## **Linear Regression for Pumpkins - Lesson 2**\r\n",
+ "#### Introduction\r\n",
+ "\r\n",
+ "Now that you are set up with the tools you need to start tackling machine learning model building with Tidymodels and the Tidyverse, you are ready to start asking questions of your data. As you work with data and apply ML solutions, it's very important to understand how to ask the right question to properly unlock the potentials of your dataset.\r\n",
+ "\r\n",
+ "In this lesson, you will learn:\r\n",
+ "\r\n",
+ "- How to prepare your data for model-building.\r\n",
+ "\r\n",
+ "- How to use `ggplot2` for data visualization.\r\n",
+ "\r\n",
+ "The question you need answered will determine what type of ML algorithms you will leverage. And the quality of the answer you get back will be heavily dependent on the nature of your data.\r\n",
+ "\r\n",
+ "Let's see this by working through a practical exercise.\r\n",
+ "\r\n",
+ "\r\n",
+ "
\r\n",
+ " \r\n",
+ " Artwork by @allison_horst\r\n",
+ "\r\n",
+ "\r\n",
+ ""
+ ],
+ "metadata": {
+ "id": "Pg5aexcOPqAZ"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 1. Importing pumpkins data and summoning the Tidyverse\n",
+ "\n",
+ "We'll require the following packages to slice and dice this lesson:\n",
+ "\n",
+ "- `tidyverse`: The [tidyverse](https://www.tidyverse.org/) is a [collection of R packages](https://www.tidyverse.org/packages) designed to makes data science faster, easier and more fun!\n",
+ "\n",
+ "You can have them installed as:\n",
+ "\n",
+ "`install.packages(c(\"tidyverse\"))`\n",
+ "\n",
+ "The script below checks whether you have the packages required to complete this module and installs them for you in case some are missing."
+ ],
+ "metadata": {
+ "id": "dc5WhyVdXAjR"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "suppressWarnings(if(!require(\"pacman\")) install.packages(\"pacman\"))\r\n",
+ "pacman::p_load(tidyverse)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "GqPYUZgfXOBt"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Now, let's fire up some packages and load the [data](https://github.com/microsoft/ML-For-Beginners/blob/main/2-Regression/data/US-pumpkins.csv) provided for this lesson!"
+ ],
+ "metadata": {
+ "id": "kvjDTPDSXRr2"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Load the core Tidyverse packages\r\n",
+ "library(tidyverse)\r\n",
+ "\r\n",
+ "# Import the pumpkins data\r\n",
+ "pumpkins <- read_csv(file = \"https://raw.githubusercontent.com/microsoft/ML-For-Beginners/main/2-Regression/data/US-pumpkins.csv\")\r\n",
+ "\r\n",
+ "\r\n",
+ "# Get a glimpse and dimensions of the data\r\n",
+ "glimpse(pumpkins)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Print the first 50 rows of the data set\r\n",
+ "pumpkins %>% \r\n",
+ " slice_head(n =50)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "VMri-t2zXqgD"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "A quick `glimpse()` immediately shows that there are blanks and a mix of strings (`chr`) and numeric data (`dbl`). The `Date` is of type character and there's also a strange column called `Package` where the data is a mix between `sacks`, `bins` and other values. The data, in fact, is a bit of a mess 😤.\n",
+ "\n",
+ "In fact, it is not very common to be gifted a dataset that is completely ready to use to create a ML model out of the box. But worry not, in this lesson, you will learn how to prepare a raw dataset using standard R libraries 🧑🔧. You will also learn various techniques to visualize the data.📈📊\n",
+ " \n",
+ "\n",
+ "> A refresher: The pipe operator (`%>%`) performs operations in logical sequence by passing an object forward into a function or call expression. You can think of the pipe operator as saying \"and then\" in your code.\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "REWcIv9yX29v"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 2. Check for missing data\n",
+ "\n",
+ "One of the most common issues data scientists need to deal with is incomplete or missing data. R represents missing, or unknown values, with special sentinel value: `NA` (Not Available).\n",
+ "\n",
+ "So how would we know that the data frame contains missing values?\n",
+ " \n",
+ "- One straight forward way would be to use the base R function `anyNA` which returns the logical objects `TRUE` or `FALSE`"
+ ],
+ "metadata": {
+ "id": "Zxfb3AM5YbUe"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "pumpkins %>% \r\n",
+ " anyNA()"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "G--DQutAYltj"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Great, there seems to be some missing data! That's a good place to start.\n",
+ "\n",
+ "- Another way would be to use the function `is.na()` that indicates which individual column elements are missing with a logical `TRUE`."
+ ],
+ "metadata": {
+ "id": "mU-7-SB6YokF"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "pumpkins %>% \r\n",
+ " is.na() %>% \r\n",
+ " head(n = 7)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "W-DxDOR4YxSW"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Okay, got the job done but with a large data frame such as this, it would be inefficient and practically impossible to review all of the rows and columns individually😴.\n",
+ "\n",
+ "- A more intuitive way would be to calculate the sum of the missing values for each column:"
+ ],
+ "metadata": {
+ "id": "xUWxipKYY0o7"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "pumpkins %>% \r\n",
+ " is.na() %>% \r\n",
+ " colSums()"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "ZRBWV6P9ZArL"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Much better! There is missing data, but maybe it won't matter for the task at hand. Let's see what further analysis brings forth.\n",
+ "\n",
+ "> Along with the awesome sets of packages and functions, R has a very good documentation. For instance, use `help(colSums)` or `?colSums` to find out more about the function."
+ ],
+ "metadata": {
+ "id": "9gv-crB6ZD1Y"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 3. Dplyr: A Grammar of Data Manipulation\r\n",
+ "\r\n",
+ "\r\n",
+ "
\r\n",
+ " \r\n",
+ " Artwork by @allison_horst\r\n",
+ "\r\n",
+ "\r\n",
+ ""
+ ],
+ "metadata": {
+ "id": "o4jLY5-VZO2C"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "[`dplyr`](https://dplyr.tidyverse.org/), a package in the Tidyverse, is a grammar of data manipulation that provides a consistent set of verbs that help you solve the most common data manipulation challenges. In this section, we'll explore some of dplyr's verbs!\n",
+ " \n"
+ ],
+ "metadata": {
+ "id": "i5o33MQBZWWw"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### dplyr::select()\n",
+ "\n",
+ "`select()` is a function in the package `dplyr` which helps you pick columns to keep or exclude.\n",
+ "\n",
+ "To make your data frame easier to work with, drop several of its columns, using `select()`, keeping only the columns you need.\n",
+ "\n",
+ "For instance, in this exercise, our analysis will involve the columns `Package`, `Low Price`, `High Price` and `Date`. Let's select these columns."
+ ],
+ "metadata": {
+ "id": "x3VGMAGBZiUr"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Select desired columns\r\n",
+ "pumpkins <- pumpkins %>% \r\n",
+ " select(Package, `Low Price`, `High Price`, Date)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Print data set\r\n",
+ "pumpkins %>% \r\n",
+ " slice_head(n = 5)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "F_FgxQnVZnM0"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### dplyr::mutate()\n",
+ "\n",
+ "`mutate()` is a function in the package `dplyr` which helps you create or modify columns, while keeping the existing columns.\n",
+ "\n",
+ "The general structure of mutate is:\n",
+ "\n",
+ "`data %>% mutate(new_column_name = what_it_contains)`\n",
+ "\n",
+ "Let's take `mutate` out for a spin using the `Date` column by doing the following operations:\n",
+ "\n",
+ "1. Convert the dates (currently of type character) to a month format (these are US dates, so the format is `MM/DD/YYYY`).\n",
+ "\n",
+ "2. Extract the month from the dates to a new column.\n",
+ "\n",
+ "In R, the package [lubridate](https://lubridate.tidyverse.org/) makes it easier to work with Date-time data. So, let's use `dplyr::mutate()`, `lubridate::mdy()`, `lubridate::month()` and see how to achieve the above objectives. We can drop the Date column since we won't be needing it again in subsequent operations."
+ ],
+ "metadata": {
+ "id": "2KKo0Ed9Z1VB"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Load lubridate\r\n",
+ "library(lubridate)\r\n",
+ "\r\n",
+ "pumpkins <- pumpkins %>% \r\n",
+ " # Convert the Date column to a date object\r\n",
+ " mutate(Date = mdy(Date)) %>% \r\n",
+ " # Extract month from Date\r\n",
+ " mutate(Month = month(Date)) %>% \r\n",
+ " # Drop Date column\r\n",
+ " select(-Date)\r\n",
+ "\r\n",
+ "# View the first few rows\r\n",
+ "pumpkins %>% \r\n",
+ " slice_head(n = 7)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "5joszIVSZ6xe"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Woohoo! 🤩\n",
+ "\n",
+ "Next, let's create a new column `Price`, which represents the average price of a pumpkin. Now, let's take the average of the `Low Price` and `High Price` columns to populate the new Price column.\n",
+ " "
+ ],
+ "metadata": {
+ "id": "nIgLjNMCZ-6Y"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Create a new column Price\r\n",
+ "pumpkins <- pumpkins %>% \r\n",
+ " mutate(Price = (`Low Price` + `High Price`)/2)\r\n",
+ "\r\n",
+ "# View the first few rows of the data\r\n",
+ "pumpkins %>% \r\n",
+ " slice_head(n = 5)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "Zo0BsqqtaJw2"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Yeees!💪\n",
+ "\n",
+ "\"But wait!\", you'll say after skimming through the whole data set with `View(pumpkins)`, \"There's something odd here!\"🤔\n",
+ "\n",
+ "If you look at the `Package` column, pumpkins are sold in many different configurations. Some are sold in `1 1/9 bushel` measures, and some in `1/2 bushel` measures, some per pumpkin, some per pound, and some in big boxes with varying widths.\n",
+ "\n",
+ "Let's verify this:"
+ ],
+ "metadata": {
+ "id": "p77WZr-9aQAR"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Verify the distinct observations in Package column\r\n",
+ "pumpkins %>% \r\n",
+ " distinct(Package)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "XISGfh0IaUy6"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Amazing!👏\n",
+ "\n",
+ "Pumpkins seem to be very hard to weigh consistently, so let's filter them by selecting only pumpkins with the string *bushel* in the `Package` column and put this in a new data frame `new_pumpkins`.\n",
+ " "
+ ],
+ "metadata": {
+ "id": "7sMjiVujaZxY"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### dplyr::filter() and stringr::str_detect()\n",
+ "\n",
+ "[`dplyr::filter()`](https://dplyr.tidyverse.org/reference/filter.html): creates a subset of the data only containing **rows** that satisfy your conditions, in this case, pumpkins with the string *bushel* in the `Package` column.\n",
+ "\n",
+ "[stringr::str_detect()](https://stringr.tidyverse.org/reference/str_detect.html): detects the presence or absence of a pattern in a string.\n",
+ "\n",
+ "The [`stringr`](https://github.com/tidyverse/stringr) package provides simple functions for common string operations."
+ ],
+ "metadata": {
+ "id": "L8Qfcs92ageF"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Retain only pumpkins with \"bushel\"\r\n",
+ "new_pumpkins <- pumpkins %>% \r\n",
+ " filter(str_detect(Package, \"bushel\"))\r\n",
+ "\r\n",
+ "# Get the dimensions of the new data\r\n",
+ "dim(new_pumpkins)\r\n",
+ "\r\n",
+ "# View a few rows of the new data\r\n",
+ "new_pumpkins %>% \r\n",
+ " slice_head(n = 5)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "hy_SGYREampd"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "You can see that we have narrowed down to 415 or so rows of data containing pumpkins by the bushel.🤩\n",
+ " "
+ ],
+ "metadata": {
+ "id": "VrDwF031avlR"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### dplyr::case_when()\n",
+ "\n",
+ "**But wait! There's one more thing to do**\n",
+ "\n",
+ "Did you notice that the bushel amount varies per row? You need to normalize the pricing so that you show the pricing per bushel, not per 1 1/9 or 1/2 bushel. Time to do some math to standardize it.\n",
+ "\n",
+ "We'll use the function [`case_when()`](https://dplyr.tidyverse.org/reference/case_when.html) to *mutate* the Price column depending on some conditions. `case_when` allows you to vectorise multiple `if_else()`statements.\n"
+ ],
+ "metadata": {
+ "id": "mLpw2jH4a0tx"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Convert the price if the Package contains fractional bushel values\r\n",
+ "new_pumpkins <- new_pumpkins %>% \r\n",
+ " mutate(Price = case_when(\r\n",
+ " str_detect(Package, \"1 1/9\") ~ Price/(1 + 1/9),\r\n",
+ " str_detect(Package, \"1/2\") ~ Price/(1/2),\r\n",
+ " TRUE ~ Price))\r\n",
+ "\r\n",
+ "# View the first few rows of the data\r\n",
+ "new_pumpkins %>% \r\n",
+ " slice_head(n = 30)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "P68kLVQmbM6I"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Now, we can analyze the pricing per unit based on their bushel measurement. All this study of bushels of pumpkins, however, goes to show how very `important` it is to `understand the nature of your data`!\n",
+ "\n",
+ "> ✅ According to [The Spruce Eats](https://www.thespruceeats.com/how-much-is-a-bushel-1389308), a bushel's weight depends on the type of produce, as it's a volume measurement. \"A bushel of tomatoes, for example, is supposed to weigh 56 pounds... Leaves and greens take up more space with less weight, so a bushel of spinach is only 20 pounds.\" It's all pretty complicated! Let's not bother with making a bushel-to-pound conversion, and instead price by the bushel. All this study of bushels of pumpkins, however, goes to show how very important it is to understand the nature of your data!\n",
+ ">\n",
+ "> ✅ Did you notice that pumpkins sold by the half-bushel are very expensive? Can you figure out why? Hint: little pumpkins are way pricier than big ones, probably because there are so many more of them per bushel, given the unused space taken by one big hollow pie pumpkin.\n",
+ " \n"
+ ],
+ "metadata": {
+ "id": "pS2GNPagbSdb"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Now lastly, for the sheer sake of adventure 💁♀️, let's also move the Month column to the first position i.e `before` column `Package`.\n",
+ "\n",
+ "`dplyr::relocate()` is used to change column positions."
+ ],
+ "metadata": {
+ "id": "qql1SowfbdnP"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Create a new data frame new_pumpkins\r\n",
+ "new_pumpkins <- new_pumpkins %>% \r\n",
+ " relocate(Month, .before = Package)\r\n",
+ "\r\n",
+ "new_pumpkins %>% \r\n",
+ " slice_head(n = 7)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "JJ1x6kw8bixF"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Good job!👌 You now have a clean, tidy dataset on which you can build your new regression model!\n",
+ " "
+ ],
+ "metadata": {
+ "id": "y8TJ0Za_bn5Y"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 4. Data visualization with ggplot2\r\n",
+ "\r\n",
+ "
\r\n",
+ " \r\n",
+ " Infographic by Dasani Madipalli\r\n",
+ "\r\n",
+ "\r\n",
+ "\r\n",
+ "\r\n",
+ "There is a *wise* saying that goes like this:\r\n",
+ "\r\n",
+ "> \"The simple graph has brought more information to the data analyst's mind than any other device.\" --- John Tukey\r\n",
+ "\r\n",
+ "Part of the data scientist's role is to demonstrate the quality and nature of the data they are working with. To do this, they often create interesting visualizations, or plots, graphs, and charts, showing different aspects of data. In this way, they are able to visually show relationships and gaps that are otherwise hard to uncover.\r\n",
+ "\r\n",
+ "Visualizations can also help determine the machine learning technique most appropriate for the data. A scatterplot that seems to follow a line, for example, indicates that the data is a good candidate for a linear regression exercise.\r\n",
+ "\r\n",
+ "R offers a number of several systems for making graphs, but [`ggplot2`](https://ggplot2.tidyverse.org/index.html) is one of the most elegant and most versatile. `ggplot2` allows you to compose graphs by **combining independent components**.\r\n",
+ "\r\n",
+ "Let's start with a simple scatter plot for the Price and Month columns.\r\n",
+ "\r\n",
+ "So in this case, we'll start with [`ggplot()`](https://ggplot2.tidyverse.org/reference/ggplot.html), supply a dataset and aesthetic mapping (with [`aes()`](https://ggplot2.tidyverse.org/reference/aes.html)) then add a layers (like [`geom_point()`](https://ggplot2.tidyverse.org/reference/geom_point.html)) for scatter plots.\r\n"
+ ],
+ "metadata": {
+ "id": "mYSH6-EtbvNa"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Set a theme for the plots\r\n",
+ "theme_set(theme_light())\r\n",
+ "\r\n",
+ "# Create a scatter plot\r\n",
+ "p <- ggplot(data = new_pumpkins, aes(x = Price, y = Month))\r\n",
+ "p + geom_point()"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "g2YjnGeOcLo4"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Is this a useful plot 🤷? Does anything about it surprise you?\n",
+ "\n",
+ "It's not particularly useful as all it does is display in your data as a spread of points in a given month.\n",
+ " "
+ ],
+ "metadata": {
+ "id": "Ml7SDCLQcPvE"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### **How do we make it useful?**\n",
+ "\n",
+ "To get charts to display useful data, you usually need to group the data somehow. For instance in our case, finding the average price of pumpkins for each month would provide more insights to the underlying patterns in our data. This leads us to one more **dplyr** flyby:\n",
+ "\n",
+ "#### `dplyr::group_by() %>% summarize()`\n",
+ "\n",
+ "Grouped aggregation in R can be easily computed using\n",
+ "\n",
+ "`dplyr::group_by() %>% summarize()`\n",
+ "\n",
+ "- `dplyr::group_by()` changes the unit of analysis from the complete dataset to individual groups such as per month.\n",
+ "\n",
+ "- `dplyr::summarize()` creates a new data frame with one column for each grouping variable and one column for each of the summary statistics that you have specified.\n",
+ "\n",
+ "For example, we can use the `dplyr::group_by() %>% summarize()` to group the pumpkins into groups based on the **Month** columns and then find the **mean price** for each month."
+ ],
+ "metadata": {
+ "id": "jMakvJZIcVkh"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Find the average price of pumpkins per month\r\n",
+ "new_pumpkins %>%\r\n",
+ " group_by(Month) %>% \r\n",
+ " summarise(mean_price = mean(Price))"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "6kVSUa2Bcilf"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Succinct!✨\n",
+ "\n",
+ "Categorical features such as months are better represented using a bar plot 📊. The layers responsible for bar charts are `geom_bar()` and `geom_col()`. Consult `?geom_bar` to find out more.\n",
+ "\n",
+ "Let's whip up one!"
+ ],
+ "metadata": {
+ "id": "Kds48GUBcj3W"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Find the average price of pumpkins per month then plot a bar chart\r\n",
+ "new_pumpkins %>%\r\n",
+ " group_by(Month) %>% \r\n",
+ " summarise(mean_price = mean(Price)) %>% \r\n",
+ " ggplot(aes(x = Month, y = mean_price)) +\r\n",
+ " geom_col(fill = \"midnightblue\", alpha = 0.7) +\r\n",
+ " ylab(\"Pumpkin Price\")"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "VNbU1S3BcrxO"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "🤩🤩This is a more useful data visualization! It seems to indicate that the highest price for pumpkins occurs in September and October. Does that meet your expectation? Why or why not?\n",
+ "\n",
+ "Congratulations on finishing the second lesson 👏! You prepared your data for model building, then uncovered more insights using visualizations!"
+ ],
+ "metadata": {
+ "id": "zDm0VOzzcuzR"
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/2-Regression/2-Data/solution/lesson_2.Rmd b/2-Regression/2-Data/solution/lesson_2.Rmd
new file mode 100644
index 0000000000..7f72b1a84e
--- /dev/null
+++ b/2-Regression/2-Data/solution/lesson_2.Rmd
@@ -0,0 +1,345 @@
+---
+title: 'Build a regression model: prepare and visualize data'
+output:
+ html_document:
+ df_print: paged
+ theme: flatly
+ highlight: breezedark
+ toc: yes
+ toc_float: yes
+ code_download: yes
+---
+
+## **Linear Regression for Pumpkins - Lesson 2**
+
+#### Introduction
+
+Now that you are set up with the tools you need to start tackling machine learning model building with Tidymodels and the Tidyverse, you are ready to start asking questions of your data. As you work with data and apply ML solutions, it's very important to understand how to ask the right question to properly unlock the potentials of your dataset.
+
+In this lesson, you will learn:
+
+- How to prepare your data for model-building.
+
+- How to use `ggplot2` for data visualization.
+
+The question you need answered will determine what type of ML algorithms you will leverage. And the quality of the answer you get back will be heavily dependent on the nature of your data.
+
+Let's see this by working through a practical exercise.
+
+![Artwork by \@allison_horst](../images/unruly_data.jpg){width="700"}
+
+## 1. Importing pumpkins data and summoning the Tidyverse
+
+We'll require the following packages to slice and dice this lesson:
+
+- `tidyverse`: The [tidyverse](https://www.tidyverse.org/) is a [collection of R packages](https://www.tidyverse.org/packages) designed to makes data science faster, easier and more fun!
+
+You can have them installed as:
+
+`install.packages(c("tidyverse"))`
+
+The script below checks whether you have the packages required to complete this module and installs them for you in case they are missing.
+
+```{r, message=F, warning=F}
+if (!require("pacman")) install.packages("pacman")
+pacman::p_load(tidyverse)
+```
+
+Now, let's fire up some packages and load the [data](https://github.com/microsoft/ML-For-Beginners/blob/main/2-Regression/data/US-pumpkins.csv) provided for this lesson!
+
+```{r load_tidy_verse_models, message=F, warning=F}
+# Load the core Tidyverse packages
+library(tidyverse)
+
+# Import the pumpkins data
+pumpkins <- read_csv(file = "https://raw.githubusercontent.com/microsoft/ML-For-Beginners/main/2-Regression/data/US-pumpkins.csv")
+
+
+# Get a glimpse and dimensions of the data
+glimpse(pumpkins)
+
+
+# Print the first 50 rows of the data set
+pumpkins %>%
+ slice_head(n =50)
+
+```
+
+A quick `glimpse()` immediately shows that there are blanks and a mix of strings (`chr`) and numeric data (`dbl`). The `Date` is of type character and there's also a strange column called `Package` where the data is a mix between `sacks`, `bins` and other values. The data, in fact, is a bit of a mess 😤.
+
+In fact, it is not very common to be gifted a dataset that is completely ready to use to create a ML model out of the box. But worry not, in this lesson, you will learn how to prepare a raw dataset using standard R libraries 🧑🔧. You will also learn various techniques to visualize the data.📈📊
+
+
+
+> A refresher: The pipe operator (`%>%`) performs operations in logical sequence by passing an object forward into a function or call expression. You can think of the pipe operator as saying "and then" in your code.
+
+
+## 2. Check for missing data
+
+One of the most common issues data scientists need to deal with is incomplete or missing data. R represents missing, or unknown values, with special sentinel value: `NA` (Not Available).
+
+So how would we know that the data frame contains missing values?
+
+- One straight forward way would be to use the base R function `anyNA` which returns the logical objects `TRUE` or `FALSE`
+
+```{r anyNA, message=F, warning=F}
+pumpkins %>%
+ anyNA()
+```
+
+Great, there seems to be some missing data! That's a good place to start.
+
+- Another way would be to use the function `is.na()` that indicates which individual column elements are missing with a logical `TRUE`.
+
+```{r is_na, message=F, warning=F}
+pumpkins %>%
+ is.na() %>%
+ head(n = 7)
+```
+
+Okay, got the job done but with a large data frame such as this, it would be inefficient and practically impossible to review all of the rows and columns individually😴.
+
+- A more intuitive way would be to calculate the sum of the missing values for each column:
+
+```{r colSum_NA, message=F, warning=F}
+pumpkins %>%
+ is.na() %>%
+ colSums()
+```
+
+Much better! There is missing data, but maybe it won't matter for the task at hand. Let's see what further analysis brings forth.
+
+> Along with the awesome sets of packages and functions, R has a very good documentation. For instance, use `help(colSums)` or `?colSums` to find out more about the function.
+
+## 3. Dplyr: A Grammar of Data Manipulation
+
+![Artwork by \@allison_horst](../images/dplyr_wrangling.png){width="569"}
+
+[`dplyr`](https://dplyr.tidyverse.org/), a package in the Tidyverse, is a grammar of data manipulation that provides a consistent set of verbs that help you solve the most common data manipulation challenges. In this section, we'll explore some of dplyr's verbs!
+
+#### dplyr::select()
+
+`select()` is a function in the package `dplyr` which helps you pick columns to keep or exclude.
+
+To make your data frame easier to work with, drop several of its columns, using `select()`, keeping only the columns you need.
+
+For instance, in this exercise, our analysis will involve the columns `Package`, `Low Price`, `High Price` and `Date`. Let's select these columns.
+
+```{r select, message=F, warning=F}
+# Select desired columns
+pumpkins <- pumpkins %>%
+ select(Package, `Low Price`, `High Price`, Date)
+
+
+# Print data set
+pumpkins %>%
+ slice_head(n = 5)
+```
+
+#### dplyr::mutate()
+
+`mutate()` is a function in the package `dplyr` which helps you create or modify columns, while keeping the existing columns.
+
+The general structure of mutate is:
+
+`data %>% mutate(new_column_name = what_it_contains)`
+
+Let's take `mutate` out for a spin using the `Date` column by doing the following operations:
+
+1. Convert the dates (currently of type character) to a month format (these are US dates, so the format is `MM/DD/YYYY`).
+
+2. Extract the month from the dates to a new column.
+
+In R, the package [lubridate](https://lubridate.tidyverse.org/) makes it easier to work with Date-time data. So, let's use `dplyr::mutate()`, `lubridate::mdy()`, `lubridate::month()` and see how to achieve the above objectives. We can drop the Date column since we won't be needing it again in subsequent operations.
+
+```{r mut_date, message=F, warning=F}
+# Load lubridate
+library(lubridate)
+
+pumpkins <- pumpkins %>%
+ # Convert the Date column to a date object
+ mutate(Date = mdy(Date)) %>%
+ # Extract month from Date
+ mutate(Month = month(Date)) %>%
+ # Drop Date column
+ select(-Date)
+
+# View the first few rows
+pumpkins %>%
+ slice_head(n = 7)
+```
+
+Woohoo! 🤩
+
+Next, let's create a new column `Price`, which represents the average price of a pumpkin. Now, let's take the average of the `Low Price` and `High Price` columns to populate the new Price column.
+
+```{r price, message=F, warning=F}
+# Create a new column Price
+pumpkins <- pumpkins %>%
+ mutate(Price = (`Low Price` + `High Price`)/2)
+
+# View the first few rows of the data
+pumpkins %>%
+ slice_head(n = 5)
+```
+
+Yeees!💪
+
+"But wait!", you'll say after skimming through the whole data set with `View(pumpkins)`, "There's something odd here!"🤔
+
+If you look at the `Package` column, pumpkins are sold in many different configurations. Some are sold in `1 1/9 bushel` measures, and some in `1/2 bushel` measures, some per pumpkin, some per pound, and some in big boxes with varying widths.
+
+Let's verify this:
+
+```{r Package, message=F, warning=F}
+# Verify the distinct observations in Package column
+pumpkins %>%
+ distinct(Package)
+
+```
+
+Amazing!👏
+
+Pumpkins seem to be very hard to weigh consistently, so let's filter them by selecting only pumpkins with the string *bushel* in the `Package` column and put this in a new data frame `new_pumpkins`.
+
+#### dplyr::filter() and stringr::str_detect()
+
+[`dplyr::filter()`](https://dplyr.tidyverse.org/reference/filter.html): creates a subset of the data only containing **rows** that satisfy your conditions, in this case, pumpkins with the string *bushel* in the `Package` column.
+
+[stringr::str_detect()](https://stringr.tidyverse.org/reference/str_detect.html): detects the presence or absence of a pattern in a string.
+
+The [`stringr`](https://github.com/tidyverse/stringr) package provides simple functions for common string operations.
+
+```{r filter, message=F, warning=F}
+# Retain only pumpkins with "bushel"
+new_pumpkins <- pumpkins %>%
+ filter(str_detect(Package, "bushel"))
+
+# Get the dimensions of the new data
+dim(new_pumpkins)
+
+# View a few rows of the new data
+new_pumpkins %>%
+ slice_head(n = 5)
+```
+
+You can see that we have narrowed down to 415 or so rows of data containing pumpkins by the bushel.🤩
+
+#### dplyr::case_when()
+
+**But wait! There's one more thing to do**
+
+Did you notice that the bushel amount varies per row? You need to normalize the pricing so that you show the pricing per bushel, not per 1 1/9 or 1/2 bushel. Time to do some math to standardize it.
+
+We'll use the function [`case_when()`](https://dplyr.tidyverse.org/reference/case_when.html) to *mutate* the Price column depending on some conditions. `case_when` allows you to vectorise multiple `if_else()`statements.
+
+```{r normalize_price, message=F, warning=F}
+# Convert the price if the Package contains fractional bushel values
+new_pumpkins <- new_pumpkins %>%
+ mutate(Price = case_when(
+ str_detect(Package, "1 1/9") ~ Price/(1 + 1/9),
+ str_detect(Package, "1/2") ~ Price/(1/2),
+ TRUE ~ Price))
+
+# View the first few rows of the data
+new_pumpkins %>%
+ slice_head(n = 30)
+```
+
+Now, we can analyze the pricing per unit based on their bushel measurement. All this study of bushels of pumpkins, however, goes to show how very `important` it is to `understand the nature of your data`!
+
+> ✅ According to [The Spruce Eats](https://www.thespruceeats.com/how-much-is-a-bushel-1389308), a bushel's weight depends on the type of produce, as it's a volume measurement. "A bushel of tomatoes, for example, is supposed to weigh 56 pounds... Leaves and greens take up more space with less weight, so a bushel of spinach is only 20 pounds." It's all pretty complicated! Let's not bother with making a bushel-to-pound conversion, and instead price by the bushel. All this study of bushels of pumpkins, however, goes to show how very important it is to understand the nature of your data!
+>
+> ✅ Did you notice that pumpkins sold by the half-bushel are very expensive? Can you figure out why? Hint: little pumpkins are way pricier than big ones, probably because there are so many more of them per bushel, given the unused space taken by one big hollow pie pumpkin.
+
+Now lastly, for the sheer sake of adventure 💁♀️, let's also move the Month column to the first position i.e `before` column `Package`.
+
+`dplyr::relocate()` is used to change column positions.
+
+```{r new_pumpkins, message=F, warning=F}
+# Create a new data frame new_pumpkins
+new_pumpkins <- new_pumpkins %>%
+ relocate(Month, .before = Package)
+
+new_pumpkins %>%
+ slice_head(n = 7)
+
+```
+
+Good job!👌 You now have a clean, tidy dataset on which you can build your new regression model!
+
+## 4. Data visualization with ggplot2
+
+![Infographic by Dasani Madipalli](../images/data-visualization.png){width="600"}
+
+There is a *wise* saying that goes like this:
+
+> "The simple graph has brought more information to the data analyst's mind than any other device." --- John Tukey
+
+Part of the data scientist's role is to demonstrate the quality and nature of the data they are working with. To do this, they often create interesting visualizations, or plots, graphs, and charts, showing different aspects of data. In this way, they are able to visually show relationships and gaps that are otherwise hard to uncover.
+
+Visualizations can also help determine the machine learning technique most appropriate for the data. A scatterplot that seems to follow a line, for example, indicates that the data is a good candidate for a linear regression exercise.
+
+R offers a number of several systems for making graphs, but [`ggplot2`](https://ggplot2.tidyverse.org/index.html) is one of the most elegant and most versatile. `ggplot2` allows you to compose graphs by **combining independent components**.
+
+Let's start with a simple scatter plot for the Price and Month columns.
+
+So in this case, we'll start with [`ggplot()`](https://ggplot2.tidyverse.org/reference/ggplot.html), supply a dataset and aesthetic mapping (with [`aes()`](https://ggplot2.tidyverse.org/reference/aes.html)) then add a layers (like [`geom_point()`](https://ggplot2.tidyverse.org/reference/geom_point.html)) for scatter plots.
+
+```{r scatter_plt, message=F, warning=F}
+# Set a theme for the plots
+theme_set(theme_light())
+
+# Create a scatter plot
+p <- ggplot(data = new_pumpkins, aes(x = Price, y = Month))
+p + geom_point()
+```
+
+Is this a useful plot 🤷? Does anything about it surprise you?
+
+It's not particularly useful as all it does is display in your data as a spread of points in a given month.
+
+### **How do we make it useful?**
+
+To get charts to display useful data, you usually need to group the data somehow. For instance in our case, finding the average price of pumpkins for each month would provide more insights to the underlying patterns in our data. This leads us to one more **dplyr** flyby:
+
+#### `dplyr::group_by() %>% summarize()`
+
+Grouped aggregation in R can be easily computed using
+
+`dplyr::group_by() %>% summarize()`
+
+- `dplyr::group_by()` changes the unit of analysis from the complete dataset to individual groups such as per month.
+
+- `dplyr::summarize()` creates a new data frame with one column for each grouping variable and one column for each of the summary statistics that you have specified.
+
+For example, we can use the `dplyr::group_by() %>% summarize()` to group the pumpkins into groups based on the **Month** columns and then find the **mean price** for each month.
+
+```{r grp_sumry, message=F, warning=F}
+# Find the average price of pumpkins per month
+new_pumpkins %>%
+ group_by(Month) %>%
+ summarise(mean_price = mean(Price))
+```
+
+Succinct!✨
+
+Categorical features such as months are better represented using a bar plot 📊. The layers responsible for bar charts are `geom_bar()` and `geom_col()`. Consult
+
+`?geom_bar` to find out more.
+
+Let's whip up one!
+
+```{r bar_plt, message=F, warning=F}
+# Find the average price of pumpkins per month then plot a bar chart
+new_pumpkins %>%
+ group_by(Month) %>%
+ summarise(mean_price = mean(Price)) %>%
+ ggplot(aes(x = Month, y = mean_price)) +
+ geom_col(fill = "midnightblue", alpha = 0.7) +
+ ylab("Pumpkin Price")
+```
+
+🤩🤩This is a more useful data visualization! It seems to indicate that the highest price for pumpkins occurs in September and October. Does that meet your expectation? Why or why not?
+
+Congratulations on finishing the second lesson 👏! You did prepared your data for model building, then uncovered more insights using visualizations!\
diff --git a/2-Regression/2-Data/translations/README.id.md b/2-Regression/2-Data/translations/README.id.md
new file mode 100644
index 0000000000..9d8b5f18da
--- /dev/null
+++ b/2-Regression/2-Data/translations/README.id.md
@@ -0,0 +1,202 @@
+# Membangun sebuah model regresi dengan Scikit-learn: siapkan dan visualisasikan data
+
+![Infografik visualisasi data](../images/data-visualization.png)
+> Infografik oleh [Dasani Madipalli](https://twitter.com/dasani_decoded)
+
+## [Kuis pra-ceramah](https://white-water-09ec41f0f.azurestaticapps.net/quiz/11/)
+
+## Pembukaan
+
+Karena sekarang kamu sudah siap dengan alat-alat yang akan diperlukan untuk mulai melampiaskan pembangunan model *machine learning* dengan Scikit-learn, kamu juga siap untuk mulai membuat pertanyaan dari datamu. Selagi kamu bekerja dengan data dan mengaplikasikan solusi ML, sangatlah penting untuk mengerti bagaimana menanyakan cara yang benar dan tepat untuk mengemukakan potensial *dataset*-mu.
+
+Dalam pelajaran ini, kamu akan belajar:
+
+- Cara mempersiapkan datamu untuk pembangunan model.
+- Cara menggunakan Matplotlib untuk memvisualisasikan data.
+
+## Menanyakan pertanyaan yang tepat dari datamu
+
+Pertanyaan yang perlu dijawab akan menentukan jenis algoritma ML yang kamu akan memanfaatkan. Lalu, kualitas jawaban yang kamu akan dapat sangat tergantung pada sifat datamu (*the nature of your data*).
+
+Lihatlah [data](../data/US-pumpkins.csv) yang disiapkan untuk pelajaran ini. Kamu bisa membuka file .csv ini di VS Code. Membaca dengan cepat, *dataset* ini ada yang kosong dan ada yang campuran data *string* dan data numerik. Adapula sebuah kolom 'Package' yang aneh dan mengandung data antara 'sacks', 'bins', dll. Terus terang, data ini amburadul.
+
+Faktanya adalah tidak sering kita dihadiahkan sebuah *dataset* yang langsung bisa digunakan untuk membuat sebuah model ML. Dalam pelajaran ini, kamu akan belajar bagaimana menyiapkan sebuah *dataset* 'mentah' menggunakan *library* standar Python. Kamu juga akan belajar aneka teknik untuk memvisualisasikan datanya.
+
+## Studi kasus: 'pasar labu'
+
+Dalam folder ini kamu akan menemukan sebuah file .csv dalam folder `data` bernama [US-pumpkins.csv](../data/US-pumpkins.csv) yang mempunyai 1757 baris data tentang pasar labu disortir dalam pengelompokkan berdasarkan kota. Ini adalah data mentah yang diambil dari [Specialty Crops Terminal Markets Standard Reports (Laporan Standar Pasar Terminal Tanaman Khusus)](https://www.marketnews.usda.gov/mnp/fv-report-config-step1?type=termPrice) yang didistribusi Departemen Agrikultur Amerika Serikat.
+
+### Menyiapkan data
+
+Data ini terbuka untuk umum (*publik domain*) dan bisa diunduh sebagai banyak file terpisah berdasarkan kota dari situs internet Departemen Agrikultur Amerika Serikat. Supaya tidak berurusan dengan terlalu banyak file, kami telah menggabungkan data dari semua kota menjadi satu *spreadsheet* (file Excel). Jadi kamu sudah _menyiapkan_ datanya sedikit. Selanjutnya, mari kita lihat datanya.
+
+### Data labu - kesimpulan-kesimpulan awal
+
+Apa yang kamu cermati tentang data ini? Kamu sudah melihat bahwa ada campuran *string*, nomor, kekosongan, dan nilai-nilai aneh yang harus diartikan.
+
+Pertanyaan apa yang kamu bisa tanyakan dari data data ini menggunakan teknik regresi? Kalau "Prediksikan harga jual sebuah labu pada bulan tertentu" bagaimana? Melihat datanya sekali lagi, ada beberapa perubahan yang kamu harus terapkan untuk membuat struktur data yang diperlukan untuk tugas ini.
+
+## Latihan - analisiskan data labu
+
+Mari menggunakan [Pandas](https://pandas.pydata.org/) (singkatan dari `Python Data Analysis`), sebuah alat yang sangat beruna untuk membentuk, menganalisis, dan menyiapkan data labu ini.
+
+### Pertama, carilah tanggal yang hilang
+
+Kamu harus mengambil langkah untuk mencari tanggal-tanggal yang hilang terlebih dahulu:
+
+1. Konversi tanggal-tanggalnya menjadi format bulan (tanggal-tanggal ini dalam format Amerika Serikat, yaitu `BULAN/TANGGAL/TAHUN`).
+2. Jadikan data bulan menjadi kolom baru
+
+Buka file _notebook.ipynb_ dalam Visual Studio Code dan impor *spreadsheet*-nya menjadi sebuah *dataframe* Pandas.
+
+1. Gunakan fungsi `head()` untuk melihat lima baris pertama.
+
+ ```python
+ import pandas as pd
+ pumpkins = pd.read_csv('../../data/US-pumpkins.csv')
+ pumpkins.head()
+ ```
+
+ ✅ Fungsi apa yang akan kamu gunakan untuk melihat lima baris **terakhir**?
+
+2. Periksa apa ada data yang hilang dalam *dataframe* ini:
+
+ ```python
+ pumpkins.isnull().sum()
+ ```
+
+ Ada data yang hilang, namun mungkin tidak akan diperlukan untuk tugas ini.
+
+3. Untuk menjadikan *dataframe* kamu lebih mudah untuk digunakan, buanglah beberapa kolom menggunakan `drop()` dan simpanlah kolom-kolom yang diperlukan saja:
+
+ ```python
+ new_columns = ['Package', 'Month', 'Low Price', 'High Price', 'Date']
+ pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)
+ ```
+
+### Kedua, tentukan harga rata-rata labu
+
+Pikirkan bagaimana caranya menentukan harga rata-rata sebuah labu pada bulan tertentu. Kamu akan pilih kolom apa saja untuk tugas ini? Petunjuk: kamu akan perlu 3 kolom.
+
+Solusi: Ambil rata-rata kolom `Low Price` dan `High Price` untuk mengisi kolom `Price` yang baru. Terus, konversikan kolom `Date` untuk hanya menunjukkan bulan saja. Untungnya, berdasarkan pemeriksaan di atas, tidak ada data tanggal atau harga yang hilang.
+
+1. Untuk mengkalkulasi rata-rata, tambahlah kode berikut:
+
+ ```python
+ price = (pumpkins['Low Price'] + pumpkins['High Price']) / 2
+
+ month = pd.DatetimeIndex(pumpkins['Date']).month
+
+ ```
+
+ ✅ Jangan ragu untuk mem-*print* data apapun yang kamu ingin periksa menggunakan `print(month)`.
+
+2. Sekarang, salinlah data yang telah dikonversi ke sebuah *dataframe* Pandas yang baru:
+
+ ```python
+ new_pumpkins = pd.DataFrame({'Month': month, 'Package': pumpkins['Package'], 'Low Price': pumpkins['Low Price'],'High Price': pumpkins['High Price'], 'Price': price})
+ ```
+
+ Jika *dataframe* baru ini di-*print*, kamu akan lihat sebuah *dataset* yang rapih darimana kamu bisa membangun model regresi barumu.
+
+### But wait! There's something odd here
+### Tunggu! Ada yang aneh di sini
+
+Kalau kamu lihat kolom `Package`, labu dijual dalam berbagai konfigurasi. Beberapa dijual dalam satuan '1 1/9 bushel', beberapa per labu, beberapa per pon, dan beberapa dalam dus-dus besar dengan kelebaran yang berbeda-beda.
+
+> Kelihatannya susah untuk menimbang labu secara konsisten
+
+Menggali data orisinal lebih dalam, sangatlah menarik untuk melihat apapun dengan `Unit of Sale` (satuan penjualan) yang sama dengan 'EACH' atau 'PER BIN' akan mempunyai jenis `Package` yang per inci, per bin, atau 'each'. Kelihatannya susah untuk menimbang labu secara konsisten, jadi mari memilah datanya dengan hanya memilih labu yang kolom `Package`-nya sama dengan *string* 'bushel'.
+
+1. Tambah sebuah filter di atas file tetapi dibawah impor .csv yang di awal
+
+ ```python
+ pumpkins = pumpkins[pumpkins['Package'].str.contains('bushel', case=True, regex=True)]
+ ```
+
+ Kalau kamu *print* datanya sekarang, kamu bisa lihat bahwa kamu hanya mendapatkan sekitar 415 baris data yang mengandung data labu per bushel.
+
+### Tunggu! Masih ada satu lagi
+
+Apa kamu sadar bahwa jumlah bushel berbeda-beda per baris? Kamu harus menormalisasi harganya supaya kamu menunjukkan harga per bushel. Gunakanlah sedikit matematika untuk menstandarisasinya.
+
+1. Tambahlah beberapa baris ini setelah blok yang membuat *dataframe* new_pumpkins:
+
+ ```python
+ new_pumpkins.loc[new_pumpkins['Package'].str.contains('1 1/9'), 'Price'] = price/(1 + 1/9)
+
+ new_pumpkins.loc[new_pumpkins['Package'].str.contains('1/2'), 'Price'] = price/(1/2)
+ ```
+
+✅ Berdasarkan [The Spruce Eats](https://www.thespruceeats.com/how-much-is-a-bushel-1389308), berat satu bushel tergantung jenis hasil bumi sebab bushel adalah satuan volume. "Satu bushel tomat, sebagai contoh, seharusnya seberat 56 pon (25.4 kg)... Dedaunan mengambil lebih banyak ruang tetapi lebih ringan, jadi satu bushel bayam hanya seberat 20 pon (9.1 kg)" (diterjemah). Lumayan rumit ya! Kita tidak usah mengkonversi bushel ke pon saja bagaimana, jadi kita gunakan satuan harga per bushel? Namun, semua riset ini tentang bushel labu menunjukkan sebagaimana pentingnya untuk mengerti sifat datamu!
+
+Sekarang, kamu bisa meneliti harga per satuan berdasarkan hitungan bushel mereka. Jika kamu *print* datanya sekali lagi, kamu bisa lihat bagaimana telah distandarisasi.
+
+✅ Apa kamu sadar bahwa labu yang dijual per setengah bushel sangat mahal? Kira-kira mengapa ya? Petunjuk: labu kecil jauh lebih mahal daripada labu besar, mungkin karena ada lebih banyak per bushel, apalagi mengingat pula bahwa satu labu besar mempunyai rongga kosong yang besar di dalamnya.
+
+## Strategi Visualisasi
+
+Sebagian dari peran seorang *data scientist* adalah untuk mendemonstrasikan kualitas dan sifat data yang sedang digunakan. Untuk melakukan ini, mereka seringkali membuat visualisasi-visualisasi atau grafik menarik yang menunjukkan aspek-aspek berbeda tentang datanya. Dengan cara ini, mereka dapat menunjukkan hubungan-hubungan dan celah-celah secara visual. Kalau tidak secara visual, akan susah untuk menemukan pola-pola tersebut.
+
+Visualisasi juga bisa membantu menentukan teknik *machine learning* yang palingn cocok untuk datanya. Sebagai contoh, sebuah petak sebar yang kelihatannya mengikuti sebuah garis mengindikasikan bahwa data ini adalah kandidat baik untuk latihan regresi linear.
+
+Satu *library* visualisasi data yang bekerja dengan baik dalam sebuah *Jupyter notebook* adalah [Matplotlib](https://matplotlib.org/) (yang kamu juga lihat dalam pelajaran sebelumnya).
+
+> Carilah pengalaman dalam memvisualisasi data dengan [tutorial-tutorial ini](https://docs.microsoft.com/learn/modules/explore-analyze-data-with-python?WT.mc_id=academic-15963-cxa).
+
+## Latihan - sebuah experimen dengan Matplotlib
+
+Coba membuat beberapa grafik sederhana untuk menunjukkan *dataframe* baru yang baru kamu buat. Kira-kira, sebuah plot garis akan menunjukkan apa ya?
+
+1. Impor Matplotlib di atas file tetapi di bawah impor Pandas:
+
+ ```python
+ import matplotlib.pyplot as plt
+ ```
+
+2. Jalankan ulang keseluruhan *notebook*-nya.
+3. Di bagian bawah *notebook*-nya, tambahkan sebuah sel untuk menggambarkan datanya sebagai sebuah kotak.
+
+ ```python
+ price = new_pumpkins.Price
+ month = new_pumpkins.Month
+ plt.scatter(price, month)
+ plt.show()
+ ```
+
+ ![Sebuah petak sebar yang menunjukkan hubungan antara harga dan bulan](../images/scatterplot.png)
+
+ Apakah grafik ini berguna? Apa ada yang mengejutkanmu?
+
+ Sebenarnya tidak terlalu berguna karena dia hanya menunjukkan datamu sebagai sebuah penyebaran poin pada bulan tertentu.
+
+### Jadikan berguna
+
+Untuk menjadikan sebuah grafik menjadi berguna, biasanya datanya harus dikelompokkan dengan suatu cara. Kita coba membuat suatu plot di mana sumbu y menunjukkan bulan dan datanya mendemonstrasikan distribusi data, yuk!
+
+1. Tambah sebuah sel untuk membuat sebuah diagram batang berkelompok:
+
+ ```python
+ new_pumpkins.groupby(['Month'])['Price'].mean().plot(kind='bar')
+ plt.ylabel("Pumpkin Price")
+ ```
+
+ ![Sebuah diagram batang yang menunjukkan hubungan antara harga dan bulan](../images/barchart.png)
+
+ Nah, ini lebih berguna! Kelihatannya visualisasi ini mengindikasi bahwa labu itu paling mahal pada bulan September dan Oktober. Apa itu sesuai ekspektasimu? Mengapa?
+
+---
+
+## 🚀Tantangan
+
+Jelajahi jenis-jenis visualisasi yang beda dan yang disediakan Matplotlib. Jenis mana yang paling cocok untuk kasus regresi?
+
+## [Kuis pasca-ceramah](https://white-water-09ec41f0f.azurestaticapps.net/quiz/12/)
+
+## Review & Pembelajaran Mandiri
+
+Lihatlah beragam cara memvisualisasi data. Buatlah sebuah daftar dari aneka *library* yang tersedia dan catatlah yang mana yang paling baik untuk jenis-jenis tugas tertentu. Sebagai contoh, bagaimana dengan visualisasi 2D vs. 3D? Apa yang kamu temukan?
+
+## Tugas
+
+[Menjelajahi visualisasi](../assignment.md)
diff --git a/2-Regression/2-Data/translations/README.it.md b/2-Regression/2-Data/translations/README.it.md
new file mode 100644
index 0000000000..ba57ee11b3
--- /dev/null
+++ b/2-Regression/2-Data/translations/README.it.md
@@ -0,0 +1,201 @@
+# Costruire un modello di regressione usando Scikit-learn: preparare e visualizzare i dati
+
+> ![Infografica sulla visualizzazione dei dati](../images/data-visualization.png)
+> Infografica di [Dasani Madipalli](https://twitter.com/dasani_decoded)
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/11/)
+
+## Introduzione
+
+Ora che si hanno a disposizione gli strumenti necessari per iniziare ad affrontare la creazione di modelli di machine learning con Scikit-learn, si è pronti per iniziare a porre domande sui propri dati. Mentre si lavora con i dati e si applicano soluzioni ML, è molto importante capire come porre la domanda giusta per sbloccare correttamente le potenzialità del proprio insieme di dati.
+
+In questa lezione, si imparerà:
+
+- Come preparare i dati per la creazione del modello.
+- Come utilizzare Matplotlib per la visualizzazione dei dati.
+
+## Fare la domanda giusta ai propri dati
+
+La domanda a cui si deve rispondere determinerà il tipo di algoritmi ML che verranno utilizzati. La qualità della risposta che si riceverà dipenderà fortemente dalla natura dei propri dati.
+
+Si dia un'occhiata ai [dati](../../data/US-pumpkins.csv) forniti per questa lezione. Si può aprire questo file .csv in VS Code. Una rapida scrematura mostra immediatamente che ci sono spazi vuoti e un mix di stringhe e dati numerici. C'è anche una strana colonna chiamata "Package" (pacchetto) in cui i dati sono un mix tra "sacks" (sacchi), "bins" (contenitori) e altri valori. I dati, infatti, sono un po' un pasticcio.
+
+In effetti, non è molto comune ricevere un insieme di dati completamente pronto per creare un modello ML pronto all'uso. In questa lezione si imparerà come preparare un insieme di dati non elaborato utilizzando le librerie standard di Python. Si impareranno anche varie tecniche per visualizzare i dati.
+
+## Caso di studio: 'il mercato della zucca'
+
+In questa cartella si troverà un file .csv nella cartella `data` radice chiamato [US-pumpkins.csv](../../data/US-pumpkins.csv) che include 1757 righe di dati sul mercato delle zucche, ordinate in raggruppamenti per città. Si tratta di dati grezzi estratti dai [Report Standard dei Mercati Terminali delle Colture Speciali](https://www.marketnews.usda.gov/mnp/fv-report-config-step1?type=termPrice) distribuiti dal Dipartimento dell'Agricoltura degli Stati Uniti.
+
+### Preparazione dati
+
+Questi dati sono di pubblico dominio. Possono essere scaricati in molti file separati, per città, dal sito web dell'USDA. Per evitare troppi file separati, sono stati concatenati tutti i dati della città in un unico foglio di calcolo, quindi un po' i dati sono già stati _preparati_ . Successivamente, si darà un'occhiata più da vicino ai dati.
+
+### I dati della zucca - prime conclusioni
+
+Cosa si nota riguardo a questi dati? Si è già visto che c'è un mix di stringhe, numeri, spazi e valori strani a cui occorre dare un senso.
+
+Che domanda si puà fare a questi dati, utilizzando una tecnica di Regressione? Che dire di "Prevedere il prezzo di una zucca in vendita durante un dato mese". Esaminando nuovamente i dati, ci sono alcune modifiche da apportare per creare la struttura dati necessaria per l'attività.
+
+## Esercizio: analizzare i dati della zucca
+
+Si usa [Pandas](https://pandas.pydata.org/), (il nome sta per `Python Data Analysis`) uno strumento molto utile per dare forma ai dati, per analizzare e preparare questi dati sulla zucca.
+
+### Innanzitutto, controllare le date mancanti
+
+Prima si dovranno eseguire i passaggi per verificare le date mancanti:
+
+1. Convertire le date in un formato mensile (queste sono date statunitensi, quindi il formato è `MM/GG/AAAA`).
+2. Estrarre il mese in una nuova colonna.
+
+Aprire il file _notebook.ipynb_ in Visual Studio Code e importare il foglio di calcolo in un nuovo dataframe Pandas.
+
+1. Usare la funzione `head()` per visualizzare le prime cinque righe.
+
+ ```python
+ import pandas as pd
+ pumpkins = pd.read_csv('../data/US-pumpkins.csv')
+ pumpkins.head()
+ ```
+
+ ✅ Quale funzione si userebbe per visualizzare le ultime cinque righe?
+
+1. Controllare se mancano dati nel dataframe corrente:
+
+ ```python
+ pumpkins.isnull().sum()
+ ```
+
+ Ci sono dati mancanti, ma forse non avrà importanza per l'attività da svolgere.
+
+1. Per rendere più facile lavorare con il dataframe, si scartano molte delle sue colonne, usando `drop()`, mantenendo solo le colonne di cui si ha bisogno:
+
+ ```python
+ new_columns = ['Package', 'Month', 'Low Price', 'High Price', 'Date']
+ pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)
+ ```
+
+### Secondo, determinare il prezzo medio della zucca
+
+Si pensi a come determinare il prezzo medio di una zucca in un dato mese. Quali colonne si sceglierebbero per questa attività? Suggerimento: serviranno 3 colonne.
+
+Soluzione: prendere la media delle colonne `Low Price` e `High Price` per popolare la nuova colonna Price e convertire la colonna Date per mostrare solo il mese. Fortunatamente, secondo il controllo di cui sopra, non mancano dati per date o prezzi.
+
+1. Per calcolare la media, aggiungere il seguente codice:
+
+ ```python
+ price = (pumpkins['Low Price'] + pumpkins['High Price']) / 2
+
+ month = pd.DatetimeIndex(pumpkins['Date']).month
+
+ ```
+
+ ✅ Si possono di stampare tutti i dati che si desidera controllare utilizzando `print(month)`.
+
+2. Ora copiare i dati convertiti in un nuovo dataframe Pandas:
+
+ ```python
+ new_pumpkins = pd.DataFrame({'Month': month, 'Package': pumpkins['Package'], 'Low Price': pumpkins['Low Price'],'High Price': pumpkins['High Price'], 'Price': price})
+ ```
+
+ La stampa del dataframe mostrerà un insieme di dati pulito e ordinato su cui si può costruire il nuovo modello di regressione.
+
+### Ma non è finita qui! C'è qualcosa di strano qui.
+
+Osservando la colonna `Package`, le zucche sono vendute in molte configurazioni diverse. Alcune sono venduti in misure '1 1/9 bushel' (bushel = staio) e alcuni in misure '1/2 bushel', alcuni per zucca, alcuni per libbra e alcuni in grandi scatole con larghezze variabili.
+
+> Le zucche sembrano molto difficili da pesare in modo coerente
+
+Scavando nei dati originali, è interessante notare che qualsiasi cosa con `Unit of Sale` (Unità di vendita) uguale a 'EACH' o 'PER BIN' ha anche il tipo di `Package` per 'inch' (pollice), per 'bin' (contenitore) o 'each' (entrambi). Le zucche sembrano essere molto difficili da pesare in modo coerente, quindi si filtrano selezionando solo zucche con la stringa "bushel" nella colonna `Package`.
+
+1. Aggiungere un filtro nella parte superiore del file, sotto l'importazione .csv iniziale:
+
+ ```python
+ pumpkins = pumpkins[pumpkins['Package'].str.contains('bushel', case=True, regex=True)]
+ ```
+
+ Se si stampano i dati ora, si può vedere che si stanno ricevendo solo le circa 415 righe di dati contenenti zucche per bushel.
+
+### Ma non è finita qui! C'è un'altra cosa da fare.
+
+Si è notato che la quantità di bushel varia per riga? Si deve normalizzare il prezzo in modo da mostrare il prezzo per bushel, quindi si facciano un po' di calcoli per standardizzarlo.
+
+1. Aggiungere queste righe dopo il blocco che crea il dataframe new_pumpkins:
+
+ ```python
+ new_pumpkins.loc[new_pumpkins['Package'].str.contains('1 1/9'), 'Price'] = price/(1 + 1/9)
+
+ new_pumpkins.loc[new_pumpkins['Package'].str.contains('1/2'), 'Price'] = price/(1/2)
+ ```
+
+✅ Secondo [The Spruce Eats](https://www.thespruceeats.com/how-much-is-a-bushel-1389308), il peso di un bushel dipende dal tipo di prodotto, poiché è una misura di volume. "Un bushel di pomodori, per esempio, dovrebbe pesare 56 libbre... Foglie e verdure occupano più spazio con meno peso, quindi un bushel di spinaci è solo 20 libbre". È tutto piuttosto complicato! Non occorre preoccuparsi di fare una conversione da bushel a libbra, e invece si valuta a bushel. Tutto questo studio sui bushel di zucche, però, dimostra quanto sia importante capire la natura dei propri dati!
+
+Ora si può analizzare il prezzo per unità in base alla misurazione del bushel. Se si stampano i dati ancora una volta, si può vedere come sono standardizzati.
+
+✅ Si è notato che le zucche vendute a metà bushel sono molto costose? Si riesce a capire perché? Suggerimento: le zucche piccole sono molto più costose di quelle grandi, probabilmente perché ce ne sono molte di più per bushel, dato lo spazio inutilizzato occupato da una grande zucca cava.
+
+## Strategie di Visualizzazione
+
+Parte del ruolo del data scientist è dimostrare la qualità e la natura dei dati con cui sta lavorando. Per fare ciò, si creano spesso visualizzazioni interessanti o tracciati, grafici e diagrammi, che mostrano diversi aspetti dei dati. In questo modo, sono in grado di mostrare visivamente relazioni e lacune altrimenti difficili da scoprire.
+
+Le visualizzazioni possono anche aiutare a determinare la tecnica di machine learning più appropriata per i dati. Un grafico a dispersione che sembra seguire una linea, ad esempio, indica che i dati sono un buon candidato per un esercizio di regressione lineare.
+
+Una libreria di visualizzazione dei dati che funziona bene nei notebook Jupyter è [Matplotlib](https://matplotlib.org/) (che si è visto anche nella lezione precedente).
+
+> Per fare più esperienza con la visualizzazione dei dati si seguano [questi tutorial](https://docs.microsoft.com/learn/modules/explore-analyze-data-with-python?WT.mc_id=academic-15963-cxa).
+
+## Esercizio - sperimentare con Matplotlib
+
+Provare a creare alcuni grafici di base per visualizzare il nuovo dataframe appena creato. Cosa mostrerebbe un grafico a linee di base?
+
+1. Importare Matplotlib nella parte superiore del file, sotto l'importazione di Pandas:
+
+ ```python
+ import matplotlib.pyplot as plt
+ ```
+
+1. Rieseguire l'intero notebook per aggiornare.
+1. Nella parte inferiore del notebook, aggiungere una cella per tracciare i dati come una casella:
+
+ ```python
+ price = new_pumpkins.Price
+ month = new_pumpkins.Month
+ plt.scatter(price, month)
+ plt.show()
+ ```
+
+ ![Un grafico a dispersione che mostra la relazione tra prezzo e mese](../images/scatterplot.png)
+
+ È un tracciato utile? C'è qualcosa che sorprende?
+
+ Non è particolarmente utile in quanto tutto ciò che fa è visualizzare nei propri dati come una diffusione di punti in un dato mese.
+
+### Renderlo utile
+
+Per fare in modo che i grafici mostrino dati utili, di solito è necessario raggruppare i dati in qualche modo. Si prova a creare un grafico che mostra la distribuzione dei dati dove l'asse x mostra i mesi.
+
+1. Aggiungere una cella per creare un grafico a barre raggruppato:
+
+ ```python
+ new_pumpkins.groupby(['Month'])['Price'].mean().plot(kind='bar')
+ plt.ylabel("Pumpkin Price")
+ ```
+
+ ![Un grafico a barre che mostra la relazione tra prezzo e mese](../images/barchart.png)
+
+ Questa è una visualizzazione dei dati più utile! Sembra indicare che il prezzo più alto per le zucche si verifica a settembre e ottobre. Questo soddisfa le proprie aspettative? Perché o perché no?
+
+---
+
+## 🚀 Sfida
+
+Esplorare i diversi tipi di visualizzazione offerti da Matplotlib. Quali tipi sono più appropriati per i problemi di regressione?
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/12/)
+
+## Revisione e Auto Apprendimento
+
+Dare un'occhiata ai molti modi per visualizzare i dati. Fare un elenco delle varie librerie disponibili e annotare quali sono le migliori per determinati tipi di attività, ad esempio visualizzazioni 2D rispetto a visualizzazioni 3D. Cosa si è scoperto?
+
+## Compito
+
+[Esplorazione della visualizzazione](assignment.it.md)
diff --git a/2-Regression/2-Data/translations/README.ja.md b/2-Regression/2-Data/translations/README.ja.md
new file mode 100644
index 0000000000..ddd01a775e
--- /dev/null
+++ b/2-Regression/2-Data/translations/README.ja.md
@@ -0,0 +1,206 @@
+# Scikit-learnを用いた回帰モデルの構築: データの準備と可視化
+
+> ![データの可視化に関するインフォグラフィック](../images/data-visualization.png)
+>
+> [Dasani Madipalli](https://twitter.com/dasani_decoded) によるインフォグラフィック
+
+## [講義前のクイズ](https://white-water-09ec41f0f.azurestaticapps.net/quiz/11?loc=ja)
+
+## イントロダクション
+
+Scikit-learnを使って機械学習モデルの構築を行うために必要なツールの用意ができたところで、データに対する問いかけを始める準備が整いました。データを扱いMLソリューションを適用する際には、データセットの潜在能力を適切に引き出すために正しい問いかけをすることが非常に重要です。
+
+このレッスンでは、以下のことを学びます。
+
+- モデルを構築するためのデータ処理方法について
+- データの可視化におけるMatplotlibの使い方について
+
+## データに対して正しい問いかけをする
+
+どのような質問に答えるかによって、どのようなMLアルゴリズムを活用するかが決まります。また、返ってくる回答の質は、データの性質に大きく依存します。
+
+このレッスンのために用意された[データ]((../../data/US-pumpkins.csv))を見てみましょう。この.csvファイルは、VS Codeで開くことができます。ざっと確認してみると、空欄があったり、文字列や数値データが混在していることがわかります。また、「Package」という奇妙な列では「sacks」や 「bins」などの異なる単位の値が混在しています。このように、データはちょっとした混乱状態にあります。
+
+実際のところ、MLモデルの作成にすぐに使えるような整ったデータセットをそのまま受け取ることはあまりありません。このレッスンでは、Pythonの標準ライブラリを使って生のデータセットを準備する方法を学びます。また、データを可視化するための様々なテクニックを学びます。
+
+## ケーススタディ: カボチャの市場
+
+ルートの`date`フォルダの中に [US-pumpkins.csv](../../data/US-pumpkins.csv) という名前の.csvファイルがあります。このファイルには、カボチャの市場に関する1757行のデータが、都市ごとにグループ分けされて入っています。これは、米国農務省が配布している [Specialty Crops Terminal Markets Standard Reports](https://www.marketnews.usda.gov/mnp/fv-report-config-step1?type=termPrice) から抽出した生データです。
+
+### データの準備
+
+このデータはパブリックドメインです。米国農務省のウェブサイトから、都市ごとに個別ファイルをダウンロードすることができます。ファイルが多くなりすぎないように、すべての都市のデータを1つのスプレッドシートに連結しました。次に、データを詳しく見てみましょう。
+
+### カボチャのデータ - 初期の結論
+
+このデータについて何か気付いたことはありますか?文字列、数字、空白、奇妙な値が混在していて、意味を理解しなければならないこと気付いたと思います。
+
+回帰を使って、このデータにどのような問いかけができますか?「ある月に販売されるカボチャの価格を予測する」というのはどうでしょうか?データをもう一度見てみると、この課題に必要なデータ構造を作るために、いくつかの変更が必要です。
+
+## エクササイズ - カボチャのデータを分析
+
+データを整形するのに非常に便利な [Pandas](https://pandas.pydata.org/) (Python Data Analysisの略) を使って、このカボチャのデータを分析したり整えてみましょう。
+
+### 最初に、日付が欠損していないか確認する
+
+日付が欠損していないか確認するために、いくつかのステップがあります:
+
+1. 日付を月の形式に変換する(これは米国の日付なので、形式は `MM/DD/YYYY` となる)。
+2. 新しい列として月を抽出する。
+
+Visual Studio Codeで _notebook.ipynb_ ファイルを開き、スプレッドシートを Pandas DataFrame としてインポートします。
+
+1. `head()` 関数を使って最初の5行を確認します。
+
+ ```python
+ import pandas as pd
+ pumpkins = pd.read_csv('../data/US-pumpkins.csv')
+ pumpkins.head()
+ ```
+
+ ✅ 最後の5行を表示するには、どのような関数を使用しますか?
+
+
+2. 現在のデータフレームに欠損データがあるかどうかをチェックします。
+
+ ```python
+ pumpkins.isnull().sum()
+ ```
+
+ 欠損データがありましたが、今回のタスクには影響がなさそうです。
+
+
+3. データフレームを扱いやすくするために、`drop()` 関数を使っていくつかの列を削除し、必要な列だけを残すようにします。
+
+ ```python
+ new_columns = ['Package', 'Month', 'Low Price', 'High Price', 'Date']
+ pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)
+ ```
+
+### 次に、カボチャの平均価格を決定します。
+
+ある月のかぼちゃの平均価格を決定する方法を考えてみましょう。このタスクのために、どの列が必要ですか?ヒント:3つの列が必要になります。
+
+解決策:「最低価格」と「最高価格」の平均値を取って新しい「price」列を作成し、「日付」列を月のみ表示するように変換します。幸いなことに、上記で確認した結果によると日付や価格に欠損データはありませんでした。
+
+1. 平均値を算出するために、以下のコードを追加します。
+
+ ```python
+ price = (pumpkins['Low Price'] + pumpkins['High Price']) / 2
+
+ month = pd.DatetimeIndex(pumpkins['Date']).month
+
+ ```
+
+ ✅ `print(month)` などを使って自由にデータを確認してみてください。
+
+
+2. 変換したデータをPandasの新しいデータフレームにコピーします。
+
+ ```python
+ new_pumpkins = pd.DataFrame({'Month': month, 'Package': pumpkins['Package'], 'Low Price': pumpkins['Low Price'],'High Price': pumpkins['High Price'], 'Price': price})
+ ```
+
+ データフレームを出力すると、新しい回帰モデルを構築するための綺麗に整頓されたデータセットが表示されます。
+
+### でも、待ってください!なにかおかしいです。
+
+`Package` 列をみると、カボチャは様々な形で販売されています。「1 1/9ブッシェル」で売られているもの、「1/2ブッシェル」で売られているもの、かぼちゃ1個単位で売られているもの、1ポンド単位で売られているもの、幅の違う大きな箱で売られているものなど様々です。
+
+
+> かぼちゃの重さを一定にするのはとても難しいようです。
+
+元のデータを調べてみると、「Unit of Sale」が「EACH」または「PER BIN」となっているものは、「Package」が「per inch」、「per bin」、「each」となっているのが興味深いです。カボチャの計量単位に一貫性を持たせるのが非常に難しいようなので、`Package`列に「bushel」という文字列を持つカボチャだけを選択してフィルタリングしてみましょう。
+
+1. ファイルの一番上にフィルタを追加します。
+
+ ```python
+ pumpkins = pumpkins[pumpkins['Package'].str.contains('bushel', case=True, regex=True)]
+ ```
+
+ 今、データを出力してみると、ブッシェル単位のカボチャを含む415行ほどのデータしか得られていないことがわかります。
+
+### でも、待ってください!もうひとつ、やるべきことがあります。
+
+行ごとにブッシェルの量が異なることに気付きましたか?1ブッシェルあたりの価格を表示するためには、計算して価格を標準化する必要があります。
+
+1. new_pumpkinsデータフレームを作成するブロックの後に以下の行を追加します。
+
+ ```python
+ new_pumpkins.loc[new_pumpkins['Package'].str.contains('1 1/9'), 'Price'] = price/(1 + 1/9)
+
+ new_pumpkins.loc[new_pumpkins['Package'].str.contains('1/2'), 'Price'] = price/(1/2)
+ ```
+
+✅ [The Spruce Eats](https://www.thespruceeats.com/how-much-is-a-bushel-1389308) によると、ブッシェルの重さは体積を測るものなので、農産物の種類によって異なります。例えば、トマトの1ブッシェルは、56ポンドの重さになるとされています。葉っぱや野菜は重量が少なくてもスペースを取るので、ほうれん草の1ブッシェルはたったの20ポンドです。なんだか複雑ですね!ブッシェルからポンドへの換算は面倒なのでやめて、ブッシェル単位で価格を決めましょう。しかし、カボチャのブッシェルについての議論は、データの性質を理解することがいかに重要であるかを示しています。
+
+これで、ブッシェルの測定値に基づいて、ユニットごとの価格を分析することができます。もう1度データを出力してみると、標準化されていることがわかります。
+
+✅ ハーフブッシェルで売られているカボチャがとても高価なことに気付きましたか?なぜだかわかりますか?小さなカボチャは大きなカボチャよりもはるかに高価です。おそらく大きなカボチャ中身には、体積あたりで考えると空洞な部分が多く含まれると考えられます。
+
+## 可視化戦略
+
+データサイエンティストの役割の一つは、扱うデータの質や性質を示すことです。そのために、データのさまざまな側面を示す興味深いビジュアライゼーション(プロット、グラフ、チャート)を作成することがよくあります。そうすることで、他の方法では発見しにくい関係性やギャップを視覚的に示すことができます。
+
+また、可視化することでデータに適した機械学習の手法を判断することができます。例えば、散布図が直線に沿っているように見える場合は、適用する手法の候補の一つとして線形回帰が考えられます。
+
+Jupyter notebookでうまく利用できるテータ可視化ライブラリの一つに [Matplotlib](https://matplotlib.org/) があります (前のレッスンでも紹介しています)。
+
+> [こちらのチュートリアル](https://docs.microsoft.com/learn/modules/explore-analyze-data-with-python?WT.mc_id=academic-15963-cxa) でデータの可視化ついてより深く体験することができます。
+
+## エクササイズ - Matplotlibの実験
+
+先ほど作成したデータフレームを表示するために、いくつか基本的なプロットを作成してみてください。折れ線グラフから何が読み取れるでしょうか?
+
+1. ファイルの先頭、Pandasのインポートの下で Matplotlibをインポートします。
+
+ ```python
+ import matplotlib.pyplot as plt
+ ```
+
+1. ノートブック全体を再実行してリフレッシュします。
+2. ノートブックの下部に、データをプロットするためのセルを追加します。
+
+ ```python
+ price = new_pumpkins.Price
+ month = new_pumpkins.Month
+ plt.scatter(price, month)
+ plt.show()
+ ```
+
+ ![価格と月の関係を示す散布図](../images/scatterplot.png)
+
+ これは役に立つプロットですか?なにか驚いたことはありますか?
+
+ これはデータをある月について、データの広がりとして表示しているだけなので、特に役に立つものではありません。
+
+### 活用できるようにする
+
+グラフに有用なデータを表示するには、通常、データを何らかの方法でグループ化する必要があります。ここでは、X軸を月として、データの分布を示すようなプロットを作ってみましょう。
+
+1. セルを追加してグループ化された棒グラフを作成します。
+
+ ```python
+ new_pumpkins.groupby(['Month'])['Price'].mean().plot(kind='bar')
+ plt.ylabel("Pumpkin Price")
+ ```
+
+ ![値段と月の関係を表した棒グラフ](../images/barchart.png)
+
+ このプロットの方が、より有用なデータを可視化しています!カボチャの価格が最も高くなるのは、9月と10月であることを示しているようです。このプロットはあなたの期待に応えるものですか?どのような点で期待通りですか?また、どのような点で期待に答えられていませんか?
+
+---
+
+## 🚀チャレンジ
+
+Matplotlibが提供する様々なタイプのビジュアライゼーションを探ってみましょう。回帰の問題にはどのタイプが最も適しているでしょうか?
+
+## [講義後クイズ](https://white-water-09ec41f0f.azurestaticapps.net/quiz/12?loc=ja)
+
+## レビュー & 自主学習
+
+データを可視化するための様々な方法を見てみましょう。様々なライブラリをリストアップし、例えば2Dビジュアライゼーションと3Dビジュアライゼーションのように、特定のタイプのタスクに最適なものをメモします。どのような発見がありましたか?
+
+## 課題
+
+[ビジュアライゼーションの探求](./assignment.ja.md)
diff --git a/2-Regression/2-Data/translations/README.zh-cn.md b/2-Regression/2-Data/translations/README.zh-cn.md
new file mode 100644
index 0000000000..df542b15bf
--- /dev/null
+++ b/2-Regression/2-Data/translations/README.zh-cn.md
@@ -0,0 +1,202 @@
+# 使用Scikit-learn构建回归模型:准备和可视化数据
+
+> ![数据可视化信息图](../images/data-visualization.png)
+> 作者[Dasani Madipalli](https://twitter.com/dasani_decoded)
+
+## [课前测](https://white-water-09ec41f0f.azurestaticapps.net/quiz/11/)
+
+## 介绍
+
+既然你已经设置了开始使用Scikit-learn处理机器学习模型构建所需的工具,你就可以开始对数据提出问题了。当你处理数据并应用ML解决方案时,了解如何提出正确的问题以正确释放数据集的潜力非常重要。
+
+在本课中,你将学习:
+
+- 如何为模型构建准备数据。
+- 如何使用Matplotlib进行数据可视化。
+
+## 对你的数据提出正确的问题
+
+你需要回答的问题将决定你将使用哪种类型的ML算法。你得到的答案的质量将在很大程度上取决于你的数据的性质。
+
+查看为本课程提供的[数据](../data/US-pumpkins.csv)。你可以在VS Code中打开这个.csv文件。快速浏览一下就会发现有空格,还有字符串和数字数据的混合。还有一个奇怪的列叫做“Package”,其中的数据是“sacks”、“bins”和其他值的混合。事实上,数据有点乱。
+
+事实上,获得一个完全准备好用于创建开箱即用的ML模型的数据集并不是很常见。在本课中,你将学习如何使用标准Python库准备原始数据集。你还将学习各种技术来可视化数据。
+
+## 案例研究:“南瓜市场”
+
+你将在`data`文件夹中找到一个名为[US-pumpkins.csv](../data/US-pumpkins.csv)的.csv 文件,其中包含有关南瓜市场的1757行数据,已 按城市排序分组。这是从美国农业部分发的[特种作物终端市场标准报告](https://www.marketnews.usda.gov/mnp/fv-report-config-step1?type=termPrice)中提取的原始数据。
+
+### 准备数据
+
+这些数据属于公共领域。它可以从美国农业部网站下载,每个城市有许多不同的文件。为了避免太多单独的文件,我们将所有城市数据合并到一个电子表格中,因此我们已经准备了一些数据。接下来,让我们仔细看看数据。
+
+### 南瓜数据 - 早期结论
+
+你对这些数据有什么看法?你已经看到了无法理解的字符串、数字、空格和奇怪值的混合体。
+
+你可以使用回归技术对这些数据提出什么问题?“预测给定月份内待售南瓜的价格”怎么样?再次查看数据,你需要进行一些更改才能创建任务所需的数据结构。
+## 练习 - 分析南瓜数据
+
+让我们使用[Pandas](https://pandas.pydata.org/),(“Python 数据分析”的意思)一个非常有用的工具,用于分析和准备南瓜数据。
+
+### 首先,检查遗漏的日期
+
+你首先需要采取以下步骤来检查缺少的日期:
+
+1. 将日期转换为月份格式(这些是美国日期,因此格式为`MM/DD/YYYY`)。
+
+2. 将月份提取到新列。
+
+在 Visual Studio Code 中打开notebook.ipynb文件,并将电子表格导入到新的Pandas dataframe中。
+
+1. 使用 `head()`函数查看前五行。
+
+ ```python
+ import pandas as pd
+ pumpkins = pd.read_csv('../../data/US-pumpkins.csv')
+ pumpkins.head()
+ ```
+
+ ✅ 使用什么函数来查看最后五行?
+
+2. 检查当前dataframe中是否缺少数据:
+
+ ```python
+ pumpkins.isnull().sum()
+ ```
+
+ 有数据丢失,但可能对手头的任务来说无关紧要。
+
+3. 为了让你的dataframe更容易使用,使用`drop()`删除它的几个列,只保留你需要的列:
+
+ ```python
+ new_columns = ['Package', 'Month', 'Low Price', 'High Price', 'Date']
+ pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)
+ ```
+
+### 然后,确定南瓜的平均价格
+
+考虑如何确定给定月份南瓜的平均价格。你会为此任务选择哪些列?提示:你需要3列。
+
+解决方案:取`Low Price`和`High Price`列的平均值来填充新的Price列,将Date列转换成只显示月份。幸运的是,根据上面的检查,没有丢失日期或价格的数据。
+
+1. 要计算平均值,请添加以下代码:
+
+ ```python
+ price = (pumpkins['Low Price'] + pumpkins['High Price']) / 2
+
+ month = pd.DatetimeIndex(pumpkins['Date']).month
+
+ ```
+
+ ✅ 请随意使用`print(month)`打印你想检查的任何数据。
+
+2. 现在,将转换后的数据复制到新的Pandas dataframe中:
+
+ ```python
+ new_pumpkins = pd.DataFrame({'Month': month, 'Package': pumpkins['Package'], 'Low Price': pumpkins['Low Price'],'High Price': pumpkins['High Price'], 'Price': price})
+ ```
+
+ 打印出的dataframe将向你展示一个干净整洁的数据集,你可以在此数据集上构建新的回归模型。
+
+### 但是等等!这里有点奇怪
+
+如果你看看`Package`(包装)一栏,南瓜有很多不同的配置。有的以1 1/9蒲式耳的尺寸出售,有的以1/2蒲式耳的尺寸出售,有的以每只南瓜出售,有的以每磅出售,有的以不同宽度的大盒子出售。
+
+> 南瓜似乎很难统一称重方式
+
+深入研究原始数据,有趣的是,任何`Unit of Sale`等于“EACH”或“PER BIN”的东西也具有每英寸、每箱或“每个”的`Package`类型。南瓜似乎很难采用统一称重方式,因此让我们通过仅选择`Package`列中带有字符串“蒲式耳”的南瓜来过滤它们。
+
+1. 在初始.csv导入下添加过滤器:
+
+ ```python
+ pumpkins = pumpkins[pumpkins['Package'].str.contains('bushel', case=True, regex=True)]
+ ```
+
+ 如果你现在打印数据,你可以看到你只获得了 415 行左右包含按蒲式耳计算的南瓜的数据。
+
+### 可是等等! 还有一件事要做
+
+你是否注意到每行的蒲式耳数量不同?你需要对定价进行标准化,以便显示每蒲式耳的定价,因此请进行一些数学计算以对其进行标准化。
+
+1. 在创建 new_pumpkins dataframe的代码块之后添加这些行:
+
+ ```python
+ new_pumpkins.loc[new_pumpkins['Package'].str.contains('1 1/9'), 'Price'] = price/(1 + 1/9)
+
+ new_pumpkins.loc[new_pumpkins['Package'].str.contains('1/2'), 'Price'] = price/(1/2)
+ ```
+
+✅ 根据 [The Spruce Eats](https://www.thespruceeats.com/how-much-is-a-bushel-1389308),蒲式耳的重量取决于产品的类型,因为它是一种体积测量。“例如,一蒲式耳西红柿应该重56 磅……叶子和蔬菜占据更多空间,重量更轻,所以一蒲式耳菠菜只有20磅。” 这一切都相当复杂!让我们不要费心进行蒲式耳到磅的转换,而是按蒲式耳定价。然而,所有这些对蒲式耳南瓜的研究表明,了解数据的性质是多么重要!
+
+现在,你可以根据蒲式耳测量来分析每单位的定价。如果你再打印一次数据,你可以看到它是如何标准化的。
+
+✅ 你有没有注意到半蒲式耳卖的南瓜很贵?你能弄清楚为什么吗?提示:小南瓜比大南瓜贵得多,这可能是因为考虑到一个大的空心馅饼南瓜占用的未使用空间,每蒲式耳的南瓜要多得多。
+
+## 可视化策略
+
+数据科学家的部分职责是展示他们使用的数据的质量和性质。为此,他们通常会创建有趣的可视化或绘图、图形和图表,以显示数据的不同方面。通过这种方式,他们能够直观地展示难以发现的关系和差距。
+
+可视化还可以帮助确定最适合数据的机器学习技术。例如,似乎沿着一条线的散点图表明该数据是线性回归练习的良好候选者。
+
+一个在Jupyter notebooks中运行良好的数据可视化库是[Matplotlib](https://matplotlib.org/)(你在上一课中也看到过)。
+
+> 在[这些教程](https://docs.microsoft.com/learn/modules/explore-analyze-data-with-python?WT.mc_id=academic-15963-cxa)中获得更多数据可视化经验。
+
+## 练习 - 使用 Matplotlib 进行实验
+
+尝试创建一些基本图形来显示你刚刚创建的新dataframe。基本线图会显示什么?
+
+1. 在文件顶部导入Matplotlib:
+
+ ```python
+ import matplotlib.pyplot as plt
+ ```
+
+2. 重新刷新以运行整个notebook。
+
+3. 在notebook底部,添加一个单元格以绘制数据:
+
+ ```python
+ price = new_pumpkins.Price
+ month = new_pumpkins.Month
+ plt.scatter(price, month)
+ plt.show()
+ ```
+
+ ![显示价格与月份关系的散点图](../images/scatterplot.png)
+
+ 这是一个有用的图吗?有什么让你吃惊的吗?
+
+ 它并不是特别有用,因为它所做的只是在你的数据中显示为给定月份的点数分布。
+
+### 让它有用
+
+为了让图表显示有用的数据,你通常需要以某种方式对数据进行分组。让我们尝试创建一个图,其中y轴显示月份,数据显示数据的分布。
+
+1. 添加单元格以创建分组柱状图:
+
+ ```python
+ new_pumpkins.groupby(['Month'])['Price'].mean().plot(kind='bar')
+ plt.ylabel("Pumpkin Price")
+ ```
+
+ ![显示价格与月份关系的柱状图](../images/barchart.png)
+
+ 这是一个更有用的数据可视化!似乎表明南瓜的最高价格出现在9月和10月。这符合你的期望吗?为什么?为什么不?
+
+---
+
+## 🚀挑战
+
+探索Matplotlib提供的不同类型的可视化。哪种类型最适合回归问题?
+
+## [课后测](https://white-water-09ec41f0f.azurestaticapps.net/quiz/12/)
+
+## 复习与自学
+
+请看一下可视化数据的多种方法。列出各种可用的库,并注意哪些库最适合给定类型的任务,例如2D可视化与3D可视化。你发现了什么?
+
+## 任务
+
+[探索可视化](../assignment.md)
diff --git a/2-Regression/2-Data/translations/assignment.it.md b/2-Regression/2-Data/translations/assignment.it.md
new file mode 100644
index 0000000000..14527fcae7
--- /dev/null
+++ b/2-Regression/2-Data/translations/assignment.it.md
@@ -0,0 +1,9 @@
+# Esplorazione delle visualizzazioni
+
+Sono disponibili diverse librerie per la visualizzazione dei dati. Creare alcune visualizzazioni utilizzando i dati della zucca in questa lezione con matplotlib e seaborn in un notebook di esempio. Con quali librerie è più facile lavorare?
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | --------- | -------- | ----------------- |
+| | Viene inviato un notebook con due esplorazioni/visualizzazioni | Viene inviato un notebook con una esplorazione/visualizzazione | Non è stato inviato un notebook |
diff --git a/2-Regression/2-Data/translations/assignment.ja.md b/2-Regression/2-Data/translations/assignment.ja.md
new file mode 100644
index 0000000000..09f344d61f
--- /dev/null
+++ b/2-Regression/2-Data/translations/assignment.ja.md
@@ -0,0 +1,9 @@
+# ビジュアライゼーションの探求
+
+データのビジュアライゼーションには、いくつかの異なるライブラリがあります。このレッスンのPumpkinデータを使って、matplotlibとseabornを使って、サンプルノートブックでいくつかのビジュアライゼーションを作ってみましょう。どのライブラリが作業しやすいでしょうか?
+
+## ルーブリック
+
+| 指標 | 模範的 | 適切 | 要改善 |
+| -------- | --------- | -------- | ----------------- |
+| | ノートブックには2つの活用法/可視化方法が示されている。 | ノートブックには1つの活用法/可視化方法が示されている。 | ノートブックが提出されていない。 |
diff --git a/2-Regression/2-Data/translations/assignment.zh-cn.md b/2-Regression/2-Data/translations/assignment.zh-cn.md
new file mode 100644
index 0000000000..e9c0f1c275
--- /dev/null
+++ b/2-Regression/2-Data/translations/assignment.zh-cn.md
@@ -0,0 +1,9 @@
+# 探索数据可视化
+
+有好几个库都可以进行数据可视化。用 matplotlib 和 seaborn 对本课中涉及的 Pumpkin 数据集创建一些数据可视化的图标。并思考哪个库更容易使用?
+
+## 评判标准
+
+| 标准 | 优秀 | 中规中矩 | 仍需努力 |
+| -------- | --------- | -------- | ----------------- |
+| | 提交了含有两种探索可视化方法的notebook工程文件 | 提交了只包含有一种探索可视化方法的notebook工程文件 | 没提交 notebook 工程文件 |
diff --git a/2-Regression/3-Linear/README.md b/2-Regression/3-Linear/README.md
index 9e0b950271..0a4e09b497 100644
--- a/2-Regression/3-Linear/README.md
+++ b/2-Regression/3-Linear/README.md
@@ -2,7 +2,7 @@
![Linear vs polynomial regression infographic](./images/linear-polynomial.png)
> Infographic by [Dasani Madipalli](https://twitter.com/dasani_decoded)
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/13/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/13/)
### Introduction
So far you have explored what regression is with sample data gathered from the pumpkin pricing dataset that we will use throughout this lesson. You have also visualized it using Matplotlib.
@@ -81,7 +81,6 @@ Since you'll use Scikit-learn, there's no reason to do this by hand (although yo
```python
from sklearn.preprocessing import LabelEncoder
-new_pumpkins.iloc[:, 0:-1] = new_pumpkins.iloc[:, 0:-1].apply(LabelEncoder().fit_transform)
new_pumpkins.iloc[:, 0:-1] = new_pumpkins.iloc[:, 0:-1].apply(LabelEncoder().fit_transform)
```
@@ -321,7 +320,7 @@ It does make sense, given the plot! And, if this is a better model than the prev
Test several different variables in this notebook to see how correlation corresponds to model accuracy.
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/14/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/14/)
## Review & Self Study
@@ -329,4 +328,4 @@ In this lesson we learned about Linear Regression. There are other important typ
## Assignment
-[Build a Model](assignment.md)
\ No newline at end of file
+[Build a Model](assignment.md)
diff --git a/2-Regression/3-Linear/images/janitor.jpg b/2-Regression/3-Linear/images/janitor.jpg
new file mode 100644
index 0000000000..93e6f011c5
Binary files /dev/null and b/2-Regression/3-Linear/images/janitor.jpg differ
diff --git a/2-Regression/3-Linear/images/recipes.png b/2-Regression/3-Linear/images/recipes.png
new file mode 100644
index 0000000000..7fd24b06b0
Binary files /dev/null and b/2-Regression/3-Linear/images/recipes.png differ
diff --git a/2-Regression/3-Linear/solution/lesson_3-R.ipynb b/2-Regression/3-Linear/solution/lesson_3-R.ipynb
new file mode 100644
index 0000000000..09fadc9926
--- /dev/null
+++ b/2-Regression/3-Linear/solution/lesson_3-R.ipynb
@@ -0,0 +1,1082 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "metadata": {
+ "colab": {
+ "name": "lesson_3-R.ipynb",
+ "provenance": [],
+ "collapsed_sections": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "name": "ir",
+ "display_name": "R"
+ },
+ "language_info": {
+ "name": "R"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Build a regression model: linear and polynomial regression models"
+ ],
+ "metadata": {
+ "id": "EgQw8osnsUV-"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Linear and Polynomial Regression for Pumpkin Pricing - Lesson 3\r\n",
+ "
\r\n",
+ " \r\n",
+ " Infographic by Dasani Madipalli\r\n",
+ "\r\n",
+ "\r\n",
+ "\r\n",
+ "\r\n",
+ "#### Introduction\r\n",
+ "\r\n",
+ "So far you have explored what regression is with sample data gathered from the pumpkin pricing dataset that we will use throughout this lesson. You have also visualized it using `ggplot2`.💪\r\n",
+ "\r\n",
+ "Now you are ready to dive deeper into regression for ML. In this lesson, you will learn more about two types of regression: *basic linear regression* and *polynomial regression*, along with some of the math underlying these techniques.\r\n",
+ "\r\n",
+ "> Throughout this curriculum, we assume minimal knowledge of math, and seek to make it accessible for students coming from other fields, so watch for notes, 🧮 callouts, diagrams, and other learning tools to aid in comprehension.\r\n",
+ "\r\n",
+ "#### Preparation\r\n",
+ "\r\n",
+ "As a reminder, you are loading this data so as to ask questions of it.\r\n",
+ "\r\n",
+ "- When is the best time to buy pumpkins?\r\n",
+ "\r\n",
+ "- What price can I expect of a case of miniature pumpkins?\r\n",
+ "\r\n",
+ "- Should I buy them in half-bushel baskets or by the 1 1/9 bushel box? Let's keep digging into this data.\r\n",
+ "\r\n",
+ "In the previous lesson, you created a `tibble` (a modern reimagining of the data frame) and populated it with part of the original dataset, standardizing the pricing by the bushel. By doing that, however, you were only able to gather about 400 data points and only for the fall months. Maybe we can get a little more detail about the nature of the data by cleaning it more? We'll see... 🕵️♀️\r\n",
+ "\r\n",
+ "For this task, we'll require the following packages:\r\n",
+ "\r\n",
+ "- `tidyverse`: The [tidyverse](https://www.tidyverse.org/) is a [collection of R packages](https://www.tidyverse.org/packages) designed to makes data science faster, easier and more fun!\r\n",
+ "\r\n",
+ "- `tidymodels`: The [tidymodels](https://www.tidymodels.org/) framework is a [collection of packages](https://www.tidymodels.org/packages/) for modeling and machine learning.\r\n",
+ "\r\n",
+ "- `janitor`: The [janitor package](https://github.com/sfirke/janitor) provides simple little tools for examining and cleaning dirty data.\r\n",
+ "\r\n",
+ "- `corrplot`: The [corrplot package](https://cran.r-project.org/web/packages/corrplot/vignettes/corrplot-intro.html) provides a visual exploratory tool on correlation matrix that supports automatic variable reordering to help detect hidden patterns among variables.\r\n",
+ "\r\n",
+ "You can have them installed as:\r\n",
+ "\r\n",
+ "`install.packages(c(\"tidyverse\", \"tidymodels\", \"janitor\", \"corrplot\"))`\r\n",
+ "\r\n",
+ "The script below checks whether you have the packages required to complete this module and installs them for you in case they are missing."
+ ],
+ "metadata": {
+ "id": "WqQPS1OAsg3H"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "suppressWarnings(if (!require(\"pacman\")) install.packages(\"pacman\"))\r\n",
+ "\r\n",
+ "pacman::p_load(tidyverse, tidymodels, janitor, corrplot)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "tA4C2WN3skCf",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "c06cd805-5534-4edc-f72b-d0d1dab96ac0"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "We'll later load these awesome packages and make them available in our current R session. (This is for mere illustration, `pacman::p_load()` already did that for you)\r\n",
+ "\r\n",
+ "## 1. A linear regression line\r\n",
+ "\r\n",
+ "As you learned in Lesson 1, the goal of a linear regression exercise is to be able to plot a *line* *of* *best fit* to:\r\n",
+ "\r\n",
+ "- **Show variable relationships**. Show the relationship between variables\r\n",
+ "\r\n",
+ "- **Make predictions**. Make accurate predictions on where a new data point would fall in relationship to that line.\r\n",
+ "\r\n",
+ "To draw this type of line, we use a statistical technique called **Least-Squares Regression**. The term `least-squares` means that all the data points surrounding the regression line are squared and then added up. Ideally, that final sum is as small as possible, because we want a low number of errors, or `least-squares`. As such, the line of best fit is the line that gives us the lowest value for the sum of the squared errors - hence the name *least squares regression*.\r\n",
+ "\r\n",
+ "We do so since we want to model a line that has the least cumulative distance from all of our data points. We also square the terms before adding them since we are concerned with its magnitude rather than its direction.\r\n",
+ "\r\n",
+ "> **🧮 Show me the math**\r\n",
+ ">\r\n",
+ "> This line, called the *line of best fit* can be expressed by [an equation](https://en.wikipedia.org/wiki/Simple_linear_regression):\r\n",
+ ">\r\n",
+ "> Y = a + bX\r\n",
+ ">\r\n",
+ "> `X` is the '`explanatory variable` or `predictor`'. `Y` is the '`dependent variable` or `outcome`'. The slope of the line is `b` and `a` is the y-intercept, which refers to the value of `Y` when `X = 0`.\r\n",
+ ">\r\n",
+ "\r\n",
+ "> ![](../images/slope.png \"slope = $y/x$\")\r\n",
+ " Infographic by Jen Looper\r\n",
+ ">\r\n",
+ "> First, calculate the slope `b`.\r\n",
+ ">\r\n",
+ "> In other words, and referring to our pumpkin data's original question: \"predict the price of a pumpkin per bushel by month\", `X` would refer to the price and `Y` would refer to the month of sale.\r\n",
+ ">\r\n",
+ "> ![](../images/calculation.png)\r\n",
+ " Infographic by Jen Looper\r\n",
+ "> \r\n",
+ "> Calculate the value of Y. If you're paying around \\$4, it must be April!\r\n",
+ ">\r\n",
+ "> The math that calculates the line must demonstrate the slope of the line, which is also dependent on the intercept, or where `Y` is situated when `X = 0`.\r\n",
+ ">\r\n",
+ "> You can observe the method of calculation for these values on the [Math is Fun](https://www.mathsisfun.com/data/least-squares-regression.html) web site. Also visit [this Least-squares calculator](https://www.mathsisfun.com/data/least-squares-calculator.html) to watch how the numbers' values impact the line.\r\n",
+ "\r\n",
+ "Not so scary, right? 🤓\r\n",
+ "\r\n",
+ "#### Correlation\r\n",
+ "\r\n",
+ "One more term to understand is the **Correlation Coefficient** between given X and Y variables. Using a scatterplot, you can quickly visualize this coefficient. A plot with datapoints scattered in a neat line have high correlation, but a plot with datapoints scattered everywhere between X and Y have a low correlation.\r\n",
+ "\r\n",
+ "A good linear regression model will be one that has a high (nearer to 1 than 0) Correlation Coefficient using the Least-Squares Regression method with a line of regression.\r\n",
+ "\r\n"
+ ],
+ "metadata": {
+ "id": "cdX5FRpvsoP5"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## **2. A dance with data: creating a data frame that will be used for modelling**\r\n",
+ "\r\n",
+ "
\r\n",
+ " \r\n",
+ " Artwork by @allison_horst\r\n",
+ "\r\n",
+ "\r\n",
+ ""
+ ],
+ "metadata": {
+ "id": "WdUKXk7Bs8-V"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Load up required libraries and dataset. Convert the data to a data frame containing a subset of the data:\n",
+ "\n",
+ "- Only get pumpkins priced by the bushel\n",
+ "\n",
+ "- Convert the date to a month\n",
+ "\n",
+ "- Calculate the price to be an average of high and low prices\n",
+ "\n",
+ "- Convert the price to reflect the pricing by bushel quantity\n",
+ "\n",
+ "> We covered these steps in the [previous lesson](https://github.com/microsoft/ML-For-Beginners/blob/main/2-Regression/2-Data/solution/lesson_2-R.ipynb)."
+ ],
+ "metadata": {
+ "id": "fMCtu2G2s-p8"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Load the core Tidyverse packages\r\n",
+ "library(tidyverse)\r\n",
+ "library(lubridate)\r\n",
+ "\r\n",
+ "# Import the pumpkins data\r\n",
+ "pumpkins <- read_csv(file = \"https://raw.githubusercontent.com/microsoft/ML-For-Beginners/main/2-Regression/data/US-pumpkins.csv\")\r\n",
+ "\r\n",
+ "\r\n",
+ "# Get a glimpse and dimensions of the data\r\n",
+ "glimpse(pumpkins)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Print the first 50 rows of the data set\r\n",
+ "pumpkins %>% \r\n",
+ " slice_head(n = 5)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "ryMVZEEPtERn"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "In the spirit of sheer adventure, let's explore the [`janitor package`](github.com/sfirke/janitor) that provides simple functions for examining and cleaning dirty data. For instance, let's take a look at the column names for our data:"
+ ],
+ "metadata": {
+ "id": "xcNxM70EtJjb"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Return column names\r\n",
+ "pumpkins %>% \r\n",
+ " names()"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "5XtpaIigtPfW"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "🤔 We can do better. Let's make these column names `friendR` by converting them to the [snake_case](https://en.wikipedia.org/wiki/Snake_case) convention using `janitor::clean_names`. To find out more about this function: `?clean_names`"
+ ],
+ "metadata": {
+ "id": "IbIqrMINtSHe"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Clean names to the snake_case convention\r\n",
+ "pumpkins <- pumpkins %>% \r\n",
+ " clean_names(case = \"snake\")\r\n",
+ "\r\n",
+ "# Return column names\r\n",
+ "pumpkins %>% \r\n",
+ " names()"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "a2uYvclYtWvX"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Much tidyR 🧹! Now, a dance with the data using `dplyr` as in the previous lesson! 💃\n"
+ ],
+ "metadata": {
+ "id": "HfhnuzDDtaDd"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Select desired columns\r\n",
+ "pumpkins <- pumpkins %>% \r\n",
+ " select(variety, city_name, package, low_price, high_price, date)\r\n",
+ "\r\n",
+ "\r\n",
+ "\r\n",
+ "# Extract the month from the dates to a new column\r\n",
+ "pumpkins <- pumpkins %>%\r\n",
+ " mutate(date = mdy(date),\r\n",
+ " month = month(date)) %>% \r\n",
+ " select(-date)\r\n",
+ "\r\n",
+ "\r\n",
+ "\r\n",
+ "# Create a new column for average Price\r\n",
+ "pumpkins <- pumpkins %>% \r\n",
+ " mutate(price = (low_price + high_price)/2)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Retain only pumpkins with the string \"bushel\"\r\n",
+ "new_pumpkins <- pumpkins %>% \r\n",
+ " filter(str_detect(string = package, pattern = \"bushel\"))\r\n",
+ "\r\n",
+ "\r\n",
+ "# Normalize the pricing so that you show the pricing per bushel, not per 1 1/9 or 1/2 bushel\r\n",
+ "new_pumpkins <- new_pumpkins %>% \r\n",
+ " mutate(price = case_when(\r\n",
+ " str_detect(package, \"1 1/9\") ~ price/(1.1),\r\n",
+ " str_detect(package, \"1/2\") ~ price*2,\r\n",
+ " TRUE ~ price))\r\n",
+ "\r\n",
+ "# Relocate column positions\r\n",
+ "new_pumpkins <- new_pumpkins %>% \r\n",
+ " relocate(month, .before = variety)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Display the first 5 rows\r\n",
+ "new_pumpkins %>% \r\n",
+ " slice_head(n = 5)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "X0wU3gQvtd9f"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Good job!👌 You now have a clean, tidy data set on which you can build your new regression model!\n",
+ "\n",
+ "Mind a scatter plot?\n"
+ ],
+ "metadata": {
+ "id": "UpaIwaxqth82"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Set theme\r\n",
+ "theme_set(theme_light())\r\n",
+ "\r\n",
+ "# Make a scatter plot of month and price\r\n",
+ "new_pumpkins %>% \r\n",
+ " ggplot(mapping = aes(x = month, y = price)) +\r\n",
+ " geom_point(size = 1.6)\r\n"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "DXgU-j37tl5K"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "A scatter plot reminds us that we only have month data from August through December. We probably need more data to be able to draw conclusions in a linear fashion.\n",
+ "\n",
+ "Let's take a look at our modelling data again:"
+ ],
+ "metadata": {
+ "id": "Ve64wVbwtobI"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Display first 5 rows\r\n",
+ "new_pumpkins %>% \r\n",
+ " slice_head(n = 5)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "HFQX2ng1tuSJ"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "What if we wanted to predict the `price` of a pumpkin based on the `city` or `package` columns which are of type character? Or even more simply, how could we find the correlation (which requires both of its inputs to be numeric) between, say, `package` and `price`? 🤷🤷\n",
+ "\n",
+ "Machine learning models work best with numeric features rather than text values, so you generally need to convert categorical features into numeric representations.\n",
+ "\n",
+ "This means that we have to find a way to reformat our predictors to make them easier for a model to use effectively, a process known as `feature engineering`."
+ ],
+ "metadata": {
+ "id": "7hsHoxsStyjJ"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 3. Preprocessing data for modelling with recipes 👩🍳👨🍳\n",
+ "\n",
+ "Activities that reformat predictor values to make them easier for a model to use effectively has been termed `feature engineering`.\n",
+ "\n",
+ "Different models have different preprocessing requirements. For instance, least squares requires `encoding categorical variables` such as month, variety and city_name. This simply involves `translating` a column with `categorical values` into one or more `numeric columns` that take the place of the original.\n",
+ "\n",
+ "For example, suppose your data includes the following categorical feature:\n",
+ "\n",
+ "| city |\n",
+ "|:-------:|\n",
+ "| Denver |\n",
+ "| Nairobi |\n",
+ "| Tokyo |\n",
+ "\n",
+ "You can apply *ordinal encoding* to substitute a unique integer value for each category, like this:\n",
+ "\n",
+ "| city |\n",
+ "|:----:|\n",
+ "| 0 |\n",
+ "| 1 |\n",
+ "| 2 |\n",
+ "\n",
+ "And that's what we'll do to our data!\n",
+ "\n",
+ "In this section, we'll explore another amazing Tidymodels package: [recipes](https://tidymodels.github.io/recipes/) - which is designed to help you preprocess your data **before** training your model. At its core, a recipe is an object that defines what steps should be applied to a data set in order to get it ready for modelling.\n",
+ "\n",
+ "Now, let's create a recipe that prepares our data for modelling by substituting a unique integer for all the observations in the predictor columns:"
+ ],
+ "metadata": {
+ "id": "AD5kQbcvt3Xl"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Specify a recipe\r\n",
+ "pumpkins_recipe <- recipe(price ~ ., data = new_pumpkins) %>% \r\n",
+ " step_integer(all_predictors(), zero_based = TRUE)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Print out the recipe\r\n",
+ "pumpkins_recipe"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "BNaFKXfRt9TU"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Awesome! 👏 We just created our first recipe that specifies an outcome (price) and its corresponding predictors and that all the predictor columns should be encoded into a set of integers 🙌! Let's quickly break it down:\r\n",
+ "\r\n",
+ "- The call to `recipe()` with a formula tells the recipe the *roles* of the variables using `new_pumpkins` data as the reference. For instance the `price` column has been assigned an `outcome` role while the rest of the columns have been assigned a `predictor` role.\r\n",
+ "\r\n",
+ "- `step_integer(all_predictors(), zero_based = TRUE)` specifies that all the predictors should be converted into a set of integers with the numbering starting at 0.\r\n",
+ "\r\n",
+ "We are sure you may be having thoughts such as: \"This is so cool!! But what if I needed to confirm that the recipes are doing exactly what I expect them to do? 🤔\"\r\n",
+ "\r\n",
+ "That's an awesome thought! You see, once your recipe is defined, you can estimate the parameters required to actually preprocess the data, and then extract the processed data. You don't typically need to do this when you use Tidymodels (we'll see the normal convention in just a minute-\\> `workflows`) but it can come in handy when you want to do some kind of sanity check for confirming that recipes are doing what you expect.\r\n",
+ "\r\n",
+ "For that, you'll need two more verbs: `prep()` and `bake()` and as always, our little R friends by [`Allison Horst`](https://github.com/allisonhorst/stats-illustrations) help you in understanding this better!\r\n",
+ "\r\n",
+ "
\r\n",
+ " \r\n",
+ " Artwork by @allison_horst\r\n",
+ "\r\n",
+ "\r\n",
+ ""
+ ],
+ "metadata": {
+ "id": "KEiO0v7kuC9O"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "[`prep()`](https://recipes.tidymodels.org/reference/prep.html): estimates the required parameters from a training set that can be later applied to other data sets. For instance, for a given predictor column, what observation will be assigned integer 0 or 1 or 2 etc\n",
+ "\n",
+ "[`bake()`](https://recipes.tidymodels.org/reference/bake.html): takes a prepped recipe and applies the operations to any data set.\n",
+ "\n",
+ "That said, lets prep and bake our recipes to really confirm that under the hood, the predictor columns will be first encoded before a model is fit."
+ ],
+ "metadata": {
+ "id": "Q1xtzebuuTCP"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Prep the recipe\r\n",
+ "pumpkins_prep <- prep(pumpkins_recipe)\r\n",
+ "\r\n",
+ "# Bake the recipe to extract a preprocessed new_pumpkins data\r\n",
+ "baked_pumpkins <- bake(pumpkins_prep, new_data = NULL)\r\n",
+ "\r\n",
+ "# Print out the baked data set\r\n",
+ "baked_pumpkins %>% \r\n",
+ " slice_head(n = 10)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "FGBbJbP_uUUn"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Woo-hoo!🥳 The processed data `baked_pumpkins` has all it's predictors encoded confirming that indeed the preprocessing steps defined as our recipe will work as expected. This makes it harder for you to read but much more intelligible for Tidymodels! Take some time to find out what observation has been mapped to a corresponding integer.\n",
+ "\n",
+ "It is also worth mentioning that `baked_pumpkins` is a data frame that we can perform computations on.\n",
+ "\n",
+ "For instance, let's try to find a good correlation between two points of your data to potentially build a good predictive model. We'll use the function `cor()` to do this. Type `?cor()` to find out more about the function."
+ ],
+ "metadata": {
+ "id": "1dvP0LBUueAW"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Find the correlation between the city_name and the price\r\n",
+ "cor(baked_pumpkins$city_name, baked_pumpkins$price)\r\n",
+ "\r\n",
+ "# Find the correlation between the package and the price\r\n",
+ "cor(baked_pumpkins$package, baked_pumpkins$price)\r\n"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "3bQzXCjFuiSV"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "As it turns out, there's only weak correlation between the City and Price. However there's a bit better correlation between the Package and its Price. That makes sense, right? Normally, the bigger the produce box, the higher the price.\n",
+ "\n",
+ "While we are at it, let's also try and visualize a correlation matrix of all the columns using the `corrplot` package."
+ ],
+ "metadata": {
+ "id": "BToPWbgjuoZw"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Load the corrplot package\r\n",
+ "library(corrplot)\r\n",
+ "\r\n",
+ "# Obtain correlation matrix\r\n",
+ "corr_mat <- cor(baked_pumpkins %>% \r\n",
+ " # Drop columns that are not really informative\r\n",
+ " select(-c(low_price, high_price)))\r\n",
+ "\r\n",
+ "# Make a correlation plot between the variables\r\n",
+ "corrplot(corr_mat, method = \"shade\", shade.col = NA, tl.col = \"black\", tl.srt = 45, addCoef.col = \"black\", cl.pos = \"n\", order = \"original\")"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "ZwAL3ksmutVR"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "🤩🤩 Much better.\r\n",
+ "\r\n",
+ "A good question to now ask of this data will be: '`What price can I expect of a given pumpkin package?`' Let's get right into it!\r\n",
+ "\r\n",
+ "> Note: When you **`bake()`** the prepped recipe **`pumpkins_prep`** with **`new_data = NULL`**, you extract the processed (i.e. encoded) training data. If you had another data set for example a test set and would want to see how a recipe would pre-process it, you would simply bake **`pumpkins_prep`** with **`new_data = test_set`**\r\n",
+ "\r\n",
+ "## 4. Build a linear regression model\r\n",
+ "\r\n",
+ "
\r\n",
+ " \r\n",
+ " Infographic by Dasani Madipalli\r\n",
+ "\r\n",
+ "\r\n",
+ ""
+ ],
+ "metadata": {
+ "id": "YqXjLuWavNxW"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Now that we have build a recipe, and actually confirmed that the data will be pre-processed appropriately, let's now build a regression model to answer the question: `What price can I expect of a given pumpkin package?`\n",
+ "\n",
+ "#### Train a linear regression model using the training set\n",
+ "\n",
+ "As you may have already figured out, the column *price* is the `outcome` variable while the *package* column is the `predictor` variable.\n",
+ "\n",
+ "To do this, we'll first split the data such that 80% goes into training and 20% into test set, then define a recipe that will encode the predictor column into a set of integers, then build a model specification. We won't prep and bake our recipe since we already know it will preprocess the data as expected."
+ ],
+ "metadata": {
+ "id": "Pq0bSzCevW-h"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "set.seed(2056)\r\n",
+ "# Split the data into training and test sets\r\n",
+ "pumpkins_split <- new_pumpkins %>% \r\n",
+ " initial_split(prop = 0.8)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Extract training and test data\r\n",
+ "pumpkins_train <- training(pumpkins_split)\r\n",
+ "pumpkins_test <- testing(pumpkins_split)\r\n",
+ "\r\n",
+ "\r\n",
+ "\r\n",
+ "# Create a recipe for preprocessing the data\r\n",
+ "lm_pumpkins_recipe <- recipe(price ~ package, data = pumpkins_train) %>% \r\n",
+ " step_integer(all_predictors(), zero_based = TRUE)\r\n",
+ "\r\n",
+ "\r\n",
+ "\r\n",
+ "# Create a linear model specification\r\n",
+ "lm_spec <- linear_reg() %>% \r\n",
+ " set_engine(\"lm\") %>% \r\n",
+ " set_mode(\"regression\")"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "CyoEh_wuvcLv"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Good job! Now that we have a recipe and a model specification, we need to find a way of bundling them together into an object that will first preprocess the data (prep+bake behind the scenes), fit the model on the preprocessed data and also allow for potential post-processing activities. How's that for your peace of mind!🤩\n",
+ "\n",
+ "In Tidymodels, this convenient object is called a [`workflow`](https://workflows.tidymodels.org/) and conveniently holds your modeling components! This is what we'd call *pipelines* in *Python*.\n",
+ "\n",
+ "So let's bundle everything up into a workflow!📦"
+ ],
+ "metadata": {
+ "id": "G3zF_3DqviFJ"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Hold modelling components in a workflow\r\n",
+ "lm_wf <- workflow() %>% \r\n",
+ " add_recipe(lm_pumpkins_recipe) %>% \r\n",
+ " add_model(lm_spec)\r\n",
+ "\r\n",
+ "# Print out the workflow\r\n",
+ "lm_wf"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "T3olroU3v-WX"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "👌 Into the bargain, a workflow can be fit/trained in much the same way a model can."
+ ],
+ "metadata": {
+ "id": "zd1A5tgOwEPX"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Train the model\r\n",
+ "lm_wf_fit <- lm_wf %>% \r\n",
+ " fit(data = pumpkins_train)\r\n",
+ "\r\n",
+ "# Print the model coefficients learned \r\n",
+ "lm_wf_fit"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "NhJagFumwFHf"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "From the model output, we can see the coefficients learned during training. They represent the coefficients of the line of best fit that gives us the lowest overall error between the actual and predicted variable.\n",
+ "\n",
+ "\n",
+ "#### Evaluate model performance using the test set\n",
+ "\n",
+ "It's time to see how the model performed 📏! How do we do this?\n",
+ "\n",
+ "Now that we've trained the model, we can use it to make predictions for the test_set using `parsnip::predict()`. Then we can compare these predictions to the actual label values to evaluate how well (or not!) the model is working.\n",
+ "\n",
+ "Let's start with making predictions for the test set then bind the columns to the test set."
+ ],
+ "metadata": {
+ "id": "_4QkGtBTwItF"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Make predictions for the test set\r\n",
+ "predictions <- lm_wf_fit %>% \r\n",
+ " predict(new_data = pumpkins_test)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Bind predictions to the test set\r\n",
+ "lm_results <- pumpkins_test %>% \r\n",
+ " select(c(package, price)) %>% \r\n",
+ " bind_cols(predictions)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Print the first ten rows of the tibble\r\n",
+ "lm_results %>% \r\n",
+ " slice_head(n = 10)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "UFZzTG0gwTs9"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "Yes, you have just trained a model and used it to make predictions!🔮 Is it any good, let's evaluate the model's performance!\n",
+ "\n",
+ "In Tidymodels, we do this using `yardstick::metrics()`! For linear regression, let's focus on the following metrics:\n",
+ "\n",
+ "- `Root Mean Square Error (RMSE)`: The square root of the [MSE](https://en.wikipedia.org/wiki/Mean_squared_error). This yields an absolute metric in the same unit as the label (in this case, the price of a pumpkin). The smaller the value, the better the model (in a simplistic sense, it represents the average price by which the predictions are wrong!)\n",
+ "\n",
+ "- `Coefficient of Determination (usually known as R-squared or R2)`: A relative metric in which the higher the value, the better the fit of the model. In essence, this metric represents how much of the variance between predicted and actual label values the model is able to explain."
+ ],
+ "metadata": {
+ "id": "0A5MjzM7wW9M"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Evaluate performance of linear regression\r\n",
+ "metrics(data = lm_results,\r\n",
+ " truth = price,\r\n",
+ " estimate = .pred)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "reJ0UIhQwcEH"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "There goes the model performance. Let's see if we can get a better indication by visualizing a scatter plot of the package and price then use the predictions made to overlay a line of best fit.\n",
+ "\n",
+ "This means we'll have to prep and bake the test set in order to encode the package column then bind this to the predictions made by our model."
+ ],
+ "metadata": {
+ "id": "fdgjzjkBwfWt"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Encode package column\r\n",
+ "package_encode <- lm_pumpkins_recipe %>% \r\n",
+ " prep() %>% \r\n",
+ " bake(new_data = pumpkins_test) %>% \r\n",
+ " select(package)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Bind encoded package column to the results\r\n",
+ "lm_results <- lm_results %>% \r\n",
+ " bind_cols(package_encode %>% \r\n",
+ " rename(package_integer = package)) %>% \r\n",
+ " relocate(package_integer, .after = package)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Print new results data frame\r\n",
+ "lm_results %>% \r\n",
+ " slice_head(n = 5)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Make a scatter plot\r\n",
+ "lm_results %>% \r\n",
+ " ggplot(mapping = aes(x = package_integer, y = price)) +\r\n",
+ " geom_point(size = 1.6) +\r\n",
+ " # Overlay a line of best fit\r\n",
+ " geom_line(aes(y = .pred), color = \"orange\", size = 1.2) +\r\n",
+ " xlab(\"package\")\r\n",
+ " \r\n"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "R0nw719lwkHE"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Great! As you can see, the linear regression model does not really well generalize the relationship between a package and its corresponding price.\r\n",
+ "\r\n",
+ "🎃 Congratulations, you just created a model that can help predict the price of a few varieties of pumpkins. Your holiday pumpkin patch will be beautiful. But you can probably create a better model!\r\n",
+ "\r\n",
+ "## 5. Build a polynomial regression model\r\n",
+ "\r\n",
+ "
\r\n",
+ " \r\n",
+ " Infographic by Dasani Madipalli\r\n",
+ "\r\n",
+ "\r\n",
+ ""
+ ],
+ "metadata": {
+ "id": "HOCqJXLTwtWI"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Sometimes our data may not have a linear relationship, but we still want to predict an outcome. Polynomial regression can help us make predictions for more complex non-linear relationships.\n",
+ "\n",
+ "Take for instance the relationship between the package and price for our pumpkins data set. While sometimes there's a linear relationship between variables - the bigger the pumpkin in volume, the higher the price - sometimes these relationships can't be plotted as a plane or straight line.\n",
+ "\n",
+ "> ✅ Here are [some more examples](https://online.stat.psu.edu/stat501/lesson/9/9.8) of data that could use polynomial regression\n",
+ ">\n",
+ "> Take another look at the relationship between Variety to Price in the previous plot. Does this scatterplot seem like it should necessarily be analyzed by a straight line? Perhaps not. In this case, you can try polynomial regression.\n",
+ ">\n",
+ "> ✅ Polynomials are mathematical expressions that might consist of one or more variables and coefficients\n",
+ "\n",
+ "#### Train a polynomial regression model using the training set\n",
+ "\n",
+ "Polynomial regression creates a *curved line* to better fit nonlinear data.\n",
+ "\n",
+ "Let's see whether a polynomial model will perform better in making predictions. We'll follow a somewhat similar procedure as we did before:\n",
+ "\n",
+ "- Create a recipe that specifies the preprocessing steps that should be carried out on our data to get it ready for modelling i.e: encoding predictors and computing polynomials of degree *n*\n",
+ "\n",
+ "- Build a model specification\n",
+ "\n",
+ "- Bundle the recipe and model specification into a workflow\n",
+ "\n",
+ "- Create a model by fitting the workflow\n",
+ "\n",
+ "- Evaluate how well the model performs on the test data\n",
+ "\n",
+ "Let's get right into it!\n"
+ ],
+ "metadata": {
+ "id": "VcEIpRV9wzYr"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Specify a recipe\r\n",
+ "poly_pumpkins_recipe <-\r\n",
+ " recipe(price ~ package, data = pumpkins_train) %>%\r\n",
+ " step_integer(all_predictors(), zero_based = TRUE) %>% \r\n",
+ " step_poly(all_predictors(), degree = 4)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Create a model specification\r\n",
+ "poly_spec <- linear_reg() %>% \r\n",
+ " set_engine(\"lm\") %>% \r\n",
+ " set_mode(\"regression\")\r\n",
+ "\r\n",
+ "\r\n",
+ "# Bundle recipe and model spec into a workflow\r\n",
+ "poly_wf <- workflow() %>% \r\n",
+ " add_recipe(poly_pumpkins_recipe) %>% \r\n",
+ " add_model(poly_spec)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Create a model\r\n",
+ "poly_wf_fit <- poly_wf %>% \r\n",
+ " fit(data = pumpkins_train)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Print learned model coefficients\r\n",
+ "poly_wf_fit\r\n",
+ "\r\n",
+ " "
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "63n_YyRXw3CC"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### Evaluate model performance\n",
+ "\n",
+ "👏👏You've built a polynomial model let's make predictions on the test set!"
+ ],
+ "metadata": {
+ "id": "-LHZtztSxDP0"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Make price predictions on test data\r\n",
+ "poly_results <- poly_wf_fit %>% predict(new_data = pumpkins_test) %>% \r\n",
+ " bind_cols(pumpkins_test %>% select(c(package, price))) %>% \r\n",
+ " relocate(.pred, .after = last_col())\r\n",
+ "\r\n",
+ "\r\n",
+ "# Print the results\r\n",
+ "poly_results %>% \r\n",
+ " slice_head(n = 10)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "YUFpQ_dKxJGx"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Woo-hoo, let's evaluate how the model performed on the test_set using `yardstick::metrics()`."
+ ],
+ "metadata": {
+ "id": "qxdyj86bxNGZ"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "metrics(data = poly_results, truth = price, estimate = .pred)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "8AW5ltkBxXDm"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "🤩🤩 Much better performance.\n",
+ "\n",
+ "The `rmse` decreased from about 7. to about 3. an indication that of a reduced error between the actual price and the predicted price. You can *loosely* interpret this as meaning that on average, incorrect predictions are wrong by around \\$3. The `rsq` increased from about 0.4 to 0.8.\n",
+ "\n",
+ "All these metrics indicate that the polynomial model performs way better than the linear model. Good job!\n",
+ "\n",
+ "Let's see if we can visualize this!"
+ ],
+ "metadata": {
+ "id": "6gLHNZDwxYaS"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Bind encoded package column to the results\r\n",
+ "poly_results <- poly_results %>% \r\n",
+ " bind_cols(package_encode %>% \r\n",
+ " rename(package_integer = package)) %>% \r\n",
+ " relocate(package_integer, .after = package)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Print new results data frame\r\n",
+ "poly_results %>% \r\n",
+ " slice_head(n = 5)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Make a scatter plot\r\n",
+ "poly_results %>% \r\n",
+ " ggplot(mapping = aes(x = package_integer, y = price)) +\r\n",
+ " geom_point(size = 1.6) +\r\n",
+ " # Overlay a line of best fit\r\n",
+ " geom_line(aes(y = .pred), color = \"midnightblue\", size = 1.2) +\r\n",
+ " xlab(\"package\")\r\n"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "A83U16frxdF1"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "You can see a curved line that fits your data better! 🤩\n",
+ "\n",
+ "You can make this more smoother by passing a polynomial formula to `geom_smooth` like this:"
+ ],
+ "metadata": {
+ "id": "4U-7aHOVxlGU"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Make a scatter plot\r\n",
+ "poly_results %>% \r\n",
+ " ggplot(mapping = aes(x = package_integer, y = price)) +\r\n",
+ " geom_point(size = 1.6) +\r\n",
+ " # Overlay a line of best fit\r\n",
+ " geom_smooth(method = lm, formula = y ~ poly(x, degree = 4), color = \"midnightblue\", size = 1.2, se = FALSE) +\r\n",
+ " xlab(\"package\")"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "5vzNT0Uexm-w"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Much like a smooth curve!🤩\n",
+ "\n",
+ "Here's how you would make a new prediction:"
+ ],
+ "metadata": {
+ "id": "v9u-wwyLxq4G"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Make a hypothetical data frame\r\n",
+ "hypo_tibble <- tibble(package = \"bushel baskets\")\r\n",
+ "\r\n",
+ "# Make predictions using linear model\r\n",
+ "lm_pred <- lm_wf_fit %>% predict(new_data = hypo_tibble)\r\n",
+ "\r\n",
+ "# Make predictions using polynomial model\r\n",
+ "poly_pred <- poly_wf_fit %>% predict(new_data = hypo_tibble)\r\n",
+ "\r\n",
+ "# Return predictions in a list\r\n",
+ "list(\"linear model prediction\" = lm_pred, \r\n",
+ " \"polynomial model prediction\" = poly_pred)\r\n"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "jRPSyfQGxuQv"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "The `polynomial model` prediction does make sense, given the scatter plots of `price` and `package`! And, if this is a better model than the previous one, looking at the same data, you need to budget for these more expensive pumpkins!\n",
+ "\n",
+ "🏆 Well done! You created two regression models in one lesson. In the final section on regression, you will learn about logistic regression to determine categories.\n",
+ "\n",
+ "## **🚀Challenge**\n",
+ "\n",
+ "Test several different variables in this notebook to see how correlation corresponds to model accuracy.\n",
+ "\n",
+ "## [**Post-lecture quiz**](https://white-water-09ec41f0f.azurestaticapps.net/quiz/14/)\n",
+ "\n",
+ "## **Review & Self Study**\n",
+ "\n",
+ "In this lesson we learned about Linear Regression. There are other important types of Regression. Read about Stepwise, Ridge, Lasso and Elasticnet techniques. A good course to study to learn more is the [Stanford Statistical Learning course](https://online.stanford.edu/courses/sohs-ystatslearning-statistical-learning)\n",
+ "\n",
+ "If you want to learn more about how to use the amazing Tidymodels framework, please check out the following resources:\n",
+ "\n",
+ "- Tidymodels website: [Get started with Tidymodels](https://www.tidymodels.org/start/)\n",
+ "\n",
+ "- Max Kuhn and Julia Silge, [*Tidy Modeling with R*](https://www.tmwr.org/)*.*\n",
+ "\n",
+ "###### **THANK YOU TO:**\n",
+ "\n",
+ "[Allison Horst](https://twitter.com/allison_horst?lang=en) for creating the amazing illustrations that make R more welcoming and engaging. Find more illustrations at her [gallery](https://www.google.com/url?q=https://github.com/allisonhorst/stats-illustrations&sa=D&source=editors&ust=1626380772530000&usg=AOvVaw3zcfyCizFQZpkSLzxiiQEM).\n"
+ ],
+ "metadata": {
+ "id": "8zOLOWqMxzk5"
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/2-Regression/3-Linear/solution/lesson_3.Rmd b/2-Regression/3-Linear/solution/lesson_3.Rmd
new file mode 100644
index 0000000000..91968a0656
--- /dev/null
+++ b/2-Regression/3-Linear/solution/lesson_3.Rmd
@@ -0,0 +1,679 @@
+---
+title: 'Build a regression model: linear and polynomial regression models'
+output:
+ html_document:
+ df_print: paged
+ theme: flatly
+ highlight: breezedark
+ toc: yes
+ toc_float: yes
+ code_download: yes
+---
+
+## Linear and Polynomial Regression for Pumpkin Pricing - Lesson 3
+
+![Infographic by Dasani Madipalli](../images/linear-polynomial.png){width="800"}
+
+#### Introduction
+
+So far you have explored what regression is with sample data gathered from the pumpkin pricing dataset that we will use throughout this lesson. You have also visualized it using `ggplot2`.💪
+
+Now you are ready to dive deeper into regression for ML. In this lesson, you will learn more about two types of regression: *basic linear regression* and *polynomial regression*, along with some of the math underlying these techniques.
+
+> Throughout this curriculum, we assume minimal knowledge of math, and seek to make it accessible for students coming from other fields, so watch for notes, 🧮 callouts, diagrams, and other learning tools to aid in comprehension.
+
+#### Preparation
+
+As a reminder, you are loading this data so as to ask questions of it.
+
+- When is the best time to buy pumpkins?
+
+- What price can I expect of a case of miniature pumpkins?
+
+- Should I buy them in half-bushel baskets or by the 1 1/9 bushel box? Let's keep digging into this data.
+
+In the previous lesson, you created a `tibble` (a modern reimagining of the data frame) and populated it with part of the original dataset, standardizing the pricing by the bushel. By doing that, however, you were only able to gather about 400 data points and only for the fall months. Maybe we can get a little more detail about the nature of the data by cleaning it more? We'll see... 🕵️♀️
+
+For this task, we'll require the following packages:
+
+- `tidyverse`: The [tidyverse](https://www.tidyverse.org/) is a [collection of R packages](https://www.tidyverse.org/packages) designed to makes data science faster, easier and more fun!
+
+- `tidymodels`: The [tidymodels](https://www.tidymodels.org/) framework is a [collection of packages](https://www.tidymodels.org/packages/) for modeling and machine learning.
+
+- `janitor`: The [janitor package](https://github.com/sfirke/janitor) provides simple little tools for examining and cleaning dirty data.
+
+- `corrplot`: The [corrplot package](https://cran.r-project.org/web/packages/corrplot/vignettes/corrplot-intro.html) provides a visual exploratory tool on correlation matrix that supports automatic variable reordering to help detect hidden patterns among variables.
+
+You can have them installed as:
+
+`install.packages(c("tidyverse", "tidymodels", "janitor", "corrplot"))`
+
+The script below checks whether you have the packages required to complete this module and installs them for you in case they are missing.
+
+```{r, message=F, warning=F}
+suppressWarnings(if (!require("pacman")) install.packages("pacman"))
+
+pacman::p_load(tidyverse, tidymodels, janitor, corrplot)
+```
+
+We'll later load these awesome packages and make them available in our current R session. (This is for mere illustration, `pacman::p_load()` already did that for you)
+
+## 1. A linear regression line
+
+As you learned in Lesson 1, the goal of a linear regression exercise is to be able to plot a *line* *of* *best fit* to:
+
+- **Show variable relationships**. Show the relationship between variables
+
+- **Make predictions**. Make accurate predictions on where a new data point would fall in relationship to that line.
+
+To draw this type of line, we use a statistical technique called **Least-Squares Regression**. The term `least-squares` means that all the data points surrounding the regression line are squared and then added up. Ideally, that final sum is as small as possible, because we want a low number of errors, or `least-squares`. As such, the line of best fit is the line that gives us the lowest value for the sum of the squared errors - hence the name *least squares regression*.
+
+We do so since we want to model a line that has the least cumulative distance from all of our data points. We also square the terms before adding them since we are concerned with its magnitude rather than its direction.
+
+> **🧮 Show me the math**
+>
+> This line, called the *line of best fit* can be expressed by [an equation](https://en.wikipedia.org/wiki/Simple_linear_regression):
+>
+> Y = a + bX
+>
+> `X` is the '`explanatory variable` or `predictor`'. `Y` is the '`dependent variable` or `outcome`'. The slope of the line is `b` and `a` is the y-intercept, which refers to the value of `Y` when `X = 0`.
+>
+> ![Infographic by Jen Looper](../images/slope.png){width="400"}
+>
+> First, calculate the slope `b`.
+>
+> In other words, and referring to our pumpkin data's original question: "predict the price of a pumpkin per bushel by month", `X` would refer to the price and `Y` would refer to the month of sale.
+>
+> ![Infographic by Jen Looper](../images/calculation.png)
+>
+> Calculate the value of Y. If you're paying around \$4, it must be April!
+>
+> The math that calculates the line must demonstrate the slope of the line, which is also dependent on the intercept, or where `Y` is situated when `X = 0`.
+>
+> You can observe the method of calculation for these values on the [Math is Fun](https://www.mathsisfun.com/data/least-squares-regression.html) web site. Also visit [this Least-squares calculator](https://www.mathsisfun.com/data/least-squares-calculator.html) to watch how the numbers' values impact the line.
+
+Not so scary, right? 🤓
+
+#### Correlation
+
+One more term to understand is the **Correlation Coefficient** between given X and Y variables. Using a scatterplot, you can quickly visualize this coefficient. A plot with datapoints scattered in a neat line have high correlation, but a plot with datapoints scattered everywhere between X and Y have a low correlation.
+
+A good linear regression model will be one that has a high (nearer to 1 than 0) Correlation Coefficient using the Least-Squares Regression method with a line of regression.
+
+## **2. A dance with data: creating a data frame that will be used for modelling**
+
+![Artwork by \@allison_horst](../images/janitor.jpg){width="700"}
+
+Load up required libraries and dataset. Convert the data to a data frame containing a subset of the data:
+
+- Only get pumpkins priced by the bushel
+
+- Convert the date to a month
+
+- Calculate the price to be an average of high and low prices
+
+- Convert the price to reflect the pricing by bushel quantity
+
+> We covered these steps in the [previous lesson](https://github.com/microsoft/ML-For-Beginners/blob/main/2-Regression/2-Data/solution/lesson_2-R.ipynb).
+
+```{r load_tidy_verse_models, message=F, warning=F}
+# Load the core Tidyverse packages
+library(tidyverse)
+library(lubridate)
+
+# Import the pumpkins data
+pumpkins <- read_csv(file = "https://raw.githubusercontent.com/microsoft/ML-For-Beginners/main/2-Regression/data/US-pumpkins.csv")
+
+
+# Get a glimpse and dimensions of the data
+glimpse(pumpkins)
+
+
+# Print the first 50 rows of the data set
+pumpkins %>%
+ slice_head(n = 5)
+
+
+```
+
+In the spirit of sheer adventure, let's explore the [`janitor package`](github.com/sfirke/janitor) that provides simple functions for examining and cleaning dirty data. For instance, let's take a look at the column names for our data:
+
+```{r col_names}
+# Return column names
+pumpkins %>%
+ names()
+
+```
+
+🤔 We can do better. Let's make these column names `friendR` by converting them to the [snake_case](https://en.wikipedia.org/wiki/Snake_case) convention using `janitor::clean_names`. To find out more about this function: `?clean_names`
+
+```{r friendR}
+# Clean names to the snake_case convention
+pumpkins <- pumpkins %>%
+ clean_names(case = "snake")
+
+# Return column names
+pumpkins %>%
+ names()
+
+```
+
+Much tidyR 🧹! Now, a dance with the data using `dplyr` as in the previous lesson! 💃
+
+```{r prep_data, message=F, warning=F}
+# Select desired columns
+pumpkins <- pumpkins %>%
+ select(variety, city_name, package, low_price, high_price, date)
+
+
+
+# Extract the month from the dates to a new column
+pumpkins <- pumpkins %>%
+ mutate(date = mdy(date),
+ month = month(date)) %>%
+ select(-date)
+
+
+
+# Create a new column for average Price
+pumpkins <- pumpkins %>%
+ mutate(price = (low_price + high_price)/2)
+
+
+# Retain only pumpkins with the string "bushel"
+new_pumpkins <- pumpkins %>%
+ filter(str_detect(string = package, pattern = "bushel"))
+
+
+# Normalize the pricing so that you show the pricing per bushel, not per 1 1/9 or 1/2 bushel
+new_pumpkins <- new_pumpkins %>%
+ mutate(price = case_when(
+ str_detect(package, "1 1/9") ~ price/(1.1),
+ str_detect(package, "1/2") ~ price*2,
+ TRUE ~ price))
+
+# Relocate column positions
+new_pumpkins <- new_pumpkins %>%
+ relocate(month, .before = variety)
+
+
+# Display the first 5 rows
+new_pumpkins %>%
+ slice_head(n = 5)
+```
+
+Good job!👌 You now have a clean, tidy data set on which you can build your new regression model!
+
+Mind a scatter plot?
+
+```{r scatter_price_month}
+# Set theme
+theme_set(theme_light())
+
+# Make a scatter plot of month and price
+new_pumpkins %>%
+ ggplot(mapping = aes(x = month, y = price)) +
+ geom_point(size = 1.6)
+
+```
+
+A scatter plot reminds us that we only have month data from August through December. We probably need more data to be able to draw conclusions in a linear fashion.
+
+Let's take a look at our modelling data again:
+
+```{r modelling data}
+# Display first 5 rows
+new_pumpkins %>%
+ slice_head(n = 5)
+
+```
+
+What if we wanted to predict the `price` of a pumpkin based on the `city` or `package` columns which are of type character? Or even more simply, how could we find the correlation (which requires both of its inputs to be numeric) between, say, `package` and `price`? 🤷🤷
+
+Machine learning models work best with numeric features rather than text values, so you generally need to convert categorical features into numeric representations.
+
+This means that we have to find a way to reformat our predictors to make them easier for a model to use effectively, a process known as `feature engineering`.
+
+## 3. Preprocessing data for modelling with recipes 👩🍳👨🍳
+
+Activities that reformat predictor values to make them easier for a model to use effectively has been termed `feature engineering`.
+
+Different models have different preprocessing requirements. For instance, least squares requires `encoding categorical variables` such as month, variety and city_name. This simply involves `translating` a column with `categorical values` into one or more `numeric columns` that take the place of the original.
+
+For example, suppose your data includes the following categorical feature:
+
+| city |
+|:-------:|
+| Denver |
+| Nairobi |
+| Tokyo |
+
+You can apply *ordinal encoding* to substitute a unique integer value for each category, like this:
+
+| city |
+|:----:|
+| 0 |
+| 1 |
+| 2 |
+
+And that's what we'll do to our data!
+
+In this section, we'll explore another amazing Tidymodels package: [recipes](https://tidymodels.github.io/recipes/) - which is designed to help you preprocess your data **before** training your model. At its core, a recipe is an object that defines what steps should be applied to a data set in order to get it ready for modelling.
+
+Now, let's create a recipe that prepares our data for modelling by substituting a unique integer for all the observations in the predictor columns:
+
+```{r pumpkins_recipe}
+# Specify a recipe
+pumpkins_recipe <- recipe(price ~ ., data = new_pumpkins) %>%
+ step_integer(all_predictors(), zero_based = TRUE)
+
+
+# Print out the recipe
+pumpkins_recipe
+
+```
+
+Awesome! 👏 We just created our first recipe that specifies an outcome (price) and its corresponding predictors and that all the predictor columns should be encoded into a set of integers 🙌! Let's quickly break it down:
+
+- The call to `recipe()` with a formula tells the recipe the *roles* of the variables using `new_pumpkins` data as the reference. For instance the `price` column has been assigned an `outcome` role while the rest of the columns have been assigned a `predictor` role.
+
+- `step_integer(all_predictors(), zero_based = TRUE)` specifies that all the predictors should be converted into a set of integers with the numbering starting at 0.
+
+We are sure you may be having thoughts such as: "This is so cool!! But what if I needed to confirm that the recipes are doing exactly what I expect them to do? 🤔"
+
+That's an awesome thought! You see, once your recipe is defined, you can estimate the parameters required to actually preprocess the data, and then extract the processed data. You don't typically need to do this when you use Tidymodels (we'll see the normal convention in just a minute-\> `workflows`) but it can come in handy when you want to do some kind of sanity check for confirming that recipes are doing what you expect.
+
+For that, you'll need two more verbs: `prep()` and `bake()` and as always, our little R friends by [`Allison Horst`](https://github.com/allisonhorst/stats-illustrations) help you in understanding this better!
+
+![Artwork by \@allison_horst](../images/recipes.png){width="550"}
+
+[`prep()`](https://recipes.tidymodels.org/reference/prep.html): estimates the required parameters from a training set that can be later applied to other data sets. For instance, for a given predictor column, what observation will be assigned integer 0 or 1 or 2 etc
+
+[`bake()`](https://recipes.tidymodels.org/reference/bake.html): takes a prepped recipe and applies the operations to any data set.
+
+That said, lets prep and bake our recipes to really confirm that under the hood, the predictor columns will be first encoded before a model is fit.
+
+```{r prep_bake}
+# Prep the recipe
+pumpkins_prep <- prep(pumpkins_recipe)
+
+# Bake the recipe to extract a preprocessed new_pumpkins data
+baked_pumpkins <- bake(pumpkins_prep, new_data = NULL)
+
+# Print out the baked data set
+baked_pumpkins %>%
+ slice_head(n = 10)
+```
+
+Woo-hoo!🥳 The processed data `baked_pumpkins` has all it's predictors encoded confirming that indeed the preprocessing steps defined as our recipe will work as expected. This makes it harder for you to read but much more intelligible for Tidymodels! Take some time to find out what observation has been mapped to a corresponding integer.
+
+It is also worth mentioning that `baked_pumpkins` is a data frame that we can perform computations on.
+
+For instance, let's try to find a good correlation between two points of your data to potentially build a good predictive model. We'll use the function `cor()` to do this. Type `?cor()` to find out more about the function.
+
+```{r corr}
+# Find the correlation between the city_name and the price
+cor(baked_pumpkins$city_name, baked_pumpkins$price)
+
+# Find the correlation between the package and the price
+cor(baked_pumpkins$package, baked_pumpkins$price)
+
+```
+
+As it turns out, there's only weak correlation between the City and Price. However there's a bit better correlation between the Package and its Price. That makes sense, right? Normally, the bigger the produce box, the higher the price.
+
+While we are at it, let's also try and visualize a correlation matrix of all the columns using the `corrplot` package.
+
+```{r corrplot}
+# Load the corrplot package
+library(corrplot)
+
+# Obtain correlation matrix
+corr_mat <- cor(baked_pumpkins %>%
+ # Drop columns that are not really informative
+ select(-c(low_price, high_price)))
+
+# Make a correlation plot between the variables
+corrplot(corr_mat, method = "shade", shade.col = NA, tl.col = "black", tl.srt = 45, addCoef.col = "black", cl.pos = "n", order = "original")
+
+```
+
+🤩🤩 Much better.
+
+A good question to now ask of this data will be: '`What price can I expect of a given pumpkin package?`' Let's get right into it!
+
+> Note: When you **`bake()`** the prepped recipe **`pumpkins_prep`** with **`new_data = NULL`**, you extract the processed (i.e. encoded) training data. If you had another data set for example a test set and would want to see how a recipe would pre-process it, you would simply bake **`pumpkins_prep`** with **`new_data = test_set`**
+
+## 4. Build a linear regression model
+
+![Infographic by Dasani Madipalli](../images/linear-polynomial.png){width="800"}
+
+Now that we have build a recipe, and actually confirmed that the data will be pre-processed appropriately, let's now build a regression model to answer the question: `What price can I expect of a given pumpkin package?`
+
+#### Train a linear regression model using the training set
+
+As you may have already figured out, the column *price* is the `outcome` variable while the *package* column is the `predictor` variable.
+
+To do this, we'll first split the data such that 80% goes into training and 20% into test set, then define a recipe that will encode the predictor column into a set of integers, then build a model specification. We won't prep and bake our recipe since we already know it will preprocess the data as expected.
+
+```{r lm_rec_spec}
+set.seed(2056)
+# Split the data into training and test sets
+pumpkins_split <- new_pumpkins %>%
+ initial_split(prop = 0.8)
+
+
+# Extract training and test data
+pumpkins_train <- training(pumpkins_split)
+pumpkins_test <- testing(pumpkins_split)
+
+
+
+# Create a recipe for preprocessing the data
+lm_pumpkins_recipe <- recipe(price ~ package, data = pumpkins_train) %>%
+ step_integer(all_predictors(), zero_based = TRUE)
+
+
+
+# Create a linear model specification
+lm_spec <- linear_reg() %>%
+ set_engine("lm") %>%
+ set_mode("regression")
+
+
+```
+
+Good job! Now that we have a recipe and a model specification, we need to find a way of bundling them together into an object that will first preprocess the data (prep+bake behind the scenes), fit the model on the preprocessed data and also allow for potential post-processing activities. How's that for your peace of mind!🤩
+
+In Tidymodels, this convenient object is called a [`workflow`](https://workflows.tidymodels.org/) and conveniently holds your modeling components! This is what we'd call *pipelines* in *Python*.
+
+So let's bundle everything up into a workflow!📦
+
+```{r lm_workflow}
+# Hold modelling components in a workflow
+lm_wf <- workflow() %>%
+ add_recipe(lm_pumpkins_recipe) %>%
+ add_model(lm_spec)
+
+# Print out the workflow
+lm_wf
+
+```
+
+👌 Into the bargain, a workflow can be fit/trained in much the same way a model can.
+
+```{r lm_wf_fit}
+# Train the model
+lm_wf_fit <- lm_wf %>%
+ fit(data = pumpkins_train)
+
+# Print the model coefficients learned
+lm_wf_fit
+
+```
+
+From the model output, we can see the coefficients learned during training. They represent the coefficients of the line of best fit that gives us the lowest overall error between the actual and predicted variable.
+
+#### Evaluate model performance using the test set
+
+It's time to see how the model performed 📏! How do we do this?
+
+Now that we've trained the model, we can use it to make predictions for the test_set using `parsnip::predict()`. Then we can compare these predictions to the actual label values to evaluate how well (or not!) the model is working.
+
+Let's start with making predictions for the test set then bind the columns to the test set.
+
+```{r lm_pred}
+# Make predictions for the test set
+predictions <- lm_wf_fit %>%
+ predict(new_data = pumpkins_test)
+
+
+# Bind predictions to the test set
+lm_results <- pumpkins_test %>%
+ select(c(package, price)) %>%
+ bind_cols(predictions)
+
+
+# Print the first ten rows of the tibble
+lm_results %>%
+ slice_head(n = 10)
+```
+
+Yes, you have just trained a model and used it to make predictions!🔮 Is it any good, let's evaluate the model's performance!
+
+In Tidymodels, we do this using `yardstick::metrics()`! For linear regression, let's focus on the following metrics:
+
+- `Root Mean Square Error (RMSE)`: The square root of the [MSE](https://en.wikipedia.org/wiki/Mean_squared_error). This yields an absolute metric in the same unit as the label (in this case, the price of a pumpkin). The smaller the value, the better the model (in a simplistic sense, it represents the average price by which the predictions are wrong!)
+
+- `Coefficient of Determination (usually known as R-squared or R2)`: A relative metric in which the higher the value, the better the fit of the model. In essence, this metric represents how much of the variance between predicted and actual label values the model is able to explain.
+
+```{r lm_yardstick}
+# Evaluate performance of linear regression
+metrics(data = lm_results,
+ truth = price,
+ estimate = .pred)
+
+
+```
+
+There goes the model performance. Let's see if we can get a better indication by visualizing a scatter plot of the package and price then use the predictions made to overlay a line of best fit.
+
+This means we'll have to prep and bake the test set in order to encode the package column then bind this to the predictions made by our model.
+
+```{r lm_plot}
+# Encode package column
+package_encode <- lm_pumpkins_recipe %>%
+ prep() %>%
+ bake(new_data = pumpkins_test) %>%
+ select(package)
+
+
+# Bind encoded package column to the results
+lm_results <- lm_results %>%
+ bind_cols(package_encode %>%
+ rename(package_integer = package)) %>%
+ relocate(package_integer, .after = package)
+
+
+# Print new results data frame
+lm_results %>%
+ slice_head(n = 5)
+
+
+# Make a scatter plot
+lm_results %>%
+ ggplot(mapping = aes(x = package_integer, y = price)) +
+ geom_point(size = 1.6) +
+ # Overlay a line of best fit
+ geom_line(aes(y = .pred), color = "orange", size = 1.2) +
+ xlab("package")
+
+
+
+```
+
+Great! As you can see, the linear regression model does not really well generalize the relationship between a package and its corresponding price.
+
+🎃 Congratulations, you just created a model that can help predict the price of a few varieties of pumpkins. Your holiday pumpkin patch will be beautiful. But you can probably create a better model!
+
+## 5. Build a polynomial regression model
+
+![Infographic by Dasani Madipalli](../images/linear-polynomial.png){width="800"}
+
+Sometimes our data may not have a linear relationship, but we still want to predict an outcome. Polynomial regression can help us make predictions for more complex non-linear relationships.
+
+Take for instance the relationship between the package and price for our pumpkins data set. While sometimes there's a linear relationship between variables - the bigger the pumpkin in volume, the higher the price - sometimes these relationships can't be plotted as a plane or straight line.
+
+> ✅ Here are [some more examples](https://online.stat.psu.edu/stat501/lesson/9/9.8) of data that could use polynomial regression
+>
+> Take another look at the relationship between Variety to Price in the previous plot. Does this scatterplot seem like it should necessarily be analyzed by a straight line? Perhaps not. In this case, you can try polynomial regression.
+>
+> ✅ Polynomials are mathematical expressions that might consist of one or more variables and coefficients
+
+#### Train a polynomial regression model using the training set
+
+Polynomial regression creates a *curved line* to better fit nonlinear data.
+
+Let's see whether a polynomial model will perform better in making predictions. We'll follow a somewhat similar procedure as we did before:
+
+- Create a recipe that specifies the preprocessing steps that should be carried out on our data to get it ready for modelling i.e: encoding predictors and computing polynomials of degree *n*
+
+- Build a model specification
+
+- Bundle the recipe and model specification into a workflow
+
+- Create a model by fitting the workflow
+
+- Evaluate how well the model performs on the test data
+
+Let's get right into it!
+
+```{r polynomial_reg}
+# Specify a recipe
+poly_pumpkins_recipe <-
+ recipe(price ~ package, data = pumpkins_train) %>%
+ step_integer(all_predictors(), zero_based = TRUE) %>%
+ step_poly(all_predictors(), degree = 4)
+
+
+# Create a model specification
+poly_spec <- linear_reg() %>%
+ set_engine("lm") %>%
+ set_mode("regression")
+
+
+# Bundle recipe and model spec into a workflow
+poly_wf <- workflow() %>%
+ add_recipe(poly_pumpkins_recipe) %>%
+ add_model(poly_spec)
+
+
+# Create a model
+poly_wf_fit <- poly_wf %>%
+ fit(data = pumpkins_train)
+
+
+# Print learned model coefficients
+poly_wf_fit
+
+
+
+```
+
+#### Evaluate model performance
+
+👏👏You've built a polynomial model let's make predictions on the test set!
+
+```{r poly_predict}
+# Make price predictions on test data
+poly_results <- poly_wf_fit %>% predict(new_data = pumpkins_test) %>%
+ bind_cols(pumpkins_test %>% select(c(package, price))) %>%
+ relocate(.pred, .after = last_col())
+
+
+# Print the results
+poly_results %>%
+ slice_head(n = 10)
+```
+
+Woo-hoo , let's evaluate how the model performed on the test_set using `yardstick::metrics()`.
+
+```{r poly_eval}
+metrics(data = poly_results, truth = price, estimate = .pred)
+```
+
+🤩🤩 Much better performance.
+
+The `rmse` decreased from about 7. to about 3. an indication that of a reduced error between the actual price and the predicted price. You can *loosely* interpret this as meaning that on average, incorrect predictions are wrong by around \$3. The `rsq` increased from about 0.4 to 0.8.
+
+All these metrics indicate that the polynomial model performs way better than the linear model. Good job!
+
+Let's see if we can visualize this!
+
+```{r poly_viz}
+# Bind encoded package column to the results
+poly_results <- poly_results %>%
+ bind_cols(package_encode %>%
+ rename(package_integer = package)) %>%
+ relocate(package_integer, .after = package)
+
+
+# Print new results data frame
+poly_results %>%
+ slice_head(n = 5)
+
+
+# Make a scatter plot
+poly_results %>%
+ ggplot(mapping = aes(x = package_integer, y = price)) +
+ geom_point(size = 1.6) +
+ # Overlay a line of best fit
+ geom_line(aes(y = .pred), color = "midnightblue", size = 1.2) +
+ xlab("package")
+
+
+
+```
+
+You can see a curved line that fits your data better! 🤩
+
+You can make this more smoother by passing a polynomial formula to `geom_smooth` like this:
+
+```{r smooth curve}
+# Make a scatter plot
+poly_results %>%
+ ggplot(mapping = aes(x = package_integer, y = price)) +
+ geom_point(size = 1.6) +
+ # Overlay a line of best fit
+ geom_smooth(method = lm, formula = y ~ poly(x, degree = 4), color = "midnightblue", size = 1.2, se = FALSE) +
+ xlab("package")
+
+
+
+
+```
+
+Much like a smooth curve!🤩
+
+Here's how you would make a new prediction:
+
+```{r predict}
+# Make a hypothetical data frame
+hypo_tibble <- tibble(package = "bushel baskets")
+
+# Make predictions using linear model
+lm_pred <- lm_wf_fit %>% predict(new_data = hypo_tibble)
+
+# Make predictions using polynomial model
+poly_pred <- poly_wf_fit %>% predict(new_data = hypo_tibble)
+
+# Return predictions in a list
+list("linear model prediction" = lm_pred,
+ "polynomial model prediction" = poly_pred)
+
+
+```
+
+The `polynomial model` prediction does make sense, given the scatter plots of `price` and `package`! And, if this is a better model than the previous one, looking at the same data, you need to budget for these more expensive pumpkins!
+
+🏆 Well done! You created two regression models in one lesson. In the final section on regression, you will learn about logistic regression to determine categories.
+
+## **🚀Challenge**
+
+Test several different variables in this notebook to see how correlation corresponds to model accuracy.
+
+## [**Post-lecture quiz**](https://white-water-09ec41f0f.azurestaticapps.net/quiz/14/)
+
+## **Review & Self Study**
+
+In this lesson we learned about Linear Regression. There are other important types of Regression. Read about Stepwise, Ridge, Lasso and Elasticnet techniques. A good course to study to learn more is the [Stanford Statistical Learning course](https://online.stanford.edu/courses/sohs-ystatslearning-statistical-learning)
+
+If you want to learn more about how to use the amazing Tidymodels framework, please check out the following resources:
+
+- Tidymodels website: [Get started with Tidymodels](https://www.tidymodels.org/start/)
+
+- Max Kuhn and Julia Silge, [*Tidy Modeling with R*](https://www.tmwr.org/)*.*
+
+###### **THANK YOU TO:**
+
+[Allison Horst](https://twitter.com/allison_horst?lang=en) for creating the amazing illustrations that make R more welcoming and engaging. Find more illustrations at her [gallery](https://www.google.com/url?q=https://github.com/allisonhorst/stats-illustrations&sa=D&source=editors&ust=1626380772530000&usg=AOvVaw3zcfyCizFQZpkSLzxiiQEM).
diff --git a/2-Regression/3-Linear/translations/README.id.md b/2-Regression/3-Linear/translations/README.id.md
new file mode 100644
index 0000000000..ce2ee4098a
--- /dev/null
+++ b/2-Regression/3-Linear/translations/README.id.md
@@ -0,0 +1,335 @@
+# Bangun sebuah model regresi dengan Scikit-learn: regresi dua arah
+
+![Infografik regresi linear vs polinomial](../images/linear-polynomial.png)
+> Infografik oleh [Dasani Madipalli](https://twitter.com/dasani_decoded)
+## [Kuis pra-ceramah](https://white-water-09ec41f0f.azurestaticapps.net/quiz/13/)
+### Pembukaan
+
+Selama ini kamu telah menjelajahi apa regresi itu dengan data contoh yang dikumpulkan dari *dataset* harga labu yang kita akan gunakan terus sepanjang pelajaran ini. Kamu juga telah memvisualisasikannya dengan Matplotlib.
+
+Sekarang kamu sudah siap terjun ke dalam regresi untuk ML. Dalam pelajaran ini, kamu akan belajar lebih tentang dua jenis regresi: _regresi linear sederhana_ dan _regresi polinomial_ serta sedikit matematika yang mendasari kedua teknik ini.
+
+> Sepanjang kurikulum ini, kami mengasumsi kamu punya pengetahuan matematika yang minim dan ingin tetap membuat pelajaran ini terjangkau bagi murid-murid dari bidang-bidang lain. Jadi perhatikan catatan, 🧮 info, diagram, dan alat-alat belajar lainnya untuk membantu pemahaman.
+
+### Prasyarat
+
+Kamu harusnya sudah terbiasa sekarang dengan struktur data labu yang kita sedang teliti. Datanya harusnya sudah dimuat dan dibersihkan dalam file _notebook.ipynb_ pelajaran ini. Dalam file ini, harga labu ditampilkan per bushel dalam *dataframe* yang bari. Pastikan kamu bisa menjalankan *notebook-notebook* ini di *kernels* di Visual Studio Code.
+
+### Persiapan
+
+Ingat, kamu sedang memuat data ini untuk menanyakan pertanyaan darinya.
+
+- Kapankah waktu terbaik untuk membeli labu?
+- Saya kira-kira harus bayar berapa untuk satu kotak labu mini?
+- Apa saya sebaiknya membelinya dalam keranjang-keranjang setengah bushel atau kardus-kardus 1 1/9 bushel?
+Ayo terjun lebih lagi ke dalam data ini.
+
+Dalam pelajaran sebelumnya, kamu membuat sebuah *dataframe* Pandas, mengisinya dengan sebagian *dataset* orisinal, dan menstandarisasi harganya per bushel. Tetapi, dengan begitu, kamu hanya dapat mengumpul sekitar 400 titik data dan itupun hanya untuk bulan-bulan musim gugur.
+
+Lihatlah data yang kita sudah muat dalam *notebook* yang terlampir pelajaran ini. Data telah di muat dan sebuah petak sebar inisial telah digambar untuk menunjukkan data per bulan. Mungkin kita bisa dapat lebih banyak detail tentang sifat datanya dengan membersih-bersihkannya lebih lagi.
+
+## Sebuah garis regresi linear
+
+Seperti yang kamu telah belajar dalam Pelajaran 1, tujuan sebuah latihan regresi linear adalah untuk dapat menggambar sebuah garis untuk:
+
+- **Menunjukkan hubungan antar-variabel**. Menunjukkan hubungan antara variabel-variabel.
+- **Membuat prediksi**. Membuat prediksi akurat tentang di mana sebuah titik data baru akan terletak berhubungan dengan garis tersebut.
+
+Dalam kasus **Regresi Kuadrat Terkecil (_Least-Squares Regression_)**, biasanya garis seperti ini digambar. Istilah 'kuadrat terkecil' berarti semua titik data yang mengitari garis regresi dikuadratkan dan dijumlahkan. Secara ideal, harusnya jumlah akhirnya sekecil mungkin, karena kita ingin kesalahan (error) terkecil, alias `kuadrat terkecil`.
+
+Kita melakukan itu sebab kita ingin memodelkan sebuah garis yang jarak kumulatifnya dari semua titik data itu sekecil mungkin. Kita juga mengkuadratkan setiap suku sebelum dijumlahkan karena kita fokus pada besarannya daripada arahnya.
+
+> **🧮 Tunjukkan matematikanya kepadaku**
+>
+> Garis ini, dipanggil _garis yang paling cocok_, dapat diekspresikan dalam [sebuah persamaan](https://en.wikipedia.org/wiki/Simple_linear_regression):
+>
+> ```
+> Y = a + bX
+> ```
+>
+> `X` adalah 'variabel penerang'. `Y` adalah 'variabel dependen'. Gradien garisnya adalah `b`, dan `a` adalah titik potong sumbu y yaitu nilai `Y` saat `X = 0`.
+>
+>![hitunglah gradiennya](../images/slope.png)
+>
+> Pertama, hitunglah gradien `b`. Infografik oleh [Jen Looper](https://twitter.com/jenlooper)
+>
+> Dalam kata lain, dan berhubungan pula dengan pertanyaan awal data labu kita "prediksikan harga satu bushel labu setiap bulan", `X` merujuk pada harganya, sedangkan `Y` akan merujuk pada bulan penjualan.
+>
+>![lengkapilah persamaan ini](../images/calculation.png)
+>
+> Hitunglah nilai Y. Kalau harganya $4, artinya pasti April! Infografik oleh [Jen Looper](https://twitter.com/jenlooper)
+>
+> Matematika yang mengkalkulasi garis ini harus mendemonstrasikan gradien garisnya yang juga tergantung pada titik potongnya pada sumbu y, alias apa `Y`-nya saat `X = 0`.
+>
+> Kamu bisa melihat metode menghitung nilai-nilai ini di situs internet [*Math is Fun* (Matematika Itu Menyenangkan)](https://www.mathsisfun.com/data/least-squares-regression.html). Kunjungi [kalkulator kuadrat terkecil ini](https://www.mathsisfun.com/data/least-squares-calculator.html) juga untuk melihat bagaimana nomor-nomor ini mengubah garisnya.
+
+## Korelasi
+
+Satu lagi yang harus dipahami adalah **Koefisien Korelasi** antara variabel X dan Y yang tersedia. Menggunakan sebuah petak sebar, kamu bisa memvisualisasi korelasi ini dengan cepat. Sebuah grafik dengan titik-titik data yang tersebar rapi seperti sebuah garis mempunyai korelasi yang tinggi. Namun, sebuah grafik dengan titik-titik data yang tersebar di mana-mana antara X dan Y mempunyai korelasi yang rendah.
+
+Sebuah model regresi linear yang bagus akan mempunyai Koefisien Korelasi yang tinggi (lebih dekat ke 1 daripada ke 0) menggunakan metode Regresi Kuadrat Terkecil dengan sebuah garis regresi.
+
+✅ Jalankan *notebook* yang terlampir dalam pelajaran ini dan lihatlah petak sebar City (Kota) ke Price (Harga). Apa data yang menghubungkan City ke Price untuk penjualan labu mempunyai korelasi yang tinggi atau rendah kelihatannya?
+
+
+## Siapkan datamu untuk regresi
+
+Sekarang dengan pemahamanmu mengenai matematika di balik latihan ini, buatlah sebuah model regresi untuk melihat apa kamu bisa memprediksi paket labu yang mana yang harganya paling baik. Seorang pembeli labu akan ingin mengetahui informasi ini untuk mengoptimasi pembelian labu mereka.
+
+Karena kamu akan menggunakan Scikit-learn, tidak usah mengerjakan ini dengan tangan (walaupun bisa sih!). Dalam blok memrosesan data utama *notebook*-mu untuk pelajaran ini, tambahlah sebuah *library* dari Scikit-learn untuk mengkonversi semua data *string* menjadi nomor secara otomatis:
+
+```python
+from sklearn.preprocessing import LabelEncoder
+
+new_pumpkins.iloc[:, 0:-1] = new_pumpkins.iloc[:, 0:-1].apply(LabelEncoder().fit_transform)
+new_pumpkins.iloc[:, 0:-1] = new_pumpkins.iloc[:, 0:-1].apply(LabelEncoder().fit_transform)
+```
+
+Kalau kamu sekarang simak *dataframe* new_punkins, kamu akan lihat bahwa semua *string* sudah dijadikan nomor. Ini lebih susah untuk kita baca, tetapi jauh lebih mudah untuk Scikit-learn!
+Sekarang kamu bisa membuat lebih banyak keputusan berakal (tidak hanya tebak-tebak dari petak sebarnya) tentang data yang paling cocok untuk regresi.
+
+Coba cari sebuah korelasi bagus antara dua titik data yang berpotensi untuk membangun sebuah model prediksi yang baik. Ternyata, hanya ada korelasi yang lemah antara City dan Price:
+
+```python
+print(new_pumpkins['City'].corr(new_pumpkins['Price']))
+0.32363971816089226
+```
+
+Meskipun begitu, ada korelasi yang sedikit lebih baik antara Package (Paket) dan Price (Harga). Masuk akal juga kan? Biasanya, lebih besar kardusnya, lebih mahal harganya.
+
+```python
+print(new_pumpkins['Package'].corr(new_pumpkins['Price']))
+0.6061712937226021
+```
+
+Sebuah pertanyaan bagus untuk ditanyakan dari data ini adalah "Kira-kira harga sebuah paket labu berapa?"
+
+Mari membangun sebuah model regresi
+
+## Membangun sebuah model linear
+
+Sebelum membangun modelmu, rapikanlah datamu sekali lagi. Buanglah sebuah data nil (null) dan periksalah sekali lagi datanya kelihatannya seperti apa.
+
+```python
+new_pumpkins.dropna(inplace=True)
+new_pumpkins.info()
+```
+
+Lalu, buatlah sebuah *dataframe* baru dari set minimal ini dan *print*:
+
+```python
+new_columns = ['Package', 'Price']
+lin_pumpkins = new_pumpkins.drop([c for c in new_pumpkins.columns if c not in new_columns], axis='columns')
+
+lin_pumpkins
+```
+
+```output
+ Package Price
+70 0 13.636364
+71 0 16.363636
+72 0 16.363636
+73 0 15.454545
+74 0 13.636364
+... ... ...
+1738 2 30.000000
+1739 2 28.750000
+1740 2 25.750000
+1741 2 24.000000
+1742 2 24.000000
+415 rows × 2 columns
+```
+
+1. Sekarang kamu bisa menetapkan data koordinat X dan y-mu:
+
+ ```python
+ X = lin_pumpkins.values[:, :1]
+ y = lin_pumpkins.values[:, 1:2]
+ ```
+✅ Apa yang sedang terjadi di sini? Kamu sedang menggunakan [notasi perpotongan Python (*Python slice notation*)](https://stackoverflow.com/questions/509211/understanding-slice-notation/509295#509295) untuk membuat dua *array* untuk mengisi `X` dan `y`.
+
+1. Selanjutnya, mulailah rutin pembangunan model:
+
+ ```python
+ from sklearn.linear_model import LinearRegression
+ from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
+ from sklearn.model_selection import train_test_split
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+ lin_reg = LinearRegression()
+ lin_reg.fit(X_train,y_train)
+
+ pred = lin_reg.predict(X_test)
+
+ accuracy_score = lin_reg.score(X_train,y_train)
+ print('Model Accuracy: ', accuracy_score)
+ ```
+
+ Karena korelasinya tidak begitu baik, model yang didapatkan tidak terlalu akurat.
+
+ ```output
+ Model Accuracy: 0.3315342327998987
+ ```
+
+2. Kamu bisa memvisualisasi garis yang digambarkan dalam proses ini:
+
+ ```python
+ plt.scatter(X_test, y_test, color='black')
+ plt.plot(X_test, pred, color='blue', linewidth=3)
+
+ plt.xlabel('Package')
+ plt.ylabel('Price')
+
+ plt.show()
+ ```
+ ![Sebuah petak sebar yang menunjukkan hubungan antara paket dan harga](../images/linear.png)
+
+3. Ujilah modelnya dengan sebuah jenis labu hipotetis:
+
+ ```python
+ lin_reg.predict( np.array([ [2.75] ]) )
+ ```
+
+ Harga yang dihasilkan untuk jenis labu mitologis ini adalah:
+
+ ```output
+ array([[33.15655975]])
+ ```
+
+Nomor itu masuk akal jikalau logika garis regresinya benar.
+
+🎃 Selamat, kamu baru saja membuat sebuah model yang bisa membantu memprediksi harga beberapa jenis labu. Namun, kamu masih bisa membuatnya lebih baik!
+
+## Regresi polinomial
+
+Jenis lain regresi linear adalah regresi polinomial. Walaupun kadangkali ada hubungan linear antara variabel-variabel — lebih besar volume labunya, lebih tinggi harganya — kadangkali hubungan-hubungan ini tidak bisa digambarkan sebagai sebuah bidang atau garis lurus.
+
+✅ Ini ada [beberapa contoh data lain](https://online.stat.psu.edu/stat501/lesson/9/9.8) yang bisa menggunakan regresi polinomial
+
+Tengok kembali hubungan antara Variety (Jenis) dan Price (Harga) dalam grafik sebelumnya. Apa petak sebar ini terlihat seperti harus dianalisis dengan sebuah garis lurus? Mungkin tidak. Kali ini, kamu bisa mencoba regresi polinomial.
+
+✅ Polinomial adalah sebuah pernyataan matematika yang mempunyai satu atau lebih variabel dan koefisien disusun menjadi suku-suku.
+
+Regresi polinomial menghasilkan sebuah garis lengkung supaya lebih cocok dengan data non-linear.
+
+1. Mari kita membuat sebuah *dataframe* yang diisi sebuah segmen dari data orisinal labu:
+
+ ```python
+ new_columns = ['Variety', 'Package', 'City', 'Month', 'Price']
+ poly_pumpkins = new_pumpkins.drop([c for c in new_pumpkins.columns if c not in new_columns], axis='columns')
+
+ poly_pumpkins
+ ```
+
+Sebuah cara bagus untuk memvisualisasi korelasi-korelasi antara data dalam *dataframe-dataframe* adalah untuk menampilkannya dalam sebuah peta '*coolwarm*' (panas-dingin):
+
+2. Gunakan fungsi `Background_gradient()` dengan `coolwarm` sebagai argumennya:
+
+ ```python
+ corr = poly_pumpkins.corr()
+ corr.style.background_gradient(cmap='coolwarm')
+ ```
+ This code creates a heatmap:
+ Kode ini membuat sebuah peta panas
+ ![Sebuah peta panas yang menunjukkan korelasi data](../images/heatmap.png)
+
+Melihat peta ini, kamu bisa memvisualisasikan korelasi yang baik antara Package dan Price. Jadi kamu seharusnya bisa membuat sebuah model yang lebih baik daripada yang sebelumnya.
+
+### Buatlah sebuah *pipeline*
+
+Scikit-learn mempunyai sebuah API yang berguna untuk membangun model regresi polinomial — [API](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html?highlight=pipeline#sklearn.pipeline.make_pipeline) `make_pipeline`. Sebuah '*pipeline*' adalah sebuah rantai penaksir. Dalam kasus ini, *pipeline* ini mempunyai fitur-fitur polinomial, atau prediksi-prediksi yang membuat garis non-linear.
+
+1. Bangunlah kolom X dan y:
+
+ ```python
+ X=poly_pumpkins.iloc[:,3:4].values
+ y=poly_pumpkins.iloc[:,4:5].values
+ ```
+
+2. Buatlah *pipeline*-nya dengan fungsi `make_pipeline()`:
+
+ ```python
+ from sklearn.preprocessing import PolynomialFeatures
+ from sklearn.pipeline import make_pipeline
+
+ pipeline = make_pipeline(PolynomialFeatures(4), LinearRegression())
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+
+ pipeline.fit(np.array(X_train), y_train)
+
+ y_pred=pipeline.predict(X_test)
+ ```
+
+### Buatlah sebuah barisan
+
+Di sini, kamu harus membuat sebuah *dataframe* baru dengan data yang _berurutan_ supaya *pipeline*-nya bisa membuat sebuah barisan.
+
+Tambahlah kode ini:
+
+ ```python
+ df = pd.DataFrame({'x': X_test[:,0], 'y': y_pred[:,0]})
+ df.sort_values(by='x',inplace = True)
+ points = pd.DataFrame(df).to_numpy()
+
+ plt.plot(points[:, 0], points[:, 1],color="blue", linewidth=3)
+ plt.xlabel('Package')
+ plt.ylabel('Price')
+ plt.scatter(X,y, color="black")
+ plt.show()
+ ```
+
+Kamu membuat sebuah *dataframe* baru dengan fungsi `pd.DataFrame`. Lalu kamu mengurutkan isinya dengan fungsi `sort_values()`. Akhirnya kamu membuat sebuah bagan polinomial:
+
+![Sebuah bagan polinomial yang menunjukkan hubungan antara paket dan harga](../images/polynomial.png)
+
+Kamu bisa melihat garis lengkungnya yang lebih cocok terhadap datamu.
+
+Ayo periksa akurasi modelnya:
+
+ ```python
+ accuracy_score = pipeline.score(X_train,y_train)
+ print('Model Accuracy: ', accuracy_score)
+ ```
+
+ Nah!
+
+ ```output
+ Model Accuracy: 0.8537946517073784
+ ```
+
+Itu bagus! Coba memprediksi harga:
+
+### Buatlah sebuah prediksi
+
+Apa kita bisa memberi input dan dapat sebuah prediksi?
+
+Pakai fungsi `predict()` untuk membuat prediksi:
+
+ ```python
+ pipeline.predict( np.array([ [2.75] ]) )
+ ```
+ Kamu diberi prediksi ini:
+
+ ```output
+ array([[46.34509342]])
+ ```
+
+Itu sangat masuk akal dengan bagan sebelumnya! Selain itu, jika ini model lebih baik daripada yang sebelumnya dengan data ini, kamu bisa siap untuk labu-labu yang lebih mahal ini!
+
+🏆 Mantap sekali! Kamu membuat dua model regresi dalam satu pelajaran. Dalam bagian terakhir mengenai regresi, kamu akan belajar tentang regresi logistik untuk pengkategorisasian.
+
+---
+## 🚀 Tantangan
+
+Coba-cobalah variabel-variabel yang lain di *notebook* ini untuk melihat bagaimana korelasi berhubungan dengan akurasi model.
+
+## [Kuis pasca-ceramah](https://white-water-09ec41f0f.azurestaticapps.net/quiz/14/)
+
+## Review & Pembelajaran Mandiri
+
+Dalam pelajaran ini kita belajar tentang regresi linear. Ada banyak jenis regresi lain yang penting pula. Bacalah tentang teknik *Stepwise*, *Ridge*, *Lasso*, dan *Elasticnet*. [Kursus Pembelajaran Statistik Stanford](https://online.stanford.edu/courses/sohs-ystatslearning-statistical-learning) juga baik untuk belajar lebih lanjut.
+
+## Tugas
+
+[Buatlah sebuah model](../assignment.md)
diff --git a/2-Regression/3-Linear/translations/README.it.md b/2-Regression/3-Linear/translations/README.it.md
new file mode 100644
index 0000000000..81fd5a1167
--- /dev/null
+++ b/2-Regression/3-Linear/translations/README.it.md
@@ -0,0 +1,339 @@
+# Costruire un modello di regressione usando Scikit-learn: regressione in due modi
+
+![Infografica di regressione lineare e polinomiale](../images/linear-polynomial.png)
+> Infografica di [Dasani Madipalli](https://twitter.com/dasani_decoded)
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/13/)
+
+### Introduzione
+
+Finora si è esplorato cos'è la regressione con dati di esempio raccolti dall'insieme di dati relativo ai prezzi della zucca, che verrà usato in questa lezione. Lo si è anche visualizzato usando Matplotlib.
+
+Ora si è pronti per approfondire la regressione per machine learning. In questa lezione si imparerà di più su due tipi di regressione: _regressione lineare di base_ e _regressione polinomiale_, insieme ad alcuni dei calcoli alla base di queste tecniche.
+
+> In questo programma di studi, si assume una conoscenza minima della matematica, e si cerca di renderla accessibile agli studenti provenienti da altri campi, quindi si faccia attenzione a note, 🧮 didascalie, diagrammi e altri strumenti di apprendimento che aiutano la comprensione.
+
+### Prerequisito
+
+Si dovrebbe ormai avere familiarità con la struttura dei dati della zucca che si sta esaminando. Lo si può trovare precaricato e prepulito nel file _notebook.ipynb_ di questa lezione. Nel file, il prezzo della zucca viene visualizzato per bushel (staio) in un nuovo dataframe. Assicurasi di poter eseguire questi notebook nei kernel in Visual Studio Code.
+
+### Preparazione
+
+Come promemoria, si stanno caricando questi dati in modo da porre domande su di essi.
+
+- Qual è il momento migliore per comprare le zucche?
+- Che prezzo ci si può aspettare da una cassa di zucche in miniatura?
+- Si devono acquistare in cestini da mezzo bushel o a scatola da 1 1/9 bushel? Si continua a scavare in questi dati.
+
+Nella lezione precedente, è stato creato un dataframe Pandas e si è popolato con parte dell'insieme di dati originale, standardizzando il prezzo per lo bushel. In questo modo, tuttavia, si sono potuti raccogliere solo circa 400 punti dati e solo per i mesi autunnali.
+
+Si dia un'occhiata ai dati precaricati nel notebook di accompagnamento di questa lezione. I dati sono precaricati e viene tracciato un grafico a dispersione iniziale per mostrare i dati mensili. Forse si può ottenere qualche dettaglio in più sulla natura dei dati pulendoli ulteriormente.
+
+## Una linea di regressione lineare
+
+Come si è appreso nella lezione 1, l'obiettivo di un esercizio di regressione lineare è essere in grado di tracciare una linea per:
+
+- **Mostrare le relazioni tra variabili**.
+- **Fare previsioni**. Fare previsioni accurate su dove cadrebbe un nuovo punto dati in relazione a quella linea.
+
+È tipico della **Regressione dei Minimi Quadrati** disegnare questo tipo di linea. Il termine "minimi quadrati" significa che tutti i punti dati che circondano la linea di regressione sono elevati al quadrato e quindi sommati. Idealmente, quella somma finale è la più piccola possibile, perché si vuole un basso numero di errori, o `minimi quadrati`.
+
+Lo si fa perché si vuole modellare una linea che abbia la distanza cumulativa minima da tutti i punti dati. Si esegue anche il quadrato dei termini prima di aggiungerli poiché interessa la grandezza piuttosto che la direzione.
+
+> **🧮 Mostrami la matematica**
+>
+> Questa linea, chiamata _linea di miglior adattamento_ , può essere espressa da [un'equazione](https://en.wikipedia.org/wiki/Simple_linear_regression):
+>
+> ```
+> Y = a + bX
+> ```
+>
+> `X` è la "variabile esplicativa". `Y` è la "variabile dipendente". La pendenza della linea è `b` e `a` è l'intercetta di y, che si riferisce al valore di `Y` quando `X = 0`.
+>
+> ![calcolare la pendenza](../images/slope.png)
+>
+> Prima, calcolare la pendenza `b`. Infografica di [Jen Looper](https://twitter.com/jenlooper)
+>
+> In altre parole, facendo riferimento alla domanda originale per i dati sulle zucche: "prevedere il prezzo di una zucca per bushel per mese", `X` si riferisce al prezzo e `Y` si riferirisce al mese di vendita.
+>
+> ![completare l'equazione](../images/calculation.png)
+>
+> Si calcola il valore di Y. Se si sta pagando circa $4, deve essere aprile! Infografica di [Jen Looper](https://twitter.com/jenlooper)
+>
+> La matematica che calcola la linea deve dimostrare la pendenza della linea, che dipende anche dall'intercetta, o dove `Y` si trova quando `X = 0`.
+>
+> Si può osservare il metodo di calcolo per questi valori sul sito web [Math is Fun](https://www.mathsisfun.com/data/least-squares-regression.html) . Si visiti anche [questo calcolatore dei minimi quadrati](https://www.mathsisfun.com/data/least-squares-calculator.html) per vedere come i valori dei numeri influiscono sulla linea.
+
+## Correlazione
+
+Un altro termine da comprendere è il **Coefficiente di Correlazione** tra determinate variabili X e Y. Utilizzando un grafico a dispersione, è possibile visualizzare rapidamente questo coefficiente. Un grafico con punti dati sparsi in una linea ordinata ha un'alta correlazione, ma un grafico con punti dati sparsi ovunque tra X e Y ha una bassa correlazione.
+
+Un buon modello di regressione lineare sarà quello che ha un Coefficiente di Correlazione alto (più vicino a 1 rispetto a 0) utilizzando il Metodo di Regressione dei Minimi Quadrati con una linea di regressione.
+
+✅ Eseguire il notebook che accompagna questa lezione e guardare il grafico a dispersione City to Price. I dati che associano la città al prezzo per le vendite di zucca sembrano avere una correlazione alta o bassa, secondo la propria interpretazione visiva del grafico a dispersione?
+
+
+## Preparare i dati per la regressione
+
+Ora che si ha una comprensione della matematica alla base di questo esercizio, si crea un modello di regressione per vedere se si può prevedere quale pacchetto di zucche avrà i migliori prezzi per zucca. Qualcuno che acquista zucche per una festa con tema un campo di zucche potrebbe desiderare che queste informazioni siano in grado di ottimizzare i propri acquisti di pacchetti di zucca per il campo.
+
+Dal momento che si utilizzerà Scikit-learn, non c'è motivo di farlo a mano (anche se si potrebbe!). Nel blocco di elaborazione dati principale del notebook della lezione, aggiungere una libreria da Scikit-learn per convertire automaticamente tutti i dati di tipo stringa in numeri:
+
+```python
+from sklearn.preprocessing import LabelEncoder
+
+new_pumpkins.iloc[:, 0:-1] = new_pumpkins.iloc[:, 0:-1].apply(LabelEncoder().fit_transform)
+```
+
+Se si guarda ora il dataframe new_pumpkins, si vede che tutte le stringhe ora sono numeriche. Questo rende più difficile la lettura per un umano ma molto più comprensibile per Scikit-learn!
+Ora si possono prendere decisioni più consapevoli (non solo basate sull'osservazione di un grafico a dispersione) sui dati più adatti alla regressione.
+
+Si provi a trovare una buona correlazione tra due punti nei propri dati per costruire potenzialmente un buon modello predittivo. A quanto pare, c'è solo una debole correlazione tra la città e il prezzo:
+
+```python
+print(new_pumpkins['City'].corr(new_pumpkins['Price']))
+0.32363971816089226
+```
+
+Tuttavia, c'è una correlazione leggermente migliore tra il pacchetto e il suo prezzo. Ha senso, vero? Normalmente, più grande è la scatola dei prodotti, maggiore è il prezzo.
+
+```python
+print(new_pumpkins['Package'].corr(new_pumpkins['Price']))
+0.6061712937226021
+```
+
+Una buona domanda da porre a questi dati sarà: "Che prezzo posso aspettarmi da un determinato pacchetto di zucca?"
+
+Si costruisce questo modello di regressione
+
+## Costruire un modello lineare
+
+Prima di costruire il modello, si esegue un altro riordino dei dati. Si eliminano tutti i dati nulli e si controlla ancora una volta che aspetto hanno i dati.
+
+```python
+new_pumpkins.dropna(inplace=True)
+new_pumpkins.info()
+```
+
+Quindi, si crea un nuovo dataframe da questo set minimo e lo si stampa:
+
+```python
+new_columns = ['Package', 'Price']
+lin_pumpkins = new_pumpkins.drop([c for c in new_pumpkins.columns if c not in new_columns], axis='columns')
+
+lin_pumpkins
+```
+
+```output
+ Package Price
+70 0 13.636364
+71 0 16.363636
+72 0 16.363636
+73 0 15.454545
+74 0 13.636364
+... ... ...
+1738 2 30.000000
+1739 2 28.750000
+1740 2 25.750000
+1741 2 24.000000
+1742 2 24.000000
+415 rows × 2 columns
+```
+
+1. Ora si possono assegnare i dati delle coordinate X e y:
+
+ ```python
+ X = lin_pumpkins.values[:, :1]
+ y = lin_pumpkins.values[:, 1:2]
+ ```
+
+Cosa sta succedendo qui? Si sta usando [la notazione slice Python](https://stackoverflow.com/questions/509211/understanding-slice-notation/509295#509295) per creare array per popolare `X` e `y`.
+
+2. Successivamente, si avvia le routine di creazione del modello di regressione:
+
+ ```python
+ from sklearn.linear_model import LinearRegression
+ from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
+ from sklearn.model_selection import train_test_split
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+ lin_reg = LinearRegression()
+ lin_reg.fit(X_train,y_train)
+
+ pred = lin_reg.predict(X_test)
+
+ accuracy_score = lin_reg.score(X_train,y_train)
+ print('Model Accuracy: ', accuracy_score)
+ ```
+
+ Poiché la correlazione non è particolarmente buona, il modello prodotto non è molto accurato.
+
+ ```output
+ Model Accuracy: 0.3315342327998987
+ ```
+
+3. Si può visualizzare la linea tracciata nel processo:
+
+ ```python
+ plt.scatter(X_test, y_test, color='black')
+ plt.plot(X_test, pred, color='blue', linewidth=3)
+
+ plt.xlabel('Package')
+ plt.ylabel('Price')
+
+ plt.show()
+ ```
+
+ ![Un grafico a dispersione che mostra il rapporto tra pacchetto e prezzo](../images/linear.png)
+
+4. Si testa il modello contro una varietà ipotetica:
+
+ ```python
+ lin_reg.predict( np.array([ [2.75] ]) )
+ ```
+
+ Il prezzo restituito per questa varietà mitologica è:
+
+ ```output
+ array([[33.15655975]])
+ ```
+
+Quel numero ha senso, se la logica della linea di regressione è vera.
+
+🎃 Congratulazioni, si è appena creato un modello che può aiutare a prevedere il prezzo di alcune varietà di zucche. La zucca per le festività sarà bellissima. Ma probabilmente si può creare un modello migliore!
+
+## Regressione polinomiale
+
+Un altro tipo di regressione lineare è la regressione polinomiale. Mentre a volte c'è una relazione lineare tra le variabili - più grande è il volume della zucca, più alto è il prezzo - a volte queste relazioni non possono essere tracciate come un piano o una linea retta.
+
+✅ Ecco [alcuni altri esempi](https://online.stat.psu.edu/stat501/lesson/9/9.8) di dati che potrebbero utilizzare la regressione polinomiale
+
+Si dia un'altra occhiata alla relazione tra Varietà e Prezzo nel tracciato precedente. Questo grafico a dispersione deve essere necessariamente analizzato da una linea retta? Forse no. In questo caso, si può provare la regressione polinomiale.
+
+✅ I polinomi sono espressioni matematiche che possono essere costituite da una o più variabili e coefficienti
+
+La regressione polinomiale crea una linea curva per adattare meglio i dati non lineari.
+
+1. Viene ricreato un dataframe popolato con un segmento dei dati della zucca originale:
+
+ ```python
+ new_columns = ['Variety', 'Package', 'City', 'Month', 'Price']
+ poly_pumpkins = new_pumpkins.drop([c for c in new_pumpkins.columns if c not in new_columns], axis='columns')
+
+ poly_pumpkins
+ ```
+
+Un buon modo per visualizzare le correlazioni tra i dati nei dataframe è visualizzarli in un grafico "coolwarm":
+
+2. Si usa il metodo `Background_gradient()` con `coolwarm` come valore dell'argomento:
+
+ ```python
+ corr = poly_pumpkins.corr()
+ corr.style.background_gradient(cmap='coolwarm')
+ ```
+
+ Questo codice crea una mappa di calore:
+ ![Una mappa di calore che mostra la correlazione dei dati](../images/heatmap.png)
+
+Guardando questo grafico, si può visualizzare la buona correlazione tra Pacchetto e Prezzo. Quindi si dovrebbe essere in grado di creare un modello un po' migliore dell'ultimo.
+
+### Creare una pipeline
+
+Scikit-learn include un'API utile per la creazione di modelli di regressione polinomiale: l'[API](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html?highlight=pipeline#sklearn.pipeline.make_pipeline) `make_pipeline`. Viene creata una 'pipeline' che è una catena di stimatori. In questo caso, la pipeline include caratteristiche polinomiali o previsioni che formano un percorso non lineare.
+
+1. Si costruiscono le colonne X e y:
+
+ ```python
+ X=poly_pumpkins.iloc[:,3:4].values
+ y=poly_pumpkins.iloc[:,4:5].values
+ ```
+
+2. Si crea la pipeline chiamando il metodo `make_pipeline()` :
+
+ ```python
+ from sklearn.preprocessing import PolynomialFeatures
+ from sklearn.pipeline import make_pipeline
+
+ pipeline = make_pipeline(PolynomialFeatures(4), LinearRegression())
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+
+ pipeline.fit(np.array(X_train), y_train)
+
+ y_pred=pipeline.predict(X_test)
+ ```
+
+### Creare una sequenza
+
+A questo punto, è necessario creare un nuovo dataframe con dati _ordinati_ in modo che la pipeline possa creare una sequenza.
+
+Si aggiunge il seguente codice:
+
+```python
+df = pd.DataFrame({'x': X_test[:,0], 'y': y_pred[:,0]})
+df.sort_values(by='x',inplace = True)
+points = pd.DataFrame(df).to_numpy()
+
+plt.plot(points[:, 0], points[:, 1],color="blue", linewidth=3)
+plt.xlabel('Package')
+plt.ylabel('Price')
+plt.scatter(X,y, color="black")
+plt.show()
+```
+
+Si è creato un nuovo dataframe chiamato `pd.DataFrame`. Quindi si sono ordinati i valori chiamando `sort_values()`. Alla fine si è creato un grafico polinomiale:
+
+![Un grafico polinomiale che mostra la relazione tra pacchetto e prezzo](../images/polynomial.png)
+
+Si può vedere una linea curva che si adatta meglio ai dati.
+
+Si verifica la precisione del modello:
+
+```python
+accuracy_score = pipeline.score(X_train,y_train)
+print('Model Accuracy: ', accuracy_score)
+```
+
+E voilà!
+
+```output
+Model Accuracy: 0.8537946517073784
+```
+
+Ecco, meglio! Si prova a prevedere un prezzo:
+
+### Fare una previsione
+
+E possibile inserire un nuovo valore e ottenere una previsione?
+
+Si chiami `predict()` per fare una previsione:
+
+```python
+pipeline.predict( np.array([ [2.75] ]) )
+```
+
+Viene data questa previsione:
+
+```output
+array([[46.34509342]])
+```
+
+Ha senso, visto il tracciato! Se questo è un modello migliore del precedente, guardando gli stessi dati, si deve preventivare queste zucche più costose!
+
+Ben fatto! Sono stati creati due modelli di regressione in una lezione. Nella sezione finale sulla regressione, si imparerà a conoscere la regressione logistica per determinare le categorie.
+
+---
+
+## 🚀 Sfida
+
+Testare diverse variabili in questo notebook per vedere come la correlazione corrisponde all'accuratezza del modello.
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/14/)
+
+## Revisione e Auto Apprendimento
+
+In questa lezione si è appreso della regressione lineare. Esistono altri tipi importanti di regressione. Leggere le tecniche Stepwise, Ridge, Lazo ed Elasticnet. Un buon corso per studiare per saperne di più è il [corso Stanford Statistical Learning](https://online.stanford.edu/courses/sohs-ystatslearning-statistical-learning)
+
+## Compito
+
+[Costruire un modello](assignment.it.md)
diff --git a/2-Regression/3-Linear/translations/README.ja.md b/2-Regression/3-Linear/translations/README.ja.md
new file mode 100644
index 0000000000..2dbc0f3219
--- /dev/null
+++ b/2-Regression/3-Linear/translations/README.ja.md
@@ -0,0 +1,334 @@
+# Scikit-learnを用いた回帰モデルの構築: 回帰を行う2つの方法
+
+![線形回帰 vs 多項式回帰 のインフォグラフィック](../images/linear-polynomial.png)
+> [Dasani Madipalli](https://twitter.com/dasani_decoded) によるインフォグラフィック
+## [講義前のクイズ](https://white-water-09ec41f0f.azurestaticapps.net/quiz/13/)
+### イントロダクション
+
+これまで、このレッスンで使用するカボチャの価格データセットから集めたサンプルデータを使って、回帰とは何かを探ってきました。また、Matplotlibを使って可視化を行いました。
+
+これで、MLにおける回帰をより深く理解する準備が整いました。このレッスンでは、2種類の回帰について詳しく説明します。基本的な線形回帰 (_basic linear regression_)と多項式回帰 (_polynomial regression_)の2種類の回帰について、その基礎となる数学を学びます。
+
+> このカリキュラムでは、最低限の数学の知識を前提とし、他の分野の学生にも理解できるようにしていますので、理解を助けるためのメモ、🧮吹き出し、図などの学習ツールをご覧ください。
+
+### 事前確認
+
+ここでは、パンプキンデータの構造について説明しています。このレッスンの_notebook.ipynb_ファイルには、事前に読み込まれ、整形されたデータが入っています。このファイルでは、カボチャの価格がブッシェル単位で新しいデータフレームに表示されています。 これらのノートブックを、Visual Studio Codeのカーネルで実行できることを確認してください。
+
+### 準備
+
+忘れてはならないのは、データを読み込んだら問いかけを行うことです。
+
+- カボチャを買うのに最適な時期はいつですか?
+- ミニカボチャ1ケースの価格はどのくらいでしょうか?
+- 半ブッシェルのバスケットで買うべきか、1 1/9ブッシェルの箱で買うべきか。
+
+データを掘り下げていきましょう。
+
+前回のレッスンでは、Pandasのデータフレームを作成し、元のデータセットの一部を入力して、ブッシェル単位の価格を標準化しました。しかし、この方法では、約400のデータポイントしか集めることができず、しかもそれは秋の期間のものでした。
+
+このレッスンに付属するノートブックで、あらかじめ読み込んでおいたデータを見てみましょう。データが事前に読み込まれ、月毎のデータが散布図として表示されています。データをもっと綺麗にすることで、データの性質をもう少し知ることができるかもしれません。
+
+## 線形回帰
+
+レッスン1で学んだように、線形回帰の演習では、以下のような線を描けるようになることが目標です。
+
+- **変数間の関係を示す。**
+- **予測を行う。** 新しいデータポイントが、その線のどこに位置するかを正確に予測することができる。
+
+このような線を描くことは、**最小二乗回帰 (Least-Squares Regression)** の典型的な例です。「最小二乗」という言葉は、回帰線を囲むすべてのデータポイントとの距離が二乗され、その後加算されることを意味しています。理想的には、最終的な合計ができるだけ小さくなるようにします。これはエラーの数、つまり「最小二乗」の値を小さくするためです。
+
+これは、すべてのデータポイントからの累積距離が最小となる直線をモデル化したいためです。また、方向ではなく大きさに注目しているので、足す前に項を二乗します。
+
+> **🧮 Show me the math**
+>
+> この線は、_line of best fit_ と呼ばれ、[方程式](https://en.wikipedia.org/wiki/Simple_linear_regression) で表すことができます。
+>
+> ```
+> Y = a + bX
+> ```
+>
+> `X`は「説明変数」です。`Y`は「目的変数」です。`a`は切片で`b`は直線の傾きを表します。`X=0`のとき、`Y`の値は切片`a`となります。
+>
+>![傾きの計算](../images/slope.png)
+>
+> はじめに、傾き`b`を計算してみます。[Jen Looper](https://twitter.com/jenlooper) によるインフォグラフィック。
+>
+> カボチャのデータに関する最初の質問である、「月毎のブッシェル単位でのカボチャの価格を予測してください」で言い換えてみると、`X`は価格を、`Y`は販売された月を表しています。
+>
+>![方程式の計算](../images/calculation.png)
+>
+> Yの値を計算してみましょう。$4前後払っているなら、4月に違いありません![Jen Looper](https://twitter.com/jenlooper) によるインフォグラフィック。
+>
+> 直線を計算する数学は、直線の傾きを示す必要がありますが、これは切片、つまり「X = 0」のときに「Y」がどこに位置するかにも依存します。
+>
+> これらの値の計算方法は、[Math is Fun](https://www.mathsisfun.com/data/least-squares-regression.html) というサイトで見ることができます。また、[this Least-squares calculator](https://www.mathsisfun.com/data/least-squares-calculator.html) では、値が線にどのような影響を与えるかを見ることができます。
+
+## 相関関係
+
+もう一つの理解すべき用語は、与えられたXとYの変数間の**相関係数 (Correlation Coefficient)** です。散布図を使えば、この係数をすぐに可視化することができます。データポイントがきれいな直線上に散らばっているプロットは、高い相関を持っていますが、データポイントがXとYの間のあらゆる場所に散らばっているプロットは、低い相関を持っています。
+
+良い線形回帰モデルとは、最小二乗法によって求めた回帰線が高い相関係数 (0よりも1に近い)を持つものです。
+
+✅ このレッスンのノートを開いて、「都市と価格」の散布図を見てみましょう。散布図の視覚的な解釈によると、カボチャの販売に関する「都市」と「価格」の関連データは、相関性が高いように見えますか、それとも低いように見えますか?
+
+## 回帰に用いるデータの準備
+
+この演習の背景にある数学を理解したので、回帰モデルを作成して、どのパッケージのカボチャの価格が最も高いかを予測できるかどうかを確認してください。休日のパンプキンパッチ用にパンプキンを購入する人は、パッチ用のパンプキンパッケージの購入を最適化するために、この情報を必要とするかもしれません。
+
+ここではScikit-learnを使用するので、手作業で行う必要はありません。レッスンノートのメインのデータ処理ブロックに、Scikit-learnのライブラリを追加して、すべての文字列データを自動的に数字に変換します。
+
+```python
+from sklearn.preprocessing import LabelEncoder
+
+new_pumpkins.iloc[:, 0:-1] = new_pumpkins.iloc[:, 0:-1].apply(LabelEncoder().fit_transform)
+```
+
+new_pumpkinsデータフレームを見ると、すべての文字列が数値になっているのがわかります。これにより、人が読むのは難しくなりましたが、Scikit-learnにとってはとても分かりやすくなりました。
+これで、回帰に最も適したデータについて、(散布図を見ただけではなく)より高度な判断ができるようになりました。
+
+良い予測モデルを構築するために、データの2点間に良い相関関係を見つけようとします。その結果、「都市」と「価格」の間には弱い相関関係しかないことがわかりました。
+
+```python
+print(new_pumpkins['City'].corr(new_pumpkins['Price']))
+0.32363971816089226
+```
+
+しかし、パッケージと価格の間にはもう少し強い相関関係があります。これは理にかなっていると思いますか?通常、箱が大きければ大きいほど、価格は高くなります。
+
+```python
+print(new_pumpkins['Package'].corr(new_pumpkins['Price']))
+0.6061712937226021
+```
+
+このデータに対する良い質問は、次のようになります。「あるカボチャのパッケージの価格はどのくらいになるか?」
+
+この回帰モデルを構築してみましょう!
+
+## 線形モデルの構築
+
+モデルを構築する前に、もう一度データの整理をしてみましょう。NULLデータを削除し、データがどのように見えるかをもう一度確認します。
+
+```python
+new_pumpkins.dropna(inplace=True)
+new_pumpkins.info()
+```
+
+そして、この最小セットから新しいデータフレームを作成し、それを出力します。
+
+```python
+new_columns = ['Package', 'Price']
+lin_pumpkins = new_pumpkins.drop([c for c in new_pumpkins.columns if c not in new_columns], axis='columns')
+
+lin_pumpkins
+```
+
+```output
+ Package Price
+70 0 13.636364
+71 0 16.363636
+72 0 16.363636
+73 0 15.454545
+74 0 13.636364
+... ... ...
+1738 2 30.000000
+1739 2 28.750000
+1740 2 25.750000
+1741 2 24.000000
+1742 2 24.000000
+415 rows × 2 columns
+```
+
+1. これで、XとYの座標データを割り当てることができます。
+
+ ```python
+ X = lin_pumpkins.values[:, :1]
+ y = lin_pumpkins.values[:, 1:2]
+ ```
+✅ ここでは何をしていますか? Pythonの[スライス記法](https://stackoverflow.com/questions/509211/understanding-slice-notation/509295#509295) を使って、`X`と`y`の配列を作成しています。
+
+2. 次に、回帰モデル構築のためのルーチンを開始します。
+
+ ```python
+ from sklearn.linear_model import LinearRegression
+ from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
+ from sklearn.model_selection import train_test_split
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+ lin_reg = LinearRegression()
+ lin_reg.fit(X_train,y_train)
+
+ pred = lin_reg.predict(X_test)
+
+ accuracy_score = lin_reg.score(X_train,y_train)
+ print('Model Accuracy: ', accuracy_score)
+ ```
+
+ 相関関係があまり良くないので、生成されたモデルもあまり正確ではありません。
+
+ ```output
+ Model Accuracy: 0.3315342327998987
+ ```
+
+3. 今回の過程で描かれた線を可視化します。
+
+ ```python
+ plt.scatter(X_test, y_test, color='black')
+ plt.plot(X_test, pred, color='blue', linewidth=3)
+
+ plt.xlabel('Package')
+ plt.ylabel('Price')
+
+ plt.show()
+ ```
+ ![パッケージと価格の関係を表す散布図](../images/linear.png)
+
+4. 架空の値に対してモデルをテストする。
+
+ ```python
+ lin_reg.predict( np.array([ [2.75] ]) )
+ ```
+
+ この架空の値に対して、以下の価格が返されます。
+
+ ```output
+ array([[33.15655975]])
+ ```
+
+回帰の線が正しく引かれていれば、その数字は理にかなっています。
+
+🎃 おめでとうございます!数種類のカボチャの価格を予測するモデルを作成しました。休日のパンプキンパッチは美しいものになるでしょう。でも、もっと良いモデルを作れるかもしれません。
+
+## 多項式回帰
+
+線形回帰のもう一つのタイプは、多項式回帰です。時には変数の間に直線的な関係 (カボチャの量が多いほど、価格は高くなる)があることもありますが、これらの関係は、平面や直線としてプロットできないこともあります。
+
+✅ 多項式回帰を使うことができる、[いくつかの例](https://online.stat.psu.edu/stat501/lesson/9/9.8) を示します。
+
+先ほどの散布図の「品種」と「価格」の関係をもう一度見てみましょう。この散布図は、必ずしも直線で分析しなければならないように見えますか?そうではないかもしれません。このような場合は、多項式回帰を試してみましょう。
+
+✅ 多項式とは、1つ以上の変数と係数で構成される数学的表現である。
+
+多項式回帰では、非線形データをよりよく適合させるために曲線を作成します。
+
+1. 元のカボチャのデータの一部を入力したデータフレームを作成してみましょう。
+
+ ```python
+ new_columns = ['Variety', 'Package', 'City', 'Month', 'Price']
+ poly_pumpkins = new_pumpkins.drop([c for c in new_pumpkins.columns if c not in new_columns], axis='columns')
+
+ poly_pumpkins
+ ```
+
+データフレーム内のデータ間の相関関係を視覚化するには、「coolwarm」チャートで表示するのが良いでしょう。
+
+2. `Background_gradient()` メソッドの引数に `coolwarm` を指定して使用します。
+
+ ```python
+ corr = poly_pumpkins.corr()
+ corr.style.background_gradient(cmap='coolwarm')
+ ```
+
+ このコードはヒートマップを作成します。
+ ![データの相関関係を示すヒートマップ](../images/heatmap.png)
+
+このチャートを見ると、「パッケージ」と「価格」の間に正の相関関係があることが視覚化されています。つまり、前回のモデルよりも多少良いモデルを作ることができるはずです。
+
+### パイプラインの作成
+
+Scikit-learnには、多項式回帰モデルを構築するための便利なAPIである`make_pipeline` [API](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html?highlight=pipeline#sklearn.pipeline.make_pipeline) が用意されています。「パイプライン」は推定量の連鎖で作成されます。今回の場合、パイプラインには多項式の特徴量、非線形の経路を形成する予測値が含まれます。
+
+1. X列とy列を作ります。
+
+ ```python
+ X=poly_pumpkins.iloc[:,3:4].values
+ y=poly_pumpkins.iloc[:,4:5].values
+ ```
+
+2. `make_pipeline()` メソッドを呼び出してパイプラインを作成します。
+
+ ```python
+ from sklearn.preprocessing import PolynomialFeatures
+ from sklearn.pipeline import make_pipeline
+
+ pipeline = make_pipeline(PolynomialFeatures(4), LinearRegression())
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+
+ pipeline.fit(np.array(X_train), y_train)
+
+ y_pred=pipeline.predict(X_test)
+ ```
+
+### 系列の作成
+
+この時点で、パイプラインが系列を作成できるように、ソートされたデータで新しいデータフレームを作成する必要があります。
+
+以下のコードを追加します。
+
+ ```python
+ df = pd.DataFrame({'x': X_test[:,0], 'y': y_pred[:,0]})
+ df.sort_values(by='x',inplace = True)
+ points = pd.DataFrame(df).to_numpy()
+
+ plt.plot(points[:, 0], points[:, 1],color="blue", linewidth=3)
+ plt.xlabel('Package')
+ plt.ylabel('Price')
+ plt.scatter(X,y, color="black")
+ plt.show()
+ ```
+
+`pd.DataFrame` を呼び出して新しいデータフレームを作成しました。次に`sort_values()` を呼び出して値をソートしました。最後に多項式のプロットを作成しました。
+
+![パッケージと価格の関係を示す多項式のプロット](../images/polynomial.png)
+
+よりデータにフィットした曲線を確認することができます。
+
+モデルの精度を確認してみましょう。
+
+ ```python
+ accuracy_score = pipeline.score(X_train,y_train)
+ print('Model Accuracy: ', accuracy_score)
+ ```
+
+ これで完成です!
+
+ ```output
+ Model Accuracy: 0.8537946517073784
+ ```
+
+いい感じです!価格を予測してみましょう。
+
+### 予測の実行
+
+新しい値を入力し、予測値を取得できますか?
+
+`predict()` メソッドを呼び出して、予測を行います。
+
+ ```python
+ pipeline.predict( np.array([ [2.75] ]) )
+ ```
+ 以下の予測結果が得られます。
+
+ ```output
+ array([[46.34509342]])
+ ```
+
+プロットを見てみると、納得できそうです!そして、同じデータを見て、これが前のモデルよりも良いモデルであれば、より高価なカボチャのために予算を組む必要があります。
+
+🏆 お疲れ様でした!1つのレッスンで2つの回帰モデルを作成しました。回帰に関する最後のセクションでは、カテゴリーを決定するためのロジスティック回帰について学びます。
+
+---
+## 🚀チャレンジ
+
+このノートブックでいくつかの異なる変数をテストし、相関関係がモデルの精度にどのように影響するかを確認してみてください。
+
+## [講義後クイズ](https://white-water-09ec41f0f.azurestaticapps.net/quiz/14/)
+
+## レビュー & 自主学習
+
+このレッスンでは、線形回帰について学びました。回帰には他にも重要な種類があります。Stepwise、Ridge、Lasso、Elasticnetなどのテクニックをご覧ください。より詳しく学ぶには、[Stanford Statistical Learning course](https://online.stanford.edu/courses/sohs-ystatslearning-statistical-learning) が良いでしょう。
+
+## 課題
+
+[モデル構築](./assignment.ja.md)
diff --git a/2-Regression/3-Linear/translations/README.zh-cn.md b/2-Regression/3-Linear/translations/README.zh-cn.md
new file mode 100644
index 0000000000..12b57103e7
--- /dev/null
+++ b/2-Regression/3-Linear/translations/README.zh-cn.md
@@ -0,0 +1,332 @@
+# 使用Scikit-learn构建回归模型:两种方式的回归
+
+![线性与多项式回归信息图](../images/linear-polynomial.png)
+> 作者[Dasani Madipalli](https://twitter.com/dasani_decoded)
+## [课前测](https://white-water-09ec41f0f.azurestaticapps.net/quiz/13/)
+### 介绍
+
+到目前为止,你已经通过从我们将在本课程中使用的南瓜定价数据集收集的样本数据探索了什么是回归。你还使用Matplotlib对其进行了可视化。
+
+现在你已准备好深入研究ML的回归。 在本课中,你将详细了解两种类型的回归:_基本线性回归_和_多项式回归_,以及这些技术背后的一些数学知识。
+
+> 在整个课程中,我们假设数学知识最少,并试图让来自其他领域的学生也能接触到它,因此请使用笔记、🧮标注、图表和其他学习工具以帮助理解。
+
+### 前提
+
+你现在应该熟悉我们正在检查的南瓜数据的结构。你可以在本课的_notebook.ipynb_文件中找到它。 在这个文件中,南瓜的价格显示在一个新的dataframe 中。确保可以在Visual Studio Code代码的内核中运行这些notebooks。
+
+### 准备
+
+提醒一下,你正在加载此数据以提出问题。
+
+- 什么时候买南瓜最好?
+- 一箱微型南瓜的价格是多少?
+- 我应该买半蒲式耳还是1 1/9蒲式耳?
+让我们继续深入研究这些数据。
+
+在上一课中,你创建了一个Pandas dataframe并用原始数据集的一部分填充它,按蒲式耳标准化定价。但是,通过这样做,你只能收集大约400个数据点,而且只能收集秋季月份的数据。
+
+看看我们在本课随附的notebook中预加载的数据。数据已预加载,并绘制了初始散点图以显示月份数据。也许我们可以通过更多地清理数据来获得更多关于数据性质的细节。
+
+## 线性回归线
+
+正如你在第1课中学到的,线性回归练习的目标是能够绘制一条线以便:
+
+- **显示变量关系**。 显示变量之间的关系
+- **作出预测**。 准确预测新数据点与该线的关系。
+
+绘制这种类型的线是**最小二乘回归**的典型做法。术语“最小二乘法”意味着将回归线周围的所有数据点平方,然后相加。理想情况下,最终和尽可能小,因为我们希望错误数量较少,或“最小二乘法”。
+
+我们这样做是因为我们想要对一条与所有数据点的累积距离最小的线进行建模。我们还在添加它们之前对这些项进行平方,因为我们关心的是它的大小而不是它的方向。
+
+> **🧮 数学知识**
+>
+> 这条线称为_最佳拟合线_,可以用[一个等式](https://en.wikipedia.org/wiki/Simple_linear_regression)表示:
+>
+> ```
+> Y = a + bX
+> ```
+>
+> `X`是“解释变量”。`Y`是“因变量”。直线的斜率是`b`,`a`是y轴截距,指的是`X = 0`时`Y`的值。
+>
+>![计算斜率](../images/slope.png)
+>
+> 首先,计算斜率`b`。作者[Jen Looper](https://twitter.com/jenlooper)
+>
+> 换句话说,参考我们的南瓜数据的原始问题:“按月预测每蒲式耳南瓜的价格”,`X`指的是价格,`Y`指的是销售月份。
+>
+>![完成等式](../images/calculation.png)
+>
+> 计算Y的值。如果你支付大约4美元,那一定是四月!作者[Jen Looper](https://twitter.com/jenlooper)
+>
+> 计算直线的数学必须证明直线的斜率,这也取决于截距,或者当`X = 0`时`Y`所在的位置。
+>
+> 你可以在[Math is Fun](https://www.mathsisfun.com/data/least-squares-regression.html)网站上观察这些值的计算方法。另请访问[这个最小二乘计算器](https://www.mathsisfun.com/data/least-squares-calculator.html)以观察数字的值如何影响直线。
+
+## 相关性
+
+另一个需要理解的术语是给定X和Y变量之间的**相关系数**。使用散点图,你可以快速可视化该系数。数据点散布在一条直线上的图具有高相关性,但数据点散布在X和Y之间的图具有低相关性。
+
+一个好的线性回归模型将是一个用最小二乘回归法与直线回归得到的高(更接近于1)相关系数的模型。
+
+✅ 运行本课随附的notebook并查看City to Price散点图。根据你对散点图的视觉解释,将南瓜销售的城市与价格相关联的数据似乎具有高相关性或低相关性?
+
+## 为回归准备数据
+
+现在你已经了解了本练习背后的数学原理,可以创建一个回归模型,看看你是否可以预测哪个南瓜包装的南瓜价格最优惠。为节日购买南瓜的人可能希望此信息能够优化他们如何购买南瓜包装。
+
+由于你将使用Scikit-learn,因此没有理由手动执行此操作(尽管你可以!)。在课程notebook的主要数据处理块中,从Scikit-learn添加一个库以自动将所有字符串数据转换为数字:
+
+```python
+from sklearn.preprocessing import LabelEncoder
+
+new_pumpkins.iloc[:, 0:-1] = new_pumpkins.iloc[:, 0:-1].apply(LabelEncoder().fit_transform)
+```
+
+如果你现在查看new_pumpkins dataframe,你会看到所有字符串现在都是数字。这让你更难阅读,但对Scikit-learn来说更容易理解!
+
+现在,你可以对最适合回归的数据做出更有根据的决策(不仅仅是基于观察散点图)。
+
+尝试在数据的两点之间找到良好的相关性,以构建良好的预测模型。事实证明,城市和价格之间只有微弱的相关性:
+
+```python
+print(new_pumpkins['City'].corr(new_pumpkins['Price']))
+0.32363971816089226
+```
+
+然而,包装和它的价格之间有更好的相关性。这是有道理的,对吧?通常,农产品箱越大,价格越高。
+
+```python
+print(new_pumpkins['Package'].corr(new_pumpkins['Price']))
+0.6061712937226021
+```
+
+对这些数据提出的一个很好的问题是:“我可以期望给定的南瓜包装的价格是多少?”
+
+让我们建立这个回归模型
+
+## 建立线性模型
+
+在构建模型之前,再对数据进行一次整理。删除任何空数据并再次检查数据的样子。
+
+```python
+new_pumpkins.dropna(inplace=True)
+new_pumpkins.info()
+```
+
+然后,从这个最小集合创建一个新的dataframe并将其打印出来:
+
+```python
+new_columns = ['Package', 'Price']
+lin_pumpkins = new_pumpkins.drop([c for c in new_pumpkins.columns if c not in new_columns], axis='columns')
+
+lin_pumpkins
+```
+
+```output
+ Package Price
+70 0 13.636364
+71 0 16.363636
+72 0 16.363636
+73 0 15.454545
+74 0 13.636364
+... ... ...
+1738 2 30.000000
+1739 2 28.750000
+1740 2 25.750000
+1741 2 24.000000
+1742 2 24.000000
+415 rows × 2 columns
+```
+
+1. 现在你可以分配X和y坐标数据:
+
+ ```python
+ X = lin_pumpkins.values[:, :1]
+ y = lin_pumpkins.values[:, 1:2]
+ ```
+✅ 这里发生了什么?你正在使用[Python slice notation](https://stackoverflow.com/questions/509211/understanding-slice-notation/509295#509295)来创建数组来填充`X`和`y`。
+
+2. 接下来,开始回归模型构建例程:
+
+ ```python
+ from sklearn.linear_model import LinearRegression
+ from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
+ from sklearn.model_selection import train_test_split
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+ lin_reg = LinearRegression()
+ lin_reg.fit(X_train,y_train)
+
+ pred = lin_reg.predict(X_test)
+
+ accuracy_score = lin_reg.score(X_train,y_train)
+ print('Model Accuracy: ', accuracy_score)
+ ```
+
+ 因为相关性不是特别好,所以生成的模型不是非常准确。
+
+ ```output
+ Model Accuracy: 0.3315342327998987
+ ```
+
+3. 你可以将过程中绘制的线条可视化:
+
+ ```python
+ plt.scatter(X_test, y_test, color='black')
+ plt.plot(X_test, pred, color='blue', linewidth=3)
+
+ plt.xlabel('Package')
+ plt.ylabel('Price')
+
+ plt.show()
+ ```
+ ![散点图显示包装与价格的关系](../images/linear.png)
+
+4. 针对假设的品种测试模型:
+
+ ```python
+ lin_reg.predict( np.array([ [2.75] ]) )
+ ```
+
+ 这个神话般的品种的价格是:
+
+ ```output
+ array([[33.15655975]])
+ ```
+
+如果回归线的逻辑成立,这个数字是有意义的。
+
+🎃 恭喜你,你刚刚创建了一个模型,可以帮助预测几个南瓜品种的价格。你的节日南瓜地会很漂亮的。但是你可以创造一个更好的模型!
+
+## 多项式回归
+
+另一种线性回归是多项式回归。虽然有时变量之间存在线性关系——南瓜的体积越大,价格就越高——但有时这些关系不能绘制成平面或直线。
+
+✅ 这里有可以使用多项式回归数据的[更多示例](https://online.stat.psu.edu/stat501/lesson/9/9.8)
+
+再看一下上图中品种与价格之间的关系。这个散点图看起来是否应该用一条直线来分析?也许不是。在这种情况下,你可以尝试多项式回归。
+
+✅ 多项式是可能由一个或多个变量和系数组成的数学表达式
+
+多项式回归创建一条曲线以更好地拟合非线性数据。
+
+1. 让我们重新创建一个填充了原始南瓜数据片段的dataframe:
+ ```python
+ new_columns = ['Variety', 'Package', 'City', 'Month', 'Price']
+ poly_pumpkins = new_pumpkins.drop([c for c in new_pumpkins.columns if c not in new_columns], axis='columns')
+
+ poly_pumpkins
+ ```
+
+可视化dataframe中数据之间相关性的一种好方法是将其显示在“coolwarm”图表中:
+
+2. 使用`Background_gradient()`方法和`coolwarm`作为其参数值:
+
+ ```python
+ corr = poly_pumpkins.corr()
+ corr.style.background_gradient(cmap='coolwarm')
+ ```
+ 这段代码创建了一个热图:
+ ![显示数据相关性的热图](../images/heatmap.png)
+
+查看此图表,你可以直观地看到Package和Price之间的良好相关性。所以你应该能够创建一个比上一个更好的模型。
+
+### 创建管道
+
+Scikit-learn包含一个用于构建多项式回归模型的有用API - `make_pipeline` [API](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html?highlight=pipeline#sklearn.pipeline.make_pipeline)。 创建了一个“管道”,它是一个估计器链。 在这种情况下,管道包括多项式特征或形成非线性路径的预测。
+
+1. 构建X和y列:
+
+ ```python
+ X=poly_pumpkins.iloc[:,3:4].values
+ y=poly_pumpkins.iloc[:,4:5].values
+ ```
+
+2. 通过调用`make_pipeline()`方法创建管道:
+
+ ```python
+ from sklearn.preprocessing import PolynomialFeatures
+ from sklearn.pipeline import make_pipeline
+
+ pipeline = make_pipeline(PolynomialFeatures(4), LinearRegression())
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+
+ pipeline.fit(np.array(X_train), y_train)
+
+ y_pred=pipeline.predict(X_test)
+ ```
+
+### 创建序列
+
+此时,你需要使用_排序好的_数据创建一个新的dataframe ,以便管道可以创建序列。
+
+添加以下代码:
+
+ ```python
+ df = pd.DataFrame({'x': X_test[:,0], 'y': y_pred[:,0]})
+ df.sort_values(by='x',inplace = True)
+ points = pd.DataFrame(df).to_numpy()
+
+ plt.plot(points[:, 0], points[:, 1],color="blue", linewidth=3)
+ plt.xlabel('Package')
+ plt.ylabel('Price')
+ plt.scatter(X,y, color="black")
+ plt.show()
+ ```
+
+你通过调用`pd.DataFrame`创建了一个新的dataframe。然后通过调用`sort_values()`对值进行排序。最后你创建了一个多项式图:
+
+![显示包装与价格关系的多项式图](../images/polynomial.png)
+
+你可以看到更适合你的数据的曲线。
+
+让我们检查模型的准确性:
+
+ ```python
+ accuracy_score = pipeline.score(X_train,y_train)
+ print('Model Accuracy: ', accuracy_score)
+ ```
+
+ 瞧!
+
+ ```output
+ Model Accuracy: 0.8537946517073784
+ ```
+
+这样好多了!试着预测一个价格:
+
+### 做个预测
+
+我们可以输入一个新值并得到一个预测吗?
+
+调用`predict()`进行预测:
+
+ ```python
+ pipeline.predict( np.array([ [2.75] ]) )
+ ```
+ 你会得到这样的预测:
+
+ ```output
+ array([[46.34509342]])
+ ```
+
+参照图像,这确实有道理!而且,如果这是一个比前一个更好的模型,看同样的数据,你需要为这些更昂贵的南瓜做好预算!
+
+🏆 干得不错!你在一节课中创建了两个回归模型。在回归的最后一节中,你将了解逻辑回归以确定类别。
+
+---
+## 🚀挑战
+
+在此notebook中测试几个不同的变量,以查看相关性与模型准确性的对应关系。
+
+## [课后测](https://white-water-09ec41f0f.azurestaticapps.net/quiz/14/)
+
+## 复习与自学
+
+在本课中,我们学习了线性回归。还有其他重要的回归类型。了解Stepwise、Ridge、Lasso和Elasticnet技术。学习更多信息的好课程是[斯坦福统计学习课程](https://online.stanford.edu/courses/sohs-ystatslearning-statistical-learning)
+
+## 任务
+
+[构建模型](../assignment.md)
diff --git a/2-Regression/3-Linear/translations/assignment.it.md b/2-Regression/3-Linear/translations/assignment.it.md
new file mode 100644
index 0000000000..e5aaaa77eb
--- /dev/null
+++ b/2-Regression/3-Linear/translations/assignment.it.md
@@ -0,0 +1,11 @@
+# Creare un Modello di Regressione
+
+## Istruzioni
+
+In questa lezione è stato mostrato come costruire un modello utilizzando sia la Regressione Lineare che Polinomiale. Usando questa conoscenza, trovare un insieme di dati o utilizzare uno degli insiemi integrati di Scikit-Learn per costruire un modello nuovo. Spiegare nel proprio notebook perché si è scelto una determinata tecnica e dimostrare la precisione del modello. Se non è accurato, spiegare perché.
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | ------------------------------------------------------------ | -------------------------- | ------------------------------- |
+| | presenta un notebook completo con una soluzione ben documentata | La soluzione è incompleta | La soluzione è difettosa o contiene bug |
diff --git a/2-Regression/3-Linear/translations/assignment.ja.md b/2-Regression/3-Linear/translations/assignment.ja.md
new file mode 100644
index 0000000000..d0f8a4c509
--- /dev/null
+++ b/2-Regression/3-Linear/translations/assignment.ja.md
@@ -0,0 +1,11 @@
+# 回帰モデルの作成
+
+## 課題の指示
+
+このレッスンでは、線形回帰と多項式回帰の両方を使ってモデルを構築する方法を紹介しました。この知識をもとに、自分でデータセットを探すか、Scikit-learnのビルトインセットの1つを使用して、新しいモデルを構築してください。手法を選んだ理由をノートブックに書き、モデルの精度を示してください。精度が十分でない場合は、その理由も説明してください。
+
+## ルーブリック
+
+| 指標 | 模範的 | 適切 | 要改善 |
+| -------- | ------------------------------------------------------------ | -------------------------- | ------------------------------- |
+| | ドキュメント化されたソリューションを含む完全なノートブックを提示する。 | 解決策が不完全である。 | 解決策に欠陥またはバグがある。 |
diff --git a/2-Regression/3-Linear/translations/assignment.zh-cn.md b/2-Regression/3-Linear/translations/assignment.zh-cn.md
new file mode 100644
index 0000000000..e9c476c361
--- /dev/null
+++ b/2-Regression/3-Linear/translations/assignment.zh-cn.md
@@ -0,0 +1,12 @@
+# 创建自己的回归模型
+
+## 说明
+
+在这节课中你学到了如何用线性回归和多项式回归建立一个模型。利用这些只是,找到一个你感兴趣的数据集或者是 Scikit-learn 内置的数据集来建立一个全新的模型。用你的 notebook 来解释为什么用了这种技术来对这个数据集进行建模,并且证明出你的模型的准确度。如果它没你想象中准确,请思考一下并解释一下原因。
+
+## 评判标准
+
+| 标准 | 优秀 | 中规中矩 | 仍需努力 |
+| -------- | ------------------------------------------------------------ | -------------------------- | ------------------------------- |
+| | 提交了一个完整的 notebook 工程文件,其中包含了解集,并且可读性良好 | 不完整的解集 | 解集是有缺陷或者有错误的 |
+
diff --git a/2-Regression/4-Logistic/README.md b/2-Regression/4-Logistic/README.md
index a4488c11e4..afc9c629f1 100644
--- a/2-Regression/4-Logistic/README.md
+++ b/2-Regression/4-Logistic/README.md
@@ -2,7 +2,7 @@
![Logistic vs. linear regression infographic](./images/logistic-linear.png)
> Infographic by [Dasani Madipalli](https://twitter.com/dasani_decoded)
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/15/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/15/)
## Introduction
@@ -140,7 +140,7 @@ Now that we have an idea of the relationship between the binary categories of co
> **🧮 Show Me The Math**
>
-> Remember how linear regression often used ordinary least squares to arrive at a value? Logistic regression relies on the concept of 'maximum likelihood' using [sigmoid functions](https://wikipedia.org/wiki/Sigmoid_function). A 'Sigmoid Function' on a plot looks like an 'S' shape. It takes a value and maps it to somewhere between 0 and 1. Its curve is also called a 'logistic curve'. Its formula looks like thus:
+> Remember how linear regression often used ordinary least squares to arrive at a value? Logistic regression relies on the concept of 'maximum likelihood' using [sigmoid functions](https://wikipedia.org/wiki/Sigmoid_function). A 'Sigmoid Function' on a plot looks like an 'S' shape. It takes a value and maps it to somewhere between 0 and 1. Its curve is also called a 'logistic curve'. Its formula looks like this:
>
> ![logistic function](images/sigmoid.png)
>
@@ -206,7 +206,7 @@ While you can get a scoreboard report [terms](https://scikit-learn.org/stable/mo
> 🎓 A '[confusion matrix](https://wikipedia.org/wiki/Confusion_matrix)' (or 'error matrix') is a table that expresses your model's true vs. false positives and negatives, thus gauging the accuracy of predictions.
-1. To use a confusion metrics, call `confusin_matrix()`:
+1. To use a confusion metrics, call `confusion_matrix()`:
```python
from sklearn.metrics import confusion_matrix
@@ -220,26 +220,35 @@ While you can get a scoreboard report [terms](https://scikit-learn.org/stable/mo
[ 33, 0]])
```
-What's going on here? Let's say our model is asked to classify items between two binary categories, category 'pumpkin' and category 'not-a-pumpkin'.
+In Scikit-learn, confusion matrices Rows (axis 0) are actual labels and columns (axis 1) are predicted labels.
-- If your model predicts something as a pumpkin and it belongs to category 'pumpkin' in reality we call it a true positive, shown by the top left number.
-- If your model predicts something as not a pumpkin and it belongs to category 'pumpkin' in reality we call it a false positive, shown by the top right number.
-- If your model predicts something as a pumpkin and it belongs to category 'not-a-pumpkin' in reality we call it a false negative, shown by the bottom left number.
-- If your model predicts something as not a pumpkin and it belongs to category 'not-a-pumpkin' in reality we call it a true negative, shown by the bottom right number.
+| | 0 | 1 |
+| :---: | :---: | :---: |
+| 0 | TN | FP |
+| 1 | FN | TP |
-![Confusion Matrix](images/confusion-matrix.png)
+What's going on here? Let's say our model is asked to classify pumpkins between two binary categories, category 'orange' and category 'not-orange'.
-> Infographic by [Jen Looper](https://twitter.com/jenlooper)
+- If your model predicts a pumpkin as not orange and it belongs to category 'not-orange' in reality we call it a true negative, shown by the top left number.
+- If your model predicts a pumpkin as orange and it belongs to category 'not-orange' in reality we call it a false negative, shown by the bottom left number.
+- If your model predicts a pumpkin as not orange and it belongs to category 'orange' in reality we call it a false positive, shown by the top right number.
+- If your model predicts a pumpkin as orange and it belongs to category 'orange' in reality we call it a true positive, shown by the bottom right number.
As you might have guessed it's preferable to have a larger number of true positives and true negatives and a lower number of false positives and false negatives, which implies that the model performs better.
-✅ Q: According to the confusion matrix, how did the model do? A: Not too bad; there are a good number of true positives but also several false negatives.
+How does the confusion matrix relate to precision and recall? Remember, the classification report printed above showed precision (0.83) and recall (0.98).
+
+Precision = tp / (tp + fp) = 162 / (162 + 33) = 0.8307692307692308
+
+Recall = tp / (tp + fn) = 162 / (162 + 4) = 0.9759036144578314
+
+✅ Q: According to the confusion matrix, how did the model do? A: Not too bad; there are a good number of true negatives but also several false negatives.
Let's revisit the terms we saw earlier with the help of the confusion matrix's mapping of TP/TN and FP/FN:
-🎓 Precision: TP/(TP + FN) The fraction of relevant instances among the retrieved instances (e.g. which labels were well-labeled)
+🎓 Precision: TP/(TP + FP) The fraction of relevant instances among the retrieved instances (e.g. which labels were well-labeled)
-🎓 Recall: TP/(TP + FP) The fraction of relevant instances that were retrieved, whether well-labeled or not
+🎓 Recall: TP/(TP + FN) The fraction of relevant instances that were retrieved, whether well-labeled or not
🎓 f1-score: (2 * precision * recall)/(precision + recall) A weighted average of the precision and recall, with best being 1 and worst being 0
@@ -252,6 +261,7 @@ Let's revisit the terms we saw earlier with the help of the confusion matrix's m
🎓 Weighted Avg: The calculation of the mean metrics for each label, taking label imbalance into account by weighting them by their support (the number of true instances for each label).
✅ Can you think which metric you should watch if you want your model to reduce the number of false negatives?
+
## Visualize the ROC curve of this model
This is not a bad model; its accuracy is in the 80% range so ideally you could use it to predict the color of a pumpkin given a set of variables.
@@ -284,8 +294,9 @@ In future lessons on classifications, you will learn how to iterate to improve y
---
## 🚀Challenge
-There's a lot more to unpack regarding logistic regression! But the best way to learn is to experiment. Find a dataset that lends itself to this type of analysis and build a model with it. What do you learn? tip: try [Kaggle](https://kaggle.com) for interesting datasets.
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/16/)
+There's a lot more to unpack regarding logistic regression! But the best way to learn is to experiment. Find a dataset that lends itself to this type of analysis and build a model with it. What do you learn? tip: try [Kaggle](https://www.kaggle.com/search?q=logistic+regression+datasets) for interesting datasets.
+
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/16/)
## Review & Self Study
diff --git a/2-Regression/4-Logistic/images/r_learners_sm.jpeg b/2-Regression/4-Logistic/images/r_learners_sm.jpeg
new file mode 100644
index 0000000000..ff8d2945dc
Binary files /dev/null and b/2-Regression/4-Logistic/images/r_learners_sm.jpeg differ
diff --git a/2-Regression/4-Logistic/solution/lesson_4-R.ipynb b/2-Regression/4-Logistic/solution/lesson_4-R.ipynb
new file mode 100644
index 0000000000..574cec0592
--- /dev/null
+++ b/2-Regression/4-Logistic/solution/lesson_4-R.ipynb
@@ -0,0 +1,751 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "metadata": {
+ "colab": {
+ "name": "Untitled10.ipynb",
+ "provenance": [],
+ "collapsed_sections": []
+ },
+ "kernelspec": {
+ "name": "ir",
+ "display_name": "R"
+ },
+ "language_info": {
+ "name": "R"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Build a regression model: logistic regression\n",
+ " \n"
+ ],
+ "metadata": {
+ "id": "fVfEucLYkV9T"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Build a logistic regression model - Lesson 4\r\n",
+ "\r\n",
+ "
\r\n",
+ " \r\n",
+ " Infographic by Dasani Madipalli\r\n",
+ "\r\n",
+ ""
+ ],
+ "metadata": {
+ "id": "QizKKpzakfx2"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### ** [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/15/)**\n",
+ "\n",
+ "#### Introduction\n",
+ "\n",
+ "In this final lesson on Regression, one of the basic *classic* ML techniques, we will take a look at Logistic Regression. You would use this technique to discover patterns to predict `binary` `categories`. Is this candy chocolate or not? Is this disease contagious or not? Will this customer choose this product or not?\n",
+ "\n",
+ "In this lesson, you will learn:\n",
+ "\n",
+ "- Techniques for logistic regression\n",
+ "\n",
+ "✅ Deepen your understanding of working with this type of regression in this [Learn module](https://docs.microsoft.com/learn/modules/train-evaluate-classification-models?WT.mc_id=academic-15963-cxa)\n",
+ "\n",
+ "#### **Prerequisite**\n",
+ "\n",
+ "Having worked with the pumpkin data, we are now familiar enough with it to realize that there's one binary category that we can work with: `Color`.\n",
+ "\n",
+ "Let's build a logistic regression model to predict that, given some variables, *what color a given pumpkin is likely to be* (orange 🎃 or white 👻).\n",
+ "\n",
+ "> Why are we talking about binary classification in a lesson grouping about regression? Only for linguistic convenience, as logistic regression is [really a classification method](https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression), albeit a linear-based one. Learn about other ways to classify data in the next lesson group.\n",
+ "\n",
+ "For this lesson, we'll require the following packages:\n",
+ "\n",
+ "- `tidyverse`: The [tidyverse](https://www.tidyverse.org/) is a [collection of R packages](https://www.tidyverse.org/packages) designed to makes data science faster, easier and more fun!\n",
+ "\n",
+ "- `tidymodels`: The [tidymodels](https://www.tidymodels.org/) framework is a [collection of packages](https://www.tidymodels.org/packages/) for modeling and machine learning.\n",
+ "\n",
+ "- `janitor`: The [janitor package](https://github.com/sfirke/janitor) provides simple little tools for examining and cleaning dirty data.\n",
+ "\n",
+ "- `ggbeeswarm`: The [ggbeeswarm package](https://github.com/eclarke/ggbeeswarm) provides methods to create beeswarm-style plots using ggplot2.\n",
+ "\n",
+ "You can have them installed as:\n",
+ "\n",
+ "`install.packages(c(\"tidyverse\", \"tidymodels\", \"janitor\", \"ggbeeswarm\"))`\n",
+ "\n",
+ "Alternatiely, the script below checks whether you have the packages required to complete this module and installs them for you in case they are missing."
+ ],
+ "metadata": {
+ "id": "KPmut75XkmXY"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "suppressWarnings(if (!require(\"pacman\")) install.packages(\"pacman\"))\r\n",
+ "\r\n",
+ "pacman::p_load(tidyverse, tidymodels, janitor, ggbeeswarm)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "dnIGNNttkx_O"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## ** Define the question**\r\n",
+ "\r\n",
+ "For our purposes, we will express this as a binary: 'Orange' or 'Not Orange'. There is also a 'striped' category in our dataset but there are few instances of it, so we will not use it. It disappears once we remove null values from the dataset, anyway.\r\n",
+ "\r\n",
+ "> 🎃 Fun fact, we sometimes call white pumpkins 'ghost' pumpkins. They aren't very easy to carve, so they aren't as popular as the orange ones but they are cool looking!\r\n",
+ "\r\n",
+ "## **About logistic regression**\r\n",
+ "\r\n",
+ "Logistic regression differs from linear regression, which you learned about previously, in a few important ways.\r\n",
+ "\r\n",
+ "#### **Binary classification**\r\n",
+ "\r\n",
+ "Logistic regression does not offer the same features as linear regression. The former offers a prediction about a `binary category` (\"orange or not orange\") whereas the latter is capable of predicting `continual values`, for example given the origin of a pumpkin and the time of harvest, *how much its price will rise*.\r\n",
+ "\r\n",
+ "
\r\n",
+ " \r\n",
+ " Infographic by Dasani Madipalli\r\n",
+ "\r\n",
+ ""
+ ],
+ "metadata": {
+ "id": "ws-hP_SXk2O6"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### **Other classifications**\r\n",
+ "\r\n",
+ "There are other types of logistic regression, including multinomial and ordinal:\r\n",
+ "\r\n",
+ "- **Multinomial**, which involves having more than one category - \"Orange, White, and Striped\".\r\n",
+ "\r\n",
+ "- **Ordinal**, which involves ordered categories, useful if we wanted to order our outcomes logically, like our pumpkins that are ordered by a finite number of sizes (mini,sm,med,lg,xl,xxl).\r\n",
+ "\r\n",
+ "
\r\n",
+ " \r\n",
+ " Infographic by Dasani Madipalli\r\n",
+ "\r\n",
+ ""
+ ],
+ "metadata": {
+ "id": "LkLN-ZgDlBEc"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**It's still linear**\n",
+ "\n",
+ "Even though this type of Regression is all about 'category predictions', it still works best when there is a clear linear relationship between the dependent variable (color) and the other independent variables (the rest of the dataset, like city name and size). It's good to get an idea of whether there is any linearity dividing these variables or not.\n",
+ "\n",
+ "#### **Variables DO NOT have to correlate**\n",
+ "\n",
+ "Remember how linear regression worked better with more correlated variables? Logistic regression is the opposite - the variables don't have to align. That works for this data which has somewhat weak correlations.\n",
+ "\n",
+ "#### **You need a lot of clean data**\n",
+ "\n",
+ "Logistic regression will give more accurate results if you use more data; our small dataset is not optimal for this task, so keep that in mind.\n",
+ "\n",
+ "✅ Think about the types of data that would lend themselves well to logistic regression\n"
+ ],
+ "metadata": {
+ "id": "D8_JoVZtlHUt"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 1. Tidy the data\n",
+ "\n",
+ "Now, the fun begins! Let's start by importing the data, cleaning the data a bit, dropping rows containing missing values and selecting only some of the columns:"
+ ],
+ "metadata": {
+ "id": "LPj8Ib1AlIua"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Load the core tidyverse packages\r\n",
+ "library(tidyverse)\r\n",
+ "\r\n",
+ "# Import the data and clean column names\r\n",
+ "pumpkins <- read_csv(file = \"https://raw.githubusercontent.com/microsoft/ML-For-Beginners/main/2-Regression/data/US-pumpkins.csv\") %>% \r\n",
+ " clean_names()\r\n",
+ "\r\n",
+ "# Select desired columns\r\n",
+ "pumpkins_select <- pumpkins %>% \r\n",
+ " select(c(city_name, package, variety, origin, item_size, color)) \r\n",
+ "\r\n",
+ "# Drop rows containing missing values and encode color as factor (category)\r\n",
+ "pumpkins_select <- pumpkins_select %>% \r\n",
+ " drop_na() %>% \r\n",
+ " mutate(color = factor(color))\r\n",
+ "\r\n",
+ "# View the first few rows\r\n",
+ "pumpkins_select %>% \r\n",
+ " slice_head(n = 5)\r\n"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "Q8oKJ8PAlLM0"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Sometimes, we may want some little more information on our data. We can have a look at the `data`, `its structure` and the `data type` of its features by using the [*glimpse()*](https://pillar.r-lib.org/reference/glimpse.html) function as below:"
+ ],
+ "metadata": {
+ "id": "tKY5eN8alPNn"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "pumpkins_select %>% \r\n",
+ " glimpse()"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "wDpatL1WlShu"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Wow! Seems that all our columns are all of type *character*, further alluding that they are all categorical.\n",
+ "\n",
+ "Let's confirm that we will actually be doing a binary classification problem:"
+ ],
+ "metadata": {
+ "id": "QbdC2b0JlU2G"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Subset distinct observations in outcome column\r\n",
+ "pumpkins_select %>% \r\n",
+ " distinct(color)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "Gys-Q18rlZpE"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "🥳🥳 That went down well!\n",
+ "\n",
+ "## 2. Explore the data\n",
+ "\n",
+ "The goal of data exploration is to try to understand the `relationships` between its attributes; in particular, any apparent correlation between the *features* and the *label* your model will try to predict. One way of doing this is by using data visualization.\n",
+ "\n",
+ "Given our the data types of our columns, we can `encode` them and be on our way to making some visualizations. This simply involves `translating` a column with `categorical values` for example our columns of type *char*, into one or more `numeric columns` that take the place of the original. - Something we did in our [last lesson](https://github.com/microsoft/ML-For-Beginners/blob/main/2-Regression/3-Linear/solution/lesson_3-R.ipynb).\n",
+ "\n",
+ "Tidymodels provides yet another neat package: [recipes](https://recipes.tidymodels.org/)- a package for preprocessing data. We'll define a `recipe` that specifies that all predictor columns should be encoded into a set of integers , `prep` it to estimates the required quantities and statistics needed by any operations and finally `bake` to apply the computations to new data.\n",
+ "\n",
+ "> Normally, recipes is usually used as a preprocessor for modelling where it defines what steps should be applied to a data set in order to get it ready for modelling. In that case it is **highly recommend** that you use a `workflow()` instead of manually estimating a recipe using prep and bake. We'll see all this in just a moment.\n",
+ ">\n",
+ "> However for now, we are using recipes + prep + bake to specify what steps should be applied to a data set in order to get it ready for data analysis and then extract the preprocessed data with the steps applied."
+ ],
+ "metadata": {
+ "id": "kn_20wSPldVH"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Preprocess and extract data to allow some data analysis\r\n",
+ "baked_pumpkins <- recipe(color ~ ., data = pumpkins_select) %>% \r\n",
+ " # Encode all columns to a set of integers\r\n",
+ " step_integer(all_predictors(), zero_based = T) %>% \r\n",
+ " prep() %>% \r\n",
+ " bake(new_data = NULL)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Display the first few rows of preprocessed data\r\n",
+ "baked_pumpkins %>% \r\n",
+ " slice_head(n = 5)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "syaCgFQ_lijg"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Now let's compare the feature distributions for each label value using box plots. We'll begin by formatting the data to a *long* format to make it somewhat easier to make multiple `facets`."
+ ],
+ "metadata": {
+ "id": "RlkOZ_C5lldq"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Pivot data to long format\r\n",
+ "baked_pumpkins_long <- baked_pumpkins %>% \r\n",
+ " pivot_longer(!color, names_to = \"features\", values_to = \"values\")\r\n",
+ "\r\n",
+ "\r\n",
+ "# Print out restructured data\r\n",
+ "baked_pumpkins_long %>% \r\n",
+ " slice_head(n = 10)\r\n"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "putq8DagltUQ"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Now, let's make some boxplots showing the distribution of the predictors with respect to the outcome color."
+ ],
+ "metadata": {
+ "id": "-RHm-12zlt-B"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "theme_set(theme_light())\r\n",
+ "#Make a box plot for each predictor feature\r\n",
+ "baked_pumpkins_long %>% \r\n",
+ " mutate(color = factor(color)) %>% \r\n",
+ " ggplot(mapping = aes(x = color, y = values, fill = features)) +\r\n",
+ " geom_boxplot() + \r\n",
+ " facet_wrap(~ features, scales = \"free\", ncol = 3) +\r\n",
+ " scale_color_viridis_d(option = \"cividis\", end = .8) +\r\n",
+ " theme(legend.position = \"none\")"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "3Py4i1p1l3hP"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Amazing🤩! For some of the features, there's a noticeable difference in the distribution for each color label. For instance, it seems the white pumpkins can be found in smaller packages and in some particular varieties of pumpkins. The *item_size* category also seems to make a difference in the color distribution. These features may help predict the color of a pumpkin.\n",
+ "\n",
+ "#### **Use a swarm plot**\n",
+ "\n",
+ "Color is a binary category (Orange or Not), it's called `categorical data`. There are other various ways of [visualizing categorical data](https://seaborn.pydata.org/tutorial/categorical.html?highlight=bar).\n",
+ "\n",
+ "Try a `swarm plot` to show the distribution of color with respect to the item_size.\n",
+ "\n",
+ "We'll use the [ggbeeswarm package](https://github.com/eclarke/ggbeeswarm) which provides methods to create beeswarm-style plots using ggplot2. Beeswarm plots are a way of plotting points that would ordinarily overlap so that they fall next to each other instead."
+ ],
+ "metadata": {
+ "id": "2LSj6_LCl68V"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Create beeswarm plots of color and item_size\r\n",
+ "baked_pumpkins %>% \r\n",
+ " mutate(color = factor(color)) %>% \r\n",
+ " ggplot(mapping = aes(x = color, y = item_size, color = color)) +\r\n",
+ " geom_quasirandom() +\r\n",
+ " scale_color_brewer(palette = \"Dark2\", direction = -1) +\r\n",
+ " theme(legend.position = \"none\")"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "hGKeRgUemMTb"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### **Violin plot**\n",
+ "\n",
+ "A 'violin' type plot is useful as you can easily visualize the way that data in the two categories is distributed. [`Violin plots`](https://en.wikipedia.org/wiki/Violin_plot) are similar to box plots, except that they also show the probability density of the data at different values. Violin plots don't work so well with smaller datasets as the distribution is displayed more 'smoothly'."
+ ],
+ "metadata": {
+ "id": "_9wdZJH5mOvN"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Create a violin plot of color and item_size\r\n",
+ "baked_pumpkins %>%\r\n",
+ " mutate(color = factor(color)) %>% \r\n",
+ " ggplot(mapping = aes(x = color, y = item_size, fill = color)) +\r\n",
+ " geom_violin() +\r\n",
+ " geom_boxplot(color = \"black\", fill = \"white\", width = 0.02) +\r\n",
+ " scale_fill_brewer(palette = \"Dark2\", direction = -1) +\r\n",
+ " theme(legend.position = \"none\")"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "LFFFymujmTAZ"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Now that we have an idea of the relationship between the binary categories of color and the larger group of sizes, let's explore logistic regression to determine a given pumpkin's likely color.\r\n",
+ "\r\n",
+ "## 3. Build your logistic regression model\r\n",
+ "\r\n",
+ "
\r\n",
+ " \r\n",
+ " Infographic by Dasani Madipalli\r\n",
+ "\r\n",
+ "> **🧮 Show Me The Math**\r\n",
+ ">\r\n",
+ "> Remember how `linear regression` often used `ordinary least squares` to arrive at a value? `Logistic regression` relies on the concept of 'maximum likelihood' using [`sigmoid functions`](https://wikipedia.org/wiki/Sigmoid_function). A Sigmoid Function on a plot looks like an `S shape`. It takes a value and maps it to somewhere between 0 and 1. Its curve is also called a 'logistic curve'. Its formula looks like this:\r\n",
+ ">\r\n",
+ "> \r\n",
+ "
\r\n",
+ " \r\n",
+ "\r\n",
+ "\r\n",
+ "> where the sigmoid's midpoint finds itself at x's 0 point, L is the curve's maximum value, and k is the curve's steepness. If the outcome of the function is more than 0.5, the label in question will be given the class 1 of the binary choice. If not, it will be classified as 0.\r\n",
+ "\r\n",
+ "Let's begin by splitting the data into `training` and `test` sets. The training set is used to train a classifier so that it finds a statistical relationship between the features and the label value.\r\n",
+ "\r\n",
+ "It is best practice to hold out some of your data for **testing** in order to get a better estimate of how your models will perform on new data by comparing the predicted labels with the already known labels in the test set. [rsample](https://rsample.tidymodels.org/), a package in Tidymodels, provides infrastructure for efficient data splitting and resampling:"
+ ],
+ "metadata": {
+ "id": "RA_bnMS9mVo8"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Split data into 80% for training and 20% for testing\r\n",
+ "set.seed(2056)\r\n",
+ "pumpkins_split <- pumpkins_select %>% \r\n",
+ " initial_split(prop = 0.8)\r\n",
+ "\r\n",
+ "# Extract the data in each split\r\n",
+ "pumpkins_train <- training(pumpkins_split)\r\n",
+ "pumpkins_test <- testing(pumpkins_split)\r\n",
+ "\r\n",
+ "# Print out the first 5 rows of the training set\r\n",
+ "pumpkins_train %>% \r\n",
+ " slice_head(n = 5)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "PQdpEYYPmdGW"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "🙌 We are now ready to train a model by fitting the training features to the training label (color).\n",
+ "\n",
+ "We'll begin by creating a recipe that specifies the preprocessing steps that should be carried out on our data to get it ready for modelling i.e: encoding categorical variables into a set of integers.\n",
+ "\n",
+ "There are quite a number of ways to specify a logistic regression model in Tidymodels. See `?logistic_reg()` For now, we'll specify a logistic regression model via the default `stats::glm()` engine."
+ ],
+ "metadata": {
+ "id": "MX9LipSimhn0"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Create a recipe that specifies preprocessing steps for modelling\r\n",
+ "pumpkins_recipe <- recipe(color ~ ., data = pumpkins_train) %>% \r\n",
+ " step_integer(all_predictors(), zero_based = TRUE)\r\n",
+ "\r\n",
+ "\r\n",
+ "# Create a logistic model specification\r\n",
+ "log_reg <- logistic_reg() %>% \r\n",
+ " set_engine(\"glm\") %>% \r\n",
+ " set_mode(\"classification\")\r\n"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "0Eo5-SbSmm2-"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Now that we have a recipe and a model specification, we need to find a way of bundling them together into an object that will first preprocess the data (prep+bake behind the scenes), fit the model on the preprocessed data and also allow for potential post-processing activities.\n",
+ "\n",
+ "In Tidymodels, this convenient object is called a [`workflow`](https://workflows.tidymodels.org/) and conveniently holds your modeling components."
+ ],
+ "metadata": {
+ "id": "G599GKhXmqWf"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Bundle modelling components in a workflow\r\n",
+ "log_reg_wf <- workflow() %>% \r\n",
+ " add_recipe(pumpkins_recipe) %>% \r\n",
+ " add_model(log_reg)\r\n",
+ "\r\n",
+ "# Print out the workflow\r\n",
+ "log_reg_wf\r\n"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "cRoU0tpbmu1T"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "After a workflow has been *specified*, a model can be `trained` using the [`fit()`](https://tidymodels.github.io/parsnip/reference/fit.html) function. The workflow will estimate a recipe and preprocess the data before training, so we won't have to manually do that using prep and bake."
+ ],
+ "metadata": {
+ "id": "JnRXKmREnEpd"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Train the model\r\n",
+ "wf_fit <- log_reg_wf %>% \r\n",
+ " fit(data = pumpkins_train)\r\n",
+ "\r\n",
+ "# Print the trained workflow\r\n",
+ "wf_fit"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "ehFwfkjWnNCb"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "The model print out shows the coefficients learned during training.\n",
+ "\n",
+ "Now we've trained the model using the training data, we can make predictions on the test data using [parsnip::predict()](https://parsnip.tidymodels.org/reference/predict.model_fit.html). Let's start by using the model to predict labels for our test set and the probabilities for each label. When the probability is more than 0.5, the predict class is `ORANGE` else `WHITE`."
+ ],
+ "metadata": {
+ "id": "w01dGNZjnOJQ"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Make predictions for color and corresponding probabilities\r\n",
+ "results <- pumpkins_test %>% select(color) %>% \r\n",
+ " bind_cols(wf_fit %>% \r\n",
+ " predict(new_data = pumpkins_test)) %>%\r\n",
+ " bind_cols(wf_fit %>%\r\n",
+ " predict(new_data = pumpkins_test, type = \"prob\"))\r\n",
+ "\r\n",
+ "# Compare predictions\r\n",
+ "results %>% \r\n",
+ " slice_head(n = 10)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "K8PNjPfTnak2"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Very nice! This provides some more insights into how logistic regression works.\n",
+ "\n",
+ "Comparing each prediction with its corresponding \"ground truth\" actual value isn't a very efficient way to determine how well the model is predicting. Fortunately, Tidymodels has a few more tricks up its sleeve: [`yardstick`](https://yardstick.tidymodels.org/) - a package used to measure the effectiveness of models using performance metrics.\n",
+ "\n",
+ "One performance metric associated with classification problems is the [`confusion matrix`](https://wikipedia.org/wiki/Confusion_matrix). A confusion matrix describes how well a classification model performs. A confusion matrix tabulates how many examples in each class were correctly classified by a model. In our case, it will show you how many orange pumpkins were classified as orange and how many white pumpkins were classified as white; the confusion matrix also shows you how many were classified into the **wrong** categories.\n",
+ "\n",
+ "The [**`conf_mat()`**](https://tidymodels.github.io/yardstick/reference/conf_mat.html) function from yardstick calculates this cross-tabulation of observed and predicted classes."
+ ],
+ "metadata": {
+ "id": "N3J-yW0wngKo"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Confusion matrix for prediction results\r\n",
+ "conf_mat(data = results, truth = color, estimate = .pred_class)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "0RD77Dq1nl2j"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Let's interpret the confusion matrix. Our model is asked to classify pumpkins between two binary categories, category `orange` and category `not-orange`\n",
+ "\n",
+ "- If your model predicts a pumpkin as orange and it belongs to category 'orange' in reality we call it a `true positive`, shown by the top left number.\n",
+ "\n",
+ "- If your model predicts a pumpkin as not orange and it belongs to category 'orange' in reality we call it a `false negative`, shown by the bottom left number.\n",
+ "\n",
+ "- If your model predicts a pumpkin as orange and it belongs to category 'not-orange' in reality we call it a `false positive`, shown by the top right number.\n",
+ "\n",
+ "- If your model predicts a pumpkin as not orange and it belongs to category 'not-orange' in reality we call it a `true negative`, shown by the bottom right number.\n",
+ "\n",
+ "\n",
+ "| **Truth** |\n",
+ "|:-----:|\n",
+ "\n",
+ "\n",
+ "| | | |\n",
+ "|---------------|--------|-------|\n",
+ "| **Predicted** | ORANGE | WHITE |\n",
+ "| ORANGE | TP | FP |\n",
+ "| WHITE | FN | TN |"
+ ],
+ "metadata": {
+ "id": "H61sFwdOnoiO"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "As you might have guessed it's preferable to have a larger number of true positives and true negatives and a lower number of false positives and false negatives, which implies that the model performs better.\n",
+ "\n",
+ "The confusion matrix is helpful since it gives rise to other metrics that can help us better evaluate the performance of a classification model. Let's go through some of them:\n",
+ "\n",
+ "🎓 Precision: `TP/(TP + FP)` defined as the proportion of predicted positives that are actually positive. Also called [positive predictive value](https://en.wikipedia.org/wiki/Positive_predictive_value \"Positive predictive value\")\n",
+ "\n",
+ "🎓 Recall: `TP/(TP + FN)` defined as the proportion of positive results out of the number of samples which were actually positive. Also known as `sensitivity`.\n",
+ "\n",
+ "🎓 Specificity: `TN/(TN + FP)` defined as the proportion of negative results out of the number of samples which were actually negative.\n",
+ "\n",
+ "🎓 Accuracy: `TP + TN/(TP + TN + FP + FN)` The percentage of labels predicted accurately for a sample.\n",
+ "\n",
+ "🎓 F Measure: A weighted average of the precision and recall, with best being 1 and worst being 0.\n",
+ "\n",
+ "Let's calculate these metrics!"
+ ],
+ "metadata": {
+ "id": "Yc6QUie2oQUr"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Combine metric functions and calculate them all at once\r\n",
+ "eval_metrics <- metric_set(ppv, recall, spec, f_meas, accuracy)\r\n",
+ "eval_metrics(data = results, truth = color, estimate = .pred_class)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "p6rXx_T3oVxX"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### **Visualize the ROC curve of this model**\n",
+ "\n",
+ "For a start, this is not a bad model; its precision, recall, F measure and accuracy are in the 80% range so ideally you could use it to predict the color of a pumpkin given a set of variables. It also seems that our model was not really able to identify the white pumpkins 🧐. Could you guess why? One reason could be because of the high prevalence of ORANGE pumpkins in our training set making our model more inclined to predict the majority class.\n",
+ "\n",
+ "Let's do one more visualization to see the so-called [`ROC score`](https://en.wikipedia.org/wiki/Receiver_operating_characteristic):"
+ ],
+ "metadata": {
+ "id": "JcenzZo1oaKR"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Make a roc_curve\r\n",
+ "results %>% \r\n",
+ " roc_curve(color, .pred_ORANGE) %>% \r\n",
+ " autoplot()"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "BcmkHHHwogRB"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "ROC curves are often used to get a view of the output of a classifier in terms of its true vs. false positives. ROC curves typically feature `True Positive Rate`/Sensitivity on the Y axis, and `False Positive Rate`/1-Specificity on the X axis. Thus, the steepness of the curve and the space between the midpoint line and the curve matter: you want a curve that quickly heads up and over the line. In our case, there are false positives to start with, and then the line heads up and over properly.\n",
+ "\n",
+ "Finally, let's use `yardstick::roc_auc()` to calculate the actual Area Under the Curve. One way of interpreting AUC is as the probability that the model ranks a random positive example more highly than a random negative example."
+ ],
+ "metadata": {
+ "id": "P_an3vc1oqjI"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "source": [
+ "# Calculate area under curve\r\n",
+ "results %>% \r\n",
+ " roc_auc(color, .pred_ORANGE)"
+ ],
+ "outputs": [],
+ "metadata": {
+ "id": "SZyy5BT8ovew"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "The result is around `0.67053`. Given that the AUC ranges from 0 to 1, you want a big score, since a model that is 100% correct in its predictions will have an AUC of 1; in this case, the model is *pretty good*.\r\n",
+ "\r\n",
+ "In future lessons on classifications, you will learn how to improve your model's scores (such as dealing with imbalanced data in this case).\r\n",
+ "\r\n",
+ "But for now, congratulations 🎉🎉🎉! You've completed these regression lessons!\r\n",
+ "\r\n",
+ "You R awesome!\r\n",
+ "\r\n",
+ "
\r\n",
+ " \r\n",
+ " Artwork by @allison_horst\r\n",
+ "\r\n",
+ "\r\n"
+ ],
+ "metadata": {
+ "id": "5jtVKLTVoy6u"
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/2-Regression/4-Logistic/solution/lesson_4.Rmd b/2-Regression/4-Logistic/solution/lesson_4.Rmd
new file mode 100644
index 0000000000..e974594d1d
--- /dev/null
+++ b/2-Regression/4-Logistic/solution/lesson_4.Rmd
@@ -0,0 +1,430 @@
+---
+title: 'Build a regression model: logistic regression'
+output:
+ html_document:
+ df_print: paged
+ theme: flatly
+ highlight: breezedark
+ toc: yes
+ toc_float: yes
+ code_download: yes
+---
+
+## Build a logistic regression model - Lesson 4
+
+![Infographic by Dasani Madipalli](../images/logistic-linear.png){width="600"}
+
+#### ** [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/15/)**
+
+#### Introduction
+
+In this final lesson on Regression, one of the basic *classic* ML techniques, we will take a look at Logistic Regression. You would use this technique to discover patterns to predict `binary` `categories`. Is this candy chocolate or not? Is this disease contagious or not? Will this customer choose this product or not?
+
+In this lesson, you will learn:
+
+- Techniques for logistic regression
+
+✅ Deepen your understanding of working with this type of regression in this [Learn module](https://docs.microsoft.com/learn/modules/train-evaluate-classification-models?WT.mc_id=academic-15963-cxa)
+
+#### **Prerequisite**
+
+Having worked with the pumpkin data, we are now familiar enough with it to realize that there's one binary category that we can work with: `Color`.
+
+Let's build a logistic regression model to predict that, given some variables, *what color a given pumpkin is likely to be* (orange 🎃 or white 👻).
+
+> Why are we talking about binary classification in a lesson grouping about regression? Only for linguistic convenience, as logistic regression is [really a classification method](https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression), albeit a linear-based one. Learn about other ways to classify data in the next lesson group.
+
+For this lesson, we'll require the following packages:
+
+- `tidyverse`: The [tidyverse](https://www.tidyverse.org/) is a [collection of R packages](https://www.tidyverse.org/packages) designed to makes data science faster, easier and more fun!
+
+- `tidymodels`: The [tidymodels](https://www.tidymodels.org/) framework is a [collection of packages](https://www.tidymodels.org/packages/) for modeling and machine learning.
+
+- `janitor`: The [janitor package](https://github.com/sfirke/janitor) provides simple little tools for examining and cleaning dirty data.
+
+- `ggbeeswarm`: The [ggbeeswarm package](https://github.com/eclarke/ggbeeswarm) provides methods to create beeswarm-style plots using ggplot2.
+
+You can have them installed as:
+
+`install.packages(c("tidyverse", "tidymodels", "janitor", "ggbeeswarm"))`
+
+Alternatiely, the script below checks whether you have the packages required to complete this module and installs them for you in case they are missing.
+
+```{r, message=F, warning=F}
+suppressWarnings(if (!require("pacman"))install.packages("pacman"))
+
+pacman::p_load(tidyverse, tidymodels, janitor, ggbeeswarm)
+```
+
+## ** Define the question**
+
+For our purposes, we will express this as a binary: 'Orange' or 'Not Orange'. There is also a 'striped' category in our dataset but there are few instances of it, so we will not use it. It disappears once we remove null values from the dataset, anyway.
+
+> 🎃 Fun fact, we sometimes call white pumpkins 'ghost' pumpkins. They aren't very easy to carve, so they aren't as popular as the orange ones but they are cool looking!
+
+## **About logistic regression**
+
+Logistic regression differs from linear regression, which you learned about previously, in a few important ways.
+
+#### **Binary classification**
+
+Logistic regression does not offer the same features as linear regression. The former offers a prediction about a `binary category` ("orange or not orange") whereas the latter is capable of predicting `continual values`, for example given the origin of a pumpkin and the time of harvest, *how much its price will rise*.
+
+![Infographic by Dasani Madipalli](../images/pumpkin-classifier.png){width="600"}
+
+#### **Other classifications**
+
+There are other types of logistic regression, including multinomial and ordinal:
+
+- **Multinomial**, which involves having more than one category - "Orange, White, and Striped".
+
+- **Ordinal**, which involves ordered categories, useful if we wanted to order our outcomes logically, like our pumpkins that are ordered by a finite number of sizes (mini,sm,med,lg,xl,xxl).
+
+![Infographic by Dasani Madipalli](../images/multinomial-ordinal.png){width="600"}
+
+\
+**It's still linear**
+
+Even though this type of Regression is all about 'category predictions', it still works best when there is a clear linear relationship between the dependent variable (color) and the other independent variables (the rest of the dataset, like city name and size). It's good to get an idea of whether there is any linearity dividing these variables or not.
+
+#### **Variables DO NOT have to correlate**
+
+Remember how linear regression worked better with more correlated variables? Logistic regression is the opposite - the variables don't have to align. That works for this data which has somewhat weak correlations.
+
+#### **You need a lot of clean data**
+
+Logistic regression will give more accurate results if you use more data; our small dataset is not optimal for this task, so keep that in mind.
+
+✅ Think about the types of data that would lend themselves well to logistic regression
+
+## 1. Tidy the data
+
+Now, the fun begins! Let's start by importing the data, cleaning the data a bit, dropping rows containing missing values and selecting only some of the columns:
+
+```{r, tidyr, message=F, warning=F}
+# Load the core tidyverse packages
+library(tidyverse)
+
+# Import the data and clean column names
+pumpkins <- read_csv(file = "https://raw.githubusercontent.com/microsoft/ML-For-Beginners/main/2-Regression/data/US-pumpkins.csv") %>%
+ clean_names()
+
+# Select desired columns
+pumpkins_select <- pumpkins %>%
+ select(c(city_name, package, variety, origin, item_size, color))
+
+# Drop rows containing missing values and encode color as factor (category)
+pumpkins_select <- pumpkins_select %>%
+ drop_na() %>%
+ mutate(color = factor(color))
+
+# View the first few rows
+pumpkins_select %>%
+ slice_head(n = 5)
+
+```
+
+Sometimes, we may want some little more information on our data. We can have a look at the `data`, `its structure` and the `data type` of its features by using the [*glimpse()*](https://pillar.r-lib.org/reference/glimpse.html) function as below:
+
+```{r glimpse}
+pumpkins_select %>%
+ glimpse()
+```
+
+Wow! Seems that all our columns are all of type *character*, further alluding that they are all categorical.
+
+Let's confirm that we will actually be doing a binary classification problem:
+
+```{r distinct color}
+# Subset distinct observations in outcome column
+pumpkins_select %>%
+ distinct(color)
+
+```
+
+🥳🥳 That went down well!
+
+## 2. Explore the data
+
+The goal of data exploration is to try to understand the `relationships` between its attributes; in particular, any apparent correlation between the *features* and the *label* your model will try to predict. One way of doing this is by using data visualization.
+
+Given our the data types of our columns, we can `encode` them and be on our way to making some visualizations. This simply involves `translating` a column with `categorical values` for example our columns of type *char*, into one or more `numeric columns` that take the place of the original. - Something we did in our [last lesson](https://github.com/microsoft/ML-For-Beginners/blob/main/2-Regression/3-Linear/solution/lesson_3-R.ipynb).
+
+Tidymodels provides yet another neat package: [recipes](https://recipes.tidymodels.org/)- a package for preprocessing data. We'll define a `recipe` that specifies that all predictor columns should be encoded into a set of integers , `prep` it to estimates the required quantities and statistics needed by any operations and finally `bake` to apply the computations to new data.
+
+> Normally, recipes is usually used as a preprocessor for modelling where it defines what steps should be applied to a data set in order to get it ready for modelling. In that case it is **highly recommend** that you use a `workflow()` instead of manually estimating a recipe using prep and bake. We'll see all this in just a moment.
+>
+> However for now, we are using recipes + prep + bake to specify what steps should be applied to a data set in order to get it ready for data analysis and then extract the preprocessed data with the steps applied.
+
+```{r recipe_prep_bake}
+# Preprocess and extract data to allow some data analysis
+baked_pumpkins <- recipe(color ~ ., data = pumpkins_select) %>%
+ # Encode all columns to a set of integers
+ step_integer(all_predictors(), zero_based = T) %>%
+ prep() %>%
+ bake(new_data = NULL)
+
+
+# Display the first few rows of preprocessed data
+baked_pumpkins %>%
+ slice_head(n = 5)
+
+```
+
+Now let's compare the feature distributions for each label value using box plots. We'll begin by formatting the data to a *long* format to make it somewhat easier to make multiple `facets`.
+
+```{r pivot}
+# Pivot data to long format
+baked_pumpkins_long <- baked_pumpkins %>%
+ pivot_longer(!color, names_to = "features", values_to = "values")
+
+
+# Print out restructured data
+baked_pumpkins_long %>%
+ slice_head(n = 10)
+
+```
+
+
+Now, let's make some boxplots showing the distribution of the predictors with respect to the outcome color!
+
+```{r boxplots}
+theme_set(theme_light())
+#Make a box plot for each predictor feature
+baked_pumpkins_long %>%
+ mutate(color = factor(color)) %>%
+ ggplot(mapping = aes(x = color, y = values, fill = features)) +
+ geom_boxplot() +
+ facet_wrap(~ features, scales = "free", ncol = 3) +
+ scale_color_viridis_d(option = "cividis", end = .8) +
+ theme(legend.position = "none")
+```
+
+Amazing🤩! For some of the features, there's a noticeable difference in the distribution for each color label. For instance, it seems the white pumpkins can be found in smaller packages and in some particular varieties of pumpkins. The *item_size* category also seems to make a difference in the color distribution. These features may help predict the color of a pumpkin.
+
+#### **Use a swarm plot**
+
+Color is a binary category (Orange or Not), it's called `categorical data`. There are other various ways of [visualizing categorical data](https://seaborn.pydata.org/tutorial/categorical.html?highlight=bar).
+
+Try a `swarm plot` to show the distribution of color with respect to the item_size.
+
+We'll use the [ggbeeswarm package](https://github.com/eclarke/ggbeeswarm) which provides methods to create beeswarm-style plots using ggplot2. Beeswarm plots are a way of plotting points that would ordinarily overlap so that they fall next to each other instead.
+
+```{r bee_swarm plot}
+# Create beeswarm plots of color and item_size
+baked_pumpkins %>%
+ mutate(color = factor(color)) %>%
+ ggplot(mapping = aes(x = color, y = item_size, color = color)) +
+ geom_quasirandom() +
+ scale_color_brewer(palette = "Dark2", direction = -1) +
+ theme(legend.position = "none")
+```
+
+#### **Violin plot**
+
+A 'violin' type plot is useful as you can easily visualize the way that data in the two categories is distributed. [`Violin plots`](https://en.wikipedia.org/wiki/Violin_plot) are similar to box plots, except that they also show the probability density of the data at different values. Violin plots don't work so well with smaller datasets as the distribution is displayed more 'smoothly'.
+
+```{r violin_plot}
+# Create a violin plot of color and item_size
+baked_pumpkins %>%
+ mutate(color = factor(color)) %>%
+ ggplot(mapping = aes(x = color, y = item_size, fill = color)) +
+ geom_violin() +
+ geom_boxplot(color = "black", fill = "white", width = 0.02) +
+ scale_fill_brewer(palette = "Dark2", direction = -1) +
+ theme(legend.position = "none")
+
+```
+
+Now that we have an idea of the relationship between the binary categories of color and the larger group of sizes, let's explore logistic regression to determine a given pumpkin's likely color.
+
+## 3. Build your model
+
+> **🧮 Show Me The Math**
+>
+> Remember how `linear regression` often used `ordinary least squares` to arrive at a value? `Logistic regression` relies on the concept of 'maximum likelihood' using [`sigmoid functions`](https://wikipedia.org/wiki/Sigmoid_function). A Sigmoid Function on a plot looks like an `S shape`. It takes a value and maps it to somewhere between 0 and 1. Its curve is also called a 'logistic curve'. Its formula looks like this:
+>
+> ![](../images/sigmoid.png)
+>
+> where the sigmoid's midpoint finds itself at x's 0 point, L is the curve's maximum value, and k is the curve's steepness. If the outcome of the function is more than 0.5, the label in question will be given the class 1 of the binary choice. If not, it will be classified as 0.
+
+Let's begin by splitting the data into `training` and `test` sets. The training set is used to train a classifier so that it finds a statistical relationship between the features and the label value.
+
+It is best practice to hold out some of your data for **testing** in order to get a better estimate of how your models will perform on new data by comparing the predicted labels with the already known labels in the test set. [rsample](https://rsample.tidymodels.org/), a package in Tidymodels, provides infrastructure for efficient data splitting and resampling:
+
+```{r split_data}
+# Split data into 80% for training and 20% for testing
+set.seed(2056)
+pumpkins_split <- pumpkins_select %>%
+ initial_split(prop = 0.8)
+
+# Extract the data in each split
+pumpkins_train <- training(pumpkins_split)
+pumpkins_test <- testing(pumpkins_split)
+
+# Print out the first 5 rows of the training set
+pumpkins_train %>%
+ slice_head(n = 5)
+
+
+```
+
+🙌 We are now ready to train a model by fitting the training features to the training label (color).
+
+We'll begin by creating a recipe that specifies the preprocessing steps that should be carried out on our data to get it ready for modelling i.e: encoding categorical variables into a set of integers.
+
+There are quite a number of ways to specify a logistic regression model in Tidymodels. See `?logistic_reg()` For now, we'll specify a logistic regression model via the default `stats::glm()` engine.
+
+```{r log_reg}
+# Create a recipe that specifies preprocessing steps for modelling
+pumpkins_recipe <- recipe(color ~ ., data = pumpkins_train) %>%
+ step_integer(all_predictors(), zero_based = TRUE)
+
+
+# Create a logistic model specification
+log_reg <- logistic_reg() %>%
+ set_engine("glm") %>%
+ set_mode("classification")
+
+
+```
+
+Now that we have a recipe and a model specification, we need to find a way of bundling them together into an object that will first preprocess the data (prep+bake behind the scenes), fit the model on the preprocessed data and also allow for potential post-processing activities.
+
+In Tidymodels, this convenient object is called a [`workflow`](https://workflows.tidymodels.org/) and conveniently holds your modeling components.
+
+```{r workflow}
+# Bundle modelling components in a workflow
+log_reg_wf <- workflow() %>%
+ add_recipe(pumpkins_recipe) %>%
+ add_model(log_reg)
+
+# Print out the workflow
+log_reg_wf
+
+
+```
+
+After a workflow has been *specified*, a model can be `trained` using the [`fit()`](https://tidymodels.github.io/parsnip/reference/fit.html) function. The workflow will estimate a recipe and preprocess the data before training, so we won't have to manually do that using prep and bake.
+
+```{r train}
+# Train the model
+wf_fit <- log_reg_wf %>%
+ fit(data = pumpkins_train)
+
+# Print the trained workflow
+wf_fit
+
+```
+
+The model print out shows the coefficients learned during training.
+
+Now we've trained the model using the training data, we can make predictions on the test data using [parsnip::predict()](https://parsnip.tidymodels.org/reference/predict.model_fit.html). Let's start by using the model to predict labels for our test set and the probabilities for each label. When the probability is more than 0.5, the predict class is `ORANGE` else `WHITE`.
+
+```{r test_pred}
+# Make predictions for color and corresponding probabilities
+results <- pumpkins_test %>% select(color) %>%
+ bind_cols(wf_fit %>%
+ predict(new_data = pumpkins_test)) %>%
+ bind_cols(wf_fit %>%
+ predict(new_data = pumpkins_test, type = "prob"))
+
+# Compare predictions
+results %>%
+ slice_head(n = 10)
+
+```
+
+Very nice! This provides some more insights into how logistic regression works.
+
+Comparing each prediction with its corresponding "ground truth" actual value isn't a very efficient way to determine how well the model is predicting. Fortunately, Tidymodels has a few more tricks up its sleeve: [`yardstick`](https://yardstick.tidymodels.org/) - a package used to measure the effectiveness of models using performance metrics.
+
+One performance metric associated with classification problems is the [`confusion matrix`](https://wikipedia.org/wiki/Confusion_matrix). A confusion matrix describes how well a classification model performs. A confusion matrix tabulates how many examples in each class were correctly classified by a model. In our case, it will show you how many orange pumpkins were classified as orange and how many white pumpkins were classified as white; the confusion matrix also shows you how many were classified into the **wrong** categories.
+
+The [**`conf_mat()`**](https://tidymodels.github.io/yardstick/reference/conf_mat.html) function from yardstick calculates this cross-tabulation of observed and predicted classes.
+
+```{r conf_mat}
+# Confusion matrix for prediction results
+conf_mat(data = results, truth = color, estimate = .pred_class)
+
+
+```
+
+Let's interpret the confusion matrix. Our model is asked to classify pumpkins between two binary categories, category `orange` and category `not-orange`
+
+- If your model predicts a pumpkin as orange and it belongs to category 'orange' in reality we call it a `true positive`, shown by the top left number.
+
+- If your model predicts a pumpkin as not orange and it belongs to category 'orange' in reality we call it a `false negative`, shown by the bottom left number.
+
+- If your model predicts a pumpkin as orange and it belongs to category 'not-orange' in reality we call it a `false positive`, shown by the top right number.
+
+- If your model predicts a pumpkin as not orange and it belongs to category 'not-orange' in reality we call it a `true negative`, shown by the bottom right number.
+
+| Truth |
+|:-----:|
+
+
+| | | |
+|---------------|--------|-------|
+| **Predicted** | ORANGE | WHITE |
+| ORANGE | TP | FP |
+| WHITE | FN | TN |
+
+As you might have guessed it's preferable to have a larger number of true positives and true negatives and a lower number of false positives and false negatives, which implies that the model performs better.
+
+The confusion matrix is helpful since it gives rise to other metrics that can help us better evaluate the performance of a classification model. Let's go through some of them:
+
+🎓 Precision: `TP/(TP + FP)` defined as the proportion of predicted positives that are actually positive. Also called [positive predictive value](https://en.wikipedia.org/wiki/Positive_predictive_value "Positive predictive value")
+
+🎓 Recall: `TP/(TP + FN)` defined as the proportion of positive results out of the number of samples which were actually positive. Also known as `sensitivity`.
+
+🎓 Specificity: `TN/(TN + FP)` defined as the proportion of negative results out of the number of samples which were actually negative.
+
+🎓 Accuracy: `TP + TN/(TP + TN + FP + FN)` The percentage of labels predicted accurately for a sample.
+
+🎓 F Measure: A weighted average of the precision and recall, with best being 1 and worst being 0.
+
+Let's calculate these metrics!
+
+```{r metric_set}
+# Combine metric functions and calculate them all at once
+eval_metrics <- metric_set(ppv, recall, spec, f_meas, accuracy)
+eval_metrics(data = results, truth = color, estimate = .pred_class)
+```
+
+#### **Visualize the ROC curve of this model**
+
+For a start, this is not a bad model; its precision, recall, F measure and accuracy are in the 80% range so ideally you could use it to predict the color of a pumpkin given a set of variables. It also seems that our model was not really able to identify the white pumpkins 🧐. Could you guess why? One reason could be because of the high prevalence of ORANGE pumpkins in our training set making our model more inclined to predict the majority class.
+
+Let's do one more visualization to see the so-called [`ROC score`](https://en.wikipedia.org/wiki/Receiver_operating_characteristic):
+
+```{r roc_curve}
+# Make a roc_curve
+results %>%
+ roc_curve(color, .pred_ORANGE) %>%
+ autoplot()
+
+```
+
+ROC curves are often used to get a view of the output of a classifier in terms of its true vs. false positives. ROC curves typically feature `True Positive Rate`/Sensitivity on the Y axis, and `False Positive Rate`/1-Specificity on the X axis. Thus, the steepness of the curve and the space between the midpoint line and the curve matter: you want a curve that quickly heads up and over the line. In our case, there are false positives to start with, and then the line heads up and over properly.
+
+Finally, let's use `yardstick::roc_auc()` to calculate the actual Area Under the Curve. One way of interpreting AUC is as the probability that the model ranks a random positive example more highly than a random negative example.
+
+```{r roc_aoc}
+# Calculate area under curve
+results %>%
+ roc_auc(color, .pred_ORANGE)
+
+```
+
+The result is around `0.67053`. Given that the AUC ranges from 0 to 1, you want a big score, since a model that is 100% correct in its predictions will have an AUC of 1; in this case, the model is *pretty good*.
+
+In future lessons on classifications, you will learn how to improve your model's scores (such as dealing with imbalanced data in this case).
+
+But for now, congratulations 🎉🎉🎉! You've completed these regression lessons!
+
+You R awesome!
+
+![Artwork by \@allison_horst](../images/r_learners_sm.jpeg)
+
+
diff --git a/2-Regression/4-Logistic/translations/README.id.md b/2-Regression/4-Logistic/translations/README.id.md
new file mode 100644
index 0000000000..553205d712
--- /dev/null
+++ b/2-Regression/4-Logistic/translations/README.id.md
@@ -0,0 +1,302 @@
+# Regresi logistik untuk memprediksi kategori-kategori
+
+![Infografik regresi logistik vs. linear](../images/logistic-linear.png)
+> Infografik oleh [Dasani Madipalli](https://twitter.com/dasani_decoded)
+
+## [Kuis pra-ceramah](https://white-water-09ec41f0f.azurestaticapps.net/quiz/15/)
+
+## Pembukaan
+
+Dalam pelajaran regresi terakhir, salah satu teknik ML _klasik_ dan sederhana adalah regresi logistik. Teknik ini digunakan untuk mengemukakan pola-pola untuk memprediksi kategori binari. Apa ini sebuah permen coklat atau tidak? Apa penyakit ini menular tidak? Apa pelanggan ini akan memilih produk ini tidak?
+
+Dalam pelajaran ini, kamu akan belajar:
+
+- Sebuah *library* baru untuk pemvisualisasian data
+- Teknik-teknik untuk regresi logistik
+
+✅ Perdalamkan pemahamanmu dalam bekerja dengan regresi jenis ini dalam [modul pembelajaran ini](https://docs.microsoft.com/learn/modules/train-evaluate-classification-models?WT.mc_id=academic-15963-cxa)
+
+## Prasyarat
+
+Setelah bekerja dengan data labu, kita sekarang sudah terbiasa dengannya untuk menyadari bahwa adapula sebuah kategori binari yang kita dapat menggunakan: `Color` (warna).
+
+Mari membangun sebuah model regresi logistik untuk memprediksi _kemungkinannya labu ini warnanya apa_ berdasarkan beberapa variabel (oranye 🎃 atau putih 👻).
+
+> Mengapa kita berbicara tentang klasifikasi binary dalam seri pelajaran tentang regresi? Hanya untuk kemudahan linguistik, regresi logistik juga [sebenarnya sebuah metode klasifikasi](https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression), namun satu yang berdasarkan garis linear. Pelajari lebih lanjut tentang cara-cara lain untuk mengklasifikasi data dalam seri pelajaran berikutnya.
+
+## Tentukan pertanyaannya
+
+Untuk keperluan kita, kita akan mengekspresikannya sebagai pilihan binari 'Orange' atau 'Not Orange' (oranye atau bukan oranye). Adapula kategori 'striped' (belang-belang) dalam dataset kita, tetapi tidak banyak titik datanya, jadi kita tidak akan menggunakannya. Lagipula, kategori itu hilang begitu kita buang nilai-nilai nil (null) dari datasetnya.
+
+> 🎃 Tahukah tidak? Kita kadangkali memanggil labu putih labu 'hantu'. Mereka tidak mudah diukir, jadi mereka tidak sepopuler yang oranye pada Halloween. Tetapi mereka keren juga ya!
+
+## Tentang regresi logistik
+
+Regresi logistik berbeda dari regresi linear, jenis regresi yang kamu pelajari sebelumnya, dalam beberapa askpek penting.
+
+### Klasifikasi binari
+
+Regresi logistik tidak mempunyai beberapa fitur regresi linear. Regresi logistik menyediakan sebuah prediksi tentang sebuah kategori binari (seperti "oranye atau bukan oranye"), sedangkan yang lainnya dapat memprediksi nilai-nilai kontinu. Contohnya, dengan mengetahui dari mana labu ini dan kapan dipanennya, regresi linear dapat memprediksi _berapa harganya akan naik_, namun regresi logistik tidak bisa.
+
+![Model klasifikasi labu](../images/pumpkin-classifier.png)
+> Infografik oleh [Dasani Madipalli](https://twitter.com/dasani_decoded)
+
+### Klasifikasi lain
+
+Ditambah itu, ada banyak jenis regresi logistik, termasuk jenis multinomial dan ordinal:
+
+- **Multinomial** memperlibatkan lebih dari satu kategori - "Oranye, Putih, dan Belang-belang".
+- **Ordinal** memperlibatkan kategori-kategori berurut. Biasanya berguna jika kita inging mengurutkan hasil kita secara logikal, seperti labu-useful if we wanted to order our outcomes logically, like our pumpkins that are ordered by a finite number of sizes (mini,sm,med,lg,xl,xxl).
+
+![Multinomial vs ordinal regression](./images/multinomial-ordinal.png)
+> Infographic by [Dasani Madipalli](https://twitter.com/dasani_decoded)
+
+### Eh, masih linear ya?
+
+Walaupun jenis regresi ini semuanya tentang 'prediksi kategori', jenis ini masih paling efektif jika ada hubungan linear antara variabel dependen (warna) dan independen (sisa *dataset*-nya, seperti kota dan ukuran). Jadi baik juga untuk mencari tahu dahulu apa ada hubungan linear antara variabel-variabel ini.
+
+### Variabel-variabel TIDAK HARUS berkorelasi
+
+Ingat bagaimana regresi linear bekerja lebih baik dengan variabel berkorelasi? Regresi logistik itu kebalikannya: variabel-variabelnya tidak harus berjejer menjadi garis. Artinya, regresi ini bekerja untuk data ini yang korelasinya lumayan lemah.
+
+### Perlu banyak data rapi
+
+Regresi logistik akan memberi hasil lebih akurat jika kamu menggunakan data lebih banyak; *dataset* kecil kita tidak optimal untuk tugas ini, ingatlah itu.
+
+✅ Pikirkan tentang jenis-jenis data yang akan bekerja baik dengan regresi logistik
+
+## Latihan - rapikan data
+
+Pertama, rapikanlah datanya sedikit. Buanglah nilai-nilai nil (null) dan pilihlah beberapa kolom:
+
+1. Tambahlah kode di bawah ini:
+
+ ```python
+ from sklearn.preprocessing import LabelEncoder
+
+ new_columns = ['Color','Origin','Item Size','Variety','City Name','Package']
+
+ new_pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)
+
+ new_pumpkins.dropna(inplace=True)
+
+ new_pumpkins = new_pumpkins.apply(LabelEncoder().fit_transform)
+ ```
+
+ Kamu selalu bisa mengintip kedalam *dataframe*-mu:
+
+ ```python
+ new_pumpkins.info
+ ```
+
+### Visualisasi - *grid* berdampingan (*side-by-side grid*)
+
+Sekarang kamu sudah memuat [*notebook* starter](./notebook.ipynb) dengan data labunya sekali lagi dan merapikannya untuk mempertahankan sebuah *dataset* dengan beberapa variabel, termasuk `Color`. Mari memvisualisasi *dataframe*-nya dengan *library* yang beda: [Seaborn](https://seaborn.pydata.org/index.html) yang dibangun di atas Matplotlib yang kita gunakan sebelumnya.
+
+Seaborn menyediakan beberapa cara keren untuk memvisualisasi datamu. Contohnya, kamu bisa membandungkan distribusi datanya untuk setiap titik data dalam sebuah *grid* berdampingan.
+
+1. Buatlah sebuah *grid* dengan meng-*instantiate* sebuah `PairGrid` menggunakan data labu kita `new_pumpkins` diikuti memanggil fungsi `map()`:
+
+ ```python
+ import seaborn as sns
+
+ g = sns.PairGrid(new_pumpkins)
+ g.map(sns.scatterplot)
+ ```
+
+ ![Sebuah visualisasi *grid* data](../images/grid.png)
+
+ Dengan mengobservasi datanya secara berdampingan, kamu bisa lihat bagaimana data warnanya berhubungan dengan kolom-kolom lainnya.
+
+ ✅ Dengan petak sebar ini, pendalaman menarik apa saja yang kamu bisa membayangkan?
+
+### Gunakan sebuah bagan kawanan (*swarm plot*)
+
+Karena warna adalah sebuah kategori binari (oranye atau bukan oranye), warna disebut 'data kategorikal' dan memerlukan 'sebuah [pendekatan khusus](https://seaborn.pydata.org/tutorial/categorical.html?highlight=bar) untuk memvisualisasi'. Ada beberapa cara lain untuk memvisualisasi hubungan antara kategori ini dengan variabel-variabel lainnya.
+
+Kamu bisa memvisualisasikan variabel-variabel secara berdampingan dengan bagan-bagan Seaborn.
+
+1. Cobalah sebuah bagan kawanan untuk menunjukkan distribusi nilai:
+
+ ```python
+ sns.swarmplot(x="Color", y="Item Size", data=new_pumpkins)
+ ```
+
+ ![Sekawanan data yang divisualisasi](../images/swarm.png)
+
+### Bagan biola
+
+Sebuah bagan 'biola' itu berguna sebab kamu bisa memvisualisasi bagaimana data dalam kedua kategori itu terdistribusi dengan mudah. Bagan viola tidak efektif dengan *dataset* yang lebih kecil sebab distribusinya ditampilkan sebagai lebih 'mulus'.
+
+1. Gunakan fungsi `catplot()` dengan parameter `x=Color` dan `kind="violin"`:
+
+ ```python
+ sns.catplot(x="Color", y="Item Size",
+ kind="violin", data=new_pumpkins)
+ ```
+
+ ![sebuah bagan biola](../images/violin.png)
+
+ ✅ Cobalah membuat bagan ini dan jenis-jenis bagan Seaborn lainnya dengan variabel-variabel lainnya.
+
+Sekarang kita sudah dapat bayangan hubungan antara kedua kategori binary warna dan ukuran. Ayo menjelajahi regresi logistik untuk memprediksi warna sebuah labu tertentu.
+
+> **🧮 Perlihatkanlah Matematikanya Kepada Saya**
+>
+> Ingat bagaiaman regresi linear seringkali menggunakan metode kuadrat terkecil untuk tiba pada sebuah nilai? Regresi logistik tergantung pada konsep 'kemungkinan terbesar' menggunakan [fungsi sigmoid](https://wikipedia.org/wiki/Sigmoid_function). Sebuah 'fungsi Sigmoid' terlihat seperti huruf 'S' dalam sistem koordinat Kartesius. Fungsi ini mengambil sebuah nilai dan 'mencorongkannya' menjadi sebuah nomor antara 0 dan 1. Kurva ini juga dipanggil sebuah 'kurva logistik'. Formulanya seperti ini:
+>
+> ![Fungsi logistic](../images/sigmoid.png)
+>
+> Titik tengah sigmoidnya terletak di sumbu X. L adalah nilai maksimum kurvanya. k adalah terjalnya kurvanya. Jika hasil fungsinya lebih dari 0.5, nilai yang diberikan kepada fungsi tersebut akan diklasifikasikan sebagai '1'. Kalau tidak, nilai itu akan diklasifikasikan sebagai '0'.
+
+## Bangunlah modelmu
+
+Scikit-learn membuat membangun model klasifikasi binary sangat mudah.
+
+1. Pilihlah variabel-variabel yang kamu ingin gunakan dalam model klasifikasimu dan bagilah datanya menjadi set latihan dan set ujian dengan fungsi `train_test_split()`:
+
+ ```python
+ from sklearn.model_selection import train_test_split
+
+ Selected_features = ['Origin','Item Size','Variety','City Name','Package']
+
+ X = new_pumpkins[Selected_features]
+ y = new_pumpkins['Color']
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+
+ ```
+
+2. Sekarang kamu bisa melatihkan modelmu dengan fungsi `fit()` dengan data latihanmu. *Print* hasilnya:
+
+ ```python
+ from sklearn.model_selection import train_test_split
+ from sklearn.metrics import accuracy_score, classification_report
+ from sklearn.linear_model import LogisticRegression
+
+ model = LogisticRegression()
+ model.fit(X_train, y_train)
+ predictions = model.predict(X_test)
+
+ print(classification_report(y_test, predictions))
+ print('Predicted labels: ', predictions)
+ print('Accuracy: ', accuracy_score(y_test, predictions))
+ ```
+
+ Lihatlah *scoreboard* modelmu. Tidak buruk, apalagi hanya dengan 1000 baris data:
+
+ ```output
+ precision recall f1-score support
+
+ 0 0.85 0.95 0.90 166
+ 1 0.38 0.15 0.22 33
+
+ accuracy 0.82 199
+ macro avg 0.62 0.55 0.56 199
+ weighted avg 0.77 0.82 0.78 199
+
+ Predicted labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
+ 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ 0 0 0 1 0 1 0 0 1 0 0 0 1 0]
+ ```
+
+## Pemahaman lebih baik via sebuah 'matriks kebingungan'
+
+Walaupun kamu bisa membuat sebuah *scoreboard* melaporkan [istilah-istilah](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html?highlight=classification_report#sklearn.metrics.classification_report) dengan mem-*print* yang di atas, kamu mungkin bisa memahami modelmu dengan lebih mudah dengan sebuah [matriks kebingungan](https://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix) untuk membantu kita lebih paham akan performa modelnya.
+
+> 🎓 Sebuah '[matriks kebingungan](https://wikipedia.org/wiki/Confusion_matrix)' (atau 'matriks kesalahan') adalah sebuah tabel yang mengekspresikan positif benar vs. positif palsu modelmu sehingga mengukur akurasi prediksi=prediksinya.
+
+1. Untuk menggunakan sebuah matriks kebingungan, gunakan fungsi `confusin_matrix()`:
+
+ ```python
+ from sklearn.metrics import confusion_matrix
+ confusion_matrix(y_test, predictions)
+ ```
+
+ Lihatlah matriks kebingungan modelmu:
+
+ ```output
+ array([[162, 4],
+ [ 33, 0]])
+ ```
+
+Apa yang sedang terjadi di sini? Mari kita asumsi dulu bahwa model kita ditanyakan untuk mengklasifikasi antara dua kategori binari: 'labu' dan 'bukan labu'.
+
+- Kalau modelmu memprediksi sesuatu sebagai sebuah labu dan memang benar sesuatu itu adalah sebuah labu, itu disebut positif benar yang diindikasi angka di pojok kiri atas.
+- Kalau modelmu memprediksi sesuatu sebagai bukan sebuah labu tetapi sesuatu itu sebenarnya sebuah labu, itu disebut positif palsu yang diindikasi angka di pojok kanan atas.
+- Kalau modelmu memprediksi sesuati sebagai sebuah labu tetapi sebenarnya bukan sebuah labu, itu disebut negatif palsu yang diindikasi angka di pojok kiri bawah.
+- Kalau modelmu memprediksi sesuati sebagai bukan sebuah labu dan memang benar sesuatu itu bukan sebuah labu, itu disebut negatif benar yang diindikasi angka di pojok kanan bawah.
+
+Sebagaimana kamu mungkin sudah pikirkan, lebih baik dapat banyak positif benar dan negatif benar dan sedikit positif palsu dan negatif palsu. Implikasinya adalah performa modelnya bagus.
+
+✅ Pertanyaan: Berdasarkan matriks kebingungan, modelnya baik tidak? Jawaban: Tidak buruk; ada banyak positif benar dan sedikit negatif palsu.
+
+Mari kita lihat kembali istilah-istilah yang kita lihat tadi dengan bantuan matriks kebingungan:
+
+> PB: Positif benar
+> PP: Positif palsu
+> NB: Negatif benar
+> NP: Negatif palsu
+
+🎓 Presisi: PB/(PB + PP) Rasio titik data relevan antara semua titik data (seperti data mana yang benar dilabelkannya)
+
+🎓 *Recall*: PB/(PB + NP) Rasio titk data relevan yang digunakan, maupun labelnya benar atau tidak.
+
+🎓 *f1-score*: (2 * Presisi * *Recall*)/(Presisi + *Recall*) Sebuah rata-rata tertimbang antara presisi dan *recall*. 1 itu baik dan 0 itu buruk.
+
+🎓 Dukungan: Jumlah kejadian per label
+
+🎓 Akurasi: (PB + NB)/(PB + PS + NB + NS) Persentase label yang diprediksi dengan benar untuk sebuah sampel.
+
+🎓 Rata-rata Makro: Hitungan rata-rata sederhana (non-tertimbang) metrik setiap label tanpa menghiraukan ketidakseimbangan label.
+
+🎓 Rata-rata Tertimbang: Hitungan rata-rata metrik setiap label dengan mempertimbangkan ketidakseimbangan label. Rata-ratanya tertimbang nilai Dukungan (jumlah kejadian dalam realita) setiap label.
+
+✅ Apa kamu bisa tebak metrik apa yang harus dipantau untuk mengurangi jumlah negatif palsu modelmu?
+
+## Visualisasikan kurva ROC model ini
+
+Ini bukanlah sebuah model buruk. Akurasinya sekitar 80%, jadi sebenarnya bisa digunakan untuk memprediksi warna sebuah labu berdasarkan beberapa variabel.
+
+Mari kita memvisualisasikan datanya sekali lagi untuk melihat nilai ROC ini:
+
+```python
+from sklearn.metrics import roc_curve, roc_auc_score
+
+y_scores = model.predict_proba(X_test)
+# calculate ROC curve
+fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
+sns.lineplot([0, 1], [0, 1])
+sns.lineplot(fpr, tpr)
+```
+Menggunakan Seaborn lagi, gambarlah [Receiving Operating Characteristic](https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html?highlight=roc) (ROC) model ini. Kurva ROC seringkali digunakan untuk menunjukkan output sebuah pembuat klasifikasi berdasarkan jumlah positif benar dan positif palsunya. "Kurva ROC biasanya menetapkan persentase positif benar di sumbu Y dan positif palsunya di sumbu X" (diterjemahkan). Maka, terjalnya kurva ini dan ruang antara garis titik tengah dan kurvanya penting: kamu mau sebuah kurva yang naik ke atas garisnya secepat mungkin. Dalam kasus ini, ada positif palsu di awal, terus kurvanya naik di atas garisnya dengan benar:
+
+![ROC](../images/ROC.png)
+
+Akhirnya, gunakanlah [API `roc_auc_score`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html?highlight=roc_auc#sklearn.metrics.roc_auc_score) Scikit-learn untuk menghitung 'Area Di Bawah Kurva'-nya (ADBK) secara persis:
+
+```python
+auc = roc_auc_score(y_test,y_scores[:,1])
+print(auc)
+```
+Hasilnya adalah `0.6976998904709748`. Mengingat bahwa ADBK itu antara 0 dan 1, lebih besar ADBK-nya lebih baik sebab ADBK model yang 100% benar terus adalah 1; dalam kasus ini, modelnya _lumayan bagus_.
+
+Nanti dalam pelajaran lebih lanjut tentang klasifikasi, kamu akan belajar bagaimana mengulang untuk membuat nilai-nilai modelmu lebih baik. Tetapi sekian dulu. Selamat! Kamu selesai pelajaran-pelajaran regresi ini!
+
+---
+## 🚀 Tantangan
+
+Masih ada banyak tentang regresi logistik! Tetapi cara paling baik adalah untuk bereksperimen. Carilah sebuah *dataset* yang bisa diteliti seperti ini dan bangunlah sebuah model darinya. Apa yang kamu pelajari? Petunjuk: Coba [Kaggle](https://kaggle.com) untuk *dataset-dataset* menarik.
+
+## [Kuis pasca-ceramah](https://white-water-09ec41f0f.azurestaticapps.net/quiz/16/)
+
+## Review & Pembelajaran mandiri
+
+Bacalah beberapa halaman pertama [makalah ini dari Stanford](https://web.stanford.edu/~jurafsky/slp3/5.pdf) tentang beberapa penggunaan praktis regresi logistik. Pikirkan tentang tugas-tugas yang lebih baik untuk suatu jenis regresi atau jenis-jenis lainnya yang kita telah pelajari sampai kini. Apa yang akan bekerja paling baik?
+
+## Tugas
+
+[Coba lagi regresi ini](../assignment.md)
diff --git a/2-Regression/4-Logistic/translations/README.it.md b/2-Regression/4-Logistic/translations/README.it.md
new file mode 100644
index 0000000000..deb51eaa1e
--- /dev/null
+++ b/2-Regression/4-Logistic/translations/README.it.md
@@ -0,0 +1,295 @@
+# Regressione logistica per prevedere le categorie
+
+![Infografica di regressione lineare e logistica](../images/logistic-linear.png)
+> Infografica di [Dasani Madipalli](https://twitter.com/dasani_decoded)
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/15/)
+
+## Introduzione
+
+In questa lezione finale sulla Regressione, una delle tecniche _classiche_ di base di machine learning, si darà un'occhiata alla Regressione Logistica. Si dovrebbe utilizzare questa tecnica per scoprire modelli per prevedere le categorie binarie. Questa caramella è al cioccolato o no? Questa malattia è contagiosa o no? Questo cliente sceglierà questo prodotto o no?
+
+In questa lezione, si imparerà:
+
+- Una nuova libreria per la visualizzazione dei dati
+- Tecniche per la regressione logistica
+
+✅ Con questo [modulo di apprendimento](https://docs.microsoft.com/learn/modules/train-evaluate-classification-models?WT.mc_id=academic-15963-cxa) si potrà approfondire la comprensione del lavoro con questo tipo di regressione
+## Prerequisito
+
+Avendo lavorato con i dati della zucca, ora si ha abbastanza familiarità con essi per rendersi conto che esiste una categoria binaria con cui è possibile lavorare: `Color` (Colore).
+
+Si costruisce un modello di regressione logistica per prevedere, date alcune variabili, di _che colore sarà probabilmente una data zucca_ (arancione 🎃 o bianca 👻).
+
+> Perché si parla di classificazione binaria in un gruppo di lezioni sulla regressione? Solo per comodità linguistica, poiché la regressione logistica è in [realtà un metodo di classificazione](https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression), anche se lineare. Si scopriranno altri modi per classificare i dati nel prossimo gruppo di lezioni.
+
+## Definire la domanda
+
+Allo scopo, verrà espressa come binaria: 'Arancio' o 'Non Arancio'. C'è anche una categoria "striped" (a strisce) nell'insieme di dati, ma ci sono pochi casi, quindi non verrà presa in considerazione. Comunque scompare una volta rimossi i valori null dall'insieme di dati.
+
+> 🎃 Fatto divertente, a volte le zucche bianche vengono chiamate zucche "fantasma" Non sono molto facili da intagliare, quindi non sono così popolari come quelle arancioni ma hanno un bell'aspetto!
+
+## Informazioni sulla regressione logistica
+
+La regressione logistica differisce dalla regressione lineare, che si è appresa in precedenza, in alcuni importanti modi.
+
+### Classificazione Binaria
+
+La regressione logistica non offre le stesse caratteristiche della regressione lineare. La prima offre una previsione su una categoria binaria ("arancione o non arancione") mentre la seconda è in grado di prevedere valori continui, ad esempio data l'origine di una zucca e il momento del raccolto, di _quanto aumenterà il suo prezzo_.
+
+![Modello di classificazione della zucca](../images/pumpkin-classifier.png)
+> Infografica di [Dasani Madipalli](https://twitter.com/dasani_decoded)
+### Altre classificazioni:
+
+Esistono altri tipi di regressione logistica, inclusi multinomiale e ordinale:
+
+- **Multinomiale**, che implica avere più di una categoria: "arancione, bianco e a strisce".
+- **Ordinale**, che coinvolge categorie ordinate, utile se si volessero ordinare i risultati in modo logico, come le zucche che sono ordinate per un numero finito di dimensioni (mini,sm,med,lg,xl,xxl).
+
+![Regressione multinomiale contro ordinale](../images/multinomial-ordinal.png)
+> Infografica di [Dasani Madipalli](https://twitter.com/dasani_decoded)
+
+### È ancora lineare
+
+Anche se questo tipo di Regressione riguarda le "previsioni di categoria", funziona ancora meglio quando esiste una chiara relazione lineare tra la variabile dipendente (colore) e le altre variabili indipendenti (il resto dell'insieme di dati, come il nome della città e le dimensioni) . È bene avere un'idea se c'è qualche linearità che divide queste variabili o meno.
+
+### Le variabili NON devono essere correlate
+
+Si ricorda come la regressione lineare ha funzionato meglio con più variabili correlate? La regressione logistica è l'opposto: le variabili non devono essere allineate. Funziona per questi dati che hanno correlazioni alquanto deboli.
+
+### Servono molti dati puliti
+
+La regressione logistica fornirà risultati più accurati se si utilizzano più dati; quindi si tenga a mente che, essendo l'insieme di dati sulla zucca piccolo, non è ottimale per questo compito
+
+✅ Si pensi ai tipi di dati che si prestano bene alla regressione logistica
+
+## Esercizio: riordinare i dati
+
+Innanzitutto, si puliscono un po 'i dati, eliminando i valori null e selezionando solo alcune delle colonne:
+
+1. Aggiungere il seguente codice:
+
+ ```python
+ from sklearn.preprocessing import LabelEncoder
+
+ new_columns = ['Color','Origin','Item Size','Variety','City Name','Package']
+
+ new_pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)
+
+ new_pumpkins.dropna(inplace=True)
+
+ new_pumpkins = new_pumpkins.apply(LabelEncoder().fit_transform)
+ ```
+
+ Si può sempre dare un'occhiata al nuovo dataframe:
+
+ ```python
+ new_pumpkins.info
+ ```
+
+### Visualizzazione - griglia affiancata
+
+A questo punto si è caricato di nuovo il [notebook iniziale](../notebook.ipynb) con i dati della zucca e lo si è pulito in modo da preservare un insieme di dati contenente alcune variabili, incluso `Color`. Si visualizza il dataframe nel notebook utilizzando una libreria diversa: [Seaborn](https://seaborn.pydata.org/index.html), che è costruita su Matplotlib, usata in precedenza.
+
+Seaborn offre alcuni modi accurati per visualizzare i dati. Ad esempio, si possono confrontare le distribuzioni dei dati per ogni punto in una griglia affiancata.
+
+1. Si crea una griglia di questo tipo istanziando `PairGrid`, usando i dati della zucca `new_pumpkins`, poi chiamando `map()`:
+
+ ```python
+ import seaborn as sns
+
+ g = sns.PairGrid(new_pumpkins)
+ g.map(sns.scatterplot)
+ ```
+
+ ![Una griglia di dati visualizzati](../images/grid.png)
+
+ Osservando i dati fianco a fianco, si può vedere come i dati di Color si riferiscono alle altre colonne.
+
+ ✅ Data questa griglia del grafico a dispersione, quali sono alcune esplorazioni interessanti che si possono immaginare?
+
+### Usare un grafico a sciame
+
+Poiché Color è una categoria binaria (arancione o no), viene chiamata "dati categoriali" e richiede "un [approccio più specializzato](https://seaborn.pydata.org/tutorial/categorical.html?highlight=bar) alla visualizzazione". Esistono altri modi per visualizzare la relazione di questa categoria con altre variabili.
+
+È possibile visualizzare le variabili fianco a fianco con i grafici di Seaborn.
+
+1. Si provi un grafico a "sciame" per mostrare la distribuzione dei valori:
+
+ ```python
+ sns.swarmplot(x="Color", y="Item Size", data=new_pumpkins)
+ ```
+
+ ![Uno sciame di dati visualizzati](../images/swarm.png)
+
+### Grafico violino
+
+Un grafico di tipo "violino" è utile in quanto è possibile visualizzare facilmente il modo in cui sono distribuiti i dati nelle due categorie. I grafici di tipo violino non funzionano così bene con insieme di dati più piccoli poiché la distribuzione viene visualizzata in modo più "liscio".
+
+1. Chiamare `catplot()` passando i parametri `x=Color`, `kind="violin"` :
+
+ ```python
+ sns.catplot(x="Color", y="Item Size",
+ kind="violin", data=new_pumpkins)
+ ```
+
+ ![una tabella di un grafico di tipo violino](../images/violin.png)
+
+ ✅ Provare a creare questo grafico e altri grafici Seaborn, utilizzando altre variabili.
+
+Ora che si ha un'idea della relazione tra le categorie binarie di colore e il gruppo più ampio di dimensioni, si esplora la regressione logistica per determinare il probabile colore di una data zucca.
+
+> **🧮 Mostrami la matematica**
+>
+> Si ricorda come la regressione lineare usava spesso i minimi quadrati ordinari per arrivare a un valore? La regressione logistica si basa sul concetto di "massima verosimiglianza" utilizzando [le funzioni sigmoidi](https://wikipedia.org/wiki/Sigmoid_function). Una "Funzione Sigmoide" su un grafico ha l'aspetto di una forma a "S". Prende un valore e lo mappa da qualche parte tra 0 e 1. La sua curva è anche chiamata "curva logistica". La sua formula si presenta così:
+>
+> ![funzione logistica](../images/sigmoid.png)
+>
+> dove il punto medio del sigmoide si trova nel punto 0 di x, L è il valore massimo della curva e k è la pendenza della curva. Se l'esito della funzione è maggiore di 0,5, all'etichetta in questione verrà assegnata la classe '1' della scelta binaria. In caso contrario, sarà classificata come '0'.
+
+## Costruire il modello
+
+Costruire un modello per trovare queste classificazioni binarie è sorprendentemente semplice in Scikit-learn.
+
+1. Si selezionano le variabili da utilizzare nel modello di classificazione e si dividono gli insiemi di training e test chiamando `train_test_split()`:
+
+ ```python
+ from sklearn.model_selection import train_test_split
+
+ Selected_features = ['Origin','Item Size','Variety','City Name','Package']
+
+ X = new_pumpkins[Selected_features]
+ y = new_pumpkins['Color']
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+
+ ```
+
+1. Ora si può addestrare il modello, chiamando `fit()` con i dati di addestramento e stamparne il risultato:
+
+ ```python
+ from sklearn.model_selection import train_test_split
+ from sklearn.metrics import accuracy_score, classification_report
+ from sklearn.linear_model import LogisticRegression
+
+ model = LogisticRegression()
+ model.fit(X_train, y_train)
+ predictions = model.predict(X_test)
+
+ print(classification_report(y_test, predictions))
+ print('Predicted labels: ', predictions)
+ print('Accuracy: ', accuracy_score(y_test, predictions))
+ ```
+
+ Si dia un'occhiata al tabellone segnapunti del modello. Non è male, considerando che si hanno solo circa 1000 righe di dati:
+
+ ```output
+ precision recall f1-score support
+
+ 0 0.85 0.95 0.90 166
+ 1 0.38 0.15 0.22 33
+
+ accuracy 0.82 199
+ macro avg 0.62 0.55 0.56 199
+ weighted avg 0.77 0.82 0.78 199
+
+ Predicted labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
+ 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ 0 0 0 1 0 1 0 0 1 0 0 0 1 0]
+ ```
+
+## Migliore comprensione tramite una matrice di confusione
+
+Sebbene si possano ottenere [i termini](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html?highlight=classification_report#sklearn.metrics.classification_report) del rapporto dei punteggi stampando gli elementi di cui sopra, si potrebbe essere in grado di comprendere più facilmente il modello utilizzando una [matrice di confusione](https://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix) che aiuti a capire come lo stesso sta funzionando.
+
+> 🎓 Una '[matrice di confusione](https://it.wikipedia.org/wiki/Matrice_di_confusione)' (o 'matrice di errore') è una tabella che esprime i veri contro i falsi positivi e negativi del modello, misurando così l'accuratezza delle previsioni.
+
+1. Per utilizzare una metrica di confusione, si `chiama confusion_matrix()`:
+
+ ```python
+ from sklearn.metrics import confusion_matrix
+ confusion_matrix(y_test, predictions)
+ ```
+
+ Si dia un'occhiata alla matrice di confusione del modello:
+
+ ```output
+ array([[162, 4],
+ [ 33, 0]])
+ ```
+
+Cosa sta succedendo qui? Si supponga che al modello venga chiesto di classificare gli elementi tra due categorie binarie, la categoria "zucca" e la categoria "non una zucca".
+
+- Se il modello prevede qualcosa come una zucca e appartiene alla categoria 'zucca' in realtà lo si chiama un vero positivo, mostrato dal numero in alto a sinistra.
+- Se il modello prevede qualcosa come non una zucca e appartiene alla categoria 'zucca' in realtà si chiama falso positivo, mostrato dal numero in alto a destra.
+- Se il modello prevede qualcosa come una zucca e appartiene alla categoria 'non-una-zucca' in realtà si chiama falso negativo, mostrato dal numero in basso a sinistra.
+- Se il modello prevede qualcosa come non una zucca e appartiene alla categoria 'non-una-zucca' in realtà lo si chiama un vero negativo, mostrato dal numero in basso a destra.
+
+Come si sarà intuito, è preferibile avere un numero maggiore di veri positivi e veri negativi e un numero inferiore di falsi positivi e falsi negativi, il che implica che il modello funziona meglio.
+
+✅ Domanda: Secondo la matrice di confusione, come si è comportato il modello? Risposta: Non male; ci sono un buon numero di veri positivi ma anche diversi falsi negativi.
+
+I termini visti in precedenza vengono rivisitati con l'aiuto della mappatura della matrice di confusione di TP/TN e FP/FN:
+
+🎓 Precisione: TP/(TP + FP) La frazione di istanze rilevanti tra le istanze recuperate (ad es. quali etichette erano ben etichettate)
+
+🎓 Richiamo: TP/(TP + FN) La frazione di istanze rilevanti che sono state recuperate, ben etichettate o meno
+
+🎓 f1-score: (2 * precisione * richiamo)/(precisione + richiamo) Una media ponderata della precisione e del richiamo, dove il migliore è 1 e il peggiore è 0
+
+🎓 Supporto: il numero di occorrenze di ciascuna etichetta recuperata
+
+🎓 Accuratezza: (TP + TN)/(TP + TN + FP + FN) La percentuale di etichette prevista accuratamente per un campione.
+
+🎓 Macro Media: il calcolo delle metriche medie non ponderate per ciascuna etichetta, senza tener conto dello squilibrio dell'etichetta.
+
+🎓 Media ponderata: il calcolo delle metriche medie per ogni etichetta, tenendo conto dello squilibrio dell'etichetta pesandole in base al loro supporto (il numero di istanze vere per ciascuna etichetta).
+
+✅ Si riesce a pensare a quale metrica si dovrebbe guardare se si vuole che il modello riduca il numero di falsi negativi?
+
+## Visualizzare la curva ROC di questo modello
+
+Questo non è un cattivo modello; la sua precisione è nell'intervallo dell'80%, quindi idealmente si potrebbe usare per prevedere il colore di una zucca dato un insieme di variabili.
+
+Si rende un'altra visualizzazione per vedere il cosiddetto punteggio 'ROC':
+
+```python
+from sklearn.metrics import roc_curve, roc_auc_score
+
+y_scores = model.predict_proba(X_test)
+# calculate ROC curve
+fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
+sns.lineplot([0, 1], [0, 1])
+sns.lineplot(fpr, tpr)
+```
+Usando di nuovo Seaborn, si traccia la [Caratteristica Operativa di Ricezione](https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html?highlight=roc) o il ROC del modello. Le curve ROC vengono spesso utilizzate per ottenere una visualizzazione dell'output di un classificatore in termini di veri e falsi positivi. "Le curve ROC in genere presentano un tasso di veri positivi sull'asse Y e un tasso di falsi positivi sull'asse X". Pertanto, la ripidità della curva e lo spazio tra la linea del punto medio e la curva contano: si vuole una curva che si sposti rapidamente verso l'alto e oltre la linea. In questo caso, ci sono falsi positivi con cui iniziare, quindi la linea si dirige correttamente:
+
+![ROC](../images/ROC.png)
+
+Infine, si usa l'[`API roc_auc_score`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html?highlight=roc_auc#sklearn.metrics.roc_auc_score) di Scikit-learn per calcolare l'effettiva "Area sotto la curva" (AUC):
+
+```python
+auc = roc_auc_score(y_test,y_scores[:,1])
+print(auc)
+```
+Il risultato è `0.6976998904709748`. Dato che l'AUC varia da 0 a 1, si desidera un punteggio elevato, poiché un modello corretto al 100% nelle sue previsioni avrà un AUC di 1; in questo caso, il modello è _abbastanza buono_.
+
+Nelle lezioni future sulle classificazioni si imparerà come eseguire l'iterazione per migliorare i punteggi del modello. Ma per ora, congratulazioni! Si sono completate queste lezioni di regressione!
+
+---
+## 🚀 Sfida
+
+C'è molto altro da svelare riguardo alla regressione logistica! Ma il modo migliore per imparare è sperimentare. Trovare un insieme di dati che si presti a questo tipo di analisi e costruire un modello con esso. Cosa si è appreso? suggerimento: provare [Kaggle](https://kaggle.com) per ottenere insiemi di dati interessanti.
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/16/)
+
+## Revisione e Auto Apprendimento
+
+Leggere le prime pagine di [questo articolo da Stanford](https://web.stanford.edu/~jurafsky/slp3/5.pdf) su alcuni usi pratici della regressione logistica. Si pensi alle attività più adatte per l'uno o l'altro tipo di attività di regressione studiate fino a questo punto. Cosa funzionerebbe meglio?
+
+## Compito
+
+[Ritentare questa regressione](assignment.it.md)
diff --git a/2-Regression/4-Logistic/translations/README.ja.md b/2-Regression/4-Logistic/translations/README.ja.md
new file mode 100644
index 0000000000..662a1eafba
--- /dev/null
+++ b/2-Regression/4-Logistic/translations/README.ja.md
@@ -0,0 +1,310 @@
+# カテゴリ予測のためのロジスティック回帰
+
+![ロジスティク回帰 vs 線形回帰のインフォグラフィック](../images/logistic-linear.png)
+> [Dasani Madipalli](https://twitter.com/dasani_decoded) によるインフォグラフィック
+## [講義前のクイズ](https://white-water-09ec41f0f.azurestaticapps.net/quiz/15/)
+
+## イントロダクション
+
+回帰の最後のレッスンでは、古典的な機械学習手法の一つである、「ロジスティック回帰」を見ていきます。この手法は、2値のカテゴリを予測するためのパターンを発見するために使います。例えば、「このお菓子は、チョコレートかどうか?」、「この病気は伝染するかどうか?」、「この顧客は、この商品を選ぶかどうか?」などです。
+
+このレッスンでは以下の内容を扱います。
+
+- データを可視化するための新しいライブラリ
+- ロジスティック回帰について
+
+✅ この[モジュール](https://docs.microsoft.com/learn/modules/train-evaluate-classification-models?WT.mc_id=academic-15963-cxa) では、今回のタイプのような回帰について理解を深めることができます。
+
+## 前提条件
+
+カボチャのデータを触ったことで、データの扱いにかなり慣れてきました。その際にバイナリカテゴリが一つあることに気づきました。「`Color`」です。
+
+いくつかの変数が与えられたときに、あるカボチャがどのような色になる可能性が高いか (オレンジ🎃または白👻)を予測するロジスティック回帰モデルを構築してみましょう。
+
+> なぜ、回帰についてのレッスンで二値分類の話をしているのでしょうか?ロジスティック回帰は、線形ベースのものではありますが、[実際には分類法](https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression) であるため、言語的な便宜上です。次のレッスングループでは、データを分類する他の方法について学びます。
+
+## 質問の定義
+
+ここでは、「Orange」か「Not Orange」かの二値で表現しています。データセットには「striped」というカテゴリーもありますが、ほとんど例がないので、ここでは使いません。データセットからnull値を削除すると、このカテゴリーは消えてしまいます。
+
+> 🎃 面白いことに、白いカボチャを「お化けカボチャ」と呼ぶことがあります。彫るのが簡単ではないので、オレンジ色のカボチャほど人気はありませんが、見た目がクールですよね!
+
+## ロジスティック回帰について
+
+ロジスティック回帰は、前回学んだ線形回帰とは、いくつかの重要な点で異なります。
+
+### 2値分類
+
+ロジスティック回帰は、線形回帰とは異なる特徴を持っています。ロジスティック回帰は、二値のカテゴリー(「オレンジ色かオレンジ色でないか」)についての予測を行うのに対し、線形回帰は連続的な値を予測します。例えば、カボチャの産地と収穫時期が与えられれば、その価格がどれだけ上昇するかを予測することができます。
+
+![カボチャ分類モデル](../images/pumpkin-classifier.png)
+> [Dasani Madipalli](https://twitter.com/dasani_decoded) によるインフォグラフィック
+### その他の分類
+
+ロジスティック回帰には他にもMultinomialやOrdinalなどの種類があります。
+
+- **Multinomial**: これは2つ以上のカテゴリーを持つ場合です。 (オレンジ、白、ストライプ)
+- **Ordinal**: これは、順序付けられたカテゴリを含むもので、有限の数のサイズ(mini、sm、med、lg、xl、xxl)で並べられたカボチャのように、結果を論理的に並べたい場合に便利です。
+
+![Multinomial vs ordinal 回帰](../images/multinomial-ordinal.png)
+> [Dasani Madipalli](https://twitter.com/dasani_decoded) によるインフォグラフィック
+
+### 線形について
+
+このタイプの回帰は、「カテゴリーの予測」が目的ですが、従属変数(色)と他の独立変数(都市名やサイズなどのデータセットの残りの部分)の間に明確な線形関係がある場合に最も効果的です。これらの変数を分ける線形性があるかどうかを把握するのは良いことです。
+
+### 変数が相関している必要はない
+
+線形回帰は、相関性の高い変数ほどよく働くことを覚えていますか?ロジスティック回帰は、そうとは限りません。相関関係がやや弱いこのデータには有効ですね。
+
+### 大量のきれいなデータが必要です
+
+一般的にロジスティック回帰は、より多くのデータを使用すれば、より正確な結果が得られます。私たちの小さなデータセットは、このタスクには最適ではありませんので、その点に注意してください。
+
+✅ ロジスティック回帰に適したデータの種類を考えてみてください。
+
+## エクササイズ - データの整形
+
+まず、NULL値を削除したり、一部の列だけを選択したりして、データを少し綺麗にします。
+
+1. 以下のコードを追加:
+
+ ```python
+ from sklearn.preprocessing import LabelEncoder
+
+ new_columns = ['Color','Origin','Item Size','Variety','City Name','Package']
+
+ new_pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)
+
+ new_pumpkins.dropna(inplace=True)
+
+ new_pumpkins = new_pumpkins.apply(LabelEncoder().fit_transform)
+ ```
+
+ 新しいデータフレームはいつでも確認することができます。
+
+ ```python
+ new_pumpkins.info
+ ```
+
+### 可視化 - グリッド状に並べる
+
+ここまでで、[スターターノートブック](../notebook.ipynb) にパンプキンデータを再度読み込み、`Color`を含むいくつかの変数を含むデータセットを保持するように整形しました。別のライブラリを使って、ノートブック内のデータフレームを可視化してみましょう。[Seaborn](https://seaborn.pydata.org/index.html) というライブラリを使って、ノートブック内のデータフレームを可視化してみましょう。このライブラリは、今まで使っていた`Matplotlib`をベースにしています。
+
+Seabornには、データを可視化するためのいくつかの優れた方法があります。例えば、各データの分布を横並びのグリッドで比較することができます。
+
+1. かぼちゃのデータ`new_pumpkins`を使って、`PairGrid`をインスタンス化し、`map()`メソッドを呼び出して、以下のようなグリッドを作成します。
+
+ ```python
+ import seaborn as sns
+
+ g = sns.PairGrid(new_pumpkins)
+ g.map(sns.scatterplot)
+ ```
+
+ ![グリッド状の可視化](../images/grid.png)
+
+ データを並べて観察することで、Colorのデータが他の列とどのように関連しているのかを知ることができます。
+
+ ✅ この散布図をもとに、どのような面白い試みが考えられるでしょうか?
+
+### swarm plot
+
+Colorは2つのカテゴリー(Orange or Not)であるため、「カテゴリカルデータ」と呼ばれ、「可視化にはより[専門的なアプローチ](https://seaborn.pydata.org/tutorial/categorical.html?highlight=bar) 」が必要となります。このカテゴリと他の変数との関係を可視化する方法は他にもあります。
+
+Seabornプロットでは、変数を並べて表示することができます。
+
+1. 値の分布を示す、'swarm' plotを試してみます。
+
+ ```python
+ sns.swarmplot(x="Color", y="Item Size", data=new_pumpkins)
+ ```
+
+ ![swarm plotによる可視化](../images/swarm.png)
+
+### Violin plot
+
+'violin' タイプのプロットは、2つのカテゴリーのデータがどのように分布しているかを簡単に視覚化できるので便利です。Violin plotは、分布がより「滑らか」に表示されるため、データセットが小さい場合はあまりうまくいきません。
+
+1. パラメータとして`x=Color`、`kind="violin"` をセットし、 `catplot()`メソッドを呼びます。
+
+ ```python
+ sns.catplot(x="Color", y="Item Size",
+ kind="violin", data=new_pumpkins)
+ ```
+
+ ![バイオリンタイプのチャート](../images/violin.png)
+
+ ✅ 他の変数を使って、このプロットや他のSeabornのプロットを作成してみてください。
+
+さて、`Color`の二値カテゴリと、より大きなサイズのグループとの関係がわかったところで、ロジスティック回帰を使って、あるカボチャの色について調べてみましょう。
+
+> **🧮 数学の確認**
+>
+> 線形回帰では、通常の最小二乗法を用いて値を求めることが多かったことを覚えていますか?ロジスティック回帰は、[シグモイド関数](https://wikipedia.org/wiki/Sigmoid_function) を使った「最尤」の概念に依存しています。シグモイド関数は、プロット上では「S」字のように見えます。その曲線は「ロジスティック曲線」とも呼ばれます。数式は次のようになります。
+>
+> ![ロジスティック関数](../images/sigmoid.png)
+>
+> ここで、シグモイドの中点はx=0の点、Lは曲線の最大値、kは曲線の急峻さを表します。この関数の結果が0.5以上であれば、そのラベルは二値選択のクラス「1」になります。そうでない場合は、「0」に分類されます。
+
+## モデルの構築
+
+これらの二値分類を行うためのモデルの構築は、Scikit-learnでは驚くほど簡単にできます。
+
+1. 分類モデルで使用したい変数を選択し、`train_test_split()`メソッドでトレーニングセットとテストセットを分割します。
+
+ ```python
+ from sklearn.model_selection import train_test_split
+
+ Selected_features = ['Origin','Item Size','Variety','City Name','Package']
+
+ X = new_pumpkins[Selected_features]
+ y = new_pumpkins['Color']
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+
+ ```
+
+2. これで、学習データを使って`fit()`メソッドを呼び出し、モデルを訓練し、その結果を出力することができます。
+
+ ```python
+ from sklearn.model_selection import train_test_split
+ from sklearn.metrics import accuracy_score, classification_report
+ from sklearn.linear_model import LogisticRegression
+
+ model = LogisticRegression()
+ model.fit(X_train, y_train)
+ predictions = model.predict(X_test)
+
+ print(classification_report(y_test, predictions))
+ print('Predicted labels: ', predictions)
+ print('Accuracy: ', accuracy_score(y_test, predictions))
+ ```
+
+ モデルのスコアボードを見てみましょう。1000行程度のデータしかないことを考えると、悪くないと思います。
+
+ ```output
+ precision recall f1-score support
+
+ 0 0.85 0.95 0.90 166
+ 1 0.38 0.15 0.22 33
+
+ accuracy 0.82 199
+ macro avg 0.62 0.55 0.56 199
+ weighted avg 0.77 0.82 0.78 199
+
+ Predicted labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
+ 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ 0 0 0 1 0 1 0 0 1 0 0 0 1 0]
+ ```
+
+## 混同行列による理解度の向上
+
+
+上記の項目を出力することで[スコアボードレポート](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html?highlight=classification_report#sklearn.metrics.classification_report) を得ることができますが、[混同行列](https://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix) を使うことで、より簡単にモデルを理解することができるかもしれません。
+
+
+> 🎓 [混同行列](https://wikipedia.org/wiki/Confusion_matrix) とは、モデルの真の陽性と陰性を表す表で、予測の正確さを測ることができます。
+
+1. `confusion_matrix()`メソッドを呼んで、混同行列を作成します。
+
+ ```python
+ from sklearn.metrics import confusion_matrix
+ confusion_matrix(y_test, predictions)
+ ```
+
+ T作成したモデルの混同行列をみてみてください。
+
+ ```output
+ array([[162, 4],
+ [ 33, 0]])
+ ```
+
+Scikit-learnでは、混同行列の行 (axis=0)が実際のラベル、列 (axis=1)が予測ラベルとなります。
+
+| | 0 | 1 |
+| :---: | :---: | :---: |
+| 0 | TN | FP |
+| 1 | FN | TP |
+
+ここで何が起こっているのか?例えば、カボチャを「オレンジ色」と「オレンジ色でない」という2つのカテゴリーに分類するように求められたとしましょう。
+
+- モデルではオレンジ色ではないと予測されたカボチャが、実際には「オレンジ色ではない」というカテゴリーに属していた場合、「true negative」と呼ばれ、左上の数字で示されます。
+- モデルではオレンジ色と予測されたカボチャが、実際には「オレンジ色ではない」カテゴリーに属していた場合、「false negative」と呼ばれ、左下の数字で示されます。
+- モデルがオレンジではないと予測したかぼちゃが、実際にはカテゴリー「オレンジ」に属していた場合、「false positive」と呼ばれ、右上の数字で示されます。
+- モデルがカボチャをオレンジ色と予測し、それが実際にカテゴリ「オレンジ」に属する場合、「true positive」と呼ばれ、右下の数字で示されます。
+
+お気づきの通り、true positiveとtrue negativeの数が多く、false positiveとfalse negativeの数が少ないことが好ましく、これはモデルの性能が高いことを意味します。
+
+混同行列は、precisionとrecallにどのように関係するのでしょうか?上記の分類レポートでは、precision(0.83)とrecall(0.98)が示されています。
+
+Precision = tp / (tp + fp) = 162 / (162 + 33) = 0.8307692307692308
+
+Recall = tp / (tp + fn) = 162 / (162 + 4) = 0.9759036144578314
+
+✅ Q: 混同行列によると、モデルの出来はどうでしたか? A: 悪くありません。true negativeがかなりの数ありますが、false negativeもいくつかあります。
+
+先ほどの用語を、混同行列のTP/TNとFP/FNのマッピングを参考にして再確認してみましょう。
+
+🎓 Precision: TP/(TP + FP) 探索されたインスタンスのうち、関連性のあるインスタンスの割合(どのラベルがよくラベル付けされていたかなど)。
+
+🎓 Recall: TP/(TP + FN) ラベリングされているかどうかに関わらず、探索された関連インスタンスの割合です。
+
+🎓 f1-score: (2 * precision * recall)/(precision + recall) precisionとrecallの加重平均で、最高が1、最低が0となる。
+
+🎓 Support: 取得した各ラベルの出現回数です。
+
+🎓 Accuracy: (TP + TN)/(TP + TN + FP + FN) サンプルに対して正確に予測されたラベルの割合です。
+
+🎓 Macro Avg: 各ラベルの非加重平均指標の計算で、ラベルの不均衡を考慮せずに算出される。
+
+🎓 Weighted Avg: 各ラベルのサポート数(各ラベルの真のインスタンス数)で重み付けすることにより、ラベルの不均衡を考慮して、各ラベルの平均指標を算出する。
+
+✅ 自分のモデルでfalse negativeの数を減らしたい場合、どの指標に注目すべきか考えられますか?
+
+## モデルのROC曲線を可視化する
+
+これは悪いモデルではありません。精度は80%の範囲で、理想的には、一連の変数が与えられたときにカボチャの色を予測するのに使うことができます。
+
+いわゆる「ROC」スコアを見るために、もう一つの可視化を行ってみましょう。
+
+```python
+from sklearn.metrics import roc_curve, roc_auc_score
+
+y_scores = model.predict_proba(X_test)
+# calculate ROC curve
+fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
+sns.lineplot([0, 1], [0, 1])
+sns.lineplot(fpr, tpr)
+```
+Seaborn を再度使用して、モデルの [受信者操作特性 (Receiving Operating Characteristic)](https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html?highlight=roc) またはROCをプロットします。ROC曲線は、分類器の出力を、true positiveとfalse positiveの観点から見るためによく使われます。ROC曲線は通常、true positive rateをY軸に、false positive rateをX軸にとっています。したがって、曲線の急峻さと、真ん中の線形な線と曲線の間のスペースが重要で、すぐに頭を上げて中線を超えるような曲線を求めます。今回のケースでは、最初にfalse positiveが出て、その後、ラインがきちんと上に向かって超えていきます。
+
+![ROC](../images/ROC.png)
+
+最後に、Scikit-learnの[`roc_auc_score` API](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html?highlight=roc_auc#sklearn.metrics.roc_auc_score) を使って、実際の「Area Under the Curve」(AUC)を計算します。
+
+```python
+auc = roc_auc_score(y_test,y_scores[:,1])
+print(auc)
+```
+結果は`0.6976998904709748`となりました。AUCの範囲が0から1であることを考えると、大きなスコアが欲しいところです。なぜなら、予測が100%正しいモデルはAUCが1になるからです。
+
+今後の分類のレッスンでは、モデルのスコアを向上させるための反復処理の方法を学びます。一旦おめでとうございます。あなたはこの回帰のレッスンを完了しました。
+
+---
+## 🚀チャレンジ
+
+ロジスティック回帰については、まだまだ解き明かすべきことがたくさんあります。しかし、学ぶための最良の方法は、実験することです。この種の分析に適したデータセットを見つけて、それを使ってモデルを構築してみましょう。ヒント:面白いデータセットを探すために[Kaggle](https://www.kaggle.com/search?q=logistic+regression+datasets) を試してみてください。
+
+## [講義後クイズ](https://white-water-09ec41f0f.azurestaticapps.net/quiz/16/)
+
+## レビュー & 自主学習
+
+ロジスティック回帰の実用的な使い方について、[Stanfordからのこの論文](https://web.stanford.edu/~jurafsky/slp3/5.pdf) の最初の数ページを読んでみてください。これまで学んできた回帰タスクのうち、どちらか一方のタイプに適したタスクについて考えてみてください。何が一番うまくいくでしょうか?
+
+## 課題
+
+[回帰に再挑戦する](./assignment.ja.md)
diff --git a/2-Regression/4-Logistic/translations/README.zh-cn.md b/2-Regression/4-Logistic/translations/README.zh-cn.md
new file mode 100644
index 0000000000..dc1bb5e10b
--- /dev/null
+++ b/2-Regression/4-Logistic/translations/README.zh-cn.md
@@ -0,0 +1,293 @@
+# 逻辑回归预测分类
+
+![逻辑与线性回归信息图](../images/logistic-linear.png)
+> 作者[Dasani Madipalli](https://twitter.com/dasani_decoded)
+## [课前测](https://white-water-09ec41f0f.azurestaticapps.net/quiz/15/)
+
+## 介绍
+
+在关于回归的最后一课中,我们将学习逻辑回归,这是经典的基本技术之一。你可以使用此技术来发现预测二元分类的模式。这是不是巧克力糖?这种病会传染吗?这个顾客会选择这个产品吗?
+
+在本课中,你将学习:
+
+- 用于数据可视化的新库
+- 逻辑回归技术
+
+✅ 在此[学习模块](https://docs.microsoft.com/learn/modules/train-evaluate-classification-models?WT.mc_id=academic-15963-cxa) 中加深你对使用此类回归的理解
+
+## 前提
+
+使用南瓜数据后,我们现在对它已经足够熟悉了,可以意识到我们可以使用一个二元类别:`Color`。
+
+让我们建立一个逻辑回归模型来预测,给定一些变量,_给定的南瓜可能是什么颜色_(橙色🎃或白色👻)。
+
+> 为什么我们在关于回归的课程分组中谈论二元分类? 只是为了语言上的方便,因为逻辑回归[真的是一种分类方法](https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression),尽管是基于线性的。我们将在在下一课组中了解对数据进行分类的其他方法。
+
+## 定义问题
+
+出于我们的目的,我们将其表示为二进制:“橙色”或“非橙色”。我们的数据集中还有一个“条纹”类别,但它的实例很少,所以我们不会使用它。无论如何,一旦我们从数据集中删除空值,它就会消失。
+
+> 🎃 有趣的是,我们有时称白南瓜为鬼南瓜。他们不是很容易雕刻,所以它们不像橙色的那么受欢迎,但它们看起来很酷!
+
+## 关于逻辑回归
+
+逻辑回归在一些重要方面与你之前了解的线性回归不同。
+
+### 二元分类
+
+逻辑回归不提供与线性回归相同的功能。前者提供关于二元类别(“橙色或非橙色”)的预测,而后者能够预测连续值,例如,给定南瓜的起源和收获时间,_其价格将上涨多少_。
+
+![南瓜分类模型](../images/pumpkin-classifier.png)
+> 作者[Dasani Madipalli](https://twitter.com/dasani_decoded)
+### 其他分类
+
+还有其他类型的逻辑回归,包括多项和有序:
+
+- **多项**,涉及多个类别 - “橙色、白色和条纹”。
+- **有序**,涉及有序类别,如果我们想对我们的结果进行逻辑排序非常有用,例如我们的南瓜按有限数量的大小(mini、sm、med、lg、xl、xxl)排序。
+
+![多项式与有序回归](../images/multinomial-ordinal.png)
+> 作者[Dasani Madipalli](https://twitter.com/dasani_decoded)
+
+### 仍然是线性的
+
+尽管这种类型的回归都是关于“类别预测”的,但当因变量(颜色)和其他自变量(数据集的其余部分,如城市名称和大小)之间存在明显的线性关系时,它仍然效果最好。最好了解一下这些变量是否存在线性划分。
+
+### 变量不必相关
+
+还记得线性回归如何更好地处理更多相关变量吗?逻辑回归是相反的——变量不必对齐。这适用于相关性较弱的数据。
+
+### 你需要大量干净的数据
+
+如果使用更多数据,逻辑回归将给出更准确的结果;我们的小数据集对于这项任务不是最佳的,请记住这一点。
+
+✅ 考虑适合逻辑回归的数据类型
+
+## 练习 - 整理数据
+
+首先,稍微清理一下数据,删除空值并仅选择其中一些列:
+
+1. 添加以下代码:
+
+ ```python
+ from sklearn.preprocessing import LabelEncoder
+
+ new_columns = ['Color','Origin','Item Size','Variety','City Name','Package']
+
+ new_pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)
+
+ new_pumpkins.dropna(inplace=True)
+
+ new_pumpkins = new_pumpkins.apply(LabelEncoder().fit_transform)
+ ```
+
+ 你可以随时查看新的数据帧:
+
+ ```python
+ new_pumpkins.info
+ ```
+
+### 可视化 - 并列网格
+
+到现在为止,你已经再次使用南瓜数据加载了[starter notebook](./notebook.ipynb)并对其进行了清理,以保留包含一些变量(包括`Color`)的数据集。让我们使用不同的库来可视化notebook中的数据帧:[Seaborn](https://seaborn.pydata.org/index.html),它是基于我们之前使用的Matplotlib构建的。
+
+Seaborn提供了一些巧妙的方法来可视化你的数据。例如,你可以比较并列网格中每个点的数据分布。
+
+1. 通过实例化一个`PairGrid`,使用我们的南瓜数据`new_pumpkins`,然后调用`map()`来创建这样一个网格:
+
+ ```python
+ import seaborn as sns
+
+ g = sns.PairGrid(new_pumpkins)
+ g.map(sns.scatterplot)
+ ```
+
+ ![可视化数据网格](../images/grid.png)
+
+ 通过并列观察数据,你可以看到颜色数据与其他列的关系。
+
+ ✅ 鉴于此散点图网格,你可以设想哪些有趣的探索?
+
+### 使用分类散点图
+
+由于颜色是一个二元类别(橙色或非橙色),它被称为“分类数据”,需要一种更[专业的方法](https://seaborn.pydata.org/tutorial/categorical.html?highlight=bar)来可视化。还有其他方法可以可视化此类别与其他变量的关系。
+
+你可以使用Seaborn图并列可视化变量。
+
+1. 尝试使用“分类散点”图来显示值的分布:
+
+ ```python
+ sns.swarmplot(x="Color", y="Item Size", data=new_pumpkins)
+ ```
+
+ ![分类散点图可视化数据](../images/swarm.png)
+
+### 小提琴图
+
+“小提琴”类型的图很有用,因为你可以轻松地可视化两个类别中数据的分布方式。小提琴图不适用于较小的数据集,因为分布显示得更“平滑”。
+
+1. 作为参数`x=Color`、`kind="violin"`并调用`catplot()`:
+
+ ```python
+ sns.catplot(x="Color", y="Item Size",
+ kind="violin", data=new_pumpkins)
+ ```
+
+ ![小提琴图](../images/violin.png)
+
+ ✅ 尝试使用其他变量创建此图和其他Seaborn图。
+
+现在我们已经了解了颜色的二元类别与更大的尺寸组之间的关系,让我们探索逻辑回归来确定给定南瓜的可能颜色。
+
+> **🧮 数学知识**
+>
+> 还记得线性回归如何经常使用普通最小二乘法来得出一个值吗?逻辑回归依赖于使用[sigmoid 函数](https://wikipedia.org/wiki/Sigmoid_function) 的“最大似然”概念。绘图上的“Sigmoid 函数”看起来像“S”形。它接受一个值并将其映射到0和1之间的某个位置。它的曲线也称为“逻辑曲线”。它的公式如下所示:
+>
+> ![逻辑函数](../images/sigmoid.png)
+>
+> 其中sigmoid的中点位于x的0点,L是曲线的最大值,k是曲线的陡度。如果函数的结果大于0.5,则所讨论的标签将被赋予二进制选择的类“1”。否则,它将被分类为“0”。
+
+## 建立你的模型
+
+在Scikit-learn中构建模型来查找这些二元分类非常简单。
+
+1. 选择要在分类模型中使用的变量,并调用`train_test_split()`拆分训练集和测试集:
+
+ ```python
+ from sklearn.model_selection import train_test_split
+
+ Selected_features = ['Origin','Item Size','Variety','City Name','Package']
+
+ X = new_pumpkins[Selected_features]
+ y = new_pumpkins['Color']
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+
+ ```
+
+2. 现在你可以训练你的模型,用你的训练数据调用`fit()`,并打印出它的结果:
+
+ ```python
+ from sklearn.model_selection import train_test_split
+ from sklearn.metrics import accuracy_score, classification_report
+ from sklearn.linear_model import LogisticRegression
+
+ model = LogisticRegression()
+ model.fit(X_train, y_train)
+ predictions = model.predict(X_test)
+
+ print(classification_report(y_test, predictions))
+ print('Predicted labels: ', predictions)
+ print('Accuracy: ', accuracy_score(y_test, predictions))
+ ```
+
+ 看看你的模型的记分板。考虑到你只有大约1000行数据,这还不错:
+
+ ```output
+ precision recall f1-score support
+
+ 0 0.85 0.95 0.90 166
+ 1 0.38 0.15 0.22 33
+
+ accuracy 0.82 199
+ macro avg 0.62 0.55 0.56 199
+ weighted avg 0.77 0.82 0.78 199
+
+ Predicted labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
+ 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ 0 0 0 1 0 1 0 0 1 0 0 0 1 0]
+ ```
+
+## 通过混淆矩阵更好地理解
+
+虽然你可以通过获得记分板报告[条目](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html?highlight=classification_report#sklearn.metrics.classification_report)把上面的项目打印出来,通过使用[混淆矩阵](https://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix)可以更容易地理解你的模型,帮助我们了解模型的性能。
+
+> 🎓 “[混淆矩阵](https://wikipedia.org/wiki/Confusion_matrix)”(或“误差矩阵”)是一个表格,用于表示模型的真假阳性和假阴性,从而衡量预测的准确性。
+
+1. 要使用混淆指标,请调用 `confusin_matrix()`:
+
+ ```python
+ from sklearn.metrics import confusion_matrix
+ confusion_matrix(y_test, predictions)
+ ```
+
+ 看看你的模型的混淆矩阵:
+
+ ```output
+ array([[162, 4],
+ [ 33, 0]])
+ ```
+
+这里发生了什么?假设我们的模型被要求对两个二元类别之间的项目进行分类,即类别“南瓜”和类别“非南瓜”。
+
+- 如果你的模型将某物预测为南瓜并且它实际上属于“南瓜”类别,我们将其称为真阳性,由左上角的数字显示。
+- 如果你的模型预测某物不是南瓜,并且它实际上属于“南瓜”类别,我们将其称为假阳性,如右上角的数字所示。
+- 如果你的模型将某物预测为南瓜并且它实际上属于“非南瓜”类别,我们将其称为假阴性,由左下角的数字显示。
+- 如果你的模型预测某物不是南瓜,并且它实际上属于“非南瓜”类别,我们将其称为真阴性,如右下角的数字所示。
+
+正如你可能已经猜到的那样,最好有更多的真阳性和真阴性以及较少的假阳性和假阴性,这意味着模型性能更好。
+
+✅ Q:根据混淆矩阵,模型怎么样? A:还不错;有很多真阳性,但也有一些假阴性。
+
+让我们借助混淆矩阵对TP/TN和FP/FN的映射,重新审视一下我们之前看到的术语:
+
+🎓 准确率:TP/(TP + FP) 检索实例中相关实例的分数(例如,哪些标签标记得很好)
+
+🎓 召回率: TP/(TP + FN) 检索到的相关实例的比例,无论是否标记良好
+
+🎓 F1分数: (2 * 准确率 * 召回率)/(准确率 + 召回率) 准确率和召回率的加权平均值,最好为1,最差为0
+
+🎓 Support:检索到的每个标签的出现次数
+
+🎓 准确度:(TP + TN)/(TP + TN + FP + FN) 为样本准确预测的标签百分比。
+
+🎓 宏平均值: 计算每个标签的未加权平均指标,不考虑标签不平衡。
+
+🎓 加权平均值:计算每个标签的平均指标,通过按支持度(每个标签的真实实例数)加权来考虑标签不平衡。
+
+✅ 如果你想让你的模型减少假阴性的数量,你能想出应该关注哪个指标吗?
+## 可视化该模型的ROC曲线
+
+这不是一个糟糕的模型;它的准确率在80%范围内,因此理想情况下,你可以使用它来预测给定一组变量的南瓜颜色。
+
+让我们再做一个可视化来查看所谓的“ROC”分数
+
+```python
+from sklearn.metrics import roc_curve, roc_auc_score
+
+y_scores = model.predict_proba(X_test)
+# calculate ROC curve
+fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
+sns.lineplot([0, 1], [0, 1])
+sns.lineplot(fpr, tpr)
+```
+再次使用Seaborn,绘制模型的[接收操作特性](https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html?highlight=roc)或ROC。 ROC曲线通常用于根据分类器的真假阳性来了解分类器的输出。“ROC曲线通常具有Y轴上的真阳性率和X轴上的假阳性率。” 因此,曲线的陡度以及中点线与曲线之间的空间很重要:你需要一条快速向上并越过直线的曲线。在我们的例子中,一开始就有误报,然后这条线正确地向上和重复:
+
+![ROC](../images/ROC.png)
+
+最后,使用Scikit-learn的[`roc_auc_score` API](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html?highlight=roc_auc#sklearn.metrics.roc_auc_score)来计算实际“曲线下面积”(AUC):
+
+```python
+auc = roc_auc_score(y_test,y_scores[:,1])
+print(auc)
+```
+结果是`0.6976998904709748`。 鉴于AUC的范围从0到1,你需要一个高分,因为预测100%正确的模型的AUC为1;在这种情况下,模型_相当不错_。
+
+在以后的分类课程中,你将学习如何迭代以提高模型的分数。但是现在,恭喜!你已经完成了这些回归课程!
+---
+## 🚀挑战
+
+关于逻辑回归,还有很多东西需要解开!但最好的学习方法是实验。找到适合此类分析的数据集并用它构建模型。你学到了什么?小贴士:尝试[Kaggle](https://kaggle.com)获取有趣的数据集。
+
+## [课后测](https://white-water-09ec41f0f.azurestaticapps.net/quiz/16/)
+
+## 复习与自学
+
+阅读[斯坦福大学的这篇论文](https://web.stanford.edu/~jurafsky/slp3/5.pdf)的前几页关于逻辑回归的一些实际应用。想想那些更适合于我们目前所研究的一种或另一种类型的回归任务的任务。什么最有效?
+
+## 任务
+
+[重试此回归](../assignment.md)
diff --git a/2-Regression/4-Logistic/translations/assignment.it.md b/2-Regression/4-Logistic/translations/assignment.it.md
new file mode 100644
index 0000000000..7b9b201619
--- /dev/null
+++ b/2-Regression/4-Logistic/translations/assignment.it.md
@@ -0,0 +1,10 @@
+# Riprovare un po' di Regressione
+
+## Istruzioni
+
+Nella lezione è stato usato un sottoinsieme dei dati della zucca. Ora si torna ai dati originali e si prova a usarli tutti, puliti e standardizzati, per costruire un modello di regressione logistica.
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | ----------------------------------------------------------------------- | ------------------------------------------------------------ | ----------------------------------------------------------- |
+| | Un notebook viene presentato con un modello ben spiegato con buone prestazioni | Un notebook viene presentato con un modello dalle prestazioni minime | Un notebook viene presentato con un modello con scarse o nessuna prestazione |
diff --git a/2-Regression/4-Logistic/translations/assignment.ja.md b/2-Regression/4-Logistic/translations/assignment.ja.md
new file mode 100644
index 0000000000..6c838173bb
--- /dev/null
+++ b/2-Regression/4-Logistic/translations/assignment.ja.md
@@ -0,0 +1,11 @@
+# 回帰に再挑戦する
+
+## 課題の指示
+
+レッスンでは、カボチャのデータのサブセットを使用しました。今度は、元のデータに戻って、ロジスティック回帰モデルを構築するために、整形して標準化したデータをすべて使ってみましょう。
+
+## ルーブリック
+
+| 指標 | 模範的 | 適切 | 要改善 |
+| -------- | ----------------------------------------------------------------------- | ------------------------------------------------------------ | ----------------------------------------------------------- |
+| | 説明がわかりやすく、性能の良いモデルが含まれたノートブック| 最小限の性能しか発揮できないモデルが含まれたノートブック | 性能の劣るモデルや、何もないモデルが含まれたノートブック |
diff --git a/2-Regression/4-Logistic/translations/assignment.zh-cn.md b/2-Regression/4-Logistic/translations/assignment.zh-cn.md
new file mode 100644
index 0000000000..8dc55af3d3
--- /dev/null
+++ b/2-Regression/4-Logistic/translations/assignment.zh-cn.md
@@ -0,0 +1,11 @@
+# 再探回归模型
+
+## 说明
+
+在这节课中,你使用了 pumpkin 数据集的子集。现在,让我们回到原始数据,并尝试使用所有数据。经过了数据清理和标准化,建立一个逻辑回归模型。
+
+## 评判标准
+
+| 标准 | 优秀 | 中规中矩 | 仍需努力 |
+| -------- | ----------------------------------------------------------------------- | ------------------------------------------------------------ | ----------------------------------------------------------- |
+| | 用notebook呈现了一个解释性和性能良好的模型 | 用notebook呈现了一个性能一般的模型 | 用notebook呈现了一个性能差的模型或根本没有模型 |
diff --git a/2-Regression/translations/README.es.md b/2-Regression/translations/README.es.md
new file mode 100644
index 0000000000..280790fc7f
--- /dev/null
+++ b/2-Regression/translations/README.es.md
@@ -0,0 +1,33 @@
+# Modelos de regresión para el machine learning
+## Tema regional: Modelos de regresión para los precios de las calabazas en América del Norte 🎃
+
+En América del Norte, las calabazas se tallan a menudo con caras aterradoras para Halloween. ¡Descubramos más sobre estas fascinantes verduras!
+
+![jack-o-lanterns](../images/jack-o-lanterns.jpg)
+> Foto de Beth Teutschmann en Unsplash
+
+## Lo que vas a aprender
+
+Las lecciones de esta sección cubren los tipos de regressión en el contexto de machine learning. Los modelos de regresión pueden ayudar a determinar la _relación_ entre variables. Este tipo de modelos puede predecir valores como la longitud, la temperatura o la edad, descubriendo así relaciones entre variables a medida que analiza puntos de datos.
+
+En esta serie de lecciones, descubrirá la diferencia entre la regresión lineal y la logística, y cuándo debe usar una u otra.
+
+En este grupo de lecciones, se preparará para comenzar las tares de machine learning, incluida la configuración de Visual Studio Code para manejar los cuadernos, el entorno común para los científicos de datos. Descubrirá Scikit-learn, una librería para machine learning, y creará sus primeros modelos, centrándose en los modelos de Regresión en este capítulo.
+
+> Existen herramientas útiles _low-code_ que pueden ayudarlo a aprender a trabjar con modelos de regresión. Pruebe [Azure ML para esta tarea](https://docs.microsoft.com/learn/modules/create-regression-model-azure-machine-learning-designer/?WT.mc_id=academic-15963-cxa)
+
+### Lecciones
+
+1. [Herramientas del oficio](1-Tools/README.md)
+2. [Gestión de datos](2-Data/README.md)
+3. [Regresión lineal y polinomial](3-Linear/README.md)
+4. [Regresión logística](4-Logistic/README.md)
+
+---
+### Créditos
+
+"ML con regresión" fue escrito con ♥️ por [Jen Looper](https://twitter.com/jenlooper)
+
+♥️ Los contribuyentes del cuestionario incluyen: [Muhammad Sakib Khan Inan](https://twitter.com/Sakibinan) y [Ornella Altunyan](https://twitter.com/ornelladotcom)
+
+El _dataset_ de calabaza es sugerido por [este proyecto en Kaggle](https://www.kaggle.com/usda/a-year-of-pumpkin-prices) y sus datos provienen de [Specialty Crops Terminal Markets Standard Reports](https://www.marketnews.usda.gov/mnp/fv-report-config-step1?type=termPrice) distribuido por el Departamento de Agricultura de los Estados Unidos. Hemos agragado algunos puntos alrededor color basados en la variedad para normalizar la distribución. Estos datos son de dominio público.
diff --git a/2-Regression/translations/README.fr.md b/2-Regression/translations/README.fr.md
new file mode 100644
index 0000000000..1b252f3f58
--- /dev/null
+++ b/2-Regression/translations/README.fr.md
@@ -0,0 +1,33 @@
+# Modèles de régression pour le machine learning
+## Sujet régional : Modèles de régression des prix des citrouilles en Amérique du Nord 🎃
+
+En Amérique du Nord, les citrouilles sont souvent sculptées en visages effrayants pour Halloween. Découvrons-en plus sur ces légumes fascinants!
+
+![jack-o-lanterns](../images/jack-o-lanterns.jpg)
+> Photo de Beth Teutschmann sur Unsplash
+
+## Ce que vous apprendrez
+
+Les leçons de cette section couvrent les types de régression dans le contexte du machine learning. Les modèles de régression peuvent aider à déterminer la _relation_ entre les variables. Ce type de modèle peut prédire des valeurs telles que la longueur, la température ou l'âge, découvrant ainsi les relations entre les variables lors de l'analyse des points de données.
+
+Dans cette série de leçons, vous découvrirez la différence entre la régression linéaire et la régression logistique, et quand vous devriez utiliser l'une ou l'autre.
+
+Dans ce groupe de leçons, vous serez préparé afin de commencer les tâches de machine learning, y compris la configuration de Visual Studio Code pour gérer les blocs-notes, l'environnement commun pour les scientifiques des données. Vous découvrirez Scikit-learn, une bibliothèque pour le machine learning, et vous construirez vos premiers modèles, en vous concentrant sur les modèles de régression dans ce chapitre.
+
+> Il existe des outils low-code utiles qui peuvent vous aider à apprendre à travailler avec des modèles de régression. Essayez [Azure ML pour cette tâche](https://docs.microsoft.com/learn/modules/create-regression-model-azure-machine-learning-designer/?WT.mc_id=academic-15963-cxa)
+
+### Cours
+
+1. [Outils du métier](1-Tools/translations/README.fr.md)
+2. [Gestion des données](2-Data/translations/README.fr.md)
+3. [Régression linéaire et polynomiale](3-Linear/translations/README.fr.md)
+4. [Régression logistique](4-Logistic/translations/README.fr.md)
+
+---
+### Crédits
+
+"ML avec régression" a été écrit avec ♥️ par [Jen Looper](https://twitter.com/jenlooper)
+
+♥️ Les contributeurs du quiz incluent : [Muhammad Sakib Khan Inan](https://twitter.com/Sakibinan) et [Ornella Altunyan](https://twitter.com/ornelladotcom)
+
+L'ensemble de données sur la citrouille est suggéré par [ce projet sur Kaggle](https://www.kaggle.com/usda/a-year-of-pumpkin-prices) et ses données proviennent des [Rapports standard des marchés terminaux des cultures spécialisées](https://www.marketnews.usda.gov/mnp/fv-report-config-step1?type=termPrice) distribué par le département américain de l'Agriculture. Nous avons ajouté quelques points autour de la couleur en fonction de la variété pour normaliser la distribution. Ces données sont dans le domaine public.
diff --git a/2-Regression/translations/README.id.md b/2-Regression/translations/README.id.md
new file mode 100644
index 0000000000..da1fa19399
--- /dev/null
+++ b/2-Regression/translations/README.id.md
@@ -0,0 +1,33 @@
+# Model regresi untuk *machine learning*
+## Topik regional: Model regresi untuk harga labu di Amerika Utara 🎃
+
+Di Amerika Utara, labu seringkali diukir menjadi muka-muka seram untuk Halloween. Mari mencari tahu lebih banyak tentang sayur menarik ini!
+
+![jack-o-lantern](../images/jack-o-lanterns.jpg)
+> Foto oleh Beth Teutschmann di Unsplash
+
+## Apa yang kamu akan pelajari
+
+Pelajaran-pelajaran dalam seksi ini mencakupi jenis-jenis regresi dalam konteks *machine learning*. Model regresi dapat membantu menentukan _hubungan_ antara variabel-variabel. Model jenis ini dapat memprediksi nilai-nilai seperti panjang, temperatur, atau usia, sehingga mengemukakan hubungan-hubungan antara variabel dengan menganalisis titik-titik data.
+
+Dalam seri pelajaran ini, kamu akan menemukan perbedaan antara regresi linear dan logistik, dan kapan untuk menggunakan satu atau yang lainnya.
+
+Selain itu, kamu akan disiapkan untuk mulai mengerjakan tugas *machine learning*, termasuk mengkonfigurasi Visual Studio Code untuk mengelola *notebook*, lingkungan wajar untuk *data scientist*. Kamu akan menemukan Scikit-learn, sebuah *library* untuk *machine learning*, dan kamu akan membangun model pertamamu dengan memfokus pada model regresi dalam bab ini.
+
+> Ada alat-alat *low-code* yang dapat membantumu belajar tentang bekerja dengan model regresi. Cobalah [Azure ML untuk tugas ini](https://docs.microsoft.com/learn/modules/create-regression-model-azure-machine-learning-designer/?WT.mc_id=academic-15963-cxa).
+
+### Pelajaran
+
+1. [Alat-alat seorang *data scientist*](1-Tools/translations/README.id.md)
+2. [Mengelola data](2-Data/translations/README.id.md)
+3. [Regresi linear dan polinomial](3-Linear/translations/README.id.md)
+4. [Regresi logistik](4-Logistic/translations/README.id.md)
+
+---
+### Kredit
+
+"ML with regression" (ML dengan regresi) ditulis dari ♥️ oleh [Jen Looper](https://twitter.com/jenlooper)
+
+♥️ Kontributor kuis termasuk: [Muhammad Sakib Khan Inan](https://twitter.com/Sakibinan) dan [Ornella Altunyan](https://twitter.com/ornelladotcom)
+
+*Dataset* labu disarankan [proyek ini di Kaggle](https://www.kaggle.com/usda/a-year-of-pumpkin-prices) dan datanya disumberkan dari [Specialty Crops Terminal Markets Standard Reports (Laporan Standar Pasar Terminal Tanaman Khusus)](https://www.marketnews.usda.gov/mnp/fv-report-config-step1?type=termPrice) yang didistribusikan Departemen Agrikultur Amerika Serikat. Kami telah menambahkan beberapa poin tentang warna berdasarkan jenis labu untuk menormalisasi distribusi data. Data ini terbuka untuk umum (*public domain*).
diff --git a/2-Regression/translations/README.it.md b/2-Regression/translations/README.it.md
new file mode 100644
index 0000000000..c6e957f9e2
--- /dev/null
+++ b/2-Regression/translations/README.it.md
@@ -0,0 +1,34 @@
+# Modelli di regressione per machine learning
+
+## Argomento regionale: modelli di Regressione per i prezzi della zucca in Nord America 🎃
+
+In Nord America, le zucche sono spesso intagliate in facce spaventose per Halloween. Si scoprirà di più su queste affascinanti verdure!
+
+![jack-o-lantern](../images/jack-o-lanterns.jpg)
+> Foto di Beth Teutschmann su Unsplash
+
+## Cosa si imparerà
+
+Le lezioni in questa sezione riguardano i tipi di regressione nel contesto di machine learning. I modelli di regressione possono aiutare a determinare la _relazione_ tra le variabili. Questo tipo di modello può prevedere valori come lunghezza, temperatura o età, scoprendo così le relazioni tra le variabili mentre analizza i punti dati.
+
+In questa serie di lezioni si scoprirà la differenza tra regressione lineare e regressione logistica e quando si dovrebbe usare l'una o l'altra.
+
+In questo gruppo di lezioni si imposterà una configurazione per iniziare le attività di machine learning, inclusa la configurazione di Visual Studio Code per gestire i notebook, l'ambiente comune per i data scientist. Si scoprirà Scikit-learn, una libreria per machine learning, e si creeranno i primi modelli, concentrandosi in questo capitolo sui modelli di Regressione.
+
+> Esistono utili strumenti a basso codice che possono aiutare a imparare a lavorare con i modelli di regressione. Si provi [Azure Machine Learning per questa attività](https://docs.microsoft.com/learn/modules/create-regression-model-azure-machine-learning-designer/?WT.mc_id=academic-15963-cxa)
+
+### Lezioni
+
+1. [Gli Attrezzi Necessari](../1-Tools/translations/README.it.md)
+2. [Gestione dati](../2-Data/translations/README.it.md)
+3. [Regressione lineare e polinomiale](../3-Linear/translations/README.it.md)
+4. [Regressione logistica](../4-Logistic/translations/README.it.md)
+
+---
+### Crediti
+
+"ML con regressione" scritto con ♥️ da [Jen Looper](https://twitter.com/jenlooper)
+
+♥️ I collaboratori del quiz includono: [Muhammad Sakib Khan Inan](https://twitter.com/Sakibinan) e [Ornella Altunyan](https://twitter.com/ornelladotcom)
+
+L'insieme di dati relativi alla zucca è suggerito da [questo progetto su](https://www.kaggle.com/usda/a-year-of-pumpkin-prices) Kaggle e i suoi dati provengono dai [Rapporti Standard sui Mercati Terminali delle Colture Speciali](https://www.marketnews.usda.gov/mnp/fv-report-config-step1?type=termPrice) distribuiti dal Dipartimento dell'Agricoltura degli Stati Uniti. Sono stati aggiunti alcuni punti intorno al colore in base alla varietà per normalizzare la distribuzione. Questi dati sono di pubblico dominio.
diff --git a/2-Regression/translations/README.ja.md b/2-Regression/translations/README.ja.md
new file mode 100644
index 0000000000..50d8294b45
--- /dev/null
+++ b/2-Regression/translations/README.ja.md
@@ -0,0 +1,32 @@
+# 機械学習のための回帰モデル
+## トピック: 北米のカボチャ価格に関する回帰モデル 🎃
+
+北米では、ハロウィンのためにカボチャはよく怖い顔に彫られています。そんな魅力的な野菜についてもっと知りましょう!
+
+![jack-o-lanterns](../images/jack-o-lanterns.jpg)
+> Beth TeutschmannによってUnsplashに投稿された写真
+
+## 今回学ぶこと
+この章のレッスンでは、機械学習の文脈における回帰の種類について説明します。回帰モデルは変数間の"関係"を決定するのに役立ちます。このタイプのモデルは、長さ、温度、年齢などの値を予測し、データポイントの分析をすることで変数間の関係性を明らかにします。
+
+今回のレッスンでは、線形回帰とロジスティック回帰の違いやどのように使い分けるかを説明します。
+
+データサイエンティストの共通開発環境であるノートブックを管理するためのVisual Studio Codeの構成や機械学習のタスクを開始するための準備を行います。また、機械学習用のライブラリであるScikit-learnを利用し最初のモデルを構築します。この章では回帰モデルに焦点を当てます。
+
+> 回帰モデルを学習するのに役立つローコードツールがあります。ぜひ[Azure ML for this task](https://docs.microsoft.com/learn/modules/create-regression-model-azure-machine-learning-designer/?WT.mc_id=academic-15963-cxa)を使ってみてください。
+
+### レッスン
+
+1. [商売道具](../1-Tools/translations/README.ja.md)
+2. [データ管理](../2-Data/translations/README.ja.md)
+3. [線形回帰と多項式回帰](../3-Linear/translations/README.ja.md)
+4. [ロジスティック回帰](../4-Logistic/translations/README.ja.md)
+
+---
+### クレジット
+
+"機械学習と回帰"は、[Jen Looper](https://twitter.com/jenlooper)によって制作されました。
+
+クイズの貢献者: [Muhammad Sakib Khan Inan](https://twitter.com/Sakibinan)と[Ornella Altunyan](https://twitter.com/ornelladotcom)
+
+pumpkin datasetは、[こちらのKaggleプロジェクト](https://www.kaggle.com/usda/a-year-of-pumpkin-prices)で提案されています。このデータは、アメリカ合衆国農務省が配布している[Specialty Crops Terminal Markets Standard Reports](https://www.marketnews.usda.gov/mnp/fv-report-config-step1?type=termPrice)が元になっています。私たちは、分布を正規化するために多様性を元に色についていくつか追加を行っています。このデータはパブリックドメインです。
diff --git a/2-Regression/translations/README.ru.md b/2-Regression/translations/README.ru.md
new file mode 100644
index 0000000000..074f0858ac
--- /dev/null
+++ b/2-Regression/translations/README.ru.md
@@ -0,0 +1,33 @@
+# Модели регрессии для машинного обучения
+## Региональная тема: модели регрессии для цен на тыкву в Северной Америке 🎃
+
+В Северной Америке на Хэллоуин из тыкв часто вырезают страшные лица. Давайте узнаем больше об этих восхитительных овощах!
+
+![jack-o-lanterns](../images/jack-o-lanterns.jpg)
+> Фото Бет Тойчманн на Unsplash
+
+## Что вы узнаете
+
+Уроки в этом разделе охватывают типы регрессии в контексте машинного обучения. Модели регрессии могут помочь определить отношения между переменными. Этот тип модели может предсказывать такие значения, как длина, температура или возраст, тем самым выявляя взаимосвязи между переменными при анализе точек данных.
+
+В этой серии уроков вы обнаружите разницу между линейной регрессией и логистической регрессией, а также когда вам следует использовать ту или иную регрессию.
+
+В этой группе уроков вы будете настроены, чтобы приступить к задачам машинного обучения, включая настройку кода Visual Studio для управления записными книжками, общей средой для специалистов по данным. Вы откроете для себя Scikit-learn, библиотеку для машинного обучения, и создадите свои первые модели, уделяя особое внимание моделям регрессии в этой главе.
+
+> Существуют полезные инструменты с небольшим количеством кода, которые могут помочь вам узнать о работе с моделями регрессии. Попробуйте [Azure ML для этой задачи](https://docs.microsoft.com/learn/modules/create-regression-model-azure-machine-learning-designer/?WT.mc_id=academic-15963-cxa)
+
+### Уроки
+
+1. [Инструменты торговли](1-Tools/README.md)
+2. [Управление данными](2-Data/README.md)
+3. [Линейная и полиномиальная регрессия](3-Linear/README.md)
+4. [Логистическая регрессия](4-Logistic/README.md)
+
+---
+### Благодарности
+
+«ML с регрессией» был написан с помощью ♥ ️[Джен Лупер](https://twitter.com/jenlooper)
+
+♥ ️ Среди участников викторины: [Мухаммад Сакиб Хан Инан](https://twitter.com/Sakibinan) и [Орнелла Алтунян](https://twitter.com/ornelladotcom)
+
+Набор данных по тыкве предлагается [этот проект на Kaggle](https://www.kaggle.com/usda/a-year-of-pumpkin-prices), а его данные взяты из [Стандартных отчетов по рынкам специальных культур на терминалах](https://www.marketnews.usda.gov/mnp/fv-report-config-step1?type=termPrice) распространяется Министерством сельского хозяйства США. Мы добавили несколько точек вокруг цвета на основе разнообразия, чтобы нормализовать распределение. Эти данные находятся в открытом доступе.
diff --git a/2-Regression/translations/README.zh-cn.md b/2-Regression/translations/README.zh-cn.md
new file mode 100644
index 0000000000..f7c511e620
--- /dev/null
+++ b/2-Regression/translations/README.zh-cn.md
@@ -0,0 +1,34 @@
+# 机器学习中的回归模型
+## 本节主题: 北美南瓜价格的回归模型 🎃
+
+在北美,南瓜经常在万圣节被刻上吓人的鬼脸。让我们来深入研究一下这种奇妙的蔬菜
+
+![jack-o-lantern](../images/jack-o-lanterns.jpg)
+> Foto oleh Beth Teutschmann di Unsplash
+
+## 你会学到什么
+
+这节的课程包括机器学习领域中的多种回归模型。回归模型可以明确多种变量间的_关系_。这种模型可以用来预测类似长度、温度和年龄之类的值, 通过分析数据点来揭示变量之间的关系。
+
+在本节的一系列课程中,你会学到线性回归和逻辑回归之间的区别,并且你将知道对于特定问题如何在这两种模型中进行选择
+
+在这组课程中,你会准备好包括为管理笔记而设置VS Code、配置数据科学家常用的环境等机器学习的初始任务。你会开始上手Scikit-learn学习项目(一个机器学习的百科),并且你会以回归模型为主构建起你的第一种机器学习模型
+
+> 这里有一些代码难度较低但很有用的工具可以帮助你学习使用回归模型。 试一下 [Azure ML for this task](https://docs.microsoft.com/learn/modules/create-regression-model-azure-machine-learning-designer/?WT.mc_id=academic-15963-cxa)
+
+
+### Lessons
+
+1. [交易的工具](../1-Tools/translations/README.zh-cn.md)
+2. [管理数据](../2-Data/translations/README.zh-cn.md)
+3. [线性和多项式回归](../3-Linear/translations/README.zh-cn.md)
+4. [逻辑回归](../4-Logistic/translations/README.zh-cn.md)
+
+---
+### Credits
+
+"机器学习中的回归" 由[Jen Looper](https://twitter.com/jenlooper)♥️ 撰写
+
+♥️ 测试的贡献者: [Muhammad Sakib Khan Inan](https://twitter.com/Sakibinan) 和 [Ornella Altunyan](https://twitter.com/ornelladotcom)
+
+南瓜数据集受此启发 [this project on Kaggle](https://www.kaggle.com/usda/a-year-of-pumpkin-prices) 并且其数据源自 [Specialty Crops Terminal Markets Standard Reports](https://www.marketnews.usda.gov/mnp/fv-report-config-step1?type=termPrice) 由美国农业部上传分享。我们根据种类添加了围绕颜色的一些数据点。这些数据处在公共的域名上。
diff --git a/3-Web-App/1-Web-App/README.md b/3-Web-App/1-Web-App/README.md
index f848d80786..c983e679a9 100644
--- a/3-Web-App/1-Web-App/README.md
+++ b/3-Web-App/1-Web-App/README.md
@@ -1,6 +1,6 @@
# Build a Web App to use a ML Model
-In this lesson, you will train an ML model on a data set that's out of this world: _UFO sightings over the past century_, sourced from [NUFORC's database](https://www.nuforc.org).
+In this lesson, you will train an ML model on a data set that's out of this world: _UFO sightings over the past century_, sourced from NUFORC's database.
You will learn:
@@ -11,7 +11,7 @@ We will continue our use of notebooks to clean data and train our model, but you
To do this, you need to build a web app using Flask.
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/17/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/17/)
## Building an app
@@ -19,15 +19,15 @@ There are several ways to build web apps to consume machine learning models. You
### Considerations
-There are many questions you need to ask:
+There are many questions you need to ask:
- **Is it a web app or a mobile app?** If you are building a mobile app or need to use the model in an IoT context, you could use [TensorFlow Lite](https://www.tensorflow.org/lite/) and use the model in an Android or iOS app.
-- **Where will the model reside**? In the cloud or locally?
-- **Offline support**. Does the app have to work offline?
+- **Where will the model reside?** In the cloud or locally?
+- **Offline support.** Does the app have to work offline?
- **What technology was used to train the model?** The chosen technology may influence the tooling you need to use.
- - **Using Tensor flow**. If you are training a model using TensorFlow, for example, that ecosystem provides the ability to convert a TensorFlow model for use in a web app by using [TensorFlow.js](https://www.tensorflow.org/js/).
- - **Using PyTorch**. If you are building a model using a library such as [PyTorch](https://pytorch.org/), you have the option to export it in [ONNX](https://onnx.ai/) (Open Neural Network Exchange) format for use in JavaScript web apps that can use the [Onnx Runtime](https://www.onnxruntime.ai/). This option will be explored in a future lesson for a Scikit-learn-trained model.
- - **Using Lobe.ai or Azure Custom vision**. If you are using an ML SaaS (Software as a Service) system such as [Lobe.ai](https://lobe.ai/) or [Azure Custom Vision](https://azure.microsoft.com/services/cognitive-services/custom-vision-service/?WT.mc_id=academic-15963-cxa) to train a model, this type of software provides ways to export the model for many platforms, including building a bespoke API to be queried in the cloud by your online application.
+ - **Using Tensor flow.** If you are training a model using TensorFlow, for example, that ecosystem provides the ability to convert a TensorFlow model for use in a web app by using [TensorFlow.js](https://www.tensorflow.org/js/).
+ - **Using PyTorch.** If you are building a model using a library such as [PyTorch](https://pytorch.org/), you have the option to export it in [ONNX](https://onnx.ai/) (Open Neural Network Exchange) format for use in JavaScript web apps that can use the [Onnx Runtime](https://www.onnxruntime.ai/). This option will be explored in a future lesson for a Scikit-learn-trained model.
+ - **Using Lobe.ai or Azure Custom Vision.** If you are using an ML SaaS (Software as a Service) system such as [Lobe.ai](https://lobe.ai/) or [Azure Custom Vision](https://azure.microsoft.com/services/cognitive-services/custom-vision-service/?WT.mc_id=academic-15963-cxa) to train a model, this type of software provides ways to export the model for many platforms, including building a bespoke API to be queried in the cloud by your online application.
You also have the opportunity to build an entire Flask web app that would be able to train the model itself in a web browser. This can also be done using TensorFlow.js in a JavaScript context.
@@ -37,7 +37,7 @@ For our purposes, since we have been working with Python-based notebooks, let's
For this task, you need two tools: Flask and Pickle, both of which run on Python.
-✅ What's [Flask](https://palletsprojects.com/p/flask/)? Defined as a 'micro-framework' by its creators, Flask provides the basic features of web frameworks using Python and a templating engine to build web pages. Take a look at [this Learn module](https://docs.microsoft.com/learn/modules/python-flask-build-ai-web-app?WT.mc_id=academic-15963-cxa) to practice building with Flask.
+✅ What's [Flask](https://palletsprojects.com/p/flask/)? Defined as a 'micro-framework' by its creators, Flask provides the basic features of web frameworks using Python and a templating engine to build web pages. Take a look at [this Learn module](https://docs.microsoft.com/learn/modules/python-flask-build-ai-web-app?WT.mc_id=academic-15963-cxa) to practice building with Flask.
✅ What's [Pickle](https://docs.python.org/3/library/pickle.html)? Pickle 🥒 is a Python module that serializes and de-serializes a Python object structure. When you 'pickle' a model, you serialize or flatten its structure for use on the web. Be careful: pickle is not intrinsically secure, so be careful if prompted to 'un-pickle' a file. A pickled file has the suffix `.pkl`.
@@ -45,12 +45,12 @@ For this task, you need two tools: Flask and Pickle, both of which run on Python
In this lesson you'll use data from 80,000 UFO sightings, gathered by [NUFORC](https://nuforc.org) (The National UFO Reporting Center). This data has some interesting descriptions of UFO sightings, for example:
-- **Long example description**. "A man emerges from a beam of light that shines on a grassy field at night and he runs towards the Texas Instruments parking lot".
-- **Short example description**. "the lights chased us".
+- **Long example description.** "A man emerges from a beam of light that shines on a grassy field at night and he runs towards the Texas Instruments parking lot".
+- **Short example description.** "the lights chased us".
-The [ufos.csv](./data/ufos.csv) spreadsheet includes columns about the `city`, `state` and `country` where the sighting occurred, the object's `shape` and its `latitude` and `longitude`.
+The [ufos.csv](./data/ufos.csv) spreadsheet includes columns about the `city`, `state` and `country` where the sighting occurred, the object's `shape` and its `latitude` and `longitude`.
-In the blank [notebook](notebook.ipynb) included in this lesson:
+In the blank [notebook](notebook.ipynb) included in this lesson:
1. import `pandas`, `matplotlib`, and `numpy` as you did in previous lessons and import the ufos spreadsheet. You can take a look at a sample data set:
@@ -58,7 +58,7 @@ In the blank [notebook](notebook.ipynb) included in this lesson:
import pandas as pd
import numpy as np
- ufos = pd.read_csv('../data/ufos.csv')
+ ufos = pd.read_csv('./data/ufos.csv')
ufos.head()
```
@@ -82,7 +82,7 @@ In the blank [notebook](notebook.ipynb) included in this lesson:
1. Import Scikit-learn's `LabelEncoder` library to convert the text values for countries to a number:
- ✅ LabelEncoder encodes data alphabetically
+ ✅ LabelEncoder encodes data alphabetically
```python
from sklearn.preprocessing import LabelEncoder
@@ -96,16 +96,16 @@ In the blank [notebook](notebook.ipynb) included in this lesson:
```output
Seconds Country Latitude Longitude
- 2 20.0 3 53.200000 -2.916667
- 3 20.0 4 28.978333 -96.645833
- 14 30.0 4 35.823889 -80.253611
- 23 60.0 4 45.582778 -122.352222
- 24 3.0 3 51.783333 -0.783333
+ 2 20.0 3 53.200000 -2.916667
+ 3 20.0 4 28.978333 -96.645833
+ 14 30.0 4 35.823889 -80.253611
+ 23 60.0 4 45.582778 -122.352222
+ 24 3.0 3 51.783333 -0.783333
```
## Exercise - build your model
-Now you can get ready to train a model by diving the data into the training and testing group.
+Now you can get ready to train a model by dividing the data into the training and testing group.
1. Select the three features you want to train on as your X vector, and the y vector will be the `Country`. You want to be able to input `Seconds`, `Latitude` and `Longitude` and get a country id to return.
@@ -123,7 +123,7 @@ Now you can get ready to train a model by diving the data into the training and
1. Train your model using logistic regression:
```python
- from sklearn.metrics import accuracy_score, classification_report
+ from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
@@ -159,20 +159,20 @@ Now you can build a Flask app to call your model and return similar results, but
1. Start by creating a folder called **web-app** next to the _notebook.ipynb_ file where your _ufo-model.pkl_ file resides.
-1. In that folder create three more folders: **static**, with a folder **css** inside it, and **templates`**. You should now have the following files and directories:
+1. In that folder create three more folders: **static**, with a folder **css** inside it, and **templates**. You should now have the following files and directories:
```output
web-app/
static/
css/
- templates/
+ templates/
notebook.ipynb
- ufo-model.pk1
- ```
+ ufo-model.pkl
+ ```
- ✅ Refer to the solution folder for a view of the finished app
+ ✅ Refer to the solution folder for a view of the finished app
-1. The first file to create in _web-app_ folder is **requirements.txt** file. Like _package.json_ in a JavaScript app, this file lists dependencies required by the app. In **requirements.txt** add the lines:
+1. The first file to create in _web-app_ folder is **requirements.txt** file. Like _package.json_ in a JavaScript app, this file lists dependencies required by the app. In **requirements.txt** add the lines:
```text
scikit-learn
@@ -183,23 +183,23 @@ Now you can build a Flask app to call your model and return similar results, but
1. Now, run this file by navigating to _web-app_:
- ```bash
- cd web-app
- ```
+ ```bash
+ cd web-app
+ ```
-1. In your terminal type `pip install`, to install the libraries listed in _reuirements.txt_:
+1. In your terminal type `pip install`, to install the libraries listed in _requirements.txt_:
- ```bash
- pip install -r requirements.txt
- ```
+ ```bash
+ pip install -r requirements.txt
+ ```
1. Now, you're ready to create three more files to finish the app:
- 1. Create **app.py** in the root
+ 1. Create **app.py** in the root.
2. Create **index.html** in _templates_ directory.
3. Create **styles.css** in _static/css_ directory.
-1. Build out the _styles.css__ file with a few styles:
+1. Build out the _styles.css_ file with a few styles:
```css
body {
@@ -238,33 +238,33 @@ Now you can build a Flask app to call your model and return similar results, but
```html
-
According to the number of seconds, latitude and longitude, which country is likely to have reported seeing a UFO?
+
According to the number of seconds, latitude and longitude, which country is likely to have reported seeing a UFO?
-
+
-
-
{{ prediction_text }}
+
{{ prediction_text }}
-
-
+
-
+
+
+
```
@@ -309,7 +309,7 @@ Now you can build a Flask app to call your model and return similar results, but
app.run(debug=True)
```
- > 💡 Tip: when you add [`debug=True`](https://www.askpython.com/python-modules/flask/flask-debug-mode) while running the web app using Flask, any changes you make to your application will be reflected immediately without the need to restart the server. Beware! Don't enable this mode in a production app.
+ > 💡 Tip: when you add [`debug=True`](https://www.askpython.com/python-modules/flask/flask-debug-mode) while running the web app using Flask, any changes you make to your application will be reflected immediately without the need to restart the server. Beware! Don't enable this mode in a production app.
If you run `python app.py` or `python3 app.py` - your web server starts up, locally, and you can fill out a short form to get an answer to your burning question about where UFOs have been sighted!
@@ -324,24 +324,22 @@ On the `/predict` route, several things happen when the form is posted:
1. The form variables are gathered and converted to a numpy array. They are then sent to the model and a prediction is returned.
2. The Countries that we want displayed are re-rendered as readable text from their predicted country code, and that value is sent back to index.html to be rendered in the template.
-Using a model this way, with Flask and a pickled model, is relatively straightforward. The hardest thing is to understand what shape the data is that must be sent to the model to get a prediction. That all depends on how the model was trained. This one has three data points to be input in order to get a prediction.
+Using a model this way, with Flask and a pickled model, is relatively straightforward. The hardest thing is to understand what shape the data is that must be sent to the model to get a prediction. That all depends on how the model was trained. This one has three data points to be input in order to get a prediction.
In a professional setting, you can see how good communication is necessary between the folks who train the model and those who consume it in a web or mobile app. In our case, it's only one person, you!
---
-## 🚀 Challenge:
+## 🚀 Challenge:
Instead of working in a notebook and importing the model to the Flask app, you could train the model right within the Flask app! Try converting your Python code in the notebook, perhaps after your data is cleaned, to train the model from within the app on a route called `train`. What are the pros and cons of pursuing this method?
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/18/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/18/)
## Review & Self Study
There are many ways to build a web app to consume ML models. Make a list of the ways you could use JavaScript or Python to build a web app to leverage machine learning. Consider architecture: should the model stay in the app or live in the cloud? If the latter, how would you access it? Draw out an architectural model for an applied ML web solution.
-## Assignment
+## Assignment
[Try a different model](assignment.md)
-
-
diff --git a/3-Web-App/1-Web-App/translations/README.it.md b/3-Web-App/1-Web-App/translations/README.it.md
new file mode 100644
index 0000000000..f31eb2d4b5
--- /dev/null
+++ b/3-Web-App/1-Web-App/translations/README.it.md
@@ -0,0 +1,347 @@
+# Creare un'app web per utilizzare un modello ML
+
+In questa lezione, si addestrerà un modello ML su un insieme di dati fuori dal mondo: _avvistamenti di UFO nel secolo scorso_, provenienti dal [database di NUFORC](https://www.nuforc.org).
+
+Si imparerà:
+
+- Come serializzare/deserializzare un modello addestrato
+- Come usare quel modello in un'app Flask
+
+Si continuerà a utilizzare il notebook per pulire i dati e addestrare il modello, ma si può fare un ulteriore passo avanti nel processo esplorando l'utilizzo del modello direttamente in un'app web.
+
+Per fare ciò, è necessario creare un'app Web utilizzando Flask.
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/17/)
+
+## Costruire un'app
+
+Esistono diversi modi per creare app Web per utilizzare modelli di machine learning. L'architettura web può influenzare il modo in cui il modello viene addestrato. Si immagini di lavorare in un'azienda nella quale il gruppo di data science ha addestrato un modello che va utilizzato in un'app.
+
+### Considerazioni
+
+Ci sono molte domande da porsi:
+
+- **È un'app web o un'app su dispositivo mobile?** Se si sta creando un'app su dispositivo mobile o si deve usare il modello in un contesto IoT, ci si può avvalere [di TensorFlow Lite](https://www.tensorflow.org/lite/) e usare il modello in un'app Android o iOS.
+- **Dove risiederà il modello**? E' utilizzato in cloud o in locale?
+- **Supporto offline**. L'app deve funzionare offline?
+- **Quale tecnologia è stata utilizzata per addestrare il modello?** La tecnologia scelta può influenzare gli strumenti che è necessario utilizzare.
+ - **Utilizzare** TensorFlow. Se si sta addestrando un modello utilizzando TensorFlow, ad esempio, tale ecosistema offre la possibilità di convertire un modello TensorFlow per l'utilizzo in un'app Web utilizzando [TensorFlow.js](https://www.tensorflow.org/js/).
+ - **Utilizzare PyTorch**. Se si sta costruendo un modello utilizzando una libreria come PyTorch[,](https://pytorch.org/) si ha la possibilità di esportarlo in formato [ONNX](https://onnx.ai/) ( Open Neural Network Exchange) per l'utilizzo in app Web JavaScript che possono utilizzare il [motore di esecuzione Onnx](https://www.onnxruntime.ai/). Questa opzione verrà esplorata in una lezione futura per un modello addestrato da Scikit-learn
+ - **Utilizzo di Lobe.ai o Azure Custom vision**. Se si sta usando un sistema ML SaaS (Software as a Service) come [Lobe.ai](https://lobe.ai/) o [Azure Custom Vision](https://azure.microsoft.com/services/cognitive-services/custom-vision-service/?WT.mc_id=academic-15963-cxa) per addestrare un modello, questo tipo di software fornisce modi per esportare il modello per molte piattaforme, inclusa la creazione di un'API su misura da interrogare nel cloud dalla propria applicazione online.
+
+Si ha anche l'opportunità di creare un'intera app Web Flask in grado di addestrare il modello stesso in un browser Web. Questo può essere fatto anche usando TensorFlow.js in un contesto JavaScript.
+
+Per questo scopo, poiché si è lavorato con i notebook basati su Python, verranno esplorati i passaggi necessari per esportare un modello addestrato da tale notebook in un formato leggibile da un'app Web creata in Python.
+
+## Strumenti
+
+Per questa attività sono necessari due strumenti: Flask e Pickle, entrambi eseguiti su Python.
+
+✅ Cos'è [Flask](https://palletsprojects.com/p/flask/)? Definito come un "micro-framework" dai suoi creatori, Flask fornisce le funzionalità di base dei framework web utilizzando Python e un motore di template per creare pagine web. Si dia un'occhiata a [questo modulo di apprendimento](https://docs.microsoft.com/learn/modules/python-flask-build-ai-web-app?WT.mc_id=academic-15963-cxa) per esercitarsi a sviluppare con Flask.
+
+✅ Cos'è [Pickle](https://docs.python.org/3/library/pickle.html)? Pickle 🥒 è un modulo Python che serializza e de-serializza la struttura di un oggetto Python. Quando si utilizza pickle in un modello, si serializza o si appiattisce la sua struttura per l'uso sul web. Cautela: pickle non è intrinsecamente sicuro, quindi si faccia attenzione se viene chiesto di de-serializzare un file. Un file creato con pickle ha il suffisso `.pkl`.
+
+## Esercizio: pulire i dati
+
+In questa lezione verranno utilizzati i dati di 80.000 avvistamenti UFO, raccolti dal Centro Nazionale per gli Avvistamenti di UFO [NUFORC](https://nuforc.org) (The National UFO Reporting Center). Questi dati hanno alcune descrizioni interessanti di avvistamenti UFO, ad esempio:
+
+- **Descrizione di esempio lunga**. "Un uomo emerge da un raggio di luce che di notte brilla su un campo erboso e corre verso il parcheggio della Texas Instruments".
+- **Descrizione di esempio breve**. "le luci ci hanno inseguito".
+
+Il foglio di calcolo [ufo.csv](../data/ufos.csv) include colonne su città (`city`), stato (`state`) e nazione (`country`) in cui è avvenuto l'avvistamento, la forma (`shape`) dell'oggetto e la sua latitudine (`latitude`) e longitudine (`longitude`).
+
+Nel [notebook](../notebook.ipynb) vuoto incluso in questa lezione:
+
+1. importare `pandas`, `matplotlib` e `numpy` come fatto nelle lezioni precedenti e importare il foglio di calcolo ufo.csv. Si può dare un'occhiata a un insieme di dati campione:
+
+ ```python
+ import pandas as pd
+ import numpy as np
+
+ ufos = pd.read_csv('../data/ufos.csv')
+ ufos.head()
+ ```
+
+1. Convertire i dati ufos in un piccolo dataframe con nuove intestazioni Controllare i valori univoci nel campo `Country` .
+
+ ```python
+ ufos = pd.DataFrame({'Seconds': ufos['duration (seconds)'], 'Country': ufos['country'],'Latitude': ufos['latitude'],'Longitude': ufos['longitude']})
+
+ ufos.Country.unique()
+ ```
+
+1. Ora si può ridurre la quantità di dati da gestire eliminando qualsiasi valore nullo e importando solo avvistamenti tra 1-60 secondi:
+
+ ```python
+ ufos.dropna(inplace=True)
+
+ ufos = ufos[(ufos['Seconds'] >= 1) & (ufos['Seconds'] <= 60)]
+
+ ufos.info()
+ ```
+
+1. Importare la libreria `LabelEncoder` di Scikit-learn per convertire i valori di testo per le nazioni in un numero:
+
+ ✅ LabelEncoder codifica i dati in ordine alfabetico
+
+ ```python
+ from sklearn.preprocessing import LabelEncoder
+
+ ufos['Country'] = LabelEncoder().fit_transform(ufos['Country'])
+
+ ufos.head()
+ ```
+
+ I dati dovrebbero assomigliare a questo:
+
+ ```output
+ Seconds Country Latitude Longitude
+ 2 20.0 3 53.200000 -2.916667
+ 3 20.0 4 28.978333 -96.645833
+ 14 30.0 4 35.823889 -80.253611
+ 23 60.0 4 45.582778 -122.352222
+ 24 3.0 3 51.783333 -0.783333
+ ```
+
+## Esercizio: costruire il proprio modello
+
+Ora ci si può preparare per addestrare un modello portando i dati nei gruppi di addestramento e test.
+
+1. Selezionare le tre caratteristiche su cui lo si vuole allenare come vettore X mentre il vettore y sarà `Country` Si deve essere in grado di inserire secondi (`Seconds`), latitudine (`Latitude`) e longitudine (`Longitude`) e ottenere un ID nazione da restituire.
+
+ ```python
+ from sklearn.model_selection import train_test_split
+
+ Selected_features = ['Seconds','Latitude','Longitude']
+
+ X = ufos[Selected_features]
+ y = ufos['Country']
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+ ```
+
+1. Addestrare il modello usando la regressione logistica:
+
+ ```python
+ from sklearn.metrics import accuracy_score, classification_report
+ from sklearn.linear_model import LogisticRegression
+ model = LogisticRegression()
+ model.fit(X_train, y_train)
+ predictions = model.predict(X_test)
+
+ print(classification_report(y_test, predictions))
+ print('Predicted labels: ', predictions)
+ print('Accuracy: ', accuracy_score(y_test, predictions))
+ ```
+
+La precisione non è male **(circa il 95%)**, non sorprende che `Country` e `Latitude/Longitude` siano correlati.
+
+Il modello creato non è molto rivoluzionario in quanto si dovrebbe essere in grado di dedurre una nazione (`Country`) dalla sua latitudine e longitudine (`Latitude` e `Longitude`), ma è un buon esercizio provare ad allenare dai dati grezzi che sono stati puliti ed esportati, e quindi utilizzare questo modello in una app web.
+
+## Esercizio: usare pickle con il modello
+
+Ora è il momento di utilizzare _pickle_ con il modello! Lo si può fare in poche righe di codice. Una volta che è stato _serializzato con pickle_, caricare il modello e testarlo rispetto a un array di dati di esempio contenente valori per secondi, latitudine e longitudine,
+
+```python
+import pickle
+model_filename = 'ufo-model.pkl'
+pickle.dump(model, open(model_filename,'wb'))
+
+model = pickle.load(open('ufo-model.pkl','rb'))
+print(model.predict([[50,44,-12]]))
+```
+
+Il modello restituisce **"3"**, che è il codice nazione per il Regno Unito. Fantastico! 👽
+
+## Esercizio: creare un'app Flask
+
+Ora si può creare un'app Flask per chiamare il modello e restituire risultati simili, ma in un modo visivamente più gradevole.
+
+1. Iniziare creando una cartella chiamata **web-app** a livello del file _notebook.ipynb_ dove risiede il file _ufo-model.pkl_.
+
+1. In quella cartella creare altre tre cartelle: **static**, con una cartella **css** al suo interno e **templates**. Ora si dovrebbero avere i seguenti file e directory:
+
+ ```output
+ web-app/
+ static/
+ css/
+ templates/
+ notebook.ipynb
+ ufo-model.pkl
+ ```
+
+ ✅ Fare riferimento alla cartella della soluzione per una visualizzazione dell'app finita.
+
+1. Il primo file da creare nella cartella _web-app_ è il file **requirements.txt**. Come _package.json_ in un'app JavaScript, questo file elenca le dipendenze richieste dall'app. In **requirements.txt** aggiungere le righe:
+
+ ```text
+ scikit-learn
+ pandas
+ numpy
+ flask
+ ```
+
+1. Ora, eseguire questo file portandosi su _web-app_:
+
+ ```bash
+ cd web-app
+ ```
+
+1. Aprire una finestra di terminale dove risiede requirements.txt e digitare `pip install`, per installare le librerie elencate in _reuirements.txt_:
+
+ ```bash
+ pip install -r requirements.txt
+ ```
+
+1. Ora si è pronti per creare altri tre file per completare l'app:
+
+ 1. Creare **app.py** nella directory radice.
+ 2. Creare **index.html** nella directory _templates_.
+ 3. Creare **sytles.css** nella directory _static/css_.
+
+1. Inserire nel file _styles.css_ alcuni stili:
+
+ ```css
+ body {
+ width: 100%;
+ height: 100%;
+ font-family: 'Helvetica';
+ background: black;
+ color: #fff;
+ text-align: center;
+ letter-spacing: 1.4px;
+ font-size: 30px;
+ }
+
+ input {
+ min-width: 150px;
+ }
+
+ .grid {
+ width: 300px;
+ border: 1px solid #2d2d2d;
+ display: grid;
+ justify-content: center;
+ margin: 20px auto;
+ }
+
+ .box {
+ color: #fff;
+ background: #2d2d2d;
+ padding: 12px;
+ display: inline-block;
+ }
+ ```
+
+1. Quindi, creare il file _index.html_ :
+
+ ```html
+
+
+
+
+ 🛸 UFO Appearance Prediction! 👽
+
+
+
+
+
+
+
+
+
According to the number of seconds, latitude and longitude, which country is likely to have reported seeing a UFO?
+
+
+
+
+
{{ prediction_text }}
+
+
+
+
+
+
+ ```
+
+ Dare un'occhiata al template di questo file. Notare la sintassi con le parentesi graffe attorno alle variabili che verranno fornite dall'app, come il testo di previsione: `{{}}`. C'è anche un modulo che invia una previsione alla rotta `/predict`.
+
+ Infine, si è pronti per creare il file python che guida il consumo del modello e la visualizzazione delle previsioni:
+
+1. In `app.py` aggiungere:
+
+ ```python
+ import numpy as np
+ from flask import Flask, request, render_template
+ import pickle
+
+ app = Flask(__name__)
+
+ model = pickle.load(open("../ufo-model.pkl", "rb"))
+
+
+ @app.route("/")
+ def home():
+ return render_template("index.html")
+
+
+ @app.route("/predict", methods=["POST"])
+ def predict():
+
+ int_features = [int(x) for x in request.form.values()]
+ final_features = [np.array(int_features)]
+ prediction = model.predict(final_features)
+
+ output = prediction[0]
+
+ countries = ["Australia", "Canada", "Germany", "UK", "US"]
+
+ return render_template(
+ "index.html", prediction_text="Likely country: {}".format(countries[output])
+ )
+
+
+ if __name__ == "__main__":
+ app.run(debug=True)
+ ```
+
+ > 💡 Suggerimento: quando si aggiunge [`debug=True`](https://www.askpython.com/python-modules/flask/flask-debug-mode) durante l'esecuzione dell'app web utilizzando Flask, qualsiasi modifica apportata all'applicazione verrà recepita immediatamente senza la necessità di riavviare il server. Attenzione! Non abilitare questa modalità in un'app di produzione.
+
+Se si esegue `python app.py` o `python3 app.py` , il server web si avvia, localmente, e si può compilare un breve modulo per ottenere una risposta alla domanda scottante su dove sono stati avvistati gli UFO!
+
+Prima di farlo, dare un'occhiata alle parti di `app.py`:
+
+1. Innanzitutto, le dipendenze vengono caricate e l'app si avvia.
+1. Poi il modello viene importato.
+1. Infine index.html viene visualizzato sulla rotta home.
+
+Sulla rotta `/predict` , accadono diverse cose quando il modulo viene inviato:
+
+1. Le variabili del modulo vengono raccolte e convertite in un array numpy. Vengono quindi inviate al modello e viene restituita una previsione.
+2. Le nazioni che si vogliono visualizzare vengono nuovamente esposte come testo leggibile ricavato dal loro codice paese previsto e tale valore viene inviato a index.html per essere visualizzato nel template della pagina web.
+
+Usare un modello in questo modo, con Flask e un modello serializzato è relativamente semplice. La cosa più difficile è capire che forma hanno i dati che devono essere inviati al modello per ottenere una previsione. Tutto dipende da come è stato addestrato il modello. Questo ha tre punti dati da inserire per ottenere una previsione.
+
+In un ambiente professionale, si può vedere quanto sia necessaria una buona comunicazione tra le persone che addestrano il modello e coloro che lo consumano in un'app web o su dispositivo mobile. In questo caso, si ricoprono entrambi i ruoli!
+
+---
+
+## 🚀 Sfida
+
+Invece di lavorare su un notebook e importare il modello nell'app Flask, si può addestrare il modello direttamente nell'app Flask! Provare a convertire il codice Python nel notebook, magari dopo che i dati sono stati puliti, per addestrare il modello dall'interno dell'app su un percorso chiamato `/train`. Quali sono i pro e i contro nel seguire questo metodo?
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/18/)
+
+## Revisione e Auto Apprendimento
+
+Esistono molti modi per creare un'app web per utilizzare i modelli ML. Elencare dei modi in cui si potrebbe utilizzare JavaScript o Python per creare un'app web per sfruttare machine learning. Considerare l'architettura: il modello dovrebbe rimanere nell'app o risiedere nel cloud? In quest'ultimo casi, come accedervi? Disegnare un modello architettonico per una soluzione web ML applicata.
+
+## Compito
+
+[Provare un modello diverso](assignment.it.md)
+
+
diff --git a/3-Web-App/1-Web-App/translations/README.ja.md b/3-Web-App/1-Web-App/translations/README.ja.md
new file mode 100644
index 0000000000..b23050dd62
--- /dev/null
+++ b/3-Web-App/1-Web-App/translations/README.ja.md
@@ -0,0 +1,345 @@
+# 機械学習モデルを使うためのWebアプリを構築する
+
+この講義では、この世界のものではないデータセットを使って機械学習モデルを学習させます。NUFORCのデータベースに登録されている「過去100年のUFO目撃情報」です。
+
+あなたが学ぶ内容は以下の通りです。
+
+- 学習したモデルを「塩漬け」にする方法
+- モデルをFlaskアプリで使う方法
+
+引き続きノートブックを使ってデータのクリーニングやモデルの学習を行いますが、さらに一歩進んでモデルを「野生で」、つまりWebアプリで使うのを検討することも可能です。
+
+そのためには、Flaskを使ってWebアプリを構築する必要があります。
+
+## [講義前の小テスト](https://white-water-09ec41f0f.azurestaticapps.net/quiz/17?loc=ja)
+
+## アプリの構築
+
+機械学習モデルを使うためのWebアプリを構築する方法はいくつかあります。Webアーキテクチャはモデルの学習方法に影響を与える可能性があります。データサイエンスグループが学習したモデルをアプリで使用する、という業務があなたに任されている状況をイメージしてください。
+
+### 検討事項
+
+あなたがすべき質問はたくさんあります。
+
+- **Webアプリですか?それともモバイルアプリですか?** モバイルアプリを構築している場合や、IoTの環境でモデルを使う必要がある場合は、[TensorFlow Lite](https://www.tensorflow.org/lite/) を使用して、AndroidまたはiOSアプリでモデルを使うことができます。
+- **モデルはどこに保存しますか?** クラウドでしょうか?それともローカルでしょうか?
+- **オフラインでのサポート。** アプリはオフラインで動作する必要がありますか?
+- **モデルの学習にはどのような技術が使われていますか?** 選択された技術は使用しなければいけないツールに影響を与える可能性があります。
+ - **Tensor flow を使っている。** 例えば TensorFlow を使ってモデルを学習している場合、 [TensorFlow.js](https://www.tensorflow.org/js/) を使って、Webアプリで使用できるように TensorFlow モデルを変換する機能をそのエコシステムは提供しています。
+ - **PyTorchを使っている。** [PyTorch](https://pytorch.org/) などのライブラリを使用してモデルを構築している場合、[ONNX](https://onnx.ai/) (Open Neural Network Exchange) 形式で出力して、JavaScript のWebアプリで [Onnx Runtime](https://www.onnxruntime.ai/) を使用するという選択肢があります。この選択肢は、Scikit-learn で学習したモデルを使う今後の講義で調べます。
+ - **Lobe.ai または Azure Custom Vision を使っている。** [Lobe.ai](https://lobe.ai/) や [Azure Custom Vision](https://azure.microsoft.com/services/cognitive-services/custom-vision-service/?WT.mc_id=academic-15963-cxa) のような機械学習SaaS (Software as a Service) システムを使用してモデルを学習している場合、この種のソフトウェアは多くのプラットフォーム向けにモデルを出力する方法を用意していて、これにはクラウド上のオンラインアプリケーションからリクエストされるような専用APIを構築することも含まれます。
+
+また、ウェブブラウザ上でモデルを学習することができるFlaskのWebアプリを構築することもできます。JavaScript の場合でも TensorFlow.js を使うことで実現できます。
+
+私たちの場合はPythonベースのノートブックを今まで使用してきたので、学習したモデルをそのようなノートブックからPythonで構築されたWebアプリで読める形式に出力するために必要な手順を探ってみましょう。
+
+## ツール
+
+ここでの作業には2つのツールが必要です。FlaskとPickleで、どちらもPython上で動作します。
+
+✅ [Flask](https://palletsprojects.com/p/flask/) とは?制作者によって「マイクロフレームワーク」と定義されているFlaskは、Pythonを使ったWebフレームワークの基本機能と、Webページを構築するためのテンプレートエンジンを提供しています。Flaskでの構築を練習するために [この学習モジュール](https://docs.microsoft.com/learn/modules/python-flask-build-ai-web-app?WT.mc_id=academic-15963-cxa) を見てみてください。
+
+✅ [Pickle](https://docs.python.org/3/library/pickle.html) とは?Pickle 🥒 は、Pythonのオブジェクト構造をシリアライズ・デシリアライズするPythonモジュールです。モデルを「塩漬け」にすると、Webで使用するためにその構造をシリアライズしたり平坦化したりします。pickleは本質的に安全ではないので、ファイルの 'un-pickle' を促された際は注意してください。塩漬けされたファイルの末尾は `.pkl` となります。
+
+## 演習 - データをクリーニングする
+
+この講義では、[NUFORC](https://nuforc.org) (The National UFO Reporting Center) が集めた8万件のUFO目撃情報のデータを使います。このデータには、UFOの目撃情報に関する興味深い記述があります。例えば以下のようなものです。
+
+- **長い記述の例。** 「夜の草原を照らす光線から男が現れ、Texas Instruments の駐車場に向かって走った」
+- **短い記述の例。** 「私たちを光が追いかけてきた」
+
+[ufos.csv](../data/ufos.csv) のスプレッドシートには、目撃された場所の都市 (`city`)、州 (`state`)、国 (`country`)、物体の形状 (`shape`)、緯度 (`latitude`)、経度 (`longitude`) などの列が含まれています。
+
+この講義に含んでいる空の [ノートブック](../notebook.ipynb) で、以下の手順に従ってください。
+
+1. 前回の講義で行ったように `pandas`、`matplotlib`、`numpy` をインポートし、UFOのスプレッドシートをインポートしてください。サンプルのデータセットを見ることができます。
+
+ ```python
+ import pandas as pd
+ import numpy as np
+
+ ufos = pd.read_csv('./data/ufos.csv')
+ ufos.head()
+ ```
+
+1. UFOのデータを新しいタイトルで小さいデータフレームに変換してください。また、`Country` 属性の一意な値を確認してください。
+
+ ```python
+ ufos = pd.DataFrame({'Seconds': ufos['duration (seconds)'], 'Country': ufos['country'],'Latitude': ufos['latitude'],'Longitude': ufos['longitude']})
+
+ ufos.Country.unique()
+ ```
+
+1. ここで、null値をすべて削除し、1~60秒の目撃情報のみを読み込むことで処理すべきデータ量を減らすことができます。
+
+ ```python
+ ufos.dropna(inplace=True)
+
+ ufos = ufos[(ufos['Seconds'] >= 1) & (ufos['Seconds'] <= 60)]
+
+ ufos.info()
+ ```
+
+1. Scikit-learn の `LabelEncoder` ライブラリをインポートして、国の文字列値を数値に変換してください。
+
+ ✅ LabelEncoder はデータをアルファベット順にエンコードします。
+
+ ```python
+ from sklearn.preprocessing import LabelEncoder
+
+ ufos['Country'] = LabelEncoder().fit_transform(ufos['Country'])
+
+ ufos.head()
+ ```
+
+ データは以下のようになります。
+
+ ```output
+ Seconds Country Latitude Longitude
+ 2 20.0 3 53.200000 -2.916667
+ 3 20.0 4 28.978333 -96.645833
+ 14 30.0 4 35.823889 -80.253611
+ 23 60.0 4 45.582778 -122.352222
+ 24 3.0 3 51.783333 -0.783333
+ ```
+
+## 演習 - モデルを構築する
+
+これでデータを訓練グループとテストグループに分けてモデルを学習する準備ができました。
+
+1. Xベクトルとして学習したい3つの特徴を選択し、Yベクトルには `Country` を指定します。`Seconds`、`Latitude`、`Longitude` を入力して国のIDを取得することにします。
+
+ ```python
+ from sklearn.model_selection import train_test_split
+
+ Selected_features = ['Seconds','Latitude','Longitude']
+
+ X = ufos[Selected_features]
+ y = ufos['Country']
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
+ ```
+
+1. ロジスティック回帰を使ってモデルを学習してください。
+
+ ```python
+ from sklearn.metrics import accuracy_score, classification_report
+ from sklearn.linear_model import LogisticRegression
+ model = LogisticRegression()
+ model.fit(X_train, y_train)
+ predictions = model.predict(X_test)
+
+ print(classification_report(y_test, predictions))
+ print('Predicted labels: ', predictions)
+ print('Accuracy: ', accuracy_score(y_test, predictions))
+ ```
+
+国 (`Country`) と緯度・経度 (`Latitude/Longitude`) は相関しているので当然ですが、精度は悪くないです。**(約95%)**
+
+緯度 (`Latitude`) と経度 (`Longitude`) から国 (`Country`) を推測することができるので、作成したモデルは画期的なものではありませんが、クリーニングして出力した生のデータから学習を行い、このモデルをWebアプリで使用してみる良い練習にはなります。
+
+## 演習 - モデルを「塩漬け」にする
+
+さて、いよいよモデルを「塩漬け」にしてみましょう!これは数行のコードで実行できます。「塩漬け」にした後は、そのモデルを読み込んで、秒・緯度・経度を含むサンプルデータの配列でテストしてください。
+
+```python
+import pickle
+model_filename = 'ufo-model.pkl'
+pickle.dump(model, open(model_filename,'wb'))
+
+model = pickle.load(open('ufo-model.pkl','rb'))
+print(model.predict([[50,44,-12]]))
+```
+
+モデルはイギリスの国番号である **「3」** を返します。すばらしい!👽
+
+## 演習 - Flaskアプリを構築する
+
+これでFlaskアプリを構築してモデルを呼び出すことができるようになり、これは同じような結果を返しますが、視覚的によりわかりやすい方法です。
+
+1. まず、_ufo-model.pkl_ ファイルと _notebook.ipynb_ ファイルが存在する場所に **web-app** というフォルダを作成してください。
+
+1. そのフォルダの中に、さらに3つのフォルダを作成してください。**css** というフォルダを含む **static** と、**templates** です。以下のようなファイルとディレクトリになっているはずです。
+
+ ```output
+ web-app/
+ static/
+ css/
+ templates/
+ notebook.ipynb
+ ufo-model.pkl
+ ```
+
+ ✅ 完成したアプリを見るには、solution フォルダを参照してください。
+
+1. _web-app_ フォルダの中に作成する最初のファイルは **requirements.txt** です。JavaScript アプリにおける _package.json_ と同様に、このファイルはアプリに必要な依存関係をリストにしたものです。**requirements.txt** に以下の行を追加してください。
+
+ ```text
+ scikit-learn
+ pandas
+ numpy
+ flask
+ ```
+
+1. 次に、_web-app_ に移動して、このファイルを実行します。
+
+ ```bash
+ cd web-app
+ ```
+
+1. _requirements.txt_ に記載されているライブラリをインストールするために、ターミナルで `pip install` と入力してください。
+
+ ```bash
+ pip install -r requirements.txt
+ ```
+
+1. アプリを完成させるために、さらに3つのファイルを作成する準備が整いました。
+
+ 1. ルートに **app.py** を作成してください。
+ 2. _templates_ ディレクトリに **index.html** を作成してください。
+ 3. _static/css_ ディレクトリに **styles.css** を作成してください。
+
+1. 以下のスタイルで _styles.css_ ファイルを構築してください。
+
+ ```css
+ body {
+ width: 100%;
+ height: 100%;
+ font-family: 'Helvetica';
+ background: black;
+ color: #fff;
+ text-align: center;
+ letter-spacing: 1.4px;
+ font-size: 30px;
+ }
+
+ input {
+ min-width: 150px;
+ }
+
+ .grid {
+ width: 300px;
+ border: 1px solid #2d2d2d;
+ display: grid;
+ justify-content: center;
+ margin: 20px auto;
+ }
+
+ .box {
+ color: #fff;
+ background: #2d2d2d;
+ padding: 12px;
+ display: inline-block;
+ }
+ ```
+
+1. 次に _index.html_ を構築してください。
+
+ ```html
+
+
+
+
+ 🛸 UFO Appearance Prediction! 👽
+
+
+
+
+
+
+
+
+
According to the number of seconds, latitude and longitude, which country is likely to have reported seeing a UFO?
According to the number of seconds, latitude and longitude, which country is likely to have reported seeing a UFO?
+
+
+
+
+
{{ prediction_text }}
+
+
+
+
+
+
+ ```
+
+ 看看这个文件中的模板。请注意应用程序将提供的变量周围的“mustache”语法,例如预测文本:`{{}}`。还有一个表单可以将预测发布到`/predict`路由。
+
+ 最后,你已准备好构建使用模型和显示预测的python 文件:
+
+9. 在`app.py`中添加:
+
+ ```python
+ import numpy as np
+ from flask import Flask, request, render_template
+ import pickle
+
+ app = Flask(__name__)
+
+ model = pickle.load(open("../ufo-model.pkl", "rb"))
+
+
+ @app.route("/")
+ def home():
+ return render_template("index.html")
+
+
+ @app.route("/predict", methods=["POST"])
+ def predict():
+
+ int_features = [int(x) for x in request.form.values()]
+ final_features = [np.array(int_features)]
+ prediction = model.predict(final_features)
+
+ output = prediction[0]
+
+ countries = ["Australia", "Canada", "Germany", "UK", "US"]
+
+ return render_template(
+ "index.html", prediction_text="Likely country: {}".format(countries[output])
+ )
+
+
+ if __name__ == "__main__":
+ app.run(debug=True)
+ ```
+
+ > 💡 提示:当你在使用Flask运行Web应用程序时添加 [`debug=True`](https://www.askpython.com/python-modules/flask/flask-debug-mode)时你对应用程序所做的任何更改将立即反映,无需重新启动服务器。注意!不要在生产应用程序中启用此模式
+
+如果你运行`python app.py`或`python3 app.py` - 你的网络服务器在本地启动,你可以填写一个简短的表格来回答你关于在哪里看到UFO的问题!
+
+在此之前,先看一下`app.py`的实现:
+
+1. 首先,加载依赖项并启动应用程序。
+2. 然后,导入模型。
+3. 然后,在home路由上渲染index.html。
+
+在`/predict`路由上,当表单被发布时会发生几件事情:
+
+1. 收集表单变量并转换为numpy数组。然后将它们发送到模型并返回预测。
+2. 我们希望显示的国家/地区根据其预测的国家/地区代码重新呈现为可读文本,并将该值发送回index.html以在模板中呈现。
+
+以这种方式使用模型,包括Flask和pickled模型,是相对简单的。最困难的是要理解数据是什么形状的,这些数据必须发送到模型中才能得到预测。这完全取决于模型是如何训练的。有三个数据要输入,以便得到一个预测。
+
+在一个专业的环境中,你可以看到训练模型的人和在Web或移动应用程序中使用模型的人之间的良好沟通是多么的必要。在我们的情况下,只有一个人,你!
+
+---
+
+## 🚀 挑战:
+
+你可以在Flask应用程序中训练模型,而不是在notebook上工作并将模型导入Flask应用程序!尝试在notebook中转换Python代码,可能是在清除数据之后,从应用程序中的一个名为`train`的路径训练模型。采用这种方法的利弊是什么?
+
+## [课后测](https://white-water-09ec41f0f.azurestaticapps.net/quiz/18/)
+
+## 复习与自学
+
+有很多方法可以构建一个Web应用程序来使用ML模型。列出可以使用JavaScript或Python构建Web应用程序以利用机器学习的方法。考虑架构:模型应该留在应用程序中还是存在于云中?如果是后者,你将如何访问它?为应用的ML Web解决方案绘制架构模型。
+
+## 任务
+
+[尝试不同的模型](../assignment.md)
+
+
diff --git a/3-Web-App/1-Web-App/translations/assignment.it.md b/3-Web-App/1-Web-App/translations/assignment.it.md
new file mode 100644
index 0000000000..7bc7ffd948
--- /dev/null
+++ b/3-Web-App/1-Web-App/translations/assignment.it.md
@@ -0,0 +1,11 @@
+# Provare un modello diverso
+
+## Istruzioni
+
+Ora che si è creato un'app web utilizzando un modello di Regressione addestrato, usare uno dei modelli da una lezione precedente sulla Regressione per rifare questa app web. Si può mantenere lo stile o progettarla in modo diverso per riflettere i dati della zucca. Fare attenzione a modificare gli input in modo che riflettano il metodo di addestramento del proprio modello.
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------------------------- | --------------------------------------------------------- | --------------------------------------------------------- | -------------------------------------- |
+| | L'app web funziona come previsto e viene distribuita nel cloud | L'app web contiene difetti o mostra risultati imprevisti | L'app web non funziona correttamente |
diff --git a/3-Web-App/1-Web-App/translations/assignment.ja.md b/3-Web-App/1-Web-App/translations/assignment.ja.md
new file mode 100644
index 0000000000..2151a7c243
--- /dev/null
+++ b/3-Web-App/1-Web-App/translations/assignment.ja.md
@@ -0,0 +1,11 @@
+# 違うモデルを試す
+
+## 指示
+
+訓練された回帰モデルを使用して1つのWebアプリを構築したので、前回の回帰の講義で使用したモデルの1つを使用して、このWebアプリを再実行してください。スタイルをそのままにしても、かぼちゃのデータを反映するために別のデザインにしても構いません。モデルの学習方法に合わせて入力を変更するように注意してください。
+
+## 評価基準
+
+| 指標 | 模範的 | 適切 | 要改善 |
+| ---- | ----------------------------------------------------------- | ----------------------------------------------------------------- | ------------------------------- |
+| | Webアプリが期待通りに動作し、クラウド上にデプロイされている | Webアプリに欠陥が含まれているか、期待していない結果を表示している | Webアプリが正しく機能していない |
diff --git a/3-Web-App/1-Web-App/translations/assignment.zh-cn.md b/3-Web-App/1-Web-App/translations/assignment.zh-cn.md
new file mode 100644
index 0000000000..016dfa5228
--- /dev/null
+++ b/3-Web-App/1-Web-App/translations/assignment.zh-cn.md
@@ -0,0 +1,12 @@
+# Բͬģ
+
+## ˵
+
+ڣѾܹʹһѵĻعģwebӦóôǰĻعγѡһģһwebӦóʹԭķͬķƣչʾpumpkinݡעԷӳģ͵ѵ
+
+
+## б
+
+| | | йо | Ŭ |
+| -------------------------- | --------------------------------------------------------- | --------------------------------------------------------- | -------------------------------------- |
+| | webӦóԤУƶ | webӦóȱݻʾ벻Ľ | webӦó |
diff --git a/3-Web-App/translations/README.it.md b/3-Web-App/translations/README.it.md
new file mode 100644
index 0000000000..d376b8ec7c
--- /dev/null
+++ b/3-Web-App/translations/README.it.md
@@ -0,0 +1,22 @@
+# Creare un'app web per utilizzare il modello ML
+
+In questa sezione del programma di studi, verrà presentato un argomento ML applicato: come salvare il modello di Scikit-learn come file che può essere utilizzato per fare previsioni all'interno di un'applicazione web. Una volta salvato il modello, si imparerà come utilizzarlo in un'app web sviluppata con Flask. Per prima cosa si creerà un modello utilizzando alcuni dati che riguardano gli avvistamenti di UFO! Quindi, si creerà un'app web che consentirà di inserire un numero di secondi con un valore di latitudine e longitudine per prevedere quale paese ha riferito di aver visto un UFO.
+
+![Parcheggio UFO](../images/ufo.jpg)
+
+Foto di Michael Herren su Unsplash
+
+
+## Lezioni
+
+1. [Costruire un'app web](../1-Web-App/translations/README.it.md)
+
+## Crediti
+
+"Costruire un'app web" è stato scritto con ♥️ da [Jen Looper](https://twitter.com/jenlooper).
+
+♥️ I quiz sono stati scritti da Rohan Raj.
+
+L'insieme di dati proviene da [Kaggle](https://www.kaggle.com/NUFORC/ufo-sightings).
+
+L'architettura dell'app web è stata suggerita in parte da [questo articolo](https://towardsdatascience.com/how-to-easily-deploy-machine-learning-models-using-flask-b95af8fe34d4) e da [questo](https://github.com/abhinavsagar/machine-learning-deployment) repository di Abhinav Sagar.
\ No newline at end of file
diff --git a/3-Web-App/translations/README.ru.md b/3-Web-App/translations/README.ru.md
new file mode 100644
index 0000000000..c252074677
--- /dev/null
+++ b/3-Web-App/translations/README.ru.md
@@ -0,0 +1,22 @@
+# Создайте веб-приложение для использования вашей модели машинного обучения
+
+В этом разделе учебной программы вы познакомитесь с прикладной темой машинного обучения: как сохранить модель Scikit-learn в виде файла, который можно использовать для прогнозирования в веб-приложении. После сохранения модели вы узнаете, как использовать ее в веб-приложении, созданном во Flask. Сначала вы создадите модель, используя некоторые данные о наблюдениях НЛО! Затем вы создадите веб-приложение, которое позволит вам ввести количество секунд с широтой и долготой, чтобы предсказать, какая страна сообщила о видении НЛО.
+
+! [Парковка НЛО](images/ufo.jpg)
+
+Фото Майкла Херрена на Unsplash
+
+
+## Уроки
+
+1. [Создайте веб-приложение](1-Web-App/README.md)
+
+## Благодарности
+
+«Создайте веб-приложение» было написано с помощью ♥ ️[Джен Лупер](https://twitter.com/jenlooper).
+
+♥ ️ Тесты были написаны Роханом Раджем.
+
+Набор данных взят из [Kaggle](https://www.kaggle.com/NUFORC/ufo-sightings).
+
+Архитектура веб-приложения была частично предложена в [этой статье](https://towardsdatascience.com/how-to-easily-deploy-machine-learning-models-using-flask-b95af8fe34d4) и [этой репозитории](https://github.com/abhinavsagar/machine-learning-deployment) Абхинава Сагара.
\ No newline at end of file
diff --git a/4-Classification/1-Introduction/README.md b/4-Classification/1-Introduction/README.md
index 38516901af..b6dde02f7a 100644
--- a/4-Classification/1-Introduction/README.md
+++ b/4-Classification/1-Introduction/README.md
@@ -19,7 +19,7 @@ Remember:
Classification uses various algorithms to determine other ways of determining a data point's label or class. Let's work with this cuisine data to see whether, by observing a group of ingredients, we can determine its cuisine of origin.
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/19/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/19/)
### Introduction
@@ -163,7 +163,7 @@ Now you can dig deeper into the data and learn what are the typical ingredients
def create_ingredient_df(df):
ingredient_df = df.T.drop(['cuisine','Unnamed: 0']).sum(axis=1).to_frame('value')
ingredient_df = ingredient_df[(ingredient_df.T != 0).any()]
- ingredient_df = ingredient_df.sort_values(by='value', ascending=False
+ ingredient_df = ingredient_df.sort_values(by='value', ascending=False,
inplace=False)
return ingredient_df
```
@@ -264,12 +264,18 @@ Now that you have cleaned the data, use [SMOTE](https://imbalanced-learn.org/dev
The data is nice and clean, balanced, and very delicious!
+1. The last step is to save your balanced data, including labels and features, into a new dataframe that can be exported into a file:
+
+ ```python
+ transformed_df = pd.concat([transformed_label_df,transformed_feature_df],axis=1, join='outer')
+ ```
+
1. You can take one more look at the data using `transformed_df.head()` and `transformed_df.info()`. Save a copy of this data for use in future lessons:
```python
transformed_df.head()
transformed_df.info()
- transformed_df.to_csv("../data/cleaned_cuisine.csv")
+ transformed_df.to_csv("../data/cleaned_cuisines.csv")
```
This fresh CSV can now be found in the root data folder.
@@ -280,7 +286,7 @@ Now that you have cleaned the data, use [SMOTE](https://imbalanced-learn.org/dev
This curriculum contains several interesting datasets. Dig through the `data` folders and see if any contain datasets that would be appropriate for binary or multi-class classification? What questions would you ask of this dataset?
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/20/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/20/)
## Review & Self Study
diff --git a/4-Classification/1-Introduction/solution/notebook.ipynb b/4-Classification/1-Introduction/solution/notebook.ipynb
index c5b8c6299c..5abb9693de 100644
--- a/4-Classification/1-Introduction/solution/notebook.ipynb
+++ b/4-Classification/1-Introduction/solution/notebook.ipynb
@@ -622,7 +622,7 @@
"metadata": {},
"outputs": [],
"source": [
- "transformed_df.to_csv(\"../../data/cleaned_cuisine.csv\")"
+ "transformed_df.to_csv(\"../../data/cleaned_cuisines.csv\")"
]
},
{
diff --git a/4-Classification/1-Introduction/translations/README.it.md b/4-Classification/1-Introduction/translations/README.it.md
new file mode 100644
index 0000000000..76c8b0a279
--- /dev/null
+++ b/4-Classification/1-Introduction/translations/README.it.md
@@ -0,0 +1,297 @@
+# Introduzione alla classificazione
+
+In queste quattro lezioni si esplorerà un focus fondamentale del machine learning classico: _la classificazione_. Verrà analizzato l'utilizzo di vari algoritmi di classificazione con un insieme di dati su tutte le brillanti cucine dell'Asia e dell'India. Si spera siate affamati!
+
+![solo un pizzico!](../images/pinch.png)
+
+> In queste lezioni di celebrano le cucine panasiatiche! Immagine di [Jen Looper](https://twitter.com/jenlooper)
+
+La classificazione è una forma di [apprendimento supervisionato](https://it.wikipedia.org/wiki/Apprendimento_supervisionato) che ha molto in comune con le tecniche di regressione. Se machine learning riguarda la previsione di valori o nomi di cose utilizzando insiemi di dati, la classificazione generalmente rientra in due gruppi: _classificazione binaria_ e _classificazione multiclasse_.
+
+[![Introduzione allaclassificazione](https://img.youtube.com/vi/eg8DJYwdMyg/0.jpg)](https://youtu.be/eg8DJYwdMyg "Introduzione alla classificazione")
+
+> 🎥 Fare clic sull'immagine sopra per un video: John Guttag del MIT introduce la classificazione
+
+Ricordare:
+
+- La **regressione lineare** ha aiutato a prevedere le relazioni tra le variabili e a fare previsioni accurate su dove un nuovo punto dati si sarebbe posizionato in relazione a quella linea. Quindi, si potrebbe prevedere _quale prezzo avrebbe una zucca a settembre rispetto a dicembre_, ad esempio.
+- La **regressione logistica** ha aiutato a scoprire le "categorie binarie": a questo prezzo, _questa zucca è arancione o non arancione_?
+
+La classificazione utilizza vari algoritmi per determinare altri modi per definire l'etichetta o la classe di un punto dati. Si lavorerà con questi dati di cucina per vedere se, osservando un gruppo di ingredienti, è possibile determinarne la cucina di origine.
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/19/)
+
+### Introduzione
+
+La classificazione è una delle attività fondamentali del ricercatore di machine learning e data scientist. Dalla classificazione basica di un valore binario ("questa email è spam o no?"), alla complessa classificazione e segmentazione di immagini utilizzando la visione artificiale, è sempre utile essere in grado di ordinare i dati in classi e porre domande su di essi.
+
+Per definire il processo in modo più scientifico, il metodo di classificazione crea un modello predittivo che consente di mappare la relazione tra le variabili di input e le variabili di output.
+
+![classificazione binaria vs. multiclasse](../images/binary-multiclass.png)
+
+> Problemi binari e multiclasse per la gestione di algoritmi di classificazione. Infografica di [Jen Looper](https://twitter.com/jenlooper)
+
+Prima di iniziare il processo di pulizia dei dati, visualizzazione e preparazione per le attività di machine learning, si apprenderà qualcosa circa i vari modi in cui machine learning può essere sfruttato per classificare i dati.
+
+Derivata dalla [statistica](https://it.wikipedia.org/wiki/Classificazione_statistica), la classificazione che utilizza machine learning classico utilizza caratteristiche come l'`essere fumatore`, il `peso` e l'`età` per determinare _la probabilità di sviluppare la malattia X._ Essendo una tecnica di apprendimento supervisionata simile agli esercizi di regressione eseguiti in precedenza, i dati vengono etichettati e gli algoritmi ML utilizzano tali etichette per classificare e prevedere le classi (o "caratteristiche") di un insieme di dati e assegnarle a un gruppo o risultato.
+
+✅ Si prenda un momento per immaginare un insieme di dati sulle cucine. A cosa potrebbe rispondere un modello multiclasse? A cosa potrebbe rispondere un modello binario? Se si volesse determinare se una determinata cucina potrebbe utilizzare il fieno greco? Se si volesse vedere se, regalando una busta della spesa piena di anice stellato, carciofi, cavolfiori e rafano, si possa creare un piatto tipico indiano?
+
+[![Cesti misteriosi pazzeschi](https://img.youtube.com/vi/GuTeDbaNoEU/0.jpg)](https://youtu.be/GuTeDbaNoEU " Cestini misteriosi pazzeschi")
+
+> 🎥 Fare clic sull'immagine sopra per un video. L'intera premessa dello spettacolo 'Chopped' è il 'cesto misterioso' dove gli chef devono preparare un piatto con una scelta casuale di ingredienti. Sicuramente un modello ML avrebbe aiutato!
+
+## Ciao 'classificatore'
+
+La domanda che si vuole porre a questo insieme di dati sulla cucina è in realtà una **domanda multiclasse**, poiché ci sono diverse potenziali cucine nazionali con cui lavorare. Dato un lotto di ingredienti, in quale di queste molte classi si identificheranno i dati?
+
+Scikit-learn offre diversi algoritmi da utilizzare per classificare i dati, a seconda del tipo di problema che si desidera risolvere. Nelle prossime due lezioni si impareranno a conoscere molti di questi algoritmi.
+
+## Esercizio: pulire e bilanciare i dati
+
+Il primo compito, prima di iniziare questo progetto, sarà pulire e **bilanciare** i dati per ottenere risultati migliori. Si inizia con il file vuoto _notebook.ipynb_ nella radice di questa cartella.
+
+La prima cosa da installare è [imblearn](https://imbalanced-learn.org/stable/). Questo è un pacchetto di apprendimento di Scikit che consentirà di bilanciare meglio i dati (si imparerà di più su questa attività tra un minuto).
+
+1. Per installare `imblearn`, eseguire `pip install`, in questo modo:
+
+ ```python
+ pip install imblearn
+ ```
+
+1. Importare i pacchetti necessari per caricare i dati e visualizzarli, importare anche `SMOTE` da `imblearn`.
+
+ ```python
+ import pandas as pd
+ import matplotlib.pyplot as plt
+ import matplotlib as mpl
+ import numpy as np
+ from imblearn.over_sampling import SMOTE
+ ```
+
+ Ora si è pronti per la successiva importazione dei dati.
+
+1. Il prossimo compito sarà quello di importare i dati:
+
+ ```python
+ df = pd.read_csv('../data/cuisines.csv')
+ ```
+
+ Usando `read_csv()` si leggerà il contenuto del file csv _cusines.csv_ e lo posizionerà nella variabile `df`.
+
+1. Controllare la forma dei dati:
+
+ ```python
+ df.head()
+ ```
+
+ Le prime cinque righe hanno questo aspetto:
+
+ ```output
+ | | Unnamed: 0 | cuisine | almond | angelica | anise | anise_seed | apple | apple_brandy | apricot | armagnac | ... | whiskey | white_bread | white_wine | whole_grain_wheat_flour | wine | wood | yam | yeast | yogurt | zucchini |
+ | --- | ---------- | ------- | ------ | -------- | ----- | ---------- | ----- | ------------ | ------- | -------- | --- | ------- | ----------- | ---------- | ----------------------- | ---- | ---- | --- | ----- | ------ | -------- |
+ | 0 | 65 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 1 | 66 | indian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 2 | 67 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 3 | 68 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 4 | 69 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
+ ```
+
+1. Si possono ottienere informazioni su questi dati chiamando `info()`:
+
+ ```python
+ df.info()
+ ```
+
+ Il risultato assomiglia a:
+
+ ```output
+
+ RangeIndex: 2448 entries, 0 to 2447
+ Columns: 385 entries, Unnamed: 0 to zucchini
+ dtypes: int64(384), object(1)
+ memory usage: 7.2+ MB
+ ```
+
+## Esercizio - conoscere le cucine
+
+Ora il lavoro inizia a diventare più interessante. Si scoprirà la distribuzione dei dati, per cucina
+
+1. Tracciare i dati come barre chiamando `barh()`:
+
+ ```python
+ df.cuisine.value_counts().plot.barh()
+ ```
+
+ ![distribuzione dati cuisine](../images/cuisine-dist.png)
+
+ Esiste un numero finito di cucine, ma la distribuzione dei dati non è uniforme. Si può sistemare! Prima di farlo, occorre esplorare un po' di più.
+
+1. Si deve scoprire quanti dati sono disponibili per cucina e stamparli:
+
+ ```python
+ thai_df = df[(df.cuisine == "thai")]
+ japanese_df = df[(df.cuisine == "japanese")]
+ chinese_df = df[(df.cuisine == "chinese")]
+ indian_df = df[(df.cuisine == "indian")]
+ korean_df = df[(df.cuisine == "korean")]
+
+ print(f'thai df: {thai_df.shape}')
+ print(f'japanese df: {japanese_df.shape}')
+ print(f'chinese df: {chinese_df.shape}')
+ print(f'indian df: {indian_df.shape}')
+ print(f'korean df: {korean_df.shape}')
+ ```
+
+ il risultato si presenta così:
+
+ ```output
+ thai df: (289, 385)
+ japanese df: (320, 385)
+ chinese df: (442, 385)
+ indian df: (598, 385)
+ korean df: (799, 385)
+ ```
+
+## Alla scoperta degli ingredienti
+
+Ora si possono approfondire i dati e scoprire quali sono gli ingredienti tipici per cucina. Si dovrebbero ripulire i dati ricorrenti che creano confusione tra le cucine, quindi si affronterà questo problema.
+
+1. Creare una funzione `create_ingredient()` in Python per creare un dataframe ingredient Questa funzione inizierà eliminando una colonna non utile e ordinando gli ingredienti in base al loro conteggio:
+
+ ```python
+ def create_ingredient_df(df):
+ ingredient_df = df.T.drop(['cuisine','Unnamed: 0']).sum(axis=1).to_frame('value')
+ ingredient_df = ingredient_df[(ingredient_df.T != 0).any()]
+ ingredient_df = ingredient_df.sort_values(by='value', ascending=False
+ inplace=False)
+ return ingredient_df
+ ```
+
+ Ora si può usare questa funzione per farsi un'idea dei primi dieci ingredienti più popolari per cucina.
+
+1. Chiamare `create_ingredient_df()` e tracciare il grafico chiamando `barh()`:
+
+ ```python
+ thai_ingredient_df = create_ingredient_df(thai_df)
+ thai_ingredient_df.head(10).plot.barh()
+ ```
+
+ ![thai](../images/thai.png)
+
+1. Fare lo stesso per i dati giapponesi:
+
+ ```python
+ japanese_ingredient_df = create_ingredient_df(japanese_df)
+ japanese_ingredient_df.head(10).plot.barh()
+ ```
+
+ ![Giapponese](../images/japanese.png)
+
+1. Ora per gli ingredienti cinesi:
+
+ ```python
+ chinese_ingredient_df = create_ingredient_df(chinese_df)
+ chinese_ingredient_df.head(10).plot.barh()
+ ```
+
+ ![cinese](../images/chinese.png)
+
+1. Tracciare gli ingredienti indiani:
+
+ ```python
+ indian_ingredient_df = create_ingredient_df(indian_df)
+ indian_ingredient_df.head(10).plot.barh()
+ ```
+
+ ![indiano](../images/indian.png)
+
+1. Infine, tracciare gli ingredienti coreani:
+
+ ```python
+ korean_ingredient_df = create_ingredient_df(korean_df)
+ korean_ingredient_df.head(10).plot.barh()
+ ```
+
+ ![Coreano](../images/korean.png)
+
+1. Ora, eliminare gli ingredienti più comuni che creano confusione tra le diverse cucine, chiamando `drop()`:
+
+ Tutti amano il riso, l'aglio e lo zenzero!
+
+ ```python
+ feature_df= df.drop(['cuisine','Unnamed: 0','rice','garlic','ginger'], axis=1)
+ labels_df = df.cuisine #.unique()
+ feature_df.head()
+ ```
+
+## Bilanciare l'insieme di dati
+
+Ora che i dati sono puliti, si usa [SMOTE](https://imbalanced-learn.org/dev/references/generated/imblearn.over_sampling.SMOTE.html) - "Tecnica di sovracampionamento della minoranza sintetica" - per bilanciarlo.
+
+1. Chiamare `fit_resample()`, questa strategia genera nuovi campioni per interpolazione.
+
+ ```python
+ oversample = SMOTE()
+ transformed_feature_df, transformed_label_df = oversample.fit_resample(feature_df, labels_df)
+ ```
+
+ Bilanciando i dati, si otterranno risultati migliori quando si classificano. Si pensi a una classificazione binaria. Se la maggior parte dei dati è una classe, un modello ML prevederà quella classe più frequentemente, solo perché ci sono più dati per essa. Il bilanciamento dei dati prende tutti i dati distorti e aiuta a rimuovere questo squilibrio.
+
+1. Ora si può controllare il numero di etichette per ingrediente:
+
+ ```python
+ print(f'new label count: {transformed_label_df.value_counts()}')
+ print(f'old label count: {df.cuisine.value_counts()}')
+ ```
+
+ il risultato si presenta così:
+
+ ```output
+ new label count: korean 799
+ chinese 799
+ indian 799
+ japanese 799
+ thai 799
+ Name: cuisine, dtype: int64
+ old label count: korean 799
+ indian 598
+ chinese 442
+ japanese 320
+ thai 289
+ Name: cuisine, dtype: int64
+ ```
+
+ I dati sono belli e puliti, equilibrati e molto deliziosi!
+
+1. L'ultimo passaggio consiste nel salvare i dati bilanciati, incluse etichette e caratteristiche, in un nuovo dataframe che può essere esportato in un file:
+
+ ```python
+ transformed_df = pd.concat([transformed_label_df,transformed_feature_df],axis=1, join='outer')
+ ```
+
+1. Si può dare un'altra occhiata ai dati usando `transform_df.head()` e `transform_df.info()`. Salvare una copia di questi dati per utilizzarli nelle lezioni future:
+
+ ```python
+ transformed_df.head()
+ transformed_df.info()
+ transformed_df.to_csv("../data/cleaned_cuisine.csv")
+ ```
+
+ Questo nuovo CSV può ora essere trovato nella cartella data in radice.
+
+---
+
+## 🚀 Sfida
+
+Questo programma di studi contiene diversi insiemi di dati interessanti. Esaminare le cartelle `data` e vedere se contiene insiemi di dati che sarebbero appropriati per la classificazione binaria o multiclasse. Quali domande si farebbero a questo insieme di dati?
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/20/)
+
+## Revisione e Auto Apprendimento
+
+Esplorare l'API di SMOTE. Per quali casi d'uso è meglio usarla? Quali problemi risolve?
+
+## Compito
+
+[Esplorare i metodi di classificazione](assignment.it.md)
diff --git a/4-Classification/1-Introduction/translations/README.tr.md b/4-Classification/1-Introduction/translations/README.tr.md
new file mode 100644
index 0000000000..e1b32ec90b
--- /dev/null
+++ b/4-Classification/1-Introduction/translations/README.tr.md
@@ -0,0 +1,298 @@
+# Sınıflandırmaya giriş
+
+Bu dört derste klasik makine öğreniminin temel bir odağı olan _sınıflandırma_ konusunu keşfedeceksiniz. Asya ve Hindistan'ın nefis mutfağının tamamı üzerine hazırlanmış bir veri setiyle çeşitli sınıflandırma algoritmalarını kullanmanın üzerinden geçeceğiz. Umarız açsınızdır!
+
+![sadece bir tutam!](../images/pinch.png)
+
+> Bu derslerede Pan-Asya mutfağını kutlayın! Fotoğraf [Jen Looper](https://twitter.com/jenlooper) tarafından çekilmiştir.
+
+Sınıflandırma, regresyon yöntemleriyle birçok ortak özelliği olan bir [gözetimli öğrenme](https://wikipedia.org/wiki/Supervised_learning) biçimidir. Eğer makine öğrenimi tamamen veri setleri kullanarak değerleri veya nesnelere verilecek isimleri öngörmekse, sınıflandırma genellikle iki gruba ayrılır: _ikili sınıflandırma_ ve _çok sınıflı sınıflandırma_.
+
+[![Sınıflandırmaya giriş](https://img.youtube.com/vi/eg8DJYwdMyg/0.jpg)](https://youtu.be/eg8DJYwdMyg "Introduction to classification")
+
+> :movie_camera: Video için yukarıdaki fotoğrafa tıklayın: MIT's John Guttag introduces classification (MIT'den John Guttag sınıflandırmayı tanıtıyor)
+
+Hatırlayın:
+
+- **Doğrusal regresyon** değişkenler arasındaki ilişkileri öngörmenize ve o doğruya ilişkili olarak yeni bir veri noktasının nereye düşeceğine dair doğru öngörülerde bulunmanıza yardımcı oluyordu. Yani, _bir balkabağının fiyatının aralık ayına göre eylül ayında ne kadar olabileceğini_ öngörebilirsiniz örneğin.
+- **Lojistik regresyon** "ikili kategoriler"i keşfetmenizi sağlamıştı: bu fiyat noktasında, _bu balkabağı turuncu mudur, turuncu-değil midir?_
+
+Sınıflandırma, bir veri noktasının etiketini veya sınıfını belirlemek için farklı yollar belirlemek üzere çeşitli algoritmalar kullanır. Bir grup malzemeyi gözlemleyerek kökeninin hangi mutfak olduğunu belirleyip belirleyemeyeceğimizi görmek için bu mutfak verisiyle çalışalım.
+
+## [Ders öncesi kısa sınavı](https://white-water-09ec41f0f.azurestaticapps.net/quiz/19/?loc=tr)
+
+### Giriş
+
+Sınıflandırma, makine öğrenimi araştırmacısının ve veri bilimcisinin temel işlerinden biridir. İkili bir değerin temel sınıflandırmasından ("Bu e-posta gereksiz (spam) midir yoksa değil midir?") bilgisayarla görüden yararlanarak karmaşık görüntü sınıflandırma ve bölütlemeye kadar, veriyi sınıf sınıf sıralayabilmek ve soru sorabilmek daima faydalıdır.
+
+Süreci daha bilimsel bir yolla ifade etmek gerekirse, sınıflandırma yönteminiz, girdi bilinmeyenlerinin arasındaki ilişkiyi çıktı bilinmeyenlerine eşlemenizi sağlayan öngörücü bir model oluşturur.
+
+![ikili ve çok sınıflı sınıflandırma karşılaştırması](../images/binary-multiclass.png)
+
+> Sınıflandırma algoritmalarının başa çıkması gereken ikili ve çok sınıflı problemler. Bilgilendirme grafiği [Jen Looper](https://twitter.com/jenlooper) tarafından hazırlanmıştır.
+
+Verimizi temizleme, görselleştirme ve makine öğrenimi görevleri için hazırlama süreçlerine başlamadan önce, veriyi sınıflandırmak için makine öğreniminin leveraj edilebileceği çeşitli yolları biraz öğrenelim.
+
+[İstatistikten](https://wikipedia.org/wiki/Statistical_classification) türetilmiş olarak, klasik makine öğrenimi kullanarak sınıflandırma, _X hastalığının gelişmesi ihtimalini_ belirlemek için `smoker`, `weight`, ve `age` gibi öznitelikler kullanır. Daha önce yaptığınız regresyon alıştırmalarına benzeyen bir gözetimli öğrenme yöntemi olarak, veriniz etiketlenir ve makine öğrenimi algoritmaları o etiketleri, sınıflandırmak ve veri setinin sınıflarını (veya 'özniteliklerini') öngörmek ve onları bir gruba veya bir sonuca atamak için kullanır.
+
+:white_check_mark: Mutfaklarla ilgili bir veri setini biraz düşünün. Çok sınıflı bir model neyi cevaplayabilir? İkili bir model neyi cevaplayabilir? Farz edelim ki verilen bir mutfağın çemen kullanmasının muhtemel olup olmadığını belirlemek istiyorsunuz. Farzedelim ki yıldız anason, enginar, karnabahar ve bayır turpu ile dolu bir alışveriş poşetinden tipik bir Hint yemeği yapıp yapamayacağınızı görmek istiyorsunuz.
+
+[![Çılgın gizem sepetleri](https://img.youtube.com/vi/GuTeDbaNoEU/0.jpg)](https://youtu.be/GuTeDbaNoEU "Crazy mystery baskets")
+
+> :movie_camera: Video için yukarıdaki fotoğrafa tıklayın. Aşçıların rastgele malzeme seçeneklerinden yemek yaptığı 'Chopped' programının tüm olayı 'gizem sepetleri'dir. Kuşkusuz, bir makine öğrenimi modeli onlara yardımcı olurdu!
+
+## Merhaba 'sınıflandırıcı'
+
+Bu mutfak veri setiyle ilgili sormak istediğimiz soru aslında bir **çok sınıflı soru**dur çünkü elimizde farklı potansiyel ulusal mutfaklar var. Verilen bir grup malzeme için, veri bu sınıflardan hangisine uyacak?
+
+Scikit-learn, veriyi sınıflandırmak için kullanmak üzere, çözmek istediğiniz problem çeşidine bağlı olarak, çeşitli farklı algoritmalar sunar. Önümüzdeki iki derste, bu algoritmalardan birkaçını öğreneceksiniz.
+
+## Alıştırma - verinizi temizleyip dengeleyin
+
+Bu projeye başlamadan önce elinizdeki ilk görev, daha iyi sonuçlar almak için, verinizi temizlemek ve **dengelemek**. Üst klasördeki boş _notebook.ipynb_ dosyasıyla başlayın.
+
+Kurmanız gereken ilk şey [imblearn](https://imbalanced-learn.org/stable/). Bu, veriyi daha iyi dengelemenizi sağlayacak bir Scikit-learn paketidir. (Bu görev hakkında birazdan daha fazla bilgi göreceksiniz.)
+
+1. `imblearn` kurun, `pip install` çalıştırın, şu şekilde:
+
+ ```python
+ pip install imblearn
+ ```
+
+1. Verinizi almak ve görselleştirmek için ihtiyaç duyacağınız paketleri alın (import edin), ayrıca `imblearn` paketinden `SMOTE` alın.
+
+ ```python
+ import pandas as pd
+ import matplotlib.pyplot as plt
+ import matplotlib as mpl
+ import numpy as np
+ from imblearn.over_sampling import SMOTE
+ ```
+
+ Şimdi okumak için hazırsınız, sonra veriyi alın.
+
+1. Sonraki görev veriyi almak olacak:
+
+ ```python
+ df = pd.read_csv('../data/cuisines.csv')
+ ```
+
+ `read_csv()` kullanmak _cusines.csv_ csv dosyasının içeriğini okuyacak ve `df` değişkenine yerleştirecek.
+
+1. Verinin şeklini kontrol edin:
+
+ ```python
+ df.head()
+ ```
+
+ İlk beş satır şöyle görünüyor:
+
+ ```output
+ | | Unnamed: 0 | cuisine | almond | angelica | anise | anise_seed | apple | apple_brandy | apricot | armagnac | ... | whiskey | white_bread | white_wine | whole_grain_wheat_flour | wine | wood | yam | yeast | yogurt | zucchini |
+ | --- | ---------- | ------- | ------ | -------- | ----- | ---------- | ----- | ------------ | ------- | -------- | --- | ------- | ----------- | ---------- | ----------------------- | ---- | ---- | --- | ----- | ------ | -------- |
+ | 0 | 65 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 1 | 66 | indian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 2 | 67 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 3 | 68 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 4 | 69 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
+ ```
+
+1. `info()` fonksiyonunu çağırarak bu veri hakkında bilgi edinin:
+
+ ```python
+ df.info()
+ ```
+
+ Çıktınız şuna benzer:
+
+ ```output
+
+ RangeIndex: 2448 entries, 0 to 2447
+ Columns: 385 entries, Unnamed: 0 to zucchini
+ dtypes: int64(384), object(1)
+ memory usage: 7.2+ MB
+ ```
+
+## Alıştırma - mutfaklar hakkında bilgi edinmek
+
+Şimdi, işimiz daha da ilginçleşmeye başlıyor. Mutfak mutfak verinin dağılımını keşfedelim
+
+1. `barh()` fonksiyonunu çağırarak veriyi sütunlarla çizdirin:
+
+ ```python
+ df.cuisine.value_counts().plot.barh()
+ ```
+
+ ![mutfak veri dağılımı](../images/cuisine-dist.png)
+
+ Sonlu sayıda mutfak var, ancak verinin dağılımı düzensiz. Bunu düzeltebilirsiniz! Bunu yapmadan önce, biraz daha keşfedelim.
+
+1. Her mutfak için ne kadar verinin mevcut olduğunu bulun ve yazdırın:
+
+ ```python
+ thai_df = df[(df.cuisine == "thai")]
+ japanese_df = df[(df.cuisine == "japanese")]
+ chinese_df = df[(df.cuisine == "chinese")]
+ indian_df = df[(df.cuisine == "indian")]
+ korean_df = df[(df.cuisine == "korean")]
+
+ print(f'thai df: {thai_df.shape}')
+ print(f'japanese df: {japanese_df.shape}')
+ print(f'chinese df: {chinese_df.shape}')
+ print(f'indian df: {indian_df.shape}')
+ print(f'korean df: {korean_df.shape}')
+ ```
+
+ çıktı şöyle görünür:
+
+ ```output
+ thai df: (289, 385)
+ japanese df: (320, 385)
+ chinese df: (442, 385)
+ indian df: (598, 385)
+ korean df: (799, 385)
+ ```
+
+## Malzemeleri keşfetme
+
+Şimdi veriyi daha derinlemesine inceleyebilirsiniz ve her mutfak için tipik malzemelerin neler olduğunu öğrenebilirsiniz. Mutfaklar arasında karışıklık yaratan tekrar eden veriyi temizlemelisiniz, dolayısıyla şimdi bu problemle ilgili bilgi edinelim.
+
+1. Python'da, malzeme veri iskeleti yaratmak için `create_ingredient_df()` diye bir fonksiyon oluşturun. Bu fonksiyon, yardımcı olmayan bir sütunu temizleyerek ve sayılarına göre malzemeleri sıralayarak başlar:
+
+ ```python
+ def create_ingredient_df(df):
+ ingredient_df = df.T.drop(['cuisine','Unnamed: 0']).sum(axis=1).to_frame('value')
+ ingredient_df = ingredient_df[(ingredient_df.T != 0).any()]
+ ingredient_df = ingredient_df.sort_values(by='value', ascending=False
+ inplace=False)
+ return ingredient_df
+ ```
+
+ Şimdi bu fonksiyonu, her mutfağın en yaygın ilk on malzemesi hakkında hakkında fikir edinmek için kullanabilirsiniz.
+
+1. `create_ingredient_df()` fonksiyonunu çağırın ve `barh()` fonksiyonunu çağırarak çizdirin:
+
+ ```python
+ thai_ingredient_df = create_ingredient_df(thai_df)
+ thai_ingredient_df.head(10).plot.barh()
+ ```
+
+ ![Tayland](../images/thai.png)
+
+1. Japon verisi için de aynısını yapın:
+
+ ```python
+ japanese_ingredient_df = create_ingredient_df(japanese_df)
+ japanese_ingredient_df.head(10).plot.barh()
+ ```
+
+ ![Japon](../images/japanese.png)
+
+1. Şimdi Çin malzemeleri için yapın:
+
+ ```python
+ chinese_ingredient_df = create_ingredient_df(chinese_df)
+ chinese_ingredient_df.head(10).plot.barh()
+ ```
+
+ ![Çin](../images/chinese.png)
+
+1. Hint malzemelerini çizdirin:
+
+ ```python
+ indian_ingredient_df = create_ingredient_df(indian_df)
+ indian_ingredient_df.head(10).plot.barh()
+ ```
+
+ ![Hint](../images/indian.png)
+
+1. Son olarak, Kore malzemelerini çizdirin:
+
+ ```python
+ korean_ingredient_df = create_ingredient_df(korean_df)
+ korean_ingredient_df.head(10).plot.barh()
+ ```
+
+ ![Kore](../images/korean.png)
+
+1. Şimdi, `drop()` fonksiyonunu çağırarak, farklı mutfaklar arasında karışıklığa sebep olan en çok ortaklık taşıyan malzemeleri temizleyelim:
+
+ Herkes pirinci, sarımsağı ve zencefili seviyor!
+
+ ```python
+ feature_df= df.drop(['cuisine','Unnamed: 0','rice','garlic','ginger'], axis=1)
+ labels_df = df.cuisine #.unique()
+ feature_df.head()
+ ```
+
+## Veri setini dengeleyin
+
+Veriyi temizlediniz, şimdi [SMOTE](https://imbalanced-learn.org/dev/references/generated/imblearn.over_sampling.SMOTE.html) - "Synthetic Minority Over-sampling Technique" ("Sentetik Azınlık Aşırı-Örnekleme/Örneklem-Artırma Tekniği") kullanarak dengeleyelim.
+
+1. `fit_resample()` fonksiyonunu çağırın, bu strateji ara değerlemeyle yeni örnekler üretir.
+
+ ```python
+ oversample = SMOTE()
+ transformed_feature_df, transformed_label_df = oversample.fit_resample(feature_df, labels_df)
+ ```
+
+ Verinizi dengeleyerek, sınıflandırırken daha iyi sonuçlar alabileceksiniz. Bir ikili sınıflandırma düşünün. Eğer verimizin çoğu tek bir sınıfsa, bir makine öğrenimi modeli, sırf onun için daha fazla veri olduğundan o sınıfı daha sık tahmin edecektir. Veriyi dengelemek herhangi eğri veriyi alır ve bu dengesizliğin ortadan kaldırılmasına yardımcı olur.
+
+1. Şimdi, her bir malzeme için etiket sayısını kontrol edebilirsiniz:
+
+ ```python
+ print(f'new label count: {transformed_label_df.value_counts()}')
+ print(f'old label count: {df.cuisine.value_counts()}')
+ ```
+
+ Çıktınız şöyle görünür:
+
+ ```output
+ new label count: korean 799
+ chinese 799
+ indian 799
+ japanese 799
+ thai 799
+ Name: cuisine, dtype: int64
+ old label count: korean 799
+ indian 598
+ chinese 442
+ japanese 320
+ thai 289
+ Name: cuisine, dtype: int64
+ ```
+
+ Veri şimdi tertemiz, dengeli ve çok lezzetli!
+
+1. Son adım, dengelenmiş verinizi, etiket ve özniteliklerle beraber, yeni bir dosyaya gönderilebilecek yeni bir veri iskeletine kaydetmek:
+
+ ```python
+ transformed_df = pd.concat([transformed_label_df,transformed_feature_df],axis=1, join='outer')
+ ```
+
+1. `transformed_df.head()` ve `transformed_df.info()` fonksiyonlarını kullanarak verinize bir kez daha göz atabilirsiniz. Gelecek derslerde kullanabilmek için bu verinin bir kopyasını kaydedin:
+
+ ```python
+ transformed_df.head()
+ transformed_df.info()
+ transformed_df.to_csv("../../data/cleaned_cuisines.csv")
+
+ ```
+
+ Bu yeni CSV şimdi kök data (veri) klasöründe görülebilir.
+
+---
+
+## :rocket: Meydan okuma
+
+Bu öğretim programı farklı ilgi çekici veri setleri içermekte. `data` klasörlerini inceleyin ve ikili veya çok sınıflı sınıflandırma için uygun olabilecek veri setleri bulunduran var mı, bakın. Bu veri seti için hangi soruları sorabilirdiniz?
+
+## [Ders sonrası kısa sınavı](https://white-water-09ec41f0f.azurestaticapps.net/quiz/20/?loc=tr)
+
+## Gözden Geçirme & Kendi Kendine Çalışma
+
+SMOTE'nin API'ını keşfedin. En iyi hangi durumlar için kullanılıyor? Hangi problemleri çözüyor?
+
+## Ödev
+
+[Sınıflandırma yöntemlerini keşfedin](assignment.tr.md)
diff --git a/4-Classification/1-Introduction/translations/README.zh-cn.md b/4-Classification/1-Introduction/translations/README.zh-cn.md
new file mode 100644
index 0000000000..978f4a0674
--- /dev/null
+++ b/4-Classification/1-Introduction/translations/README.zh-cn.md
@@ -0,0 +1,291 @@
+# 对分类方法的介绍
+
+在这四节课程中,你将会学习机器学习中一个基本的重点 - _分类_. 我们会在关于亚洲和印度的神奇的美食的数据集上尝试使用多种分类算法。希望你有点饿了。
+
+![一个桃子!](../images/pinch.png)
+
+>在学习的课程中赞叹泛亚地区的美食吧! 图片由 [Jen Looper](https://twitter.com/jenlooper)提供
+
+分类算法是[监督学习](https://wikipedia.org/wiki/Supervised_learning) 的一种。它与回归算法在很多方面都有相同之处。如果机器学习所有的目标都是使用数据集来预测数值或物品的名字,那么分类算法通常可以分为两类 _二元分类_ 和 _多元分类_。
+
+[![对分类算法的介绍](https://img.youtube.com/vi/eg8DJYwdMyg/0.jpg)](https://youtu.be/eg8DJYwdMyg "对分类算法的介绍")
+
+> 🎥 点击上方给的图片可以跳转到一个视频-MIT的John对分类算法的介绍
+
+请记住:
+
+- **线性回归** 帮助你预测变量之间的关系并对一个新的数据点会落在哪条线上做出精确的预测。因此,你可以预测 _南瓜在九月的价格和十月的价格_。
+- **逻辑回归** 帮助你发现“二元范畴”:即在当前这个价格, _这个南瓜是不是橙色_?
+
+分类方法采用多种算法来确定其他可以用来确定一个数据点的标签或类别的方法。让我们来研究一下这个数据集,看看我们能否通过观察菜肴的原料来确定它的源头。
+
+## [课程前的小问题](https://white-water-09ec41f0f.azurestaticapps.net/quiz/19/)
+
+分类是机器学习研究者和数据科学家使用的一种基本方法。从基本的二元分类(这是不是一份垃圾邮件?)到复杂的图片分类和使用计算机视觉的分割技术,它都是将数据分类并提出相关问题的有效工具。
+
+![二元分类 vs 多元分类](../images/binary-multiclass.png)
+
+> 需要分类算法解决的二元分类和多元分类问题的对比. 信息图由[Jen Looper](https://twitter.com/jenlooper)提供
+
+在开始清洗数据、数据可视化和调整数据以适应机器学习的任务前,让我们来了解一下多种可用来数据分类的机器学习方法。
+
+派生自[统计数学](https://wikipedia.org/wiki/Statistical_classification),分类算法使用经典的机器学习的一些特征,比如通过'吸烟者'、'体重'和'年龄'来推断 _罹患某种疾病的可能性_。作为一个与你刚刚实践过的回归算法很相似的监督学习算法,你的数据是被标记过的并且算法通过采集这些标签来进行分类和预测并进行输出。
+
+✅ 花一点时间来想象一下一个关于菜肴的数据集。一个多元分类的模型应该能回答什么问题?一个二元分类的模型又应该能回答什么?如果你想确定一个给定的菜肴是否会用到葫芦巴(一种植物,种子用来调味)该怎么做?如果你想知道给你一个装满了八角茴香、花椰菜和辣根的购物袋你能否做出一道代表性的印度菜又该怎么做?
+
+[![Crazy mystery baskets](https://img.youtube.com/vi/GuTeDbaNoEU/0.jpg)](https://youtu.be/GuTeDbaNoEU "疯狂的神秘篮子")
+
+> 🎥 点击图像观看视频。整个'Chopped'节目的前提都是建立在神秘的篮子上,在这个节目中厨师必须利用随机给定的食材做菜。可见一个机器学习模型能起到不小的作用
+
+## 初见-分类器
+
+我们关于这个菜肴数据集想要提出的问题其实是一个 **多元问题**,因为我们有很多潜在的具有代表性的菜肴。给定一系列食材数据,数据能够符合这些类别中的哪一类?
+
+Scikit-learn项目提供多种对数据进行分类的算法,你需要根据问题的具体类型来进行选择。在下两节课程中你会学到这些算法中的几个。
+
+## 练习 - 清洗并平衡你的数据
+
+在你开始进行这个项目前的第一个上手的任务就是清洗和 **平衡**你的数据来得到更好的结果。从当前目录的根目录中的 _nodebook.ipynb_ 开始。
+
+第一个需要安装的东西是 [imblearn](https://imbalanced-learn.org/stable/)这是一个Scikit-learn项目中的一个包,它可以让你更好的平衡数据 (关于这个任务你很快你就会学到更多)。
+
+1. 安装 `imblearn`, 运行命令 `pip install`:
+
+ ```python
+ pip install imblearn
+ ```
+
+1. 为了导入和可视化数据你需要导入下面的这些包, 你还需要从`imblearn`导入`SMOTE`
+
+ ```python
+ import pandas as pd
+ import matplotlib.pyplot as plt
+ import matplotlib as mpl
+ import numpy as np
+ from imblearn.over_sampling import SMOTE
+ ```
+
+ 现在你已经准备好导入数据了。
+
+1. 下一项任务是导入数据:
+
+ ```python
+ df = pd.read_csv('../data/cuisines.csv')
+ ```
+
+ 使用函数 `read_csv()` 会读取csv文件的内容 _cusines.csv_ 并将内容放置在 变量`df`中。
+
+1. 检查数据的形状是否正确:
+
+ ```python
+ df.head()
+ ```
+
+ 前五行输出应该是这样的:
+
+ ```output
+ | | Unnamed: 0 | cuisine | almond | angelica | anise | anise_seed | apple | apple_brandy | apricot | armagnac | ... | whiskey | white_bread | white_wine | whole_grain_wheat_flour | wine | wood | yam | yeast | yogurt | zucchini |
+ | --- | ---------- | ------- | ------ | -------- | ----- | ---------- | ----- | ------------ | ------- | -------- | --- | ------- | ----------- | ---------- | ----------------------- | ---- | ---- | --- | ----- | ------ | -------- |
+ | 0 | 65 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 1 | 66 | indian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 2 | 67 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 3 | 68 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 4 | 69 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
+ ```
+
+1. 调用函数 `info()` 可以获得有关这个数据集的信息:
+
+ ```python
+ df.info()
+ ```
+
+ Your out resembles:
+
+ ```output
+
+ RangeIndex: 2448 entries, 0 to 2447
+ Columns: 385 entries, Unnamed: 0 to zucchini
+ dtypes: int64(384), object(1)
+ memory usage: 7.2+ MB
+ ```
+
+ ## 练习 - 了解这些菜肴
+
+现在任务变得更有趣了,让我们来探索如何将数据分配给各个菜肴
+
+1. 调用函数 `barh()`可以绘制出数据的条形图:
+
+ ```python
+ df.cuisine.value_counts().plot.barh()
+ ```
+
+ ![菜肴数据分配](../images/cuisine-dist.png)
+
+ 这里有有限的一些菜肴,但是数据的分配是不平均的。但是你可以修正这一现象!在这样做之前再稍微探索一下。
+
+1. 找出对于每个菜肴有多少数据是有效的并将其打印出来:
+
+ ```python
+ thai_df = df[(df.cuisine == "thai")]
+ japanese_df = df[(df.cuisine == "japanese")]
+ chinese_df = df[(df.cuisine == "chinese")]
+ indian_df = df[(df.cuisine == "indian")]
+ korean_df = df[(df.cuisine == "korean")]
+
+ print(f'thai df: {thai_df.shape}')
+ print(f'japanese df: {japanese_df.shape}')
+ print(f'chinese df: {chinese_df.shape}')
+ print(f'indian df: {indian_df.shape}')
+ print(f'korean df: {korean_df.shape}')
+ ```
+
+ 输出应该是这样的 :
+
+ ```output
+ thai df: (289, 385)
+ japanese df: (320, 385)
+ chinese df: (442, 385)
+ indian df: (598, 385)
+ korean df: (799, 385)
+ ```
+## 探索有关食材的内容
+
+现在你可以在数据中探索的更深一点并了解每道菜肴的代表性食材。你需要将反复出现的、容易造成混淆的数据清理出去,那么让我们来学习解决这个问题。
+
+1. 在Python中创建一个函数 `create_ingredient_df()` 来创建一个食材的数据帧。这个函数会去掉数据中无用的列并按食材的数量进行分类。
+
+ ```python
+ def create_ingredient_df(df):
+ ingredient_df = df.T.drop(['cuisine','Unnamed: 0']).sum(axis=1).to_frame('value')
+ ingredient_df = ingredient_df[(ingredient_df.T != 0).any()]
+ ingredient_df = ingredient_df.sort_values(by='value', ascending=False,
+ inplace=False)
+ return ingredient_df
+ ```
+现在你可以使用这个函数来得到理想的每道菜肴最重要的10种食材。
+
+1. 调用函数 `create_ingredient_df()` 然后通过函数`barh()`来绘制图像:
+
+ ```python
+ thai_ingredient_df = create_ingredient_df(thai_df)
+ thai_ingredient_df.head(10).plot.barh()
+ ```
+
+ ![thai](../images/thai.png)
+
+1. 对日本的数据进行相同的操作:
+
+ ```python
+ japanese_ingredient_df = create_ingredient_df(japanese_df)
+ japanese_ingredient_df.head(10).plot.barh()
+ ```
+
+ ![日本](../images/japanese.png)
+
+1. 现在处理中国的数据:
+
+ ```python
+ chinese_ingredient_df = create_ingredient_df(chinese_df)
+ chinese_ingredient_df.head(10).plot.barh()
+ ```
+
+ ![中国](../images/chinese.png)
+
+1. 绘制印度食材的数据:
+
+ ```python
+ indian_ingredient_df = create_ingredient_df(indian_df)
+ indian_ingredient_df.head(10).plot.barh()
+ ```
+
+ ![印度](../images/indian.png)
+
+1. 最后,绘制韩国的食材的数据:
+
+ ```python
+ korean_ingredient_df = create_ingredient_df(korean_df)
+ korean_ingredient_df.head(10).plot.barh()
+ ```
+
+ ![韩国](../images/korean.png)
+
+1. 现在,去除在不同的菜肴间最普遍的容易造成混乱的食材,调用函数 `drop()`:
+
+ 大家都喜欢米饭、大蒜和生姜
+
+ ```python
+ feature_df= df.drop(['cuisine','Unnamed: 0','rice','garlic','ginger'], axis=1)
+ labels_df = df.cuisine #.unique()
+ feature_df.head()
+ ```
+
+## 平衡数据集
+
+现在你已经清理过数据集了, 使用 [SMOTE](https://imbalanced-learn.org/dev/references/generated/imblearn.over_sampling.SMOTE.html) - "Synthetic Minority Over-sampling Technique" - 来平衡数据集。
+
+1. 调用函数 `fit_resample()`, 此方法通过插入数据来生成新的样本
+
+ ```python
+ oversample = SMOTE()
+ transformed_feature_df, transformed_label_df = oversample.fit_resample(feature_df, labels_df)
+ ```
+
+ 通过对数据集的平衡,当你对数据进行分类时能够得到更好的结果。现在考虑一个二元分类的问题,如果你的数据集中的大部分数据都属于其中一个类别,那么机器学习的模型就会因为在那个类别的数据更多而判断那个类别更为常见。平衡数据能够去除不公平的数据点。
+
+1. 现在你可以查看每个食材的标签数量:
+
+ ```python
+ print(f'new label count: {transformed_label_df.value_counts()}')
+ print(f'old label count: {df.cuisine.value_counts()}')
+ ```
+
+ 输出应该是这样的 :
+
+ ```output
+ new label count: korean 799
+ chinese 799
+ indian 799
+ japanese 799
+ thai 799
+ Name: cuisine, dtype: int64
+ old label count: korean 799
+ indian 598
+ chinese 442
+ japanese 320
+ thai 289
+ Name: cuisine, dtype: int64
+ ```
+
+ 现在这个数据集不仅干净、平衡而且还很“美味” !
+
+1. 最后一步是保存你处理过后的平衡的数据(包括标签和特征),将其保存为一个可以被输出到文件中的数据帧。
+
+ ```python
+ transformed_df = pd.concat([transformed_label_df,transformed_feature_df],axis=1, join='outer')
+ ```
+
+1. 你可以通过调用函数 `transformed_df.head()` 和 `transformed_df.info()`再检查一下你的数据。 接下来要将数据保存以供在未来的课程中使用:
+
+ ```python
+ transformed_df.head()
+ transformed_df.info()
+ transformed_df.to_csv("../data/cleaned_cuisines.csv")
+ ```
+
+ 这个全新的CSV文件可以在数据根目录中被找到。
+
+---
+
+## 🚀小练习
+
+本项目的全部课程含有很多有趣的数据集。 探索一下 `data`文件夹,看看这里面有没有适合二元分类、多元分类算法的数据集,再想一下你对这些数据集有没有什么想问的问题。
+
+## [课后练习](https://white-water-09ec41f0f.azurestaticapps.net/quiz/20/)
+
+## 回顾 & 自学
+
+探索一下 SMOTE的API文档。思考一下它最适合于什么样的情况、它能够解决什么样的问题。
+
+## 课后作业
+
+[探索一下分类方法](../assignment.md)
diff --git a/4-Classification/1-Introduction/translations/assignment.it.md b/4-Classification/1-Introduction/translations/assignment.it.md
new file mode 100644
index 0000000000..1283401791
--- /dev/null
+++ b/4-Classification/1-Introduction/translations/assignment.it.md
@@ -0,0 +1,11 @@
+# Esplorare i metodi di classificazione
+
+## Istruzioni
+
+Nella [documentazione](https://scikit-learn.org/stable/supervised_learning.html) di Scikit-learn si troverà un ampio elenco di modi per classificare i dati. Fare una piccola caccia al tesoro in questi documenti: l'obiettivo è cercare metodi di classificazione e abbinare un insieme di dati in questo programma di studi, una domanda che si può porre e una tecnica di classificazione. Creare un foglio di calcolo o una tabella in un file .doc e spiegare come funzionerebbe l'insieme di dati con l'algoritmo di classificazione.
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | ----------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| | viene presentato un documento che riporta una panoramica di 5 algoritmi insieme a una tecnica di classificazione. La panoramica è ben spiegata e dettagliata. | viene presentato un documento che riporta una panoramica di 3 algoritmi insieme a una tecnica di classificazione. La panoramica è ben spiegata e dettagliata. | viene presentato un documento che riporta una panoramica di meno di tre algoritmi insieme a una tecnica di classificazione e la panoramica non è né ben spiegata né dettagliata. |
diff --git a/4-Classification/1-Introduction/translations/assignment.tr.md b/4-Classification/1-Introduction/translations/assignment.tr.md
new file mode 100644
index 0000000000..99dfe5c26d
--- /dev/null
+++ b/4-Classification/1-Introduction/translations/assignment.tr.md
@@ -0,0 +1,11 @@
+# Sınıflandırma yöntemlerini keşfedin
+
+## Yönergeler
+
+[Scikit-learn dokümentasyonunda](https://scikit-learn.org/stable/supervised_learning.html) veriyi sınıflandırma yöntemlerini içeren büyük bir liste göreceksiniz. Bu dokümanlar arasında ufak bir çöpçü avı yapın: Hedefiniz, sınıflandırma yöntemleri aramak ve bu eğitim programındaki bir veri seti, sorabileceğiniz bir soru ve bir sınıflandırma yöntemi eşleştirmek. Bir .doc dosyasında elektronik çizelge veya tablo hazırlayın ve veri setinin sınıflandırma algoritmasıyla nasıl çalışacağını açıklayın.
+
+## Rubrik
+
+| Ölçüt | Örnek Alınacak Nitelikte | Yeterli | Geliştirme Gerekli |
+| -------- | ----------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| | Bir sınıflandırma yönteminin yanısıra 5 algoritmayı inceleyen bir doküman sunulmuş. İnceleme iyi açıklanmış ve detaylı. | Bir sınıflandırma yönteminin yanısıra 5 algoritmayı inceleyen bir doküman sunulmuş. İnceleme iyi açıklanmış ve detaylı. | Bir sınıflandırma yönteminin yanısıra 3'ten az algoritmayı inceleyen bir doküman sunulmuş ve inceleme iyi açıklanmış veya detaylı değil. |
diff --git a/4-Classification/2-Classifiers-1/README.md b/4-Classification/2-Classifiers-1/README.md
index 15800922a6..68877e6ace 100644
--- a/4-Classification/2-Classifiers-1/README.md
+++ b/4-Classification/2-Classifiers-1/README.md
@@ -4,7 +4,7 @@ In this lesson, you will use the dataset you saved from the last lesson full of
You will use this dataset with a variety of classifiers to _predict a given national cuisine based on a group of ingredients_. While doing so, you'll learn more about some of the ways that algorithms can be leveraged for classification tasks.
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/21/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/21/)
# Preparation
Assuming you completed [Lesson 1](../1-Introduction/README.md), make sure that a _cleaned_cuisines.csv_ file exists in the root `/data` folder for these four lessons.
@@ -15,21 +15,20 @@ Assuming you completed [Lesson 1](../1-Introduction/README.md), make sure that a
```python
import pandas as pd
- cuisines_df = pd.read_csv("../../data/cleaned_cuisine.csv")
+ cuisines_df = pd.read_csv("../../data/cleaned_cuisines.csv")
cuisines_df.head()
```
The data looks like this:
- ```output
- | | Unnamed: 0 | cuisine | almond | angelica | anise | anise_seed | apple | apple_brandy | apricot | armagnac | ... | whiskey | white_bread | white_wine | whole_grain_wheat_flour | wine | wood | yam | yeast | yogurt | zucchini |
- | --- | ---------- | ------- | ------ | -------- | ----- | ---------- | ----- | ------------ | ------- | -------- | --- | ------- | ----------- | ---------- | ----------------------- | ---- | ---- | --- | ----- | ------ | -------- |
- | 0 | 0 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
- | 1 | 1 | indian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
- | 2 | 2 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
- | 3 | 3 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
- | 4 | 4 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
- ```
+| | Unnamed: 0 | cuisine | almond | angelica | anise | anise_seed | apple | apple_brandy | apricot | armagnac | ... | whiskey | white_bread | white_wine | whole_grain_wheat_flour | wine | wood | yam | yeast | yogurt | zucchini |
+| --- | ---------- | ------- | ------ | -------- | ----- | ---------- | ----- | ------------ | ------- | -------- | --- | ------- | ----------- | ---------- | ----------------------- | ---- | ---- | --- | ----- | ------ | -------- |
+| 0 | 0 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 1 | 1 | indian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 2 | 2 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 3 | 3 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 4 | 4 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
+
1. Now, import several more libraries:
@@ -68,13 +67,13 @@ Assuming you completed [Lesson 1](../1-Introduction/README.md), make sure that a
Your features look like this:
- | almond | angelica | anise | anise_seed | apple | apple_brandy | apricot | armagnac | artemisia | artichoke | ... | whiskey | white_bread | white_wine | whole_grain_wheat_flour | wine | wood | yam | yeast | yogurt | zucchini | |
- | -----: | -------: | ----: | ---------: | ----: | -----------: | ------: | -------: | --------: | --------: | ---: | ------: | ----------: | ---------: | ----------------------: | ---: | ---: | ---: | ----: | -----: | -------: | --- |
- | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
- | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
- | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
- | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
- | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
+| | almond | angelica | anise | anise_seed | apple | apple_brandy | apricot | armagnac | artemisia | artichoke | ... | whiskey | white_bread | white_wine | whole_grain_wheat_flour | wine | wood | yam | yeast | yogurt | zucchini |
+| ---: | -----: | -------: | ----: | ---------: | ----: | -----------: | ------: | -------: | --------: | --------: | ---: | ------: | ----------: | ---------: | ----------------------: | ---: | ---: | ---: | ----: | -----: | -------: |
+| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
Now you are ready to train your model!
@@ -200,13 +199,13 @@ Since you are using the multiclass case, you need to choose what _scheme_ to use
The result is printed - Indian cuisine is its best guess, with good probability:
- | | 0 | | | | | | | | | | | | | | | | | | | | |
- | -------: | -------: | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
- | indian | 0.715851 | | | | | | | | | | | | | | | | | | | | |
- | chinese | 0.229475 | | | | | | | | | | | | | | | | | | | | |
- | japanese | 0.029763 | | | | | | | | | | | | | | | | | | | | |
- | korean | 0.017277 | | | | | | | | | | | | | | | | | | | | |
- | thai | 0.007634 | | | | | | | | | | | | | | | | | | | | |
+ | | 0 |
+ | -------: | -------: |
+ | indian | 0.715851 |
+ | chinese | 0.229475 |
+ | japanese | 0.029763 |
+ | korean | 0.017277 |
+ | thai | 0.007634 |
✅ Can you explain why the model is pretty sure this is an Indian cuisine?
@@ -217,22 +216,23 @@ Since you are using the multiclass case, you need to choose what _scheme_ to use
print(classification_report(y_test,y_pred))
```
- | precision | recall | f1-score | support | | | | | | | | | | | | | | | | | | |
- | ------------ | ------ | -------- | ------- | ---- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
- | chinese | 0.73 | 0.71 | 0.72 | 229 | | | | | | | | | | | | | | | | | |
- | indian | 0.91 | 0.93 | 0.92 | 254 | | | | | | | | | | | | | | | | | |
- | japanese | 0.70 | 0.75 | 0.72 | 220 | | | | | | | | | | | | | | | | | |
- | korean | 0.86 | 0.76 | 0.81 | 242 | | | | | | | | | | | | | | | | | |
- | thai | 0.79 | 0.85 | 0.82 | 254 | | | | | | | | | | | | | | | | | |
- | accuracy | 0.80 | 1199 | | | | | | | | | | | | | | | | | | | |
- | macro avg | 0.80 | 0.80 | 0.80 | 1199 | | | | | | | | | | | | | | | | | |
- | weighted avg | 0.80 | 0.80 | 0.80 | 1199 | | | | | | | | | | | | | | | | | |
+ | | precision | recall | f1-score | support |
+ | ------------ | --------- | ------ | -------- | ------- |
+ | chinese | 0.73 | 0.71 | 0.72 | 229 |
+ | indian | 0.91 | 0.93 | 0.92 | 254 |
+ | japanese | 0.70 | 0.75 | 0.72 | 220 |
+ | korean | 0.86 | 0.76 | 0.81 | 242 |
+ | thai | 0.79 | 0.85 | 0.82 | 254 |
+ | accuracy | 0.80 | 1199 | | |
+ | macro avg | 0.80 | 0.80 | 0.80 | 1199 |
+ | weighted avg | 0.80 | 0.80 | 0.80 | 1199 |
## 🚀Challenge
In this lesson, you used your cleaned data to build a machine learning model that can predict a national cuisine based on a series of ingredients. Take some time to read through the many options Scikit-learn provides to classify data. Dig deeper into the concept of 'solver' to understand what goes on behind the scenes.
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/22/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/22/)
+
## Review & Self Study
Dig a little more into the math behind logistic regression in [this lesson](https://people.eecs.berkeley.edu/~russell/classes/cs194/f11/lectures/CS194%20Fall%202011%20Lecture%2006.pdf)
diff --git a/4-Classification/2-Classifiers-1/solution/notebook.ipynb b/4-Classification/2-Classifiers-1/solution/notebook.ipynb
index a819dbe5b7..770ac85c7e 100644
--- a/4-Classification/2-Classifiers-1/solution/notebook.ipynb
+++ b/4-Classification/2-Classifiers-1/solution/notebook.ipynb
@@ -47,7 +47,7 @@
],
"source": [
"import pandas as pd\n",
- "cuisines_df = pd.read_csv(\"../../data/cleaned_cuisine.csv\")\n",
+ "cuisines_df = pd.read_csv(\"../../data/cleaned_cuisines.csv\")\n",
"cuisines_df.head()"
]
},
diff --git a/4-Classification/2-Classifiers-1/translations/README.it.md b/4-Classification/2-Classifiers-1/translations/README.it.md
new file mode 100644
index 0000000000..5611870b4e
--- /dev/null
+++ b/4-Classification/2-Classifiers-1/translations/README.it.md
@@ -0,0 +1,241 @@
+# Classificatori di cucina 1
+
+In questa lezione, si utilizzerà l'insieme di dati salvati dall'ultima lezione, pieno di dati equilibrati e puliti relativi alle cucine.
+
+Si utilizzerà questo insieme di dati con una varietà di classificatori per _prevedere una determinata cucina nazionale in base a un gruppo di ingredienti_. Mentre si fa questo, si imparerà di più su alcuni dei modi in cui gli algoritmi possono essere sfruttati per le attività di classificazione.
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/21/)
+# Preparazione
+
+Supponendo che la [Lezione 1](../1-Introduction/README.md) sia stata completata, assicurarsi che _esista_ un file clean_cuisines.csv nella cartella in radice `/data` per queste quattro lezioni.
+
+## Esercizio - prevedere una cucina nazionale
+
+1. Lavorando con il _notebook.ipynb_ di questa lezione nella cartella radice, importare quel file insieme alla libreria Pandas:
+
+ ```python
+ import pandas as pd
+ cuisines_df = pd.read_csv("../../data/cleaned_cuisine.csv")
+ cuisines_df.head()
+ ```
+
+ I dati si presentano così:
+
+ ```output
+ | | Unnamed: 0 | cuisine | almond | angelica | anise | anise_seed | apple | apple_brandy | apricot | armagnac | ... | whiskey | white_bread | white_wine | whole_grain_wheat_flour | wine | wood | yam | yeast | yogurt | zucchini |
+ | --- | ---------- | ------- | ------ | -------- | ----- | ---------- | ----- | ------------ | ------- | -------- | --- | ------- | ----------- | ---------- | ----------------------- | ---- | ---- | --- | ----- | ------ | -------- |
+ | 0 | 0 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 1 | 1 | indian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 2 | 2 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 3 | 3 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 4 | 4 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
+ ```
+
+1. Ora importare molte altre librerie:
+
+ ```python
+ from sklearn.linear_model import LogisticRegression
+ from sklearn.model_selection import train_test_split, cross_val_score
+ from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
+ from sklearn.svm import SVC
+ import numpy as np
+ ```
+
+1. Dividere le coordinate X e y in due dataframe per l'addestramento. `cuisine` può essere il dataframe delle etichette:
+
+ ```python
+ cuisines_label_df = cuisines_df['cuisine']
+ cuisines_label_df.head()
+ ```
+
+ Apparirà così
+
+ ```output
+ 0 indian
+ 1 indian
+ 2 indian
+ 3 indian
+ 4 indian
+ Name: cuisine, dtype: object
+ ```
+
+1. Scartare la colonna `Unnamed: 0` e la colonna `cuisine` , chiamando `drop()`. Salvare il resto dei dati come caratteristiche addestrabili:
+
+ ```python
+ cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)
+ cuisines_feature_df.head()
+ ```
+
+ Le caratteristiche sono così:
+
+ | almond | angelica | anise | anise_seed | apple | apple_brandy | apricot | armagnac | artemisia | artichoke | ... | whiskey | white_bread | white_wine | whole_grain_wheat_flour | wine | wood | yam | yeast | yogurt | zucchini | |
+ | -----: | -------: | ----: | ---------: | ----: | -----------: | ------: | -------: | --------: | --------: | ---: | ------: | ----------: | ---------: | ----------------------: | ---: | ---: | ---: | ----: | -----: | -------: | --- |
+ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
+
+Ora si è pronti per addestrare il modello!
+
+## Scegliere il classificatore
+
+Ora che i dati sono puliti e pronti per l'addestramento, si deve decidere quale algoritmo utilizzare per il lavoro.
+
+Scikit-learn raggruppa la classificazione in Supervised Learning, e in quella categoria si troveranno molti modi per classificare. [La varietà](https://scikit-learn.org/stable/supervised_learning.html) è piuttosto sconcertante a prima vista. I seguenti metodi includono tutti tecniche di classificazione:
+
+- Modelli Lineari
+- Macchine a Vettori di Supporto
+- Discesa stocastica del gradiente
+- Nearest Neighbors
+- Processi Gaussiani
+- Alberi di Decisione
+- Apprendimento ensemble (classificatore di voto)
+- Algoritmi multiclasse e multioutput (classificazione multiclasse e multietichetta, classificazione multiclasse-multioutput)
+
+> Si possono anche usare [le reti neurali per classificare i dati](https://scikit-learn.org/stable/modules/neural_networks_supervised.html#classification), ma questo esula dall'ambito di questa lezione.
+
+### Con quale classificatore andare?
+
+Quale classificatore si dovrebbe scegliere? Spesso, scorrerne diversi e cercare un buon risultato è un modo per testare. Scikit-learn offre un [confronto fianco](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html) a fianco su un insieme di dati creato, confrontando KNeighbors, SVC in due modi, GaussianProcessClassifier, DecisionTreeClassifier, RandomForestClassifier, MLPClassifier, AdaBoostClassifier, GaussianNB e QuadraticDiscrinationAnalysis, mostrando i risultati visualizzati:
+
+![confronto di classificatori](../images/comparison.png)
+> Grafici generati sulla documentazione di Scikit-learn
+
+> AutoML risolve questo problema in modo ordinato eseguendo questi confronti nel cloud, consentendo di scegliere l'algoritmo migliore per i propri dati. Si può provare [qui](https://docs.microsoft.com/learn/modules/automate-model-selection-with-azure-automl/?WT.mc_id=academic-15963-cxa)
+
+### Un approccio migliore
+
+Un modo migliore che indovinare a caso, tuttavia, è seguire le idee su questo [ML Cheat sheet](https://docs.microsoft.com/azure/machine-learning/algorithm-cheat-sheet?WT.mc_id=academic-15963-cxa) scaricabile. Qui si scopre che, per questo problema multiclasse, si dispone di alcune scelte:
+
+![cheatsheet per problemi multiclasse](../images/cheatsheet.png)
+> Una sezione dell'Algorithm Cheat Sheet di Microsoft, che descrive in dettaglio le opzioni di classificazione multiclasse
+
+✅ Scaricare questo cheat sheet, stamparlo e appenderlo alla parete!
+
+### Motivazione
+
+Si prova a ragionare attraverso diversi approcci dati i vincoli presenti:
+
+- **Le reti neurali sono troppo pesanti**. Dato l'insieme di dati pulito, ma minimo, e il fatto che si sta eseguendo l'addestramento localmente tramite notebook, le reti neurali sono troppo pesanti per questo compito.
+- **Nessun classificatore a due classi**. Non si usa un classificatore a due classi, quindi questo esclude uno contro tutti.
+- L'**albero decisionale o la regressione logistica potrebbero funzionare**. Potrebbe funzionare un albero decisionale o una regressione logistica per dati multiclasse.
+- **Gli alberi decisionali potenziati multiclasse risolvono un problema diverso**. L'albero decisionale potenziato multiclasse è più adatto per attività non parametriche, ad esempio attività progettate per costruire classifiche, quindi non è utile in questo caso.
+
+### Utilizzo di Scikit-learn
+
+Si userà Scikit-learn per analizzare i dati. Tuttavia, ci sono molti modi per utilizzare la regressione logistica in Scikit-learn. Dare un'occhiata ai [parametri da passare](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic%20regressio#sklearn.linear_model.LogisticRegression).
+
+Essenzialmente ci sono due importanti parametri `multi_class` e `solver`, che occorre specificare, quando si chiede a Scikit-learn di eseguire una regressione logistica. Il valore `multi_class` si applica un certo comportamento. Il valore del risolutore è quale algoritmo utilizzare. Non tutti i risolutori possono essere associati a tutti i valori `multi_class` .
+
+Secondo la documentazione, nel caso multiclasse, l'algoritmo di addestramento:
+
+- **Utilizza lo schema one-vs-rest (OvR)** - uno contro tutti, se l'opzione `multi_class` è impostata su `ovr`
+- **Utilizza la perdita di entropia incrociata**, se l 'opzione `multi_class` è impostata su `multinomial`. (Attualmente l'opzione multinomiale è supportata solo dai solutori 'lbfgs', 'sag', 'saga' e 'newton-cg')."
+
+> 🎓 Lo 'schema' qui può essere 'ovr' (one-vs-rest) - uno contro tutti - o 'multinomiale'. Poiché la regressione logistica è realmente progettata per supportare la classificazione binaria, questi schemi consentono di gestire meglio le attività di classificazione multiclasse. [fonte](https://machinelearningmastery.com/one-vs-rest-and-one-vs-one-for-multi-class-classification/)
+
+> 🎓 Il 'solver' è definito come "l'algoritmo da utilizzare nel problema di ottimizzazione". [fonte](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic%20regressio#sklearn.linear_model.LogisticRegression).
+
+Scikit-learn offre questa tabella per spiegare come i risolutori gestiscono le diverse sfide presentate da diversi tipi di strutture dati:
+
+![risolutori](../images/solvers.png)
+
+## Esercizio: dividere i dati
+
+Ci si può concentrare sulla regressione logistica per la prima prova di addestramento poiché di recente si è appreso di quest'ultima in una lezione precedente.
+Dividere i dati in gruppi di addestramento e test chiamando `train_test_split()`:
+
+```python
+X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)
+```
+
+## Esercizio: applicare la regressione logistica
+
+Poiché si sta utilizzando il caso multiclasse, si deve scegliere quale _schema_ utilizzare e quale _solutore_ impostare. Usare LogisticRegression con un'impostazione multiclasse e il solutore **liblinear** da addestrare.
+
+1. Creare una regressione logistica con multi_class impostato su `ovr` e il risolutore impostato su `liblinear`:
+
+ ```python
+ lr = LogisticRegression(multi_class='ovr',solver='liblinear')
+ model = lr.fit(X_train, np.ravel(y_train))
+
+ accuracy = model.score(X_test, y_test)
+ print ("Accuracy is {}".format(accuracy))
+ ```
+
+ ✅ Provare un risolutore diverso come `lbfgs`, che è spesso impostato come predefinito
+
+ > Nota, usare la funzione [`ravel`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.ravel.html) di Pandas per appiattire i dati quando necessario.
+
+ La precisione è buona oltre l'**80%**!
+
+1. Si può vedere questo modello in azione testando una riga di dati (#50):
+
+ ```python
+ print(f'ingredients: {X_test.iloc[50][X_test.iloc[50]!=0].keys()}')
+ print(f'cuisine: {y_test.iloc[50]}')
+ ```
+
+ Il risultato viene stampato:
+
+ ```output
+ ingredients: Index(['cilantro', 'onion', 'pea', 'potato', 'tomato', 'vegetable_oil'], dtype='object')
+ cuisine: indian
+ ```
+
+ ✅ Provare un numero di riga diverso e controllare i risultati
+
+1. Scavando più a fondo, si può verificare l'accuratezza di questa previsione:
+
+ ```python
+ test= X_test.iloc[50].values.reshape(-1, 1).T
+ proba = model.predict_proba(test)
+ classes = model.classes_
+ resultdf = pd.DataFrame(data=proba, columns=classes)
+
+ topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])
+ topPrediction.head()
+ ```
+
+ Il risultato è stampato: la cucina indiana è la sua ipotesi migliore, con buone probabilità:
+
+ | | 0 | | | | | | | | | | | | | | | | | | | | |
+ | ---------: | -------: | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+ | indiano | 0,715851 | | | | | | | | | | | | | | | | | | | | |
+ | cinese | 0.229475 | | | | | | | | | | | | | | | | | | | | |
+ | Giapponese | 0,029763 | | | | | | | | | | | | | | | | | | | | |
+ | Coreano | 0.017277 | | | | | | | | | | | | | | | | | | | | |
+ | thai | 0.007634 | | | | | | | | | | | | | | | | | | | | |
+
+ ✅ Si è in grado di spiegare perché il modello è abbastanza sicuro che questa sia una cucina indiana?
+
+1. Ottenere maggiori dettagli stampando un rapporto di classificazione, come fatto nelle lezioni di regressione:
+
+ ```python
+ y_pred = model.predict(X_test)
+ print(classification_report(y_test,y_pred))
+ ```
+
+ | precisione | recall | punteggio f1 | supporto | | | | | | | | | | | | | | | | | | |
+ | --------------- | ------ | ------------ | -------- | ---- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+ | cinese | 0,73 | 0,71 | 0,72 | 229 | | | | | | | | | | | | | | | | | |
+ | indiano | 0,91 | 0,93 | 0,92 | 254 | | | | | | | | | | | | | | | | | |
+ | Giapponese | 0.70 | 0,75 | 0,72 | 220 | | | | | | | | | | | | | | | | | |
+ | Coreano | 0,86 | 0,76 | 0,81 | 242 | | | | | | | | | | | | | | | | | |
+ | thai | 0,79 | 0,85 | 0.82 | 254 | | | | | | | | | | | | | | | | | |
+ | accuratezza | 0,80 | 1199 | | | | | | | | | | | | | | | | | | | |
+ | macro media | 0,80 | 0,80 | 0,80 | 1199 | | | | | | | | | | | | | | | | | |
+ | Media ponderata | 0,80 | 0,80 | 0,80 | 1199 | | | | | | | | | | | | | | | | | |
+
+## 🚀 Sfida
+
+In questa lezione, sono stati utilizzati dati puliti per creare un modello di apprendimento automatico in grado di prevedere una cucina nazionale basata su una serie di ingredienti. Si prenda del tempo per leggere le numerose opzioni fornite da Scikit-learn per classificare i dati. Approfondire il concetto di "risolutore" per capire cosa succede dietro le quinte.
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/22/)
+## Revisione e Auto Apprendimento
+
+Approfondire un po' la matematica alla base della regressione logistica in [questa lezione](https://people.eecs.berkeley.edu/~russell/classes/cs194/f11/lectures/CS194%20Fall%202011%20Lecture%2006.pdf)
+## Compito
+
+[Studiare i risolutori](assignment.it.md)
diff --git a/4-Classification/2-Classifiers-1/translations/README.tr.md b/4-Classification/2-Classifiers-1/translations/README.tr.md
new file mode 100644
index 0000000000..30f36b133c
--- /dev/null
+++ b/4-Classification/2-Classifiers-1/translations/README.tr.md
@@ -0,0 +1,241 @@
+# Mutfak sınıflandırıcıları 1
+
+Bu derste, mutfaklarla ilgili dengeli ve temiz veriyle dolu, geçen dersten kaydettiğiniz veri setini kullanacaksınız.
+
+Bu veri setini çeşitli sınıflandırıcılarla _bir grup malzemeyi baz alarak verilen bir ulusal mutfağı öngörmek_ için kullanacaksınız. Bunu yaparken, sınıflandırma görevleri için algoritmaların leveraj edilebileceği yollardan bazıları hakkında daha fazla bilgi edineceksiniz.
+
+## [Ders öncesi kısa sınavı](https://white-water-09ec41f0f.azurestaticapps.net/quiz/21/?loc=tr)
+# Hazırlık
+
+[Birinci dersi](../../1-Introduction/README.md) tamamladığınızı varsayıyoruz, dolayısıyla bu dört ders için _cleaned_cuisines.csv_ dosyasının kök `/data` klasöründe var olduğundan emin olun.
+
+## Alıştırma - ulusal bir mutfağı öngörün
+
+1. Bu dersin _notebook.ipynb_ dosyasında çalışarak, Pandas kütüphanesiyle beraber o dosyayı da alın:
+
+ ```python
+ import pandas as pd
+ cuisines_df = pd.read_csv("../data/cleaned_cuisines.csv")
+ cuisines_df.head()
+ ```
+
+ Veri şöyle görünüyor:
+
+| | Unnamed: 0 | cuisine | almond | angelica | anise | anise_seed | apple | apple_brandy | apricot | armagnac | ... | whiskey | white_bread | white_wine | whole_grain_wheat_flour | wine | wood | yam | yeast | yogurt | zucchini |
+| --- | ---------- | ------- | ------ | -------- | ----- | ---------- | ----- | ------------ | ------- | -------- | --- | ------- | ----------- | ---------- | ----------------------- | ---- | ---- | --- | ----- | ------ | -------- |
+| 0 | 0 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 1 | 1 | indian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 2 | 2 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 3 | 3 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 4 | 4 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
+
+
+1. Şimdi, birkaç kütüphane daha alın:
+
+ ```python
+ from sklearn.linear_model import LogisticRegression
+ from sklearn.model_selection import train_test_split, cross_val_score
+ from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
+ from sklearn.svm import SVC
+ import numpy as np
+ ```
+
+1. X ve y koordinatlarını eğitme için iki veri iskeletine bölün. `cuisine` etiket veri iskeleti olabilir:
+
+ ```python
+ cuisines_label_df = cuisines_df['cuisine']
+ cuisines_label_df.head()
+ ```
+
+ Şöyle görünecek:
+
+ ```output
+ 0 indian
+ 1 indian
+ 2 indian
+ 3 indian
+ 4 indian
+ Name: cuisine, dtype: object
+ ```
+
+1. `Unnamed: 0` ve `cuisine` sütunlarını, `drop()` fonksiyonunu çağırarak temizleyin. Kalan veriyi eğitilebilir öznitelikler olarak kaydedin:
+
+ ```python
+ cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)
+ cuisines_feature_df.head()
+ ```
+
+ Öznitelikleriniz şöyle görünüyor:
+
+| almond | angelica | anise | anise_seed | apple | apple_brandy | apricot | armagnac | artemisia | artichoke | ... | whiskey | white_bread | white_wine | whole_grain_wheat_flour | wine | wood | yam | yeast | yogurt | zucchini |
+| -----: | -------: | ----: | ---------: | ----: | -----------: | ------: | -------: | --------: | --------: | ---: | ------: | ----------: | ---------: | ----------------------: | ---: | ---: | ---: | ----: | -----: | -------: |
+| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
+
+Şimdi modelinizi eğitmek için hazırsınız!
+
+## Sınıflandırıcınızı seçme
+
+Veriniz temiz ve eğitme için hazır, şimdi bu iş için hangi algoritmanın kullanılması gerektiğine karar vermelisiniz.
+
+Scikit-learn, sınıflandırmayı gözetimli öğrenme altında grupluyor. Bu kategoride sınıflandırma için birçok yöntem görebilirsiniz. [Çeşitlilik](https://scikit-learn.org/stable/supervised_learning.html) ilk bakışta oldukça şaşırtıcı. Aşağıdaki yöntemlerin hepsi sınıflandırma yöntemlerini içermektedir:
+
+- Doğrusal Modeller
+- Destek Vektör Makineleri
+- Stokastik Gradyan İnişi
+- En Yakın Komşu
+- Gauss Süreçleri
+- Karar Ağaçları
+- Topluluk Metotları (Oylama Sınıflandırıcısı)
+- Çok sınıflı ve çok çıktılı algoritmalar (çok sınıflı ve çok etiketli sınıflandırma, çok sınıflı-çok çıktılı sınıflandırma)
+
+> [Verileri sınıflandırmak için sinir ağlarını](https://scikit-learn.org/stable/modules/neural_networks_supervised.html#classification) da kullanabilirsiniz, ancak bu, bu dersin kapsamı dışındadır.
+
+### Hangi sınıflandırıcıyı kullanmalı?
+
+Şimdi, hangi sınıflandırıcıyı seçmelisiniz? Genellikle, birçoğunu gözden geçirmek ve iyi bir sonuç aramak deneme yollarından biridir. Scikit-learn, oluşturulmuş bir veri seti üzerinde KNeighbors, iki yolla SVC, GaussianProcessClassifier, DecisionTreeClassifier, RandomForestClassifier, MLPClassifier, AdaBoostClassifier, GaussianNB ve QuadraticDiscrinationAnalysis karşılaştırmaları yapan ve sonuçları görsel olarak gösteren bir [yan yana karşılaştırma](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html) sunar:
+
+![sınıflandırıcıların karşılaştırılması](../images/comparison.png)
+> Grafikler Scikit-learn dokümantasyonlarında oluşturulmuştur.
+
+> AutoML, bu karşılaştırmaları bulutta çalıştırarak bu problemi muntazam bir şekilde çözer ve veriniz için en iyi algoritmayı seçmenizi sağlar. [Buradan](https://docs.microsoft.com/learn/modules/automate-model-selection-with-azure-automl/?WT.mc_id=academic-15963-cxa) deneyin.
+
+### Daha iyi bir yaklaşım
+
+Böyle tahminlerle çözmekten daha iyi bir yol ise, indirilebilir [ML Kopya kağıdı](https://docs.microsoft.com/azure/machine-learning/algorithm-cheat-sheet?WT.mc_id=academic-15963-cxa) içindeki fikirlere bakmaktır. Burada, bizim çok sınıflı problemimiz için bazı seçenekler olduğunu görüyoruz:
+
+![çok sınıflı problemler için kopya kağıdı](../images/cheatsheet.png)
+> Microsoft'un Algoritma Kopya Kağıdı'ndan, çok sınıflı sınıflandırma seçeneklerini detaylandıran bir bölüm
+
+:white_check_mark: Bu kopya kağıdını indirin, yazdırın ve duvarınıza asın!
+
+### Akıl yürütme
+
+Elimizdeki kısıtlamalarla farklı yaklaşımlar üzerine akıl yürütelim:
+
+- **Sinir ağları çok ağır**. Temiz ama minimal veri setimizi ve eğitimi not defterleriyle yerel makinelerde çalıştırdığımızı göz önünde bulundurursak, sinir ağları bu görev için çok ağır oluyor.
+- **İki sınıflı sınıflandırıcısı yok**. İki sınıflı sınıflandırıcı kullanmıyoruz, dolayısıyla bire karşı hepsi (one-vs-all) yöntemi eleniyor.
+- **Karar ağacı veya lojistik regresyon işe yarayabilirdi**. Bir karar ağacı veya çok sınıflı veri için lojistik regresyon işe yarayabilir.
+- **Çok Sınıf Artırmalı Karar Ağaçları farklı bir problemi çözüyor**. Çok sınıf artırmalı karar ağacı, parametrik olmayan görevler için en uygunu, mesela sıralama (ranking) oluşturmak için tasarlanan görevler. Yani, bizim için kullanışlı değil.
+
+### Scikit-learn kullanımı
+
+Verimizi analiz etmek için Scikit-learn kullanacağız. Ancak, Scikit-learn içerisinde lojistik regresyonu kullanmanın birçok yolu var. [Geçirilecek parametreler](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic%20regressio#sklearn.linear_model.LogisticRegression) göz atın.
+
+Aslında, Scikit-learn'den lojistik regresyon yapmasını beklediğimizde belirtmemiz gereken `multi_class` ve `solver` diye iki önemli parametre var. `multi_class` değeri belli bir davranış uygular. Çözücünün değeri, hangi algoritmanın kullanılacağını gösterir. Her çözücü her `multi_class` değeriyle eşleştirilemez.
+
+Dokümanlara göre, çok sınıflı durumunda eğitme algoritması:
+
+- Eğer `multi_class` seçeneği `ovr` olarak ayarlanmışsa, **bire karşı diğerleri (one-vs-rest, OvR) şemasını kullanır**
+- Eğer `multi_class` seçeneği `multinomial` olarak ayarlanmışsa, **çapraz düzensizlik yitimini/kaybını kullanır**. (Güncel olarak `multinomial` seçeneği yalnızca ‘lbfgs’, ‘sag’, ‘saga’ ve ‘newton-cg’ çözücüleriyle destekleniyor.)
+
+> :mortar_board: Buradaki 'şema' ya 'ovr' (one-vs-rest, yani bire karşı diğerleri) ya da 'multinomial' olabilir. Lojistik regresyon aslında ikili sınıflandırmayı desteklemek için tasarlandığından, bu şemalar onun çok sınıflı sınıflandırma görevlerini daha iyi ele alabilmesini sağlıyor. [kaynak](https://machinelearningmastery.com/one-vs-rest-and-one-vs-one-for-multi-class-classification/)
+
+> :mortar_board: 'Çözücü', "eniyileştirme probleminde kullanılacak algoritma" olarak tanımlanır. [kaynak](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic%20regressio#sklearn.linear_model.LogisticRegression)
+
+Scikit-learn, çözücülerin, farklı tür veri yapıları tarafından sunulan farklı meydan okumaları nasıl ele aldığını açıklamak için bu tabloyu sunar:
+
+![çözücüler](../images/solvers.png)
+
+## Alıştırma - veriyi bölün
+
+İkincisini önceki derte öğrendiğinizden, ilk eğitme denememiz için lojistik regresyona odaklanabiliriz.
+`train_test_split()` fonksiyonunu çağırarak verilerinizi eğitme ve sınama gruplarına bölün:
+
+```python
+X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)
+```
+
+## Alıştırma - lojistik regresyon uygulayın
+
+Çok sınıflı durumu kullandığınız için, hangi _şemayı_ kullanacağınızı ve hangi _çözücüyü_ ayarlayacağınızı seçmeniz gerekiyor. Eğitme için, bir çok sınıflı ayarında LogisticRegression ve **liblinear** çözücüsünü kullanın.
+
+1. multi_class'ı `ovr` ve solver'ı `liblinear` olarak ayarlayarak bir lojistik regresyon oluşturun:
+
+ ```python
+ lr = LogisticRegression(multi_class='ovr',solver='liblinear')
+ model = lr.fit(X_train, np.ravel(y_train))
+
+ accuracy = model.score(X_test, y_test)
+ print ("Accuracy is {}".format(accuracy))
+ ```
+
+ :white_check_mark: Genelde varsayılan olarak ayarlanan `lbfgs` gibi farklı bir çözücü deneyin.
+
+ > Not olarak, gerektiğinde verinizi düzleştirmek için Pandas [`ravel`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.ravel.html) fonksiyonunu kullanın.
+
+ Doğruluk **%80** üzerinde iyidir!
+
+1. Bir satır veriyi (#50) sınayarak bu modeli eylem halinde görebilirsiniz:
+
+ ```python
+ print(f'ingredients: {X_test.iloc[50][X_test.iloc[50]!=0].keys()}')
+ print(f'cuisine: {y_test.iloc[50]}')
+ ```
+
+ Sonuç bastırılır:
+
+ ```output
+ ingredients: Index(['cilantro', 'onion', 'pea', 'potato', 'tomato', 'vegetable_oil'], dtype='object')
+ cuisine: indian
+ ```
+
+ :white_check_mark: Farklı bir satır sayısı deneyin ve sonuçları kontrol edin
+
+1. Daha derinlemesine inceleyerek, bu öngörünün doğruluğunu kontrol edebilirsiniz:
+
+ ```python
+ test= X_test.iloc[50].values.reshape(-1, 1).T
+ proba = model.predict_proba(test)
+ classes = model.classes_
+ resultdf = pd.DataFrame(data=proba, columns=classes)
+
+ topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])
+ topPrediction.head()
+ ```
+
+ Sonuç bastırılır - Hint mutfağı iyi olasılıkla en iyi öngörü:
+
+ | | 0 |
+ | -------: | -------: |
+ | indian | 0.715851 |
+ | chinese | 0.229475 |
+ | japanese | 0.029763 |
+ | korean | 0.017277 |
+ | thai | 0.007634 |
+
+ :while_check_mark: Modelin, bunun bir Hint mutfağı olduğundan nasıl emin olduğunu açıklayabilir misiniz?
+
+1. Regresyon derslerinde yaptığınız gibi, bir sınıflandırma raporu bastırarak daha fazla detay elde edin:
+
+ ```python
+ y_pred = model.predict(X_test)
+ print(classification_report(y_test,y_pred))
+ ```
+
+ | | precision | recall | f1-score | support |
+ | ------------ | --------- | ------ | -------- | ------- |
+ | chinese | 0.73 | 0.71 | 0.72 | 229 |
+ | indian | 0.91 | 0.93 | 0.92 | 254 |
+ | japanese | 0.70 | 0.75 | 0.72 | 220 |
+ | korean | 0.86 | 0.76 | 0.81 | 242 |
+ | thai | 0.79 | 0.85 | 0.82 | 254 |
+ | accuracy | 0.80 | 1199 | | |
+ | macro avg | 0.80 | 0.80 | 0.80 | 1199 |
+ | weighted avg | 0.80 | 0.80 | 0.80 | 1199 |
+
+## :rocket: Meydan Okuma
+
+Bu derste, bir grup malzemeyi baz alarak bir ulusal mutfağı öngörebilen bir makine öğrenimi modeli oluşturmak için temiz verinizi kullandınız. Scikit-learn'ün veri sınıflandırmak için sağladığı birçok yöntemi okumak için biraz vakit ayırın. Arka tarafta neler olduğunu anlamak için 'çözücü' kavramını derinlemesine inceleyin.
+
+## [Ders sonrası kısa sınavı](https://white-water-09ec41f0f.azurestaticapps.net/quiz/22/?loc=tr)
+
+## Gözden geçirme & kendi kendine çalışma
+
+[Bu deste](https://people.eecs.berkeley.edu/~russell/classes/cs194/f11/lectures/CS194%20Fall%202011%20Lecture%2006.pdf) lojistik regresyonun arkasındaki matematiği derinlemesine inceleyin.
+## Ödev
+
+[Çözücüleri çalışın](assignment.tr.md)
diff --git a/4-Classification/2-Classifiers-1/translations/README.zh-cn.md b/4-Classification/2-Classifiers-1/translations/README.zh-cn.md
new file mode 100644
index 0000000000..4795333387
--- /dev/null
+++ b/4-Classification/2-Classifiers-1/translations/README.zh-cn.md
@@ -0,0 +1,242 @@
+# 菜品分类器1
+
+本节课程将使用你在上一个课程中所保存的全部经过均衡和清洗的菜品数据。
+
+你将使用此数据集和各种分类器,_根据一组配料预测这是哪一国家的美食_。在此过程中,你将学到更多用来权衡分类任务算法的方法
+
+## [课前测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/21/)
+# 准备工作
+
+假如你已经完成了[课程1](../../1-Introduction/translations/README.zh-cn.md), 确保在根目录的`/data`文件夹中有 _cleaned_cuisines.csv_ 这份文件来进行接下来的四节课程。
+
+## 练习 - 预测某国的菜品
+
+1. 在本节课的 _notebook.ipynb_ 文件中,导入Pandas,并读取相应的数据文件:
+
+ ```python
+ import pandas as pd
+ cuisines_df = pd.read_csv("../../data/cleaned_cuisine.csv")
+ cuisines_df.head()
+ ```
+
+ 数据如下所示:
+
+ ```output
+ | | Unnamed: 0 | cuisine | almond | angelica | anise | anise_seed | apple | apple_brandy | apricot | armagnac | ... | whiskey | white_bread | white_wine | whole_grain_wheat_flour | wine | wood | yam | yeast | yogurt | zucchini |
+ | --- | ---------- | ------- | ------ | -------- | ----- | ---------- | ----- | ------------ | ------- | -------- | --- | ------- | ----------- | ---------- | ----------------------- | ---- | ---- | --- | ----- | ------ | -------- |
+ | 0 | 0 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 1 | 1 | indian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 2 | 2 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 3 | 3 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 4 | 4 | indian | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
+ ```
+
+1. 现在,再多导入一些库:
+
+ ```python
+ from sklearn.linear_model import LogisticRegression
+ from sklearn.model_selection import train_test_split, cross_val_score
+ from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
+ from sklearn.svm import SVC
+ import numpy as np
+ ```
+
+1. 接下来需要将数据分为训练模型所需的X(译者注:代表特征数据)和y(译者注:代表标签数据)两个dataframe。首先可将`cuisine`列的数据单独保存为的一个dataframe作为标签(label)。
+
+ ```python
+ cuisines_label_df = cuisines_df['cuisine']
+ cuisines_label_df.head()
+ ```
+
+ 输出如下:
+
+ ```output
+ 0 indian
+ 1 indian
+ 2 indian
+ 3 indian
+ 4 indian
+ Name: cuisine, dtype: object
+ ```
+
+1. 调用`drop()`方法将 `Unnamed: 0`和 `cuisine`列删除,并将余下的数据作为可以用于训练的特证(feature)数据:
+
+ ```python
+ cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)
+ cuisines_feature_df.head()
+ ```
+
+ 你的特征集看上去将会是这样:
+
+ | | almond | angelica | anise | anise_seed | apple | apple_brandy | apricot | armagnac | artemisia | artichoke | ... | whiskey | white_bread | white_wine | whole_grain_wheat_flour | wine | wood | yam | yeast | yogurt | zucchini |
+ | ---: | -----: | -------: | ----: | ---------: | ----: | -----------: | ------: | -------: | --------: | --------: | ---: | ------: | ----------: | ---------: | ----------------------: | ---: | ---: | ---: | ----: | -----: | -------- |
+ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+ | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
+
+现在,你已经准备好可以开始训练你的模型了!
+
+## 选择你的分类器
+
+你的数据已经清洗干净并已经准备好可以进行训练了,现在需要决定你想要使用的算法来完成这项任务。
+
+Scikit_learn将分类任务归在了监督学习类别中,在这个类别中你可以找到很多可以用来分类的方法。乍一看上去,有点[琳琅满目](https://scikit-learn.org/stable/supervised_learning.html)。以下这些算法都可以用于分类:
+
+- 线性模型(Linear Models)
+- 支持向量机(Support Vector Machines)
+- 随机梯度下降(Stochastic Gradient Descent)
+- 最近邻(Nearest Neighbors)
+- 高斯过程(Gaussian Processes)
+- 决策树(Decision Trees)
+- 集成方法(投票分类器)(Ensemble methods(voting classifier))
+- 多类别多输出算法(多类别多标签分类,多类别多输出分类)(Multiclass and multioutput algorithms (multiclass and multilabel classification, multiclass-multioutput classification))
+
+> 你也可以使用[神经网络来分类数据](https://scikit-learn.org/stable/modules/neural_networks_supervised.html#classification), 但这对于本课程来说有点超纲了。
+
+### 如何选择分类器?
+
+那么,你应该如何从中选择分类器呢?一般来说,可以选择多个分类器并对比他们的运行结果。Scikit-learn提供了各种算法(包括KNeighbors、 SVC two ways、 GaussianProcessClassifier、 DecisionTreeClassifier、 RandomForestClassifier、 MLPClassifier、 AdaBoostClassifier、 GaussianNB以及QuadraticDiscrinationAnalysis)的[对比](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html),并且将结果进行了可视化的展示:
+
+![各分类器比较](../images/comparison.png)
+> 图表来源于Scikit-learn的官方文档
+
+> AutoML通过在云端运行这些算法并进行了对比,非常巧妙地解决的算法选择的问题,能帮助你根据数据集的特点来选择最佳的算法。试试点击[这里](https://docs.microsoft.com/learn/modules/automate-model-selection-with-azure-automl/?WT.mc_id=academic-15963-cxa)了解更多。
+
+### 另外一种效果更佳的分类器选择方法
+
+比起无脑地猜测,你可以下载这份[机器学习小抄(cheatsheet)](https://docs.microsoft.com/azure/machine-learning/algorithm-cheat-sheet?WT.mc_id=academic-15963-cxa)。这里面将各算法进行了比较,能更有效地帮助我们选择算法。根据这份小抄,我们可以找到要完成本课程中涉及的多类型的分类任务,可以有以下这些选择:
+
+![多类型问题作弊表](../images/cheatsheet.png)
+> 微软算法小抄中部分关于多类型分类任务可选算法
+
+✅ 下载这份小抄,并打印出来,挂在你的墙上吧!
+
+### 选择的流程
+
+让我们根据所有限制条件依次对各种算法的可行性进行判断:
+
+- **神经网络(Neural Network)太过复杂了**。我们的数据很清晰但数据量比较小,此外我们是通过notebook在本地进行训练的,神经网络对于这个任务来说过于复杂了。
+- **二分类法(two-class classifier)是不可行的**。我们不能使用二分类法,所以这就排除了一对多(one-vs-all)算法。
+- **可以选择决策树以及逻辑回归算法**。决策树应该是可行的,此外也可以使用逻辑回归来处理多类型数据。
+- **多类型增强决策树是用于解决其他问题的**. 多类型增强决策树最适合的是非参数化的任务,即任务目标是建立一个排序,这对我们当前的任务并没有作用。
+
+### 使用Scikit-learn
+
+我们将会使用Scikit-learn来对我们的数据进行分析。然而在Scikit-learn中使用逻辑回归也有很多方法。可以先了解一下逻辑回归算法需要[传递的参数](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic%20regressio#sklearn.linear_model.LogisticRegression)。
+
+当我们需要Scikit-learn进行逻辑回归运算时,`multi_class` 以及 `solver`是最重要的两个参数,因此我们需要特别说明一下。 `multi_class` 是分类方式选择参数,而`solver`优化算法选择参数。值得注意的是,并不是所有的solvers都可以与`multi_class`参数进行匹配的。
+
+根据官方文档,在多类型分类问题中:
+
+- 当`multi_class`被设置为`ovr`时,将使用 **“一对其余”(OvR)策略(scheme)**。
+- 当`multi_class`被设置为`multinomial`时,则使用的是**交叉熵损失(cross entropy loss)** 作为损失函数。(注意,目前`multinomial`只支持‘lbfgs’, ‘sag’, ‘saga’以及‘newton-cg’等solver作为损失函数的优化方法)
+
+> 🎓 在本课程的任务中“scheme”可以是“ovr(one-vs-rest)”也可以是“multinomial”。因为逻辑回归本来是设计来用于进行二分类任务的,这两个scheme参数的选择都可以使得逻辑回归很好的完成多类型分类任务。[来源](https://machinelearningmastery.com/one-vs-rest-and-one-vs-one-for-multi-class-classification/)
+
+> 🎓 “solver”被定义为是"用于解决优化问题的算法"。[来源](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic%20regressio#sklearn.linear_model.LogisticRegression).
+
+Scikit-learn提供了以下这个表格来解释各种solver是如何应对的不同的数据结构所带来的不同的挑战的:
+
+![solvers](../images/solvers.png)
+
+## 练习 - 分割数据
+
+因为你刚刚在上一节课中学习了逻辑回归,我们这里就通过逻辑回归算法,来演练一下如何进行你的第一个机器学习模型的训练。首先,需要通过调用`train_test_split()`方法可以把你的数据分割成训练集和测试集:
+
+
+```python
+X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)
+```
+
+## 练习 - 调用逻辑回归算法
+
+接下来,你需要决定选用什么 _scheme_ 以及 _solver_ 来进行我们这个多类型分类的案例。在这里我们使用LogisticRegression方法,并设置相应的multi_class参数,同时将solver设置为**liblinear**来进行模型训练。
+
+1. 创建一个逻辑回归模型,并将multi_class设置为`ovr`,同时将solver设置为 `liblinear`:
+
+ ```python
+ lr = LogisticRegression(multi_class='ovr',solver='liblinear')
+ model = lr.fit(X_train, np.ravel(y_train))
+
+ accuracy = model.score(X_test, y_test)
+ print ("Accuracy is {}".format(accuracy))
+ ```
+
+ ✅ 也可以试试其他solver比如`lbfgs`, 这也是默认参数
+
+ > 注意, 使用Pandas的[`ravel`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.ravel.html) 方法可以在需要的时候将你的数据进行降维
+
+ 运算之后,可以看到准确率高达了**80%**!
+
+1. 你也可以通过查看某一行数据(比如第50行)来观测到模型运行的情况:
+
+ ```python
+ print(f'ingredients: {X_test.iloc[50][X_test.iloc[50]!=0].keys()}')
+ print(f'cuisine: {y_test.iloc[50]}')
+ ```
+
+ 运行后的输出如下:
+
+ ```output
+ ingredients: Index(['cilantro', 'onion', 'pea', 'potato', 'tomato', 'vegetable_oil'], dtype='object')
+ cuisine: indian
+ ```
+
+ ✅ 试试不同的行索引来检查一下计算的结果吧
+
+1. 我们可以再进行一部深入的研究,检查一下本轮预测结果的准确率:
+
+ ```python
+ test= X_test.iloc[50].values.reshape(-1, 1).T
+ proba = model.predict_proba(test)
+ classes = model.classes_
+ resultdf = pd.DataFrame(data=proba, columns=classes)
+
+ topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])
+ topPrediction.head()
+ ```
+
+ 运行后的输出如下———可以发现这是一道印度菜的可能性最大,是最合理的猜测:
+
+ | | 0 |
+ | -------: | -------: |
+ | indian | 0.715851 |
+ | chinese | 0.229475 |
+ | japanese | 0.029763 |
+ | korean | 0.017277 |
+ | thai | 0.007634 |
+
+ ✅ 你能解释下为什么模型会如此确定这是一道印度菜么?
+
+1. 和你在之前的回归的课程中所做的一样,我们也可以通过输出分类的报告得到关于模型的更多的细节:
+
+ ```python
+ y_pred = model.predict(X_test)
+ print(classification_report(y_test,y_pred))
+ ```
+
+ | precision | recall | f1-score | support | |
+ | ------------ | ------ | -------- | ------- | ---- |
+ | chinese | 0.73 | 0.71 | 0.72 | 229 |
+ | indian | 0.91 | 0.93 | 0.92 | 254 |
+ | japanese | 0.70 | 0.75 | 0.72 | 220 |
+ | korean | 0.86 | 0.76 | 0.81 | 242 |
+ | thai | 0.79 | 0.85 | 0.82 | 254 |
+ | accuracy | 0.80 | 1199 | | |
+ | macro avg | 0.80 | 0.80 | 0.80 | 1199 |
+ | weighted avg | 0.80 | 0.80 | 0.80 | 1199 |
+
+## 挑战
+
+在本课程中,你使用了清洗后的数据建立了一个机器学习的模型,这个模型能够根据输入的一系列的配料来预测菜品来自于哪个国家。请再花点时间阅读一下Scikit-learn所提供的关于可以用来分类数据的其他方法的资料。此外,你也可以深入研究一下“solver”的概念并尝试一下理解其背后的原理。
+
+## [课后测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/22/)
+## 回顾与自学
+
+[这个课程](https://people.eecs.berkeley.edu/~russell/classes/cs194/f11/lectures/CS194%20Fall%202011%20Lecture%2006.pdf)将对逻辑回归背后的数学原理进行更加深入的讲解
+
+## 作业
+
+[学习solver](assignment.md)
diff --git a/4-Classification/2-Classifiers-1/translations/assignment.it.md b/4-Classification/2-Classifiers-1/translations/assignment.it.md
new file mode 100644
index 0000000000..80d1c5e16d
--- /dev/null
+++ b/4-Classification/2-Classifiers-1/translations/assignment.it.md
@@ -0,0 +1,10 @@
+# Studiare i risolutori
+## Istruzioni
+
+In questa lezione si è imparato a conoscere i vari risolutori che associano algoritmi a un processo di machine learning per creare un modello accurato. Esaminare i risolutori elencati nella lezione e sceglierne due. Con parole proprie, confrontare questi due risolutori. Che tipo di problema affrontano? Come funzionano con varie strutture di dati? Perché se ne dovrebbe sceglierne uno piuttosto che un altro?
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | ---------------------------------------------------------------------------------------------- | ------------------------------------------------ | ---------------------------- |
+| | Viene presentato un file .doc con due paragrafi, uno su ciascun risolutore, confrontandoli attentamente. | Un file .doc viene presentato con un solo paragrafo | Il compito è incompleto |
diff --git a/4-Classification/2-Classifiers-1/translations/assignment.tr.md b/4-Classification/2-Classifiers-1/translations/assignment.tr.md
new file mode 100644
index 0000000000..10d4c64f3f
--- /dev/null
+++ b/4-Classification/2-Classifiers-1/translations/assignment.tr.md
@@ -0,0 +1,9 @@
+# Çözücüleri çalışın
+## Yönergeler
+
+Bu derste, doğru bir model yaratmak için algoritmaları bir makine öğrenimi süreciyle eşleştiren çeşitli çözücüleri öğrendiniz. Derste sıralanan çözücüleri inceleyin ve iki tanesini seçin. Kendi cümlelerinizle, bu iki çözücünün benzerliklerini ve farklılıklarını bulup yazın. Ne tür problemleri ele alıyorlar? Çeşitli veri yapılarıyla nasıl çalışıyorlar? Birini diğerine neden tercih ederdiniz?
+## Rubrik
+
+| Ölçüt | Örnek Alınacak Nitelikte | Yeterli | Geliştirme Gerekli |
+| -------- | -------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------ | ---------------------------- |
+| | Her biri bir çözücü üzerine yazılmış, onları dikkatle karşılaştıran ve iki paragraf içeren bir .doc dosyası sunulmuş | Bir paragraf içeren bir .doc dosyası sunulmuş | Görev tamamlanmamış |
diff --git a/4-Classification/3-Classifiers-2/README.md b/4-Classification/3-Classifiers-2/README.md
index dd25926e12..291b3a7515 100644
--- a/4-Classification/3-Classifiers-2/README.md
+++ b/4-Classification/3-Classifiers-2/README.md
@@ -2,11 +2,11 @@
In this second classification lesson, you will explore more ways to classify numeric data. You will also learn about the ramifications for choosing one classifier over the other.
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/23/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/23/)
### Prerequisite
-We assume that you have completed the previous lessons and have a cleaned dataset in your `data` folder called _cleaned_cuisine.csv_ in the root of this 4-lesson folder.
+We assume that you have completed the previous lessons and have a cleaned dataset in your `data` folder called _cleaned_cuisines.csv_ in the root of this 4-lesson folder.
### Preparation
@@ -224,7 +224,7 @@ This method of Machine Learning "combines the predictions of several base estima
Each of these techniques has a large number of parameters that you can tweak. Research each one's default parameters and think about what tweaking these parameters would mean for the model's quality.
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/24/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/24/)
## Review & Self Study
diff --git a/4-Classification/3-Classifiers-2/notebook.ipynb b/4-Classification/3-Classifiers-2/notebook.ipynb
index f4dec474dd..4659a7b623 100644
--- a/4-Classification/3-Classifiers-2/notebook.ipynb
+++ b/4-Classification/3-Classifiers-2/notebook.ipynb
@@ -47,7 +47,7 @@
],
"source": [
"import pandas as pd\n",
- "cuisines_df = pd.read_csv(\"../data/cleaned_cuisine.csv\")\n",
+ "cuisines_df = pd.read_csv(\"../data/cleaned_cuisines.csv\")\n",
"cuisines_df.head()"
]
},
diff --git a/4-Classification/3-Classifiers-2/solution/notebook.ipynb b/4-Classification/3-Classifiers-2/solution/notebook.ipynb
index d953c603d4..a089b21fa6 100644
--- a/4-Classification/3-Classifiers-2/solution/notebook.ipynb
+++ b/4-Classification/3-Classifiers-2/solution/notebook.ipynb
@@ -47,7 +47,7 @@
],
"source": [
"import pandas as pd\n",
- "cuisines_df = pd.read_csv(\"../../data/cleaned_cuisine.csv\")\n",
+ "cuisines_df = pd.read_csv(\"../../data/cleaned_cuisines.csv\")\n",
"cuisines_df.head()"
]
},
diff --git a/4-Classification/3-Classifiers-2/translations/README.it.md b/4-Classification/3-Classifiers-2/translations/README.it.md
new file mode 100644
index 0000000000..3d011db879
--- /dev/null
+++ b/4-Classification/3-Classifiers-2/translations/README.it.md
@@ -0,0 +1,235 @@
+# Classificatori di cucina 2
+
+In questa seconda lezione sulla classificazione, si esploreranno più modi per classificare i dati numerici. Si Impareranno anche le ramificazioni per la scelta di un classificatore rispetto all'altro.
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/23/)
+
+### Prerequisito
+
+Si parte dal presupposto che siano state completate le lezioni precedenti e si disponga di un insieme di dati pulito nella cartella `data` chiamato _clean_cuisine.csv_ nella radice di questa cartella di 4 lezioni.
+
+### Preparazione
+
+Il file _notebook.ipynb_ è stato caricato con l'insieme di dati pulito ed è stato diviso in dataframe di dati X e y, pronti per il processo di creazione del modello.
+
+## Una mappa di classificazione
+
+In precedenza, si sono apprese le varie opzioni a disposizione durante la classificazione dei dati utilizzando il cheat sheet di Microsoft. Scikit-learn offre un cheat sheet simile, ma più granulare che può aiutare ulteriormente a restringere i propri stimatori (un altro termine per i classificatori):
+
+![Mappa ML da Scikit-learn](../images/map.png)
+> Suggerimento: [visitare questa mappa online](https://scikit-learn.org/stable/tutorial/machine_learning_map/) e fare clic lungo il percorso per leggere la documentazione.
+
+### Il piano
+
+Questa mappa è molto utile una volta che si ha una chiara comprensione dei propri dati, poiché si può "camminare" lungo i suoi percorsi verso una decisione:
+
+- Ci sono >50 campioni
+- Si vuole pronosticare una categoria
+- I dati sono etichettati
+- Ci sono meno di 100K campioni
+- ✨ Si può scegliere un SVC lineare
+- Se non funziona, visto che ci sono dati numerici
+ - Si può provare un ✨ KNeighbors Classifier
+ - Se non funziona, si prova ✨ SVC e ✨ Classificatori di ensemble
+
+Questo è un percorso molto utile da seguire.
+
+## Esercizio: dividere i dati
+
+Seguendo questo percorso, si dovrebbe iniziare importando alcune librerie da utilizzare.
+
+1. Importare le librerie necessarie:
+
+ ```python
+ from sklearn.neighbors import KNeighborsClassifier
+ from sklearn.linear_model import LogisticRegression
+ from sklearn.svm import SVC
+ from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
+ from sklearn.model_selection import train_test_split, cross_val_score
+ from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
+ import numpy as np
+ ```
+
+1. Dividere i dati per allenamento e test:
+
+ ```python
+ X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)
+ ```
+
+## Classificatore lineare SVC
+
+Il clustering Support-Vector (SVC) è figlio della famiglia di tecniche ML Support-Vector (ulteriori informazioni su queste di seguito). In questo metodo, si può scegliere un "kernel" per decidere come raggruppare le etichette. Il parametro 'C' si riferisce alla 'regolarizzazione' che regola l'influenza dei parametri. Il kernel può essere uno dei [tanti](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC); qui si imposta su 'lineare' per assicurarsi di sfruttare l'SVC lineare. Il valore predefinito di probabilità è 'false'; qui si imposta su 'true' per raccogliere stime di probabilità. Si imposta lo stato casuale su "0" per mescolare i dati per ottenere le probabilità.
+
+### Esercizio: applicare una SVC lineare
+
+Iniziare creando un array di classificatori. Si aggiungerà progressivamente a questo array durante il test.
+
+1. Iniziare con un SVC lineare:
+
+ ```python
+ C = 10
+ # Create different classifiers.
+ classifiers = {
+ 'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0)
+ }
+ ```
+
+2. Addestrare il modello utilizzando Linear SVC e stampare un rapporto:
+
+ ```python
+ n_classifiers = len(classifiers)
+
+ for index, (name, classifier) in enumerate(classifiers.items()):
+ classifier.fit(X_train, np.ravel(y_train))
+
+ y_pred = classifier.predict(X_test)
+ accuracy = accuracy_score(y_test, y_pred)
+ print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
+ print(classification_report(y_test,y_pred))
+ ```
+
+ Il risultato è abbastanza buono:
+
+ ```output
+ Accuracy (train) for Linear SVC: 78.6%
+ precision recall f1-score support
+
+ chinese 0.71 0.67 0.69 242
+ indian 0.88 0.86 0.87 234
+ japanese 0.79 0.74 0.76 254
+ korean 0.85 0.81 0.83 242
+ thai 0.71 0.86 0.78 227
+
+ accuracy 0.79 1199
+ macro avg 0.79 0.79 0.79 1199
+ weighted avg 0.79 0.79 0.79 1199
+ ```
+
+## Classificatore K-Neighbors
+
+K-Neighbors fa parte della famiglia dei metodi ML "neighbors" (vicini), che possono essere utilizzati sia per l'apprendimento supervisionato che non supervisionato. In questo metodo, viene creato un numero predefinito di punti e i dati vengono raccolti attorno a questi punti in modo tale da poter prevedere etichette generalizzate per i dati.
+
+### Esercizio: applicare il classificatore K-Neighbors
+
+Il classificatore precedente era buono e funzionava bene con i dati, ma forse si può ottenere una maggiore precisione. Provare un classificatore K-Neighbors.
+
+1. Aggiungere una riga all'array classificatore (aggiungere una virgola dopo l'elemento Linear SVC):
+
+ ```python
+ 'KNN classifier': KNeighborsClassifier(C),
+ ```
+
+ Il risultato è un po' peggio:
+
+ ```output
+ Accuracy (train) for KNN classifier: 73.8%
+ precision recall f1-score support
+
+ chinese 0.64 0.67 0.66 242
+ indian 0.86 0.78 0.82 234
+ japanese 0.66 0.83 0.74 254
+ korean 0.94 0.58 0.72 242
+ thai 0.71 0.82 0.76 227
+
+ accuracy 0.74 1199
+ macro avg 0.76 0.74 0.74 1199
+ weighted avg 0.76 0.74 0.74 1199
+ ```
+
+ ✅ Scoprire [K-Neighbors](https://scikit-learn.org/stable/modules/neighbors.html#neighbors)
+
+## Classificatore Support Vector
+
+I classificatori Support-Vector fanno parte della famiglia di metodi ML [Support-Vector Machine](https://it.wikipedia.org/wiki/Macchine_a_vettori_di_supporto) utilizzati per le attività di classificazione e regressione. Le SVM "mappano esempi di addestramento in punti nello spazio" per massimizzare la distanza tra due categorie. I dati successivi vengono mappati in questo spazio in modo da poter prevedere la loro categoria.
+
+### Esercizio: applicare un classificatore di vettori di supporto
+
+Si prova a ottenere una precisione leggermente migliore con un classificatore di vettori di supporto.
+
+1. Aggiungere una virgola dopo l'elemento K-Neighbors, quindi aggiungere questa riga:
+
+ ```python
+ 'SVC': SVC(),
+ ```
+
+ Il risultato è abbastanza buono!
+
+ ```output
+ Accuracy (train) for SVC: 83.2%
+ precision recall f1-score support
+
+ chinese 0.79 0.74 0.76 242
+ indian 0.88 0.90 0.89 234
+ japanese 0.87 0.81 0.84 254
+ korean 0.91 0.82 0.86 242
+ thai 0.74 0.90 0.81 227
+
+ accuracy 0.83 1199
+ macro avg 0.84 0.83 0.83 1199
+ weighted avg 0.84 0.83 0.83 1199
+ ```
+
+ ✅ Scoprire i vettori di [supporto](https://scikit-learn.org/stable/modules/svm.html#svm)
+
+## Classificatori ensamble
+
+Si segue il percorso fino alla fine, anche se il test precedente è stato abbastanza buono. Si provano un po' di classificatori di ensemble, nello specifico Random Forest e AdaBoost:
+
+```python
+'RFST': RandomForestClassifier(n_estimators=100),
+ 'ADA': AdaBoostClassifier(n_estimators=100)
+```
+
+Il risultato è molto buono, soprattutto per Random Forest:
+
+```output
+Accuracy (train) for RFST: 84.5%
+ precision recall f1-score support
+
+ chinese 0.80 0.77 0.78 242
+ indian 0.89 0.92 0.90 234
+ japanese 0.86 0.84 0.85 254
+ korean 0.88 0.83 0.85 242
+ thai 0.80 0.87 0.83 227
+
+ accuracy 0.84 1199
+ macro avg 0.85 0.85 0.84 1199
+weighted avg 0.85 0.84 0.84 1199
+
+Accuracy (train) for ADA: 72.4%
+ precision recall f1-score support
+
+ chinese 0.64 0.49 0.56 242
+ indian 0.91 0.83 0.87 234
+ japanese 0.68 0.69 0.69 254
+ korean 0.73 0.79 0.76 242
+ thai 0.67 0.83 0.74 227
+
+ accuracy 0.72 1199
+ macro avg 0.73 0.73 0.72 1199
+weighted avg 0.73 0.72 0.72 1199
+```
+
+✅ Ulteriori informazioni sui [classificatori di ensemble](https://scikit-learn.org/stable/modules/ensemble.html)
+
+Questo metodo di Machine Learning "combina le previsioni di diversi stimatori di base" per migliorare la qualità del modello. In questo esempio, si è utilizzato Random Trees e AdaBoost.
+
+- [Random Forest](https://scikit-learn.org/stable/modules/ensemble.html#forest), un metodo di calcolo della media, costruisce una "foresta" di "alberi decisionali" infusi di casualità per evitare il sovradattamento. Il parametro n_estimators è impostato sul numero di alberi.
+
+- [AdaBoost](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html) adatta un classificatore a un insieme di dati e quindi adatta le copie di quel classificatore allo stesso insieme di dati. Si concentra sui pesi degli elementi classificati in modo errato e regola l'adattamento per il successivo classificatore da correggere.
+
+---
+
+## 🚀 Sfida
+
+Ognuna di queste tecniche ha un gran numero di parametri che si possono modificare. Ricercare i parametri predefiniti di ciascuno e pensare a cosa significherebbe modificare questi parametri per la qualità del modello.
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/24/)
+
+## Revisione e Auto Apprendimento
+
+C'è molto gergo in queste lezioni, quindi si prenda un minuto per rivedere [questo elenco](https://docs.microsoft.com/dotnet/machine-learning/resources/glossary?WT.mc_id=academic-15963-cxa) di terminologia utile!
+
+## Compito
+
+[Giocore coi parametri](assignment.it.md)
diff --git a/4-Classification/3-Classifiers-2/translations/README.tr.md b/4-Classification/3-Classifiers-2/translations/README.tr.md
new file mode 100644
index 0000000000..56c1e11e9f
--- /dev/null
+++ b/4-Classification/3-Classifiers-2/translations/README.tr.md
@@ -0,0 +1,235 @@
+# Mutfak sınıflandırıcıları 2
+
+Bu ikinci sınıflandırma dersinde, sayısal veriyi sınıflandırmak için daha fazla yöntem öğreneceksiniz. Ayrıca, bir sınıflandırıcıyı diğerlerine tercih etmenin sonuçlarını da öğreneceksiniz.
+
+## [Ders öncesi kısa sınavı](https://white-water-09ec41f0f.azurestaticapps.net/quiz/23/?loc=tr)
+
+### Ön koşul
+
+Önceki dersleri tamamladığınızı ve bu 4-ders klasörünün kökündeki `data` klasörünüzdeki _cleaned_cuisines.csv_ adlı veri setini temizlediğinizi varsayıyoruz.
+
+### Hazırlık
+
+Temizlenmiş veri setiyle _notebook.ipynb_ dosyanızı yükledik ve model oluşturma sürecine hazır olması için X ve y veri iskeletlerine böldük.
+
+## Bir sınıflandırma haritası
+
+Daha önce, Microsoft'un kopya kağıdını kullanarak veri sınıflandırmanın çeşitli yollarını öğrendiniz. Scikit-learn de buna benzer, öngörücülerinizi (sınıflandırıcı) sınırlandırmanıza ilaveten yardım edecek bir kopya kağıdı sunar.
+
+![Scikit-learn'den Makine Öğrenimi Haritası](../images/map.png)
+> Tavsiye: [Bu haritayı çevrim içi ziyaret edin](https://scikit-learn.org/stable/tutorial/machine_learning_map/) ve rotayı seyrederken dokümantasyonu okumak için tıklayın.
+
+### Plan
+
+Verinizi iyice kavradığınızda bu harita çok faydalı olacaktır, çünkü karara ulaşırken rotalarında 'yürüyebilirsiniz':
+
+- >50 adet örneğimiz var
+- Bir kategori öngörmek istiyoruz
+- Etiketlenmiş veri var
+- 100 binden az örneğimiz var
+- :sparkles: Bir Linear SVC (Doğrusal Destek Vektör Sınıflandırma) seçebiliriz
+- Eğer bu işe yaramazsa, verimiz sayısal olduğundan
+ - :sparkles: Bir KNeighbors (K Komşu) Sınıflandırıcı deneyebiliriz
+ - Eğer bu işe yaramazsa, :sparkles: SVC (Destek Vektör Sınıflandırma) ve :sparkles: Ensemble (Topluluk) Sınıflandırıcılarını deneyin
+
+Bu çok faydalı bir yol.
+
+## Alıştırma - veriyi bölün
+
+Bu yolu takip ederek, kullanmak için bazı kütüphaneleri alarak başlamalıyız.
+
+1. Gerekli kütüphaneleri alın:
+
+ ```python
+ from sklearn.neighbors import KNeighborsClassifier
+ from sklearn.linear_model import LogisticRegression
+ from sklearn.svm import SVC
+ from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
+ from sklearn.model_selection import train_test_split, cross_val_score
+ from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
+ import numpy as np
+ ```
+
+1. Eğitme ve sınama verinizi bölün:
+
+ ```python
+ X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)
+ ```
+
+## Linear SVC Sınıflandırıcısı
+
+Destek Vektör kümeleme (SVC), makine öğrenimi yöntemlerinden Destek Vektör Makinelerinin (Aşağıda bunun hakkında daha fazla bilgi edineceksiniz.) alt dallarından biridir. Bu yöntemde, etiketleri nasıl kümeleyeceğinize karar vermek için bir 'kernel' seçebilirsiniz. 'C' parametresi 'düzenlileştirme'yi ifade eder ve parametrelerin etkilerini düzenler. Kernel (çekirdek) [birçoğundan](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC) biri olabilir; burada, doğrusal SVC leveraj ettiğimizden emin olmak için, 'linear' olarak ayarlıyoruz. Olasılık varsayılan olarak 'false' olarak ayarlıdır; burada, olasılık öngörülerini toplamak için, 'true' olarak ayarlıyoruz. Rastgele durumu (random state), olasılıkları elde etmek için veriyi karıştırmak (shuffle) üzere, '0' olarak ayarlıyoruz.
+
+### Alıştırma - doğrusal SVC uygulayın
+
+Sınıflandırıcıardan oluşan bir dizi oluşturarak başlayın. Sınadıkça bu diziye ekleme yapacağız.
+
+1. Liner SVC ile başlayın:
+
+ ```python
+ C = 10
+ # Create different classifiers.
+ classifiers = {
+ 'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0)
+ }
+ ```
+
+2. Linear SVC kullanarak modelinizi eğitin ve raporu bastırın:
+
+ ```python
+ n_classifiers = len(classifiers)
+
+ for index, (name, classifier) in enumerate(classifiers.items()):
+ classifier.fit(X_train, np.ravel(y_train))
+
+ y_pred = classifier.predict(X_test)
+ accuracy = accuracy_score(y_test, y_pred)
+ print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
+ print(classification_report(y_test,y_pred))
+ ```
+
+ Sonuç oldukça iyi:
+
+ ```output
+ Accuracy (train) for Linear SVC: 78.6%
+ precision recall f1-score support
+
+ chinese 0.71 0.67 0.69 242
+ indian 0.88 0.86 0.87 234
+ japanese 0.79 0.74 0.76 254
+ korean 0.85 0.81 0.83 242
+ thai 0.71 0.86 0.78 227
+
+ accuracy 0.79 1199
+ macro avg 0.79 0.79 0.79 1199
+ weighted avg 0.79 0.79 0.79 1199
+ ```
+
+## K-Komşu sınıflandırıcısı
+
+K-Komşu, makine öğrenimi yöntemlerinden "neighbors" (komşular) ailesinin bir parçasıdır ve gözetimli ve gözetimsiz öğrenmenin ikisinde de kullanılabilir. Bu yöntemde, önceden tanımlanmış sayıda nokta üretilir ve veri bu noktalar etrafında, genelleştirilmiş etiketlerin veriler için öngörülebileceği şekilde toplanır.
+
+### Alıştırma - K-Komşu sınıflandırıcısını uygulayın
+
+Önceki sınıflandırıcı iyiydi ve veriyle iyi çalıştı, ancak belki daha iyi bir doğruluk elde edebiliriz. K-Komşu sınıflandırıcısını deneyin.
+
+1. Sınıflandırıcı dizinize bir satır ekleyin (Linear SVC ögesinden sonra bir virgül ekleyin):
+
+ ```python
+ 'KNN classifier': KNeighborsClassifier(C),
+ ```
+
+ Sonuç biraz daha kötü:
+
+ ```output
+ Accuracy (train) for KNN classifier: 73.8%
+ precision recall f1-score support
+
+ chinese 0.64 0.67 0.66 242
+ indian 0.86 0.78 0.82 234
+ japanese 0.66 0.83 0.74 254
+ korean 0.94 0.58 0.72 242
+ thai 0.71 0.82 0.76 227
+
+ accuracy 0.74 1199
+ macro avg 0.76 0.74 0.74 1199
+ weighted avg 0.76 0.74 0.74 1199
+ ```
+
+ :white_check_mark: [K-Komşu](https://scikit-learn.org/stable/modules/neighbors.html#neighbors) hakkında bilgi edinin
+
+## Destek Vektör Sınıflandırıcısı
+
+Destek Vektör sınıflandırıcıları, makine öğrenimi yöntemlerinden [Destek Vektörü Makineleri](https://wikipedia.org/wiki/Support-vector_machine) ailesinin bir parçasıdır ve sınıflandırma ve regresyon görevlerinde kullanılır. SVM'ler (Destek Vektör Makineleri), iki kategori arasındaki uzaklığı en yükseğe getirmek için eğitme örneklerini boşluktaki noktalara eşler. Sonraki veri, kategorisinin öngörülebilmesi için bu boşluğa eşlenir.
+
+### Alıştırma - bir Destek Vektör Sınıflandırıcısı uygulayın
+
+Bir Destek Vektör Sınıflandırıcısı ile daha iyi bir doğruluk elde etmeye çalışalım.
+
+1. K-Neighbors ögesinden sonra bir virgül ekleyin, sonra bu satırı ekleyin:
+
+ ```python
+ 'SVC': SVC(),
+ ```
+
+ Sonuç oldukça iyi!
+
+ ```output
+ Accuracy (train) for SVC: 83.2%
+ precision recall f1-score support
+
+ chinese 0.79 0.74 0.76 242
+ indian 0.88 0.90 0.89 234
+ japanese 0.87 0.81 0.84 254
+ korean 0.91 0.82 0.86 242
+ thai 0.74 0.90 0.81 227
+
+ accuracy 0.83 1199
+ macro avg 0.84 0.83 0.83 1199
+ weighted avg 0.84 0.83 0.83 1199
+ ```
+
+ :white_check_mark: [Destek Vektörleri](https://scikit-learn.org/stable/modules/svm.html#svm) hakkında bilgi edinin
+
+## Topluluk Sınıflandırıcıları
+
+Önceki sınamanın oldukça iyi olmasına rağmen rotayı sonuna kadar takip edelim. Bazı Topluluk Sınıflandırıcılarını deneyelim, özellikle Random Forest ve AdaBoost'u:
+
+```python
+'RFST': RandomForestClassifier(n_estimators=100),
+ 'ADA': AdaBoostClassifier(n_estimators=100)
+```
+
+Sonuç çok iyi, özellikle Random Forest sonuçları:
+
+```output
+Accuracy (train) for RFST: 84.5%
+ precision recall f1-score support
+
+ chinese 0.80 0.77 0.78 242
+ indian 0.89 0.92 0.90 234
+ japanese 0.86 0.84 0.85 254
+ korean 0.88 0.83 0.85 242
+ thai 0.80 0.87 0.83 227
+
+ accuracy 0.84 1199
+ macro avg 0.85 0.85 0.84 1199
+weighted avg 0.85 0.84 0.84 1199
+
+Accuracy (train) for ADA: 72.4%
+ precision recall f1-score support
+
+ chinese 0.64 0.49 0.56 242
+ indian 0.91 0.83 0.87 234
+ japanese 0.68 0.69 0.69 254
+ korean 0.73 0.79 0.76 242
+ thai 0.67 0.83 0.74 227
+
+ accuracy 0.72 1199
+ macro avg 0.73 0.73 0.72 1199
+weighted avg 0.73 0.72 0.72 1199
+```
+
+:white_check_mark: [Topluluk Sınıflandırıcıları](https://scikit-learn.org/stable/modules/ensemble.html) hakkında bilgi edinin
+
+Makine Öğreniminin bu yöntemi, modelin kalitesini artırmak için, "birçok temel öngörücünün öngörülerini birleştirir." Bizim örneğimizde, Random Trees ve AdaBoost kullandık.
+
+- [Random Forest](https://scikit-learn.org/stable/modules/ensemble.html#forest) bir ortalama alma yöntemidir, aşırı öğrenmeden kaçınmak için rastgelelikle doldurulmuş 'karar ağaçları'ndan oluşan bir 'orman' oluşturur. n_estimators parametresi, ağaç sayısı olarak ayarlanmaktadır.
+
+- [AdaBoost](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html), bir sınıflandıcıyı bir veri setine uydurur ve sonra o sınıflandırıcının kopyalarını aynı veri setine uydurur. Yanlış sınıflandırılmış ögelerin ağırlıklarına odaklanır ve bir sonraki sınıflandırıcının düzeltmesi için uydurma/oturtmayı ayarlar.
+
+---
+
+## :rocket: Meydan okuma
+
+Bu yöntemlerden her biri değiştirebileceğiniz birsürü parametre içeriyor. Her birinin varsayılan parametrelerini araştırın ve bu parametreleri değiştirmenin modelin kalitesi için ne anlama gelebileceği hakkında düşünün.
+
+## [Ders sonrası kısa sınavı](https://white-water-09ec41f0f.azurestaticapps.net/quiz/24/?loc=tr)
+
+## Gözden Geçirme & Kendi Kendine Çalışma
+
+Bu derslerde çok fazla jargon var, bu yüzden yararlı terminoloji içeren [bu listeyi](https://docs.microsoft.com/dotnet/machine-learning/resources/glossary?WT.mc_id=academic-15963-cxa) incelemek için bir dakika ayırın.
+
+## Ödev
+
+[Parametre oyunu](assignment.tr.md)
\ No newline at end of file
diff --git a/4-Classification/3-Classifiers-2/translations/assignment.it.md b/4-Classification/3-Classifiers-2/translations/assignment.it.md
new file mode 100644
index 0000000000..472cdb114d
--- /dev/null
+++ b/4-Classification/3-Classifiers-2/translations/assignment.it.md
@@ -0,0 +1,11 @@
+# Giocore coi parametri
+
+## Istruzioni
+
+Ci sono molti parametri impostati in modalità predefinita quando si lavora con questi classificatori. Intellisense in VS Code può aiutare a scavare in loro. Adottare una delle tecniche di classificazione ML in questa lezione e riaddestrare i modelli modificando i vari valori dei parametri. Costruire un notebook spiegando perché alcune modifiche aiutano la qualità del modello mentre altre la degradano. La risposta sia dettagliata.
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | ---------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------- | ----------------------------- |
+| | Un notebook viene presentato con un classificatore completamente costruito e i suoi parametri ottimizzati e le modifiche spiegate nelle caselle di testo | Un quaderno è presentato parzialmente o spiegato male | Un notebook contiene errori o è difettoso |
diff --git a/4-Classification/3-Classifiers-2/translations/assignment.tr.md b/4-Classification/3-Classifiers-2/translations/assignment.tr.md
new file mode 100644
index 0000000000..fbc7409279
--- /dev/null
+++ b/4-Classification/3-Classifiers-2/translations/assignment.tr.md
@@ -0,0 +1,11 @@
+# Parametre Oyunu
+
+## Yönergeler
+
+Bu sınıflandırıcılarla çalışırken varsayılan olarak ayarlanmış birçok parametre var. VS Code'daki Intellisense, onları derinlemesine incelemenize yardımcı olabilir. Bu dersteki Makine Öğrenimi Sınıflandırma Yöntemlerinden birini seçin ve çeşitli parametre değerlerini değiştirerek modelleri yeniden eğitin. Neden bazı değişikliklerin modelin kalitesini artırdığını ve bazılarının azalttığını açıklayan bir not defteri yapın. Cevabınız açıklayıcı olmalı.
+
+## Rubrik
+
+| Ölçüt | Örnek Alınacak Nitelikte | Yeterli | Geliştirme Gerekli |
+| -------- | ---------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------- | ------------------------------- |
+| | Bir sınıflandırıcının tamamen oluşturulduğu ve parametrelerinin değiştirilip yazı kutularında açıklandığı bir not defteri sunulmuş | Not defteri kısmen sunulmuş veya az açıklanmış | Not defteri hatalı veya kusurlu |
\ No newline at end of file
diff --git a/4-Classification/4-Applied/README.md b/4-Classification/4-Applied/README.md
index 0e20f6c37f..cad728f34c 100644
--- a/4-Classification/4-Applied/README.md
+++ b/4-Classification/4-Applied/README.md
@@ -8,7 +8,7 @@ One of the most useful practical uses of machine learning is building recommenda
> 🎥 Click the image above for a video: Andrew Ng introduces recommendation system design
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/25/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/25/)
In this lesson you will learn:
@@ -31,7 +31,7 @@ First, train a classification model using the cleaned cuisines dataset we used.
1. Start by importing useful libraries:
```python
- pip install skl2onnx
+ !pip install skl2onnx
import pandas as pd
```
@@ -40,7 +40,7 @@ First, train a classification model using the cleaned cuisines dataset we used.
1. Then, work with your data in the same way you did in previous lessons, by reading a CSV file using `read_csv()`:
```python
- data = pd.read_csv('../data/cleaned_cuisine.csv')
+ data = pd.read_csv('../data/cleaned_cuisines.csv')
data.head()
```
@@ -312,7 +312,7 @@ In this code, there are several things happening:
## Test your application
-Open a terminal session in Visual Studio Code in the folder where your index.html file resides. Ensure that you have `[http-server](https://www.npmjs.com/package/http-server)` installed globally, and type `http-server` at the prompt. A localhost should open and you can view your web app. Check what cuisine is recommended based on various ingredients:
+Open a terminal session in Visual Studio Code in the folder where your index.html file resides. Ensure that you have [http-server](https://www.npmjs.com/package/http-server) installed globally, and type `http-server` at the prompt. A localhost should open and you can view your web app. Check what cuisine is recommended based on various ingredients:
![ingredient web app](images/web-app.png)
@@ -321,7 +321,7 @@ Congratulations, you have created a 'recommendation' web app with a few fields.
Your web app is very minimal, so continue to build it out using ingredients and their indexes from the [ingredient_indexes](../data/ingredient_indexes.csv) data. What flavor combinations work to create a given national dish?
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/26/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/26/)
## Review & Self Study
diff --git a/4-Classification/4-Applied/solution/notebook.ipynb b/4-Classification/4-Applied/solution/notebook.ipynb
index b96820489a..b388d2ca87 100644
--- a/4-Classification/4-Applied/solution/notebook.ipynb
+++ b/4-Classification/4-Applied/solution/notebook.ipynb
@@ -64,7 +64,7 @@
}
],
"source": [
- "pip install skl2onnx"
+ "!pip install skl2onnx"
]
},
{
@@ -115,7 +115,7 @@
}
],
"source": [
- "data = pd.read_csv('../../data/cleaned_cuisine.csv')\n",
+ "data = pd.read_csv('../../data/cleaned_cuisines.csv')\n",
"data.head()"
]
},
diff --git a/4-Classification/4-Applied/translations/README.it.md b/4-Classification/4-Applied/translations/README.it.md
new file mode 100644
index 0000000000..72dc6c3800
--- /dev/null
+++ b/4-Classification/4-Applied/translations/README.it.md
@@ -0,0 +1,336 @@
+# Costruire un'App Web per Consigliare una Cucina
+
+In questa lezione si creerà un modello di classificazione utilizzando alcune delle tecniche apprese nelle lezioni precedenti e con il delizioso insieme di dati sulla cucina utilizzato in questa serie. Inoltre, si creerà una piccola app web per utilizzare un modello salvato, sfruttando il runtime web di Onnx.
+
+Uno degli usi pratici più utili dell'apprendimento automatico è la creazione di sistemi di raccomandazione e oggi si può fare il primo passo in quella direzione!
+
+[![Introduzione ai Sistemi di Raccomandazione](https://img.youtube.com/vi/giIXNoiqO_U/0.jpg)](https://youtu.be/giIXNoiqO_U "Introduzione ai Sistemi di Raccomandazione")
+
+> 🎥 Fare clic sull'immagine sopra per un video: Andrew Ng introduce la progettazione di un sistema di raccomandazione
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/25/)
+
+In questa lezione, si imparerà:
+
+- Come costruire un modello e salvarlo come modello Onnx
+- Come usare Netron per ispezionare il modello
+- Come utilizzare il modello in un'app web per l'inferenza
+
+## Costruire il modello
+
+La creazione di sistemi ML applicati è una parte importante dell'utilizzo di queste tecnologie per i sistemi aziendali. Si possono utilizzare i modelli all'interno delle proprie applicazioni web (e quindi utilizzarli in un contesto offline se necessario) utilizzando Onnx.
+
+In una [lezione precedente](../../../3-Web-App/1-Web-App/translations/README.it.md) si è costruito un modello di regressione sugli avvistamenti di UFO, è stato serializzato e lo si è utilizzato in un'app Flask. Sebbene questa architettura sia molto utile da conoscere, è un'app Python completa e i requisiti potrebbero includere l'uso di un'applicazione JavaScript.
+
+In questa lezione si può creare un sistema di inferenza di base utilizzando JavaScript. Prima, tuttavia, è necessario addestrare un modello e convertirlo per l'utilizzo con Onnx.
+
+## Esercizio - modello di classificazione di addestramento
+
+Innanzitutto, addestrare un modello di classificazione utilizzando l'insieme di dati pulito delle cucine precedentemente usato.
+
+1. Iniziare importando librerie utili:
+
+ ```python
+ !pip install skl2onnx
+ import pandas as pd
+ ```
+
+ Serve '[skl2onnx](https://onnx.ai/sklearn-onnx/)' per poter convertire il modello di Scikit-learn in formato Onnx.
+
+1. Quindi si lavora con i dati nello stesso modo delle lezioni precedenti, leggendo un file CSV usando `read_csv()`:
+
+ ```python
+ data = pd.read_csv('../data/cleaned_cuisine.csv')
+ data.head()
+ ```
+
+1. Rimuovere le prime due colonne non necessarie e salvare i dati rimanenti come "X":
+
+ ```python
+ X = data.iloc[:,2:]
+ X.head()
+ ```
+
+1. Salvare le etichette come "y":
+
+ ```python
+ y = data[['cuisine']]
+ y.head()
+
+ ```
+
+### Iniziare la routine di addestramento
+
+Verrà usata la libreria 'SVC' che ha una buona precisione.
+
+1. Importare le librerie appropriate da Scikit-learn:
+
+ ```python
+ from sklearn.model_selection import train_test_split
+ from sklearn.svm import SVC
+ from sklearn.model_selection import cross_val_score
+ from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report
+ ```
+
+1. Separare gli insiemi di allenamento e test:
+
+ ```python
+ X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
+ ```
+
+1. Costruire un modello di classificazione SVC come fatto nella lezione precedente:
+
+ ```python
+ model = SVC(kernel='linear', C=10, probability=True,random_state=0)
+ model.fit(X_train,y_train.values.ravel())
+ ```
+
+1. Ora provare il modello, chiamando `predict()`:
+
+ ```python
+ y_pred = model.predict(X_test)
+ ```
+
+1. Stampare un rapporto di classificazione per verificare la qualità del modello:
+
+ ```python
+ print(classification_report(y_test,y_pred))
+ ```
+
+ Come visto prima, la precisione è buona:
+
+ ```output
+ precision recall f1-score support
+
+ chinese 0.72 0.69 0.70 257
+ indian 0.91 0.87 0.89 243
+ japanese 0.79 0.77 0.78 239
+ korean 0.83 0.79 0.81 236
+ thai 0.72 0.84 0.78 224
+
+ accuracy 0.79 1199
+ macro avg 0.79 0.79 0.79 1199
+ weighted avg 0.79 0.79 0.79 1199
+ ```
+
+### Convertire il modello in Onnx
+
+Assicurarsi di eseguire la conversione con il numero tensore corretto. Questo insieme di dati ha 380 ingredienti elencati, quindi è necessario annotare quel numero in `FloatTensorType`:
+
+1. Convertire usando un numero tensore di 380.
+
+ ```python
+ from skl2onnx import convert_sklearn
+ from skl2onnx.common.data_types import FloatTensorType
+
+ initial_type = [('float_input', FloatTensorType([None, 380]))]
+ options = {id(model): {'nocl': True, 'zipmap': False}}
+ ```
+
+1. Creare l'onx e salvarlo come file **model.onnx**:
+
+ ```python
+ onx = convert_sklearn(model, initial_types=initial_type, options=options)
+ with open("./model.onnx", "wb") as f:
+ f.write(onx.SerializeToString())
+ ```
+
+ > Nota, si possono passare le[opzioni](https://onnx.ai/sklearn-onnx/parameterized.html) nello script di conversione. In questo caso, si è passato 'nocl' come True e 'zipmap' come False. Poiché questo è un modello di classificazione, si ha la possibilità di rimuovere ZipMap che produce un elenco di dizionari (non necessario). `nocl` si riferisce alle informazioni sulla classe incluse nel modello. Ridurre le dimensioni del modello impostando `nocl` su 'True'.
+
+L'esecuzione dell'intero notebook ora creerà un modello Onnx e lo salverà in questa cartella.
+
+## Visualizzare il modello
+
+I modelli Onnx non sono molto visualizzabili in Visual Studio code, ma c'è un ottimo software gratuito che molti ricercatori usano per visualizzare il modello per assicurarsi che sia costruito correttamente. Scaricare [Netron](https://github.com/lutzroeder/Netron) e aprire il file model.onnx. Si può vedere il modello semplice visualizzato, con i suoi 380 input e classificatore elencati:
+
+![Vista Netron ](../images/netron.png)
+
+Netron è uno strumento utile per visualizzare i modelli.
+
+Ora si è pronti per utilizzare questo modello accurato in un'app web. Si costruisce un'app che tornerà utile quando si guarda nel frigorifero e si prova a capire quale combinazione di ingredienti avanzati si può usare per cucinare una determinata tipologia di cucina, come determinato dal modello.
+
+## Creare un'applicazione web di raccomandazione
+
+Si può utilizzare il modello direttamente in un'app web. Questa architettura consente anche di eseguirlo localmente e anche offline se necessario. Iniziare creando un file `index.html` nella stessa cartella in cui si è salvato il file `model.onnx`.
+
+1. In questo file _index.html_, aggiungere il seguente codice markup:
+
+ ```html
+
+
+
+ Cuisine Matcher
+
+
+ ...
+
+
+ ```
+
+1. Ora, lavorando all'interno del tag `body` , aggiungere un piccolo markup per mostrare un elenco di caselle di controllo che riflettono alcuni ingredienti:
+
+ ```html
+
Check your refrigerator. What can you create?
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ```
+
+ Notare che a ogni casella di controllo viene assegnato un valore. Questo riflette l'indice in cui si trova l'ingrediente in base all'insieme di dati. Apple, ad esempio, in questo elenco alfabetico, occupa la quinta colonna, quindi il suo valore è "4" poiché si inizia a contare da 0. Si può consultare il [foglio di calcolo degli ingredienti](../../data/ingredient_indexes.csv) per scoprire l'indice di un determinato ingrediente.
+
+ Continuando il lavoro nel file index.html, aggiungere un blocco di script in cui viene chiamato il modello dopo la chiusura del tag `` finale.
+
+1. Innanzitutto, importare il [runtime Onnx](https://www.onnxruntime.ai/):
+
+ ```html
+
+ ```
+
+ > Onnx Runtime viene utilizzato per consentire l'esecuzione dei modelli Onnx su un'ampia gamma di piattaforme hardware, comprese le ottimizzazioni e un'API da utilizzare.
+
+1. Una volta che il Runtime è a posto, lo si può chiamare:
+
+ ```javascript
+
+ ```
+
+In questo codice, accadono diverse cose:
+
+1. Si è creato un array di 380 possibili valori (1 o 0) da impostare e inviare al modello per l'inferenza, a seconda che una casella di controllo dell'ingrediente sia selezionata.
+2. Si è creata una serie di caselle di controllo e un modo per determinare se sono state selezionate in una funzione `init` chiamata all'avvio dell'applicazione. Quando una casella di controllo è selezionata, l 'array `ingredients` viene modificato per riflettere l'ingrediente scelto.
+3. Si è creata una funzione `testCheckboxes` che controlla se una casella di controllo è stata selezionata.
+4. Si utilizza quella funzione quando si preme il pulsante e, se una casella di controllo è selezionata, si avvia l'inferenza.
+5. La routine di inferenza include:
+ 1. Impostazione di un caricamento asincrono del modello
+ 2. Creazione di una struttura tensoriale da inviare al modello
+ 3. Creazione di "feed" che riflettano l'input `float_input` creato durante l'addestramento del modello (si può usare Netron per verificare quel nome)
+ 4. Invio di questi "feed" al modello e attesa di una risposta
+
+## Verificare l'applicazione
+
+Aprire una sessione terminale in Visual Studio Code nella cartella in cui risiede il file index.html. Assicurarsi di avere [http-server](https://www.npmjs.com/package/http-server) installato globalmente e digitare `http-server` al prompt. Dovrebbe aprirsi nel browser un localhost e si può visualizzare l'app web. Controllare quale cucina è consigliata in base ai vari ingredienti:
+
+![app web degli ingredienti](../images/web-app.png)
+
+Congratulazioni, si è creato un'app web di "raccomandazione" con pochi campi. Si prenda del tempo per costruire questo sistema!
+## 🚀 Sfida
+
+L'app web è molto minimale, quindi continuare a costruirla usando gli ingredienti e i loro indici dai dati [ingredient_indexes](../../data/ingredient_indexes.csv) . Quali combinazioni di sapori funzionano per creare un determinato piatto nazionale?
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/26/)
+
+## Revisione e Auto Apprendimento
+
+Sebbene questa lezione abbia appena toccato l'utilità di creare un sistema di raccomandazione per gli ingredienti alimentari, quest'area delle applicazioni ML è molto ricca di esempi. Leggere di più su come sono costruiti questi sistemi:
+
+- https://www.sciencedirect.com/topics/computer-science/recommendation-engine
+- https://www.technologyreview.com/2014/08/25/171547/the-ultimate-challenge-for-recommendation-engines/
+- https://www.technologyreview.com/2015/03/23/168831/everything-is-a-recommendation/
+
+## Compito
+
+[Creare un nuovo sistema di raccomandazione](assignment.it.md)
diff --git a/4-Classification/4-Applied/translations/README.tr.md b/4-Classification/4-Applied/translations/README.tr.md
new file mode 100644
index 0000000000..d3418e9ab8
--- /dev/null
+++ b/4-Classification/4-Applied/translations/README.tr.md
@@ -0,0 +1,336 @@
+# Mutfak Önerici Bir Web Uygulaması Oluşturun
+
+Bu derste, önceki derslerde öğrendiğiniz bazı yöntemleri kullanarak, bu seri boyunca kullanılan leziz mutfak veri setiyle bir sınıflandırma modeli oluşturacaksınız. Ayrıca, kaydettiğiniz modeli kullanmak üzere, Onnx'un web çalışma zamanından yararlanan küçük bir web uygulaması oluşturacaksınız.
+
+Makine öğreniminin en faydalı pratik kullanımlarından biri, önerici/tavsiyeci sistemler oluşturmaktır ve bu yöndeki ilk adımınızı bugün atabilirsiniz!
+
+[![Önerici Sistemler Tanıtımı](https://img.youtube.com/vi/giIXNoiqO_U/0.jpg)](https://youtu.be/giIXNoiqO_U "Recommendation Systems Introduction")
+
+> :movie_camera: Video için yukarıdaki fotoğrafa tıklayın: Andrew Ng introduces recommendation system design (Andrew Ng önerici sistem tasarımını tanıtıyor)
+
+## [Ders öncesi kısa sınavı](https://white-water-09ec41f0f.azurestaticapps.net/quiz/25/?loc=tr)
+
+Bu derste şunları öğreneceksiniz:
+
+- Bir model nasıl oluşturulur ve Onnx modeli olarak kaydedilir
+- Modeli denetlemek için Netron nasıl kullanılır
+- Modeliniz çıkarım için bir web uygulamasında nasıl kullanılabilir
+
+## Modelinizi oluşturun
+
+Uygulamalı Makine Öğrenimi sistemleri oluşturmak, bu teknolojilerden kendi iş sistemleriniz için yararlanmanızın önemli bir parçasıdır. Onnx kullanarak modelleri kendi web uygulamalarınız içerisinde kullanabilirsiniz (Böylece gerektiğinde çevrim dışı bir içerikte kullanabilirsiniz.).
+
+[Önceki bir derste](../../../3-Web-App/1-Web-App/README.md) UFO gözlemleriyle ilgili bir Regresyon modeli oluşturmuş, "pickle" kullanmış ve bir Flask uygulamasında kullanmıştınız. Bu mimariyi bilmek çok faydalıdır, ancak bu tam yığın Python uygulamasıdır ve bir JavaScript uygulaması kullanımı gerekebilir.
+
+Bu derste, çıkarım için temel JavaScript tabanlı bir sistem oluşturabilirsiniz. Ancak öncelikle, bir model eğitmeniz ve Onnx ile kullanım için dönüştürmeniz gerekmektedir.
+
+## Alıştırma - sınıflandırma modelini eğitin
+
+Öncelikle, kullandığımız temiz mutfak veri setini kullanarak bir sınıflandırma modeli eğitin.
+
+1. Faydalı kütüphaneler almakla başlayın:
+
+ ```python
+ !pip install skl2onnx
+ import pandas as pd
+ ```
+
+ Scikit-learn modelinizi Onnx biçimine dönüştürmeyi sağlamak için '[skl2onnx](https://onnx.ai/sklearn-onnx/)'a ihtiyacınız var.
+
+1. Sonra, önceki derslerde yaptığınız şekilde, `read_csv()` kullanarak bir CSV dosyasını okuyarak veriniz üzerinde çalışın:
+
+ ```python
+ data = pd.read_csv('../data/cleaned_cuisines.csv')
+ data.head()
+ ```
+
+1. İlk iki gereksiz sütunu kaldırın ve geriye kalan veriyi 'X' olarak kaydedin:
+
+ ```python
+ X = data.iloc[:,2:]
+ X.head()
+ ```
+
+1. Etiketleri 'y' olarak kaydedin:
+
+ ```python
+ y = data[['cuisine']]
+ y.head()
+
+ ```
+
+### Eğitme rutinine başlayın
+
+İyi doğruluğu olan 'SVC' kütüphanesini kullanacağız.
+
+1. Scikit-learn'den uygun kütüphaneleri alın:
+
+ ```python
+ from sklearn.model_selection import train_test_split
+ from sklearn.svm import SVC
+ from sklearn.model_selection import cross_val_score
+ from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report
+ ```
+
+1. Eğitme ve sınama kümelerini ayırın:
+
+ ```python
+ X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
+ ```
+
+1. Önceki derste yaptığınız gibi bir SVC Sınıflandırma modeli oluşturun:
+
+ ```python
+ model = SVC(kernel='linear', C=10, probability=True,random_state=0)
+ model.fit(X_train,y_train.values.ravel())
+ ```
+
+1. Şimdi, `predict()` fonksiyonunu çağırarak modelinizi sınayın:
+
+ ```python
+ y_pred = model.predict(X_test)
+ ```
+
+1. Modelin kalitesini kontrol etmek için bir sınıflandırma raporu bastırın:
+
+ ```python
+ print(classification_report(y_test,y_pred))
+ ```
+
+ Daha önce de gördüğümüz gibi, doğruluk iyi:
+
+ ```output
+ precision recall f1-score support
+
+ chinese 0.72 0.69 0.70 257
+ indian 0.91 0.87 0.89 243
+ japanese 0.79 0.77 0.78 239
+ korean 0.83 0.79 0.81 236
+ thai 0.72 0.84 0.78 224
+
+ accuracy 0.79 1199
+ macro avg 0.79 0.79 0.79 1199
+ weighted avg 0.79 0.79 0.79 1199
+ ```
+
+### Modelinizi Onnx'a dönüştürün
+
+Dönüştürmeyi uygun Tensor sayısıyla yaptığınıza emin olun. Bu veri seti listelenmiş 380 malzeme içeriyor, dolayısıyla bu sayıyı `FloatTensorType` içinde belirtmeniz gerekiyor:
+
+1. 380 tensor sayısını kullanarak dönüştürün.
+
+ ```python
+ from skl2onnx import convert_sklearn
+ from skl2onnx.common.data_types import FloatTensorType
+
+ initial_type = [('float_input', FloatTensorType([None, 380]))]
+ options = {id(model): {'nocl': True, 'zipmap': False}}
+ ```
+
+1. onx'u oluşturun ve **model.onnx** diye bir dosya olarak kaydedin:
+
+ ```python
+ onx = convert_sklearn(model, initial_types=initial_type, options=options)
+ with open("./model.onnx", "wb") as f:
+ f.write(onx.SerializeToString())
+ ```
+
+ > Not olarak, dönüştürme senaryonuzda [seçenekler](https://onnx.ai/sklearn-onnx/parameterized.html) geçirebilirsiniz. Biz bu durumda, 'nocl' parametresini True ve 'zipmap' parametresini 'False' olarak geçirdik. Bu bir sınıflandırma modeli olduğundan, bir sözlük listesi üreten (gerekli değil) ZipMap'i kaldırma seçeneğiniz var. `nocl`, modelde sınıf bilgisinin barındırılmasını ifade eder. `nocl` parametresini 'True' olarak ayarlayarak modelinizin boyutunu küçültün.
+
+Tüm not defterini çalıştırmak şimdi bir Onnx modeli oluşturacak ve bu klasöre kaydedecek.
+
+## Modelinizi inceleyin
+
+Onnx modelleri Visual Studio code'da pek görünür değiller ama birçok araştırmacının modelin doğru oluştuğundan emin olmak üzere modeli görselleştirmek için kullandığı çok iyi bir yazılım var. [Netron](https://github.com/lutzroeder/Netron)'u indirin ve model.onnx dosyanızı açın. 380 girdisi ve sınıflandırıcısıyla basit modelinizin görselleştirildiğini görebilirsiniz:
+
+![Netron görseli](../images/netron.png)
+
+Netron, modellerinizi incelemek için faydalı bir araçtır.
+
+Şimdi, bu düzenli modeli web uygulamanızda kullanmak için hazırsınız. Buzdolabınıza baktığınızda ve verilen bir mutfak için artık malzemelerin hangi birleşimini kullanabileceğinizi bulmayı denediğinizde kullanışlı olacak bir uygulama oluşturalım. Bu birleşim modeliniz tarafından belirlenecek.
+
+## Önerici bir web uygulaması oluşturun
+
+Modelinizi doğrudan bir web uygulamasında kullanabilirsiniz. Bu mimari, modelinizi yerelde ve hatta gerektiğinde çevrim dışı çalıştırabilmenizi de sağlar. `model.onnx` dosyanızı kaydettiğiniz klasörde `index.html` dosyasını oluşturarak başlayın.
+
+1. Bu _index.html_ dosyasında aşağıdaki işaretlemeyi ekleyin:
+
+ ```html
+
+
+
+ Cuisine Matcher
+
+
+ ...
+
+
+ ```
+
+1. Şimdi, `body` etiketleri içinde çalışarak, bazı malzemeleri ifade eden bir onay kutusu listesi göstermek için küçük bir işaretleme ekleyin:
+
+ ```html
+
Check your refrigerator. What can you create?
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ```
+
+ Her bir onay kutusuna bir değer verildiğine dikkat edin. Bu, veri setine göre malzemenin bulunduğu indexi ifade eder. Örneğin bu alfabetik listede elma beşinci sütundadır, dolayısıyla onun değeri '4'tür çünkü saymaya 0'dan başlıyoruz. Verilen malzemenin indexini görmek için [malzemeler tablosuna](../../data/ingredient_indexes.csv) başvurabilirsiniz.
+
+ index.html dosyasındaki işinize devam ederek, son `` kapamasından sonra modelinizin çağrılacağı bir script bloğu ekleyin.
+
+1. Öncelikle, [Onnx Runtime](https://www.onnxruntime.ai/) alın:
+
+ ```html
+
+ ```
+
+ > Onnx Runtime, Onnx modelinizin, eniyileştirmeler ve kullanmak için bir API da dahil olmak üzere, geniş bir donanım platform yelpazesinde çalışmasını sağlamak için kullanılır.
+
+1. Runtime uygun hale geldiğinde, onu çağırabilirsiniz:
+
+ ```javascript
+
+ ```
+
+Bu kodda birçok şey gerçekleşiyor:
+
+1. Ayarlanması ve çıkarım için modele gönderilmesi için, bir malzeme onay kutusunun işaretli olup olmadığına bağlı 380 muhtemel değerden (ya 1 ya da 0) oluşan bir dizi oluşturdunuz.
+2. Onay kutularından oluşan bir dizi ve uygulama başladığında çağrılan bir `init` fonksiyonunda işaretli olup olmadıklarını belirleme yolu oluşturdunuz. Eğer onay kutusu işaretliyse, `ingredients` dizisi, seçilen malzemeyi ifade etmek üzere değiştirilir.
+3. Herhangi bir onay kutusunun işaretli olup olmadığını kontrol eden bir `testCheckboxes` fonksiyonu oluşturdunuz.
+4. Düğmeye basıldığında o fonksiyonu kullanıyor ve eğer herhangi bir onay kutusu işaretlenmişse çıkarıma başlıyorsunuz.
+5. Çıkarım rutini şunları içerir:
+ 1. Makinenin eşzamansız bir yüklemesini ayarlama
+ 2. Modele göndermek için bir Tensor yapısı oluşturma
+ 3. Modelinizi eğitirken oluşturduğunuz `float_input` (Bu adı doğrulamak için Netron kullanabilirsiniz.) girdisini ifade eden 'feeds' oluşturma
+ 4. Bu 'feeds'i modele gönderme ve yanıt için bekleme
+
+## Uygulamanızı test edin
+
+index.html dosyanızın olduğu klasördeyken Visual Studio Code'da bir terminal açın. Global kapsamda [http-server](https://www.npmjs.com/package/http-server) indirilmiş olduğundan emin olun ve istemde `http-server` yazın. Bir yerel ana makine açılmalı ve web uygulamanızı görebilirsiniz. Çeşitli malzemeleri baz alarak hangi mutfağın önerildiğine bakın:
+
+![malzeme web uygulaması](../images/web-app.png)
+
+Tebrikler, birkaç değişkenle bir 'önerici' web uygulaması oluşturdunuz! Bu sistemi oluşturmak için biraz zaman ayırın!
+## :rocket: Meydan okuma
+
+Web uygulamanız çok minimal, bu yüzden [ingredient_indexes](../../data/ingredient_indexes.csv) verisinden malzemeleri ve indexlerini kullanarak web uygulamanızı oluşturmaya devam edin. Verilen bir ulusal yemeği yapmak için hangi tat birleşimleri işe yarıyor?
+
+## [Ders sonrası kısa sınavı](https://white-water-09ec41f0f.azurestaticapps.net/quiz/26/?loc=tr)
+
+## Gözden Geçirme & Kendi Kendine Çalışma
+
+Bu dersin sadece yemek malzemeleri için bir öneri sistemi oluşturmanın olanaklarına değinmesiyle beraber, makine öğrenimi uygulamalarının bu alanı örnekler açısından çok zengin. Bu sistemlerin nasıl oluşturulduğu hakkında biraz daha okuyun:
+
+- https://www.sciencedirect.com/topics/computer-science/recommendation-engine
+- https://www.technologyreview.com/2014/08/25/171547/the-ultimate-challenge-for-recommendation-engines/
+- https://www.technologyreview.com/2015/03/23/168831/everything-is-a-recommendation/
+
+## Ödev
+
+[Yeni bir önerici oluşturun](assignment.tr.md)
diff --git a/4-Classification/4-Applied/translations/assignment.it.md b/4-Classification/4-Applied/translations/assignment.it.md
new file mode 100644
index 0000000000..cc926c7278
--- /dev/null
+++ b/4-Classification/4-Applied/translations/assignment.it.md
@@ -0,0 +1,11 @@
+# Creare un sistema di raccomandazione
+
+## Istruzioni
+
+Dati gli esercizi di questa lezione, ora si conosce come creare un'app Web basata su JavaScript utilizzando Onnx Runtime e un modello Onnx convertito. Sperimentare con la creazione di un nuovo sistema di raccomandazione utilizzando i dati di queste lezioni o provenienti da altre parti (citare le fonti, per favore). Si potrebbe creare un sistema di raccomandazione di animali domestici in base a vari attributi della personalità o un sistema di raccomandazione di genere musicale basato sull'umore di una persona. Dare sfogo alla creatività!
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | ---------------------------------------------------------------------- | ------------------------------------- | --------------------------------- |
+| | Vengono presentati un'app Web e un notebook, entrambi ben documentati e funzionanti | Uno di quei due è mancante o difettoso | Entrambi sono mancanti o difettosi |
diff --git a/4-Classification/4-Applied/translations/assignment.tr.md b/4-Classification/4-Applied/translations/assignment.tr.md
new file mode 100644
index 0000000000..f561bf48a0
--- /dev/null
+++ b/4-Classification/4-Applied/translations/assignment.tr.md
@@ -0,0 +1,11 @@
+# Bir önerici oluşturun
+
+## Yönergeler
+
+Bu dersteki alıştırmalar göz önünde bulundurulursa, Onnx Runtime ve dönüştürülmüş bir Onnx modeli kullanarak JavaScript tabanlı web uygulamasının nasıl oluşturulacağını artık biliyorsunuz. Bu derslerdeki verileri veya başka bir yerden kaynaklandırılmış verileri (Lütfen kaynakça verin.) kullanarak yeni bir önerici oluşturma deneyimi kazanın. Verilen çeşitli kişilik özellikleriyle bir evcil hayvan önericisi veya kişinin ruh haline göre bir müzik türü önericisi oluşturabilirsiniz. Yaratıcı olun!
+
+## Rubrik
+
+| Ölçüt | Örnek Alınacak Nitelikte | Yeterli | Geliştirme Gerekli |
+| -------- | ---------------------------------------------------------------------- | ------------------------------------- | --------------------------------- |
+| | İyi belgelenen ve çalışan bir web uygulaması ve not defteri sunulmuş | İkisinden biri eksik veya kusurlu | İkisi ya eksik ya da kusurlu |
\ No newline at end of file
diff --git a/4-Classification/README.md b/4-Classification/README.md
index f6133aa13e..73d83beb3e 100644
--- a/4-Classification/README.md
+++ b/4-Classification/README.md
@@ -8,7 +8,7 @@ In Asia and India, food traditions are extremely diverse, and very delicious! Le
## What you will learn
-In this section, you will build on the skills you learned in the first part of this curriculum all about regressionn to learn about other classifiers you can use that will help you learn about your data.
+In this section, you will build on the skills you learned in the first part of this curriculum all about regression to learn about other classifiers you can use that will help you learn about your data.
> There are useful low-code tools that can help you learn about working with classification models. Try [Azure ML for this task](https://docs.microsoft.com/learn/modules/create-classification-model-azure-machine-learning-designer/?WT.mc_id=academic-15963-cxa)
diff --git a/4-Classification/data/cleaned_cuisine.csv b/4-Classification/data/cleaned_cuisines.csv
similarity index 100%
rename from 4-Classification/data/cleaned_cuisine.csv
rename to 4-Classification/data/cleaned_cuisines.csv
diff --git a/4-Classification/translations/README.it.md b/4-Classification/translations/README.it.md
new file mode 100644
index 0000000000..fbaa4720d5
--- /dev/null
+++ b/4-Classification/translations/README.it.md
@@ -0,0 +1,26 @@
+# Iniziare con la classificazione
+
+## Argomento regionale: Deliziose Cucine Asiatiche e Indiane 🍜
+
+In Asia e in India, le tradizioni alimentari sono estremamente diverse e molto deliziose! Si darà un'occhiata ai dati sulle cucine regionali per cercare di capirne gli ingredienti.
+
+![Venditore di cibo tailandese](../images/thai-food.jpg)
+> Foto di Lisheng Chang su Unsplash
+
+## Cosa si imparerà
+
+In questa sezione si approfondiranno le abilità sulla regressione apprese nella prima parte di questo programma di studi per conoscere altri classificatori da poter utilizzare e che aiuteranno a conoscere i propri dati.
+
+> Esistono utili strumenti a basso codice che possono aiutare a imparare a lavorare con i modelli di regressione. Si provi [Azure ML per questa attività](https://docs.microsoft.com/learn/modules/create-classification-model-azure-machine-learning-designer/?WT.mc_id=academic-15963-cxa)
+
+## Lezioni
+
+1. [Introduzione alla classificazione](../1-Introduction/translations/README.it.md)
+2. [Più classificatori](../2-Classifiers-1/translations/README.it.md)
+3. [Ancora altri classificatori](../3-Classifiers-2/translations/README.it.md)
+4. [Machine Learning applicato: sviluppare un'app web](../4-Applied/translations/README.it.md)
+## Crediti
+
+"Iniziare con la classificazione" è stato scritto con ♥️ da [Cassie Breviu](https://www.twitter.com/cassieview) e [Jen Looper](https://www.twitter.com/jenlooper)
+
+L'insieme di dati sulle deliziose cucine proviene da [Kaggle](https://www.kaggle.com/hoandan/asian-and-indian-cuisines)
diff --git a/4-Classification/translations/README.ru.md b/4-Classification/translations/README.ru.md
new file mode 100644
index 0000000000..e9359d59a7
--- /dev/null
+++ b/4-Classification/translations/README.ru.md
@@ -0,0 +1,25 @@
+# Начало работы с классификацией
+## Региональная тема: Вкусные блюда азиатской и индийской кухни 🍜
+
+В Азии и Индии традиции кухни чрезвычайно разнообразны и очень вкусны! Давайте посмотрим на данные о региональных кухнях, чтобы попытаться понять их состав.
+
+! [Продавец тайской еды](./images/thai-food.jpg)
+> Фото Лишенг Чанг на Unsplash
+
+## Что вы узнаете
+
+В этом разделе вы воспользуетесь навыками, полученными в первой части учебной программы, посвященными регрессии, и узнаете о других классификаторах, которые вы можете использовать и которые помогут вам изучить свои данные.
+
+> Существуют полезные инструменты с небольший количеством кода, которые могут помочь вам узнать о работе с моделями классификации. Попробуйте [Azure ML для этой задачи](https://docs.microsoft.com/learn/modules/create-classification-model-azure-machine-learning-designer/?WT.mc_id=academic-15963-cxa)
+
+## Уроки
+
+1. [Введение в классификацию](1-Introduction/README.md)
+2. [Другие классификаторы](2-Classifiers-1/README.md)
+3. [Еще классификаторы](3-Classifiers-2/README.md)
+4. [Прикладное машинное обучение: создание веб-приложения](4-Applied/README.md)
+## Благодарности
+
+«Начало работы с классификацией» было написано с ♥ ️[Кэсси Бревиу](https://www.twitter.com/cassieview) и [Джен Лупер](https://www.twitter.com/jenlooper)
+
+Набор данных о вкусных блюдах взят из [Kaggle](https://www.kaggle.com/hoandan/asian-and-indian-cuisines)
\ No newline at end of file
diff --git a/4-Classification/translations/README.tr.md b/4-Classification/translations/README.tr.md
new file mode 100644
index 0000000000..9514dd0add
--- /dev/null
+++ b/4-Classification/translations/README.tr.md
@@ -0,0 +1,25 @@
+# Sınıflandırmaya başlarken
+## Bölgesel konu: Leziz Asya ve Hint Mutfağı :ramen:
+
+Asya ve Hindistan'da yemek gelenekleri fazlaca çeşitlilik gösterir ve çok lezzetlidir! Malzemelerini anlamaya çalışmak için bölgesel mutfaklar hakkındaki veriye bakalım.
+
+![Taylandlı yemek satıcısı](../images/thai-food.jpg)
+> Fotoğraf Lisheng Chang tarafından çekilmiştir ve Unsplash'tadır.
+
+## Öğrenecekleriniz
+
+Bu bölümde, bu eğitim programının tamamen regresyon üzerine olan ilk bölümünde öğrendiğiniz becerilere dayanıp onların üstüne beceriler ekleyeceksiniz ve veriniz hakkında bilgi sahibi olmanızı sağlayacak diğer sınıflandırıcıları öğreneceksiniz.
+
+> Sınıflandırma modelleriyle çalışmayı öğrenmenizi sağlayacak faydalı düşük kodlu araçlar vardır. [Bu görev için Azure ML](https://docs.microsoft.com/learn/modules/create-classification-model-azure-machine-learning-designer/?WT.mc_id=academic-15963-cxa)'i deneyin.
+
+## Dersler
+
+1. [Sınıflandırmaya giriş](../1-Introduction/translations/README.tr.md)
+2. [Daha fazla sınıflandırıcı](../2-Classifiers-1/translations/README.tr.md)
+3. [Hatta daha fazla sınıflandırıcı](../3-Classifiers-2/translations/README.tr.md)
+4. [Uygulamalı Makine Öğrenimi: bir web uygulaması oluşturun](../4-Applied/translations/README.tr.md)
+## Katkıda bulunanlar
+
+"Sınıflandırmaya başlarken" [Cassie Breviu](https://www.twitter.com/cassieview) ve [Jen Looper](https://www.twitter.com/jenlooper) tarafından :hearts: ile yazılmıştır.
+
+Leziz mutfak veri seti [Kaggle](https://www.kaggle.com/hoandan/asian-and-indian-cuisines)'dan alınmıştır.
diff --git a/5-Clustering/1-Visualize/README.md b/5-Clustering/1-Visualize/README.md
index 12ac7c7b11..c5ef453f52 100644
--- a/5-Clustering/1-Visualize/README.md
+++ b/5-Clustering/1-Visualize/README.md
@@ -5,7 +5,7 @@ Clustering is a type of [Unsupervised Learning](https://wikipedia.org/wiki/Unsup
[![No One Like You by PSquare](https://img.youtube.com/vi/ty2advRiWJM/0.jpg)](https://youtu.be/ty2advRiWJM "No One Like You by PSquare")
> 🎥 Click the image above for a video. While you're studying machine learning with clustering, enjoy some Nigerian Dance Hall tracks - this is a highly rated song from 2014 by PSquare.
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/27/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/27/)
### Introduction
[Clustering](https://link.springer.com/referenceworkentry/10.1007%2F978-0-387-30164-8_124) is very useful for data exploration. Let's see if it can help discover trends and patterns in the way Nigerian audiences consume music.
@@ -104,7 +104,7 @@ Clustering as a technique is greatly aided by proper visualization, so let's get
1. Import the `Seaborn` package for good data visualization.
```python
- pip install seaborn
+ !pip install seaborn
```
1. Append the song data from _nigerian-songs.csv_. Load up a dataframe with some data about the songs. Get ready to explore this data by importing the libraries and dumping out the data:
@@ -317,7 +317,7 @@ In general, for clustering, you can use scatterplots to show clusters of data, s
In preparation for the next lesson, make a chart about the various clustering algorithms you might discover and use in a production environment. What kinds of problems is the clustering trying to address?
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/28/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/28/)
## Review & Self Study
diff --git a/5-Clustering/1-Visualize/solution/notebook.ipynb b/5-Clustering/1-Visualize/solution/notebook.ipynb
index 1542549cec..44c15495e8 100644
--- a/5-Clustering/1-Visualize/solution/notebook.ipynb
+++ b/5-Clustering/1-Visualize/solution/notebook.ipynb
@@ -61,7 +61,7 @@
}
],
"source": [
- "pip install seaborn"
+ "!pip install seaborn"
]
},
{
diff --git a/5-Clustering/1-Visualize/translations/README.it.md b/5-Clustering/1-Visualize/translations/README.it.md
new file mode 100644
index 0000000000..0d8510b009
--- /dev/null
+++ b/5-Clustering/1-Visualize/translations/README.it.md
@@ -0,0 +1,332 @@
+# Introduzione al clustering
+
+Il clustering è un tipo di [apprendimento non supervisionato](https://wikipedia.org/wiki/Unsupervised_learning) che presuppone che un insieme di dati non sia etichettato o che i suoi input non siano abbinati a output predefiniti. Utilizza vari algoritmi per ordinare i dati non etichettati e fornire raggruppamenti in base ai modelli che individua nei dati.
+
+[![No One Like You di PSquare](https://img.youtube.com/vi/ty2advRiWJM/0.jpg)](https://youtu.be/ty2advRiWJM "No One Like You di PSquare")
+
+> 🎥 Fare clic sull'immagine sopra per un video. Mentre si studia machine learning con il clustering, si potranno gradire brani della Nigerian Dance Hall: questa è una canzone molto apprezzata del 2014 di PSquare.
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/27/)
+
+### Introduzione
+
+[Il clustering](https://link.springer.com/referenceworkentry/10.1007%2F978-0-387-30164-8_124) è molto utile per l'esplorazione dei dati. Si vedrà se può aiutare a scoprire tendenze e modelli nel modo in cui il pubblico nigeriano consuma la musica.
+
+✅ Ci si prenda un minuto per pensare agli usi del clustering. Nella vita reale, il clustering si verifica ogni volta che si ha una pila di biancheria e si devono sistemare i vestiti dei propri familiari 🧦👕👖🩲. Nella scienza dei dati, il clustering si verifica quando si tenta di analizzare le preferenze di un utente o di determinare le caratteristiche di qualsiasi insieme di dati senza etichetta. Il clustering, in un certo senso, aiuta a dare un senso al caos, come un cassetto dei calzini.
+
+[![Introduzione a ML](https://img.youtube.com/vi/esmzYhuFnds/0.jpg)](https://youtu.be/esmzYhuFnds "Introduzione al Clustering")
+
+> 🎥 Fare clic sull'immagine sopra per un video: John Guttag del MIT introduce il clustering
+
+In un ambiente professionale, il clustering può essere utilizzato per determinare cose come la segmentazione del mercato, determinare quali fasce d'età acquistano quali articoli, ad esempio. Un altro uso sarebbe il rilevamento di anomalie, forse per rilevare le frodi da un insieme di dati delle transazioni con carta di credito. Oppure si potrebbe usare il clustering per determinare i tumori in una serie di scansioni mediche.
+
+✅ Si pensi un minuto a come si potrebbe aver incontrato il clustering 'nel mondo reale', in un ambiente bancario, e-commerce o aziendale.
+
+> 🎓 È interessante notare che l'analisi dei cluster ha avuto origine nei campi dell'antropologia e della psicologia negli anni '30. Si riusce a immaginare come potrebbe essere stato utilizzato?
+
+In alternativa, lo si può utilizzare per raggruppare i risultati di ricerca, ad esempio tramite link per acquisti, immagini o recensioni. Il clustering è utile quando si dispone di un insieme di dati di grandi dimensioni che si desidera ridurre e sul quale si desidera eseguire un'analisi più granulare, quindi la tecnica può essere utilizzata per conoscere i dati prima che vengano costruiti altri modelli.
+
+✅ Una volta che i dati sono organizzati in cluster, viene assegnato un ID cluster e questa tecnica può essere utile quando si preserva la privacy di un insieme di dati; si può invece fare riferimento a un punto dati tramite il suo ID cluster, piuttosto che dati identificabili più rivelatori. Si riesce a pensare ad altri motivi per cui fare riferimento a un ID cluster piuttosto che ad altri elementi del cluster per identificarlo?
+
+In questo [modulo di apprendimento](https://docs.microsoft.com/learn/modules/train-evaluate-cluster-models?WT.mc_id=academic-15963-cxa) si approfondirà la propria comprensione delle tecniche di clustering
+
+## Iniziare con il clustering
+
+[Scikit-learn offre una vasta gamma](https://scikit-learn.org/stable/modules/clustering.html) di metodi per eseguire il clustering. Il tipo scelto dipenderà dal caso d'uso. Secondo la documentazione, ogni metodo ha diversi vantaggi. Ecco una tabella semplificata dei metodi supportati da Scikit-learn e dei loro casi d'uso appropriati:
+
+| Nome del metodo | Caso d'uso |
+| :------------------------------------------------------ | :-------------------------------------------------------------------------- |
+| K-MEANS | uso generale, induttivo |
+| Affinity propagation (Propagazione dell'affinità) | molti, cluster irregolari, induttivo |
+| Mean-shift (Spostamento medio) | molti, cluster irregolari, induttivo |
+| Spectral clustering (Raggruppamento spettrale) | pochi, anche grappoli, trasduttivi |
+| Ward hierarchical clustering (Cluster gerarchico) | molti, cluster vincolati, trasduttivi |
+| Agglomerative clustering (Raggruppamento agglomerativo) | molte, vincolate, distanze non euclidee, trasduttive |
+| DBSCAN | geometria non piatta, cluster irregolari, trasduttivo |
+| OPTICS | geometria non piatta, cluster irregolari con densità variabile, trasduttivo |
+| Gaussian mixtures (miscele gaussiane) | geometria piana, induttiva |
+| BIRCH | insiemi di dati di grandi dimensioni con valori anomali, induttivo |
+
+> 🎓 Il modo in cui si creno i cluster ha molto a che fare con il modo in cui si raccolgono punti dati in gruppi. Si esamina un po' di vocabolario:
+>
+> 🎓 ['trasduttivo' vs. 'induttivo'](https://wikipedia.org/wiki/Transduction_(machine_learning))
+>
+> L'inferenza trasduttiva è derivata da casi di addestramento osservati che mappano casi di test specifici. L'inferenza induttiva è derivata da casi di addestramento che mappano regole generali che vengono poi applicate ai casi di test.
+>
+> Un esempio: si immagini di avere un insieme di dati che è solo parzialmente etichettato. Alcune cose sono "dischi", alcune "cd" e altre sono vuote. Il compito è fornire etichette per gli spazi vuoti. Se si scegliesse un approccio induttivo, si addestrerebbe un modello alla ricerca di "dischi" e "cd" e si applicherebbero quelle etichette ai dati non etichettati. Questo approccio avrà problemi a classificare cose che sono in realtà "cassette". Un approccio trasduttivo, d'altra parte, gestisce questi dati sconosciuti in modo più efficace poiché funziona raggruppando elementi simili e quindi applica un'etichetta a un gruppo. In questo caso, i cluster potrebbero riflettere "cose musicali rotonde" e "cose musicali quadrate".
+>
+> 🎓 [Geometria 'non piatta' (non-flat) vs. 'piatta' (flat)](https://datascience.stackexchange.com/questions/52260/terminology-flat-geometry-in-the-context-of-clustering)
+>
+> Derivato dalla terminologia matematica, la geometria non piatta rispetto a quella piatta si riferisce alla misura delle distanze tra i punti mediante metodi geometrici "piatti" ([euclidei](https://wikipedia.org/wiki/Euclidean_geometry)) o "non piatti" (non euclidei).
+>
+> "Piatto" in questo contesto si riferisce alla geometria euclidea (parti della quale vengono insegnate come geometria "piana") e non piatto si riferisce alla geometria non euclidea. Cosa ha a che fare la geometria con machine learning? Bene, come due campi che sono radicati nella matematica, ci deve essere un modo comune per misurare le distanze tra i punti nei cluster, e questo può essere fatto in modo "piatto" o "non piatto", a seconda della natura dei dati . [Le distanze euclidee](https://wikipedia.org/wiki/Euclidean_distance) sono misurate come la lunghezza di un segmento di linea tra due punti. [Le distanze non euclidee](https://wikipedia.org/wiki/Non-Euclidean_geometry) sono misurate lungo una curva. Se i dati, visualizzati, sembrano non esistere su un piano, si potrebbe dover utilizzare un algoritmo specializzato per gestirli.
+>
+![Infografica con geometria piatta e non piatta](../images/flat-nonflat.png)
+> Infografica di [Dasani Madipalli](https://twitter.com/dasani_decoded)
+>
+> [' Distanze'](https://web.stanford.edu/class/cs345a/slides/12-clustering.pdf)
+>
+> I cluster sono definiti dalla loro matrice di distanza, ad esempio le distanze tra i punti. Questa distanza può essere misurata in alcuni modi. I cluster euclidei sono definiti dalla media dei valori dei punti e contengono un 'centroide' o baricentro. Le distanze sono quindi misurate dalla distanza da quel baricentro. Le distanze non euclidee si riferiscono a "clustroidi", il punto più vicino ad altri punti. I clustroidi a loro volta possono essere definiti in vari modi.
+>
+> 🎓 ['Vincolato'](https://wikipedia.org/wiki/Constrained_clustering)
+>
+> [Constrained Clustering](https://web.cs.ucdavis.edu/~davidson/Publications/ICDMTutorial.pdf) introduce l'apprendimento 'semi-supervisionato' in questo metodo non supervisionato. Le relazioni tra i punti sono contrassegnate come "non è possibile collegare" o "è necessario collegare", quindi alcune regole sono imposte sull'insieme di dati.
+>
+> Un esempio: se un algoritmo viene applicato su un batch di dati non etichettati o semi-etichettati, i cluster che produce potrebbero essere di scarsa qualità. Nell'esempio sopra, i cluster potrebbero raggruppare "cose musicali rotonde" e "cose musicali quadrate" e "cose triangolari" e "biscotti". Se vengono dati dei vincoli, o delle regole da seguire ("l'oggetto deve essere di plastica", "l'oggetto deve essere in grado di produrre musica"), questo può aiutare a "vincolare" l'algoritmo a fare scelte migliori.
+>
+> 'Densità'
+>
+> I dati "rumorosi" sono considerati "densi". Le distanze tra i punti in ciascuno dei suoi cluster possono rivelarsi, all'esame, più o meno dense, o "affollate" e quindi questi dati devono essere analizzati con il metodo di clustering appropriato. [Questo articolo](https://www.kdnuggets.com/2020/02/understanding-density-based-clustering.html) dimostra la differenza tra l'utilizzo del clustering K-Means rispetto agli algoritmi HDBSCAN per esplorare un insieme di dati rumoroso con densità di cluster non uniforme.
+
+## Algoritmi di clustering
+
+Esistono oltre 100 algoritmi di clustering e il loro utilizzo dipende dalla natura dei dati a portata di mano. Si discutono alcuni dei principali:
+
+- **Raggruppamento gerarchico**. Se un oggetto viene classificato in base alla sua vicinanza a un oggetto vicino, piuttosto che a uno più lontano, i cluster vengono formati in base alla distanza dei loro membri da e verso altri oggetti. Il clustering agglomerativo di Scikit-learn è gerarchico.
+
+ ![Infografica sul clustering gerarchico](../images/hierarchical.png)
+ > Infografica di [Dasani Madipalli](https://twitter.com/dasani_decoded)
+
+- **Raggruppamento centroide**. Questo popolare algoritmo richiede la scelta di 'k', o il numero di cluster da formare, dopodiché l'algoritmo determina il punto centrale di un cluster e raccoglie i dati attorno a quel punto. [Il clustering K-means](https://wikipedia.org/wiki/K-means_clustering) è una versione popolare del clustering centroide. Il centro è determinato dalla media più vicina, da qui il nome. La distanza al quadrato dal cluster è ridotta al minimo.
+
+ ![Infografica sul clustering del centroide](../images/centroid.png)
+ > Infografica di [Dasani Madipalli](https://twitter.com/dasani_decoded)
+
+- **Clustering basato sulla distribuzione**. Basato sulla modellazione statistica, il clustering basato sulla distribuzione è incentrato sulla determinazione della probabilità che un punto dati appartenga a un cluster e sull'assegnazione di conseguenza. I metodi di miscelazione gaussiana appartengono a questo tipo.
+
+- **Clustering basato sulla densità**. I punti dati vengono assegnati ai cluster in base alla loro densità o al loro raggruppamento l'uno intorno all'altro. I punti dati lontani dal gruppo sono considerati valori anomali o rumore. DBSCAN, Mean-shift e OPTICS appartengono a questo tipo di clustering.
+
+- **Clustering basato su griglia**. Per gli insiemi di dati multidimensionali, viene creata una griglia e i dati vengono divisi tra le celle della griglia, creando così dei cluster.
+
+## Esercizio: raggruppare i dati
+
+Il clustering come tecnica è notevolmente aiutato da una corretta visualizzazione, quindi si inizia visualizzando i dati musicali. Questo esercizio aiuterà a decidere quale dei metodi di clustering si dovranno utilizzare in modo più efficace per la natura di questi dati.
+
+1. Aprire il file _notebook.ipynb_ in questa cartella.
+
+1. Importare il pacchetto `Seaborn` per una buona visualizzazione dei dati.
+
+ ```python
+ !pip install seaborn
+ ```
+
+1. Aggiungere i dati dei brani da _nigerian-songs.csv_. Caricare un dataframe con alcuni dati sulle canzoni. Prepararsi a esplorare questi dati importando le librerie e scaricando i dati:
+
+ ```python
+ import matplotlib.pyplot as plt
+ import pandas as pd
+
+ df = pd.read_csv("../data/nigerian-songs.csv")
+ df.head()
+ ```
+
+ Controllare le prime righe di dati:
+
+ | | name | album | artist | artist_top_genre | release_date | length | popularity | danceability | acousticness | energy | instrumentalness | liveness | loudness | speechiness | tempo | time_signature |
+ | --- | ------------------------ | ---------------------------- | ------------------- | ---------------- | ------------ | ------ | ---------- | ------------ | ------------ | ------ | ---------------- | -------- | -------- | ----------- | ------- | -------------- |
+ | 0 | Sparky | Mandy & The Jungle | Cruel Santino | alternative r&b | 2019 | 144000 | 48 | 0.666 | 0.851 | 0.42 | 0.534 | 0.11 | -6.699 | 0.0829 | 133.015 | 5 |
+ | 1 | shuga rush | EVERYTHING YOU HEARD IS TRUE | Odunsi (The Engine) | afropop | 2020 | 89488 | 30 | 0.71 | 0.0822 | 0.683 | 0.000169 | 0.101 | -5.64 | 0.36 | 129.993 | 3 |
+ | 2 | LITT! | LITT! | AYLØ | indie r&b | 2018 | 207758 | 40 | 0.836 | 0.272 | 0.564 | 0.000537 | 0.11 | -7.127 | 0.0424 | 130.005 | 4 |
+ | 3 | Confident / Feeling Cool | Enjoy Your Life | Lady Donli | nigerian pop | 2019 | 175135 | 14 | 0.894 | 0.798 | 0.611 | 0.000187 | 0.0964 | -4.961 | 0.113 | 111.087 | 4 |
+ | 4 | wanted you | rare. | Odunsi (The Engine) | afropop | 2018 | 152049 | 25 | 0.702 | 0.116 | 0.833 | 0.91 | 0.348 | -6.044 | 0.0447 | 105.115 | 4 |
+
+1. Ottenere alcune informazioni sul dataframe, chiamando `info()`:
+
+ ```python
+ df.info()
+ ```
+
+ Il risultato appare così:
+
+ ```output
+
+ RangeIndex: 530 entries, 0 to 529
+ Data columns (total 16 columns):
+ # Column Non-Null Count Dtype
+ --- ------ -------------- -----
+ 0 name 530 non-null object
+ 1 album 530 non-null object
+ 2 artist 530 non-null object
+ 3 artist_top_genre 530 non-null object
+ 4 release_date 530 non-null int64
+ 5 length 530 non-null int64
+ 6 popularity 530 non-null int64
+ 7 danceability 530 non-null float64
+ 8 acousticness 530 non-null float64
+ 9 energy 530 non-null float64
+ 10 instrumentalness 530 non-null float64
+ 11 liveness 530 non-null float64
+ 12 loudness 530 non-null float64
+ 13 speechiness 530 non-null float64
+ 14 tempo 530 non-null float64
+ 15 time_signature 530 non-null int64
+ dtypes: float64(8), int64(4), object(4)
+ memory usage: 66.4+ KB
+ ```
+
+1. Ricontrollare i valori null, chiamando `isnull()` e verificando che la somma sia 0:
+
+ ```python
+ df.isnull().sum()
+ ```
+
+ Si presenta bene!
+
+ ```output
+ name 0
+ album 0
+ artist 0
+ artist_top_genre 0
+ release_date 0
+ length 0
+ popularity 0
+ danceability 0
+ acousticness 0
+ energy 0
+ instrumentalness 0
+ liveness 0
+ loudness 0
+ speechiness 0
+ tempo 0
+ time_signature 0
+ dtype: int64
+ ```
+
+1. Descrivere i dati:
+
+ ```python
+ df.describe()
+ ```
+
+ | | release_date | lenght | popularity | danceability | acousticness | Energia | strumentale | vitalità | livello di percezione sonora | parlato | tempo | #ora_firma |
+ | ------- | ------------ | ----------- | ---------- | ------------ | ------------ | -------- | ----------- | -------- | ---------------------------- | -------- | ---------- | ---------- |
+ | estero) | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 530 |
+ | mezzo | 2015.390566 | 222298.1698 | 17.507547 | 0.741619 | 0.265412 | 0.760623 | 0,016305 | 0,147308 | -4.953011 | 0,130748 | 116.487864 | 3.986792 |
+ | std | 3.131688 | 39696.82226 | 18.992212 | 0,117522 | 0.208342 | 0.148533 | 0.090321 | 0,123588 | 2.464186 | 0,092939 | 23.518601 | 0.333701 |
+ | min | 1998 | 89488 | 0 | 0,255 | 0,000665 | 0,111 | 0 | 0,0283 | -19,362 | 0,0278 | 61.695 | 3 |
+ | 25% | 2014 | 199305 | 0 | 0,681 | 0,089525 | 0,669 | 0 | 0,07565 | -6.29875 | 0,0591 | 102.96125 | 4 |
+ | 50% | 2016 | 218509 | 13 | 0,761 | 0.2205 | 0.7845 | 0.000004 | 0,1035 | -4.5585 | 0,09795 | 112.7145 | 4 |
+ | 75% | 2017 | 242098.5 | 31 | 0,8295 | 0.403 | 0.87575 | 0.000234 | 0,164 | -3.331 | 0,177 | 125.03925 | 4 |
+ | max | 2020 | 511738 | 73 | 0.966 | 0,954 | 0,995 | 0,91 | 0,811 | 0,582 | 0.514 | 206.007 | 5 |
+
+> 🤔 Se si sta lavorando con il clustering, un metodo non supervisionato che non richiede dati etichettati, perché si stanno mostrando questi dati con etichette? Nella fase di esplorazione dei dati, sono utili, ma non sono necessari per il funzionamento degli algoritmi di clustering. Si potrebbero anche rimuovere le intestazioni delle colonne e fare riferimento ai dati per numero di colonna.
+
+Dare un'occhiata ai valori generali dei dati. Si nota che la popolarità può essere "0", che mostra i brani che non hanno una classifica. Quelli verranno rimossi a breve.
+
+1. Usare un grafico a barre per scoprire i generi più popolari:
+
+ ```python
+ import seaborn as sns
+
+ top = df['artist_top_genre'].value_counts()
+ plt.figure(figsize=(10,7))
+ sns.barplot(x=top[:5].index,y=top[:5].values)
+ plt.xticks(rotation=45)
+ plt.title('Top genres',color = 'blue')
+ ```
+
+ ![I più popolari](../images/popular.png)
+
+✅ Se si desidera vedere più valori superiori, modificare il valore di top `[:5]` con un valore più grande o rimuoverlo per vederli tutti.
+
+Nota, quando un valore di top è descritto come "Missing", ciò significa che Spotify non lo ha classificato, quindi va rimosso.
+
+1. Eliminare i dati mancanti escludendoli via filtro
+
+ ```python
+ df = df[df['artist_top_genre'] != 'Missing']
+ top = df['artist_top_genre'].value_counts()
+ plt.figure(figsize=(10,7))
+ sns.barplot(x=top.index,y=top.values)
+ plt.xticks(rotation=45)
+ plt.title('Top genres',color = 'blue')
+ ```
+
+ Ora ricontrollare i generi:
+
+ ![I più popolari](../images/all-genres.png)
+
+1. Di gran lunga, i primi tre generi dominano questo insieme di dati. Si pone l'attenzione su `afrodancehall,` `afropop` e `nigerian pop`, filtrando inoltre l'insieme di dati per rimuovere qualsiasi cosa con un valore di popolarità 0 (il che significa che non è stato classificato con una popolarità nell'insieme di dati e può essere considerato rumore per gli scopi attuali):
+
+ ```python
+ df = df[(df['artist_top_genre'] == 'afro dancehall') | (df['artist_top_genre'] == 'afropop') | (df['artist_top_genre'] == 'nigerian pop')]
+ df = df[(df['popularity'] > 0)]
+ top = df['artist_top_genre'].value_counts()
+ plt.figure(figsize=(10,7))
+ sns.barplot(x=top.index,y=top.values)
+ plt.xticks(rotation=45)
+ plt.title('Top genres',color = 'blue')
+ ```
+
+1. Fare un test rapido per vedere se i dati sono correlati in modo particolarmente forte:
+
+ ```python
+ corrmat = df.corr()
+ f, ax = plt.subplots(figsize=(12, 9))
+ sns.heatmap(corrmat, vmax=.8, square=True)
+ ```
+
+ ![correlazioni](../images/correlation.png)
+
+ L'unica forte correlazione è tra `energy` e `loudness` (volume), il che non è troppo sorprendente, dato che la musica ad alto volume di solito è piuttosto energica. Altrimenti, le correlazioni sono relativamente deboli. Sarà interessante vedere cosa può fare un algoritmo di clustering di questi dati.
+
+ > 🎓 Notare che la correlazione non implica la causalità! Ci sono prove di correlazione ma nessuna prova di causalità. Un [sito web divertente](https://tylervigen.com/spurious-correlations) ha alcune immagini che enfatizzano questo punto.
+
+C'è qualche convergenza in questo insieme di dati intorno alla popolarità e alla ballabilità percepite di una canzone? Una FacetGrid mostra che ci sono cerchi concentrici che si allineano, indipendentemente dal genere. Potrebbe essere che i gusti nigeriani convergano ad un certo livello di ballabilità per questo genere?
+
+✅ Provare diversi punti dati (energy, loudness, speachiness) e più o diversi generi musicali. Cosa si può scoprire? Dare un'occhiata alla tabella con `df.describe()` per vedere la diffusione generale dei punti dati.
+
+### Esercizio - distribuzione dei dati
+
+Questi tre generi sono significativamente differenti nella percezione della loro ballabilità, in base alla loro popolarità?
+
+1. Esaminare la distribuzione dei dati sui tre principali generi per la popolarità e la ballabilità lungo un dato asse x e y.
+
+ ```python
+ sns.set_theme(style="ticks")
+
+ g = sns.jointplot(
+ data=df,
+ x="popularity", y="danceability", hue="artist_top_genre",
+ kind="kde",
+ )
+ ```
+
+ Si possono scoprire cerchi concentrici attorno a un punto di convergenza generale, che mostra la distribuzione dei punti.
+
+ > 🎓 Si noti che questo esempio utilizza un grafico KDE (Kernel Density Estimate) che rappresenta i dati utilizzando una curva di densità di probabilità continua. Questo consente di interpretare i dati quando si lavora con più distribuzioni.
+
+ In generale, i tre generi si allineano liberamente in termini di popolarità e ballabilità. Determinare i cluster in questi dati vagamente allineati sarà una sfida:
+
+ ![distribuzione](../images/distribution.png)
+
+1. Crea un grafico a dispersione:
+
+ ```python
+ sns.FacetGrid(df, hue="artist_top_genre", size=5) \
+ .map(plt.scatter, "popularity", "danceability") \
+ .add_legend()
+ ```
+
+ Un grafico a dispersione degli stessi assi mostra un modello di convergenza simile
+
+ ![Facetgrid](../images/facetgrid.png)
+
+In generale, per il clustering è possibile utilizzare i grafici a dispersione per mostrare i cluster di dati, quindi è molto utile padroneggiare questo tipo di visualizzazione. Nella prossima lezione, si prenderanno questi dati filtrati e si utilizzerà il clustering k-means per scoprire gruppi in questi dati che si sovrappongono in modi interessanti.
+
+---
+
+## 🚀 Sfida
+
+In preparazione per la lezione successiva, creare un grafico sui vari algoritmi di clustering che si potrebbero scoprire e utilizzare in un ambiente di produzione. Che tipo di problemi sta cercando di affrontare il clustering?
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/28/)
+
+## Revisione e Auto Apprendimento
+
+Prima di applicare gli algoritmi di clustering, come si è appreso, è una buona idea comprendere la natura del proprio insieme di dati. Leggere di più su questo argomento [qui](https://www.kdnuggets.com/2019/10/right-clustering-algorithm.html)
+
+[Questo utile articolo](https://www.freecodecamp.org/news/8-clustering-algorithms-in-machine-learning-that-all-data-scientists-should-know/) illustra i diversi modi in cui si comportano i vari algoritmi di clustering, date diverse forme di dati.
+
+## Compito
+
+[Ricercare altre visualizzazioni per il clustering](assignment.it.md)
diff --git a/5-Clustering/1-Visualize/translations/README.zh-cn.md b/5-Clustering/1-Visualize/translations/README.zh-cn.md
new file mode 100644
index 0000000000..e3161473b2
--- /dev/null
+++ b/5-Clustering/1-Visualize/translations/README.zh-cn.md
@@ -0,0 +1,336 @@
+# 介绍聚类
+
+聚类是一种无监督学习,它假定数据集未标记或其输入与预定义的输出不匹配。它使用各种算法对未标记的数据进行排序,并根据它在数据中识别的模式提供分组。
+[![No One Like You by PSquare](https://img.youtube.com/vi/ty2advRiWJM/0.jpg)](https://youtu.be/ty2advRiWJM "No One Like You by PSquare")
+
+> 🎥 点击上面的图片观看视频。当您通过聚类学习机器学习时,请欣赏一些尼日利亚舞厅曲目 - 这是2014 年PSquare上高度评价的歌曲。
+## [课前测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/27/)
+### 介绍
+
+[聚类](https://link.springer.com/referenceworkentry/10.1007%2F978-0-387-30164-8_124)对于数据探索非常有用。让我们看看它是否有助于发现尼日利亚观众消费音乐的趋势和模式。
+
+✅花一点时间思考聚类的用途。在现实生活中,每当你有一堆衣服需要整理家人的衣服时,就会发生聚类🧦👕👖🩲. 在数据科学中,聚类用于在尝试分析用户的偏好或确定任何未标记数据集的特征。在某种程度上,聚类有助于理解杂乱的状态,就像是一个袜子抽屉。
+
+[![Introduction to ML](https://img.youtube.com/vi/esmzYhuFnds/0.jpg)](https://youtu.be/esmzYhuFnds "Introduction to Clustering")
+
+> 🎥单击上图观看视频:麻省理工学院的 John Guttag 介绍聚类
+
+在专业环境中,聚类可用于确定诸如市场细分之类的事情,例如确定哪些年龄组购买哪些商品。另一个用途是异常检测,可能是从信用卡交易数据集中检测欺诈。或者您可以使用聚类来确定一批医学扫描中的肿瘤。
+
+✅ 想一想您是如何在银行、电子商务或商业环境中“意外”遇到聚类的。
+
+> 🎓有趣的是,聚类分析起源于 1930 年代的人类学和心理学领域。你能想象它是如何被使用的吗?
+
+或者,您可以使用它对搜索结果进行分组 - 例如,通过购物链接、图片或评论。当您有一个大型数据集想要减少并且想要对其执行更细粒度的分析时,聚类非常有用,因此该技术可用于在构建其他模型之前了解数据。
+
+✅一旦你的数据被组织成聚类,你就为它分配一个聚类 ID,这个技术在保护数据集的隐私时很有用;您可以改为通过其聚类 ID 来引用数据点,而不是通过更多的可明显区分的数据。您能想到为什么要引用聚类 ID 而不是聚类的其他元素来识别它的其他原因吗?
+
+在此[学习模块中](https://docs.microsoft.com/learn/modules/train-evaluate-cluster-models?WT.mc_id=academic-15963-cxa)加深您对聚类技术的理解
+
+## 聚类入门
+
+[Scikit-learn ](https://scikit-learn.org/stable/modules/clustering.html)提供了大量的方法来执行聚类。您选择的类型将取决于您的用例。根据文档,每种方法都有不同的好处。以下是 Scikit-learn 支持的方法及其适当用例的简化表:
+
+| 方法名称 | 用例 |
+| ---------------------------- | -------------------------------------------------- |
+| K-Means | 通用目的,归纳的 |
+| Affinity propagation | 许多,不均匀的聚类,归纳的 |
+| Mean-shift | 许多,不均匀的聚类,归纳的 |
+| Spectral clustering | 少数,甚至聚类,转导的 |
+| Ward hierarchical clustering | 许多,受约束的聚类,转导的 |
+| Agglomerative clustering | 许多,受约束的,非欧几里得距离,转导的 |
+| DBSCAN | 非平面几何,不均匀聚类,转导的 |
+| OPTICS | 不平坦的几何形状,具有可变密度的不均匀聚类,转导的 |
+| Gaussian mixtures | 平面几何,归纳的 |
+| BIRCH | 具有异常值的大型数据集,归纳的 |
+
+> 🎓我们如何创建聚类与我们如何将数据点收集到组中有很大关系。让我们分析一些词汇:
+>
+> 🎓 [“转导”与“归纳”](https://wikipedia.org/wiki/Transduction_(machine_learning))
+>
+> 转导推理源自观察到的映射到特定测试用例的训练用例。归纳推理源自映射到一般规则的训练案例,然后才应用于测试案例。
+>
+> 示例:假设您有一个仅部分标记的数据集。有些东西是“记录”,有些是“CD”,有些是空白的。您的工作是为空白提供标签。如果您选择归纳方法,您将训练一个寻找“记录”和“CD”的模型,并将这些标签应用于未标记的数据。这种方法将难以对实际上是“盒式磁带”的东西进行分类。另一方面,转导方法可以更有效地处理这些未知数据,因为它可以将相似的项目组合在一起,然后将标签应用于一个组。在这种情况下,聚类可能反映“圆形音乐事物”和“方形音乐事物”。
+>
+> 🎓 [“非平面”与“平面”几何](https://datascience.stackexchange.com/questions/52260/terminology-flat-geometry-in-the-context-of-clustering)
+>
+> 源自数学术语,非平面与平面几何是指通过“平面”([欧几里德](https://wikipedia.org/wiki/Euclidean_geometry))或“非平面”(非欧几里得)几何方法测量点之间的距离。
+>
+> 在此上下文中,“平面”是指欧几里得几何(其中一部分被教导为“平面”几何),而非平面是指非欧几里得几何。几何与机器学习有什么关系?好吧,作为植根于数学的两个领域,必须有一种通用的方法来测量聚类中点之间的距离,并且可以以“平坦”(flat)或“非平坦”(non-flat)的方式完成,具体取决于数据的性质. [欧几里得距离](https://wikipedia.org/wiki/Euclidean_distance)测量为两点之间线段的长度。[非欧距离](https://wikipedia.org/wiki/Non-Euclidean_geometry)是沿曲线测量的。如果您的可视化数据似乎不存在于平面上,您可能需要使用专门的算法来处理它。
+>
+> ![Flat vs Nonflat Geometry Infographic](../images/flat-nonflat.png)
+> [Dasani Madipalli ](https://twitter.com/dasani_decoded)作图
+>
+> 🎓 ['距离'](https://web.stanford.edu/class/cs345a/slides/12-clustering.pdf)
+>
+> 聚类由它们的距离矩阵定义,例如点之间的距离。这个距离可以通过几种方式来测量。欧几里得聚类由点值的平均值定义,并包含“质心”或中心点。因此,距离是通过到该质心的距离来测量的。非欧式距离指的是“聚类中心”,即离其他点最近的点。聚类中心又可以用各种方式定义。
+>
+> 🎓 ['约束'](https://wikipedia.org/wiki/Constrained_clustering)
+>
+> [约束聚类](https://web.cs.ucdavis.edu/~davidson/Publications/ICDMTutorial.pdf)将“半监督”学习引入到这种无监督方法中。点之间的关系被标记为“无法链接”或“必须链接”,因此对数据集强加了一些规则。
+>
+> 一个例子:如果一个算法在一批未标记或半标记的数据上不受约束,它产生的聚类质量可能很差。在上面的示例中,聚类可能将“圆形音乐事物”和“方形音乐事物”以及“三角形事物”和“饼干”分组。如果给出一些约束或要遵循的规则(“物品必须由塑料制成”、“物品需要能够产生音乐”),这可以帮助“约束”算法做出更好的选择。
+>
+> 🎓 '密度'
+>
+> “嘈杂”的数据被认为是“密集的”。在检查时,每个聚类中的点之间的距离可能或多或少地密集或“拥挤”,因此需要使用适当的聚类方法分析这些数据。[本文](https://www.kdnuggets.com/2020/02/understanding-density-based-clustering.html)演示了使用 K-Means 聚类与 HDBSCAN 算法探索具有不均匀聚类密度的嘈杂数据集之间的区别。
+
+## 聚类算法
+
+有超过 100 种聚类算法,它们的使用取决于手头数据的性质。让我们讨论一些主要的:
+
+- **层次聚类**。如果一个对象是根据其与附近对象的接近程度而不是较远对象来分类的,则聚类是根据其成员与其他对象之间的距离来形成的。Scikit-learn 的凝聚聚类是分层的。
+
+ ![Hierarchical clustering Infographic](../images/hierarchical.png)
+
+ > [Dasani Madipalli ](https://twitter.com/dasani_decoded)作图
+
+- **质心聚类**。这种流行的算法需要选择“k”或要形成的聚类数量,然后算法确定聚类的中心点并围绕该点收集数据。[K-means 聚类](https://wikipedia.org/wiki/K-means_clustering)是质心聚类的流行版本。中心由最近的平均值确定,因此叫做质心。与聚类的平方距离被最小化。
+
+ ![Centroid clustering Infographic](../images/centroid.png)
+
+ > [Dasani Madipalli](https://twitter.com/dasani_decoded)作图
+
+- **基于分布的聚类**。基于统计建模,基于分布的聚类中心确定一个数据点属于一个聚类的概率,并相应地分配它。高斯混合方法属于这种类型。
+
+- **基于密度的聚类**。数据点根据它们的密度或它们彼此的分组分配给聚类。远离该组的数据点被视为异常值或噪声。DBSCAN、Mean-shift 和 OPTICS 属于此类聚类。
+
+- **基于网格的聚类**。对于多维数据集,创建一个网格并将数据划分到网格的单元格中,从而创建聚类。
+
+
+
+## 练习 - 对你的数据进行聚类
+
+适当的可视化对聚类作为一种技术有很大帮助,所以让我们从可视化我们的音乐数据开始。这个练习将帮助我们决定我们应该最有效地使用哪种聚类方法来处理这些数据的性质。
+
+1. 打开此文件夹中的*notebook.ipynb*文件。
+
+1. 导入`Seaborn`包以获得良好的数据可视化。
+
+ ```python
+ !pip install seaborn
+ ```
+
+1. 附加来自*nigerian-songs.csv*的歌曲数据。加载包含有关歌曲的一些数据的数据帧。准备好通过导入库和转储数据来探索这些数据:
+
+ ```python
+ import matplotlib.pyplot as plt
+ import pandas as pd
+
+ df = pd.read_csv("../data/nigerian-songs.csv")
+ df.head()
+ ```
+
+ 检查前几行数据:
+
+ | | name | album | artist | artist_top_genre | release_date | length | popularity | danceability | acousticness | energy | instrumentalness | liveness | loudness | speechiness | tempo | time_signature |
+ | --- | ------------------------ | ---------------------------- | ------------------- | ---------------- | ------------ | ------ | ---------- | ------------ | ------------ | ------ | ---------------- | -------- | -------- | ----------- | ------- | -------------- |
+ | 0 | Sparky | Mandy & The Jungle | Cruel Santino | alternative r&b | 2019 | 144000 | 48 | 0.666 | 0.851 | 0.42 | 0.534 | 0.11 | -6.699 | 0.0829 | 133.015 | 5 |
+ | 1 | shuga rush | EVERYTHING YOU HEARD IS TRUE | Odunsi (The Engine) | afropop | 2020 | 89488 | 30 | 0.71 | 0.0822 | 0.683 | 0.000169 | 0.101 | -5.64 | 0.36 | 129.993 | 3 |
+ | 2 | LITT! | LITT! | AYLØ | indie r&b | 2018 | 207758 | 40 | 0.836 | 0.272 | 0.564 | 0.000537 | 0.11 | -7.127 | 0.0424 | 130.005 | 4 |
+ | 3 | Confident / Feeling Cool | Enjoy Your Life | Lady Donli | nigerian pop | 2019 | 175135 | 14 | 0.894 | 0.798 | 0.611 | 0.000187 | 0.0964 | -4.961 | 0.113 | 111.087 | 4 |
+ | 4 | wanted you | rare. | Odunsi (The Engine) | afropop | 2018 | 152049 | 25 | 0.702 | 0.116 | 0.833 | 0.91 | 0.348 | -6.044 | 0.0447 | 105.115 | 4 |
+
+1. 获取有关数据帧的一些信息,调用`info()`:
+
+ ```python
+ df.info()
+ ```
+
+ 输出看起来像这样:
+
+ ```output
+
+ RangeIndex: 530 entries, 0 to 529
+ Data columns (total 16 columns):
+ # Column Non-Null Count Dtype
+ --- ------ -------------- -----
+ 0 name 530 non-null object
+ 1 album 530 non-null object
+ 2 artist 530 non-null object
+ 3 artist_top_genre 530 non-null object
+ 4 release_date 530 non-null int64
+ 5 length 530 non-null int64
+ 6 popularity 530 non-null int64
+ 7 danceability 530 non-null float64
+ 8 acousticness 530 non-null float64
+ 9 energy 530 non-null float64
+ 10 instrumentalness 530 non-null float64
+ 11 liveness 530 non-null float64
+ 12 loudness 530 non-null float64
+ 13 speechiness 530 non-null float64
+ 14 tempo 530 non-null float64
+ 15 time_signature 530 non-null int64
+ dtypes: float64(8), int64(4), object(4)
+ memory usage: 66.4+ KB
+ ```
+
+1. 通过调用`isnull()`和验证总和为 0 来仔细检查空值:
+
+ ```python
+ df.isnull().sum()
+ ```
+
+ 看起来不错:
+
+ ```output
+ name 0
+ album 0
+ artist 0
+ artist_top_genre 0
+ release_date 0
+ length 0
+ popularity 0
+ danceability 0
+ acousticness 0
+ energy 0
+ instrumentalness 0
+ liveness 0
+ loudness 0
+ speechiness 0
+ tempo 0
+ time_signature 0
+ dtype: int64
+ ```
+
+1. 描述数据:
+
+ ```python
+ df.describe()
+ ```
+
+ | | release_date | length | popularity | danceability | acousticness | energy | instrumentalness | liveness | loudness | speechiness | tempo | time_signature |
+ | ----- | ------------ | ----------- | ---------- | ------------ | ------------ | -------- | ---------------- | -------- | --------- | ----------- | ---------- | -------------- |
+ | count | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 530 |
+ | mean | 2015.390566 | 222298.1698 | 17.507547 | 0.741619 | 0.265412 | 0.760623 | 0.016305 | 0.147308 | -4.953011 | 0.130748 | 116.487864 | 3.986792 |
+ | std | 3.131688 | 39696.82226 | 18.992212 | 0.117522 | 0.208342 | 0.148533 | 0.090321 | 0.123588 | 2.464186 | 0.092939 | 23.518601 | 0.333701 |
+ | min | 1998 | 89488 | 0 | 0.255 | 0.000665 | 0.111 | 0 | 0.0283 | -19.362 | 0.0278 | 61.695 | 3 |
+ | 25% | 2014 | 199305 | 0 | 0.681 | 0.089525 | 0.669 | 0 | 0.07565 | -6.29875 | 0.0591 | 102.96125 | 4 |
+ | 50% | 2016 | 218509 | 13 | 0.761 | 0.2205 | 0.7845 | 0.000004 | 0.1035 | -4.5585 | 0.09795 | 112.7145 | 4 |
+ | 75% | 2017 | 242098.5 | 31 | 0.8295 | 0.403 | 0.87575 | 0.000234 | 0.164 | -3.331 | 0.177 | 125.03925 | 4 |
+ | max | 2020 | 511738 | 73 | 0.966 | 0.954 | 0.995 | 0.91 | 0.811 | 0.582 | 0.514 | 206.007 | 5 |
+
+> 🤔如果我们正在使用聚类,一种不需要标记数据的无监督方法,为什么我们用标签显示这些数据?在数据探索阶段,它们派上用场,但它们不是聚类算法工作所必需的。您也可以删除列标题并按列号引用数据。
+
+查看数据的普遍值。请注意,流行度可以是“0”,表示没有排名的歌曲。让我们尽快删除它们。
+
+1. 使用条形图找出最受欢迎的类型:
+
+ ```python
+ import seaborn as sns
+
+ top = df['artist_top_genre'].value_counts()
+ plt.figure(figsize=(10,7))
+ sns.barplot(x=top[:5].index,y=top[:5].values)
+ plt.xticks(rotation=45)
+ plt.title('Top genres',color = 'blue')
+ ```
+
+ ![most popular](../images/popular.png)
+
+✅如果您想查看更多顶部值,请将顶部更改`[:5]`为更大的值,或将其删除以查看全部。
+
+请注意,当顶级流派被描述为“缺失”时,这意味着 Spotify 没有对其进行分类,所以让我们避免它。
+
+1. 通过过滤掉丢失的数据来避免
+
+ ```python
+ df = df[df['artist_top_genre'] != 'Missing']
+ top = df['artist_top_genre'].value_counts()
+ plt.figure(figsize=(10,7))
+ sns.barplot(x=top.index,y=top.values)
+ plt.xticks(rotation=45)
+ plt.title('Top genres',color = 'blue')
+ ```
+
+ 现在重新检查genres:
+
+ ![most popular](../images/all-genres.png)
+
+1. 到目前为止,前三大流派主导了这个数据集。让我们专注于`afro dancehall`, `afropop`, 和`nigerian pop`,另外过滤数据集以删除任何具有 0 流行度值的内容(这意味着它在数据集中没有被归类为流行度并且可以被视为我们的目的的噪音):
+
+ ```python
+ df = df[(df['artist_top_genre'] == 'afro dancehall') | (df['artist_top_genre'] == 'afropop') | (df['artist_top_genre'] == 'nigerian pop')]
+ df = df[(df['popularity'] > 0)]
+ top = df['artist_top_genre'].value_counts()
+ plt.figure(figsize=(10,7))
+ sns.barplot(x=top.index,y=top.values)
+ plt.xticks(rotation=45)
+ plt.title('Top genres',color = 'blue')
+ ```
+
+1. 做一个快速测试,看看数据是否以任何特别强的方式相关:
+
+ ```python
+ corrmat = df.corr()
+ f, ax = plt.subplots(figsize=(12, 9))
+ sns.heatmap(corrmat, vmax=.8, square=True)
+ ```
+
+ ![correlations](../images/correlation.png)
+
+ > 唯一强相关性是`energy`和之间`loudness`,这并不奇怪,因为嘈杂的音乐通常非常有活力。否则,相关性相对较弱。看看聚类算法可以如何处理这些数据会很有趣。
+ >
+ > > 🎓请注意,相关性并不意味着因果关系!我们有相关性的证据,但没有因果关系的证据。一个[有趣的网站](https://tylervigen.com/spurious-correlations)有一些强调这一点的视觉效果。
+
+这个数据集是否围绕歌曲的流行度和可舞性有任何收敛?FacetGrid 显示无论流派如何,都有同心圆排列。对于这种类型,尼日利亚人的口味是否会在某种程度的可舞性上趋于一致?
+
+✅尝试不同的数据点(能量、响度、语音)和更多或不同的音乐类型。你能发现什么?查看`df.describe()`表格以了解数据点的一般分布。
+
+### 练习 - 数据分布
+
+这三种流派是否因其受欢迎程度而对其可舞性的看法有显着差异?
+
+1. 检查我们沿给定 x 和 y 轴的流行度和可舞性的前三种类型数据分布。
+
+ ```python
+ sns.set_theme(style="ticks")
+
+ g = sns.jointplot(
+ data=df,
+ x="popularity", y="danceability", hue="artist_top_genre",
+ kind="kde",
+ )
+ ```
+
+ 您可以发现围绕一般收敛点的同心圆,显示点的分布。
+
+ > 🎓请注意,此示例使用 KDE(核密度估计)图,该图使用连续概率密度曲线表示数据。这允许我们在处理多个分布时解释数据。
+
+ 总的来说,这三种流派在流行度和可舞性方面松散地对齐。在这种松散对齐的数据中确定聚类将是一个挑战:
+
+ ![distribution](../images/distribution.png)
+
+1. 创建散点图:
+
+ ```python
+ sns.FacetGrid(df, hue="artist_top_genre", size=5) \
+ .map(plt.scatter, "popularity", "danceability") \
+ .add_legend()
+ ```
+
+ 相同轴的散点图显示了类似的收敛模式
+
+ ![Facetgrid](../images/facetgrid.png)
+
+一般来说,对于聚类,你可以使用散点图来展示数据的聚类,所以掌握这种类型的可视化是非常有用的。在下一课中,我们将使用过滤后的数据并使用 k-means 聚类来发现这些数据中以有趣方式重叠的组。
+
+---
+
+## 🚀挑战
+
+为下一课做准备,制作一张图表,说明您可能会在生产环境中发现和使用的各种聚类算法。
+
+聚类试图解决什么样的问题?
+
+## [课后测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/28/)
+
+## 复习与自学
+
+在应用聚类算法之前,正如我们所了解的,了解数据集的性质是一个好主意。[在此处](https://www.kdnuggets.com/2019/10/right-clustering-algorithm.html)阅读有关此主题的更多[信息](https://www.kdnuggets.com/2019/10/right-clustering-algorithm.html)
+
+[这篇有用的文章将](https://www.freecodecamp.org/news/8-clustering-algorithms-in-machine-learning-that-all-data-scientists-should-know/)引导您了解各种聚类算法在给定不同数据形状的情况下的不同行为方式。
+
+## 作业
+
+[研究用于聚类的其他可视化](./assignment.zh-cn.md)
diff --git a/5-Clustering/1-Visualize/translations/assignment.it.md b/5-Clustering/1-Visualize/translations/assignment.it.md
new file mode 100644
index 0000000000..dad3d7081d
--- /dev/null
+++ b/5-Clustering/1-Visualize/translations/assignment.it.md
@@ -0,0 +1,11 @@
+# Ricercare altre visualizzazioni per il clustering
+
+## Istruzioni
+
+In questa lezione, si è lavorato con alcune tecniche di visualizzazione per capire come tracciare i propri dati in preparazione per il clustering. I grafici a dispersione, in particolare, sono utili per trovare gruppi di oggetti. Ricercare modi diversi e librerie diverse per creare grafici a dispersione e documentare il proprio lavoro in un notebook. Si possono utilizzare i dati di questa lezione, di altre lezioni o dei dati che si sono procurati in autonomia (per favore citare la fonte, comunque, nel proprio notebook). Tracciare alcuni dati usando i grafici a dispersione e spiegare cosa si scopre.
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | -------------------------------------------------------------- | ---------------------------------------------------------------------------------------- | ----------------------------------- |
+| | Viene presentato un notebook con cinque grafici a dispersione ben documentati | Un notebook viene presentato con meno di cinque grafici a dispersione ed è meno ben documentato | Viene presentato un notebook incompleto |
diff --git a/5-Clustering/1-Visualize/translations/assignment.zh-cn.md b/5-Clustering/1-Visualize/translations/assignment.zh-cn.md
new file mode 100644
index 0000000000..48f5ea24a4
--- /dev/null
+++ b/5-Clustering/1-Visualize/translations/assignment.zh-cn.md
@@ -0,0 +1,13 @@
+# 研究用于聚类的其他可视化
+
+## 说明
+
+在本节课中,您使用了一些可视化技术来掌握绘制数据图,为聚类数据做准备。散点图在寻找一组对象时尤其有用。研究不同的方法和不同的库来创建散点图,并在notebook上记录你的工作。你可以使用这节课的数据,其他课的数据,或者你自己的数据(但是,请把它的来源记在你的notebook上)。用散点图绘制一些数据,并解释你的发现。
+
+## 评判规则
+
+
+| 评判标准 | 优秀 | 中规中矩 | 仍需努力 |
+| -------- | -------------------------------- | ----------------------------------------------- | -------------------- |
+| | notebook上有五个详细文档的散点图 | notebook上的散点图少于5个,而且文档写得不太详细 | 一个不完整的notebook |
+
diff --git a/5-Clustering/2-K-Means/README.md b/5-Clustering/2-K-Means/README.md
index 153932e631..3f4c5fca6b 100644
--- a/5-Clustering/2-K-Means/README.md
+++ b/5-Clustering/2-K-Means/README.md
@@ -4,7 +4,7 @@
> 🎥 Click the image above for a video: Andrew Ng explains clustering
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/29/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/29/)
In this lesson, you will learn how to create clusters using Scikit-learn and the Nigerian music dataset you imported earlier. We will cover the basics of K-Means for Clustering. Keep in mind that, as you learned in the earlier lesson, there are many ways to work with clusters and the method you use depends on your data. We will try K-Means as it's the most common clustering technique. Let's get started!
@@ -224,7 +224,7 @@ Previously, you surmised that, because you have targeted 3 song genres, you shou
## Variance
-Variance is defined as "the average of the squared differences from the Mean."[source](https://www.mathsisfun.com/data/standard-deviation.html) In the context of this clustering problem, it refers to data that the numbers of our dataset tend to diverge a bit too much from the mean.
+Variance is defined as "the average of the squared differences from the Mean" [source](https://www.mathsisfun.com/data/standard-deviation.html). In the context of this clustering problem, it refers to data that the numbers of our dataset tend to diverge a bit too much from the mean.
✅ This is a great moment to think about all the ways you could correct this issue. Tweak the data a bit more? Use different columns? Use a different algorithm? Hint: Try [scaling your data](https://www.mygreatlearning.com/blog/learning-data-science-with-k-means-clustering/) to normalize it and test other columns.
@@ -238,11 +238,11 @@ Spend some time with this notebook, tweaking parameters. Can you improve the acc
Hint: Try to scale your data. There's commented code in the notebook that adds standard scaling to make the data columns resemble each other more closely in terms of range. You'll find that while the silhouette score goes down, the 'kink' in the elbow graph smooths out. This is because leaving the data unscaled allows data with less variance to carry more weight. Read a bit more on this problem [here](https://stats.stackexchange.com/questions/21222/are-mean-normalization-and-feature-scaling-needed-for-k-means-clustering/21226#21226).
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/30/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/30/)
## Review & Self Study
-Take a look at Stanford's K-Means Simulator [here](https://stanford.edu/class/engr108/visualizations/kmeans/kmeans.html). You can use this tool to visualize sample data points and determine its centroids. With fresh data, click 'update' to see how long it takes to find convergence. You can edit the data's randomness, numbers of clusters and numbers of centroids. Does this help you get an idea of how the data can be grouped?
+Take a look at a K-Means Simulator [such as this one](https://user.ceng.metu.edu.tr/~akifakkus/courses/ceng574/k-means/). You can use this tool to visualize sample data points and determine its centroids. You can edit the data's randomness, numbers of clusters and numbers of centroids. Does this help you get an idea of how the data can be grouped?
Also, take a look at [this handout on k-means](https://stanford.edu/~cpiech/cs221/handouts/kmeans.html) from Stanford.
diff --git a/5-Clustering/2-K-Means/translations/README.it.md b/5-Clustering/2-K-Means/translations/README.it.md
new file mode 100644
index 0000000000..9c94d19f57
--- /dev/null
+++ b/5-Clustering/2-K-Means/translations/README.it.md
@@ -0,0 +1,251 @@
+# Clustering K-Means
+
+[![Andrew Ng spiega Clustering](https://img.youtube.com/vi/hDmNF9JG3lo/0.jpg)](https://youtu.be/hDmNF9JG3lo " Andrew Ng spiega Clustering")
+
+> 🎥 Fare clic sull'immagine sopra per un video: Andrew Ng spiega il clustering
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/29/)
+
+In questa lezione si imparerà come creare cluster utilizzando Scikit-learn e l'insieme di dati di musica nigeriana importato in precedenza. Si tratteranno le basi di K-Means per Clustering. Si tenga presente che, come appreso nella lezione precedente, ci sono molti modi per lavorare con i cluster e il metodo usato dipende dai propri dati. Si proverà K-Means poiché è la tecnica di clustering più comune. Si inizia!
+
+Temini che si imparerà a conoscere:
+
+- Silhouette scoring (punteggio silhouette)
+- Elbow method (metodo del gomito)
+- Inerzia
+- Varianza
+
+## Introduzione
+
+[K-Means Clustering](https://wikipedia.org/wiki/K-means_clustering) è un metodo derivato dal campo dell'elaborazione del segnale. Viene utilizzato per dividere e partizionare gruppi di dati in cluster "k" utilizzando una serie di osservazioni. Ogni osservazione lavora per raggruppare un dato punto dati più vicino alla sua "media" più vicina, o punto centrale di un cluster.
+
+I cluster possono essere visualizzati come [diagrammi di Voronoi](https://wikipedia.org/wiki/Voronoi_diagram), che includono un punto (o 'seme') e la sua regione corrispondente.
+
+![diagramma di voronoi](../images/voronoi.png)
+
+> Infografica di [Jen Looper](https://twitter.com/jenlooper)
+
+Il processo di clustering K-Means [viene eseguito in tre fasi](https://scikit-learn.org/stable/modules/clustering.html#k-means):
+
+1. L'algoritmo seleziona il numero k di punti centrali campionando dall'insieme di dati. Dopo questo, esegue un ciclo:
+ 1. Assegna ogni campione al centroide più vicino.
+ 2. Crea nuovi centroidi prendendo il valore medio di tutti i campioni assegnati ai centroidi precedenti.
+ 3. Quindi, calcola la differenza tra il nuovo e il vecchio centroide e ripete finché i centroidi non sono stabilizzati.
+
+Uno svantaggio dell'utilizzo di K-Means include il fatto che sarà necessario stabilire 'k', ovvero il numero di centroidi. Fortunatamente il "metodo del gomito" aiuta a stimare un buon valore iniziale per "k". Si proverà in un minuto.
+
+## Prerequisito
+
+Si lavorerà nel file _notebook.ipynb_ di questa lezione che include l'importazione dei dati e la pulizia preliminare fatta nell'ultima lezione.
+
+## Esercizio - preparazione
+
+Iniziare dando un'altra occhiata ai dati delle canzoni.
+
+1. Creare un diagramma a scatola e baffi (boxplot), chiamando `boxplot()` per ogni colonna:
+
+ ```python
+ plt.figure(figsize=(20,20), dpi=200)
+
+ plt.subplot(4,3,1)
+ sns.boxplot(x = 'popularity', data = df)
+
+ plt.subplot(4,3,2)
+ sns.boxplot(x = 'acousticness', data = df)
+
+ plt.subplot(4,3,3)
+ sns.boxplot(x = 'energy', data = df)
+
+ plt.subplot(4,3,4)
+ sns.boxplot(x = 'instrumentalness', data = df)
+
+ plt.subplot(4,3,5)
+ sns.boxplot(x = 'liveness', data = df)
+
+ plt.subplot(4,3,6)
+ sns.boxplot(x = 'loudness', data = df)
+
+ plt.subplot(4,3,7)
+ sns.boxplot(x = 'speechiness', data = df)
+
+ plt.subplot(4,3,8)
+ sns.boxplot(x = 'tempo', data = df)
+
+ plt.subplot(4,3,9)
+ sns.boxplot(x = 'time_signature', data = df)
+
+ plt.subplot(4,3,10)
+ sns.boxplot(x = 'danceability', data = df)
+
+ plt.subplot(4,3,11)
+ sns.boxplot(x = 'length', data = df)
+
+ plt.subplot(4,3,12)
+ sns.boxplot(x = 'release_date', data = df)
+ ```
+
+ Questi dati sono un po' rumorosi: osservando ogni colonna come un boxplot, si possono vedere i valori anomali.
+
+ ![situazioni anomale](../images/boxplots.png)
+
+Si potrebbe esaminare l'insieme di dati e rimuovere questi valori anomali, ma ciò renderebbe i dati piuttosto minimi.
+
+1. Per ora, si scelgono quali colonne utilizzare per questo esercizio di clustering. Scegliere quelle con intervalli simili e codifica la colonna `artist_top_genre` come dati numerici:
+
+ ```python
+ from sklearn.preprocessing import LabelEncoder
+ le = LabelEncoder()
+
+ X = df.loc[:, ('artist_top_genre','popularity','danceability','acousticness','loudness','energy')]
+
+ y = df['artist_top_genre']
+
+ X['artist_top_genre'] = le.fit_transform(X['artist_top_genre'])
+
+ y = le.transform(y)
+ ```
+
+1. Ora si deve scegliere quanti cluster scegliere come obiettivo. E' noto che ci sono 3 generi di canzoni ricavati dall'insieme di dati, quindi si prova 3:
+
+ ```python
+ from sklearn.cluster import KMeans
+
+ nclusters = 3
+ seed = 0
+
+ km = KMeans(n_clusters=nclusters, random_state=seed)
+ km.fit(X)
+
+ # Predict the cluster for each data point
+
+ y_cluster_kmeans = km.predict(X)
+ y_cluster_kmeans
+ ```
+
+Viene visualizzato un array con i cluster previsti (0, 1 o 2) per ogni riga del dataframe di dati.
+
+1. Usare questo array per calcolare un "punteggio silhouette":
+
+ ```python
+ from sklearn import metrics
+ score = metrics.silhouette_score(X, y_cluster_kmeans)
+ score
+ ```
+
+## Punteggio Silhouette
+
+Si vuole ottenere un punteggio silhouette più vicino a 1. Questo punteggio varia da -1 a 1 e, se il punteggio è 1, il cluster è denso e ben separato dagli altri cluster. Un valore vicino a 0 rappresenta cluster sovrapposti con campioni molto vicini al limite di decisione dei clusters vicini [fonte](https://dzone.com/articles/kmeans-silhouette-score-explained-with-python-exam).
+
+Il punteggio è **.53**, quindi proprio nel mezzo. Ciò indica che i dati non sono particolarmente adatti a questo tipo di clustering, ma si prosegue.
+
+### Esercizio: costruire il proprio modello
+
+1. Importare `KMeans` e avviare il processo di clustering.
+
+ ```python
+ from sklearn.cluster import KMeans
+ wcss = []
+
+ for i in range(1, 11):
+ kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
+ kmeans.fit(X)
+ wcss.append(kmeans.inertia_)
+
+ ```
+
+ Ci sono alcune parti qui che meritano una spiegazione.
+
+ > 🎓 range: queste sono le iterazioni del processo di clustering
+
+ > 🎓 random_state: "Determina la generazione di numeri casuali per l'inizializzazione del centroide."[fonte](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans)
+
+ > 🎓 WCSS: "somma dei quadrati all'interno del cluster" misura la distanza media al quadrato di tutti i punti all'interno di un cluster rispetto al cluster centroid [fonte](https://medium.com/@ODSC/unsupervised-learning-evaluating-clusters-bd47eed175ce).
+
+ > 🎓 Inerzia: gli algoritmi K-Means tentano di scegliere i centroidi per ridurre al minimo l’’inerzia’, "una misura di quanto siano coerenti i cluster".[fonte](https://scikit-learn.org/stable/modules/clustering.html). Il valore viene aggiunto alla variabile wcss ad ogni iterazione.
+
+ > 🎓 k-means++: in [Scikit-learn](https://scikit-learn.org/stable/modules/clustering.html#k-means) puoi utilizzare l'ottimizzazione 'k-means++', che "inizializza i centroidi in modo che siano (generalmente) distanti l'uno dall'altro, portando probabilmente a risultati migliori rispetto all'inizializzazione casuale.
+
+### Metodo del gomito
+
+In precedenza, si era supposto che, poiché sono stati presi di mira 3 generi di canzoni, si dovrebbero scegliere 3 cluster. E' questo il caso?
+
+1. Usare il "metodo del gomito" per assicurarsene.
+
+ ```python
+ plt.figure(figsize=(10,5))
+ sns.lineplot(range(1, 11), wcss,marker='o',color='red')
+ plt.title('Elbow')
+ plt.xlabel('Number of clusters')
+ plt.ylabel('WCSS')
+ plt.show()
+ ```
+
+ Usare la variabile `wcss` creata nel passaggio precedente per creare un grafico che mostra dove si trova la "piegatura" nel gomito, che indica il numero ottimale di cluster. Forse **sono** 3!
+
+ ![Metodo del gomito](../images/elbow.png)
+
+## Esercizio - visualizzare i cluster
+
+1. Riprovare il processo, questa volta impostando tre cluster e visualizzare i cluster come grafico a dispersione:
+
+ ```python
+ from sklearn.cluster import KMeans
+ kmeans = KMeans(n_clusters = 3)
+ kmeans.fit(X)
+ labels = kmeans.predict(X)
+ plt.scatter(df['popularity'],df['danceability'],c = labels)
+ plt.xlabel('popularity')
+ plt.ylabel('danceability')
+ plt.show()
+ ```
+
+1. Verificare la precisione del modello:
+
+ ```python
+ labels = kmeans.labels_
+
+ correct_labels = sum(y == labels)
+
+ print("Result: %d out of %d samples were correctly labeled." % (correct_labels, y.size))
+
+ print('Accuracy score: {0:0.2f}'. format(correct_labels/float(y.size)))
+ ```
+
+ La precisione di questo modello non è molto buona e la forma dei grappoli fornisce un indizio sul perché.
+
+ ![cluster](../images/clusters.png)
+
+ Questi dati sono troppo sbilanciati, troppo poco correlati e c'è troppa varianza tra i valori della colonna per raggruppare bene. In effetti, i cluster che si formano sono probabilmente fortemente influenzati o distorti dalle tre categorie di genere definite sopra. È stato un processo di apprendimento!
+
+ Nella documentazione di Scikit-learn, si può vedere che un modello come questo, con cluster non molto ben delimitati, ha un problema di "varianza":
+
+ ![modelli problematici](../images/problems.png)
+ > Infografica da Scikit-learn
+
+## Varianza
+
+La varianza è definita come "la media delle differenze al quadrato dalla media" [fonte](https://www.mathsisfun.com/data/standard-deviation.html). Nel contesto di questo problema di clustering, si fa riferimento ai dati che i numeri dell'insieme di dati tendono a divergere un po' troppo dalla media.
+
+✅ Questo è un ottimo momento per pensare a tutti i modi in cui si potrebbe correggere questo problema. Modificare un po' di più i dati? Utilizzare colonne diverse? Utilizzare un algoritmo diverso? Suggerimento: provare a [ridimensionare i dati](https://www.mygreatlearning.com/blog/learning-data-science-with-k-means-clustering/) per normalizzarli e testare altre colonne.
+
+> Provare questo "[calcolatore della varianza](https://www.calculatorsoup.com/calculators/statistics/variance-calculator.php)" per capire un po’ di più il concetto.
+
+---
+
+## 🚀 Sfida
+
+Trascorrere un po' di tempo con questo notebook, modificando i parametri. E possibile migliorare l'accuratezza del modello pulendo maggiormente i dati (rimuovendo gli outlier, ad esempio)? È possibile utilizzare i pesi per dare più peso a determinati campioni di dati. Cos'altro si può fare per creare cluster migliori?
+
+Suggerimento: provare a ridimensionare i dati. C'è un codice commentato nel notebook che aggiunge il ridimensionamento standard per rendere le colonne di dati più simili tra loro in termini di intervallo. Si scoprirà che mentre il punteggio della silhouette diminuisce, il "kink" nel grafico del gomito si attenua. Questo perché lasciare i dati non scalati consente ai dati con meno varianza di avere più peso. Leggere un po' di più su questo problema [qui](https://stats.stackexchange.com/questions/21222/are-mean-normalization-and-feature-scaling-needed-for-k-means-clustering/21226#21226).
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/30/)
+
+## Revisione e Auto Apprendimento
+
+Dare un'occhiata a un simulatore di K-Means [tipo questo](https://user.ceng.metu.edu.tr/~akifakkus/courses/ceng574/k-means/). È possibile utilizzare questo strumento per visualizzare i punti dati di esempio e determinarne i centroidi. Questo aiuta a farsi un'idea di come i dati possono essere raggruppati?
+
+Inoltre, dare un'occhiata a [questa dispensa sui k-means](https://stanford.edu/~cpiech/cs221/handouts/kmeans.html) di Stanford.
+
+## Compito
+
+[Provare diversi metodi di clustering](assignment.it.md)
diff --git a/5-Clustering/2-K-Means/translations/README.zh-cn.md b/5-Clustering/2-K-Means/translations/README.zh-cn.md
new file mode 100644
index 0000000000..a5348eacb3
--- /dev/null
+++ b/5-Clustering/2-K-Means/translations/README.zh-cn.md
@@ -0,0 +1,253 @@
+# K-Means 聚类
+
+[![Andrew Ng explains Clustering](https://img.youtube.com/vi/hDmNF9JG3lo/0.jpg)](https://youtu.be/hDmNF9JG3lo "Andrew Ng explains Clustering")
+
+> 🎥 单击上图观看视频:Andrew Ng 解释聚类
+
+## [课前测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/29/)
+
+在本课中,您将学习如何使用 Scikit-learn 和您之前导入的尼日利亚音乐数据集创建聚类。我们将介绍 K-Means 聚类 的基础知识。请记住,正如您在上一课中学到的,使用聚类的方法有很多种,您使用的方法取决于您的数据。我们将尝试 K-Means,因为它是最常见的聚类技术。让我们开始吧!
+
+您将了解的术语:
+
+- 轮廓打分
+- 手肘方法
+- 惯性
+- 方差
+
+## 介绍
+
+[K-Means Clustering](https://wikipedia.org/wiki/K-means_clustering)是一种源自信号处理领域的方法。它用于使用一系列观察将数据组划分和划分为“k”个聚类。每个观察都用于对最接近其最近“平均值”或聚类中心点的给定数据点进行分组。
+
+聚类可以可视化为[Voronoi 图](https://wikipedia.org/wiki/Voronoi_diagram),其中包括一个点(或“种子”)及其相应的区域。
+
+![voronoi diagram](../images/voronoi.png)
+
+> [Jen Looper](https://twitter.com/jenlooper)作图
+
+K-Means 聚类过程[分三步执行](https://scikit-learn.org/stable/modules/clustering.html#k-means):
+
+1. 该算法通过从数据集中采样来选择 k 个中心点。在此之后,它循环:
+ 1. 它将每个样本分配到最近的质心。
+ 2. 它通过取分配给先前质心的所有样本的平均值来创建新质心。
+ 3. 然后,它计算新旧质心之间的差异并重复直到质心稳定。
+
+使用 K-Means 的一个缺点包括您需要建立“k”,即质心的数量。幸运的是,“肘部法则”有助于估计“k”的良好起始值。试一下吧。
+
+## 前置条件
+
+您将使用本课的*notebook.ipynb*文件,其中包含您在上一课中所做的数据导入和初步清理。
+
+## 练习 - 准备
+
+首先再看看歌曲数据。
+
+1. 创建一个箱线图,`boxplot()`为每一列调用:
+
+ ```python
+ plt.figure(figsize=(20,20), dpi=200)
+
+ plt.subplot(4,3,1)
+ sns.boxplot(x = 'popularity', data = df)
+
+ plt.subplot(4,3,2)
+ sns.boxplot(x = 'acousticness', data = df)
+
+ plt.subplot(4,3,3)
+ sns.boxplot(x = 'energy', data = df)
+
+ plt.subplot(4,3,4)
+ sns.boxplot(x = 'instrumentalness', data = df)
+
+ plt.subplot(4,3,5)
+ sns.boxplot(x = 'liveness', data = df)
+
+ plt.subplot(4,3,6)
+ sns.boxplot(x = 'loudness', data = df)
+
+ plt.subplot(4,3,7)
+ sns.boxplot(x = 'speechiness', data = df)
+
+ plt.subplot(4,3,8)
+ sns.boxplot(x = 'tempo', data = df)
+
+ plt.subplot(4,3,9)
+ sns.boxplot(x = 'time_signature', data = df)
+
+ plt.subplot(4,3,10)
+ sns.boxplot(x = 'danceability', data = df)
+
+ plt.subplot(4,3,11)
+ sns.boxplot(x = 'length', data = df)
+
+ plt.subplot(4,3,12)
+ sns.boxplot(x = 'release_date', data = df)
+ ```
+
+ 这个数据有点嘈杂:通过观察每一列作为箱线图,你可以看到异常值。
+
+ ![outliers](../images/boxplots.png)
+
+您可以浏览数据集并删除这些异常值,但这会使数据非常少。
+
+1. 现在,选择您将用于聚类练习的列。选择具有相似范围的那些并将`artist_top_genre`列编码为数字类型的数据:
+
+ ```python
+ from sklearn.preprocessing import LabelEncoder
+ le = LabelEncoder()
+
+ X = df.loc[:, ('artist_top_genre','popularity','danceability','acousticness','loudness','energy')]
+
+ y = df['artist_top_genre']
+
+ X['artist_top_genre'] = le.fit_transform(X['artist_top_genre'])
+
+ y = le.transform(y)
+ ```
+
+1. 现在您需要选择要定位的聚类数量。您知道我们从数据集中挖掘出 3 种歌曲流派,所以让我们尝试 3 种:
+
+ ```python
+ from sklearn.cluster import KMeans
+
+ nclusters = 3
+ seed = 0
+
+ km = KMeans(n_clusters=nclusters, random_state=seed)
+ km.fit(X)
+
+ # Predict the cluster for each data point
+
+ y_cluster_kmeans = km.predict(X)
+ y_cluster_kmeans
+ ```
+
+您会看到打印出的数组,其中包含数据帧每一行的预测聚类(0、1 或 2)。
+
+1. 使用此数组计算“轮廓分数”:
+
+ ```python
+ from sklearn import metrics
+ score = metrics.silhouette_score(X, y_cluster_kmeans)
+ score
+ ```
+
+## 轮廓分数
+
+寻找接近 1 的轮廓分数。该分数从 -1 到 1 不等,如果分数为 1,则该聚类密集且与其他聚类分离良好。接近 0 的值表示重叠聚类,样本非常接近相邻聚类的决策边界。[来源](https://dzone.com/articles/kmeans-silhouette-score-explained-with-python-exam)。
+
+我们的分数是**0.53**,所以正好在中间。这表明我们的数据不是特别适合这种类型的聚类,但让我们继续。
+
+### 练习 - 建立模型
+
+1. 导入`KMeans`并启动聚类过程。
+
+ ```python
+ from sklearn.cluster import KMeans
+ wcss = []
+
+ for i in range(1, 11):
+ kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
+ kmeans.fit(X)
+ wcss.append(kmeans.inertia_)
+
+ ```
+
+ 这里有几个部分需要解释。
+
+ > 🎓 range:这些是聚类过程的迭代
+
+ > 🎓random_state:“确定质心初始化的随机数生成。” [来源](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans)
+
+ > 🎓WCSS:“聚类内平方和”测量聚类内所有点到聚类质心的平方平均距离。[来源](https://medium.com/@ODSC/unsupervised-learning-evaluating-clusters-bd47eed175ce)。
+
+ > 🎓Inertia:K-Means 算法尝试选择质心以最小化“惯性”,“惯性是衡量内部相干程度的一种方法”。[来源](https://scikit-learn.org/stable/modules/clustering.html)。该值在每次迭代时附加到 wcss 变量。
+
+ > 🎓k-means++:在[Scikit-learn 中,](https://scikit-learn.org/stable/modules/clustering.html#k-means)您可以使用“k-means++”优化,它“将质心初始化为(通常)彼此远离,导致可能比随机初始化更好的结果。
+
+### 手肘方法
+
+之前,您推测,因为您已经定位了 3 个歌曲genre,所以您应该选择 3 个聚类。但真的是这样吗?
+
+1. 使用手肘方法来确认。
+
+ ```python
+ plt.figure(figsize=(10,5))
+ sns.lineplot(range(1, 11), wcss,marker='o',color='red')
+ plt.title('Elbow')
+ plt.xlabel('Number of clusters')
+ plt.ylabel('WCSS')
+ plt.show()
+ ```
+
+ 使用`wcss`您在上一步中构建的变量创建一个图表,显示肘部“弯曲”的位置,这表示最佳聚类数。也许**是**3!
+
+ ![elbow method](../images/elbow.png)
+
+## 练习 - 显示聚类
+
+1. 再次尝试该过程,这次设置三个聚类,并将聚类显示为散点图:
+
+ ```python
+ from sklearn.cluster import KMeans
+ kmeans = KMeans(n_clusters = 3)
+ kmeans.fit(X)
+ labels = kmeans.predict(X)
+ plt.scatter(df['popularity'],df['danceability'],c = labels)
+ plt.xlabel('popularity')
+ plt.ylabel('danceability')
+ plt.show()
+ ```
+
+1. 检查模型的准确性:
+
+ ```python
+ labels = kmeans.labels_
+
+ correct_labels = sum(y == labels)
+
+ print("Result: %d out of %d samples were correctly labeled." % (correct_labels, y.size))
+
+ print('Accuracy score: {0:0.2f}'. format(correct_labels/float(y.size)))
+ ```
+
+ 这个模型的准确性不是很好,聚类的形状给了你一个提示。
+
+ ![clusters](../images/clusters.png)
+
+ 这些数据太不平衡,相关性太低,列值之间的差异太大,无法很好地聚类。事实上,形成的聚类可能受到我们上面定义的三个类型类别的严重影响或扭曲。那是一个学习的过程!
+
+ 在 Scikit-learn 的文档中,你可以看到像这样的模型,聚类划分不是很好,有一个“方差”问题:
+
+ ![problem models](../images/problems.png)
+
+ > 图来自 Scikit-learn
+
+## 方差
+
+> 方差被定义为“来自均值的平方差的平均值”[源](https://www.mathsisfun.com/data/standard-deviation.html)。在这个聚类问题的上下文中,它指的是我们数据集的数量往往与平均值相差太多的数据。
+>
+> ✅这是考虑可以纠正此问题的所有方法的好时机。稍微调整一下数据?使用不同的列?使用不同的算法?提示:尝试[缩放数据](https://www.mygreatlearning.com/blog/learning-data-science-with-k-means-clustering/)以对其进行标准化并测试其他列。
+>
+> > 试试这个“[方差计算器](https://www.calculatorsoup.com/calculators/statistics/variance-calculator.php)”来更多地理解这个概念。
+
+---
+
+## 🚀挑战
+
+花一些时间在这个笔记本上,调整参数。您能否通过更多地清理数据(例如,去除异常值)来提高模型的准确性?您可以使用权重为给定的数据样本赋予更多权重。你还能做些什么来创建更好的聚类?
+
+提示:尝试缩放您的数据。笔记本中的注释代码添加了标准缩放,使数据列在范围方面更加相似。您会发现,当轮廓分数下降时,肘部图中的“扭结”变得平滑。这是因为不缩放数据可以让方差较小的数据承载更多的权重。在[这里](https://stats.stackexchange.com/questions/21222/are-mean-normalization-and-feature-scaling-needed-for-k-means-clustering/21226#21226)阅读更多关于这个问题的[信息](https://stats.stackexchange.com/questions/21222/are-mean-normalization-and-feature-scaling-needed-for-k-means-clustering/21226#21226)。
+
+## [课后测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/30/)
+
+## 复习与自学
+
+看看[像这样](https://user.ceng.metu.edu.tr/~akifakkus/courses/ceng574/k-means/)的 K-Means 模拟器。您可以使用此工具来可视化样本数据点并确定其质心。您可以编辑数据的随机性、聚类数和质心数。这是否有助于您了解如何对数据进行分组?
+
+另外,看看斯坦福大学的[k-means 讲义](https://stanford.edu/~cpiech/cs221/handouts/kmeans.html)。
+
+## 作业
+
+[尝试不同的聚类方法](./assignment.zh-cn.md)
+
diff --git a/5-Clustering/2-K-Means/translations/assignment.it.md b/5-Clustering/2-K-Means/translations/assignment.it.md
new file mode 100644
index 0000000000..59fc79de1d
--- /dev/null
+++ b/5-Clustering/2-K-Means/translations/assignment.it.md
@@ -0,0 +1,10 @@
+# Provare diversi metodi di clustering
+
+## Istruzioni
+
+In questa lezione si è imparato a conoscere il clustering K-Means. A volte K-Means non è appropriato per i propri dati. Creare un notebook usando i dati da queste lezioni o da qualche altra parte (accreditare la fonte) e mostrare un metodo di clustering diverso NON usando K-Means. Che cosa si è imparato?
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | --------------------------------------------------------------- | -------------------------------------------------------------------- | ---------------------------- |
+| | Viene presentato un notebook con un modello di clustering ben documentato | Un notebook è presentato senza una buona documentazione e/o incompleto | E' stato inviato un lavoro incompleto |
diff --git a/5-Clustering/2-K-Means/translations/assignment.zh-cn.md b/5-Clustering/2-K-Means/translations/assignment.zh-cn.md
new file mode 100644
index 0000000000..c21058d3c3
--- /dev/null
+++ b/5-Clustering/2-K-Means/translations/assignment.zh-cn.md
@@ -0,0 +1,12 @@
+# 尝试不同的聚类方法
+
+
+## 说明
+
+在本课中,您学习了 K-Means 聚类。有时 K-Means 不适合您的数据。使用来自这些课程或其他地方的数据(归功于您的来源)创建notebook,并展示不使用 K-Means 的不同聚类方法。你学到了什么?
+## 评判规则
+
+| 评判标准 | 优秀 | 中规中矩 | 仍需努力 |
+| -------- | --------------------------------------------------------------- | -------------------------------------------------------------------- | ---------------------------- |
+| | 一个具有良好文档记录的聚类模型的notebook | 一个没有详细文档或不完整的notebook| 提交了一个不完整的工作 |
+
diff --git a/5-Clustering/translations/README.it.md b/5-Clustering/translations/README.it.md
new file mode 100644
index 0000000000..4a056e7217
--- /dev/null
+++ b/5-Clustering/translations/README.it.md
@@ -0,0 +1,29 @@
+# Modelli di clustering per machine learning
+
+Il clustering è un'attività di machine learning che cerca di trovare oggetti che si assomigliano per raggrupparli in gruppi chiamati cluster. Ciò che differenzia il clustering da altri approcci in machine learning è che le cose accadono automaticamente, infatti, è giusto dire che è l'opposto dell'apprendimento supervisionato.
+
+## Tema regionale: modelli di clustering per il gusto musicale di un pubblico nigeriano 🎧
+
+Il pubblico eterogeneo della Nigeria ha gusti musicali diversi. Usando i dati recuperati da Spotify (ispirato da [questo articolo](https://towardsdatascience.com/country-wise-visual-analysis-of-music-taste-using-spotify-api-seaborn-in-python-77f5b749b421), si dà un'occhiata a un po' di musica popolare in Nigeria. Questo insieme di dati include dati sul punteggio di "danzabilità", acustica, volume, "speechness" (un numero compreso tra zero e uno che indica la probabilità che un particolare file audio sia parlato - n.d.t.) popolarità ed energia di varie canzoni. Sarà interessante scoprire modelli in questi dati!
+
+![Un giradischi](../images/turntable.jpg)
+
+Foto di Marcela Laskoski su Unsplash
+
+In questa serie di lezioni si scopriranno nuovi modi per analizzare i dati utilizzando tecniche di clustering. Il clustering è particolarmente utile quando l'insieme di dati non ha etichette. Se ha etichette, le tecniche di classificazione come quelle apprese nelle lezioni precedenti potrebbero essere più utili. Ma nei casi in cui si sta cercando di raggruppare dati senza etichetta, il clustering è un ottimo modo per scoprire i modelli.
+
+> Esistono utili strumenti a basso codice che possono aiutare a imparare a lavorare con i modelli di clustering. Si provi [Azure ML per questa attività](https://docs.microsoft.com/learn/modules/create-clustering-model-azure-machine-learning-designer/?WT.mc_id=academic-15963-cxa)
+
+## Lezioni
+
+
+1. [Introduzione al clustering](../1-Visualize/translations/README.it.md)
+2. [K-Means clustering](../2-K-Means/translations/README.it.md)
+
+## Crediti
+
+Queste lezioni sono state scritte con 🎶 da [Jen Looper](https://www.twitter.com/jenlooper) con utili recensioni di [Rishit Dagli](https://rishit_dagli) e [Muhammad Sakib Khan Inan](https://twitter.com/Sakibinan).
+
+L'insieme di dati [Nigerian Songs](https://www.kaggle.com/sootersaalu/nigerian-songs-spotify) è stato prelevato da Kaggle, a sua volta recuperato da Spotify.
+
+Esempi utili di K-Means che hanno aiutato nella creazione di questa lezione includono questa [esplorazione dell'iride](https://www.kaggle.com/bburns/iris-exploration-pca-k-means-and-gmm-clustering), questo [notebook introduttivo](https://www.kaggle.com/prashant111/k-means-clustering-with-python) e questo [ipotetico esempio di ONG](https://www.kaggle.com/ankandash/pca-k-means-clustering-hierarchical-clustering).
\ No newline at end of file
diff --git a/5-Clustering/translations/README.ru.md b/5-Clustering/translations/README.ru.md
new file mode 100644
index 0000000000..eb0241b477
--- /dev/null
+++ b/5-Clustering/translations/README.ru.md
@@ -0,0 +1,26 @@
+# Модели кластеризации для машинного обучения
+
+Кластеризация - это задача машинного обучения, при которой она ищет объекты, которые похожи друг на друга, и группирует их в группы, называемые кластерами. Что отличает кластеризацию от других подходов в машинном обучении, так это то, что все происходит автоматически, и справедливо сказать, что это противоположность supervised learning.
+
+## Региональная тема: модели кластеризации для музыкальных вкусов нигерийской публики 🎧
+
+У разнообразной публики Нигерии самые разные музыкальные вкусы. Использование данных, извлеченных из Spotify (на основе [этой статьи](https://towardsdatascience.com/country-wise-visual-analysis-of-music-taste-using-spotify-api-seaborn-in-python-77f5b749b421), давайте посмотрим на музыку, популярную в Нигерии. Этот набор данных включает данные о различных песнях "танцевальность", "акустичность", "громкость", "речевость", "популярность" и "энергия". Будет интересно обнаружить закономерности в этих данных!
+
+![Поворотный стол](./images/turntable.jpg)
+
+Фото Марсела Ласкоски на Unsplash
+
+В этой серии уроков вы откроете для себя новые способы анализа данных с помощью методов кластеризации. Кластеризация особенно полезна, когда в наборе данных отсутствуют метки. Если на нем есть ярлыки, тогда могут быть более полезными методы классификации, подобные тем, которые вы изучили на предыдущих уроках. Но в случаях, когда вы хотите сгруппировать немаркированные данные, кластеризация - отличный способ обнаружить закономерности.
+
+> Существуют полезные инструменты с небольшим количеством кода, которые могут помочь вам узнать о работе с моделями кластеризации. Попробуйте [Azure ML для этой задачи](https://docs.microsoft.com/learn/modules/create-clustering-model-azure-machine-learning-designer/?WT.mc_id=academic-15963-cxa)
+## Уроки
+
+1. [Введение в кластеризацию](1-Visualize/README.md)
+2. [Кластеризация K-Means](2-K-Means/README.md)
+## Благодарности
+
+Эти уроки были написаны с помощью 🎶 [Джен Лупер](https://www.twitter.com/jenlooper) с полезными отзывами [Ришит Дагли](https://rishit_dagli) и [Мухаммад Сакиб Хан Инан](https://twitter.com/Sakibinan).
+
+Набор данных [Нигерийские песни](https://www.kaggle.com/sootersaalu/nigerian-songs-spotify) был получен из Kaggle, как и из Spotify.
+
+Полезные примеры K-Means, которые помогли в создании этого урока, включают [исследование радужной оболочки глаза](https://www.kaggle.com/bburns/iris-exploration-pca-k-means-and-gmm-clustering), [вводный блокнот](https://www.kaggle.com/prashant111/k-means-clustering-with-python) и [пример гипотетической НПО](https://www.kaggle.com/ankandash/pca-k-means-clustering-hierarchical-clustering).
\ No newline at end of file
diff --git a/5-Clustering/translations/README.zh-cn.md b/5-Clustering/translations/README.zh-cn.md
new file mode 100644
index 0000000000..7f05082b5d
--- /dev/null
+++ b/5-Clustering/translations/README.zh-cn.md
@@ -0,0 +1,27 @@
+# 机器学习中的聚类模型
+
+聚类(clustering)是一项机器学习任务,用于寻找类似对象并将他们分成不同的组(这些组称做“聚类”(cluster))。聚类与其它机器学习方法的不同之处在于聚类是自动进行的。事实上,我们可以说它是监督学习的对立面。
+
+## 本节主题: 尼日利亚观众音乐品味的聚类模型🎧
+
+尼日利亚多样化的观众有着多样化的音乐品味。使用从Spotify上抓取的数据(受到[本文](https://towardsdatascience.com/country-wise-visual-analysis-of-music-taste-using-spotify-api-seaborn-in-python-77f5b749b421)的启发),让我们看看尼日利亚流行的一些音乐。这个数据集包括关于各种歌曲的舞蹈性、声学、响度、言语、流行度和活力的分数。从这些数据中发现一些模式(pattern)会是很有趣的事情!
+
+![A turntable](../images/turntable.jpg)
+
+Marcela Laskoski在Unsplash上的照片
+
+在本系列课程中,您将发现使用聚类技术分析数据的新方法。当数据集缺少标签的时候,聚类特别有用。如果它有标签,那么分类技术(比如您在前面的课程中所学的那些)可能会更有用。但是如果要对未标记的数据进行分组,聚类是发现模式的好方法。
+
+> 这里有一些有用的低代码工具可以帮助您了解如何使用聚类模型。尝试 [Azure ML for this task](https://docs.microsoft.com/learn/modules/create-clustering-model-azure-machine-learning-designer/?WT.mc_id=academic-15963-cxa)
+## 课程安排
+
+1. [介绍聚类](../1-Visualize/translations/README.zh-cn.md)
+2. [K-Means聚类](../2-K-Means/translations/README.zh-cn.md)
+## 致谢
+
+这些课程由Jen Looper在🎶上撰写,并由 [Rishit Dagli](https://rishit_dagli) 和[Muhammad Sakib Khan Inan](https://twitter.com/Sakibinan)进行了有帮助的评审。
+
+[尼日利亚歌曲数据集](https://www.kaggle.com/sootersaalu/nigerian-songs-spotify) 来自Kaggle抓取的Spotify数据。
+
+一些帮助创造了这节课程的K-Means例子包括:[虹膜探索(iris exploration)](https://www.kaggle.com/bburns/iris-exploration-pca-k-means-and-gmm-clustering),[介绍性的笔记(introductory notebook)](https://www.kaggle.com/prashant111/k-means-clustering-with-python),和 [假设非政府组织的例子(hypothetical NGO example)](https://www.kaggle.com/ankandash/pca-k-means-clustering-hierarchical-clustering)。
+
diff --git a/6-NLP/1-Introduction-to-NLP/README.md b/6-NLP/1-Introduction-to-NLP/README.md
index 0d47a1d701..ef7444cb58 100644
--- a/6-NLP/1-Introduction-to-NLP/README.md
+++ b/6-NLP/1-Introduction-to-NLP/README.md
@@ -2,7 +2,7 @@
This lesson covers a brief history and important concepts of *natural language processing*, a subfield of *computational linguistics*.
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/31/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/31/)
## Introduction
@@ -17,7 +17,7 @@ You will learn about:
## Computational linguistics
-Computational linguistics is an area of research and development over many decades that studies how computers can work with, and even understand, translate, and communicate with languages. natural language processing (NLP) is a related field focused on how computers can process 'natural', or human, languages.
+Computational linguistics is an area of research and development over many decades that studies how computers can work with, and even understand, translate, and communicate with languages. Natural language processing (NLP) is a related field focused on how computers can process 'natural', or human, languages.
### Example - phone dictation
@@ -69,7 +69,7 @@ The idea for this came from a party game called *The Imitation Game* where an in
### Developing Eliza
-In the 1960's an MIT scientist called *Joseph Weizenbaum* developed [*Eliza*](https:/wikipedia.org/wiki/ELIZA), a computer 'therapist' that would ask the human questions and give the appearance of understanding their answers. However, while Eliza could parse a sentence and identify certain grammatical constructs and keywords so as to give a reasonable answer, it could not be said to *understand* the sentence. If Eliza was presented with a sentence following the format "**I am** sad" it might rearrange and substitute words in the sentence to form the response "How long have **you been** sad".
+In the 1960's an MIT scientist called *Joseph Weizenbaum* developed [*Eliza*](https://wikipedia.org/wiki/ELIZA), a computer 'therapist' that would ask the human questions and give the appearance of understanding their answers. However, while Eliza could parse a sentence and identify certain grammatical constructs and keywords so as to give a reasonable answer, it could not be said to *understand* the sentence. If Eliza was presented with a sentence following the format "**I am** sad" it might rearrange and substitute words in the sentence to form the response "How long have **you been** sad".
This gave the impression that Eliza understood the statement and was asking a follow-on question, whereas in reality, it was changing the tense and adding some words. If Eliza could not identify a keyword that it had a response for, it would instead give a random response that could be applicable to many different statements. Eliza could be easily tricked, for instance if a user wrote "**You are** a bicycle" it might respond with "How long have **I been** a bicycle?", instead of a more reasoned response.
@@ -81,7 +81,7 @@ This gave the impression that Eliza understood the statement and was asking a fo
## Exercise - coding a basic conversational bot
-A conversational bot, like Eliza, is a program that elicits user input and seems to understand and respond intelligently. Unlike Eliza, our bot will not have several rules giving it the appearance of having an intelligent conversation. Instead, out bot will have one ability only, to keep the conversation going with random responses that might work in almost any trivial conversation.
+A conversational bot, like Eliza, is a program that elicits user input and seems to understand and respond intelligently. Unlike Eliza, our bot will not have several rules giving it the appearance of having an intelligent conversation. Instead, our bot will have one ability only, to keep the conversation going with random responses that might work in almost any trivial conversation.
### The plan
@@ -149,7 +149,7 @@ Choose one of the "stop and consider" elements above and either try to implement
In the next lesson, you'll learn about a number of other approaches to parsing natural language and machine learning.
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/32/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/32/)
## Review & Self Study
diff --git a/6-NLP/1-Introduction-to-NLP/translations/README.it.md b/6-NLP/1-Introduction-to-NLP/translations/README.it.md
new file mode 100644
index 0000000000..67c0406e36
--- /dev/null
+++ b/6-NLP/1-Introduction-to-NLP/translations/README.it.md
@@ -0,0 +1,165 @@
+# Introduzione all'elaborazione del linguaggio naturale
+
+Questa lezione copre una breve storia e concetti importanti dell' *elaborazione del linguaggio naturale*, un sottocampo della *linguistica computazionale*.
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/31/)
+
+## Introduzione
+
+NLP, come è comunemente conosciuto, è una delle aree più note in cui machine learning è stato applicato e utilizzato nei software di produzione.
+
+✅ Si riesce a pensare a un software che si usa tutti i giorni che probabilmente ha NLP incorporato? Che dire dei programmi di elaborazione testi o le app mobili che si usano regolarmente?
+
+Si imparerà a conoscere:
+
+- **L'idea delle lingue**. Come si sono sviluppate le lingue e quali sono state le principali aree di studio.
+- **Definizione e concetti**. Si impareranno anche definizioni e concetti su come i computer elaborano il testo, inclusa l'analisi, la grammatica e l'identificazione di nomi e verbi. Ci sono alcune attività di codifica in questa lezione e vengono introdotti diversi concetti importanti che si imparerà a codificare più avanti nelle lezioni successive.
+
+## Linguistica computazionale
+
+La linguistica computazionale è un'area di ricerca e sviluppo che da molti decenni studia come i computer possono lavorare e persino capire, tradurre e comunicare con le lingue. L'elaborazione del linguaggio naturale (NLP) è un campo correlato incentrato su come i computer possono elaborare le lingue "naturali" o umane.
+
+### Esempio: dettatura telefonica
+
+Se si è mai dettato al telefono invece di digitare o posto una domanda a un assistente virtuale, il proprio discorso è stato convertito in formato testuale e quindi elaborato o *analizzato* dalla lingua con la quale si è parlato. Le parole chiave rilevate sono state quindi elaborate in un formato che il telefono o l'assistente possono comprendere e utilizzare.
+
+![comprensione](../images/comprehension.png)
+> La vera comprensione linguistica è difficile! Immagine di [Jen Looper](https://twitter.com/jenlooper)
+
+### Come è resa possibile questa tecnologia?
+
+Questo è possibile perché qualcuno ha scritto un programma per computer per farlo. Alcuni decenni fa, alcuni scrittori di fantascienza prevedevano che le persone avrebbero parlato principalmente con i loro computer e che i computer avrebbero sempre capito esattamente cosa intendevano. Purtroppo, si è rivelato essere un problema più difficile di quanto molti immaginavano, e sebbene sia un problema molto meglio compreso oggi, ci sono sfide significative nel raggiungere un'elaborazione del linguaggio naturale "perfetta" quando si tratta di comprendere il significato di una frase. Questo è un problema particolarmente difficile quando si tratta di comprendere l'umore o rilevare emozioni come il sarcasmo in una frase.
+
+A questo punto, si potrebbero ricordare le lezioni scolastiche in cui l'insegnante ha coperto le parti della grammatica in una frase. In alcuni paesi, agli studenti viene insegnata la grammatica e la linguistica come materie dedicate, ma in molti questi argomenti sono inclusi nell'apprendimento di una lingua: o la prima lingua nella scuola primaria (imparare a leggere e scrivere) e forse una seconda lingua in post-primario o liceo. Non occorre preoccuparsi se non si è esperti nel distinguere i nomi dai verbi o gli avverbi dagli aggettivi!
+
+Se si fa fatica a comprendere la differenza tra il *presente semplice* e il *presente progressivo*, non si è soli. Questa è una cosa impegnativa per molte persone, anche madrelingua di una lingua. La buona notizia è che i computer sono davvero bravi ad applicare regole formali e si imparerà a scrivere codice in grado di *analizzare* una frase così come un essere umano. La sfida più grande che si esaminerà in seguito è capire il *significato* e il *sentimento* di una frase.
+
+## Prerequisiti
+
+Per questa lezione, il prerequisito principale è essere in grado di leggere e comprendere la lingua di questa lezione. Non ci sono problemi di matematica o equazioni da risolvere. Sebbene l'autore originale abbia scritto questa lezione in inglese, è anche tradotta in altre lingue, quindi si potrebbe leggere una traduzione. Ci sono esempi in cui vengono utilizzati un numero di lingue diverse (per confrontare le diverse regole grammaticali di lingue diverse). Questi *non* sono tradotti, ma il testo esplicativo lo è, quindi il significato dovrebbe essere chiaro.
+
+Per le attività di codifica, si utilizzerà Python e gli esempi utilizzano Python 3.8.
+
+In questa sezione servirà e si utilizzerà:
+
+- **Comprensione del linguaggio Python 3**. Questa lezione utilizza input, loop, lettura di file, array.
+- **Visual Studio Code + estensione**. Si userà Visual Studio Code e la sua estensione Python. Si può anche usare un IDE Python a propria scelta.
+- **TextBlob**. [TextBlob](https://github.com/sloria/TextBlob) è una libreria di elaborazione del testo semplificata per Python. Seguire le istruzioni sul sito TextBlob per installarlo sul proprio sistema (installare anche i corpora, come mostrato di seguito):
+
+ ```bash
+ pip install -U textblob
+ python -m textblob.download_corpora
+ ```
+
+> 💡 Suggerimento: si può eseguire Python direttamente negli ambienti VS Code. Controllare la [documentazione](https://code.visualstudio.com/docs/languages/python?WT.mc_id=academic-15963-cxa) per ulteriori informazioni.
+
+## Parlare con le macchine
+
+La storia del tentativo di far capire ai computer il linguaggio umano risale a decenni fa e uno dei primi scienziati a considerare l'elaborazione del linguaggio naturale è stato *Alan Turing*.
+
+### Il Test di Turing.
+
+Quando Turing stava facendo ricerche sull'*intelligenza artificiale* negli anni '50, considerò se un test di conversazione potesse essere somministrato a un essere umano e a un computer (tramite corrispondenza digitata) in cui l'essere umano nella conversazione non era sicuro se stesse conversando con un altro umano o un computer.
+
+Se, dopo una certa durata di conversazione, l'essere umano non è riuscito a determinare se le risposte provenivano da un computer o meno, allora si potrebbe dire che il computer *sta pensando*?
+
+### L'ispirazione - 'il gioco dell'imitazione'
+
+L'idea è nata da un gioco di società chiamato *The Imitation Game* in cui un interrogatore è da solo in una stanza e ha il compito di determinare quale delle due persone (in un'altra stanza) sono rispettivamente maschio e femmina. L'interrogatore può inviare note e deve cercare di pensare a domande in cui le risposte scritte rivelano il sesso della persona misteriosa. Ovviamente, i giocatori nell'altra stanza stanno cercando di ingannare l'interrogatore rispondendo alle domande in modo tale da fuorviare o confondere l'interrogatore, dando anche l'impressione di rispondere onestamente.
+
+### Lo sviluppo di Eliza
+
+Negli anni '60 uno scienziato del MIT chiamato *Joseph* Weizenbaum sviluppò [*Eliza*](https:/wikipedia.org/wiki/ELIZA), un "terapista" informatico che poneva domande a un umano e dava l'impressione di comprendere le loro risposte. Tuttavia, mentre Eliza poteva analizzare una frase e identificare alcuni costrutti grammaticali e parole chiave in modo da dare una risposta ragionevole, non si poteva dire *che capisse* la frase. Se a Eliza viene presentata una frase che segue il formato "**Sono** _triste_", potrebbe riorganizzare e sostituire le parole nella frase per formare la risposta "Da quanto tempo **sei** _triste_".
+
+Questo dava l'impressione che Eliza avesse capito la frase e stesse facendo una domanda successiva, mentre in realtà stava cambiando il tempo e aggiungendo alcune parole. Se Eliza non fosse stata in grado di identificare una parola chiave per la quale aveva una risposta, avrebbe dato invece una risposta casuale che potrebbe essere applicabile a molte frasi diverse. Eliza avrebbe potuto essere facilmente ingannata, ad esempio se un utente avesse scritto "**Sei** una _bicicletta_" avrebbe potuto rispondere con "Da quanto tempo **sono** una _bicicletta_?", invece di una risposta più ragionata.
+
+[![Chiacchierare conEliza](https://img.youtube.com/vi/RMK9AphfLco/0.jpg)](https://youtu.be/RMK9AphfLco " Chiaccherare con Eliza")
+
+> 🎥 Fare clic sull'immagine sopra per un video sul programma ELIZA originale
+
+> Nota: si può leggere la descrizione originale di [Eliza](https://cacm.acm.org/magazines/1966/1/13317-elizaa-computer-program-for-the-study-of-natural-language-communication-between-man-and-machine/abstract) pubblicata nel 1966 se si dispone di un account ACM. In alternativa, leggere di Eliza su [wikipedia](https://it.wikipedia.org/wiki/ELIZA_(chatterbot))
+
+## Esercizio: codificare un bot conversazionale di base
+
+Un bot conversazionale, come Eliza, è un programma che sollecita l'input dell'utente e sembra capire e rispondere in modo intelligente. A differenza di Eliza, questo bot non avrà diverse regole che gli danno l'impressione di avere una conversazione intelligente. Invece, il bot avrà una sola capacità, per mantenere viva la conversazione con risposte casuali che potrebbero funzionare in quasi tutte le conversazioni banali.
+
+### Il piano
+
+I passaggi durante la creazione di un bot conversazionale:
+
+1. Stampare le istruzioni che consigliano all'utente come interagire con il bot
+2. Iniziare un ciclo
+ 1. Accettare l'input dell'utente
+ 2. Se l'utente ha chiesto di uscire, allora si esce
+ 3. Elaborare l'input dell'utente e determinare la risposta (in questo caso, la risposta è una scelta casuale da un elenco di possibili risposte generiche)
+ 4. Stampare la risposta
+3. Riprendere il ciclo dal passaggio 2
+
+### Costruire il bot
+
+Si crea il bot. Si inizia definendo alcune frasi.
+
+1. Creare questo bot in Python con le seguenti risposte casuali:
+
+ ```python
+ random_responses = ["That is quite interesting, please tell me more.",
+ "I see. Do go on.",
+ "Why do you say that?",
+ "Funny weather we've been having, isn't it?",
+ "Let's change the subject.",
+ "Did you catch the game last night?"]
+ ```
+
+ Ecco un esempio di output come guida (l'input dell'utente è sulle righe che iniziano con `>`):
+
+ ```output
+ Hello, I am Marvin, the simple robot.
+ You can end this conversation at any time by typing 'bye'
+ After typing each answer, press 'enter'
+ How are you today?
+ > I am good thanks
+ That is quite interesting, please tell me more.
+ > today I went for a walk
+ Did you catch the game last night?
+ > I did, but my team lost
+ Funny weather we've been having, isn't it?
+ > yes but I hope next week is better
+ Let's change the subject.
+ > ok, lets talk about music
+ Why do you say that?
+ > because I like music!
+ Why do you say that?
+ > bye
+ It was nice talking to you, goodbye!
+ ```
+
+ Una possibile soluzione al compito è [qui](../solution/bot.py)
+
+ ✅ Fermarsi e riflettere
+
+ 1. Si ritiene che le risposte casuali "ingannerebbero" qualcuno facendogli pensare che il bot le abbia effettivamente capite?
+ 2. Di quali caratteristiche avrebbe bisogno il bot per essere più efficace?
+ 3. Se un bot potesse davvero "capire" il significato di una frase, avrebbe bisogno di "ricordare" anche il significato delle frasi precedenti in una conversazione?
+
+---
+
+## 🚀 Sfida
+
+Scegliere uno degli elementi "fermarsi e riflettere" qui sopra e provare a implementarli nel codice o scrivere una soluzione su carta usando pseudocodice.
+
+Nella prossima lezione si impareranno una serie di altri approcci all'analisi del linguaggio naturale e dell'machine learning.
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/32/)
+
+## Revisione e Auto Apprendimento
+
+Dare un'occhiata ai riferimenti di seguito come ulteriori opportunità di lettura.
+
+### Bibliografia
+
+1. Schubert, Lenhart, "Computational Linguistics", *The Stanford Encyclopedia of Philosophy* (Edizione primavera 2020), Edward N. Zalta (a cura di), URL = .
+2. Università di Princeton "About WordNet". [WordNet](https://wordnet.princeton.edu/). Princeton University 2010.
+
+## Compito
+
+[Cercare un bot](assignment.it.md)
diff --git a/6-NLP/1-Introduction-to-NLP/translations/README.zh-cn.md b/6-NLP/1-Introduction-to-NLP/translations/README.zh-cn.md
new file mode 100644
index 0000000000..3826ccfd12
--- /dev/null
+++ b/6-NLP/1-Introduction-to-NLP/translations/README.zh-cn.md
@@ -0,0 +1,162 @@
+# 自然语言处理介绍
+这节课讲解了 *自然语言处理* 的简要历史和重要概念,*自然语言处理*是计算语言学的一个子领域。
+
+## [课前测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/31/)
+
+## 介绍
+众所周知,自然语言处理 (Natural Language Processing, NLP) 是机器学习在生产软件中应用最广泛的领域之一。
+
+✅ 你能想到哪些你日常生活中使用的软件可能嵌入了自然语言处理技术呢?或者,你经常使用的文字处理程序或移动应用程序中是否嵌入了自然语言处理技术呢?
+
+你将会学习到:
+
+- **什么是「语言」**。语言的发展历程,以及相关研究的主要领域。
+- **定义和概念**。你还将了解关于计算机文本处理的概念。包括解析 (parsing)、语法 (grammar) 以及识别名词与动词。这节课中有一些编程任务;还有一些重要概念将在以后的课程中被引入,届时你也会练习通过编程实现其它概念。
+
+## 计算语言学
+
+计算语言学 (Computational Linguistics) 是一个经过几十年研究和发展的领域,它研究如何让计算机能使用、理解、翻译语言并使用语言交流。自然语言处理 (NLP) 是计算语言学中一个专注于计算机如何处理「自然的」(或者说,人类的)语言的相关领域。
+
+### 举例:电话号码识别
+
+如果你曾经在手机上使用语音输入替代键盘输入,或者使用过虚拟语音助手,那么你的语音将被转录(或者叫*解析*)为文本形式后进行处理。被检测到的关键字最后将被处理成手机或语音助手可以理解并可以依此做出行为的格式。
+
+![comprehension](../images/comprehension.png)
+> 真正意义上的语言理解很难!图源:[Jen Looper](https://twitter.com/jenlooper)
+
+### 这项技术是如何实现的?
+
+我们之所以可能完成这样的任务,是因为有人编写了一个计算机程序来实现它。几十年前,一些科幻作家预测,在未来,人类很大可能会能够他们的电脑对话,而电脑总是能准确地理解人类的意思。可惜的是,事实证明这个问题的解决比我们想象的更困难。虽然今天这个问题已经被初步解决,但在理解句子的含义时,要实现 “完美” 的自然语言处理仍然存在重大挑战 —— 理解幽默或是检测感情(比如讽刺)对于计算机来说尤其困难。
+
+现在,你可能会想起课堂上老师讲解的语法。在某些国家/地区,语法和语言学知识是学生的专题课内容。但在另一些国家/地区,不管是从小学习的第一语言(学习阅读和写作),还是之后学习的第二语言中,语法及语言学知识都是作为语言的一部分教学的。所以,如果你不能很好地区分名词与动词或者区分副词与形容词,请不要担心!
+
+你还为难以区分*一般现在时*与*现在进行时*而烦恼吗?没关系的,即使是对以这门语言为母语的人在内的大多数人来说,区分它们都很有挑战性。但是,计算机非常善于应用标准的规则,你将学会编写可以像人一样"解析"句子的代码。稍后你将面对的更大挑战是理解句子的*语义*和*情绪*。
+
+## 前提
+
+本节教程的主要先决条件是能够阅读和理解本节教程的语言。本节中没有数学问题或方程需要解决。虽然原作者用英文写了这教程,但它也被翻译成其他语言,所以你可能在阅读翻译内容。这节课的示例中涉及到很多语言种类(以比较不同语言的不同语法规则)。这些是*未*翻译的,但对它们的解释是翻译过的,所以你应该能理解它在讲什么。
+
+编程任务中,你将会使用 Python 语言,示例使用的是 Python 3.8 版本。
+
+在本节中你将需要并使用如下技能:
+
+- **Python 3**。你需要能够理解并使用 Python 3. 本课将会使用输入、循环、文件读取、数组功能。
+- **Visual Studio Code + 扩展**. 我们将使用 Visual Studio Code 及其 Python 扩展。你也可以使用你喜欢的 Python IDE。
+- **TextBlob**. [TextBlob](https://github.com/sloria/TextBlob)是一个精简的 Python 文本处理库。请按照 TextBlob 网站上的说明,在您的系统上安装它(也需要安装语料库,安装代码如下所示):
+-
+ ```bash
+ pip install -U textblob
+ python -m textblob.download_corpora
+ ```
+
+> 💡 提示:你可以在 VS Code 环境中直接运行 Python。 点击[docs](https://code.visualstudio.com/docs/languages/python?WT.mc_id=academic-15963-cxa)查看更多信息。
+
+## 与机器对话
+
+试图让计算机理解人类语言的尝试最早可以追溯到几十年前。*Alan Turing* 是最早研究自然语言处理问题的科学家之一。
+
+### 图灵测试
+
+当图灵在 1950 年代研究*人工智能*时,他想出了这个思维实验:让人类和计算机通过打字的方式来交谈,其中人类并不知道对方是人类还是计算机。
+
+如果经过一定时间的交谈,人类无法确定对方是否是计算机,那么是否可以认为计算机正在“思考”?
+
+### 灵感 - “模仿游戏”
+
+这个想法来自一个名为 *模仿游戏* 的派对游戏,其中一名审讯者独自一人在一个房间里,负责确定在另一个房间里的两人的性别(男性或女性)。审讯者可以传递笔记,并且需要想出能够揭示神秘人性别的问题。当然,另一个房间的玩家也可以通过回答问题的方式来欺骗审讯者,例如用看似真诚的方式误导或迷惑审讯者。
+
+### Eliza 的研发
+
+在 1960 年代的麻省理工学院,一位名叫 *Joseph Weizenbaum* 的科学家开发了[*Eliza*](https:/wikipedia.org/wiki/ELIZA)。Eliza 是一位计算机“治疗师”,它可以向人类提出问题并让人类觉得它能理解人类的回答。然而,虽然 Eliza 可以解析句子并识别某些语法结构和关键字以给出合理的答案,但不能说它*理解*了句子。如果 Eliza 看到的句子格式为“**I am** sad”(**我很** 难过),它可能会重新排列并替换句子中的单词,回答 “How long have **you been** sad"(**你已经** 难过 多久了)。
+
+看起来像是 Eliza 理解了这句话,还在询问关于这句话的问题,而实际上,它只是在改变时态和添加词语。如果 Eliza 没有在回答中发现它知道如何响应的词汇,它会给出一个随机响应,该响应可以适用于许多不同的语句。 Eliza 很容易被欺骗,例如,如果用户写了 "**You are** a bicycle"(**你是** 个 自行车),它可能会回复 "How long have **I been** a bicycle?"(**我已经是** 一个 自行车 多久了?),而不是更合理的回答。
+
+[![跟 Eliza 聊天](https://img.youtube.com/vi/RMK9AphfLco/0.jpg)](https://youtu.be/RMK9AphfLco "跟 Eliza 聊天")
+
+> 🎥 点击上方的图片查看关于 Eliza 原型的视频
+
+> 旁注:如果你拥有 ACM 账户,你可以阅读 1996 年发表的 [Eliza](https://cacm.acm.org/magazines/1966/1/13317-elizaa-computer-program-for-the-study-of-natural-language-communication-between-man-and-machine/abstract)的原始介绍。或者,在[维基百科](https://wikipedia.org/wiki/ELIZA)上阅读有关 Eliza 的信息。
+
+## 练习 - 编程实现一个基础的对话机器人
+
+像 Eliza 一样的对话机器人是一个看起来可以智能地理解和响应用户输入的程序。与 Eliza 不同的是,我们的机器人不会用规则让它看起来像是在进行智能对话。我们的对话机器人将只有一种能力:它只会通过基本上可以糊弄所有普通对话的句子来随机回答,使得谈话能够继续进行。
+
+### 计划
+
+搭建聊天机器人的步骤
+
+1. 打印用户与机器人交互的使用说明
+2. 开启循环
+ 1. 获取用户输入
+ 2. 如果用户要求退出,就退出
+ 3. 处理用户输入并选择一个回答(在这个例子中,从回答列表中随机选择一个回答)
+ 4. 打印回答
+3. 重复步骤2
+
+### 构建聊天机器人
+
+接下来让我们建一个聊天机器人。我们将从定义一些短语开始。
+
+1. 使用以下随机的回复(`random_responses`)在 Python 中自己创建此机器人:
+
+ ```python
+ random_responses = ["That is quite interesting, please tell me more.",
+ "I see. Do go on.",
+ "Why do you say that?",
+ "Funny weather we've been having, isn't it?",
+ "Let's change the subject.",
+ "Did you catch the game last night?"]
+ ```
+
+ 程序运行看起来应该是这样:(用户输入位于以 `>` 开头的行上)
+
+ ```output
+ Hello, I am Marvin, the simple robot.
+ You can end this conversation at any time by typing 'bye'
+ After typing each answer, press 'enter'
+ How are you today?
+ > I am good thanks
+ That is quite interesting, please tell me more.
+ > today I went for a walk
+ Did you catch the game last night?
+ > I did, but my team lost
+ Funny weather we've been having, isn't it?
+ > yes but I hope next week is better
+ Let's change the subject.
+ > ok, lets talk about music
+ Why do you say that?
+ > because I like music!
+ Why do you say that?
+ > bye
+ It was nice talking to you, goodbye!
+ ```
+
+ 示例程序在[这里](../solution/bot.py)。这只是一种可能的解决方案。
+
+ ✅ 停下来,思考一下
+
+ 1. 你认为这些随机响应能够“欺骗”人类,使人类认为机器人实际上理解了他们的意思吗?
+ 2. 机器人需要哪些功能才能更有效的回应?
+ 3. 如果机器人真的可以“理解”一个句子的意思,它是否也需要“记住”前面句子的意思?
+
+---
+## 🚀挑战
+
+在上面的「停下来,思考一下」板块中选择一个问题,尝试编程实现它们,或使用伪代码在纸上编写解决方案。
+
+在下一课中,您将了解解析自然语言和机器学习的许多其他方法。
+
+## [课后测验](https://white-water-09ec41f0f.azurestaticapps.net/quiz/32/)
+
+## 复习与自学
+
+看看下面的参考资料作为进一步的参考阅读。
+
+### 参考
+
+1. Schubert, Lenhart, "Computational Linguistics", *The Stanford Encyclopedia of Philosophy* (Spring 2020 Edition), Edward N. Zalta (ed.), URL = .
+2. Princeton University "About WordNet." [WordNet](https://wordnet.princeton.edu/). Princeton University. 2010.
+
+## 任务
+
+[查找一个机器人](../assignment.md)
diff --git a/6-NLP/1-Introduction-to-NLP/translations/assignment.it.md b/6-NLP/1-Introduction-to-NLP/translations/assignment.it.md
new file mode 100644
index 0000000000..02150752bd
--- /dev/null
+++ b/6-NLP/1-Introduction-to-NLP/translations/assignment.it.md
@@ -0,0 +1,11 @@
+# Cercare un bot
+
+## Istruzioni
+
+I bot sono ovunque. Il compito: trovarne uno e adottarlo! È possibile trovarli sui siti web, nelle applicazioni bancarie e al telefono, ad esempio quando si chiamano società di servizi finanziari per consigli o informazioni sull'account. Analizzare il bot e vedire se si riesce a confonderlo. Se si riesce a confondere il bot, perché si pensa sia successo? Scrivere un breve articolo sulla propria esperienza.
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | ------------------------------------------------------------------------------------------------------------- | -------------------------------------------- | --------------------- |
+| | Viene scritto un documento a pagina intera, che spiega la presunta architettura del bot e delinea l'esperienza con esso | Un documento è incompleto o non ben concepito | Nessun documento inviato |
diff --git a/6-NLP/2-Tasks/README.md b/6-NLP/2-Tasks/README.md
index d816b7fe3b..829df67baa 100644
--- a/6-NLP/2-Tasks/README.md
+++ b/6-NLP/2-Tasks/README.md
@@ -2,7 +2,7 @@
For most *natural language processing* tasks, the text to be processed, must be broken down, examined, and the results stored or cross referenced with rules and data sets. These tasks, allows the programmer to derive the _meaning_ or _intent_ or only the _frequency_ of terms and words in a text.
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/33/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/33/)
Let's discover common techniques used in processing text. Combined with machine learning, these techniques help you to analyse large amounts of text efficiently. Before applying ML to these tasks, however, let's understand the problems encountered by an NLP specialist.
@@ -203,7 +203,7 @@ Implement the bot in the prior knowledge check and test it on a friend. Can it t
Take a task in the prior knowledge check and try to implement it. Test the bot on a friend. Can it trick them? Can you make your bot more 'believable?'
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/34/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/34/)
## Review & Self Study
diff --git a/6-NLP/2-Tasks/translations/README.it.md b/6-NLP/2-Tasks/translations/README.it.md
new file mode 100644
index 0000000000..d553eea3bf
--- /dev/null
+++ b/6-NLP/2-Tasks/translations/README.it.md
@@ -0,0 +1,214 @@
+# Compiti e tecniche comuni di elaborazione del linguaggio naturale
+
+Per la maggior parte delle attività di *elaborazione del linguaggio naturale* , il testo da elaborare deve essere suddiviso, esaminato e i risultati archiviati o incrociati con regole e insiemi di dati. Queste attività consentono al programmatore di derivare il _significato_ o l'_intento_ o solo la _frequenza_ di termini e parole in un testo.
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/33/)
+
+Si esaminano le comuni tecniche utilizzate nell'elaborazione del testo. Combinate con machine learning, queste tecniche aiutano ad analizzare grandi quantità di testo in modo efficiente. Prima di applicare machine learning a queste attività, tuttavia, occorre cercare di comprendere i problemi incontrati da uno specialista in NLP.
+
+## Compiti comuni per NLP
+
+Esistono diversi modi per analizzare un testo su cui si sta lavorando. Ci sono attività che si possono eseguire e attraverso le quali si è in grado di valutare la comprensione del testo e trarre conclusioni. Di solito si eseguono queste attività in sequenza.
+
+### Tokenizzazione
+
+Probabilmente la prima cosa che la maggior parte degli algoritmi di NLP deve fare è dividere il testo in token o parole. Anche se questo sembra semplice, dover tenere conto della punteggiatura e dei delimitatori di parole e frasi di lingue diverse può renderlo complicato. Potrebbe essere necessario utilizzare vari metodi per determinare le demarcazioni.
+
+![Tokenizzazione](../images/tokenization.png)
+> Tokenizzazione di una frase da **Orgoglio e Pregiudizio**. Infografica di [Jen Looper](https://twitter.com/jenlooper)
+
+### Embedding
+
+I [word embeddings](https://it.wikipedia.org/wiki/Word_embedding) sono un modo per convertire numericamente i dati di testo. Gli embedding vengono eseguiti in modo tale che le parole con un significato simile o le parole usate insieme vengano raggruppate insieme.
+
+![word embeddings](../images/embedding.png)
+> "I have the highest respect for your nerves, they are my old friends." - Incorporazioni di parole per una frase in **Orgoglio e Pregiudizio**. Infografica di [Jen Looper](https://twitter.com/jenlooper)
+
+✅ Provare [questo interessante strumento](https://projector.tensorflow.org/) per sperimentare i word embedding. Facendo clic su una parola vengono visualizzati gruppi di parole simili: gruppi di "toy" con "disney", "lego", "playstation" e "console".
+
+### Analisi e codifica di parti del discorso
+
+Ogni parola che è stata tokenizzata può essere etichettata come parte del discorso: un sostantivo, un verbo o un aggettivo. La frase `the quick red fox jumped over the lazy brown dog` potrebbe essere etichettata come fox = sostantivo, jumped = verbo.
+
+![elaborazione](../images/parse.png)
+
+> Analisi di una frase da **Orgoglio e Pregiudizio**. Infografica di [Jen Looper](https://twitter.com/jenlooper)
+
+L'analisi consiste nel riconoscere quali parole sono correlate tra loro in una frase - per esempio `the quick red fox jumped` è una sequenza aggettivo-sostantivo-verbo che è separata dalla sequenza `lazy brown dog` .
+
+### Frequenze di parole e frasi
+
+Una procedura utile quando si analizza un corpo di testo di grandi dimensioni è creare un dizionario di ogni parola o frase di interesse e con quale frequenza viene visualizzata. La frase `the quick red fox jumped over the lazy brown dog` ha una frequenza di parole di 2 per the.
+
+Si esamina un testo di esempio in cui si conta la frequenza delle parole. La poesia di Rudyard Kipling The Winners contiene i seguenti versi:
+
+```output
+What the moral? Who rides may read.
+When the night is thick and the tracks are blind
+A friend at a pinch is a friend, indeed,
+But a fool to wait for the laggard behind.
+Down to Gehenna or up to the Throne,
+He travels the fastest who travels alone.
+```
+
+Poiché le frequenze delle frasi possono essere o meno insensibili alle maiuscole o alle maiuscole, a seconda di quanto richiesto, la frase `a friend` ha una frequenza di 2, `the` ha una frequenza di 6 e `travels` è 2.
+
+### N-grammi
+
+Un testo può essere suddiviso in sequenze di parole di una lunghezza prestabilita, una parola singola (unigramma), due parole (bigrammi), tre parole (trigrammi) o un numero qualsiasi di parole (n-grammi).
+
+Ad esempio, `the quick red fox jumped over the lazy brown dog` con un punteggio n-grammo di 2 produce i seguenti n-grammi:
+
+1. the quick
+2. quick red
+3. red fox
+4. fox jumped
+5. jumped over
+6. over the
+7. the lazy
+8. lazy brown
+9. brown dog
+
+Potrebbe essere più facile visualizzarlo come una casella scorrevole per la frase. Qui è per n-grammi di 3 parole, l'n-grammo è in grassetto in ogni frase:
+
+1. **the quick red** fox jumped over the lazy brown dog
+2. the **quick red fox** jumped over the lazy brown dog
+3. the quick **red fox jumped** over the lazy brown dog
+4. the quick red **fox jumped over** the lazy brown dog
+5. the quick red fox **jumped over the** lazy brown dog
+6. the quick red fox jumped **over the lazy** brown dog
+7. the quick red fox jumped over **the lazy brown** dog
+8. the quick red fox jumped over the **lazy brown dog**
+
+![finestra scorrevole n-grammi](../images/n-grams.gif)
+
+> Valore N-gram di 3: Infografica di [Jen Looper](https://twitter.com/jenlooper)
+
+### Estrazione frase nominale
+
+Nella maggior parte delle frasi, c'è un sostantivo che è il soggetto o l'oggetto della frase. In inglese, è spesso identificabile con "a" o "an" o "the" che lo precede. Identificare il soggetto o l'oggetto di una frase "estraendo la frase nominale" è un compito comune in NLP quando si cerca di capire il significato di una frase.
+
+✅ Nella frase "I cannot fix on the hour, or the spot, or the look or the words, which laid the foundation. It is too long ago. I was in the middle before I knew that I had begun.", si possono identificare i nomi nelle frasi?
+
+Nella frase `the quick red fox jumped over the lazy brown dog` ci sono 2 frasi nominali: **quick red fox** e **lazy brown dog**.
+
+### Analisi del sentiment
+
+Una frase o un testo può essere analizzato per il sentimento, o quanto *positivo* o *negativo* esso sia. Il sentimento si misura in *polarità* e *oggettività/soggettività*. La polarità è misurata da -1,0 a 1,0 (da negativo a positivo) e da 0,0 a 1,0 (dal più oggettivo al più soggettivo).
+
+✅ In seguito si imparerà che ci sono diversi modi per determinare il sentimento usando machine learning ma un modo è avere un elenco di parole e frasi che sono classificate come positive o negative da un esperto umano e applicare quel modello al testo per calcolare un punteggio di polarità. Si riesce a vedere come funzionerebbe in alcune circostanze e meno bene in altre?
+
+### Inflessione
+
+L'inflessione consente di prendere una parola e ottenere il singolare o il plurale della parola.
+
+### Lemmatizzazione
+
+Un *lemma* è la radice o il lemma per un insieme di parole, ad esempio *volò*, *vola*, *volando* ha un lemma del verbo *volare*.
+
+Ci sono anche utili database disponibili per il ricercatore NPL, in particolare:
+
+### WordNet
+
+[WordNet](https://wordnet.princeton.edu/) è un database di parole, sinonimi, contari e molti altri dettagli per ogni parola in molte lingue diverse. È incredibilmente utile quando si tenta di costruire traduzioni, correttori ortografici o strumenti di lingua di qualsiasi tipo.
+
+## Librerie NPL
+
+Fortunatamente, non è necessario creare tutte queste tecniche da soli, poiché sono disponibili eccellenti librerie Python che le rendono molto più accessibili agli sviluppatori che non sono specializzati nell'elaborazione del linguaggio naturale o in machine learning. Le prossime lezioni includono altri esempi di queste, ma qui si impareranno alcuni esempi utili che aiuteranno con il prossimo compito.
+
+### Esercizio: utilizzo della libreria `TextBlob`
+
+Si usa una libreria chiamata TextBlob in quanto contiene API utili per affrontare questi tipi di attività. TextBlob "sta sulle spalle giganti di [NLTK](https://nltk.org) e [pattern](https://github.com/clips/pattern), e si sposa bene con entrambi". Ha una notevole quantità di ML incorporato nella sua API.
+
+> Nota: per TextBlob è disponibile un'utile [guida rapida](https://textblob.readthedocs.io/en/dev/quickstart.html#quickstart), consigliata per sviluppatori Python esperti
+
+Quando si tenta di identificare *le frasi nominali*, TextBlob offre diverse opzioni di estrattori per trovarle.
+
+1. Dare un'occhiata a `ConllExtractor`.
+
+ ```python
+ from textblob import TextBlob
+ from textblob.np_extractors import ConllExtractor
+ # importa e crea un extrattore Conll da usare successivamente
+ extractor = ConllExtractor()
+
+ # quando serve un estrattore di frasi nominali:
+ user_input = input("> ")
+ user_input_blob = TextBlob(user_input, np_extractor=extractor) # notare specificato estrattore non predefinito
+ np = user_input_blob.noun_phrases
+ ```
+
+ > Cosa sta succedendo qui? [ConllExtractor](https://textblob.readthedocs.io/en/dev/api_reference.html?highlight=Conll#textblob.en.np_extractors.ConllExtractor) è "Un estrattore di frasi nominali che utilizza l'analisi dei blocchi addestrata con il corpus di formazione ConLL-2000". ConLL-2000 si riferisce alla Conferenza del 2000 sull'apprendimento computazionale del linguaggio naturale. Ogni anno la conferenza ha ospitato un workshop per affrontare uno spinoso problema della NPL, e nel 2000 è stato lo spezzettamento dei sostantivi. Un modello è stato addestrato sul Wall Street Journal, con "sezioni 15-18 come dati di addestramento (211727 token) e sezione 20 come dati di test (47377 token)". Si possono guardare le procedure utilizzate [qui](https://www.clips.uantwerpen.be/conll2000/chunking/) e i [risultati](https://ifarm.nl/erikt/research/np-chunking.html).
+
+### Sfida: migliorare il bot con NPL
+
+Nella lezione precedente si è creato un bot di domande e risposte molto semplice. Ora si renderà Marvin un po' più comprensivo analizzando l'input per il sentimento e stampando una risposta che corrisponda al sentimento. Si dovrà anche identificare una frase nominale `noun_phrase` e chiedere informazioni su di essa.
+
+I passaggi durante la creazione di un bot conversazionale:
+
+1. Stampare le istruzioni che consigliano all'utente come interagire con il bot
+2. Avviare il ciclo
+ 1. Accettare l'input dell'utente
+ 2. Se l'utente ha chiesto di uscire, allora si esce
+ 3. Elaborare l'input dell'utente e determinare la risposta di sentimento appropriata
+ 4. Se viene rilevata una frase nominale nel sentimento, pluralizzala e chiedere ulteriori input su quell'argomento
+ 5. Stampare la risposta
+3. Riprendere il ciclo dal passo 2
+
+Ecco il frammento di codice per determinare il sentimento usando TextBlob. Si noti che ci sono solo quattro *gradienti* di risposta al sentimento (se ne potrebbero avere di più se lo si desidera):
+
+```python
+if user_input_blob.polarity <= -0.5:
+ response = "Oh dear, that sounds bad. " # Oh caro, è terribile
+elif user_input_blob.polarity <= 0:
+ response = "Hmm, that's not great. " # Mmm, non è eccezionale
+elif user_input_blob.polarity <= 0.5:
+ response = "Well, that sounds positive. " # Bene, questo è positivo
+elif user_input_blob.polarity <= 1:
+ response = "Wow, that sounds great. " # Wow, sembra eccezionale
+```
+
+Ecco un risultato di esempio a scopo di guida (l'input utente è sulle righe che iniziano per >):
+
+```output
+Hello, I am Marvin, the friendly robot.
+You can end this conversation at any time by typing 'bye'
+After typing each answer, press 'enter'
+How are you today?
+> I am ok
+Well, that sounds positive. Can you tell me more?
+> I went for a walk and saw a lovely cat
+Well, that sounds positive. Can you tell me more about lovely cats?
+> cats are the best. But I also have a cool dog
+Wow, that sounds great. Can you tell me more about cool dogs?
+> I have an old hounddog but he is sick
+Hmm, that's not great. Can you tell me more about old hounddogs?
+> bye
+It was nice talking to you, goodbye!
+```
+
+Una possibile soluzione al compito è [qui](../solution/bot.py)
+
+Verifica delle conoscenze
+
+1. Si ritiene che le risposte casuali "ingannerebbero" qualcuno facendogli pensare che il bot le abbia effettivamente capite?
+2. Identificare la frase nominale rende il bot più 'credibile'?
+3. Perché estrarre una "frase nominale" da una frase sarebbe una cosa utile da fare?
+
+---
+
+Implementare il bot nel controllo delle conoscenze precedenti e testarlo su un amico. Può ingannarlo? Si può rendere il bot più 'credibile?'
+
+## 🚀 Sfida
+
+Prendere un'attività dalla verifica delle conoscenze qui sopra e provare a implementarla. Provare il bot su un amico. Può ingannarlo? Si può rendere il bot più 'credibile?'
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/34/)
+
+## Revisione e Auto Apprendimento
+
+Nelle prossime lezioni si imparerà di più sull'analisi del sentiment. Fare ricerche su questa interessante tecnica in articoli come questi su [KDNuggets](https://www.kdnuggets.com/tag/nlp)
+
+## Compito
+
+[Fare rispondere un bot](assignment.it.md)
diff --git a/6-NLP/2-Tasks/translations/assignment.it.md b/6-NLP/2-Tasks/translations/assignment.it.md
new file mode 100644
index 0000000000..fddf44bc0d
--- /dev/null
+++ b/6-NLP/2-Tasks/translations/assignment.it.md
@@ -0,0 +1,11 @@
+# Fare rispondere un bot
+
+## Istruzioni
+
+Nelle ultime lezioni, si è programmato un bot di base con cui chattare. Questo bot fornisce risposte casuali finché non si dice ciao ("bye"). Si possono rendere le risposte un po' meno casuali e attivare le risposte se si dicono cose specifiche, come "perché" o "come"? Si pensi a come machine learning potrebbe rendere questo tipo di lavoro meno manuale mentre si estende il bot. Si possono utilizzare le librerie NLTK o TextBlob per semplificare le attività.
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | --------------------------------------------- | ------------------------------------------------ | ----------------------- |
+| | Viene presentato e documentato un nuovo file bot.py | Viene presentato un nuovo file bot ma contiene bug | Non viene presentato un file |
diff --git a/6-NLP/3-Translation-Sentiment/README.md b/6-NLP/3-Translation-Sentiment/README.md
index bcd6cdd1ae..dd6bd1bffc 100644
--- a/6-NLP/3-Translation-Sentiment/README.md
+++ b/6-NLP/3-Translation-Sentiment/README.md
@@ -2,7 +2,7 @@
In the previous lessons you learned how to build a basic bot using `TextBlob`, a library that embeds ML behind-the-scenes to perform basic NLP tasks such as noun phrase extraction. Another important challenge in computational linguistics is accurate _translation_ of a sentence from one spoken or written language to another.
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/35/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/35/)
Translation is a very hard problem compounded by the fact that there are thousands of languages and each can have very different grammar rules. One approach is to convert the formal grammar rules for one language, such as English, into a non-language dependent structure, and then translate it by converting back to another language. This approach means that you would take the following steps:
@@ -143,7 +143,7 @@ Your task is to determine, using sentiment polarity, if *Pride and Prejudice* ha
1. If the polarity is 1 or -1 store the sentence in an array or list of positive or negative messages
5. At the end, print out all the positive sentences and negative sentences (separately) and the number of each.
-Here is a sample [solution](solutions/notebook.ipynb).
+Here is a sample [solution](solution/notebook.ipynb).
✅ Knowledge Check
@@ -176,7 +176,7 @@ Here is a sample [solution](solutions/notebook.ipynb).
Can you make Marvin even better by extracting other features from the user input?
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/36/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/36/)
## Review & Self Study
diff --git a/6-NLP/3-Translation-Sentiment/translations/README.it.md b/6-NLP/3-Translation-Sentiment/translations/README.it.md
new file mode 100644
index 0000000000..d1116a9e08
--- /dev/null
+++ b/6-NLP/3-Translation-Sentiment/translations/README.it.md
@@ -0,0 +1,187 @@
+# Traduzione e analisi del sentiment con ML
+
+Nelle lezioni precedenti si è imparato come creare un bot di base utilizzando `TextBlob`, una libreria che incorpora machine learning dietro le quinte per eseguire attività di base di NPL come l'estrazione di frasi nominali. Un'altra sfida importante nella linguistica computazionale è _la traduzione_ accurata di una frase da una lingua parlata o scritta a un'altra.
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/35/)
+
+La traduzione è un problema molto difficile, aggravato dal fatto che ci sono migliaia di lingue e ognuna può avere regole grammaticali molto diverse. Un approccio consiste nel convertire le regole grammaticali formali per una lingua, come l'inglese, in una struttura non dipendente dalla lingua e quindi tradurla convertendola in un'altra lingua. Questo approccio significa che si dovrebbero eseguire i seguenti passaggi:
+
+1. **Identificazione**. Identificare o taggare le parole nella lingua di input in sostantivi, verbi, ecc.
+2. **Creare la traduzione**. Produrre una traduzione diretta di ogni parola nel formato della lingua di destinazione.
+
+### Frase di esempio, dall'inglese all'irlandese
+
+In inglese, la frase _I feel happy_ (sono felice) è composta da tre parole nell'ordine:
+
+- **soggetto** (I)
+- **verbo** (feel)
+- **aggettivo** (happy)
+
+Tuttavia, nella lingua irlandese, la stessa frase ha una struttura grammaticale molto diversa - emozioni come "*felice*" o "*triste*" sono espresse come se fossero *su se stessi*.
+
+La frase inglese `I feel happy` in irlandese sarebbe `Tá athas orm`. Una traduzione *letterale* sarebbe `Happy is upon me` (felicità su di me).
+
+Un oratore irlandese che traduce in inglese direbbe `I feel happy`, non `Happy is upon me`, perché capirebbe il significato della frase, anche se le parole e la struttura della frase sono diverse.
+
+L'ordine formale per la frase in irlandese sono:
+
+- **verbo** (Tá o is)
+- **aggettivo** (athas, o happy)
+- **soggetto** (orm, o upon me)
+
+## Traduzione
+
+Un programma di traduzione ingenuo potrebbe tradurre solo parole, ignorando la struttura della frase.
+
+✅ Se si è imparato una seconda (o terza o più) lingua da adulto, si potrebbe aver iniziato pensando nella propria lingua madre, traducendo un concetto parola per parola nella propria testa nella seconda lingua, e poi pronunciando la traduzione. Questo è simile a quello che stanno facendo i programmi per computer di traduzione ingenui. È importante superare questa fase per raggiungere la fluidità!
+
+La traduzione ingenua porta a cattive (e talvolta esilaranti) traduzioni errate: `I feel happy` si traduce letteralmente in `Mise bhraitheann athas` in irlandese. Ciò significa (letteralmente) `me feel happy` e non è una frase irlandese valida. Anche se l'inglese e l'irlandese sono lingue parlate su due isole vicine, sono lingue molto diverse con strutture grammaticali diverse.
+
+> E' possibile guardare alcuni video sulle tradizioni linguistiche irlandesi come [questo](https://www.youtube.com/watch?v=mRIaLSdRMMs)
+
+### Approcci di machine learning
+
+Finora, si è imparato a conoscere l'approccio delle regole formali all'elaborazione del linguaggio naturale. Un altro approccio consiste nell'ignorare il significato delle parole e _utilizzare invece machine learning per rilevare i modelli_. Questo può funzionare nella traduzione se si ha molto testo (un *corpus*) o testi (*corpora*) sia nella lingua di origine che in quella di destinazione.
+
+Si prenda ad esempio il caso di *Pride and Prejudice (Orgoglio* e pregiudizio),un noto romanzo inglese scritto da Jane Austen nel 1813. Se si consulta il libro in inglese e una traduzione umana del libro in *francese*, si potrebberoi rilevare frasi in uno che sono tradotte *idiomaticamente* nell'altro. Si farà fra un minuto.
+
+Ad esempio, quando una frase inglese come `I have no money` (non ho denaro) viene tradotta letteralmente in francese, potrebbe diventare `Je n'ai pas de monnaie`. "Monnaie" è un complicato "falso affine" francese, poiché "money" e "monnaie" non sono sinonimi. Una traduzione migliore che un essere umano potrebbe fare sarebbe `Je n'ai pas d'argent`, perché trasmette meglio il significato che non si hanno soldi (piuttosto che "moneta spicciola" che è il significato di "monnaie").
+
+![monnaie](../images/monnaie.png)
+
+> Immagine di [Jen Looper](https://twitter.com/jenlooper)
+
+Se un modello ML ha abbastanza traduzioni umane su cui costruire un modello, può migliorare l'accuratezza delle traduzioni identificando modelli comuni in testi che sono stati precedentemente tradotti da umani esperti parlanti di entrambe le lingue.
+
+### Esercizio - traduzione
+
+Si può usare `TextBlob` per tradurre le frasi. Provare la famosa prima riga di **Orgoglio e Pregiudizio**:
+
+```python
+from textblob import TextBlob
+
+blob = TextBlob(
+ "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife!"
+)
+print(blob.translate(to="fr"))
+
+```
+
+`TextBlob` fa un buon lavoro con la traduzione: "C'est une vérité universalllement reconnue, qu'un homme célibataire en possession d'une bonne fortune doit avoir besoin d'une femme!".
+
+Si può sostenere che la traduzione di TextBlob è molto più esatta, infatti, della traduzione francese del 1932 del libro di V. Leconte e Ch. Pressoir:
+
+"C'est une vérité universelle qu'un celibataire pourvu d'une belle fortune doit avoir envie de se marier, et, si peu que l'on sache de son sentiment à cet egard, lorsqu'il arrive dans une nouvelle residence, cette idée est si bien fixée dans l'esprit de ses voisins qu'ils le considèrent sur-le-champ comme la propriété légitime de l'une ou l'autre de leurs filles."
+
+In questo caso, la traduzione informata da ML fa un lavoro migliore del traduttore umano che mette inutilmente parole nella bocca dell'autore originale per "chiarezza".
+
+> Cosa sta succedendo qui? e perché TextBlob è così bravo a tradurre? Ebbene, dietro le quinte, utilizza Google translate, una sofisticata intelligenza artificiale in grado di analizzare milioni di frasi per prevedere le migliori stringhe per il compito da svolgere. Non c'è niente di manuale in corso qui e serve una connessione Internet per usare `blob.translate`.
+
+✅ Provare altre frasi. Qual'è migliore, ML o traduzione umana? In quali casi?
+
+## Analisi del sentiment
+
+Un'altra area in cui l'apprendimento automatico può funzionare molto bene è l'analisi del sentiment. Un approccio non ML al sentiment consiste nell'identificare parole e frasi che sono "positive" e "negative". Quindi, dato un nuovo pezzo di testo, calcolare il valore totale delle parole positive, negative e neutre per identificare il sentimento generale.
+
+Questo approccio è facilmente ingannabile come si potrebbe aver visto nel compito di Marvin: la frase `Great, that was a wonderful waste of time, I'm glad we are lost on this dark road` (Grande, è stata una meravigliosa perdita di tempo, sono contento che ci siamo persi su questa strada oscura) è una frase sarcastica e negativa, ma il semplice algoritmo rileva 'great' (grande), 'wonderful' (meraviglioso), 'glad' (contento) come positivo e 'waste' (spreco), 'lost' (perso) e 'dark' (oscuro) come negativo. Il sentimento generale è influenzato da queste parole contrastanti.
+
+✅ Si rifletta un momento su come si trasmette il sarcasmo come oratori umani. L'inflessione del tono gioca un ruolo importante. Provare a dire la frase "Beh, quel film è stato fantastico" in modi diversi per scoprire come la propria voce trasmette significato.
+
+### Approcci ML
+
+L'approccio ML sarebbe quello di raccogliere manualmente corpi di testo negativi e positivi: tweet, recensioni di film o qualsiasi cosa in cui l'essere umano abbia assegnato un punteggio *e* un parere scritto. Quindi le tecniche di NPL possono essere applicate alle opinioni e ai punteggi, in modo che emergano modelli (ad esempio, le recensioni positive di film tendono ad avere la frase "degno di un Oscar" più delle recensioni di film negative, o le recensioni positive di ristoranti dicono "gourmet" molto più di "disgustoso").
+
+> ⚖️ **Esempio**: se si è lavorato nell'ufficio di un politico e c'era qualche nuova legge in discussione, gli elettori potrebbero scrivere all'ufficio con e-mail a sostegno o e-mail contro la nuova legge specifica. Si supponga che si abbia il compito di leggere le e-mail e ordinarle in 2 pile, *pro* e *contro*. Se ci fossero molte e-mail, si potrebbe essere sopraffatti dal tentativo di leggerle tutte. Non sarebbe bello se un bot potesse leggerle tutte, capirle e dire a quale pila apparteneva ogni email?
+>
+> Un modo per raggiungere questo obiettivo è utilizzare machine learning. Si addestrerebbe il modello con una parte delle email *contro* e una parte delle email *per* . Il modello tenderebbe ad associare frasi e parole con il lato contro o il lato per, *ma non capirebbe alcun contenuto*, solo che è più probabile che alcune parole e modelli in una email appaiano in un *contro* o in un *pro*. Si potrebbe fare una prova con alcune e-mail non usate per addestrare il modello e vedere se si arriva alla stessa conclusione tratta da un umano. Quindi, una volta soddisfatti dell'accuratezza del modello, si potrebbero elaborare le email future senza doverle leggere tutte.
+
+✅ Questo processo ricorda processi usati nelle lezioni precedenti?
+
+## Esercizio - frasi sentimentali
+
+Il sentimento viene misurato con una *polarità* da -1 a 1, il che significa che -1 è il sentimento più negativo e 1 è il più positivo. Il sentimento viene anche misurato con un punteggio 0 - 1 per oggettività (0) e soggettività (1).
+
+Si dia un'altra occhiata a *Orgoglio e pregiudizio* di Jane Austen. Il testo è disponibile qui su [Project Gutenberg](https://www.gutenberg.org/files/1342/1342-h/1342-h.htm). L'esempio seguente mostra un breve programma che analizza il sentimento della prima e dell'ultima frase del libro e ne mostra la polarità del sentimento e il punteggio di soggettività/oggettività.
+
+Si dovrebbe utilizzare la libreria `TextBlob` (descritta sopra) per determinare il `sentiment` (non si deve scrivere il proprio calcolatore del sentimento) nella seguente attività.
+
+```python
+from textblob import TextBlob
+
+quote1 = """It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife."""
+
+quote2 = """Darcy, as well as Elizabeth, really loved them; and they were both ever sensible of the warmest gratitude towards the persons who, by bringing her into Derbyshire, had been the means of uniting them."""
+
+sentiment1 = TextBlob(quote1).sentiment
+sentiment2 = TextBlob(quote2).sentiment
+
+print(quote1 + " has a sentiment of " + str(sentiment1))
+print(quote2 + " has a sentiment of " + str(sentiment2))
+```
+
+Si dovrebbe ottenere il seguente risultato:
+
+```output
+It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want # of a wife. has a sentiment of Sentiment(polarity=0.20952380952380953, subjectivity=0.27142857142857146)
+
+Darcy, as well as Elizabeth, really loved them; and they were
+ both ever sensible of the warmest gratitude towards the persons
+ who, by bringing her into Derbyshire, had been the means of
+ uniting them. has a sentiment of Sentiment(polarity=0.7, subjectivity=0.8)
+```
+
+## Sfida: controllare la polarità del sentimento
+
+Il compito è determinare, usando la polarità del sentiment, se *Orgoglio e Pregiudizio* ha più frasi assolutamente positive di quelle assolutamente negative. Per questa attività, si può presumere che un punteggio di polarità di 1 o -1 sia rispettivamente assolutamente positivo o negativo.
+
+**Procedura:**
+
+1. Scaricare una [copia di Orgoglio e pregiudizio](https://www.gutenberg.org/files/1342/1342-h/1342-h.htm) dal Progetto Gutenberg come file .txt. Rimuovere i metadati all'inizio e alla fine del file, lasciando solo il testo originale
+2. Aprire il file in Python ed estrare il contenuto come una stringa
+3. Creare un TextBlob usando la stringa del libro
+4. Analizzare ogni frase del libro in un ciclo
+ 1. Se la polarità è 1 o -1, memorizzare la frase in un array o in un elenco di messaggi positivi o negativi
+5. Alla fine, stampare tutte le frasi positive e negative (separatamente) e il numero di ciascuna.
+
+Ecco una [soluzione](../solution/notebook.ipynb) di esempio.
+
+✅ Verifica delle conoscenze
+
+1. Il sentimento si basa sulle parole usate nella frase, ma il codice *comprende* le parole?
+2. Si ritiene che la polarità del sentimento sia accurata o, in altre parole, si è *d'accordo* con i punteggi?
+ 1. In particolare, si è d'accordo o in disaccordo con l'assoluta polarità **positiva** delle seguenti frasi?
+ * What an excellent father you have, girls! (Che padre eccellente avete, ragazze!) said she, when the door was shut. (disse lei, non appena si chiuse la porta).
+ * “Your examination of Mr. Darcy is over, I presume,” said Miss Bingley; “and pray what is the result?” ("Il vostro esame di Mr. Darcy è finito, presumo", disse Miss Bingley; "e vi prego qual è il risultato?") “I am perfectly convinced by it that Mr. Darcy has no defect. (Sono perfettamente convinta che il signor Darcy non abbia difetti).
+ * How wonderfully these sort of things occur! (Come accadono meravigliosamente questo genere di cose!).
+ * I have the greatest dislike in the world to that sort of thing. (Ho la più grande antipatia del mondo per quel genere di cose).
+ * Charlotte is an excellent manager, I dare say (Charlotte è un'eccellente manager, oserei dire).
+ * “This is delightful indeed! (“Questo è davvero delizioso!)
+ * I am so happy! (Che gioia!)
+ * Your idea of the ponies is delightful. (La vostra idea dei pony è deliziosa).
+ 2. Le successive 3 frasi sono state valutate con un sentimento assolutamente positivo, ma a una lettura attenta, non sono frasi positive. Perché l'analisi del sentiment ha pensato che fossero frasi positive?
+ * Happy shall I be, when his stay at Netherfield is over!” “I wish I could say anything to comfort you,” replied Elizabeth; “but it is wholly out of my power. (Come sarò felice, quando il suo soggiorno a Netherfield sarà finito!» "Vorrei poter dire qualcosa per consolarti", rispose Elizabeth; “ma proprio non ci riesco).
+ * If I could but see you as happy! (Se solo potessi vederti felice!)
+ * Our distress, my dear Lizzy, is very great. (La nostra angoscia, mia cara Lizzy, è devvero grande).
+ 3. Sei d'accordo o in disaccordo con la polarità **negativa** assoluta delle seguenti frasi?
+ - Everybody is disgusted with his pride. (Tutti sono disgustati dal suo orgoglio).
+ - “I should like to know how he behaves among strangers.” “You shall hear then—but prepare yourself for something very dreadful. ("Vorrei sapere come si comporta in mezzo agli estranei." "Allora sentirete, ma preparatevi a qualcosa di terribile).
+ - The pause was to Elizabeth’s feelings dreadful. (La pausa fu terribile per i sentimenti di Elizabeth).
+ - It would be dreadful! (Sarebbe terribile!)
+
+✅ Qualsiasi appassionato di Jane Austen capirebbe che usa spesso i suoi libri per criticare gli aspetti più ridicoli della società inglese Regency. Elizabeth Bennett, la protagonista di *Orgoglio e pregiudizio,* è un'attenta osservatrice sociale (come l'autrice) e il suo linguaggio è spesso pesantemente sfumato. Anche Mr. Darcy (l'interesse amoroso della storia) nota l'uso giocoso e canzonatorio del linguaggio di Elizabeth: "I have had the pleasure of your acquaintance long enough to know that you find great enjoyment in occasionally professing opinions which in fact are not your own." ("Ho il piacere di conoscervi da abbastanza tempo per sapere quanto vi divertiate a esprimere di tanto in tanto delle opinioni che in realtà non vi appartengono")
+
+---
+
+## 🚀 Sfida
+
+Si può rendere Marvin ancora migliore estraendo altre funzionalità dall'input dell'utente?
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/36/)
+
+## Revisione e Auto Apprendimento
+
+Esistono molti modi per estrarre il sentiment dal testo. Si pensi alle applicazioni aziendali che potrebbero utilizzare questa tecnica. Pensare a cosa potrebbe andare storto. Ulteriori informazioni sui sistemi sofisticati pronti per l'azienda che analizzano il sentiment come l'[analisi del testo di Azure](https://docs.microsoft.com/azure/cognitive-services/Text-Analytics/how-tos/text-analytics-how-to-sentiment-analysis?tabs=version-3-1?WT.mc_id=academic-15963-cxa). Provare alcune delle frasi di Orgoglio e Pregiudizio sopra e vedere se può rilevare sfumature.
+
+## Compito
+
+[Licenza poetica](assignment.it.md)
diff --git a/6-NLP/3-Translation-Sentiment/translations/assignment.it.md b/6-NLP/3-Translation-Sentiment/translations/assignment.it.md
new file mode 100644
index 0000000000..1b786ba267
--- /dev/null
+++ b/6-NLP/3-Translation-Sentiment/translations/assignment.it.md
@@ -0,0 +1,11 @@
+# Licenza poetica
+
+## Istruzioni
+
+In [questo notebook](https://www.kaggle.com/jenlooper/emily-dickinson-word-frequency) si possono trovare oltre 500 poesie di Emily Dickinson precedentemente analizzate per il sentiment utilizzando l'analisi del testo di Azure. Utilizzando questo insieme di dati, analizzarlo utilizzando le tecniche descritte nella lezione. Il sentimento suggerito di una poesia corrisponde alla decisione più sofisticata del servizio Azure? Perché o perché no, secondo il proprio parere? C’è qualcosa che sorprende?
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | -------------------------------------------------------------------------- | ------------------------------------------------------- | ------------------------ |
+| | Un notebook viene presentato con una solida analisi del risultato da un campione di un autore | Il notebook è incompleto o non esegue l'analisi | Nessun notebook presentato |
diff --git a/6-NLP/4-Hotel-Reviews-1/README.md b/6-NLP/4-Hotel-Reviews-1/README.md
index a4b5a556f7..dd6a573a9e 100644
--- a/6-NLP/4-Hotel-Reviews-1/README.md
+++ b/6-NLP/4-Hotel-Reviews-1/README.md
@@ -6,7 +6,7 @@ In this section you will use the techniques in the previous lessons to do some e
- how to calculate some new data based on the existing columns
- how to save the resulting dataset for use in the final challenge
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/37/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/37/)
### Introduction
@@ -393,7 +393,7 @@ Now that you have explored the dataset, in the next lesson you will filter the d
This lesson demonstrates, as we saw in previous lessons, how critically important it is to understand your data and its foibles before performing operations on it. Text-based data, in particular, bears careful scrutiny. Dig through various text-heavy datasets and see if you can discover areas that could introduce bias or skewed sentiment into a model.
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/38/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/38/)
## Review & Self Study
diff --git a/6-NLP/4-Hotel-Reviews-1/translations/README.it.md b/6-NLP/4-Hotel-Reviews-1/translations/README.it.md
new file mode 100644
index 0000000000..cbbca6109c
--- /dev/null
+++ b/6-NLP/4-Hotel-Reviews-1/translations/README.it.md
@@ -0,0 +1,412 @@
+# Analisi del sentiment con le recensioni degli hotel - elaborazione dei dati
+
+In questa sezione si utilizzeranno le tecniche delle lezioni precedenti per eseguire alcune analisi esplorative dei dati di un grande insieme di dati. Una volta compresa bene l'utilità delle varie colonne, si imparerà:
+
+- come rimuovere le colonne non necessarie
+- come calcolare alcuni nuovi dati in base alle colonne esistenti
+- come salvare l'insieme di dati risultante per l'uso nella sfida finale
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/37/)
+
+### Introduzione
+
+Finora si è appreso come i dati di testo siano abbastanza diversi dai tipi di dati numerici. Se è un testo scritto o parlato da un essere umano, può essere analizzato per trovare schemi e frequenze, sentiment e significati. Questa lezione entra in un vero insieme di dati con una vera sfida: **[515K dati di recensioni di hotel in Europa](https://www.kaggle.com/jiashenliu/515k-hotel-reviews-data-in-europe)** e include una [licenza CC0: Public Domain](https://creativecommons.org/publicdomain/zero/1.0/). È stato ricavato da Booking.com da fonti pubbliche. Il creatore dell'insieme di dati è stato Jiashen Liu.
+
+### Preparazione
+
+Ecco l'occorrente:
+
+* La possibilità di eseguire notebook .ipynb utilizzando Python 3
+* pandas
+* NLTK, [che si dovrebbe installare localmente](https://www.nltk.org/install.html)
+* L'insieme di dati disponibile su Kaggle [515K Hotel Reviews Data in Europe](https://www.kaggle.com/jiashenliu/515k-hotel-reviews-data-in-europe). Sono circa 230 MB decompressi. Scaricarlo nella cartella radice `/data` associata a queste lezioni di NLP.
+
+## Analisi esplorativa dei dati
+
+Questa sfida presuppone che si stia creando un bot di raccomandazione di hotel utilizzando l'analisi del sentiment e i punteggi delle recensioni degli ospiti. L'insieme di dati da utilizzare include recensioni di 1493 hotel diversi in 6 città.
+
+Utilizzando Python, un insieme di dati di recensioni di hotel e l'analisi del sentiment di NLTK si potrebbe scoprire:
+
+* quali sono le parole e le frasi più usate nelle recensioni?
+* i *tag* ufficiali che descrivono un hotel correlato con punteggi di recensione (ad es. sono le più negative per un particolare hotel per *famiglia con bambini piccoli* rispetto a *viaggiatore singolo*, forse indicando che è meglio per i *viaggiatori sinogli*?)
+* i punteggi del sentiment NLTK "concordano" con il punteggio numerico del recensore dell'hotel?
+
+#### Insieme di dati
+
+Per esplorare l'insieme di dati scaricato e salvato localmente, aprire il file in un editor tipo VS Code o anche Excel.
+
+Le intestazioni nell'insieme di dati sono le seguenti:
+
+*Hotel_Address, Additional_Number_of_Scoring, Review_Date, Average_Score, Hotel_Name, Reviewer_Nationality, Negative_Review, Review_Total_Negative_Word_Counts, Total_Number_of_Reviews, Positive_Review, Review_Total_Positive_Word_Counts, Total_Number_of_Reviews_Reviewer_Has_Given, Reviewer_Score, Tags, days_since_review, lat, lng*
+
+Qui sono raggruppati in un modo che potrebbe essere più facile da esaminare:
+
+##### Colonne Hotel
+
+* `Hotel_Name` (nome hotel), `Hotel_Address` (indirizzo hotel), `lat` (latitudine)`,` lng (longitudine)
+ * Usando *lat* e *lng* si può tracciare una mappa con Python che mostra le posizioni degli hotel (forse codificate a colori per recensioni negative e positive)
+ * Hotel_Address non è ovviamente utile e probabilmente verrà sostituito con una nazione per semplificare l'ordinamento e la ricerca
+
+**Colonne di meta-recensione dell'hotel**
+
+* `Average_Score` (Punteggio medio)
+ * Secondo il creatore dell'insieme di dati, questa colonna è il *punteggio medio dell'hotel, calcolato in base all'ultimo commento dell'ultimo anno*. Questo sembra un modo insolito per calcolare il punteggio, ma sono i dati recuperati, quindi per ora si potrebbero prendere come valore nominale.
+
+ ✅ Sulla base delle altre colonne di questi dati, si riesce a pensare a un altro modo per calcolare il punteggio medio?
+
+* `Total_Number_of_Reviews` (Numero totale di recensioni)
+ * Il numero totale di recensioni ricevute da questo hotel - non è chiaro (senza scrivere del codice) se si riferisce alle recensioni nell'insieme di dati.
+* `Additional_Number_of_Scoring` (Numero aggiuntivo di punteggio
+ * Ciò significa che è stato assegnato un punteggio di recensione ma nessuna recensione positiva o negativa è stata scritta dal recensore
+
+**Colonne di recensione**
+
+- `Reviewer_Score` (Punteggio recensore)
+ - Questo è un valore numerico con al massimo 1 cifra decimale tra i valori minimo e massimo 2,5 e 10
+ - Non è spiegato perché 2,5 sia il punteggio più basso possibile
+- `Negative_Review` (Recensione Negativa)
+ - Se un recensore non ha scritto nulla, questo campo avrà "**No Negative" (Nessun negativo)**
+ - Si tenga presente che un recensore può scrivere una recensione positiva nella colonna delle recensioni negative (ad es. "non c'è niente di negativo in questo hotel")
+- `Review_Total_Negative_Word_Counts` (Conteggio parole negative totali per revisione)
+ - Conteggi di parole negative più alti indicano un punteggio più basso (senza controllare il sentiment)
+- `Positive_Review` (Recensioni positive)
+ - Se un recensore non ha scritto nulla, questo campo avrà "**No Positive" (Nessun positivo)**
+ - Si tenga presente che un recensore può scrivere una recensione negativa nella colonna delle recensioni positive (ad es. "non c'è niente di buono in questo hotel")
+- `Review_Total_Positive_Word_Counts` (Conteggio parole positive totali per revisione)
+ - Conteggi di parole positive più alti indicano un punteggio più alto (senza controllare il sentiment)
+- `Review_Date` e `days_since_review` (Data revisione e giorni trascorsi dalla revisione)
+ - Una misura di freschezza od obsolescenza potrebbe essere applicata a una recensione (le recensioni più vecchie potrebbero non essere accurate quanto quelle più recenti perché la gestione dell'hotel è cambiata, o sono stati effettuati lavori di ristrutturazione, o è stata aggiunta una piscina, ecc.)
+- `Tag`
+ - Questi sono brevi descrittori che un recensore può selezionare per descrivere il tipo di ospite in cui rientra (ad es. da soli o in famiglia), il tipo di camera che aveva, la durata del soggiorno e come è stata inviata la recensione.
+ - Sfortunatamente, l'uso di questi tag è problematico, controllare la sezione sottostante che discute la loro utilità
+
+**Colonne dei recensori**
+
+- `Total_Number_of_Reviews_Reviewer_Has_Given` (Numero totale di revisioni per recensore)
+ - Questo potrebbe essere un fattore in un modello di raccomandazione, ad esempio, se si potesse determinare che i recensori più prolifici con centinaia di recensioni avevano maggiori probabilità di essere negativi piuttosto che positivi. Tuttavia, il recensore di una particolare recensione non è identificato con un codice univoco e quindi non può essere collegato a un insieme di recensioni. Ci sono 30 recensori con 100 o più recensioni, ma è difficile vedere come questo possa aiutare il modello di raccomandazione.
+- `Reviewer_Nationality` (Nazionalità recensore)
+ - Alcune persone potrebbero pensare che alcune nazionalità abbiano maggiore propensione a dare una recensione positiva o negativa a causa di un'inclinazione nazionalista. Fare attenzione a creare tali punti di vista aneddotici nei propri modelli. Questi sono stereotipi nazionalisti (e talvolta razziali) e ogni recensore era un individuo che ha scritto una recensione in base alla propria esperienza. Potrebbe essere stata filtrata attraverso molte lenti come i loro precedenti soggiorni in hotel, la distanza percorsa e la loro indole. Pensare che la loro nazionalità sia stata la ragione per un punteggio di recensione è difficile da giustificare.
+
+##### Esempi
+
+| Average Score | Total Number Reviews | Reviewer Score | Negative Review | Positive Review | Tags |
+| -------------- | ---------------------- | ---------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------- | ----------------------------------------------------------------------------------------- |
+| 7.8 | 1945 | 2.5 | This is currently not a hotel but a construction site I was terroized from early morning and all day with unacceptable building noise while resting after a long trip and working in the room People were working all day i e with jackhammers in the adjacent rooms I asked for a room change but no silent room was available To make thinks worse I was overcharged I checked out in the evening since I had to leave very early flight and received an appropiate bill A day later the hotel made another charge without my concent in excess of booked price It s a terrible place Don t punish yourself by booking here | Nothing Terrible place Stay away | Business trip Couple Standard Double Room Stayed 2 nights |
+
+Come si può vedere, questo ospite non ha trascorso un soggiorno felice in questo hotel. L'hotel ha un buon punteggio medio di 7,8 e 1945 recensioni, ma questo recensore ha dato 2,5 e ha scritto 115 parole su quanto sia stato negativo il suo soggiorno. Se non è stato scritto nulla nella colonna Positive_Review, si potrebbe supporre che non ci sia stato nulla di positivo, ma purtroppo sono state scritte 7 parole di avvertimento. Se si contassero solo le parole invece del significato o del sentiment delle parole, si potrebbe avere una visione distorta dell'intento dei recensori. Stranamente, il punteggio di 2,5 è fonte di confusione, perché se il soggiorno in hotel è stato così negativo, perché dargli dei punti? Indagando da vicino l'insieme di dati, si vedrà che il punteggio più basso possibile è 2,5, non 0. Il punteggio più alto possibile è 10.
+
+##### Tag
+
+Come accennato in precedenza, a prima vista, l'idea di utilizzare i `tag` per classificare i dati ha senso. Sfortunatamente questi tag non sono standardizzati, il che significa che in un determinato hotel le opzioni potrebbero essere *Camera singola* , *Camera a due letti* e *Camera doppia*, ma nell'hotel successivo potrebbero essere *Camera singola deluxe*, *Camera matrimoniale classica* e *Camera king executive*. Potrebbero essere le stesse cose, ma ci sono così tante varianti che la scelta diventa:
+
+1. Tentare di modificare tutti i termini in un unico standard, il che è molto difficile, perché non è chiaro quale sarebbe il percorso di conversione in ciascun caso (ad es. *Camera singola classica* mappata in *camera singola*, ma *camera Queen Superior con cortile giardino o vista città* è molto più difficile da mappare)
+
+1. Si può adottare un approccio NLP e misurare la frequenza di determinati termini come *Solo*, *Business Traveller* (Viaggiatore d'affari) o *Family with young kids* (Famiglia con bambini piccoli) quando si applicano a ciascun hotel e tenerne conto nella raccomandazione
+
+I tag sono in genere (ma non sempre) un singolo campo contenente un elenco di valori separati da 5 a 6 virgole allineati per *Type of trip* (Tipo di viaggio), *Type of guests* (Tipo di ospiti), *Type of room* (Tipo di camera), *Number of nights* (Numero di notti) e *Type of device* (Tipo di dispositivo con il quale è stata inviata la recensione).Tuttavia, poiché alcuni recensori non compilano ogni campo (potrebbero lasciarne uno vuoto), i valori non sono sempre nello stesso ordine.
+
+Ad esempio, si prenda *Type of group* (Tipo di gruppo). Ci sono 1025 possibilità uniche in questo campo nella colonna `Tag`, e purtroppo solo alcune di esse fanno riferimento a un gruppo (alcune sono il tipo di stanza ecc.). Se si filtra solo quelli che menzionano la famiglia, saranno ricavati molti risultati relativi al tipo di *Family room* . Se si include il termine *with* (con), cioè si conta i valori per *Family with*, i risultati sono migliori, con oltre 80.000 dei 515.000 risultati che contengono la frase "Family with young children" (Famiglia con figli piccoli) o "Family with older children" (Famiglia con figli grandi).
+
+Ciò significa che la colonna dei tag non è completamente inutile allo scopo, ma richiederà del lavoro per renderla utile.
+
+##### Punteggio medio dell'hotel
+
+Ci sono una serie di stranezze o discrepanze con l'insieme di dati che non si riesce a capire, ma sono illustrate qui in modo che ci siano note quando si costruiscono i propri modelli. Se ci si capisce qualcosa, renderlo noto nella sezione discussione!
+
+L'insieme di dati ha le seguenti colonne relative al punteggio medio e al numero di recensioni:
+
+1. Hotel_Name (Nome Hotel)
+2. Additional_Number_of_Scoring (Numero aggiuntivo di punteggio
+3. Average_Score (Punteggio medio)
+4. Total_Number_of_Reviews (Numero totale di recensioni)
+5. Reviewer_Score (Punteggio recensore)
+
+L'hotel con il maggior numero di recensioni in questo insieme di dati è *il Britannia International Hotel Canary Wharf* con 4789 recensioni su 515.000. Ma se si guarda al valore `Total_Number_of_Reviews` per questo hotel, è 9086. Si potrebbe supporre che ci siano molti più punteggi senza recensioni, quindi forse si dovrebbe aggiungere il valore della colonna `Additional_Number_of_Scoring` . Quel valore è 2682 e aggiungendolo a 4789 si ottiene 7.471 che è ancora 1615 in meno del valore di `Total_Number_of_Reviews`.
+
+Se si prende la colonna `Average_Score`, si potrebbe supporre che sia la media delle recensioni nell'insieme di dati, ma la descrizione di Kaggle è "*Punteggio medio dell’hotel, calcolato in base all’ultimo commento nell’ultimo anno*". Non sembra così utile, ma si può calcolare la media in base ai punteggi delle recensioni nell'insieme di dati. Utilizzando lo stesso hotel come esempio, il punteggio medio dell'hotel è 7,1 ma il punteggio calcolato (punteggio medio del recensore nell'insieme di dati) è 6,8. Questo è vicino, ma non lo stesso valore, e si può solo supporre che i punteggi dati nelle recensioni `Additional_Number_of_Scoring` hanno aumentato la media a 7,1. Sfortunatamente, senza alcun modo per testare o dimostrare tale affermazione, è difficile utilizzare o fidarsi di `Average_Score`, `Additional_Number_of_Scoring` e `Total_Number_of_Reviews` quando si basano su o fanno riferimento a dati che non sono presenti.
+
+Per complicare ulteriormente le cose, l'hotel con il secondo numero più alto di recensioni ha un punteggio medio calcolato di 8,12 e l'insieme di dati `Average_Score` è 8,1. Questo punteggio corretto è una coincidenza o il primo hotel è una discrepanza?
+
+Sulla possibilità che questi hotel possano essere un valore anomalo e che forse la maggior parte dei valori coincidano (ma alcuni non lo fanno per qualche motivo) si scriverà un breve programma per esplorare i valori nell'insieme di dati e determinare l'utilizzo corretto (o mancato utilizzo) dei valori.
+
+> 🚨 Una nota di cautela
+>
+> Quando si lavora con questo insieme di dati, si scriverà un codice che calcola qualcosa dal testo senza dover leggere o analizzare il testo da soli. Questa è l'essenza di NLP, interpretare il significato o il sentiment senza che lo faccia un essere umano. Tuttavia, è possibile che si leggano alcune delle recensioni negative. Non è raccomandabile farlo. Alcune di esse sono recensioni negative sciocche o irrilevanti, come "Il tempo non era eccezionale", qualcosa al di fuori del controllo dell'hotel, o di chiunque, in effetti. Ma c'è anche un lato oscuro in alcune recensioni. A volte le recensioni negative sono razziste, sessiste o antietà. Questo è un peccato, ma è prevedibile in un insieme di dati recuperato da un sito web pubblico. Alcuni recensori lasciano recensioni che si potrebbe trovare sgradevoli, scomode o sconvolgenti. Meglio lasciare che il codice misuri il sentiment piuttosto che leggerle da soli e arrabbiarsi. Detto questo, è una minoranza che scrive queste cose, ma esistono lo stesso.
+
+## Esercizio - Esplorazione dei dati
+
+### Caricare i dati
+
+Per ora l'esame visivo dei dati è sufficiente, adesso si scriverà del codice e si otterranno alcune risposte! Questa sezione utilizza la libreria pandas. Il primo compito è assicurarsi di poter caricare e leggere i dati CSV. La libreria pandas ha un veloce caricatore CSV e il risultato viene inserito in un dataframe, come nelle lezioni precedenti. Il CSV che si sta caricando ha oltre mezzo milione di righe, ma solo 17 colonne. Pandas offre molti modi potenti per interagire con un dataframe, inclusa la possibilità di eseguire operazioni su ogni riga.
+
+Da qui in poi in questa lezione, ci saranno frammenti di codice e alcune spiegazioni del codice e alcune discussioni su cosa significano i risultati. Usare il _notebook.ipynb_ incluso per il proprio codice.
+
+Si inizia con il caricamento del file di dati da utilizzare:
+
+```python
+# Carica il CSV con le recensioni degli hotel
+import pandas as pd
+import time
+# importa time per determinare orario di inizio e fine caricamento per poterne calcolare la durata
+print("Loading data file now, this could take a while depending on file size")
+start = time.time()
+# df è un 'DataFrame' - assicurarsi di aver scaricato il file nella cartelle data
+df = pd.read_csv('../../data/Hotel_Reviews.csv')
+end = time.time()
+print("Loading took " + str(round(end - start, 2)) + " seconds")
+```
+
+Ora che i dati sono stati caricati, si possono eseguire alcune operazioni su di essi. Tenere questo codice nella parte superiore del programma per la parte successiva.
+
+## Esplorare i dati
+
+In questo caso, i dati sono già *puliti*, il che significa che sono pronti per essere lavorati e non ci sono caratteri in altre lingue che potrebbero far scattare algoritmi che si aspettano solo caratteri inglesi.
+
+✅ Potrebbe essere necessario lavorare con dati che richiedono un'elaborazione iniziale per formattarli prima di applicare le tecniche di NLP, ma non questa volta. Se si dovesse, come si gestirebero i caratteri non inglesi?
+
+Si prenda un momento per assicurarsi che una volta caricati, i dati possano essere esplorarati con il codice. È molto facile volersi concentrare sulle colonne `Negative_Review` e `Positive_Review` . Sono pieni di testo naturale per essere elaborato dagli algoritmi di NLP. Ma non è finita qui! Prima di entrare nel NLP e nel sentiment, si dovrebbe seguire il codice seguente per accertarsi se i valori forniti nell'insieme di dati corrispondono ai valori calcolati con pandas.
+
+## Operazioni con dataframe
+
+Il primo compito di questa lezione è verificare se le seguenti asserzioni sono corrette scrivendo del codice che esamini il dataframe (senza modificarlo).
+
+> Come molte attività di programmazione, ci sono diversi modi per completarla, ma un buon consiglio è farlo nel modo più semplice e facile possibile, soprattutto se sarà più facile da capire quando si riesaminerà questo codice in futuro. Con i dataframe, esiste un'API completa che spesso avrà un modo per fare ciò che serve in modo efficiente.
+
+Trattare le seguenti domande come attività di codifica e provare a rispondere senza guardare la soluzione.
+
+1. Stampare la *forma* del dataframe appena caricato (la forma è il numero di righe e colonne)
+2. Calcolare il conteggio della frequenza per le nazionalità dei recensori:
+ 1. Quanti valori distinti ci sono per la colonna `Reviewer_Nationality` e quali sono?
+ 2. Quale nazionalità del recensore è la più comune nell'insieme di dati (stampare nazione e numero di recensioni)?
+ 3. Quali sono le prossime 10 nazionalità più frequenti e la loro frequenza?
+3. Qual è stato l'hotel più recensito per ciascuna delle 10 nazionalità più recensite?
+4. Quante recensioni ci sono per hotel (conteggio della frequenza dell'hotel) nell'insieme di dati?
+5. Sebbene sia presente una colonna `Average_Score` per ogni hotel nell'insieme di dati, si può anche calcolare un punteggio medio (ottenendo la media di tutti i punteggi dei recensori nell'insieme di dati per ogni hotel). Aggiungere una nuova colonna al dataframe con l'intestazione della colonna `Calc_Average_Score` che contiene quella media calcolata.
+6. Ci sono hotel che hanno lo stesso (arrotondato a 1 decimale) `Average_Score` e `Calc_Average_Score`?
+ 1. Provare a scrivere una funzione Python che accetta una serie (riga) come argomento e confronta i valori, stampando un messaggio quando i valori non sono uguali. Quindi usare il metodo `.apply()` per elaborare ogni riga con la funzione.
+7. Calcolare e stampare quante righe contengono valori di "No Negative" nella colonna `Negative_Review` "
+8. Calcolare e stampare quante righe contengono valori di "No Positive" nella colonna `Positive_Review` "
+9. Calcolare e stampare quante righe contengono valori di "No Positive" nella colonna `Positive_Review` " **e** valori di "No Negative" nella colonna `Negative_Review`
+
+### Risposte
+
+1. Stampare la *forma* del dataframei appena caricato (la forma è il numero di righe e colonne)
+
+ ```python
+ print("The shape of the data (rows, cols) is " + str(df.shape))
+ > The shape of the data (rows, cols) is (515738, 17)
+ ```
+
+2. Calcolare il conteggio della frequenza per le nazionalità dei recensori:
+
+ 1. Quanti valori distinti ci sono per la colonna `Reviewer_Nationality` e quali sono?
+ 2. Quale nazionalità del recensore è la più comune nell'insieme di dati (paese di stampa e numero di recensioni)?
+
+ ```python
+ # value_counts() crea un oggetto Series con indice e valori, in questo caso la nazione
+ # e la frequenza con la quale si manifestano nella nazionalità del recensore
+ nationality_freq = df["Reviewer_Nationality"].value_counts()
+ print("There are " + str(nationality_freq.size) + " different nationalities")
+ # stampa la prima e ultima riga della Series. Modificare in nationality_freq.to_string() per stampare tutti i dati
+ print(nationality_freq)
+
+ There are 227 different nationalities
+ United Kingdom 245246
+ United States of America 35437
+ Australia 21686
+ Ireland 14827
+ United Arab Emirates 10235
+ ...
+ Comoros 1
+ Palau 1
+ Northern Mariana Islands 1
+ Cape Verde 1
+ Guinea 1
+ Name: Reviewer_Nationality, Length: 227, dtype: int64
+ ```
+
+ 3. Quali sono le prossime 10 nazionalità più frequenti e la loro frequenza?
+
+ ```python
+ print("The highest frequency reviewer nationality is " + str(nationality_freq.index[0]).strip() + " with " + str(nationality_freq[0]) + " reviews.")
+ # Notare che c'è uno spazio davanti ai valori, strip() lo rimuove per la stampa
+ # Quale sono le 10 nazionalità più comuni e la loro frequenza?
+ print("The next 10 highest frequency reviewer nationalities are:")
+ print(nationality_freq[1:11].to_string())
+
+ The highest frequency reviewer nationality is United Kingdom with 245246 reviews.
+ The next 10 highest frequency reviewer nationalities are:
+ United States of America 35437
+ Australia 21686
+ Ireland 14827
+ United Arab Emirates 10235
+ Saudi Arabia 8951
+ Netherlands 8772
+ Switzerland 8678
+ Germany 7941
+ Canada 7894
+ France 7296
+ ```
+
+3. Qual è stato l'hotel più recensito per ciascuna delle 10 nazionalità più recensite?
+
+ ```python
+ # Qual è stato l'hotel più recensito per ciascuna delle 10 nazionalità più recensite
+ # In genere con pandas si cerca di evitare un ciclo esplicito, ma si vuole mostrare come si crea un
+ # nuovo dataframe usando criteri (non fare questo con un grande volume di dati in quanto potrebbe essere molto lento)
+ for nat in nationality_freq[:10].index:
+ # Per prima cosa estrarre tutte le righe che corrispondono al criterio in un nuovo dataframe
+ nat_df = df[df["Reviewer_Nationality"] == nat]
+ # Ora ottenere la frequenza per l'hotel
+ freq = nat_df["Hotel_Name"].value_counts()
+ print("The most reviewed hotel for " + str(nat).strip() + " was " + str(freq.index[0]) + " with " + str(freq[0]) + " reviews.")
+
+ The most reviewed hotel for United Kingdom was Britannia International Hotel Canary Wharf with 3833 reviews.
+ The most reviewed hotel for United States of America was Hotel Esther a with 423 reviews.
+ The most reviewed hotel for Australia was Park Plaza Westminster Bridge London with 167 reviews.
+ The most reviewed hotel for Ireland was Copthorne Tara Hotel London Kensington with 239 reviews.
+ The most reviewed hotel for United Arab Emirates was Millennium Hotel London Knightsbridge with 129 reviews.
+ The most reviewed hotel for Saudi Arabia was The Cumberland A Guoman Hotel with 142 reviews.
+ The most reviewed hotel for Netherlands was Jaz Amsterdam with 97 reviews.
+ The most reviewed hotel for Switzerland was Hotel Da Vinci with 97 reviews.
+ The most reviewed hotel for Germany was Hotel Da Vinci with 86 reviews.
+ The most reviewed hotel for Canada was St James Court A Taj Hotel London with 61 reviews.
+ ```
+
+4. Quante recensioni ci sono per hotel (conteggio della frequenza dell'hotel) nell'insieme di dati?
+
+ ```python
+ # Per prima cosa creare un nuovo dataframe in base a quello vecchio, togliendo le colonne non necessarie
+ hotel_freq_df = df.drop(["Hotel_Address", "Additional_Number_of_Scoring", "Review_Date", "Average_Score", "Reviewer_Nationality", "Negative_Review", "Review_Total_Negative_Word_Counts", "Positive_Review", "Review_Total_Positive_Word_Counts", "Total_Number_of_Reviews_Reviewer_Has_Given", "Reviewer_Score", "Tags", "days_since_review", "lat", "lng"], axis = 1)
+
+ # Raggruppre le righe per Hotel_Name, conteggiarle e inserire il risultato in una nuova colonna Total_Reviews_Found
+ hotel_freq_df['Total_Reviews_Found'] = hotel_freq_df.groupby('Hotel_Name').transform('count')
+
+ # Eliminare tutte le righe duplicate
+ hotel_freq_df = hotel_freq_df.drop_duplicates(subset = ["Hotel_Name"])
+ display(hotel_freq_df)
+ ```
+
+ | Hotel_Name (Nome Hotel) | Total_Number_of_Reviews (Numero totale di recensioni) | Total_Reviews_Found |
+ | :----------------------------------------: | :---------------------------------------------------: | :-----------------: |
+ | Britannia International Hotel Canary Wharf | 9086 | 4789 |
+ | Park Plaza Westminster Bridge Londra | 12158 | 4169 |
+ | Copthorne Tara Hotel London Kensington | 7105 | 3578 |
+ | ... | ... | ... |
+ | Mercure Paris Porte d'Orléans | 110 | 10 |
+ | Hotel Wagner | 135 | 10 |
+ | Hotel Gallitzinberg | 173 | 8 |
+
+
+ Si potrebbe notare che il *conteggio nell'insieme di dati* non corrisponde al valore in `Total_Number_of_Reviews`. Non è chiaro se questo valore nell'insieme di dati rappresentasse il numero totale di recensioni che l'hotel aveva, ma non tutte sono state recuperate o qualche altro calcolo. `Total_Number_of_Reviews` non viene utilizzato nel modello a causa di questa non chiarezza.
+
+5. Sebbene sia presente una colonna `Average_Score` per ogni hotel nell'insieme di dati, si può anche calcolare un punteggio medio (ottenendo la media di tutti i punteggi dei recensori nell'insieme di dati per ogni hotel). Aggiungere una nuova colonna al dataframe con l'intestazione della colonna `Calc_Average_Score` che contiene quella media calcolata. Stampare le colonne `Hotel_Name`, `Average_Score` e `Calc_Average_Score`.
+
+ ```python
+ # definisce una funzione che ottiene una riga ed esegue alcuni calcoli su di essa
+ def get_difference_review_avg(row):
+ return row["Average_Score"] - row["Calc_Average_Score"]
+
+ # 'mean' è la definizione matematica per 'average'
+ df['Calc_Average_Score'] = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)
+
+ # Aggiunge una nuova colonna con la differenza tra le due medie di punteggio
+ df["Average_Score_Difference"] = df.apply(get_difference_review_avg, axis = 1)
+
+ # Crea un df senza tutti i duplicati di Hotel_Name (quindi una sola riga per hotel)
+ review_scores_df = df.drop_duplicates(subset = ["Hotel_Name"])
+
+ # Ordina il dataframe per trovare la differnza più bassa e più alta per il punteggio medio
+ review_scores_df = review_scores_df.sort_values(by=["Average_Score_Difference"])
+
+ display(review_scores_df[["Average_Score_Difference", "Average_Score", "Calc_Average_Score", "Hotel_Name"]])
+ ```
+
+ Ci si potrebbe anche chiedere del valore `Average_Score` e perché a volte è diverso dal punteggio medio calcolato. Poiché non è possibile sapere perché alcuni valori corrispondano, ma altri hanno una differenza, in questo caso è più sicuro utilizzare i punteggi delle recensioni a disposizione per calcolare autonomamente la media. Detto questo, le differenze sono solitamente molto piccole, ecco gli hotel con la maggiore deviazione dalla media dell'insieme di dati e dalla media calcolata:
+
+ | Average_Score_Difference | Average_Score | Calc_Average_Score | Hotel_Name (Nome Hotel) |
+ | :----------------------: | :-----------: | :----------------: | -------------------------------------------: |
+ | -0,8 | 7,7 | 8,5 | Best Western Hotel Astoria |
+ | -0,7 | 8,8 | 9,5 | Hotel Stendhal Place Vend me Parigi MGallery |
+ | -0,7 | 7,5 | 8.2 | Mercure Paris Porte d'Orléans |
+ | -0,7 | 7,9 | 8,6 | Renaissance Paris Vendome Hotel |
+ | -0,5 | 7,0 | 7,5 | Hotel Royal Elys es |
+ | ... | ... | ... | ... |
+ | 0.7 | 7,5 | 6.8 | Mercure Paris Op ra Faubourg Montmartre |
+ | 0,8 | 7,1 | 6.3 | Holiday Inn Paris Montparnasse Pasteur |
+ | 0,9 | 6.8 | 5,9 | Villa Eugenia |
+ | 0,9 | 8,6 | 7,7 | MARCHESE Faubourg St Honor Relais Ch teaux |
+ | 1,3 | 7,2 | 5,9 | Kube Hotel Ice Bar |
+
+ Con un solo hotel con una differenza di punteggio maggiore di 1, significa che probabilmente si può ignorare la differenza e utilizzare il punteggio medio calcolato.
+
+6. Calcolare e stampare quante righe hanno la colonna `Negative_Review` valori di "No Negative"
+
+7. Calcolare e stampare quante righe hanno la colonna `Positive_Review` valori di "No Positive"
+
+8. Calcolare e stampare quante righe hanno la colonna `Positive_Review` valori di "No Positive" **e** `Negative_Review` valori di "No Negative"
+
+ ```python
+ # con funzini lambda:
+ start = time.time()
+ no_negative_reviews = df.apply(lambda x: True if x['Negative_Review'] == "No Negative" else False , axis=1)
+ print("Number of No Negative reviews: " + str(len(no_negative_reviews[no_negative_reviews == True].index)))
+
+ no_positive_reviews = df.apply(lambda x: True if x['Positive_Review'] == "No Positive" else False , axis=1)
+ print("Number of No Positive reviews: " + str(len(no_positive_reviews[no_positive_reviews == True].index)))
+
+ both_no_reviews = df.apply(lambda x: True if x['Negative_Review'] == "No Negative" and x['Positive_Review'] == "No Positive" else False , axis=1)
+ print("Number of both No Negative and No Positive reviews: " + str(len(both_no_reviews[both_no_reviews == True].index)))
+ end = time.time()
+ print("Lambdas took " + str(round(end - start, 2)) + " seconds")
+
+ Number of No Negative reviews: 127890
+ Number of No Positive reviews: 35946
+ Number of both No Negative and No Positive reviews: 127
+ Lambdas took 9.64 seconds
+ ```
+
+## Un'altra strada
+
+Un altro modo per contare gli elementi senza Lambda e utilizzare sum per contare le righe:
+
+```python
+# senza funzioni lambda (usando un misto di notazioni per mostrare che si possono usare entrambi)
+start = time.time()
+no_negative_reviews = sum(df.Negative_Review == "No Negative")
+print("Number of No Negative reviews: " + str(no_negative_reviews))
+
+no_positive_reviews = sum(df["Positive_Review"] == "No Positive")
+print("Number of No Positive reviews: " + str(no_positive_reviews))
+
+both_no_reviews = sum((df.Negative_Review == "No Negative") & (df.Positive_Review == "No Positive"))
+print("Number of both No Negative and No Positive reviews: " + str(both_no_reviews))
+
+end = time.time()
+print("Sum took " + str(round(end - start, 2)) + " seconds")
+
+Number of No Negative reviews: 127890
+Number of No Positive reviews: 35946
+Number of both No Negative and No Positive reviews: 127
+Sum took 0.19 seconds
+```
+
+Si potrebbe aver notato che ci sono 127 righe che hanno entrambi i valori "No Negative" e "No Positive" rispettivamente per le colonne `Negative_Review` e `Positive_Review` . Ciò significa che il recensore ha assegnato all'hotel un punteggio numerico, ma ha rifiutato di scrivere una recensione positiva o negativa. Fortunatamente questa è una piccola quantità di righe (127 su 515738, o 0,02%), quindi probabilmente non distorcerà il modello o i risultati in una direzione particolare, ma si potrebbe non aspettarsi che un insieme di dati di recensioni abbia righe con nessuna recensione, quindi vale la pena esplorare i dati per scoprire righe come questa.
+
+Ora che si è esplorato l'insieme di dati, nella prossima lezione si filtreranno i dati e si aggiungerà un'analisi del sentiment.
+
+---
+
+## 🚀 Sfida
+
+Questa lezione dimostra, come visto nelle lezioni precedenti, quanto sia di fondamentale importanza comprendere i dati e le loro debolezze prima di eseguire operazioni su di essi. I dati basati su testo, in particolare, sono oggetto di un attento esame. Esaminare vari insiemi di dati contenenti principalmente testo e vedere se si riesce a scoprire aree che potrebbero introdurre pregiudizi o sentiment distorti in un modello.
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/38/)
+
+## Revisione e Auto Apprendimento
+
+Seguire [questo percorso di apprendimento su NLP](https://docs.microsoft.com/learn/paths/explore-natural-language-processing/?WT.mc_id=academic-15963-cxa) per scoprire gli strumenti da provare durante la creazione di modelli vocali e di testo.
+
+## Compito
+
+[NLTK](assignment.it.md)
diff --git a/6-NLP/4-Hotel-Reviews-1/translations/assignment.it.md b/6-NLP/4-Hotel-Reviews-1/translations/assignment.it.md
new file mode 100644
index 0000000000..5fd90e11e9
--- /dev/null
+++ b/6-NLP/4-Hotel-Reviews-1/translations/assignment.it.md
@@ -0,0 +1,5 @@
+# NLTK
+
+## Instructions
+
+NLTK is a well-known library for use in computational linguistics and NLP. Take this opportunity to read through the '[NLTK book](https://www.nltk.org/book/)' and try out its exercises. In this ungraded assignment, you will get to know this library more deeply.
diff --git a/6-NLP/5-Hotel-Reviews-2/README.md b/6-NLP/5-Hotel-Reviews-2/README.md
index 7d8a4d031a..c764b8e1ce 100644
--- a/6-NLP/5-Hotel-Reviews-2/README.md
+++ b/6-NLP/5-Hotel-Reviews-2/README.md
@@ -1,7 +1,7 @@
# Sentiment analysis with hotel reviews
Now that you have a explored the dataset in detail, it's time to filter the columns and then use NLP techniques on the dataset to gain new insights about the hotels.
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/39/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/39/)
### Filtering & Sentiment Analysis Operations
@@ -347,20 +347,20 @@ print("Saving results to Hotel_Reviews_NLP.csv")
df.to_csv(r"../data/Hotel_Reviews_NLP.csv", index = False)
```
-You should run the entire code for [the analysis notebook](solution/notebook-sentiment-analysis.ipynb) (after you've run [your filtering notebook](solution/notebook-filtering.ipynb) to generate the Hotel_Reviews_Filtered.csv file).
+You should run the entire code for [the analysis notebook](solution/3-notebook.ipynb) (after you've run [your filtering notebook](solution/1-notebook.ipynb) to generate the Hotel_Reviews_Filtered.csv file).
To review, the steps are:
-1. Original dataset file **Hotel_Reviews.csv** is explored in the previous lesson with [the explorer notebook](../4-Hotel-Reviews-1/solution/notebook-explorer.ipynb)
-2. Hotel_Reviews.csv is filtered by [the filtering notebook](solution/notebook-filtering.ipynb) resulting in **Hotel_Reviews_Filtered.csv**
-3. Hotel_Reviews_Filtered.csv is processed by [the sentiment analysis notebook](solution/notebook-sentiment-analysis.ipynb) resulting in **Hotel_Reviews_NLP.csv**
+1. Original dataset file **Hotel_Reviews.csv** is explored in the previous lesson with [the explorer notebook](../4-Hotel-Reviews-1/solution/notebook.ipynb)
+2. Hotel_Reviews.csv is filtered by [the filtering notebook](solution/1-notebook.ipynb) resulting in **Hotel_Reviews_Filtered.csv**
+3. Hotel_Reviews_Filtered.csv is processed by [the sentiment analysis notebook](solution/3-notebook.ipynb) resulting in **Hotel_Reviews_NLP.csv**
4. Use Hotel_Reviews_NLP.csv in the NLP Challenge below
### Conclusion
When you started, you had a dataset with columns and data but not all of it could be verified or used. You've explored the data, filtered out what you don't need, converted tags into something useful, calculated your own averages, added some sentiment columns and hopefully, learned some interesting things about processing natural text.
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/40/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/40/)
## Challenge
diff --git a/6-NLP/5-Hotel-Reviews-2/translations/README.it.md b/6-NLP/5-Hotel-Reviews-2/translations/README.it.md
new file mode 100644
index 0000000000..46e92c7b64
--- /dev/null
+++ b/6-NLP/5-Hotel-Reviews-2/translations/README.it.md
@@ -0,0 +1,376 @@
+# Analisi del sentiment con recensioni di hotel
+
+Ora che si è esplorato in dettaglio l'insieme di dati, è il momento di filtrare le colonne e quindi utilizzare le tecniche NLP sull'insieme di dati per ottenere nuove informazioni sugli hotel.
+
+## [Quiz Pre-Lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/39/)
+
+### Operazioni di Filtraggio e Analisi del Sentiment
+
+Come probabilmente notato, l'insieme di dati presenta alcuni problemi. Alcune colonne sono piene di informazioni inutili, altre sembrano errate. Se sono corrette, non è chiaro come sono state calcolate e le risposte non possono essere verificate in modo indipendente dai propri calcoli.
+
+## Esercizio: un po' più di elaborazione dei dati
+
+Occorre pulire un po' di più i dati. Si aggiungono colonne che saranno utili in seguito, si modificano i valori in altre colonne e si eliminano completamente determinate colonne.
+
+1. Elaborazione iniziale colonne
+
+ 1. Scartare `lat` e `lng`
+
+ 2. Sostituire i valori `Hotel_Address` con i seguenti valori (se l'indirizzo contiene lo stesso della città e del paese, si cambia solo con la città e la nazione).
+
+ Queste sono le uniche città e nazioni nell'insieme di dati:
+
+ Amsterdam, Netherlands
+
+ Barcelona, Spain
+
+ London, United Kingdom
+
+ Milan, Italy
+
+ Paris, France
+
+ Vienna, Austria
+
+ ```python
+ def replace_address(row):
+ if "Netherlands" in row["Hotel_Address"]:
+ return "Amsterdam, Netherlands"
+ elif "Barcelona" in row["Hotel_Address"]:
+ return "Barcelona, Spain"
+ elif "United Kingdom" in row["Hotel_Address"]:
+ return "London, United Kingdom"
+ elif "Milan" in row["Hotel_Address"]:
+ return "Milan, Italy"
+ elif "France" in row["Hotel_Address"]:
+ return "Paris, France"
+ elif "Vienna" in row["Hotel_Address"]:
+ return "Vienna, Austria"
+
+ # Sostituisce tutti gli indirizzi con una forma ridotta più utile
+ df["Hotel_Address"] = df.apply(replace_address, axis = 1)
+ # La somma di value_counts() dovrebbe sommarsi al numero totale recensioni
+ print(df["Hotel_Address"].value_counts())
+ ```
+
+ Ora si possono interrogare i dati a livello di nazione:
+
+ ```python
+ display(df.groupby("Hotel_Address").agg({"Hotel_Name": "nunique"}))
+ ```
+
+ | Hotel_Address | Hotel_Name (Nome Hotel) |
+ | :--------------------- | :---------------------: |
+ | Amsterdam, Paesi Bassi | 105 |
+ | Barcellona, Spagna | 211 |
+ | Londra, Regno Unito | 400 |
+ | Milano, Italia | 162 |
+ | Parigi, Francia | 458 |
+ | Vienna, Austria | 158 |
+
+2. Elaborazione colonne di meta-recensione dell'hotel
+
+1. Eliminare `Additional_Number_of_Scoring`
+
+1. Sostituire `Total_Number_of_Reviews` con il numero totale di recensioni per quell'hotel che sono effettivamente nell'insieme di dati
+
+1. Sostituire `Average_Score` con il punteggio calcolato via codice
+
+```python
+# Elimina `Additional_Number_of_Scoring`
+df.drop(["Additional_Number_of_Scoring"], axis = 1, inplace=True)
+# Sostituisce `Total_Number_of_Reviews` e `Average_Score` con i propri valori calcolati
+df.Total_Number_of_Reviews = df.groupby('Hotel_Name').transform('count')
+df.Average_Score = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)
+```
+
+3. Elaborazione delle colonne di recensione
+
+ 1. Eliminare `Review_Total_Negative_Word_Counts`, `Review_Total_Positive_Word_Counts`, `Review_Date` e `days_since_review`
+
+ 2. Mantenere `Reviewer_Score`, `Negative_Review` e `Positive_Review` così come sono
+
+ 3. Conservare i `Tags` per ora
+
+ - Si faranno alcune operazioni di filtraggio aggiuntive sui tag nella prossima sezione, successivamente i tag verranno eliminati
+
+4. Elaborazione delle colonne del recensore
+
+1. Scartare `Total_Number_of_Reviews_Reviewer_Has_Given`
+
+2. Mantenere `Reviewer_Nationality`
+
+### Colonne tag
+
+Le colonne `Tag` sono problematiche in quanto si tratta di un elenco (in formato testo) memorizzato nella colonna. Purtroppo l'ordine e il numero delle sottosezioni in questa colonna non sono sempre gli stessi. È difficile per un essere umano identificare le frasi corrette a cui essere interessato, perché ci sono 515.000 righe e 1427 hotel e ognuno ha opzioni leggermente diverse che un recensore potrebbe scegliere. È qui che la NLP brilla. Si può scansionare il testo, trovare le frasi più comuni e contarle.
+
+Purtroppo non interessano parole singole, ma frasi composte da più parole (es. *Viaggio di lavoro*). L'esecuzione di un algoritmo di distribuzione della frequenza a più parole su così tanti dati (6762646 parole) potrebbe richiedere una quantità straordinaria di tempo, ma senza guardare i dati, sembrerebbe che sia una spesa necessaria. È qui che l'analisi dei dati esplorativi diventa utile, perché si è visto un esempio di tag come `["Business trip", "Solo traveler", "Single Room", "Stayed 5 nights", "Submitted from a mobile device"]` , si può iniziare a chiedersi se è possibile ridurre notevolmente l'elaborazione da fare. Fortunatamente lo è, ma prima occorre seguire alcuni passaggi per accertare i tag di interesse.
+
+### Filtraggio tag
+
+Ricordare che l'obiettivo dell'insieme di dati è aggiungere il sentiment e le colonne che aiuteranno a scegliere l'hotel migliore (per se stessi o forse per un cliente che incarica di creare un bot di raccomandazione dell'hotel). Occorre chiedersi se i tag sono utili o meno nell'insieme di dati finale. Ecco un'interpretazione (se serve l'insieme di dati per altri motivi diversi tag potrebbero rimanere dentro/fuori dalla selezione):
+
+1. Il tipo di viaggio è rilevante e dovrebbe rimanere
+2. Il tipo di gruppo di ospiti è importante e dovrebbe rimanere
+3. Il tipo di camera, suite o monolocale in cui ha soggiornato l'ospite è irrilevante (tutti gli hotel hanno praticamente le stesse stanze)
+4. Il dispositivo su cui è stata inviata la recensione è irrilevante
+5. Il numero di notti in cui il recensore ha soggiornato *potrebbe* essere rilevante se si attribuisce a soggiorni più lunghi un gradimento maggiore per l'hotel, ma è una forzatura e probabilmente irrilevante
+
+In sintesi, si **mantengono 2 tipi di tag e si rimuove il resto**.
+
+Innanzitutto, non si vogliono contare i tag finché non sono in un formato migliore, quindi ciò significa rimuovere le parentesi quadre e le virgolette. Si può fare in diversi modi, ma serve il più veloce in quanto potrebbe richiedere molto tempo per elaborare molti dati. Fortunatamente, pandas ha un modo semplice per eseguire ciascuno di questi passaggi.
+
+```Python
+# Rimuove le parentesi quadre di apertura e chiusura
+df.Tags = df.Tags.str.strip("[']")
+# rimuove anche tutte le virgolette
+df.Tags = df.Tags.str.replace(" ', '", ",", regex = False)
+```
+
+Ogni tag diventa qualcosa come: `Business trip, Solo traveler, Single Room, Stayed 5 nights, Submitted from a mobile device`.
+
+Successivamente si manifesta un problema. Alcune recensioni, o righe, hanno 5 colonne, altre 3, altre 6. Questo è il risultato di come è stato creato l'insieme di dati ed è difficile da risolvere. Si vuole ottenere un conteggio della frequenza di ogni frase, ma sono in ordine diverso in ogni recensione, quindi il conteggio potrebbe essere disattivato e un hotel potrebbe non ricevere un tag assegnato per ciò che meritava.
+
+Si utilizzerà invece l'ordine diverso a proprio vantaggio, perché ogni tag è composto da più parole ma anche separato da una virgola! Il modo più semplice per farlo è creare 6 colonne temporanee con ogni tag inserito nella colonna corrispondente al suo ordine nel tag. Quindi si uniscono le 6 colonne in una grande colonna e si esegue il metodo `value_counts()` sulla colonna risultante. Stampandolo, si vedrà che c'erano 2428 tag univoci. Ecco un piccolo esempio:
+
+| Tag | Count |
+| ------------------------------ | ------ |
+| Leisure trip | 417778 |
+| Submitted from a mobile device | 307640 |
+| Couple | 252294 |
+| Stayed 1 night | 193645 |
+| Stayed 2 nights | 133937 |
+| Solo traveler | 108545 |
+| Stayed 3 nights | 95821 |
+| Business trip | 82939 |
+| Group | 65392 |
+| Family with young children | 61015 |
+| Stayed 4 nights | 47817 |
+| Double Room | 35207 |
+| Standard Double Room | 32248 |
+| Superior Double Room | 31393 |
+| Family with older children | 26349 |
+| Deluxe Double Room | 24823 |
+| Double or Twin Room | 22393 |
+| Stayed 5 nights | 20845 |
+| Standard Double or Twin Room | 17483 |
+| Classic Double Room | 16989 |
+| Superior Double or Twin Room | 13570 |
+| 2 rooms | 12393 |
+
+Alcuni dei tag comuni come `Submitted from a mobile device` non sono di alcuna utilità, quindi potrebbe essere una cosa intelligente rimuoverli prima di contare l'occorrenza della frase, ma è un'operazione così veloce che si possono lasciare e ignorare.
+
+### Rimozione della durata dai tag di soggiorno
+
+La rimozione di questi tag è il passaggio 1, riduce leggermente il numero totale di tag da considerare. Notare che non si rimuovono dall'insieme di dati, si sceglie semplicemente di rimuoverli dalla considerazione come valori da contare/mantenere nell'insieme di dati delle recensioni.
+
+| Length of stay | Count |
+| ---------------- | ------ |
+| Stayed 1 night | 193645 |
+| Stayed 2 nights | 133937 |
+| Stayed 3 nights | 95821 |
+| Stayed 4 nights | 47817 |
+| Stayed 5 nights | 20845 |
+| Stayed 6 nights | 9776 |
+| Stayed 7 nights | 7399 |
+| Stayed 8 nights | 2502 |
+| Stayed 9 nights | 1293 |
+| ... | ... |
+
+C'è una grande varietà di camere, suite, monolocali, appartamenti e così via. Significano tutti più o meno la stessa cosa e non sono rilevanti allo scopo, quindi si rimuovono dalla considerazione.
+
+| Type of room | Count |
+| ----------------------------- | ----- |
+| Double Room | 35207 |
+| Standard Double Room | 32248 |
+| Superior Double Room | 31393 |
+| Deluxe Double Room | 24823 |
+| Double or Twin Room | 22393 |
+| Standard Double or Twin Room | 17483 |
+| Classic Double Room | 16989 |
+| Superior Double or Twin Room | 13570 |
+
+Infine, e questo è delizioso (perché non ha richiesto molta elaborazione), rimarranno i seguenti tag *utili*:
+
+| Tag | Count |
+| --------------------------------------------- | ------ |
+| Leisure trip | 417778 |
+| Couple | 252294 |
+| Solo traveler | 108545 |
+| Business trip | 82939 |
+| Group (combined with Travellers with friends) | 67535 |
+| Family with young children | 61015 |
+| Family with older children | 26349 |
+| With a pet | 1405 |
+
+Si potrebbe obiettare che `Travellers with friends` (Viaggiatori con amici) è più o meno lo stesso di `Group` (Gruppo), e sarebbe giusto combinare i due come fatto sopra. Il codice per identificare i tag corretti è [il notebook Tags](../solution/1-notebook.ipynb).
+
+Il passaggio finale consiste nel creare nuove colonne per ciascuno di questi tag. Quindi, per ogni riga di recensione, se la colonna `Tag` corrisponde a una delle nuove colonne, aggiungere 1, in caso contrario aggiungere 0. Il risultato finale sarà un conteggio di quanti recensori hanno scelto questo hotel (in aggregato) per, ad esempio, affari o piacere, o per portare un animale domestico, e questa è un'informazione utile quando consiglia un hotel.
+
+```python
+# Elabora Tags in nuove colonne
+# Il file Hotel_Reviews_Tags.py, identifica i tag più importanti
+# Leisure trip, Couple, Solo traveler, Business trip, Group combinato con Travelers with friends,
+# Family with young children, Family with older children, With a pet
+df["Leisure_trip"] = df.Tags.apply(lambda tag: 1 if "Leisure trip" in tag else 0)
+df["Couple"] = df.Tags.apply(lambda tag: 1 if "Couple" in tag else 0)
+df["Solo_traveler"] = df.Tags.apply(lambda tag: 1 if "Solo traveler" in tag else 0)
+df["Business_trip"] = df.Tags.apply(lambda tag: 1 if "Business trip" in tag else 0)
+df["Group"] = df.Tags.apply(lambda tag: 1 if "Group" in tag or "Travelers with friends" in tag else 0)
+df["Family_with_young_children"] = df.Tags.apply(lambda tag: 1 if "Family with young children" in tag else 0)
+df["Family_with_older_children"] = df.Tags.apply(lambda tag: 1 if "Family with older children" in tag else 0)
+df["With_a_pet"] = df.Tags.apply(lambda tag: 1 if "With a pet" in tag else 0)
+
+```
+
+### Salvare il file.
+
+Infine, salvare l'insieme di dati così com'è ora con un nuovo nome.
+
+```python
+df.drop(["Review_Total_Negative_Word_Counts", "Review_Total_Positive_Word_Counts", "days_since_review", "Total_Number_of_Reviews_Reviewer_Has_Given"], axis = 1, inplace=True)
+
+# Salvataggio del nuovo file dati con le colonne calcolate
+print("Saving results to Hotel_Reviews_Filtered.csv")
+df.to_csv(r'../data/Hotel_Reviews_Filtered.csv', index = False)
+```
+
+## Operazioni di Analisi del Sentiment
+
+In questa sezione finale, si applicherà l'analisi del sentiment alle colonne di recensione e si salveranno i risultati in un insieme di dati.
+
+## Esercizio: caricare e salvare i dati filtrati
+
+Tenere presente che ora si sta caricando l'insieme di dati filtrato che è stato salvato nella sezione precedente, **non** quello originale.
+
+```python
+import time
+import pandas as pd
+import nltk as nltk
+from nltk.corpus import stopwords
+from nltk.sentiment.vader import SentimentIntensityAnalyzer
+nltk.download('vader_lexicon')
+
+# Carica le recensioni di hotel filtrate dal CSV
+df = pd.read_csv('../../data/Hotel_Reviews_Filtered.csv')
+
+# Il proprio codice andrà aggiunto qui
+
+
+# Infine ricordarsi di salvare le recensioni di hotel con i nuovi dati NLP aggiunti
+print("Saving results to Hotel_Reviews_NLP.csv")
+df.to_csv(r'../data/Hotel_Reviews_NLP.csv', index = False)
+```
+
+### Rimozione delle stop word
+
+Se si dovesse eseguire l'analisi del sentiment sulle colonne delle recensioni negative e positive, potrebbe volerci molto tempo. Testato su un potente laptop di prova con CPU veloce, ci sono voluti 12 - 14 minuti a seconda della libreria di sentiment utilizzata. È un tempo (relativamente) lungo, quindi vale la pena indagare se può essere accelerato.
+
+Il primo passo è rimuovere le stop word, o parole inglesi comuni che non cambiano il sentiment di una frase. Rimuovendole, l'analisi del sentiment dovrebbe essere eseguita più velocemente, ma non essere meno accurata (poiché le stop word non influiscono sul sentiment, ma rallentano l'analisi).
+
+La recensione negativa più lunga è stata di 395 parole, ma dopo aver rimosso le stop word, è di 195 parole.
+
+Anche la rimozione delle stop word è un'operazione rapida, poiché la rimozione di esse da 2 colonne di recensione su 515.000 righe ha richiesto 3,3 secondi sul dispositivo di test. Potrebbe volerci un po' più o meno tempo a seconda della velocità della CPU del proprio dispositivo, della RAM, del fatto che si abbia o meno un SSD e alcuni altri fattori. La relativa brevità dell'operazione significa che se migliora il tempo di analisi del sentiment, allora vale la pena farlo.
+
+```python
+from nltk.corpus import stopwords
+
+# Carica le recensioni di hotel da CSV
+df = pd.read_csv("../../data/Hotel_Reviews_Filtered.csv")
+
+# Rimuove le stop word - potrebbe essere lento quando c'è molto testo!
+# Ryan Han (ryanxjhan su Kaggle) ha un gran post riguardo al misurare le prestazioni di diversi approcci per la rimozione delle stop word
+# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # si usa l'approccio raccomandato da Ryan
+start = time.time()
+cache = set(stopwords.words("english"))
+def remove_stopwords(review):
+ text = " ".join([word for word in review.split() if word not in cache])
+ return text
+
+# Rimuove le stop word da entrambe le colonne
+df.Negative_Review = df.Negative_Review.apply(remove_stopwords)
+df.Positive_Review = df.Positive_Review.apply(remove_stopwords)
+```
+
+### Esecuzione dell'analisi del sentiment
+
+Ora si dovrebbe calcolare l'analisi del sentiment per le colonne di recensioni negative e positive e memorizzare il risultato in 2 nuove colonne. Il test del sentiment sarà quello di confrontarlo con il punteggio del recensore per la stessa recensione. Ad esempio, se il sentiment ritiene che la recensione negativa abbia avuto un sentiment pari a 1 (giudizio estremamente positivo) e un sentiment positivo della recensione pari a 1, ma il recensore ha assegnato all'hotel il punteggio più basso possibile, allora il testo della recensione non corrisponde al punteggio, oppure l'analizzatore del sentiment non è stato in grado di riconoscere correttamente il sentiment. Ci si dovrebbe aspettare che alcuni punteggi del sentiment siano completamente sbagliati, e spesso ciò sarà spiegabile, ad esempio la recensione potrebbe essere estremamente sarcastica "Certo che mi è piaciuto dormire in una stanza senza riscaldamento" e l'analizzatore del sentimento pensa che sia un sentimento positivo, anche se un un lettore umano avrebbe rilevato il sarcasmo.
+
+NLTK fornisce diversi analizzatori di sentiment con cui imparare e si possono sostituire e vedere se il sentiment è più o meno accurato. Qui viene utilizzata l'analisi del sentiment di VADER.
+
+> Hutto, CJ & Gilbert, EE (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Ottava Conferenza Internazionale su Weblog e Social Media (ICWSM-14). Ann Arbor, MI, giugno 2014.
+
+```python
+from nltk.sentiment.vader import SentimentIntensityAnalyzer
+
+# Crea l'analizzatore di sentiment vader (ce ne sono altri in NLTK che si possono provare)
+vader_sentiment = SentimentIntensityAnalyzer()
+# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
+
+# Ci sono tre possibilità di input per un recensore:
+# Potrebbe essere "No Negative", nel qual caso ritorna 0
+# Potrebbe essere "No Positive", nel qual caso ritorna 0
+# Potrebbe essere una recensione, nel qual caso calcola il sentiment
+def calc_sentiment(review):
+ if review == "No Negative" or review == "No Positive":
+ return 0
+ return vader_sentiment.polarity_scores(review)["compound"]
+```
+
+Più avanti nel programma, quando si è pronti per calcolare il sentiment, lo si può applicare a ciascuna recensione come segue:
+
+```python
+# Aggiunge una colonna di sentiment negativa e positiva
+print("Calculating sentiment columns for both positive and negative reviews")
+start = time.time()
+df["Negative_Sentiment"] = df.Negative_Review.apply(calc_sentiment)
+df["Positive_Sentiment"] = df.Positive_Review.apply(calc_sentiment)
+end = time.time()
+print("Calculating sentiment took " + str(round(end - start, 2)) + " seconds")
+```
+
+Questo richiede circa 120 secondi sul computer utilizzato, ma varierà per ciascun computer. Se si vogliono stampare i risultati e vedere se il sentiment corrisponde alla recensione:
+
+```python
+df = df.sort_values(by=["Negative_Sentiment"], ascending=True)
+print(df[["Negative_Review", "Negative_Sentiment"]])
+df = df.sort_values(by=["Positive_Sentiment"], ascending=True)
+print(df[["Positive_Review", "Positive_Sentiment"]])
+```
+
+L'ultima cosa da fare con il file prima di utilizzarlo nella sfida è salvarlo! Si dovrrebbe anche considerare di riordinare tutte le nuove colonne in modo che sia facile lavorarci (per un essere umano, è un cambiamento estetico).
+
+```python
+# Riordina le colonne (E' un estetismo ma facilita l'esplorazione successiva dei dati)
+df = df.reindex(["Hotel_Name", "Hotel_Address", "Total_Number_of_Reviews", "Average_Score", "Reviewer_Score", "Negative_Sentiment", "Positive_Sentiment", "Reviewer_Nationality", "Leisure_trip", "Couple", "Solo_traveler", "Business_trip", "Group", "Family_with_young_children", "Family_with_older_children", "With_a_pet", "Negative_Review", "Positive_Review"], axis=1)
+
+print("Saving results to Hotel_Reviews_NLP.csv")
+df.to_csv(r"../data/Hotel_Reviews_NLP.csv", index = False)
+```
+
+Si dovrebbe eseguire l'intero codice per [il notebook di analisi](../solution/3-notebook.ipynb) (dopo aver eseguito [il notebook di filtraggio](../solution/1-notebook.ipynb) per generare il file Hotel_Reviews_Filtered.csv).
+
+Per riepilogare, i passaggi sono:
+
+1. Il file del'insieme di dati originale **Hotel_Reviews.csv** è stato esplorato nella lezione precedente con [il notebook explorer](../../4-Hotel-Reviews-1/solution/notebook.ipynb)
+2. Hotel_Reviews.csv viene filtrato [dal notebook di filtraggio](../solution/1-notebook.ipynb) risultante in **Hotel_Reviews_Filtered.csv**
+3. Hotel_Reviews_Filtered.csv viene elaborato dal [notebook di analisi del sentiment](../solution/3-notebook.ipynb) risultante in **Hotel_Reviews_NLP.csv**
+4. Usare Hotel_Reviews_NLP.csv nella Sfida NLP di seguito
+
+### Conclusione
+
+Quando si è iniziato, si disponeva di un insieme di dati con colonne e dati, ma non tutto poteva essere verificato o utilizzato. Si sono esplorati i dati, filtrato ciò che non serve, convertito i tag in qualcosa di utile, calcolato le proprie medie, aggiunto alcune colonne di sentiment e, si spera, imparato alcune cose interessanti sull'elaborazione del testo naturale.
+
+## [Quiz post-lezione](https://white-water-09ec41f0f.azurestaticapps.net/quiz/40/)
+
+## Sfida
+
+Ora che si è analizzato il proprio insieme di dati per il sentiment, vedere se si possono usare le strategie apprese in questo programma di studi (clustering, forse?) per determinare modelli intorno al sentiment.
+
+## recensione e Auto Apprendimento
+
+Seguire [questo modulo di apprendimento](https://docs.microsoft.com/en-us/learn/modules/classify-user-feedback-with-the-text-analytics-api/?WT.mc_id=academic-15963-cxa) per saperne di più e utilizzare diversi strumenti per esplorare il sentiment nel testo.
+
+## Compito
+
+[Provare un insieme di dati diverso](assignment.it.md)
diff --git a/6-NLP/5-Hotel-Reviews-2/translations/assignment.it.md b/6-NLP/5-Hotel-Reviews-2/translations/assignment.it.md
new file mode 100644
index 0000000000..dae727b78f
--- /dev/null
+++ b/6-NLP/5-Hotel-Reviews-2/translations/assignment.it.md
@@ -0,0 +1,11 @@
+# Provare un insieme di dati diverso
+
+## Istruzioni
+
+Ora che si è imparato a usare NLTK per assegnare sentiment al testo, provare un insieme di dati diverso. Probabilmente si dovranno elaborare dei dati attorno ad esso, quindi creare un notebook e documentare il proprio processo di pensiero. Cosa si è scoperto?
+
+## Rubrica
+
+| Criteri | Ottimo | Adeguato | Necessita miglioramento |
+| -------- | ----------------------------------------------------------------------------------------------------------------- | ----------------------------------------- | ---------------------- |
+| | Vengono presentati un notebook completo e un insieme di dati con celle ben documentate che spiegano come viene assegnato il sentiment | Il notebook manca di buone spiegazioni | Il notebook è difettoso |
diff --git a/6-NLP/translations/README.it.md b/6-NLP/translations/README.it.md
new file mode 100644
index 0000000000..13eb3226eb
--- /dev/null
+++ b/6-NLP/translations/README.it.md
@@ -0,0 +1,24 @@
+# Iniziare con l'elaborazione del linguaggio naturale
+
+L'elaborazione del linguaggio naturale, NLP, è un sottocampo dell'intelligenza artificiale. L'intero campo è volto ad aiutare le macchine a comprendere ed elaborare il linguaggio umano. Questo può quindi essere utilizzato per eseguire attività come il controllo ortografico o la traduzione automatica.
+
+## Argomento regionale: lingue e letterature europee e hotel romantici d'Europa ❤️
+
+In questa sezione del programma di studi, verrà presentato uno degli usi più diffusi di machine learning: l'elaborazione del linguaggio naturale (NLP). Derivato dalla linguistica computazionale, questa categoria di intelligenza artificiale è il ponte tra umani e macchine tramite la comunicazione vocale o testuale.
+
+In queste lezioni si impareranno le basi di NLP costruendo piccoli bot conversazionali per imparare come machine learning aiuti a rendere queste conversazioni sempre più "intelligenti". Si viaggerà indietro nel tempo, chiacchierando con Elizabeth Bennett e Mr. Darcy dal romanzo classico di Jane Austen, **Orgoglio e pregiudizio**, pubblicato nel 1813. Quindi, si approfondiranno le proprie conoscenze imparando l'analisi del sentiment tramite le recensioni di hotel in Europa.
+
+![Libro e tè di Orgoglio e Pregiudizio](../images/p&p.jpg)
+> Foto di Elaine Howlin su Unsplash
+
+## Lezioni
+
+1. [Introduzione all'elaborazione del linguaggio naturale](../1-Introduction-to-NLP/translations/README.it.md)
+2. [Compiti e tecniche comuni di NLP](../2-Tasks/translations/README.it.md)
+3. [Traduzione e analisi del sentiment con machine learning](../3-Translation-Sentiment/translations/README.it.md)
+4. [Preparazione dei dati](../4-Hotel-Reviews-1/translations/README.it.md)
+5. [NLTK per l'analisi del sentiment](../5-Hotel-Reviews-2/translations/README.it.md)
+
+## Crediti
+
+Queste lezioni sull'elaborazione del linguaggio naturale sono state scritte con ☕ da [Stephen Howell](https://twitter.com/Howell_MSFT)
diff --git a/6-NLP/translations/README.ru.md b/6-NLP/translations/README.ru.md
new file mode 100644
index 0000000000..6743a968a3
--- /dev/null
+++ b/6-NLP/translations/README.ru.md
@@ -0,0 +1,24 @@
+# Начало работы с обработкой естественного языка
+
+Обработка естественного языка, NLP, - это область искусственного интеллекта. Вся эта область направлена на то, чтобы помочь машинам понимать и обрабатывать человеческий язык. Затем это можно использовать для выполнения таких задач, как проверка орфографии или машинный перевод.
+
+## Региональная тема: европейские языки и литература и романтические отели Европы ❤️
+
+В этом разделе учебной программы вы познакомитесь с одним из наиболее распространенных способов использования машинного обучения: обработкой естественного языка (NLP). Эта категория искусственного интеллекта, выведенная из компьютерной лингвистики, является мостом между людьми и машинами посредством голосовой или текстовой коммуникации.
+
+На этих уроках мы изучим основы NLP, создав небольших диалоговых ботов, чтобы узнать, как машинное обучение помогает сделать эти разговоры все более и более «умными». Вы отправитесь в прошлое, болтая с Элизабет Беннетт и мистером Дарси из классического романа Джейн Остин **Гордость и предубеждение**, опубликованного в 1813 году. Затем вы расширите свои знания, узнав об анализе настроений из отзывов об отелях в Европе.
+
+![Книга о гордости и предубеждениях и чай](images/p&p.jpg)
+> Фото Элейн Хоулин на Unsplash
+
+## Уроки
+
+1. [Введение в обработку естественного языка](1-Introduction-to-NLP/README.md)
+2. [Общие задачи и техники NLP](2-Tasks/README.md)
+3. [Перевод и анализ тональности с помощью машинного обучения](3-Translation-Sentiment/README.md)
+4. [Подготовка данных](4-Hotel-Reviews-1/README.md)
+5. [NLTK для анализа настроений](5-Hotel-Reviews-2/README.md)
+
+## Благодарности
+
+Эти уроки обработки естественного языка были написаны с помощью ☕ [Стивен Хауэлл](https://twitter.com/Howell_MSFT)
\ No newline at end of file
diff --git a/6-NLP/translations/README.zh-cn.md b/6-NLP/translations/README.zh-cn.md
new file mode 100644
index 0000000000..db08bd0853
--- /dev/null
+++ b/6-NLP/translations/README.zh-cn.md
@@ -0,0 +1,24 @@
+# 自然语言处理入门
+
+自然语言处理 (NLP) 是人工智能的一个子领域,主要研究如何让机器理解和处理人类语言,并用它来执行拼写检查或机器翻译等任务。
+
+## 本节主题:欧洲语言文学和欧洲浪漫酒店 ❤️
+
+在这部分课程中,您将了解机器学习最广泛的用途之一:自然语言处理 (NLP)。源自计算语言学,这一类人工智能会通过语音或文本与人类交流,建立连接人与机器的桥梁。
+
+课程中,我们将通过构建小型对话机器人来学习 NLP 的基础知识,以了解机器学习是如何使这个机器人越来越“智能”。您将穿越回 1813 年,与简·奥斯汀的经典小说 **傲慢与偏见** 中的 Elizabeth Bennett 和 Mr. Darcy 聊天(该小说于 1813 年出版)。然后,您将通过欧洲的酒店评论来进一步学习情感分析。
+
+![傲慢与偏见之书,和茶](../images/p&p.jpg)
+> 由 Elaine Howlin 拍摄, 来自 Unsplash
+
+## 课程
+
+1. [自然语言处理简介](../1-Introduction-to-NLP/README.md)
+2. [NLP 常见任务与技巧](../2-Tasks/README.md)
+3. [机器学习翻译和情感分析](../3-Translation-Sentiment/README.md)
+4. [准备数据](../4-Hotel-Reviews-1/README.md)
+5. [用于情感分析的工具:NLTK](../5-Hotel-Reviews-2/README.md)
+
+## 作者
+
+这些自然语言处理课程由 [Stephen Howell](https://twitter.com/Howell_MSFT) 用 ☕ 编写
\ No newline at end of file
diff --git a/7-TimeSeries/1-Introduction/README.md b/7-TimeSeries/1-Introduction/README.md
index ae2a69e1ca..1d75763f0b 100644
--- a/7-TimeSeries/1-Introduction/README.md
+++ b/7-TimeSeries/1-Introduction/README.md
@@ -10,7 +10,7 @@ In this lesson and the following one, you will learn a bit about time series for
> 🎥 Click the image above for a video about time series forecasting
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/41/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/41/)
It's a useful and interesting field with real value to business, given its direct application to problems of pricing, inventory, and supply chain issues. While deep learning techniques have started to be used to gain more insights to better predict future performance, time series forecasting remains a field greatly informed by classic ML techniques.
@@ -174,7 +174,7 @@ In the next lesson, you will create an ARIMA model to create some forecasts.
Make a list of all the industries and areas of inquiry you can think of that would benefit from time series forecasting. Can you think of an application of these techniques in the arts? In Econometrics? Ecology? Retail? Industry? Finance? Where else?
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/42/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/42/)
## Review & Self Study
diff --git a/7-TimeSeries/2-ARIMA/README.md b/7-TimeSeries/2-ARIMA/README.md
index d54a781be2..19da562216 100644
--- a/7-TimeSeries/2-ARIMA/README.md
+++ b/7-TimeSeries/2-ARIMA/README.md
@@ -6,7 +6,7 @@ In the previous lesson, you learned a bit about time series forecasting and load
> 🎥 Click the image above for a video: A brief introduction to ARIMA models. The example is done in R, but the concepts are universal.
-## [Pre-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/43/)
+## [Pre-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/43/)
## Introduction
@@ -50,13 +50,13 @@ Open the _/working_ folder in this lesson and find the _notebook.ipynb_ file.
import pandas as pd
import datetime as dt
import math
-
+
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.preprocessing import MinMaxScaler
from common.utils import load_data, mape
from IPython.display import Image
-
+
%matplotlib inline
pd.options.display.float_format = '{:,.2f}'.format
np.set_printoptions(precision=2)
@@ -83,16 +83,16 @@ Open the _/working_ folder in this lesson and find the _notebook.ipynb_ file.
### Create training and testing datasets
-Now your data is loaded, so you can separate it into train and test sets. You'll train your model on the train set. As usual, after the model has finished training, you'll evaluate its accuracy using the test set. You need to ensure that the test set covers a later period in time from the training set to ensure that the model does not gain information from future time periods.
+Now your data is loaded, so you can separate it into train and test sets. You'll train your model on the train set. As usual, after the model has finished training, you'll evaluate its accuracy using the test set. You need to ensure that the test set covers a later period in time from the training set to ensure that the model does not gain information from future time periods.
1. Allocate a two-month period from September 1 to October 31, 2014 to the training set. The test set will include the two-month period of November 1 to December 31, 2014:
```python
train_start_dt = '2014-11-01 00:00:00'
- test_start_dt = '2014-12-30 00:00:00'
+ test_start_dt = '2014-12-30 00:00:00'
```
- Since this data reflects the daily consumption of energy, there is a strong seasonal pattern, but the consumption is most similar to the consumption in more recent days.
+ Since this data reflects the daily consumption of energy, there is a strong seasonal pattern, but the consumption is most similar to the consumption in more recent days.
1. Visualize the differences:
@@ -120,11 +120,11 @@ Now, you need to prepare the data for training by performing filtering and scali
```python
train = energy.copy()[(energy.index >= train_start_dt) & (energy.index < test_start_dt)][['load']]
test = energy.copy()[energy.index >= test_start_dt][['load']]
-
+
print('Training data shape: ', train.shape)
print('Test data shape: ', test.shape)
```
-
+
You can see the shape of the data:
```output
@@ -189,17 +189,17 @@ Now you need to follow several steps
print('Forecasting horizon:', HORIZON, 'hours')
```
- Selecting the best values for an ARIMA model's parameters can be challenging as it's somewhat subjective and time intensive. You might consider using an `auto_arima()` function from the [`pyramid` library](https://alkaline-ml.com/pmdarima/0.9.0/modules/generated/pyramid.arima.auto_arima.html),
+ Selecting the best values for an ARIMA model's parameters can be challenging as it's somewhat subjective and time intensive. You might consider using an `auto_arima()` function from the [`pyramid` library](https://alkaline-ml.com/pmdarima/0.9.0/modules/generated/pyramid.arima.auto_arima.html),
1. For now try some manual selections to find a good model.
```python
order = (4, 1, 0)
seasonal_order = (1, 1, 0, 24)
-
+
model = SARIMAX(endog=train, order=order, seasonal_order=seasonal_order)
results = model.fit()
-
+
print(results.summary())
```
@@ -223,10 +223,10 @@ Walk-forward validation is the gold standard of time series model evaluation and
```python
test_shifted = test.copy()
-
- for t in range(1, HORIZON):
+
+ for t in range(1, HORIZON+1):
test_shifted['load+'+str(t)] = test_shifted['load'].shift(-t, freq='H')
-
+
test_shifted = test_shifted.dropna(how='any')
test_shifted.head(5)
```
@@ -246,18 +246,18 @@ Walk-forward validation is the gold standard of time series model evaluation and
```python
%%time
training_window = 720 # dedicate 30 days (720 hours) for training
-
+
train_ts = train['load']
test_ts = test_shifted
-
+
history = [x for x in train_ts]
history = history[(-training_window):]
-
+
predictions = list()
-
+
order = (2, 1, 0)
seasonal_order = (1, 1, 0, 24)
-
+
for t in range(test_ts.shape[0]):
model = SARIMAX(endog=history, order=order, seasonal_order=seasonal_order)
model_fit = model.fit()
@@ -276,10 +276,10 @@ Walk-forward validation is the gold standard of time series model evaluation and
```output
2014-12-30 00:00:00
1 : predicted = [0.32 0.29 0.28] expected = [0.32945389435989236, 0.2900626678603402, 0.2739480752014323]
-
+
2014-12-30 01:00:00
2 : predicted = [0.3 0.29 0.3 ] expected = [0.2900626678603402, 0.2739480752014323, 0.26812891674127126]
-
+
2014-12-30 02:00:00
3 : predicted = [0.27 0.28 0.32] expected = [0.2739480752014323, 0.26812891674127126, 0.3025962399283795]
```
@@ -295,7 +295,7 @@ Walk-forward validation is the gold standard of time series model evaluation and
eval_df.head()
```
- ```output
+ Output
| | | timestamp | h | prediction | actual |
| --- | ---------- | --------- | --- | ---------- | -------- |
| 0 | 2014-12-30 | 00:00:00 | t+1 | 3,008.74 | 3,023.00 |
@@ -303,7 +303,7 @@ Walk-forward validation is the gold standard of time series model evaluation and
| 2 | 2014-12-30 | 02:00:00 | t+1 | 2,900.17 | 2,899.00 |
| 3 | 2014-12-30 | 03:00:00 | t+1 | 2,917.69 | 2,886.00 |
| 4 | 2014-12-30 | 04:00:00 | t+1 | 2,946.99 | 2,963.00 |
- ```
+
Observe the hourly data's prediction, compared to the actual load. How accurate is this?
@@ -311,10 +311,10 @@ Walk-forward validation is the gold standard of time series model evaluation and
Check the accuracy of your model by testing its mean absolute percentage error (MAPE) over all the predictions.
-> **🧮 Show me the math**
+> **🧮 Show me the math**
>
> ![MAPE](images/mape.png)
->
+>
> [MAPE](https://www.linkedin.com/pulse/what-mape-mad-msd-time-series-allameh-statistics/) is used to show prediction accuracy as a ratio defined by the above formula. The difference between actualt and predictedt is divided by the actualt. "The absolute value in this calculation is summed for every forecasted point in time and divided by the number of fitted points n." [wikipedia](https://wikipedia.org/wiki/Mean_absolute_percentage_error)
1. Express equation in code:
@@ -351,13 +351,13 @@ Check the accuracy of your model by testing its mean absolute percentage error (
if(HORIZON == 1):
## Plotting single step forecast
eval_df.plot(x='timestamp', y=['actual', 'prediction'], style=['r', 'b'], figsize=(15, 8))
-
+
else:
## Plotting multi step forecast
plot_df = eval_df[(eval_df.h=='t+1')][['timestamp', 'actual']]
for t in range(1, HORIZON+1):
plot_df['t+'+str(t)] = eval_df[(eval_df.h=='t+'+str(t))]['prediction'].values
-
+
fig = plt.figure(figsize=(15, 8))
ax = plt.plot(plot_df['timestamp'], plot_df['actual'], color='red', linewidth=4.0)
ax = fig.add_subplot(111)
@@ -365,9 +365,9 @@ Check the accuracy of your model by testing its mean absolute percentage error (
x = plot_df['timestamp'][(t-1):]
y = plot_df['t+'+str(t)][0:len(x)]
ax.plot(x, y, color='blue', linewidth=4*math.pow(.9,t), alpha=math.pow(0.8,t))
-
+
ax.legend(loc='best')
-
+
plt.xlabel('timestamp', fontsize=12)
plt.ylabel('load', fontsize=12)
plt.show()
@@ -383,12 +383,12 @@ Check the accuracy of your model by testing its mean absolute percentage error (
Dig into the ways to test the accuracy of a Time Series Model. We touch on MAPE in this lesson, but are there other methods you could use? Research them and annotate them. A helpful document can be found [here](https://otexts.com/fpp2/accuracy.html)
-## [Post-lecture quiz](https://jolly-sea-0a877260f.azurestaticapps.net/quiz/44/)
+## [Post-lecture quiz](https://white-water-09ec41f0f.azurestaticapps.net/quiz/44/)
## Review & Self Study
This lesson touches on only the basics of Time Series Forecasting with ARIMA. Take some time to deepen your knowledge by digging into [this repository](https://microsoft.github.io/forecasting/) and its various model types to learn other ways to build Time Series models.
-## Assignment
+## Assignment
[A new ARIMA model](assignment.md)
diff --git a/7-TimeSeries/2-ARIMA/solution/notebook.ipynb b/7-TimeSeries/2-ARIMA/solution/notebook.ipynb
index 62ebccaafe..1a42ab1555 100644
--- a/7-TimeSeries/2-ARIMA/solution/notebook.ipynb
+++ b/7-TimeSeries/2-ARIMA/solution/notebook.ipynb
@@ -1,6 +1,7 @@
{
"cells": [
{
+ "cell_type": "markdown",
"source": [
"# Time series forecasting with ARIMA\n",
"\n",
@@ -14,13 +15,26 @@
"\n",
"1Tao Hong, Pierre Pinson, Shu Fan, Hamidreza Zareipour, Alberto Troccoli and Rob J. Hyndman, \"Probabilistic energy forecasting: Global Energy Forecasting Competition 2014 and beyond\", International Journal of Forecasting, vol.32, no.3, pp 896-913, July-September, 2016."
],
+ "metadata": {}
+ },
+ {
"cell_type": "markdown",
+ "source": [
+ "## Install Dependencies\n",
+ "Get started by installing some of the required dependencies. These libraries with their corresponding versions are known to work for the solution:\n",
+ "\n",
+ "* `statsmodels == 0.12.2`\n",
+ "* `matplotlib == 3.4.2`\n",
+ "* `scikit-learn == 0.24.2`\n"
+ ],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 1,
- "metadata": {},
+ "source": [
+ "!pip install statsmodels"
+ ],
"outputs": [
{
"output_type": "stream",
@@ -40,15 +54,11 @@
]
}
],
- "source": [
- "pip install statsmodels"
- ]
+ "metadata": {}
},
{
"cell_type": "code",
"execution_count": 2,
- "metadata": {},
- "outputs": [],
"source": [
"import os\n",
"import warnings\n",
@@ -68,12 +78,17 @@
"pd.options.display.float_format = '{:,.2f}'.format\n",
"np.set_printoptions(precision=2)\n",
"warnings.filterwarnings(\"ignore\") # specify to ignore warning messages\n"
- ]
+ ],
+ "outputs": [],
+ "metadata": {}
},
{
"cell_type": "code",
"execution_count": 3,
- "metadata": {},
+ "source": [
+ "energy = load_data('./data')[['load']]\n",
+ "energy.head(10)"
+ ],
"outputs": [
{
"output_type": "execute_result",
@@ -91,116 +106,188 @@
"2012-01-01 08:00:00 2,916.00\n",
"2012-01-01 09:00:00 3,105.00"
],
- "text/html": "