masakhane-io · jaderabbit · Jul 21, 2020 · Jul 21, 2020 · Jul 21, 2020 · Jul 21, 2020
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+opustools-pkg
+pandas
+p_tqdm
+rapidfuzz
+joeynmt
diff --git a/starter_notebook.ipynb b/starter_notebook.ipynb
@@ -47,7 +47,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -62,7 +62,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "collapsed": true,
+    "id": "gA75Fs9ys8Y9",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Install python packages\n",
+    "! pip install opustools-pkg\n",
+    "! pip install pandas\n",
+    "! pip install p_tqdm\n",
+    "! pip install rapidfuzz\n",
+    "! pip install joeynmt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -91,7 +111,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -105,22 +125,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
-   "metadata": {
-    "colab": {},
-    "colab_type": "code",
-    "collapsed": true,
-    "id": "gA75Fs9ys8Y9"
-   },
-   "outputs": [],
-   "source": [
-    "# Install opus-tools\n",
-    "! pip install opustools-pkg"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -138,7 +143,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -162,7 +167,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -185,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -240,7 +245,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -264,7 +269,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -275,15 +280,11 @@
    "source": [
     "# Install fuzzy wuzzy to remove \"almost duplicate\" sentences in the\n",
     "# test and training sets.\n",
-    "! pip install fuzzywuzzy\n",
-    "! pip install python-Levenshtein\n",
     "import time\n",
-    "from fuzzywuzzy import process\n",
+    "from rapidfuzz import process\n",
     "import numpy as np\n",
     "from os import cpu_count\n",
     "from functools import partial\n",
-    "from multiprocessing import Pool\n",
-    "\n",
     "\n",
     "# reset the index of the training set after previous filtering\n",
     "df_pp.reset_index(drop=False, inplace=True)\n",
@@ -309,14 +310,9 @@
    },
    "outputs": [],
    "source": [
-    "start_time = time.time()\n",
     "### iterating over pandas dataframe rows is not recomended, let use multi processing to apply the function\n",
-    "\n",
-    "with Pool(cpu_count()-1) as pool:\n",
-    "    scores = pool.map(partial(fuzzfilter, candidates=list(en_test_sents), pad=5), df_pp['source_sentence'])\n",
-    "hours, rem = divmod(time.time() - start_time, 3600)\n",
-    "minutes, seconds = divmod(rem, 60)\n",
-    "print(\"done in {}h:{}min:{}seconds\".format(hours, minutes, seconds))\n",
+    "from p_tqdm import p_map\n",
+    "scores = p_map(partial(fuzzfilter, candidates=list(en_test_sents), pad=5), df_pp['source_sentence'], num_cpus=cpu_count())\n",
     "\n",
     "# Filter out \"almost overlapping samples\"\n",
     "df_pp = df_pp.assign(scores=scores)\n",
@@ -325,7 +321,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -390,7 +386,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -422,7 +418,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -480,7 +476,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -520,7 +516,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -640,7 +636,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -656,7 +652,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -671,7 +667,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -686,7 +682,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "colab": {},
     "colab_type": "code",
@@ -723,9 +719,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.8"
+   "version": "3.7.5-final"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
+}