Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/cleanup fuzzyreplace #123

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
opustools-pkg
pandas
p_tqdm
rapidfuzz
joeynmt
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jaderabbit and @juliakreutzer is this the right way to install joeynmt?

like pip install joeynmt

I remember last time I tried it it gave me headaches and I was forced to install it from Github...

using pip install git+https://github.com/joeynmt/joeynmt.git

also it is missing the version...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@espoirMur I added it to pypi a few weeks ago so this should be fine now - I still need to update the folder locations later down so we don't need to install joeynmt via cloning it.

Yes, I should do versions for all the requirements - thanks for the push!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if it is on PyPI that is cool....

90 changes: 43 additions & 47 deletions starter_notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand All @@ -62,7 +62,27 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"collapsed": true,
"id": "gA75Fs9ys8Y9",
"tags": []
},
"outputs": [],
"source": [
"# Install python packages\n",
"! pip install opustools-pkg\n",
"! pip install pandas\n",
"! pip install p_tqdm\n",
"! pip install rapidfuzz\n",
"! pip install joeynmt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand Down Expand Up @@ -91,7 +111,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand All @@ -105,22 +125,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"collapsed": true,
"id": "gA75Fs9ys8Y9"
},
"outputs": [],
"source": [
"# Install opus-tools\n",
"! pip install opustools-pkg"
]
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand All @@ -138,7 +143,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand All @@ -162,7 +167,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand All @@ -185,7 +190,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand Down Expand Up @@ -240,7 +245,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand All @@ -264,7 +269,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand All @@ -275,15 +280,11 @@
"source": [
"# Install fuzzy wuzzy to remove \"almost duplicate\" sentences in the\n",
"# test and training sets.\n",
"! pip install fuzzywuzzy\n",
"! pip install python-Levenshtein\n",
"import time\n",
"from fuzzywuzzy import process\n",
"from rapidfuzz import process\n",
"import numpy as np\n",
"from os import cpu_count\n",
"from functools import partial\n",
"from multiprocessing import Pool\n",
"\n",
"\n",
"# reset the index of the training set after previous filtering\n",
"df_pp.reset_index(drop=False, inplace=True)\n",
Expand All @@ -309,14 +310,9 @@
},
"outputs": [],
"source": [
"start_time = time.time()\n",
"### iterating over pandas dataframe rows is not recomended, let use multi processing to apply the function\n",
"\n",
"with Pool(cpu_count()-1) as pool:\n",
" scores = pool.map(partial(fuzzfilter, candidates=list(en_test_sents), pad=5), df_pp['source_sentence'])\n",
"hours, rem = divmod(time.time() - start_time, 3600)\n",
"minutes, seconds = divmod(rem, 60)\n",
"print(\"done in {}h:{}min:{}seconds\".format(hours, minutes, seconds))\n",
"from p_tqdm import p_map\n",
"scores = p_map(partial(fuzzfilter, candidates=list(en_test_sents), pad=5), df_pp['source_sentence'], num_cpus=cpu_count())\n",
"\n",
"# Filter out \"almost overlapping samples\"\n",
"df_pp = df_pp.assign(scores=scores)\n",
Expand All @@ -325,7 +321,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand Down Expand Up @@ -390,7 +386,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand Down Expand Up @@ -422,7 +418,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand Down Expand Up @@ -480,7 +476,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand Down Expand Up @@ -520,7 +516,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand Down Expand Up @@ -640,7 +636,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand All @@ -656,7 +652,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand All @@ -671,7 +667,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand All @@ -686,7 +682,7 @@
},
{
"cell_type": "code",
"execution_count": 0,
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
Expand Down Expand Up @@ -723,9 +719,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.8"
"version": "3.7.5-final"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
}