Skip to content

Commit

Permalink
Improve fuzzymatch recall, add name_A and name_B so names show when f…
Browse files Browse the repository at this point in the history
…uzzymatching
  • Loading branch information
araistrick committed Oct 7, 2024
1 parent e7d7a18 commit fb81968
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 14 deletions.
24 changes: 22 additions & 2 deletions tests/integration/integration_test_parse_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,21 +254,39 @@ def find_run(base_path: str, run: str) -> Path:
raise FileNotFoundError(f"Could not find match for {run=} in {base_path=}")


def fuzzy_merge(dfA, dfB, keyA, keyB, threshold=80, limit=1):
def fuzzy_merge(dfA, dfB, keyA, keyB, threshold=1):

from rapidfuzz import fuzz, process

matches_A = []
matches_B = []


def preproc(x):
x = x.split('/')[-1]
x = re.sub(r'(?<!^)(?=[A-Z][a-z])', '_', x)
x = x.lower()
return x

b_names_list = dfB[keyB].apply(preproc)
print(list(b_names_list))

for i, rowA in dfA.iterrows():

match = process.extractOne(rowA[keyA], dfB[keyB], scorer=fuzz.ratio, score_cutoff=threshold)
match = process.extractOne(
preproc(rowA[keyA]),
b_names_list,
scorer=fuzz.ratio,
score_cutoff=threshold
)

if match:
matched_rowB = dfB.loc[match[2]].to_dict()
#print(f"Matched {rowA[keyA].split('/')[-1]} with {matched_rowB[keyB].split('/')[-1]} with score {match[1]:.2f}")
else:
matched_rowB = {col: pd.NA for col in dfB.columns}
matched_rowB[keyB] = "No Matching Scene"
print(f"No match found for {rowA[keyA].split('/')[-1]}")

matches_A.append(rowA.to_dict())
matches_B.append(matched_rowB)
Expand Down Expand Up @@ -309,6 +327,8 @@ def main():

if args.nearest:
main_df = fuzzy_merge(lhs, rhs, keyA="name", keyB="name", threshold=80)
main_df["name_A"] = main_df["name"]
main_df["name_B"] = main_df["name"]
else:
main_df = lhs.merge(rhs, on="name", suffixes=("_A", "_B"), how="outer")

Expand Down
24 changes: 12 additions & 12 deletions tests/integration/template.html
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,14 @@ <h1>{{ heading }}</h1>
<tr>
<td>
<div class="image-container">
<p class="overlay-text">{{scene['name']}}</p>
<p class="overlay-text">{{scene['name_A']}}</p>
<img loading="lazy" src="{{ scene['img_path_A'] }}" >
<p class="stats">{{scene['stats_A']}}</p>
</div>
</td>
<td>
<div class="image-container">
<p class="overlay-text">{{scene['name']}}</p>
<p class="overlay-text">{{scene['name_B']}}</p>
<img loading="lazy" src="{{ scene['img_path_B'] }}" >
<p class="stats">{{scene['stats_B']}}</p>
</div>
Expand All @@ -141,14 +141,14 @@ <h1>{{ heading }}</h1>
<tr>
<td>
<div class="image-container">
<p class="overlay-text">{{scene['name']}}</p>
<p class="overlay-text">{{scene['name_A']}}</p>
<img loading="lazy" src="{{ scene['img_path_A'] }}" >
</div>
<p class="stats">{{scene['stats_A']}}</p>
</td>
<td>
<div class="image-container">
<p class="overlay-text">{{scene['name']}}</p>
<p class="overlay-text">{{scene['name_B']}}</p>
<img loading="lazy" src="{{ scene['img_path_B'] }}" >
</div>
<p class="stats">{{scene['stats_B']}}</p>
Expand All @@ -170,14 +170,14 @@ <h1>{{ heading }}</h1>
<tr>
<td>
<div class="image-container">
<p class="overlay-text">{{object['name']}}</p>
<p class="overlay-text">{{object['name_A']}}</p>
<img loading="lazy" class="asset-img" src="{{ object['img_path_A'] }}" >
<p class="stats">{{object['stats_A']}}</p>
</div>
</td>
<td>
<div class="image-container">
<p class="overlay-text">{{object['name']}}</p>
<p class="overlay-text">{{object['name_B']}}</p>
<img loading="lazy" class="asset-img" src="{{ object['img_path_B'] }}" >
<p class="stats">{{object['stats_B']}}</p>
</div>
Expand All @@ -200,14 +200,14 @@ <h1>{{ heading }}</h1>
<tr>
<td>
<div class="image-container">
<p class="overlay-text">{{object['name']}}</p>
<p class="overlay-text">{{object['name_A']}}</p>
<img loading="lazy" class="asset-img" src="{{ object['img_path_A'] }}" >
<p class="stats">{{object['stats_A']}}</p>
</div>
</td>
<td>
<div class="image-container">
<p class="overlay-text">{{object['name']}}</p>
<p class="overlay-text">{{object['name_B']}}</p>
<img loading="lazy" class="asset-img" src="{{ object['img_path_B'] }}" >
<p class="stats">{{object['stats_B']}}</p>
</div>
Expand All @@ -229,14 +229,14 @@ <h1>{{ heading }}</h1>
<tr>
<td>
<div class="image-container">
<p class="overlay-text">{{material['name']}}</p>
<p class="overlay-text">{{material['name_A']}}</p>
<img loading="lazy" class="asset-img" src="{{ material['img_path_A'] }}" >
<p class="stats">{{material['stats_A']}}</p>
</div>
</td>
<td>
<div class="image-container">
<p class="overlay-text">{{material['name']}}</p>
<p class="overlay-text">{{material['name_B']}}</p>
<img loading="lazy" class="asset-img" src="{{ material['img_path_B'] }}" >
<p class="stats">{{material['stats_B']}}</p>
</div>
Expand All @@ -258,14 +258,14 @@ <h1>{{ heading }}</h1>
<tr>
<td>
<div class="image-container">
<p class="overlay-text">{{material['name']}}</p>
<p class="overlay-text">{{material['name_A']}}</p>
<img loading="lazy" class="asset-img" src="{{ material['img_path_A'] }}" >
<p class="stats">{{material['stats_A']}}</p>
</div>
</td>
<td>
<div class="image-container">
<p class="overlay-text">{{material['name']}}</p>
<p class="overlay-text">{{material['name_B']}}</p>
<img loading="lazy" class="asset-img" src="{{ material['img_path_B'] }}" >
<p class="stats">{{material['stats_B']}}</p>
</div>
Expand Down

0 comments on commit fb81968

Please sign in to comment.