Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add TestGenEval benchmark #5534

Open
wants to merge 31 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
7a4729c
initial TestGenEval code
Nov 29, 2024
c6206f5
Initial pass for TestGenEval
Nov 29, 2024
280baa2
Licensing
Nov 29, 2024
75fba59
Readability metrics
Nov 29, 2024
f7f2531
Fixing testing dependencies
Dec 4, 2024
30197e6
Add option for starting point
Dec 4, 2024
791b7f9
Cleaning to not OOM
Dec 5, 2024
bd66d09
Merge pull request #1 from All-Hands-AI/main
kjain14 Dec 6, 2024
585dba9
TestGenEval MVP
Dec 11, 2024
3af6025
+ mutation testing
Dec 11, 2024
b19f735
Merge pull request #2 from All-Hands-AI/main
kjain14 Dec 11, 2024
7c81deb
Update README
Dec 11, 2024
2cd64bc
Merge branch 'main' of https://github.com/kjain14/OpenHands
Dec 11, 2024
b685c67
reset
Dec 12, 2024
fb9bc87
testgeneval deps
Dec 12, 2024
77a153e
Final update, now working on all projects
Dec 16, 2024
3401bd6
Update TestGenEval README with comprehensive information
openhands-agent Dec 25, 2024
b47da9e
Merge branch 'main' of github.com:All-Hands-AI/OpenHands into kjain14…
neubig Dec 25, 2024
90422e5
Update lock file
neubig Dec 25, 2024
31b6967
Any and all pass
Jan 8, 2025
1ded123
Reset to normal time
Jan 8, 2025
efb525a
Refine postprocessing
Jan 9, 2025
219a134
Refine prompt
Jan 10, 2025
3f0f13d
Update prompt
Jan 10, 2025
d1e8409
Update filtering
Jan 17, 2025
8848e60
Only top level filtering
Jan 17, 2025
3355bae
Merge branch 'main' of github.com:kjain14/OpenHands into kjain14-main
neubig Jan 20, 2025
9f9a65c
More updates
Jan 20, 2025
f781bc8
Fix prompting
Jan 28, 2025
c7d575b
Removing duplicate script
Jan 28, 2025
64abd4a
Ablation outputs
Jan 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions evaluation/benchmarks/testgeneval/CodeBLEU/Evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Adapted from https://github.com/EngineeringSoftware/teco/blob/main/src/CodeBLEU/Evaluator.py
import os
from pathlib import Path
from typing import List

import numpy as np
from CodeBLEU import bleu, dataflow_match, syntax_match, weighted_ngram_match
from tree_sitter import Language


class Evaluator:
"""
Python interface for using CodeBLEU, based on calc_code_bleu.py.
"""

def __init__(
self,
lang: str,
alpha: float = 0.25,
beta: float = 0.25,
gamma: float = 0.25,
theta: float = 0.25,
):
self.lang = lang
self.alpha = alpha
self.beta = beta
self.gamma = gamma
self.theta = theta

# Load keywords and tree-sitter parser
this_dir: Path = Path(os.path.dirname(os.path.realpath(__file__)))
self.keywords = [
x.strip()
for x in open(
this_dir / 'keywords' / f'{self.lang}.txt', 'r', encoding='utf-8'
).readlines()
]
self.parser_language = Language(this_dir / 'parser' / 'my-languages.so', lang)

@staticmethod
def make_weights(reference_tokens, key_word_list):
return {
token: 1 if token in key_word_list else 0.2 for token in reference_tokens
}

def corpus_code_bleu(
self, refs_toks: List[List[List[str]]], hyps_toks: List[List[str]]
) -> float:
"""
Calculates CodeBLEU for the given references and hypotheses (should be tokenized).
:param refs_toks: the references, num_item * num_ref * num_tok.
:param hyps_toks: the hypotheses, num_item * num_tok.
:return: corpus-level CodeBLEU score;
NOTE: not to be confused with averaged sentence-level CodeBLEU score.
"""
assert len(refs_toks) == len(hyps_toks)

# Group tokens (for syntax match & dataflow match)
refs = [
[' '.join(ref_toks) for ref_toks in reference] for reference in refs_toks
]
hyps = [' '.join(hyp_toks) for hyp_toks in hyps_toks]

# Accumulate working scores and weights
cum_weighted_score = 0
cum_weight = 0

# Calculate ngram match (BLEU)
ngram_match_score = bleu.corpus_bleu(refs_toks, hyps_toks)
cum_weighted_score += self.alpha * ngram_match_score
cum_weight += self.alpha

# Calculate weighted ngram match
refs_toks_with_weights = [
[
[reference_tokens, self.make_weights(reference_tokens, self.keywords)]
for reference_tokens in reference
]
for reference in refs_toks
]
weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(
refs_toks_with_weights, hyps_toks
)
cum_weighted_score += self.beta * weighted_ngram_match_score
cum_weight += self.beta

# Calculate syntax match
try:
syntax_match_score = syntax_match.corpus_syntax_match(
refs, hyps, self.lang, parser_language=self.parser_language
)
except ZeroDivisionError:
# Syntax match not working, ignore this part
syntax_match_score = np.nan
pass
else:
cum_weighted_score += self.gamma * syntax_match_score
cum_weight += self.gamma

# Calculate dataflow match
dataflow_match_score = dataflow_match.corpus_dataflow_match(
refs, hyps, self.lang, parser_language=self.parser_language
)
if dataflow_match_score is not np.nan:
cum_weighted_score += self.theta * dataflow_match_score
cum_weight += self.theta
# else, ignore this part

return cum_weighted_score / cum_weight

def sentence_code_bleu(
self, refs_toks: List[List[str]], hyp_toks: List[str]
) -> float:
"""
Calculates CodeBLEU for the given references and hypothesis (should be tokenized).
:param refs_toks: the references, num_ref * num_tok.
:param hyp_toks: the hypothesis, num_tok.
:return: sentence-level CodeBLEU score.
"""
return self.corpus_code_bleu([refs_toks], [hyp_toks])
Loading