Skip to content
This repository has been archived by the owner on Aug 9, 2024. It is now read-only.

Commit

Permalink
Merge pull request #34 from halfak/frwiki
Browse files Browse the repository at this point in the history
Adds models and feature_lists for fr, fa and tr wiki. We have been using this in the Hackathon.
  • Loading branch information
ToAruShiroiNeko committed May 25, 2015
2 parents 9b2a9aa + c29632a commit ec8862a
Show file tree
Hide file tree
Showing 8 changed files with 250 additions and 62 deletions.
70 changes: 45 additions & 25 deletions ores/feature_lists/enwiki.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,51 @@
from revscoring.features import (
diff, page, parent_revision, previous_user_revision, revision, user
)
from revscoring.features import (diff, page, parent_revision,
previous_user_revision, revision, user)
from revscoring.features.modifiers import log

from . import generic

damaging = generic.damaging + [
log(max(diff.added_badwords_ratio + 1,1)),
log(max(diff.added_misspellings_ratio + 1,1)),
log(max(diff.badwords_added + 1,1)),
log(max(diff.badwords_removed + 1,1)),
log(max(diff.misspellings_added + 1,1)),
log(max(diff.misspellings_removed + 1,1)),
log(max(diff.proportion_of_badwords_added + 1,1)),
log(max(diff.proportion_of_badwords_removed + 1,1)),
log(max(diff.proportion_of_misspellings_added + 1,1)),
log(max(diff.proportion_of_misspellings_removed + 1,1)),
log(max(diff.removed_badwords_ratio + 1,1)),
log(max(diff.removed_misspellings_ratio + 1,1)),
log(max(parent_revision.badwords + 1,1)),
log(max(parent_revision.misspellings + 1,1)),
log(max(parent_revision.proportion_of_badwords + 1,1)),
log(max(parent_revision.proportion_of_misspellings + 1,1)),
log(max(revision.badwords + 1,1)),
log(max(revision.misspellings + 1,1)),
log(max(revision.proportion_of_badwords + 1,1)),
log(max(revision.proportion_of_misspellings + 1,1)),
log(revision.infonoise + 1)
damaging = [
log(diff.added_symbolic_chars_ratio + 1),
log(diff.chars_added + 1),
log(diff.chars_removed + 1),
diff.longest_repeated_char_added,
diff.longest_token_added,
log(diff.markup_chars_added + 1),
log(diff.markup_chars_removed + 1),
log(diff.numeric_chars_added + 1),
log(diff.numeric_chars_removed + 1),
diff.proportion_of_chars_added,
diff.proportion_of_chars_removed,
diff.proportion_of_markup_chars_added,
diff.proportion_of_numeric_chars_added,
diff.proportion_of_symbolic_chars_added,
diff.proportion_of_uppercase_chars_added,
log(diff.segments_added + 1),
log(diff.segments_removed + 1),
log(diff.symbolic_chars_added + 1),
log(diff.symbolic_chars_removed + 1),
log(diff.uppercase_chars_added + 1),
log(diff.uppercase_chars_removed + 1),
log(diff.words_added + 1),
log(diff.words_removed + 1),
diff.bytes_changed + 1,
diff.bytes_changed_ratio,
page.is_content_namespace,
parent_revision.was_same_user,
log(parent_revision.words + 1),
log(user.age + 1),
user.is_anon,
user.is_bot,
log(diff.added_badwords_ratio + 1),
log(diff.added_misspellings_ratio + 1),
log(diff.badwords_added + 1),
log(diff.badwords_removed + 1),
log(diff.misspellings_added + 1),
log(diff.misspellings_removed + 1),
diff.proportion_of_badwords_added,
diff.proportion_of_badwords_removed,
diff.proportion_of_misspellings_added,
diff.proportion_of_misspellings_removed
]

good_faith = generic.good_faith + [
Expand Down
44 changes: 44 additions & 0 deletions ores/feature_lists/fawiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from revscoring.features import (diff, page, parent_revision,
previous_user_revision, revision, user)
from revscoring.features.modifiers import log

from . import generic

damaging = [
log(diff.added_symbolic_chars_ratio + 1),
log(diff.chars_added + 1),
log(diff.chars_removed + 1),
diff.longest_repeated_char_added,
diff.longest_token_added,
log(diff.markup_chars_added + 1),
log(diff.markup_chars_removed + 1),
log(diff.numeric_chars_added + 1),
log(diff.numeric_chars_removed + 1),
diff.proportion_of_chars_added,
diff.proportion_of_chars_removed,
diff.proportion_of_markup_chars_added,
diff.proportion_of_numeric_chars_added,
diff.proportion_of_symbolic_chars_added,
diff.proportion_of_uppercase_chars_added,
log(diff.segments_added + 1),
log(diff.segments_removed + 1),
log(diff.symbolic_chars_added + 1),
log(diff.symbolic_chars_removed + 1),
log(diff.uppercase_chars_added + 1),
log(diff.uppercase_chars_removed + 1),
log(diff.words_added + 1),
log(diff.words_removed + 1),
diff.bytes_changed + 1,
diff.bytes_changed_ratio,
page.is_content_namespace,
parent_revision.was_same_user,
log(parent_revision.words + 1),
log(user.age + 1),
user.is_anon,
user.is_bot,
log(diff.added_misspellings_ratio + 1),
log(diff.misspellings_added + 1),
log(diff.misspellings_removed + 1),
diff.proportion_of_misspellings_added,
diff.proportion_of_misspellings_removed
]
61 changes: 61 additions & 0 deletions ores/feature_lists/frwiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from revscoring.features import (diff, page, parent_revision,
previous_user_revision, revision, user)
from revscoring.features.modifiers import log

from . import generic

damaging = [
log(diff.added_symbolic_chars_ratio + 1),
log(diff.chars_added + 1),
log(diff.chars_removed + 1),
diff.longest_repeated_char_added,
diff.longest_token_added,
log(diff.markup_chars_added + 1),
log(diff.markup_chars_removed + 1),
log(diff.numeric_chars_added + 1),
log(diff.numeric_chars_removed + 1),
diff.proportion_of_chars_added,
diff.proportion_of_chars_removed,
diff.proportion_of_markup_chars_added,
diff.proportion_of_numeric_chars_added,
diff.proportion_of_symbolic_chars_added,
diff.proportion_of_uppercase_chars_added,
log(diff.segments_added + 1),
log(diff.segments_removed + 1),
log(diff.symbolic_chars_added + 1),
log(diff.symbolic_chars_removed + 1),
log(diff.uppercase_chars_added + 1),
log(diff.uppercase_chars_removed + 1),
log(diff.words_added + 1),
log(diff.words_removed + 1),
diff.bytes_changed + 1,
diff.bytes_changed_ratio,
page.is_content_namespace,
parent_revision.was_same_user,
log(parent_revision.words + 1),
log(user.age + 1),
user.is_anon,
user.is_bot,
log(diff.added_badwords_ratio + 1),
log(diff.added_misspellings_ratio + 1),
log(diff.badwords_added + 1),
log(diff.badwords_removed + 1),
log(diff.misspellings_added + 1),
log(diff.misspellings_removed + 1),
diff.proportion_of_badwords_added,
diff.proportion_of_badwords_removed,
diff.proportion_of_misspellings_added,
diff.proportion_of_misspellings_removed
]

good_faith = generic.good_faith + [
log(diff.added_badwords_ratio + 1),
log(diff.badwords_added + 1),
log(diff.badwords_removed + 1),
diff.proportion_of_badwords_added,
diff.proportion_of_badwords_removed,
diff.removed_badwords_ratio,
log(parent_revision.badwords + 1),
parent_revision.proportion_of_badwords,
log(revision.badwords + 1)
]
71 changes: 45 additions & 26 deletions ores/feature_lists/ptwiki.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,51 @@
from revscoring.features import (
diff, page, parent_revision, previous_user_revision, revision, user
)
from revscoring.features import (diff, page, parent_revision,
previous_user_revision, revision, user)
from revscoring.features.modifiers import log
from . import generic

damaging = generic.damaging + [
log(max(diff.added_badwords_ratio + 1,1)),
log(max(diff.added_misspellings_ratio + 1,1)),
log(max(diff.badwords_added + 1,1)),
log(max(diff.badwords_removed + 1,1)),
log(max(diff.misspellings_added + 1,1)),
log(max(diff.misspellings_removed + 1,1)),
log(max(diff.proportion_of_badwords_added + 1,1)),
log(max(diff.proportion_of_badwords_removed + 1,1)),
log(max(diff.proportion_of_misspellings_added + 1,1)),
log(max(diff.proportion_of_misspellings_removed + 1,1)),
log(max(diff.removed_badwords_ratio + 1,1)),
log(max(diff.removed_misspellings_ratio + 1,1)),
log(max(parent_revision.badwords + 1,1)),
log(max(parent_revision.misspellings + 1,1)),
log(max(parent_revision.proportion_of_badwords + 1,1)),
log(max(parent_revision.proportion_of_misspellings + 1,1)),
log(max(revision.badwords + 1,1)),
log(max(revision.misspellings + 1,1)),
log(max(revision.proportion_of_badwords + 1,1)),
log(max(revision.proportion_of_misspellings + 1,1)),
log(revision.infonoise + 1)
from . import generic

damaging = [
log(diff.added_symbolic_chars_ratio + 1),
log(diff.chars_added + 1),
log(diff.chars_removed + 1),
diff.longest_repeated_char_added,
diff.longest_token_added,
log(diff.markup_chars_added + 1),
log(diff.markup_chars_removed + 1),
log(diff.numeric_chars_added + 1),
log(diff.numeric_chars_removed + 1),
diff.proportion_of_chars_added,
diff.proportion_of_chars_removed,
diff.proportion_of_markup_chars_added,
diff.proportion_of_numeric_chars_added,
diff.proportion_of_symbolic_chars_added,
diff.proportion_of_uppercase_chars_added,
log(diff.segments_added + 1),
log(diff.segments_removed + 1),
log(diff.symbolic_chars_added + 1),
log(diff.symbolic_chars_removed + 1),
log(diff.uppercase_chars_added + 1),
log(diff.uppercase_chars_removed + 1),
log(diff.words_added + 1),
log(diff.words_removed + 1),
diff.bytes_changed + 1,
diff.bytes_changed_ratio,
page.is_content_namespace,
parent_revision.was_same_user,
log(parent_revision.words + 1),
log(user.age + 1),
user.is_anon,
user.is_bot,
log(diff.added_badwords_ratio + 1),
log(diff.added_misspellings_ratio + 1),
log(diff.badwords_added + 1),
log(diff.badwords_removed + 1),
log(diff.misspellings_added + 1),
log(diff.misspellings_removed + 1),
diff.proportion_of_badwords_added,
diff.proportion_of_badwords_removed,
diff.proportion_of_misspellings_added,
diff.proportion_of_misspellings_removed
]

good_faith = generic.good_faith + [
Expand Down
48 changes: 37 additions & 11 deletions ores/feature_lists/trwiki.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,46 @@
from revscoring.features import (
diff, page, parent_revision, previous_user_revision, revision, user
)
from revscoring.features import (diff, page, parent_revision,
previous_user_revision, revision, user)
from revscoring.features.modifiers import log

from . import generic

damaging = generic.damaging + [
damaging = [
log(diff.added_symbolic_chars_ratio + 1),
log(diff.chars_added + 1),
log(diff.chars_removed + 1),
diff.longest_repeated_char_added,
diff.longest_token_added,
log(diff.markup_chars_added + 1),
log(diff.markup_chars_removed + 1),
log(diff.numeric_chars_added + 1),
log(diff.numeric_chars_removed + 1),
diff.proportion_of_chars_added,
diff.proportion_of_chars_removed,
diff.proportion_of_markup_chars_added,
diff.proportion_of_numeric_chars_added,
diff.proportion_of_symbolic_chars_added,
diff.proportion_of_uppercase_chars_added,
log(diff.segments_added + 1),
log(diff.segments_removed + 1),
log(diff.symbolic_chars_added + 1),
log(diff.symbolic_chars_removed + 1),
log(diff.uppercase_chars_added + 1),
log(diff.uppercase_chars_removed + 1),
log(diff.words_added + 1),
log(diff.words_removed + 1),
diff.bytes_changed + 1,
diff.bytes_changed_ratio,
page.is_content_namespace,
parent_revision.was_same_user,
log(parent_revision.words + 1),
log(user.age + 1),
user.is_anon,
user.is_bot,
log(diff.added_badwords_ratio + 1),
log(diff.badwords_added + 1),
log(diff.badwords_removed + 1),
log(diff.proportion_of_badwords_added + 1),
log(diff.proportion_of_badwords_removed + 1),
log(diff.removed_badwords_ratio + 1),
log(parent_revision.badwords + 1),
log(parent_revision.proportion_of_badwords + 1),
log(revision.badwords + 1),
log(revision.proportion_of_badwords + 1)
diff.proportion_of_badwords_added,
diff.proportion_of_badwords_removed
]

good_faith = generic.good_faith + [
Expand Down
6 changes: 6 additions & 0 deletions ores/scorer_models/fawiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from revscoring.languages import persian
from revscoring.scorers import LinearSVCModel

from ..feature_lists.fawiki import damaging

damaging_linear_svc = LinearSVCModel(damaging, language=persian)
6 changes: 6 additions & 0 deletions ores/scorer_models/frwiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from revscoring.languages import french
from revscoring.scorers import LinearSVCModel

from ..feature_lists.frwiki import damaging

damaging_linear_svc = LinearSVCModel(damaging, language=french)
6 changes: 6 additions & 0 deletions ores/scorer_models/trwiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from revscoring.languages import turkish
from revscoring.scorers import LinearSVCModel

from ..feature_lists.trwiki import damaging

damaging_linear_svc = LinearSVCModel(damaging, language=turkish)

0 comments on commit ec8862a

Please sign in to comment.