Turkish-Intent-Analysis/data_preprocessing.py at main · Tek-nr/Turkish-Intent-Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import re
import pandas as pd
import itertools


def data_cleaning(text):
    text = text.replace("\n", " ").lower() # Replace line breaks with space and convert the text to lowercase
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation and non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with a single space
    text = re.sub('RT', '', text)
    text = re.sub('[^a-zA-ZğüşıöçĞÜŞİÖÇ]', ' ', text)
    return text

def remove_stopwords(text, stopwords):
    # Splitted the text by space, filter out stopwords, and join the cleaned words
    text = data_cleaning(text)
    return " ".join(word for word in text.split() if word.lower() not in stopwords)

def word_tokenize(text):
    words = re.findall(r'\b\w+\b', text)  # Used a regular expression to split the text into words
    return words

def dup_vanish(s1):
     return (''.join(i for i, _ in itertools.groupby(s1)))

def balance_data(df_patterns):
    df_intent = df_patterns['intent']
    max_counts = df_intent.value_counts().max() #max number of examples for a class

    new_df = df_patterns.copy()
    for i in df_intent.unique():
        i_count = int(df_intent[df_intent == i].value_counts())
        if i_count < max_counts:
            i_samples = df_patterns[df_intent == i].sample(max_counts - i_count, replace = True, ignore_index = True)
            new_df = pd.concat([new_df, i_samples])
    return new_df