Resume-CVproject/CVproject.py at main · auhieur/Resume-CVproject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import os

# 確保NLTK資料已下載。如果首次運行，請取消註釋以下行並執行。
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')

class ResumeScreener:
    """
    一個用於履歷篩選的類別，包含資料載入、預處理、特徵工程、
    模型訓練和評估等功能。
    """
    def __init__(self, data_path='SomeoneCV.csv', tfidf_max_features=5000):
        """
        初始化履歷篩選器。

        參數:
            data_path (str): 履歷資料集CSV檔案的路徑。
            tfidf_max_features (int): TF-IDF向量化器中最大特徵數。
        """
        self.data_path = data_path
        self.tfidf_max_features = tfidf_max_features
        self.df = None
        self.tfidf_vectorizer = None
        self.label_encoder = None
        self.model = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

        # 初始化NLTK工具
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        print("ResumeScreener 已初始化。")

    def load_data(self):
        """
        載入履歷資料集到DataFrame中。
        """
        try:
            self.df = pd.read_csv(self.data_path)
            print(f"資料 '{self.data_path}' 載入成功！")
            print("資料集形狀 (行, 列)：", self.df.shape)
            print("資料集前5行：\n", self.df.head())
        except FileNotFoundError:
            print(f"錯誤：找不到檔案在 {self.data_path}。請確保檔案路徑正確。")
            self.df = None # 確保df為None，以便後續檢查
            return False
        return True

    def visualize_category_distribution(self):
        """
        視覺化履歷類別的分佈。
        """
        if self.df is None:
            print("請先載入資料。")
            return

        print("\n各類別履歷數量：")
        print(self.df['Category'].value_counts())

        plt.figure(figsize=(15, 8))
        sns.countplot(y='Category', data=self.df, palette='viridis')
        plt.title('各類別履歷分佈')
        plt.xlabel('數量')
        plt.ylabel('類別')
        plt.show()

    def clean_resume_text(self, text):
        """
        清理單個履歷文本：
        - 移除URLs、RT、cc、哈希標籤、提及、標點符號、非ASCII字元和數字。
        - 轉換為小寫。
        - 移除停用詞。
        - 進行詞形還原。
        """
        text = re.sub('http\S+\s*', ' ', text)  # 移除URLs
        text = re.sub('RT|cc', ' ', text)  # 移除RT和cc
        text = re.sub('#\S+', '', text)  # 移除哈希標籤
        text = re.sub('@\S+', ' ', text)  # 移除提及
        text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)  # 移除標點符號
        text = re.sub(r'[^\x00-\x7f]', r' ', text) # 移除非ASCII字符
        text = re.sub('\s+', ' ', text)  # 移除多餘的空格
        text = re.sub('\d+', '', text) # 移除數字
        text = text.lower() # 轉換為小寫

        # 移除停用詞並進行詞形還原
        text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split() if word not in self.stop_words])
        return text

    def preprocess_and_vectorize(self):
        """
        對履歷文本進行預處理（清理）和特徵化（TF-IDF向量化）。
        """
        if self.df is None:
            print("請先載入資料。")
            return None

        print("\n開始文本預處理和特徵化...")
        self.df['cleaned_resume'] = self.df['Resume'].apply(self.clean_resume_text)
        print("文本清理完成。")
        print("清理後履歷的前5行：\n", self.df[['Resume', 'cleaned_resume']].head())

        # 初始化TF-IDF向量化器並進行擬合和轉換
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=self.tfidf_max_features)
        X = self.tfidf_vectorizer.fit_transform(self.df['cleaned_resume'])
        print(f"TF-IDF 特徵矩陣形狀：{X.shape}")
        return X

    def train_model(self, X, test_size=0.2, random_state=42):
        """
        訓練機器學習模型。

        參數:
            X (sparse matrix): TF-IDF特徵矩陣。
            test_size (float): 測試集佔比。
            random_state (int): 隨機種子，用於確保結果可復現。
        """
        if self.df is None:
            print("請先載入資料。")
            return

        print("\n開始模型訓練...")
        # 將類別標籤編碼為數值
        self.label_encoder = LabelEncoder()
        y = self.label_encoder.fit_transform(self.df['Category'])

        # 將資料分割為訓練集和測試集
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y
        )
        print(f"訓練集形狀 (X_train): {self.X_train.shape}, (y_train): {self.y_train.shape}")
        print(f"測試集形狀 (X_test): {self.X_test.shape}, (y_test): {self.y_test.shape}")

        # 選擇並訓練多項式朴素貝葉斯模型
        self.model = MultinomialNB()
        self.model.fit(self.X_train, self.y_train)
        print("模型訓練完成！")

    def evaluate_model(self):
        """
        評估訓練好的模型在測試集上的性能。
        """
        if self.model is None or self.X_test is None or self.y_test is None:
            print("模型尚未訓練或測試資料不可用。")
            return

        print("\n開始模型評估...")
        y_pred = self.model.predict(self.X_test)

        accuracy = accuracy_score(self.y_test, y_pred)
        print(f"模型準確度：{accuracy:.4f}")

        # 將數值標籤轉換回原始類別名稱以顯示分類報告
        target_names = self.label_encoder.inverse_transform(sorted(np.unique(self.y_test)))
        print("\n分類報告：")
        print(classification_report(self.y_test, y_pred, target_names=target_names))

    def predict_new_resume(self, new_resume_text):
        """
        使用訓練好的模型預測新履歷的類別。

        參數:
            new_resume_text (str): 新履歷的文本內容。

        返回:
            str: 預測的履歷類別名稱。
        """
        if self.model is None or self.tfidf_vectorizer is None or self.label_encoder is None:
            print("模型尚未訓練，或TF-IDF向量化器/標籤編碼器不可用。")
            return "無法預測"

        print(f"\n預測新履歷：'{new_resume_text[:50]}...'")
        # 1. 清理新履歷文本
        cleaned_new_resume = self.clean_resume_text(new_resume_text)

        # 2. 使用之前訓練好的 TF-IDF 向量化器轉換文本
        # 注意：這裡只用 transform，不能用 fit_transform
        new_resume_features = self.tfidf_vectorizer.transform([cleaned_new_resume])

        # 3. 使用訓練好的模型進行預測
        predicted_category_encoded = self.model.predict(new_resume_features)

        # 4. 將預測的數值標籤轉換回原始類別名稱
        predicted_category_name = self.label_encoder.inverse_transform(predicted_category_encoded)

        return predicted_category_name[0]

# 主執行區塊
if __name__ == "__main__":
    # 創建 ResumeScreener 實例，並指定新的資料檔案名稱
    screener = ResumeScreener(data_path='SomeoneCV.csv')

    # 1. 載入資料
    if not screener.load_data():
        print("資料載入失敗，程式終止。")
    else:
        # 2. 視覺化類別分佈
        screener.visualize_category_distribution()

        # 3. 預處理和特徵化
        X_features = screener.preprocess_and_vectorize()

        if X_features is not None:
            # 4. 訓練模型
            screener.train_model(X_features)

            # 5. 評估模型
            screener.evaluate_model()

            # 6. 預測新履歷
            sample_resume_1 = "Highly skilled software developer with expertise in Python, Java, and C++. Seeking a challenging role in backend development."
            predicted_1 = screener.predict_new_resume(sample_resume_1)
            print(f"範例履歷 1 預測類別：{predicted_1}")

            sample_resume_2 = "Experienced HR professional with strong background in talent acquisition, employee relations, and compensation & benefits."
            predicted_2 = screener.predict_new_resume(sample_resume_2)
            print(f"範例履歷 2 預測類別：{predicted_2}")

            sample_resume_3 = "Data scientist with a master's degree in statistics. Proficient in machine learning, deep learning, and data visualization using R and Python."
            predicted_3 = screener.predict_new_resume(sample_resume_3)
            print(f"範例履歷 3 預測類別：{predicted_3}")