-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataAnalysis.py
50 lines (36 loc) · 1.37 KB
/
DataAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pandas as pd
from IPython.display import display, HTML
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
df1 = pd.read_csv("java_sk.csv",encoding="utf-8")
df2 = pd.read_csv("python_sk.csv")
df = pd.concat([df1,df2], axis=0).drop_duplicates()
df=df1.append(df2).drop_duplicates()
print("after clean duplicates: ",df.shape)
# df = df[df.Sponsored != 'Sponsored']
print("after clean Sponsored: ",df.shape)
counts = df.groupby("Company").count()["Title"].sort_values(ascending=False)[:10]
counts.plot("bar",figsize=(20,5))
plt.savefig("img/companies.png")
plt.show()
def cleanData(desc):
desc = word_tokenize(desc)
desc = [word.lower() for word in desc if word.isalpha() and len(word) > 2]
desc = [word for word in desc if word not in stop_words]
return desc
# nltk.download('punkt')
# nltk.download('punkt')
stop_words = stopwords.words('english')
tags_df = df["Description"].apply(cleanData)
result = tags_df.apply(Counter).sum().items()
result = sorted(result, key=lambda kv: kv[1],reverse=True)
result_series = pd.Series({k: v for k, v in result})
skills = ["senior","expert","junior","intermediate","entry","fresher"]
filter_series = result_series.filter(items=skills)
filter_series.plot('bar',figsize=(10,5))
plt.savefig("img/level.png")
plt.show()