-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalculate_recall_precision_2ref_cases.py
69 lines (53 loc) · 2.45 KB
/
calculate_recall_precision_2ref_cases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd
import json
import re
import numpy as np
import pickle
num_topics = 50
df = pd.read_csv('bverfg230107_with_break_noNaN.csv')
columns = list(df.columns)
rf_columns = []
for col in columns:
if col[:3] == 'ref':
#print('col:', col)
rf_columns.append(col)
#print('rf_columns:', rf_columns)
#print('len(rf_columns):', len(rf_columns)) # 50
with open('WardNJU_authors_per_doc_topN=1_num_topics=' + str(num_topics) + '.json', 'rb') as f:
authors_prob_per_doc_all = pickle.load(f) #format: {0: {'winter': 0.42},...}
author_per_doc = {}
for k, v in authors_prob_per_doc_all.items():
#print(k)
if len(list(v.keys())) > 0:
author_per_doc[k] = list(v.keys())[0]
else:
author_per_doc[k] = np.nan
#print(author_per_doc.values())
unique_authors = np.unique(list(author_per_doc.values()))
#print('unique_authors:', unique_authors)
#print('len(unique_authors):', len(unique_authors)) #53
#Construct a map from referee to author variable (e.g. ref_kuehling -> kuehling)
ref_to_author_dict = {}
for author in unique_authors:
for ref in rf_columns:
if (author == ref[4:]) | (author == ref[5:]):
print('author:', author)
print('ref:', ref)
ref_to_author_dict[ref] = author
print('ref_to_author_dict:', ref_to_author_dict)
print('len(ref_to_author_dict):', len(ref_to_author_dict)) #50
df_with_2_ref = df[((df["sumref"] == 2) & (df["sumref2"] == 0)) | ((df["sumref"] == 1) & (df["sumref2"] == 1)) | ((df["sumref"] == 0) & (df["sumref2"] == 2))]
df_with_2_ref.to_csv('bverfg230107_with_2_ref.csv', index=False)
print(len(df_with_2_ref.index)) #3192 cases
df_with_2_ref = pd.read_csv('bverfg230107_with_2_ref.csv')[rf_columns]
#Get a df with a column showing all features (ref...) that have value = 1 in each row (expect 2 features or 2 referees per row)
#df_with_ref_variable = df_with_2_ref.idxmax(axis=1)
df_with_ref_variable = df_with_2_ref.apply(lambda row: list(row[row == 1].index), axis=1)
print('head_ref:', df_with_ref_variable.head())
df_with_uid = pd.read_csv('bverfg230107_with_2_ref.csv')['uid']
#print('head_uid:', df_with_uid.head())
df_with_uid_and_ref_variable = pd.concat([df_with_uid, df_with_ref_variable], axis=1)
#rename columns
df_with_uid_and_ref_variable.columns = ['uid', 'referees']
df_with_uid_and_ref_variable.to_csv('bverfg230107_with_uid_and_2ref_variable.csv', index=False)
df_with_uid_and_ref_variable = pd.read_csv('bverfg230107_with_uid_and_2ref_variable.csv')