-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathremove_irrelevant_cases.py
43 lines (37 loc) · 1.81 KB
/
remove_irrelevant_cases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import pandas as pd
import re
import numpy as np
#df = pd.read_csv('case_scraping_Dec_04_2022.csv')
df = pd.read_csv('bverfg230107_with_break.csv')
#updated_df = df.drop([df.index[i] for i in range(0,233)])
updated_df = df.drop([df.index[i] for i in range(0,3)])
drop_indices = []
for idx, row in updated_df.iterrows():
row['full_text'] = row['full_text'].replace('\n', '')
row['full_text'] = re.sub(' +', ' ', row['full_text'])
#if len(row['judges']) < 3 or len(row['full_text']) < 3:
#Remove cases with emty authors or too short full_text
if len(row['full_text'].split(' ')) < 100:
print("row['full_text']: ", row['full_text'])
print("len of full_text:", len(row['full_text'].split(' ')))
print("row['full_text'].split(" "):", row['full_text'].split(' '))
drop_indices += [idx]
#updated_df = updated_df.drop(idx)
for idx, row in updated_df.iterrows():
if len(row['judges']) < 3:
print("row['judges']:", row['judges'])
drop_indices += [idx]
updated_df = updated_df.drop(drop_indices)
#Remove less important columns to mitigate the effect of dropping rows with any empty entries
#updated_df.drop(columns=['senate_and_chamber', 'decision'], axis=1, inplace=True)
#updated_df.dropna(axis=0, how='any', inplace=True)
#Create an additional uid column (updated id) with index ranging from 0 to 1
idx = 0
updated_df.insert(idx, 'uid', value=np.arange(len(updated_df)))
#updated_df.to_csv('case_scraping_01_1998_to_07_2022.csv', index=False)
updated_df.to_csv('bverfg230107_with_break_noNaN.csv', index=False)
print('before:', len(df.index))
print('after:', len(updated_df.index))
#print('updated_df[0:2]:', updated_df[0:2])
#print('updated_df[8185:8187]:', updated_df[8185:8187])
#print('updated_df.shape[0]: ', updated_df.shape[0])